// charset_traits.hh // This file is part of libpbe; see http://anyterm.org/ // (C) 2007-2008 Philip Endecott // Distributed under the Boost Software License, Version 1.0: // // Permission is hereby granted, free of charge, to any person or organization // obtaining a copy of the software and accompanying documentation covered by // this license (the "Software") to use, reproduce, display, distribute, // execute, and transmit the Software, and to prepare derivative works of the // Software, and to permit third-parties to whom the Software is furnished to // do so, all subject to the following: // // The copyright notices in the Software and this entire statement, including // the above license grant, this restriction and the following disclaimer, // must be included in all copies of the Software, in whole or in part, and // all derivative works of the Software, unless such copies or derivative // works are solely in the form of machine-executable object code generated by // a source language processor. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT // SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE // FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER // DEALINGS IN THE SOFTWARE. #ifndef libpbe_charset_charset_traits_hh #define libpbe_charset_charset_traits_hh #include "charset_t.hh" #include "char_t.hh" namespace pbe { // Character Set Traits // -------------------- // // charset_traits provides information about a character set cs, identified // by a member of the charset_t enumeration. This information comprises: // - The character type, char_t. // - The unit type, unit_t. // - The state type (for character sets like Shift-JIS), state_t. // The character and unit types differ for variable-length encodings. // Since the majority of the character sets in charset_t are fixed-length // byte character sets with no shift-state, this is defined as a default. // Note that different character sets' state types are distinct, even if they // are all stateless character sets. // - The boolean constants is_ascii_superset and is_iso8859_common_superset, // which indicate whether the character set shares codes 0-127 with ASCII // and whether it shares code 0-160 (inclusive) with the iso-8859 character // sets, respectively. (The iso-8859 character sets are all ASCII supersets, // and they also have in common with each other characters 128 to 160 // inclusive; 128 to 159 are control characters and 160 is non-breaking space. // - The following functions, for conversion between the character type and // a sequence of units: // template char_t decode(const_unit_ptr_t& p); // Decodes and returns one character pointed to by p, and advances p // to point to the next character. // template void encode(unit_ptr_t& p, char_t c); // Encodes the character c and stores it starting at *p, and advances // p to point after the character that it has written. // - The following functions, for per-character iteration though units: // template void skip_forward_char(const_unit_ptr_t& p); // Advance p to point to the next character. // template void skip_backward_char(const_unit_ptr_t& p); // Move p back to point to the previous character. // template int char_length(const_unit_ptr_t& p); // Returns the length in units of the character pointed to by p. // - The following to help client code allocate buffer space: // size_t max_characters(size_t n_units) // size_t typ_characters(size_t n_units) // size_t max_units(size_t n_characters) // size_t typ_units(size_t n_characters) // Maybe add: check whether units pointed to are valid. // For fixed-length character sets, most of these things are trivial; the // only variable is whether they have the common ASCII or ISO-8859 subset. // Specialisations for such cases can inherit from one of these: // cset must be a template parameter so that the state_t for each character // set is distinct. template struct fixed_charset_traits { typedef C char_t; typedef C unit_t; struct state_t {}; static const bool is_ascii_superset = false; static const bool is_ascii_plus_c1_superset = false; template static char_t decode(const_unit_ptr_t& p) { return *(p++); } template static void encode(unit_ptr_t& p, char_t c) { *(p++) = c; } template static void skip_forward_char(const_unit_ptr_t& p) { ++p; } template static void skip_backward_char(const_unit_ptr_t& p) { --p; } template static int char_length(const_unit_ptr_t& p) { return 1; } static size_t max_characters(size_t n_units) { return n_units; } static size_t typ_characters(size_t n_units) { return n_units; } static size_t max_units(size_t n_characters) { return n_characters; } static size_t typ_units(size_t n_characters) { return n_characters; } }; template struct fixed_byte_charset_traits: fixed_charset_traits {}; template struct fixed_byte_ascii_superset_charset_traits: fixed_byte_charset_traits { static const bool is_ascii_superset = true; static const bool is_ascii_plus_c1_superset = false; }; template struct fixed_byte_iso8859_common_superset_charset_traits: fixed_byte_charset_traits { static const bool is_ascii_superset = true; static const bool is_ascii_plus_c1_superset = true; }; // Here is the base charset traits class, for which specialisations are made: template struct charset_traits {}; // For many of the fixed-length character sets, we define specialisations here: template <> struct charset_traits: fixed_byte_ascii_superset_charset_traits {}; // The ISO-8859 character sets: template <> struct charset_traits: fixed_byte_iso8859_common_superset_charset_traits {}; template <> struct charset_traits: fixed_byte_iso8859_common_superset_charset_traits {}; template <> struct charset_traits: fixed_byte_iso8859_common_superset_charset_traits {}; template <> struct charset_traits: fixed_byte_iso8859_common_superset_charset_traits {}; template <> struct charset_traits: fixed_byte_iso8859_common_superset_charset_traits {}; template <> struct charset_traits: fixed_byte_iso8859_common_superset_charset_traits {}; template <> struct charset_traits: fixed_byte_iso8859_common_superset_charset_traits {}; template <> struct charset_traits: fixed_byte_iso8859_common_superset_charset_traits {}; template <> struct charset_traits: fixed_byte_iso8859_common_superset_charset_traits {}; template <> struct charset_traits: fixed_byte_iso8859_common_superset_charset_traits {}; template <> struct charset_traits: fixed_byte_iso8859_common_superset_charset_traits {}; template <> struct charset_traits: fixed_byte_iso8859_common_superset_charset_traits {}; template <> struct charset_traits: fixed_byte_iso8859_common_superset_charset_traits {}; template <> struct charset_traits: fixed_byte_iso8859_common_superset_charset_traits {}; // These sound related, but I don't know what they are: // iso8859_6_e // iso8859_6_i // iso8859_supp // The ISO-646 character sets. These are 7-bit character sets with most characters shared // with ASCII, but with some punctuation characters replaced by national accented characters // and symbols. // It might be useful to have a trait indicating that they're "approximately" ASCII. template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; // This sounds related, but I don't know what it is: // ISO_646.basic:1983 // These "code pages" are all 8-bit fixed-length encodings. // Some/many of them will be ASCII supersets but this isn't identified here. // (In some/most cases a definition is provided in the Unicode mapping tables.) template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; template <> struct charset_traits: fixed_byte_charset_traits {}; // The cyrillic KOI8 character sets are ASCII supersets. template <> struct charset_traits: fixed_byte_ascii_superset_charset_traits {}; template <> struct charset_traits: fixed_byte_ascii_superset_charset_traits {}; template <> struct charset_traits: fixed_byte_ascii_superset_charset_traits {}; // The fixed Unicode formats: template <> struct charset_traits: fixed_charset_traits {}; template <> struct charset_traits: fixed_charset_traits {}; // For the variable-length, specialisations are // provided in separate files. These are: // utf8 // utf16 // iso_10646_utf_1 // unicode_1_1_utf_7 // UTF-7 // UTF16-BE // UTF16-LE // UTF32 // UTF32-BE // UTF32-LE // iso_2022_kr // iso_2022_jp // iso_2022_jp_2 // iso_2022_cn // iso_2022_cn_ext // shift_jis // I don't yet know enough about the following character sets to supply traits // for them. I do not aim to do so for every single character set. // iso_ir_142 // jis_x0201 // jis_encoding // euc_jp // Extended_UNIX_Code_Fixed_Width_for_Japanese // INVARIANT // iso_ir_2 // iso_ir_8_1 // iso_ir_8_2 // iso_ir_9_1 // iso_ir_9_2 // iso_ir_149 // euc_kr // iso_ir_13 // iso_ir_14 // iso_ir_18 // iso_ir_19 // iso_ir_27 // iso_ir_37 // iso_ir_42 // iso_ir_47 // iso_ir_49 // iso_ir_50 // iso_ir_51 // iso_ir_52 // iso_ir_53 // iso_ir_54 // iso_ir_55 // iso_ir_57 // iso_ir_58 // iso_ir_70 // iso_ir_87 // iso_ir_88 // iso_ir_89 // iso_ir_90 // iso_ir_91 // iso_ir_93 // iso_ir_94 // iso_ir_95 // iso_ir_96 // iso_ir_98 // iso_ir_99 // iso_ir_102 // iso_ir_103 // iso_ir_103 // iso_ir_123 // iso_ir_139 // iso_ir_143 // iso_ir_146 // iso_ir_147 // iso_ir_150 // iso_ir_152 // iso_ir_153 // iso_ir_155 // iso_ir_158 // iso_ir_159 // us-dk // dk-us // gb18030 // OSD_EBCDIC_DF04_15 // OSD_EBCDIC_DF03_IRV // OSD_EBCDIC_DF04_1 // ISO-11548-1 // KZ-1048 // ISO-10646-UCS-Basic // ISO-10646-Unicode-Latin1 // ISO-10646-J-1 // ISO-Unicode-IBM-1261 // ISO-Unicode-IBM-1268 // ISO-Unicode-IBM-1276 // ISO-Unicode-IBM-1264 // ISO-Unicode-IBM-1265 // UNICODE-1-1 // SCSU // CESU-8 // BOCU-1 // ISO-8859-1-Windows-3.0-Latin-1 // ISO-8859-1-Windows-3.1-Latin-1 // ISO-8859-2-Windows-Latin-2 // ISO-8859-9-Windows-Latin-5 // hp_roman8 // Adobe-Standard-Encoding // Ventura-US // Ventura-International // DEC-MCS // PC8-Danish-Norwegian // PC8-Turkish // IBM-Symbols // IBM-Thai // HP-Legal // HP-Pi-font // HP-Math8 // Adobe-Symbol-Encoding // HP-DeskTop // Ventura-Math // Microsoft-Publishing // Windows-31J // GB2312 // Big5 // macintosh // EBCDIC-AT-DE // EBCDIC-AT-DE-A // EBCDIC-CA-FR // EBCDIC-DK-NO // EBCDIC-DK-NO-A // EBCDIC-FI-SE // EBCDIC-FI-SE-A // EBCDIC-FR // EBCDIC-IT // EBCDIC-PT // EBCDIC-ES // EBCDIC-ES-A // EBCDIC-ES-S // EBCDIC-UK // EBCDIC-US // UNKNOWN-8BIT // MNEMONIC // MNEM // VISCII // VIQR // HZ-GB-2312 // Big5-HKSCS // Amiga1251 // KOI7-switched // BRF // TSCII // TIS-620 }; #endif