// iso8859.hh // This file is part of libpbe; see http://anyterm.org/ // (C) 2008 Philip Endecott // Distributed under the Boost Software License, Version 1.0: // // Permission is hereby granted, free of charge, to any person or organization // obtaining a copy of the software and accompanying documentation covered by // this license (the "Software") to use, reproduce, display, distribute, // execute, and transmit the Software, and to prepare derivative works of the // Software, and to permit third-parties to whom the Software is furnished to // do so, all subject to the following: // // The copyright notices in the Software and this entire statement, including // the above license grant, this restriction and the following disclaimer, // must be included in all copies of the Software, in whole or in part, and // all derivative works of the Software, unless such copies or derivative // works are solely in the form of machine-executable object code generated by // a source language processor. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT // SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE // FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER // DEALINGS IN THE SOFTWARE. #ifndef libpbe_charset_conv_iso8859_hh #define libpbe_charset_conv_iso8859_hh #include "charset/charset_t.hh" #include "charset/char_t.hh" #include "charset/charset_traits.hh" #include "charset/char_conv.hh" #include #include namespace pbe { // Conversion of iso8859 characters // -------------------------------- // // This file implements character conversions to and from the iso8859 character // sets. // Conversions to Unicode characters // --------------------------------- // All iso8859 characters correspond to "basic multilingual plane" unicode // characters, i.e. they fit in a 16-bit ucs2 character. // iso8859-1 is a special case: the first 256 Unicode characters are // the iso8859-1 characters. IDENTITY_CHAR_CONV(cs::iso8859_1,cs::ucs2) IDENTITY_CHAR_CONV(cs::iso8859_1,cs::ucs4) // For the other character sets, characters 0 to 160 inclusivve map directly to // Unicode. Tables are used to map characters 161 to 255: typedef char16_t iso8859_table_t[95]; // character n in [n-161]. extern iso8859_table_t iso8859_tables[15]; // iso8859-n in [n-2]. // These tables are automatically generated from data from unicode.org. // Some character sets don't use some codes. The sentinel value -1 is stored // in the tables for these cases. (0xffff is not a valid ucs2 character.) // In the following, template parameter n is the character set number 2-16. template static inline int iso8859_to_ucs ( char8_t c ) { int i = static_cast(static_cast(c)); return (i<161) ? i : iso8859_tables[n-2][i-161]; } #define CONV_ISO8859_TO_UCS(N) \ template \ struct char_conv { \ char16_t operator() ( char8_t c, \ charset_traits::state_t&, \ charset_traits::state_t& ) \ { \ return iso8859_to_ucs(c); \ } \ }; \ template \ struct char_conv { \ char32_t operator() ( char8_t c, \ charset_traits::state_t&, \ charset_traits::state_t& ) \ { \ return iso8859_to_ucs(c); \ } \ }; CONV_ISO8859_TO_UCS(2) CONV_ISO8859_TO_UCS(3) CONV_ISO8859_TO_UCS(4) CONV_ISO8859_TO_UCS(5) CONV_ISO8859_TO_UCS(6) CONV_ISO8859_TO_UCS(7) CONV_ISO8859_TO_UCS(8) CONV_ISO8859_TO_UCS(9) CONV_ISO8859_TO_UCS(10) //CONV_ISO8859_TO_UCS(11) // This is missing from the IANA file // -11 should be Thai. // -12 is supposed to be missing; it's the abandoned Devanagari CONV_ISO8859_TO_UCS(13) CONV_ISO8859_TO_UCS(14) CONV_ISO8859_TO_UCS(15) CONV_ISO8859_TO_UCS(16) #undef CONV_ISO8859_TO_UCS // Conversion from Unicode characters // ---------------------------------- // iso8859-1 is a special case again: template struct char_conv { char8_t operator() ( char16_t c, charset_traits::state_t&, charset_traits::state_t& ) { if (c>0xff) { error_policy::no_equivalent(); } return c; } }; template struct char_conv { char8_t operator() ( char32_t c, charset_traits::state_t&, charset_traits::state_t& ) { if (c>0xff) { error_policy::no_equivalent(); } return c; } }; // For the other cases we use tables. // A single-level table-driven conversion would require large, sparse // tables; instead we break the unicode space into pages and have one table // for each combination of ucs page and iso8859 character set. // These tables are generated dynamically only as needed by invoking // the reverse functions above. template static inline const char8_t* mk_ucs_to_iso8859_page_table() { char8_t* table = new char8_t[256]; // never deleted std::fill(table,table+256,0); // 0 = no equivalent for (int c=161; c<256; ++c) { int unichar = iso8859_to_ucs(c); if ((unichar>>8) == page) { table[unichar&0xff] = c; } } return table; } template static inline char8_t char_conv_ucs_to_iso8859_lookup ( uint8_t point ) { /*FIXME THREAD SAFE*/ static const char8_t* table_p = mk_ucs_to_iso8859_page_table(); char8_t c = table_p[point]; if (c==0) { return error_policy::no_equivalent(); } return c; } template static inline char8_t ucs_to_iso8859 ( int c ) { if (c<=160) { return c; } else { int page = c>>8; uint8_t point = c&0xff; switch (page) { // These are the only pages that have any characters in any iso8859 character sets. // FIXME we could use specialisation to consider only those pages that apply to // a partiuclar character set. case 0x00: return char_conv_ucs_to_iso8859_lookup(point); case 0x01: return char_conv_ucs_to_iso8859_lookup(point); case 0x02: return char_conv_ucs_to_iso8859_lookup(point); case 0x03: return char_conv_ucs_to_iso8859_lookup(point); case 0x04: return char_conv_ucs_to_iso8859_lookup(point); case 0x05: return char_conv_ucs_to_iso8859_lookup(point); case 0x06: return char_conv_ucs_to_iso8859_lookup(point); case 0x0e: return char_conv_ucs_to_iso8859_lookup(point); case 0x1e: return char_conv_ucs_to_iso8859_lookup(point); case 0x20: return char_conv_ucs_to_iso8859_lookup(point); case 0x21: return char_conv_ucs_to_iso8859_lookup(point); default: return error_policy::no_equivalent(); } } } #define CONV_UCS_TO_ISO8859(N) \ template \ struct char_conv { \ char8_t operator() ( char16_t c, \ charset_traits::state_t&, \ charset_traits::state_t& ) \ { \ return ucs_to_iso8859(c); \ } \ }; \ template \ struct char_conv { \ char8_t operator() ( char32_t c, \ charset_traits::state_t&, \ charset_traits::state_t& ) \ { \ return ucs_to_iso8859(c); \ } \ }; CONV_UCS_TO_ISO8859(2) CONV_UCS_TO_ISO8859(3) CONV_UCS_TO_ISO8859(4) CONV_UCS_TO_ISO8859(5) CONV_UCS_TO_ISO8859(6) CONV_UCS_TO_ISO8859(7) CONV_UCS_TO_ISO8859(8) CONV_UCS_TO_ISO8859(9) CONV_UCS_TO_ISO8859(10) CONV_UCS_TO_ISO8859(13) CONV_UCS_TO_ISO8859(14) CONV_UCS_TO_ISO8859(15) CONV_UCS_TO_ISO8859(16) #undef CONV_UCS_TO_ISO8859 }; #endif