pandorafms/extras/anytermd/libpbe/include/charset/conv/iso8859.hh

256 lines
9.5 KiB
C++
Raw Normal View History

// iso8859.hh
// This file is part of libpbe; see http://anyterm.org/
// (C) 2008 Philip Endecott
// Distributed under the Boost Software License, Version 1.0:
//
// Permission is hereby granted, free of charge, to any person or organization
// obtaining a copy of the software and accompanying documentation covered by
// this license (the "Software") to use, reproduce, display, distribute,
// execute, and transmit the Software, and to prepare derivative works of the
// Software, and to permit third-parties to whom the Software is furnished to
// do so, all subject to the following:
//
// The copyright notices in the Software and this entire statement, including
// the above license grant, this restriction and the following disclaimer,
// must be included in all copies of the Software, in whole or in part, and
// all derivative works of the Software, unless such copies or derivative
// works are solely in the form of machine-executable object code generated by
// a source language processor.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
// SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
// FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
// DEALINGS IN THE SOFTWARE.
#ifndef libpbe_charset_conv_iso8859_hh
#define libpbe_charset_conv_iso8859_hh
#include "charset/charset_t.hh"
#include "charset/char_t.hh"
#include "charset/charset_traits.hh"
#include "charset/char_conv.hh"
#include <boost/cstdint.hpp>
#include <algorithm>
namespace pbe {
// Conversion of iso8859 characters
// --------------------------------
//
// This file implements character conversions to and from the iso8859 character
// sets.
// Conversions to Unicode characters
// ---------------------------------
// All iso8859 characters correspond to "basic multilingual plane" unicode
// characters, i.e. they fit in a 16-bit ucs2 character.
// iso8859-1 is a special case: the first 256 Unicode characters are
// the iso8859-1 characters.
IDENTITY_CHAR_CONV(cs::iso8859_1,cs::ucs2)
IDENTITY_CHAR_CONV(cs::iso8859_1,cs::ucs4)
// For the other character sets, characters 0 to 160 inclusivve map directly to
// Unicode. Tables are used to map characters 161 to 255:
typedef char16_t iso8859_table_t[95]; // character n in [n-161].
extern iso8859_table_t iso8859_tables[15]; // iso8859-n in [n-2].
// These tables are automatically generated from data from unicode.org.
// Some character sets don't use some codes. The sentinel value -1 is stored
// in the tables for these cases. (0xffff is not a valid ucs2 character.)
// In the following, template parameter n is the character set number 2-16.
template <int n>
static inline int iso8859_to_ucs ( char8_t c )
{
int i = static_cast<int>(static_cast<uint8_t>(c));
return (i<161) ? i : iso8859_tables[n-2][i-161];
}
#define CONV_ISO8859_TO_UCS(N) \
template <typename error_policy> \
struct char_conv<cs::iso8859_##N, cs::ucs2, error_policy> { \
char16_t operator() ( char8_t c, \
charset_traits<cs::iso8859_##N>::state_t&, \
charset_traits<cs::ucs2>::state_t& ) \
{ \
return iso8859_to_ucs<N>(c); \
} \
}; \
template <typename error_policy> \
struct char_conv<cs::iso8859_##N, cs::ucs4, error_policy> { \
char32_t operator() ( char8_t c, \
charset_traits<cs::iso8859_##N>::state_t&, \
charset_traits<cs::ucs4>::state_t& ) \
{ \
return iso8859_to_ucs<N>(c); \
} \
};
CONV_ISO8859_TO_UCS(2)
CONV_ISO8859_TO_UCS(3)
CONV_ISO8859_TO_UCS(4)
CONV_ISO8859_TO_UCS(5)
CONV_ISO8859_TO_UCS(6)
CONV_ISO8859_TO_UCS(7)
CONV_ISO8859_TO_UCS(8)
CONV_ISO8859_TO_UCS(9)
CONV_ISO8859_TO_UCS(10)
//CONV_ISO8859_TO_UCS(11) // This is missing from the IANA file
// -11 should be Thai.
// -12 is supposed to be missing; it's the abandoned Devanagari
CONV_ISO8859_TO_UCS(13)
CONV_ISO8859_TO_UCS(14)
CONV_ISO8859_TO_UCS(15)
CONV_ISO8859_TO_UCS(16)
#undef CONV_ISO8859_TO_UCS
// Conversion from Unicode characters
// ----------------------------------
// iso8859-1 is a special case again:
template <typename error_policy>
struct char_conv<cs::ucs2, cs::iso8859_1, error_policy> {
char8_t operator() ( char16_t c,
charset_traits<cs::ucs2>::state_t&,
charset_traits<cs::iso8859_1>::state_t& )
{
if (c>0xff) {
error_policy::no_equivalent();
}
return c;
}
};
template <typename error_policy>
struct char_conv<cs::ucs4, cs::iso8859_1, error_policy> {
char8_t operator() ( char32_t c,
charset_traits<cs::ucs4>::state_t&,
charset_traits<cs::iso8859_1>::state_t& )
{
if (c>0xff) {
error_policy::no_equivalent();
}
return c;
}
};
// For the other cases we use tables.
// A single-level table-driven conversion would require large, sparse
// tables; instead we break the unicode space into pages and have one table
// for each combination of ucs page and iso8859 character set.
// These tables are generated dynamically only as needed by invoking
// the reverse functions above.
template <int n, int page>
static inline const char8_t* mk_ucs_to_iso8859_page_table() {
char8_t* table = new char8_t[256]; // never deleted
std::fill(table,table+256,0); // 0 = no equivalent
for (int c=161; c<256; ++c) {
int unichar = iso8859_to_ucs<n>(c);
if ((unichar>>8) == page) {
table[unichar&0xff] = c;
}
}
return table;
}
template <int n, int page, typename error_policy>
static inline char8_t char_conv_ucs_to_iso8859_lookup ( uint8_t point )
{
/*FIXME THREAD SAFE*/ static const char8_t* table_p = mk_ucs_to_iso8859_page_table<n,page>();
char8_t c = table_p[point];
if (c==0) {
return error_policy::no_equivalent();
}
return c;
}
template <int n, typename error_policy>
static inline char8_t ucs_to_iso8859 ( int c )
{
if (c<=160) {
return c;
} else {
int page = c>>8;
uint8_t point = c&0xff;
switch (page) {
// These are the only pages that have any characters in any iso8859 character sets.
// FIXME we could use specialisation to consider only those pages that apply to
// a partiuclar character set.
case 0x00: return char_conv_ucs_to_iso8859_lookup<n,0x00,error_policy>(point);
case 0x01: return char_conv_ucs_to_iso8859_lookup<n,0x01,error_policy>(point);
case 0x02: return char_conv_ucs_to_iso8859_lookup<n,0x02,error_policy>(point);
case 0x03: return char_conv_ucs_to_iso8859_lookup<n,0x03,error_policy>(point);
case 0x04: return char_conv_ucs_to_iso8859_lookup<n,0x04,error_policy>(point);
case 0x05: return char_conv_ucs_to_iso8859_lookup<n,0x05,error_policy>(point);
case 0x06: return char_conv_ucs_to_iso8859_lookup<n,0x06,error_policy>(point);
case 0x0e: return char_conv_ucs_to_iso8859_lookup<n,0x0e,error_policy>(point);
case 0x1e: return char_conv_ucs_to_iso8859_lookup<n,0x1e,error_policy>(point);
case 0x20: return char_conv_ucs_to_iso8859_lookup<n,0x20,error_policy>(point);
case 0x21: return char_conv_ucs_to_iso8859_lookup<n,0x21,error_policy>(point);
default: return error_policy::no_equivalent();
}
}
}
#define CONV_UCS_TO_ISO8859(N) \
template <typename error_policy> \
struct char_conv<cs::ucs2, cs::iso8859_##N, error_policy> { \
char8_t operator() ( char16_t c, \
charset_traits<cs::ucs2>::state_t&, \
charset_traits<cs::iso8859_##N>::state_t& ) \
{ \
return ucs_to_iso8859<N,error_policy>(c); \
} \
}; \
template <typename error_policy> \
struct char_conv<cs::ucs4, cs::iso8859_##N, error_policy> { \
char8_t operator() ( char32_t c, \
charset_traits<cs::ucs4>::state_t&, \
charset_traits<cs::iso8859_##N>::state_t& ) \
{ \
return ucs_to_iso8859<N,error_policy>(c); \
} \
};
CONV_UCS_TO_ISO8859(2)
CONV_UCS_TO_ISO8859(3)
CONV_UCS_TO_ISO8859(4)
CONV_UCS_TO_ISO8859(5)
CONV_UCS_TO_ISO8859(6)
CONV_UCS_TO_ISO8859(7)
CONV_UCS_TO_ISO8859(8)
CONV_UCS_TO_ISO8859(9)
CONV_UCS_TO_ISO8859(10)
CONV_UCS_TO_ISO8859(13)
CONV_UCS_TO_ISO8859(14)
CONV_UCS_TO_ISO8859(15)
CONV_UCS_TO_ISO8859(16)
#undef CONV_UCS_TO_ISO8859
};
#endif