256 lines
9.5 KiB
C++
256 lines
9.5 KiB
C++
// iso8859.hh
|
|
// This file is part of libpbe; see http://anyterm.org/
|
|
// (C) 2008 Philip Endecott
|
|
|
|
// Distributed under the Boost Software License, Version 1.0:
|
|
//
|
|
// Permission is hereby granted, free of charge, to any person or organization
|
|
// obtaining a copy of the software and accompanying documentation covered by
|
|
// this license (the "Software") to use, reproduce, display, distribute,
|
|
// execute, and transmit the Software, and to prepare derivative works of the
|
|
// Software, and to permit third-parties to whom the Software is furnished to
|
|
// do so, all subject to the following:
|
|
//
|
|
// The copyright notices in the Software and this entire statement, including
|
|
// the above license grant, this restriction and the following disclaimer,
|
|
// must be included in all copies of the Software, in whole or in part, and
|
|
// all derivative works of the Software, unless such copies or derivative
|
|
// works are solely in the form of machine-executable object code generated by
|
|
// a source language processor.
|
|
//
|
|
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
// FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
|
|
// SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
|
|
// FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
|
|
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
|
// DEALINGS IN THE SOFTWARE.
|
|
|
|
#ifndef libpbe_charset_conv_iso8859_hh
|
|
#define libpbe_charset_conv_iso8859_hh
|
|
|
|
#include "charset/charset_t.hh"
|
|
#include "charset/char_t.hh"
|
|
#include "charset/charset_traits.hh"
|
|
#include "charset/char_conv.hh"
|
|
|
|
#include <boost/cstdint.hpp>
|
|
|
|
#include <algorithm>
|
|
|
|
|
|
namespace pbe {
|
|
|
|
// Conversion of iso8859 characters
|
|
// --------------------------------
|
|
//
|
|
// This file implements character conversions to and from the iso8859 character
|
|
// sets.
|
|
|
|
|
|
// Conversions to Unicode characters
|
|
// ---------------------------------
|
|
|
|
// All iso8859 characters correspond to "basic multilingual plane" unicode
|
|
// characters, i.e. they fit in a 16-bit ucs2 character.
|
|
|
|
// iso8859-1 is a special case: the first 256 Unicode characters are
|
|
// the iso8859-1 characters.
|
|
|
|
IDENTITY_CHAR_CONV(cs::iso8859_1,cs::ucs2)
|
|
IDENTITY_CHAR_CONV(cs::iso8859_1,cs::ucs4)
|
|
|
|
|
|
// For the other character sets, characters 0 to 160 inclusivve map directly to
|
|
// Unicode. Tables are used to map characters 161 to 255:
|
|
|
|
typedef char16_t iso8859_table_t[95]; // character n in [n-161].
|
|
extern iso8859_table_t iso8859_tables[15]; // iso8859-n in [n-2].
|
|
|
|
// These tables are automatically generated from data from unicode.org.
|
|
|
|
// Some character sets don't use some codes. The sentinel value -1 is stored
|
|
// in the tables for these cases. (0xffff is not a valid ucs2 character.)
|
|
|
|
// In the following, template parameter n is the character set number 2-16.
|
|
|
|
template <int n>
|
|
static inline int iso8859_to_ucs ( char8_t c )
|
|
{
|
|
int i = static_cast<int>(static_cast<uint8_t>(c));
|
|
return (i<161) ? i : iso8859_tables[n-2][i-161];
|
|
}
|
|
|
|
#define CONV_ISO8859_TO_UCS(N) \
|
|
template <typename error_policy> \
|
|
struct char_conv<cs::iso8859_##N, cs::ucs2, error_policy> { \
|
|
char16_t operator() ( char8_t c, \
|
|
charset_traits<cs::iso8859_##N>::state_t&, \
|
|
charset_traits<cs::ucs2>::state_t& ) \
|
|
{ \
|
|
return iso8859_to_ucs<N>(c); \
|
|
} \
|
|
}; \
|
|
template <typename error_policy> \
|
|
struct char_conv<cs::iso8859_##N, cs::ucs4, error_policy> { \
|
|
char32_t operator() ( char8_t c, \
|
|
charset_traits<cs::iso8859_##N>::state_t&, \
|
|
charset_traits<cs::ucs4>::state_t& ) \
|
|
{ \
|
|
return iso8859_to_ucs<N>(c); \
|
|
} \
|
|
};
|
|
|
|
CONV_ISO8859_TO_UCS(2)
|
|
CONV_ISO8859_TO_UCS(3)
|
|
CONV_ISO8859_TO_UCS(4)
|
|
CONV_ISO8859_TO_UCS(5)
|
|
CONV_ISO8859_TO_UCS(6)
|
|
CONV_ISO8859_TO_UCS(7)
|
|
CONV_ISO8859_TO_UCS(8)
|
|
CONV_ISO8859_TO_UCS(9)
|
|
CONV_ISO8859_TO_UCS(10)
|
|
//CONV_ISO8859_TO_UCS(11) // This is missing from the IANA file
|
|
// -11 should be Thai.
|
|
// -12 is supposed to be missing; it's the abandoned Devanagari
|
|
CONV_ISO8859_TO_UCS(13)
|
|
CONV_ISO8859_TO_UCS(14)
|
|
CONV_ISO8859_TO_UCS(15)
|
|
CONV_ISO8859_TO_UCS(16)
|
|
|
|
#undef CONV_ISO8859_TO_UCS
|
|
|
|
|
|
// Conversion from Unicode characters
|
|
// ----------------------------------
|
|
|
|
// iso8859-1 is a special case again:
|
|
|
|
template <typename error_policy>
|
|
struct char_conv<cs::ucs2, cs::iso8859_1, error_policy> {
|
|
char8_t operator() ( char16_t c,
|
|
charset_traits<cs::ucs2>::state_t&,
|
|
charset_traits<cs::iso8859_1>::state_t& )
|
|
{
|
|
if (c>0xff) {
|
|
error_policy::no_equivalent();
|
|
}
|
|
return c;
|
|
}
|
|
};
|
|
|
|
template <typename error_policy>
|
|
struct char_conv<cs::ucs4, cs::iso8859_1, error_policy> {
|
|
char8_t operator() ( char32_t c,
|
|
charset_traits<cs::ucs4>::state_t&,
|
|
charset_traits<cs::iso8859_1>::state_t& )
|
|
{
|
|
if (c>0xff) {
|
|
error_policy::no_equivalent();
|
|
}
|
|
return c;
|
|
}
|
|
};
|
|
|
|
|
|
// For the other cases we use tables.
|
|
// A single-level table-driven conversion would require large, sparse
|
|
// tables; instead we break the unicode space into pages and have one table
|
|
// for each combination of ucs page and iso8859 character set.
|
|
// These tables are generated dynamically only as needed by invoking
|
|
// the reverse functions above.
|
|
|
|
|
|
template <int n, int page>
|
|
static inline const char8_t* mk_ucs_to_iso8859_page_table() {
|
|
char8_t* table = new char8_t[256]; // never deleted
|
|
std::fill(table,table+256,0); // 0 = no equivalent
|
|
for (int c=161; c<256; ++c) {
|
|
int unichar = iso8859_to_ucs<n>(c);
|
|
if ((unichar>>8) == page) {
|
|
table[unichar&0xff] = c;
|
|
}
|
|
}
|
|
return table;
|
|
}
|
|
|
|
template <int n, int page, typename error_policy>
|
|
static inline char8_t char_conv_ucs_to_iso8859_lookup ( uint8_t point )
|
|
{
|
|
/*FIXME THREAD SAFE*/ static const char8_t* table_p = mk_ucs_to_iso8859_page_table<n,page>();
|
|
char8_t c = table_p[point];
|
|
if (c==0) {
|
|
return error_policy::no_equivalent();
|
|
}
|
|
return c;
|
|
}
|
|
|
|
template <int n, typename error_policy>
|
|
static inline char8_t ucs_to_iso8859 ( int c )
|
|
{
|
|
if (c<=160) {
|
|
return c;
|
|
} else {
|
|
int page = c>>8;
|
|
uint8_t point = c&0xff;
|
|
switch (page) {
|
|
// These are the only pages that have any characters in any iso8859 character sets.
|
|
// FIXME we could use specialisation to consider only those pages that apply to
|
|
// a partiuclar character set.
|
|
case 0x00: return char_conv_ucs_to_iso8859_lookup<n,0x00,error_policy>(point);
|
|
case 0x01: return char_conv_ucs_to_iso8859_lookup<n,0x01,error_policy>(point);
|
|
case 0x02: return char_conv_ucs_to_iso8859_lookup<n,0x02,error_policy>(point);
|
|
case 0x03: return char_conv_ucs_to_iso8859_lookup<n,0x03,error_policy>(point);
|
|
case 0x04: return char_conv_ucs_to_iso8859_lookup<n,0x04,error_policy>(point);
|
|
case 0x05: return char_conv_ucs_to_iso8859_lookup<n,0x05,error_policy>(point);
|
|
case 0x06: return char_conv_ucs_to_iso8859_lookup<n,0x06,error_policy>(point);
|
|
case 0x0e: return char_conv_ucs_to_iso8859_lookup<n,0x0e,error_policy>(point);
|
|
case 0x1e: return char_conv_ucs_to_iso8859_lookup<n,0x1e,error_policy>(point);
|
|
case 0x20: return char_conv_ucs_to_iso8859_lookup<n,0x20,error_policy>(point);
|
|
case 0x21: return char_conv_ucs_to_iso8859_lookup<n,0x21,error_policy>(point);
|
|
default: return error_policy::no_equivalent();
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
#define CONV_UCS_TO_ISO8859(N) \
|
|
template <typename error_policy> \
|
|
struct char_conv<cs::ucs2, cs::iso8859_##N, error_policy> { \
|
|
char8_t operator() ( char16_t c, \
|
|
charset_traits<cs::ucs2>::state_t&, \
|
|
charset_traits<cs::iso8859_##N>::state_t& ) \
|
|
{ \
|
|
return ucs_to_iso8859<N,error_policy>(c); \
|
|
} \
|
|
}; \
|
|
template <typename error_policy> \
|
|
struct char_conv<cs::ucs4, cs::iso8859_##N, error_policy> { \
|
|
char8_t operator() ( char32_t c, \
|
|
charset_traits<cs::ucs4>::state_t&, \
|
|
charset_traits<cs::iso8859_##N>::state_t& ) \
|
|
{ \
|
|
return ucs_to_iso8859<N,error_policy>(c); \
|
|
} \
|
|
};
|
|
|
|
CONV_UCS_TO_ISO8859(2)
|
|
CONV_UCS_TO_ISO8859(3)
|
|
CONV_UCS_TO_ISO8859(4)
|
|
CONV_UCS_TO_ISO8859(5)
|
|
CONV_UCS_TO_ISO8859(6)
|
|
CONV_UCS_TO_ISO8859(7)
|
|
CONV_UCS_TO_ISO8859(8)
|
|
CONV_UCS_TO_ISO8859(9)
|
|
CONV_UCS_TO_ISO8859(10)
|
|
CONV_UCS_TO_ISO8859(13)
|
|
CONV_UCS_TO_ISO8859(14)
|
|
CONV_UCS_TO_ISO8859(15)
|
|
CONV_UCS_TO_ISO8859(16)
|
|
|
|
#undef CONV_UCS_TO_ISO8859
|
|
|
|
};
|
|
|
|
#endif
|