pandorafms/extras/anytermd/libpbe/include/charset/conv/iso8859.hh

// iso8859.hh
// This file is part of libpbe; see http://anyterm.org/
// (C) 2008 Philip Endecott

// Distributed under the Boost Software License, Version 1.0:
//
// Permission is hereby granted, free of charge, to any person or organization
// obtaining a copy of the software and accompanying documentation covered by
// this license (the "Software") to use, reproduce, display, distribute,
// execute, and transmit the Software, and to prepare derivative works of the
// Software, and to permit third-parties to whom the Software is furnished to
// do so, all subject to the following:
//
// The copyright notices in the Software and this entire statement, including
// the above license grant, this restriction and the following disclaimer,
// must be included in all copies of the Software, in whole or in part, and
// all derivative works of the Software, unless such copies or derivative
// works are solely in the form of machine-executable object code generated by
// a source language processor.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
// SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
// FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
// DEALINGS IN THE SOFTWARE.

#ifndef libpbe_charset_conv_iso8859_hh
#define libpbe_charset_conv_iso8859_hh

#include "charset/charset_t.hh"
#include "charset/char_t.hh"
#include "charset/charset_traits.hh"
#include "charset/char_conv.hh"

#include <boost/cstdint.hpp>

#include <algorithm>


namespace pbe {

// Conversion of iso8859 characters
// --------------------------------
//
// This file implements character conversions to and from the iso8859 character
// sets.


// Conversions to Unicode characters
// ---------------------------------

// All iso8859 characters correspond to "basic multilingual plane" unicode
// characters, i.e. they fit in a 16-bit ucs2 character.

// iso8859-1 is a special case: the first 256 Unicode characters are
// the iso8859-1 characters.

IDENTITY_CHAR_CONV(cs::iso8859_1,cs::ucs2)
IDENTITY_CHAR_CONV(cs::iso8859_1,cs::ucs4)


// For the other character sets, characters 0 to 160 inclusivve map directly to
// Unicode.  Tables are used to map characters 161 to 255:

typedef char16_t iso8859_table_t[95];      // character n in [n-161].
extern iso8859_table_t iso8859_tables[15]; // iso8859-n in [n-2].

// These tables are automatically generated from data from unicode.org.

// Some character sets don't use some codes.  The sentinel value -1 is stored
// in the tables for these cases.  (0xffff is not a valid ucs2 character.)

// In the following, template parameter n is the character set number 2-16.

template <int n>
static inline int iso8859_to_ucs ( char8_t c )
{
  int i =  static_cast<int>(static_cast<uint8_t>(c));
  return (i<161) ? i : iso8859_tables[n-2][i-161];
}

#define CONV_ISO8859_TO_UCS(N)                                     \
template <typename error_policy>                                   \
struct char_conv<cs::iso8859_##N, cs::ucs2, error_policy> {        \
  char16_t operator() ( char8_t c,                                 \
                        charset_traits<cs::iso8859_##N>::state_t&, \
                        charset_traits<cs::ucs2>::state_t&  )      \
  {                                                                \
    return iso8859_to_ucs<N>(c);                                   \
  }                                                                \
};                                                                 \
template <typename error_policy>                                   \
struct char_conv<cs::iso8859_##N, cs::ucs4, error_policy> {        \
  char32_t operator() ( char8_t c,                                 \
                        charset_traits<cs::iso8859_##N>::state_t&, \
                        charset_traits<cs::ucs4>::state_t&  )      \
  {                                                                \
    return iso8859_to_ucs<N>(c);                                   \
  }                                                                \
};

CONV_ISO8859_TO_UCS(2)
CONV_ISO8859_TO_UCS(3)
CONV_ISO8859_TO_UCS(4)
CONV_ISO8859_TO_UCS(5)
CONV_ISO8859_TO_UCS(6)
CONV_ISO8859_TO_UCS(7)
CONV_ISO8859_TO_UCS(8)
CONV_ISO8859_TO_UCS(9)
CONV_ISO8859_TO_UCS(10)
//CONV_ISO8859_TO_UCS(11)  // This is missing from the IANA file
                           // -11 should be Thai.
                           // -12 is supposed to be missing; it's the abandoned Devanagari
CONV_ISO8859_TO_UCS(13)
CONV_ISO8859_TO_UCS(14)
CONV_ISO8859_TO_UCS(15)
CONV_ISO8859_TO_UCS(16)

#undef CONV_ISO8859_TO_UCS


// Conversion from Unicode characters
// ----------------------------------

// iso8859-1 is a special case again:

template <typename error_policy>
struct char_conv<cs::ucs2, cs::iso8859_1, error_policy> {
  char8_t operator() ( char16_t c,
                       charset_traits<cs::ucs2>::state_t&,
                       charset_traits<cs::iso8859_1>::state_t& )
  {
    if (c>0xff) {
      error_policy::no_equivalent();
    }
    return c;
  }
};

template <typename error_policy>
struct char_conv<cs::ucs4, cs::iso8859_1, error_policy> {
  char8_t operator() ( char32_t c,
                       charset_traits<cs::ucs4>::state_t&,
                       charset_traits<cs::iso8859_1>::state_t& )
  {
    if (c>0xff) {
      error_policy::no_equivalent();
    }
    return c;
  }
};


// For the other cases we use tables.
// A single-level table-driven conversion would require large, sparse
// tables; instead we break the unicode space into pages and have one table
// for each combination of ucs page and iso8859 character set.
// These tables are generated dynamically only as needed by invoking
// the reverse functions above.


template <int n, int page>
static inline const char8_t* mk_ucs_to_iso8859_page_table() {
  char8_t* table = new char8_t[256];  // never deleted
  std::fill(table,table+256,0);  // 0 = no equivalent
  for (int c=161; c<256; ++c) {
    int unichar = iso8859_to_ucs<n>(c);
    if ((unichar>>8) == page) {
      table[unichar&0xff] = c;
    }
  }
  return table;
}

template <int n, int page, typename error_policy>
static inline char8_t char_conv_ucs_to_iso8859_lookup ( uint8_t point )
{
  /*FIXME THREAD SAFE*/ static const char8_t* table_p = mk_ucs_to_iso8859_page_table<n,page>();
  char8_t c = table_p[point];
  if (c==0) {
    return error_policy::no_equivalent();
  }
  return c;
}

template <int n, typename error_policy>
static inline char8_t ucs_to_iso8859 ( int c )
{
  if (c<=160) {
    return c;
  } else {
    int page = c>>8;
    uint8_t point = c&0xff;
    switch (page) {
      // These are the only pages that have any characters in any iso8859 character sets.
      // FIXME we could use specialisation to consider only those pages that apply to
      // a partiuclar character set.
      case 0x00: return char_conv_ucs_to_iso8859_lookup<n,0x00,error_policy>(point);
      case 0x01: return char_conv_ucs_to_iso8859_lookup<n,0x01,error_policy>(point);
      case 0x02: return char_conv_ucs_to_iso8859_lookup<n,0x02,error_policy>(point);
      case 0x03: return char_conv_ucs_to_iso8859_lookup<n,0x03,error_policy>(point);
      case 0x04: return char_conv_ucs_to_iso8859_lookup<n,0x04,error_policy>(point);
      case 0x05: return char_conv_ucs_to_iso8859_lookup<n,0x05,error_policy>(point);
      case 0x06: return char_conv_ucs_to_iso8859_lookup<n,0x06,error_policy>(point);
      case 0x0e: return char_conv_ucs_to_iso8859_lookup<n,0x0e,error_policy>(point);
      case 0x1e: return char_conv_ucs_to_iso8859_lookup<n,0x1e,error_policy>(point);
      case 0x20: return char_conv_ucs_to_iso8859_lookup<n,0x20,error_policy>(point);
      case 0x21: return char_conv_ucs_to_iso8859_lookup<n,0x21,error_policy>(point);
      default:   return error_policy::no_equivalent();
    }
  }
}


#define CONV_UCS_TO_ISO8859(N)                                        \
template <typename error_policy>                                      \
struct char_conv<cs::ucs2, cs::iso8859_##N, error_policy> {           \
  char8_t operator() ( char16_t c,                                    \
                       charset_traits<cs::ucs2>::state_t&,            \
                       charset_traits<cs::iso8859_##N>::state_t& )    \
  {                                                                   \
    return ucs_to_iso8859<N,error_policy>(c);                         \
  }                                                                   \
};                                                                    \
template <typename error_policy>                                      \
struct char_conv<cs::ucs4, cs::iso8859_##N, error_policy> {           \
  char8_t operator() ( char32_t c,                                    \
                       charset_traits<cs::ucs4>::state_t&,            \
                       charset_traits<cs::iso8859_##N>::state_t& )    \
  {                                                                   \
    return ucs_to_iso8859<N,error_policy>(c);                         \
  }                                                                   \
};

CONV_UCS_TO_ISO8859(2)
CONV_UCS_TO_ISO8859(3)
CONV_UCS_TO_ISO8859(4)
CONV_UCS_TO_ISO8859(5)
CONV_UCS_TO_ISO8859(6)
CONV_UCS_TO_ISO8859(7)
CONV_UCS_TO_ISO8859(8)
CONV_UCS_TO_ISO8859(9)
CONV_UCS_TO_ISO8859(10)
CONV_UCS_TO_ISO8859(13)
CONV_UCS_TO_ISO8859(14)
CONV_UCS_TO_ISO8859(15)
CONV_UCS_TO_ISO8859(16)

#undef CONV_UCS_TO_ISO8859

};

#endif