pandorafms/extras/anytermd/libpbe/include/charset/utf8.hh

// utf8.hh
// This file is part of libpbe; see http://anyterm.org/
// (C) 2007-2008 Philip Endecott

// Distributed under the Boost Software License, Version 1.0:
//
// Permission is hereby granted, free of charge, to any person or organization
// obtaining a copy of the software and accompanying documentation covered by
// this license (the "Software") to use, reproduce, display, distribute,
// execute, and transmit the Software, and to prepare derivative works of the
// Software, and to permit third-parties to whom the Software is furnished to
// do so, all subject to the following:
//
// The copyright notices in the Software and this entire statement, including
// the above license grant, this restriction and the following disclaimer,
// must be included in all copies of the Software, in whole or in part, and
// all derivative works of the Software, unless such copies or derivative
// works are solely in the form of machine-executable object code generated by
// a source language processor.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
// SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
// FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
// DEALINGS IN THE SOFTWARE.

#ifndef libpbe_charset_utf8_hh
#define libpbe_charset_utf8_hh

#include "char_t.hh"
#include "charset_t.hh"
#include "charset_traits.hh"

#include "compiler_magic.hh"


// UTF-8 Encoding and Decoding
// ---------------------------
//
// This file provides functions that do the bit-shuffling needed for UTF-8
// encoding and decoding in the form of a charset_traits specialisation.
//
// TODO there should be an error_policy template parameter to the decoding
// function so that the behaviour when invalid input is encountered can be
// defined.
//
// These functions perform significantly better than the alternaitves that
// I benchmarked: Ken Thompson's originals, glibc's mbsrtowcs etc. and
// glibc's iconv.  Performance is somewhat sensitive to optimisation, however;
// the branch prediction hints that I've included make this less of a problem.


namespace pbe {

template <> struct charset_traits<cs::utf8> {

  typedef char32_t char_t;
  typedef char8_t unit_t;
  struct state_t {};
  static const bool is_ascii_superset = true;
  static const bool is_ascii_plus_c1_superset = true;

  struct InvalidUTF8 {};  // FIXME see above error_policy needed

private:
  static void check(bool condition) {
    IF_UNLIKELY(!condition) throw InvalidUTF8();
  }

public:
  template <typename const_char8_ptr_t>
  static char32_t decode(const_char8_ptr_t& p) {
    char8_t b0 = *(p++);
    IF_LIKELY((b0&0x80)==0) {
      return b0;
    }
    char8_t b1 = *(p++);
    check((b1&0xc0)==0x80);
    IF_LIKELY((b0&0xe0)==0xc0) {
      char32_t r = (b1&0x3f) | ((b0&0x1f)<<6);
      check(r>=0x80);
      return r;
    }
    char8_t b2 = *(p++);
    check((b2&0xc0)==0x80);
    IF_LIKELY((b0&0xf0)==0xe0) {
      char32_t r = (b2&0x3f) | ((b1&0x3f)<<6) | ((b0&0x0f)<<12);
      check(r>=0x800);
      return r;
    }
    char8_t b3 = *(p++);
    check((b3&0xc0)==0x80);
    IF_LIKELY((b0&0xf8)==0xf0) {
      char32_t r = (b3&0x3f) | ((b2&0x3f)<<6) | ((b1&0x3f)<<12) | ((b0&0x07)<<18);
      check(r>=0x10000);
      return r;
    }
    // The original definition of UTF-8 includes 5- and 6-byte encodings.
    // But these correspond to values that are not valid Unicode characters,
    // and they are not included in the more recent spec.  Included here
    // for interest:
    // char8_t b4 = *(p++);
    // check((b4&0xc0)==0x80);
    // IF_LIKELY((b0&0xfc)==0xf8) {
    //   char32_t r = (b4&0x3f) | ((b3&0x3f)<<6) | ((b2&0x3f)<<12) | ((b1&0x3f)<<18) | ((b0&0x03)<<24);
    //   check(r>=0x200000);
    //   return r;
    // }
    // char8_t b5 = *(p++);
    // check((b5&0xc0)==0x80);
    // IF_LIKELY((b0&0xfe)==0xfc) {
    //   char32_t r = (b5&0x3f) | ((b4&0x3f)<<6) | ((b3&0x3f)<<12) | ((b2&0x3f)<<18) | ((b1&0x3f)<<24) | ((b0&0x01)<<30);
    //   check(r>=0x4000000);
    //   return r;
    // }
    check(false);
    return 0;  // not reached
  }


  template <typename char8_ptr_t>
  static void encode(char8_ptr_t& p, char32_t c) {
    IF_LIKELY(c<=0x7f) {
      *(p++) = c;
    } else {
      IF_LIKELY(c<=0x7ff) {
        *(p++) = 0xc0 | (c>>6);
      } else {
        IF_LIKELY(c<=0xffff) {
          *(p++) = 0xe0 | (c>>12);
        } else {
          IF_LIKELY(c<=0x1fffff) {
            *(p++) = 0xf0 | (c>>18);
          } else {
            // As above, disable 5- and 6-byte forms:
            // IF_LIKELY(c<=0x3ffffff) {
            //   *(p++) = 0xf8 | (c>>24);
            // } else {
            //   if (c&0x80000000) {
            throw "can't represent this value in UTF8";  // needs error_policy
            //   }
            //   *(p++) = 0xfc | (c>>30);
            //   *(p++) = 0x80 | ((c>>24)&0x3f);
            // }
            // *(p++) = 0x80 | ((c>>18)&0x3f);
          }
          *(p++) = 0x80 | ((c>>12)&0x3f);
        }
        *(p++) = 0x80 | ((c>>6)&0x3f);
      }
      *(p++) = 0x80 | (c&0x3f);
    }
  }


  // Skip forward and backward to the start of the next character.
  // We know the length of a character from its first byte, so in principle we
  // need only look at that to skip forward.  But I guess that it's actually
  // quicker to look at all bytes until a character-starting-byte is found,
  // because the bit manipulation is simpler and less code is needed.

private:
  static bool char_start_byte(char8_t b) {
    // All non-first bytes of a UTF8 character are 10xxxxxx.
    return (b&0xc0) != 0x80;
  }

public:
  template <typename char8_ptr_t>
  static void skip_forward_char(char8_ptr_t& i) {
    do {
      ++i;
    } while (!char_start_byte(*i));  // Maybe hint this?
  }

  template <typename char8_ptr_t>
  static void skip_backward_char(char8_ptr_t& i) {
    do {
      --i;
    } while (!char_start_byte(*i));  // Maybe hint this?
  }

  template <typename const_char8_ptr_t>
  static int char_length(const_char8_ptr_t& p) {
    char8_t b = *p;
    if ((b&0x80)==0)    return 1;
    if ((b&0xe0)==0xc0) return 2;
    if ((b&0xf0)==0xe0) return 3;
    if ((b&0xf8)==0xf0) return 4;
    // As above, disable 5- and 6-byte forms:
    // if ((b&0xfc)==0xf8) return 5;
    // if ((b&0xfe)==0xfc) return 6;
    return 0;  // not reached for valid input
  }


  static size_t max_characters(size_t n_units) { return n_units; }
  static size_t typ_characters(size_t n_units) { return n_units; }
  static size_t max_units(size_t n_characters) { return 4*n_characters; }
  static size_t typ_units(size_t n_characters) { return 2*n_characters; }


};


};

#endif