pandorafms/extras/anytermd/libpbe/include/charset/utf8.hh

211 lines
6.8 KiB
C++

// utf8.hh
// This file is part of libpbe; see http://anyterm.org/
// (C) 2007-2008 Philip Endecott
// Distributed under the Boost Software License, Version 1.0:
//
// Permission is hereby granted, free of charge, to any person or organization
// obtaining a copy of the software and accompanying documentation covered by
// this license (the "Software") to use, reproduce, display, distribute,
// execute, and transmit the Software, and to prepare derivative works of the
// Software, and to permit third-parties to whom the Software is furnished to
// do so, all subject to the following:
//
// The copyright notices in the Software and this entire statement, including
// the above license grant, this restriction and the following disclaimer,
// must be included in all copies of the Software, in whole or in part, and
// all derivative works of the Software, unless such copies or derivative
// works are solely in the form of machine-executable object code generated by
// a source language processor.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
// SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
// FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
// DEALINGS IN THE SOFTWARE.
#ifndef libpbe_charset_utf8_hh
#define libpbe_charset_utf8_hh
#include "char_t.hh"
#include "charset_t.hh"
#include "charset_traits.hh"
#include "compiler_magic.hh"
// UTF-8 Encoding and Decoding
// ---------------------------
//
// This file provides functions that do the bit-shuffling needed for UTF-8
// encoding and decoding in the form of a charset_traits specialisation.
//
// TODO there should be an error_policy template parameter to the decoding
// function so that the behaviour when invalid input is encountered can be
// defined.
//
// These functions perform significantly better than the alternaitves that
// I benchmarked: Ken Thompson's originals, glibc's mbsrtowcs etc. and
// glibc's iconv. Performance is somewhat sensitive to optimisation, however;
// the branch prediction hints that I've included make this less of a problem.
namespace pbe {
template <> struct charset_traits<cs::utf8> {
typedef char32_t char_t;
typedef char8_t unit_t;
struct state_t {};
static const bool is_ascii_superset = true;
static const bool is_ascii_plus_c1_superset = true;
struct InvalidUTF8 {}; // FIXME see above error_policy needed
private:
static void check(bool condition) {
IF_UNLIKELY(!condition) throw InvalidUTF8();
}
public:
template <typename const_char8_ptr_t>
static char32_t decode(const_char8_ptr_t& p) {
char8_t b0 = *(p++);
IF_LIKELY((b0&0x80)==0) {
return b0;
}
char8_t b1 = *(p++);
check((b1&0xc0)==0x80);
IF_LIKELY((b0&0xe0)==0xc0) {
char32_t r = (b1&0x3f) | ((b0&0x1f)<<6);
check(r>=0x80);
return r;
}
char8_t b2 = *(p++);
check((b2&0xc0)==0x80);
IF_LIKELY((b0&0xf0)==0xe0) {
char32_t r = (b2&0x3f) | ((b1&0x3f)<<6) | ((b0&0x0f)<<12);
check(r>=0x800);
return r;
}
char8_t b3 = *(p++);
check((b3&0xc0)==0x80);
IF_LIKELY((b0&0xf8)==0xf0) {
char32_t r = (b3&0x3f) | ((b2&0x3f)<<6) | ((b1&0x3f)<<12) | ((b0&0x07)<<18);
check(r>=0x10000);
return r;
}
// The original definition of UTF-8 includes 5- and 6-byte encodings.
// But these correspond to values that are not valid Unicode characters,
// and they are not included in the more recent spec. Included here
// for interest:
// char8_t b4 = *(p++);
// check((b4&0xc0)==0x80);
// IF_LIKELY((b0&0xfc)==0xf8) {
// char32_t r = (b4&0x3f) | ((b3&0x3f)<<6) | ((b2&0x3f)<<12) | ((b1&0x3f)<<18) | ((b0&0x03)<<24);
// check(r>=0x200000);
// return r;
// }
// char8_t b5 = *(p++);
// check((b5&0xc0)==0x80);
// IF_LIKELY((b0&0xfe)==0xfc) {
// char32_t r = (b5&0x3f) | ((b4&0x3f)<<6) | ((b3&0x3f)<<12) | ((b2&0x3f)<<18) | ((b1&0x3f)<<24) | ((b0&0x01)<<30);
// check(r>=0x4000000);
// return r;
// }
check(false);
return 0; // not reached
}
template <typename char8_ptr_t>
static void encode(char8_ptr_t& p, char32_t c) {
IF_LIKELY(c<=0x7f) {
*(p++) = c;
} else {
IF_LIKELY(c<=0x7ff) {
*(p++) = 0xc0 | (c>>6);
} else {
IF_LIKELY(c<=0xffff) {
*(p++) = 0xe0 | (c>>12);
} else {
IF_LIKELY(c<=0x1fffff) {
*(p++) = 0xf0 | (c>>18);
} else {
// As above, disable 5- and 6-byte forms:
// IF_LIKELY(c<=0x3ffffff) {
// *(p++) = 0xf8 | (c>>24);
// } else {
// if (c&0x80000000) {
throw "can't represent this value in UTF8"; // needs error_policy
// }
// *(p++) = 0xfc | (c>>30);
// *(p++) = 0x80 | ((c>>24)&0x3f);
// }
// *(p++) = 0x80 | ((c>>18)&0x3f);
}
*(p++) = 0x80 | ((c>>12)&0x3f);
}
*(p++) = 0x80 | ((c>>6)&0x3f);
}
*(p++) = 0x80 | (c&0x3f);
}
}
// Skip forward and backward to the start of the next character.
// We know the length of a character from its first byte, so in principle we
// need only look at that to skip forward. But I guess that it's actually
// quicker to look at all bytes until a character-starting-byte is found,
// because the bit manipulation is simpler and less code is needed.
private:
static bool char_start_byte(char8_t b) {
// All non-first bytes of a UTF8 character are 10xxxxxx.
return (b&0xc0) != 0x80;
}
public:
template <typename char8_ptr_t>
static void skip_forward_char(char8_ptr_t& i) {
do {
++i;
} while (!char_start_byte(*i)); // Maybe hint this?
}
template <typename char8_ptr_t>
static void skip_backward_char(char8_ptr_t& i) {
do {
--i;
} while (!char_start_byte(*i)); // Maybe hint this?
}
template <typename const_char8_ptr_t>
static int char_length(const_char8_ptr_t& p) {
char8_t b = *p;
if ((b&0x80)==0) return 1;
if ((b&0xe0)==0xc0) return 2;
if ((b&0xf0)==0xe0) return 3;
if ((b&0xf8)==0xf0) return 4;
// As above, disable 5- and 6-byte forms:
// if ((b&0xfc)==0xf8) return 5;
// if ((b&0xfe)==0xfc) return 6;
return 0; // not reached for valid input
}
static size_t max_characters(size_t n_units) { return n_units; }
static size_t typ_characters(size_t n_units) { return n_units; }
static size_t max_units(size_t n_characters) { return 4*n_characters; }
static size_t typ_units(size_t n_characters) { return 2*n_characters; }
};
};
#endif