211 lines
6.8 KiB
C++
211 lines
6.8 KiB
C++
// utf8.hh
|
|
// This file is part of libpbe; see http://anyterm.org/
|
|
// (C) 2007-2008 Philip Endecott
|
|
|
|
// Distributed under the Boost Software License, Version 1.0:
|
|
//
|
|
// Permission is hereby granted, free of charge, to any person or organization
|
|
// obtaining a copy of the software and accompanying documentation covered by
|
|
// this license (the "Software") to use, reproduce, display, distribute,
|
|
// execute, and transmit the Software, and to prepare derivative works of the
|
|
// Software, and to permit third-parties to whom the Software is furnished to
|
|
// do so, all subject to the following:
|
|
//
|
|
// The copyright notices in the Software and this entire statement, including
|
|
// the above license grant, this restriction and the following disclaimer,
|
|
// must be included in all copies of the Software, in whole or in part, and
|
|
// all derivative works of the Software, unless such copies or derivative
|
|
// works are solely in the form of machine-executable object code generated by
|
|
// a source language processor.
|
|
//
|
|
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
// FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
|
|
// SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
|
|
// FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
|
|
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
|
// DEALINGS IN THE SOFTWARE.
|
|
|
|
#ifndef libpbe_charset_utf8_hh
|
|
#define libpbe_charset_utf8_hh
|
|
|
|
#include "char_t.hh"
|
|
#include "charset_t.hh"
|
|
#include "charset_traits.hh"
|
|
|
|
#include "compiler_magic.hh"
|
|
|
|
|
|
// UTF-8 Encoding and Decoding
|
|
// ---------------------------
|
|
//
|
|
// This file provides functions that do the bit-shuffling needed for UTF-8
|
|
// encoding and decoding in the form of a charset_traits specialisation.
|
|
//
|
|
// TODO there should be an error_policy template parameter to the decoding
|
|
// function so that the behaviour when invalid input is encountered can be
|
|
// defined.
|
|
//
|
|
// These functions perform significantly better than the alternaitves that
|
|
// I benchmarked: Ken Thompson's originals, glibc's mbsrtowcs etc. and
|
|
// glibc's iconv. Performance is somewhat sensitive to optimisation, however;
|
|
// the branch prediction hints that I've included make this less of a problem.
|
|
|
|
|
|
namespace pbe {
|
|
|
|
template <> struct charset_traits<cs::utf8> {
|
|
|
|
typedef char32_t char_t;
|
|
typedef char8_t unit_t;
|
|
struct state_t {};
|
|
static const bool is_ascii_superset = true;
|
|
static const bool is_ascii_plus_c1_superset = true;
|
|
|
|
struct InvalidUTF8 {}; // FIXME see above error_policy needed
|
|
|
|
private:
|
|
static void check(bool condition) {
|
|
IF_UNLIKELY(!condition) throw InvalidUTF8();
|
|
}
|
|
|
|
public:
|
|
template <typename const_char8_ptr_t>
|
|
static char32_t decode(const_char8_ptr_t& p) {
|
|
char8_t b0 = *(p++);
|
|
IF_LIKELY((b0&0x80)==0) {
|
|
return b0;
|
|
}
|
|
char8_t b1 = *(p++);
|
|
check((b1&0xc0)==0x80);
|
|
IF_LIKELY((b0&0xe0)==0xc0) {
|
|
char32_t r = (b1&0x3f) | ((b0&0x1f)<<6);
|
|
check(r>=0x80);
|
|
return r;
|
|
}
|
|
char8_t b2 = *(p++);
|
|
check((b2&0xc0)==0x80);
|
|
IF_LIKELY((b0&0xf0)==0xe0) {
|
|
char32_t r = (b2&0x3f) | ((b1&0x3f)<<6) | ((b0&0x0f)<<12);
|
|
check(r>=0x800);
|
|
return r;
|
|
}
|
|
char8_t b3 = *(p++);
|
|
check((b3&0xc0)==0x80);
|
|
IF_LIKELY((b0&0xf8)==0xf0) {
|
|
char32_t r = (b3&0x3f) | ((b2&0x3f)<<6) | ((b1&0x3f)<<12) | ((b0&0x07)<<18);
|
|
check(r>=0x10000);
|
|
return r;
|
|
}
|
|
// The original definition of UTF-8 includes 5- and 6-byte encodings.
|
|
// But these correspond to values that are not valid Unicode characters,
|
|
// and they are not included in the more recent spec. Included here
|
|
// for interest:
|
|
// char8_t b4 = *(p++);
|
|
// check((b4&0xc0)==0x80);
|
|
// IF_LIKELY((b0&0xfc)==0xf8) {
|
|
// char32_t r = (b4&0x3f) | ((b3&0x3f)<<6) | ((b2&0x3f)<<12) | ((b1&0x3f)<<18) | ((b0&0x03)<<24);
|
|
// check(r>=0x200000);
|
|
// return r;
|
|
// }
|
|
// char8_t b5 = *(p++);
|
|
// check((b5&0xc0)==0x80);
|
|
// IF_LIKELY((b0&0xfe)==0xfc) {
|
|
// char32_t r = (b5&0x3f) | ((b4&0x3f)<<6) | ((b3&0x3f)<<12) | ((b2&0x3f)<<18) | ((b1&0x3f)<<24) | ((b0&0x01)<<30);
|
|
// check(r>=0x4000000);
|
|
// return r;
|
|
// }
|
|
check(false);
|
|
return 0; // not reached
|
|
}
|
|
|
|
|
|
template <typename char8_ptr_t>
|
|
static void encode(char8_ptr_t& p, char32_t c) {
|
|
IF_LIKELY(c<=0x7f) {
|
|
*(p++) = c;
|
|
} else {
|
|
IF_LIKELY(c<=0x7ff) {
|
|
*(p++) = 0xc0 | (c>>6);
|
|
} else {
|
|
IF_LIKELY(c<=0xffff) {
|
|
*(p++) = 0xe0 | (c>>12);
|
|
} else {
|
|
IF_LIKELY(c<=0x1fffff) {
|
|
*(p++) = 0xf0 | (c>>18);
|
|
} else {
|
|
// As above, disable 5- and 6-byte forms:
|
|
// IF_LIKELY(c<=0x3ffffff) {
|
|
// *(p++) = 0xf8 | (c>>24);
|
|
// } else {
|
|
// if (c&0x80000000) {
|
|
throw "can't represent this value in UTF8"; // needs error_policy
|
|
// }
|
|
// *(p++) = 0xfc | (c>>30);
|
|
// *(p++) = 0x80 | ((c>>24)&0x3f);
|
|
// }
|
|
// *(p++) = 0x80 | ((c>>18)&0x3f);
|
|
}
|
|
*(p++) = 0x80 | ((c>>12)&0x3f);
|
|
}
|
|
*(p++) = 0x80 | ((c>>6)&0x3f);
|
|
}
|
|
*(p++) = 0x80 | (c&0x3f);
|
|
}
|
|
}
|
|
|
|
|
|
// Skip forward and backward to the start of the next character.
|
|
// We know the length of a character from its first byte, so in principle we
|
|
// need only look at that to skip forward. But I guess that it's actually
|
|
// quicker to look at all bytes until a character-starting-byte is found,
|
|
// because the bit manipulation is simpler and less code is needed.
|
|
|
|
private:
|
|
static bool char_start_byte(char8_t b) {
|
|
// All non-first bytes of a UTF8 character are 10xxxxxx.
|
|
return (b&0xc0) != 0x80;
|
|
}
|
|
|
|
public:
|
|
template <typename char8_ptr_t>
|
|
static void skip_forward_char(char8_ptr_t& i) {
|
|
do {
|
|
++i;
|
|
} while (!char_start_byte(*i)); // Maybe hint this?
|
|
}
|
|
|
|
template <typename char8_ptr_t>
|
|
static void skip_backward_char(char8_ptr_t& i) {
|
|
do {
|
|
--i;
|
|
} while (!char_start_byte(*i)); // Maybe hint this?
|
|
}
|
|
|
|
template <typename const_char8_ptr_t>
|
|
static int char_length(const_char8_ptr_t& p) {
|
|
char8_t b = *p;
|
|
if ((b&0x80)==0) return 1;
|
|
if ((b&0xe0)==0xc0) return 2;
|
|
if ((b&0xf0)==0xe0) return 3;
|
|
if ((b&0xf8)==0xf0) return 4;
|
|
// As above, disable 5- and 6-byte forms:
|
|
// if ((b&0xfc)==0xf8) return 5;
|
|
// if ((b&0xfe)==0xfc) return 6;
|
|
return 0; // not reached for valid input
|
|
}
|
|
|
|
|
|
static size_t max_characters(size_t n_units) { return n_units; }
|
|
static size_t typ_characters(size_t n_units) { return n_units; }
|
|
static size_t max_units(size_t n_characters) { return 4*n_characters; }
|
|
static size_t typ_units(size_t n_characters) { return 2*n_characters; }
|
|
|
|
|
|
};
|
|
|
|
|
|
};
|
|
|
|
#endif
|