// utf8.hh // This file is part of libpbe; see http://anyterm.org/ // (C) 2007-2008 Philip Endecott // Distributed under the Boost Software License, Version 1.0: // // Permission is hereby granted, free of charge, to any person or organization // obtaining a copy of the software and accompanying documentation covered by // this license (the "Software") to use, reproduce, display, distribute, // execute, and transmit the Software, and to prepare derivative works of the // Software, and to permit third-parties to whom the Software is furnished to // do so, all subject to the following: // // The copyright notices in the Software and this entire statement, including // the above license grant, this restriction and the following disclaimer, // must be included in all copies of the Software, in whole or in part, and // all derivative works of the Software, unless such copies or derivative // works are solely in the form of machine-executable object code generated by // a source language processor. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT // SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE // FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER // DEALINGS IN THE SOFTWARE. #ifndef libpbe_charset_utf8_hh #define libpbe_charset_utf8_hh #include "char_t.hh" #include "charset_t.hh" #include "charset_traits.hh" #include "compiler_magic.hh" // UTF-8 Encoding and Decoding // --------------------------- // // This file provides functions that do the bit-shuffling needed for UTF-8 // encoding and decoding in the form of a charset_traits specialisation. // // TODO there should be an error_policy template parameter to the decoding // function so that the behaviour when invalid input is encountered can be // defined. // // These functions perform significantly better than the alternaitves that // I benchmarked: Ken Thompson's originals, glibc's mbsrtowcs etc. and // glibc's iconv. Performance is somewhat sensitive to optimisation, however; // the branch prediction hints that I've included make this less of a problem. namespace pbe { template <> struct charset_traits { typedef char32_t char_t; typedef char8_t unit_t; struct state_t {}; static const bool is_ascii_superset = true; static const bool is_ascii_plus_c1_superset = true; struct InvalidUTF8 {}; // FIXME see above error_policy needed private: static void check(bool condition) { IF_UNLIKELY(!condition) throw InvalidUTF8(); } public: template static char32_t decode(const_char8_ptr_t& p) { char8_t b0 = *(p++); IF_LIKELY((b0&0x80)==0) { return b0; } char8_t b1 = *(p++); check((b1&0xc0)==0x80); IF_LIKELY((b0&0xe0)==0xc0) { char32_t r = (b1&0x3f) | ((b0&0x1f)<<6); check(r>=0x80); return r; } char8_t b2 = *(p++); check((b2&0xc0)==0x80); IF_LIKELY((b0&0xf0)==0xe0) { char32_t r = (b2&0x3f) | ((b1&0x3f)<<6) | ((b0&0x0f)<<12); check(r>=0x800); return r; } char8_t b3 = *(p++); check((b3&0xc0)==0x80); IF_LIKELY((b0&0xf8)==0xf0) { char32_t r = (b3&0x3f) | ((b2&0x3f)<<6) | ((b1&0x3f)<<12) | ((b0&0x07)<<18); check(r>=0x10000); return r; } // The original definition of UTF-8 includes 5- and 6-byte encodings. // But these correspond to values that are not valid Unicode characters, // and they are not included in the more recent spec. Included here // for interest: // char8_t b4 = *(p++); // check((b4&0xc0)==0x80); // IF_LIKELY((b0&0xfc)==0xf8) { // char32_t r = (b4&0x3f) | ((b3&0x3f)<<6) | ((b2&0x3f)<<12) | ((b1&0x3f)<<18) | ((b0&0x03)<<24); // check(r>=0x200000); // return r; // } // char8_t b5 = *(p++); // check((b5&0xc0)==0x80); // IF_LIKELY((b0&0xfe)==0xfc) { // char32_t r = (b5&0x3f) | ((b4&0x3f)<<6) | ((b3&0x3f)<<12) | ((b2&0x3f)<<18) | ((b1&0x3f)<<24) | ((b0&0x01)<<30); // check(r>=0x4000000); // return r; // } check(false); return 0; // not reached } template static void encode(char8_ptr_t& p, char32_t c) { IF_LIKELY(c<=0x7f) { *(p++) = c; } else { IF_LIKELY(c<=0x7ff) { *(p++) = 0xc0 | (c>>6); } else { IF_LIKELY(c<=0xffff) { *(p++) = 0xe0 | (c>>12); } else { IF_LIKELY(c<=0x1fffff) { *(p++) = 0xf0 | (c>>18); } else { // As above, disable 5- and 6-byte forms: // IF_LIKELY(c<=0x3ffffff) { // *(p++) = 0xf8 | (c>>24); // } else { // if (c&0x80000000) { throw "can't represent this value in UTF8"; // needs error_policy // } // *(p++) = 0xfc | (c>>30); // *(p++) = 0x80 | ((c>>24)&0x3f); // } // *(p++) = 0x80 | ((c>>18)&0x3f); } *(p++) = 0x80 | ((c>>12)&0x3f); } *(p++) = 0x80 | ((c>>6)&0x3f); } *(p++) = 0x80 | (c&0x3f); } } // Skip forward and backward to the start of the next character. // We know the length of a character from its first byte, so in principle we // need only look at that to skip forward. But I guess that it's actually // quicker to look at all bytes until a character-starting-byte is found, // because the bit manipulation is simpler and less code is needed. private: static bool char_start_byte(char8_t b) { // All non-first bytes of a UTF8 character are 10xxxxxx. return (b&0xc0) != 0x80; } public: template static void skip_forward_char(char8_ptr_t& i) { do { ++i; } while (!char_start_byte(*i)); // Maybe hint this? } template static void skip_backward_char(char8_ptr_t& i) { do { --i; } while (!char_start_byte(*i)); // Maybe hint this? } template static int char_length(const_char8_ptr_t& p) { char8_t b = *p; if ((b&0x80)==0) return 1; if ((b&0xe0)==0xc0) return 2; if ((b&0xf0)==0xe0) return 3; if ((b&0xf8)==0xf0) return 4; // As above, disable 5- and 6-byte forms: // if ((b&0xfc)==0xf8) return 5; // if ((b&0xfe)==0xfc) return 6; return 0; // not reached for valid input } static size_t max_characters(size_t n_units) { return n_units; } static size_t typ_characters(size_t n_units) { return n_units; } static size_t max_units(size_t n_characters) { return 4*n_characters; } static size_t typ_units(size_t n_characters) { return 2*n_characters; } }; }; #endif