Add 'third-party/utf8cpp/' from commit '82344d605146dca6b19abc21578482cecda7f5d7'

git-subtree-dir: third-party/utf8cpp git-subtree-mainline: 002f8e36f5ef8517cb9b870351bff9da617abe23 git-subtree-split: 82344d605146dca6b19abc21578482cecda7f5d7
2025-10-28 01:33:50 +01:00 · 2019-03-15 13:33:44 +01:00 · 2019-03-15 13:33:44 +01:00 · dd15f33c17
commit dd15f33c17
parent 002f8e36f5 82344d6051
16 changed files with 3149 additions and 0 deletions
--- a/third-party/utf8cpp/.gitignore
+++ b/third-party/utf8cpp/.gitignore
@ -0,0 +1,4 @@
+# VS Code:
+.vscode/
+# Often used by CMake 
+build/
--- a/third-party/utf8cpp/CMakeLists.txt
+++ b/third-party/utf8cpp/CMakeLists.txt
@ -0,0 +1,43 @@
+cmake_minimum_required (VERSION 3.0.2)
+project (utf8cpp VERSION 2.3.6 LANGUAGES CXX)
+
+option(UTF8_TESTS "Enable tests for UTF8-CPP" On)
+option(UTF8_SAMPLES "Enable building samples for UTF8-CPP" On)
+
+add_library(utf8cpp INTERFACE)
+target_include_directories(utf8cpp INTERFACE
+	"$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/source>"
+	$<INSTALL_INTERFACE:include/utf8cpp>
+)
+add_library(utf8::cpp ALIAS utf8cpp)
+
+if(WIN32 AND NOT CYGWIN)
+    set(DEF_INSTALL_CMAKE_DIR CMake)
+else()
+    include(GNUInstallDirs) # define CMAKE_INSTALL_*
+    set(DEF_INSTALL_CMAKE_DIR ${CMAKE_INSTALL_LIBDIR}/cmake/utf8cpp)
+endif()
+
+install(DIRECTORY source/ DESTINATION include/utf8cpp)
+install(TARGETS utf8cpp EXPORT utf8cppConfig)
+install(EXPORT utf8cppConfig DESTINATION ${DEF_INSTALL_CMAKE_DIR})
+
+if(UTF8_SAMPLES)
+	add_executable(utf8reader ${PROJECT_SOURCE_DIR}/test_drivers/utf8reader/utf8reader.cpp)
+	add_executable(docsample ${PROJECT_SOURCE_DIR}/samples/docsample.cpp)
+
+	target_link_libraries(utf8reader PRIVATE utf8::cpp)
+	target_link_libraries(docsample PRIVATE utf8::cpp)
+endif()
+
+if(UTF8_TESTS)
+	add_executable(smoke ${PROJECT_SOURCE_DIR}/test_drivers/smoke_test/test.cpp)
+	add_executable(negative ${PROJECT_SOURCE_DIR}/test_drivers/negative/negative.cpp)
+
+	target_link_libraries(smoke PRIVATE utf8::cpp)
+	target_link_libraries(negative PRIVATE utf8::cpp)
+
+	enable_testing()
+	add_test(smoke_test smoke)
+	add_test(negative_test negative ${PROJECT_SOURCE_DIR}/test_data/negative/utf8_invalid.txt)
+endif()
--- a/third-party/utf8cpp/LICENSE
+++ b/third-party/utf8cpp/LICENSE
@ -0,0 +1,23 @@
+Boost Software License - Version 1.0 - August 17th, 2003
+
+Permission is hereby granted, free of charge, to any person or organization
+obtaining a copy of the software and accompanying documentation covered by
+this license (the "Software") to use, reproduce, display, distribute,
+execute, and transmit the Software, and to prepare derivative works of the
+Software, and to permit third-parties to whom the Software is furnished to
+do so, all subject to the following:
+
+The copyright notices in the Software and this entire statement, including
+the above license grant, this restriction and the following disclaimer,
+must be included in all copies of the Software, in whole or in part, and
+all derivative works of the Software, unless such copies or derivative
+works are solely in the form of machine-executable object code generated by
+a source language processor.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
--- a/third-party/utf8cpp/README.md
+++ b/third-party/utf8cpp/README.md
--- a/third-party/utf8cpp/samples/docsample.cpp
+++ b/third-party/utf8cpp/samples/docsample.cpp
@ -0,0 +1,52 @@
+#include "../source/utf8.h"
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <vector>
+
+
+using namespace std;
+
+int main(int argc, char** argv)
+{
+    if (argc != 2) {
+        cout << "\nUsage: docsample filename\n";
+        return 0;
+    }
+    const char* test_file_path = argv[1];
+    // Open the test file (must be UTF-8 encoded)
+    ifstream fs8(test_file_path);
+    if (!fs8.is_open()) {
+    cout << "Could not open " << test_file_path << endl;
+    return 0;
+    }
+
+    unsigned line_count = 1;
+    string line;
+    // Play with all the lines in the file
+    while (getline(fs8, line)) {
+        // check for invalid utf-8 (for a simple yes/no check, there is also utf8::is_valid function)
+        string::iterator end_it = utf8::find_invalid(line.begin(), line.end());
+        if (end_it != line.end()) {
+            cout << "Invalid UTF-8 encoding detected at line " << line_count << "\n";
+            cout << "This part is fine: " << string(line.begin(), end_it) << "\n";
+        }
+        // Get the line length (at least for the valid part)
+        int length = utf8::distance(line.begin(), end_it);
+        cout << "Length of line " << line_count << " is " << length <<  "\n";
+
+        // Convert it to utf-16
+        vector<unsigned short> utf16line;
+        utf8::utf8to16(line.begin(), end_it, back_inserter(utf16line));
+        // And back to utf-8;
+        string utf8line; 
+        utf8::utf16to8(utf16line.begin(), utf16line.end(), back_inserter(utf8line));
+        // Confirm that the conversion went OK:
+        if (utf8line != string(line.begin(), end_it))
+            cout << "Error in UTF-16 conversion at line: " << line_count << "\n";        
+
+        line_count++;
+    } 
+
+    return 0;
+}
--- a/third-party/utf8cpp/source/utf8.h
+++ b/third-party/utf8cpp/source/utf8.h
@ -0,0 +1,34 @@
+// Copyright 2006 Nemanja Trifunovic
+
+/*
+Permission is hereby granted, free of charge, to any person or organization
+obtaining a copy of the software and accompanying documentation covered by
+this license (the "Software") to use, reproduce, display, distribute,
+execute, and transmit the Software, and to prepare derivative works of the
+Software, and to permit third-parties to whom the Software is furnished to
+do so, all subject to the following:
+
+The copyright notices in the Software and this entire statement, including
+the above license grant, this restriction and the following disclaimer,
+must be included in all copies of the Software, in whole or in part, and
+all derivative works of the Software, unless such copies or derivative
+works are solely in the form of machine-executable object code generated by
+a source language processor.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
+*/
+
+
+#ifndef UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731
+#define UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731
+
+#include "utf8/checked.h"
+#include "utf8/unchecked.h"
+
+#endif // header guard
--- a/third-party/utf8cpp/source/utf8/checked.h
+++ b/third-party/utf8cpp/source/utf8/checked.h
@ -0,0 +1,327 @@
+// Copyright 2006-2016 Nemanja Trifunovic
+
+/*
+Permission is hereby granted, free of charge, to any person or organization
+obtaining a copy of the software and accompanying documentation covered by
+this license (the "Software") to use, reproduce, display, distribute,
+execute, and transmit the Software, and to prepare derivative works of the
+Software, and to permit third-parties to whom the Software is furnished to
+do so, all subject to the following:
+
+The copyright notices in the Software and this entire statement, including
+the above license grant, this restriction and the following disclaimer,
+must be included in all copies of the Software, in whole or in part, and
+all derivative works of the Software, unless such copies or derivative
+works are solely in the form of machine-executable object code generated by
+a source language processor.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
+*/
+
+
+#ifndef UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
+#define UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
+
+#include "core.h"
+#include <stdexcept>
+
+namespace utf8
+{
+    // Base for the exceptions that may be thrown from the library
+    class exception : public ::std::exception {
+    };
+
+    // Exceptions that may be thrown from the library functions.
+    class invalid_code_point : public exception {
+        uint32_t cp;
+    public:
+        invalid_code_point(uint32_t codepoint) : cp(codepoint) {}
+        virtual const char* what() const throw() { return "Invalid code point"; }
+        uint32_t code_point() const {return cp;}
+    };
+
+    class invalid_utf8 : public exception {
+        uint8_t u8;
+    public:
+        invalid_utf8 (uint8_t u) : u8(u) {}
+        virtual const char* what() const throw() { return "Invalid UTF-8"; }
+        uint8_t utf8_octet() const {return u8;}
+    };
+
+    class invalid_utf16 : public exception {
+        uint16_t u16;
+    public:
+        invalid_utf16 (uint16_t u) : u16(u) {}
+        virtual const char* what() const throw() { return "Invalid UTF-16"; }
+        uint16_t utf16_word() const {return u16;}
+    };
+
+    class not_enough_room : public exception {
+    public:
+        virtual const char* what() const throw() { return "Not enough space"; }
+    };
+
+    /// The library API - functions intended to be called by the users
+
+    template <typename octet_iterator>
+    octet_iterator append(uint32_t cp, octet_iterator result)
+    {
+        if (!utf8::internal::is_code_point_valid(cp))
+            throw invalid_code_point(cp);
+
+        if (cp < 0x80)                        // one octet
+            *(result++) = static_cast<uint8_t>(cp);
+        else if (cp < 0x800) {                // two octets
+            *(result++) = static_cast<uint8_t>((cp >> 6)            | 0xc0);
+            *(result++) = static_cast<uint8_t>((cp & 0x3f)          | 0x80);
+        }
+        else if (cp < 0x10000) {              // three octets
+            *(result++) = static_cast<uint8_t>((cp >> 12)           | 0xe0);
+            *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f)   | 0x80);
+            *(result++) = static_cast<uint8_t>((cp & 0x3f)          | 0x80);
+        }
+        else {                                // four octets
+            *(result++) = static_cast<uint8_t>((cp >> 18)           | 0xf0);
+            *(result++) = static_cast<uint8_t>(((cp >> 12) & 0x3f)  | 0x80);
+            *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f)   | 0x80);
+            *(result++) = static_cast<uint8_t>((cp & 0x3f)          | 0x80);
+        }
+        return result;
+    }
+
+    template <typename octet_iterator, typename output_iterator>
+    output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement)
+    {
+        while (start != end) {
+            octet_iterator sequence_start = start;
+            internal::utf_error err_code = utf8::internal::validate_next(start, end);
+            switch (err_code) {
+                case internal::UTF8_OK :
+                    for (octet_iterator it = sequence_start; it != start; ++it)
+                        *out++ = *it;
+                    break;
+                case internal::NOT_ENOUGH_ROOM:
+                    throw not_enough_room();
+                case internal::INVALID_LEAD:
+                    out = utf8::append (replacement, out);
+                    ++start;
+                    break;
+                case internal::INCOMPLETE_SEQUENCE:
+                case internal::OVERLONG_SEQUENCE:
+                case internal::INVALID_CODE_POINT:
+                    out = utf8::append (replacement, out);
+                    ++start;
+                    // just one replacement mark for the sequence
+                    while (start != end && utf8::internal::is_trail(*start))
+                        ++start;
+                    break;
+            }
+        }
+        return out;
+    }
+
+    template <typename octet_iterator, typename output_iterator>
+    inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out)
+    {
+        static const uint32_t replacement_marker = utf8::internal::mask16(0xfffd);
+        return utf8::replace_invalid(start, end, out, replacement_marker);
+    }
+
+    template <typename octet_iterator>
+    uint32_t next(octet_iterator& it, octet_iterator end)
+    {
+        uint32_t cp = 0;
+        internal::utf_error err_code = utf8::internal::validate_next(it, end, cp);
+        switch (err_code) {
+            case internal::UTF8_OK :
+                break;
+            case internal::NOT_ENOUGH_ROOM :
+                throw not_enough_room();
+            case internal::INVALID_LEAD :
+            case internal::INCOMPLETE_SEQUENCE :
+            case internal::OVERLONG_SEQUENCE :
+                throw invalid_utf8(*it);
+            case internal::INVALID_CODE_POINT :
+                throw invalid_code_point(cp);
+        }
+        return cp;
+    }
+
+    template <typename octet_iterator>
+    uint32_t peek_next(octet_iterator it, octet_iterator end)
+    {
+        return utf8::next(it, end);
+    }
+
+    template <typename octet_iterator>
+    uint32_t prior(octet_iterator& it, octet_iterator start)
+    {
+        // can't do much if it == start
+        if (it == start)
+            throw not_enough_room();
+
+        octet_iterator end = it;
+        // Go back until we hit either a lead octet or start
+        while (utf8::internal::is_trail(*(--it)))
+            if (it == start)
+                throw invalid_utf8(*it); // error - no lead byte in the sequence
+        return utf8::peek_next(it, end);
+    }
+
+    /// Deprecated in versions that include "prior"
+    template <typename octet_iterator>
+    uint32_t previous(octet_iterator& it, octet_iterator pass_start)
+    {
+        octet_iterator end = it;
+        while (utf8::internal::is_trail(*(--it)))
+            if (it == pass_start)
+                throw invalid_utf8(*it); // error - no lead byte in the sequence
+        octet_iterator temp = it;
+        return utf8::next(temp, end);
+    }
+
+    template <typename octet_iterator, typename distance_type>
+    void advance (octet_iterator& it, distance_type n, octet_iterator end)
+    {
+        for (distance_type i = 0; i < n; ++i)
+            utf8::next(it, end);
+    }
+
+    template <typename octet_iterator>
+    typename std::iterator_traits<octet_iterator>::difference_type
+    distance (octet_iterator first, octet_iterator last)
+    {
+        typename std::iterator_traits<octet_iterator>::difference_type dist;
+        for (dist = 0; first < last; ++dist)
+            utf8::next(first, last);
+        return dist;
+    }
+
+    template <typename u16bit_iterator, typename octet_iterator>
+    octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
+    {
+        while (start != end) {
+            uint32_t cp = utf8::internal::mask16(*start++);
+            // Take care of surrogate pairs first
+            if (utf8::internal::is_lead_surrogate(cp)) {
+                if (start != end) {
+                    uint32_t trail_surrogate = utf8::internal::mask16(*start++);
+                    if (utf8::internal::is_trail_surrogate(trail_surrogate))
+                        cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
+                    else
+                        throw invalid_utf16(static_cast<uint16_t>(trail_surrogate));
+                }
+                else
+                    throw invalid_utf16(static_cast<uint16_t>(cp));
+
+            }
+            // Lone trail surrogate
+            else if (utf8::internal::is_trail_surrogate(cp))
+                throw invalid_utf16(static_cast<uint16_t>(cp));
+
+            result = utf8::append(cp, result);
+        }
+        return result;
+    }
+
+    template <typename u16bit_iterator, typename octet_iterator>
+    u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result)
+    {
+        while (start < end) {
+            uint32_t cp = utf8::next(start, end);
+            if (cp > 0xffff) { //make a surrogate pair
+                *result++ = static_cast<uint16_t>((cp >> 10)   + internal::LEAD_OFFSET);
+                *result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
+            }
+            else
+                *result++ = static_cast<uint16_t>(cp);
+        }
+        return result;
+    }
+
+    template <typename octet_iterator, typename u32bit_iterator>
+    octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result)
+    {
+        while (start != end)
+            result = utf8::append(*(start++), result);
+
+        return result;
+    }
+
+    template <typename octet_iterator, typename u32bit_iterator>
+    u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result)
+    {
+        while (start < end)
+            (*result++) = utf8::next(start, end);
+
+        return result;
+    }
+
+    // The iterator class
+    template <typename octet_iterator>
+    class iterator : public std::iterator <std::bidirectional_iterator_tag, uint32_t> {
+      octet_iterator it;
+      octet_iterator range_start;
+      octet_iterator range_end;
+      public:
+      iterator () {}
+      explicit iterator (const octet_iterator& octet_it,
+                         const octet_iterator& rangestart,
+                         const octet_iterator& rangeend) :
+               it(octet_it), range_start(rangestart), range_end(rangeend)
+      {
+          if (it < range_start || it > range_end)
+              throw std::out_of_range("Invalid utf-8 iterator position");
+      }
+      // the default "big three" are OK
+      octet_iterator base () const { return it; }
+      uint32_t operator * () const
+      {
+          octet_iterator temp = it;
+          return utf8::next(temp, range_end);
+      }
+      bool operator == (const iterator& rhs) const
+      {
+          if (range_start != rhs.range_start || range_end != rhs.range_end)
+              throw std::logic_error("Comparing utf-8 iterators defined with different ranges");
+          return (it == rhs.it);
+      }
+      bool operator != (const iterator& rhs) const
+      {
+          return !(operator == (rhs));
+      }
+      iterator& operator ++ ()
+      {
+          utf8::next(it, range_end);
+          return *this;
+      }
+      iterator operator ++ (int)
+      {
+          iterator temp = *this;
+          utf8::next(it, range_end);
+          return temp;
+      }
+      iterator& operator -- ()
+      {
+          utf8::prior(it, range_start);
+          return *this;
+      }
+      iterator operator -- (int)
+      {
+          iterator temp = *this;
+          utf8::prior(it, range_start);
+          return temp;
+      }
+    }; // class iterator
+
+} // namespace utf8
+
+#endif //header guard
+
+
--- a/third-party/utf8cpp/source/utf8/core.h
+++ b/third-party/utf8cpp/source/utf8/core.h
@ -0,0 +1,332 @@
+// Copyright 2006 Nemanja Trifunovic
+
+/*
+Permission is hereby granted, free of charge, to any person or organization
+obtaining a copy of the software and accompanying documentation covered by
+this license (the "Software") to use, reproduce, display, distribute,
+execute, and transmit the Software, and to prepare derivative works of the
+Software, and to permit third-parties to whom the Software is furnished to
+do so, all subject to the following:
+
+The copyright notices in the Software and this entire statement, including
+the above license grant, this restriction and the following disclaimer,
+must be included in all copies of the Software, in whole or in part, and
+all derivative works of the Software, unless such copies or derivative
+works are solely in the form of machine-executable object code generated by
+a source language processor.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
+*/
+
+
+#ifndef UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
+#define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
+
+#include <iterator>
+
+namespace utf8
+{
+    // The typedefs for 8-bit, 16-bit and 32-bit unsigned integers
+    // You may need to change them to match your system.
+    // These typedefs have the same names as ones from cstdint, or boost/cstdint
+    typedef unsigned char   uint8_t;
+    typedef unsigned short  uint16_t;
+    typedef unsigned int    uint32_t;
+
+// Helper code - not intended to be directly called by the library users. May be changed at any time
+namespace internal
+{
+    // Unicode constants
+    // Leading (high) surrogates: 0xd800 - 0xdbff
+    // Trailing (low) surrogates: 0xdc00 - 0xdfff
+    const uint16_t LEAD_SURROGATE_MIN  = 0xd800u;
+    const uint16_t LEAD_SURROGATE_MAX  = 0xdbffu;
+    const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u;
+    const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu;
+    const uint16_t LEAD_OFFSET         = LEAD_SURROGATE_MIN - (0x10000 >> 10);
+    const uint32_t SURROGATE_OFFSET    = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN;
+
+    // Maximum valid value for a Unicode code point
+    const uint32_t CODE_POINT_MAX      = 0x0010ffffu;
+
+    template<typename octet_type>
+    inline uint8_t mask8(octet_type oc)
+    {
+        return static_cast<uint8_t>(0xff & oc);
+    }
+    template<typename u16_type>
+    inline uint16_t mask16(u16_type oc)
+    {
+        return static_cast<uint16_t>(0xffff & oc);
+    }
+    template<typename octet_type>
+    inline bool is_trail(octet_type oc)
+    {
+        return ((utf8::internal::mask8(oc) >> 6) == 0x2);
+    }
+
+    template <typename u16>
+    inline bool is_lead_surrogate(u16 cp)
+    {
+        return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX);
+    }
+
+    template <typename u16>
+    inline bool is_trail_surrogate(u16 cp)
+    {
+        return (cp >= TRAIL_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
+    }
+
+    template <typename u16>
+    inline bool is_surrogate(u16 cp)
+    {
+        return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
+    }
+
+    template <typename u32>
+    inline bool is_code_point_valid(u32 cp)
+    {
+        return (cp <= CODE_POINT_MAX && !utf8::internal::is_surrogate(cp));
+    }
+
+    template <typename octet_iterator>
+    inline typename std::iterator_traits<octet_iterator>::difference_type
+    sequence_length(octet_iterator lead_it)
+    {
+        uint8_t lead = utf8::internal::mask8(*lead_it);
+        if (lead < 0x80)
+            return 1;
+        else if ((lead >> 5) == 0x6)
+            return 2;
+        else if ((lead >> 4) == 0xe)
+            return 3;
+        else if ((lead >> 3) == 0x1e)
+            return 4;
+        else
+            return 0;
+    }
+
+    template <typename octet_difference_type>
+    inline bool is_overlong_sequence(uint32_t cp, octet_difference_type length)
+    {
+        if (cp < 0x80) {
+            if (length != 1) 
+                return true;
+        }
+        else if (cp < 0x800) {
+            if (length != 2) 
+                return true;
+        }
+        else if (cp < 0x10000) {
+            if (length != 3) 
+                return true;
+        }
+
+        return false;
+    }
+
+    enum utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT};
+
+    /// Helper for get_sequence_x
+    template <typename octet_iterator>
+    utf_error increase_safely(octet_iterator& it, octet_iterator end)
+    {
+        if (++it == end)
+            return NOT_ENOUGH_ROOM;
+
+        if (!utf8::internal::is_trail(*it))
+            return INCOMPLETE_SEQUENCE;
+        
+        return UTF8_OK;
+    }
+
+    #define UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(IT, END) {utf_error ret = increase_safely(IT, END); if (ret != UTF8_OK) return ret;}    
+
+    /// get_sequence_x functions decode utf-8 sequences of the length x
+    template <typename octet_iterator>
+    utf_error get_sequence_1(octet_iterator& it, octet_iterator end, uint32_t& code_point)
+    {
+        if (it == end)
+            return NOT_ENOUGH_ROOM;
+
+        code_point = utf8::internal::mask8(*it);
+
+        return UTF8_OK;
+    }
+
+    template <typename octet_iterator>
+    utf_error get_sequence_2(octet_iterator& it, octet_iterator end, uint32_t& code_point)
+    {
+        if (it == end) 
+            return NOT_ENOUGH_ROOM;
+        
+        code_point = utf8::internal::mask8(*it);
+
+        UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
+
+        code_point = ((code_point << 6) & 0x7ff) + ((*it) & 0x3f);
+
+        return UTF8_OK;
+    }
+
+    template <typename octet_iterator>
+    utf_error get_sequence_3(octet_iterator& it, octet_iterator end, uint32_t& code_point)
+    {
+        if (it == end)
+            return NOT_ENOUGH_ROOM;
+            
+        code_point = utf8::internal::mask8(*it);
+
+        UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
+
+        code_point = ((code_point << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff);
+
+        UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
+
+        code_point += (*it) & 0x3f;
+
+        return UTF8_OK;
+    }
+
+    template <typename octet_iterator>
+    utf_error get_sequence_4(octet_iterator& it, octet_iterator end, uint32_t& code_point)
+    {
+        if (it == end)
+           return NOT_ENOUGH_ROOM;
+
+        code_point = utf8::internal::mask8(*it);
+
+        UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
+
+        code_point = ((code_point << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff);
+
+        UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
+
+        code_point += (utf8::internal::mask8(*it) << 6) & 0xfff;
+
+        UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
+
+        code_point += (*it) & 0x3f;
+
+        return UTF8_OK;
+    }
+
+    #undef UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR
+
+    template <typename octet_iterator>
+    utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t& code_point)
+    {
+	if (it == end)
+            return NOT_ENOUGH_ROOM;
+
+        // Save the original value of it so we can go back in case of failure
+        // Of course, it does not make much sense with i.e. stream iterators
+        octet_iterator original_it = it;
+
+        uint32_t cp = 0;
+        // Determine the sequence length based on the lead octet
+        typedef typename std::iterator_traits<octet_iterator>::difference_type octet_difference_type;
+        const octet_difference_type length = utf8::internal::sequence_length(it);
+
+        // Get trail octets and calculate the code point
+        utf_error err = UTF8_OK;
+        switch (length) {
+            case 0: 
+                return INVALID_LEAD;
+            case 1:
+                err = utf8::internal::get_sequence_1(it, end, cp);
+                break;
+            case 2:
+                err = utf8::internal::get_sequence_2(it, end, cp);
+            break;
+            case 3:
+                err = utf8::internal::get_sequence_3(it, end, cp);
+            break;
+            case 4:
+                err = utf8::internal::get_sequence_4(it, end, cp);
+            break;
+        }
+
+        if (err == UTF8_OK) {
+            // Decoding succeeded. Now, security checks...
+            if (utf8::internal::is_code_point_valid(cp)) {
+                if (!utf8::internal::is_overlong_sequence(cp, length)){
+                    // Passed! Return here.
+                    code_point = cp;
+                    ++it;
+                    return UTF8_OK;
+                }
+                else
+                    err = OVERLONG_SEQUENCE;
+            }
+            else 
+                err = INVALID_CODE_POINT;
+        }
+
+        // Failure branch - restore the original value of the iterator
+        it = original_it;
+        return err;
+    }
+
+    template <typename octet_iterator>
+    inline utf_error validate_next(octet_iterator& it, octet_iterator end) {
+        uint32_t ignored;
+        return utf8::internal::validate_next(it, end, ignored);
+    }
+
+} // namespace internal
+
+    /// The library API - functions intended to be called by the users
+
+    // Byte order mark
+    const uint8_t bom[] = {0xef, 0xbb, 0xbf};
+
+    template <typename octet_iterator>
+    octet_iterator find_invalid(octet_iterator start, octet_iterator end)
+    {
+        octet_iterator result = start;
+        while (result != end) {
+            utf8::internal::utf_error err_code = utf8::internal::validate_next(result, end);
+            if (err_code != internal::UTF8_OK)
+                return result;
+        }
+        return result;
+    }
+
+    template <typename octet_iterator>
+    inline bool is_valid(octet_iterator start, octet_iterator end)
+    {
+        return (utf8::find_invalid(start, end) == end);
+    }
+
+    template <typename octet_iterator>
+    inline bool starts_with_bom (octet_iterator it, octet_iterator end)
+    {
+        return (
+            ((it != end) && (utf8::internal::mask8(*it++)) == bom[0]) &&
+            ((it != end) && (utf8::internal::mask8(*it++)) == bom[1]) &&
+            ((it != end) && (utf8::internal::mask8(*it))   == bom[2])
+           );
+    }
+	
+    //Deprecated in release 2.3 
+    template <typename octet_iterator>
+    inline bool is_bom (octet_iterator it)
+    {
+        return (
+            (utf8::internal::mask8(*it++)) == bom[0] &&
+            (utf8::internal::mask8(*it++)) == bom[1] &&
+            (utf8::internal::mask8(*it))   == bom[2]
+           );
+    }
+} // namespace utf8
+
+#endif // header guard
+
+
--- a/third-party/utf8cpp/source/utf8/unchecked.h
+++ b/third-party/utf8cpp/source/utf8/unchecked.h
@ -0,0 +1,228 @@
+// Copyright 2006 Nemanja Trifunovic
+
+/*
+Permission is hereby granted, free of charge, to any person or organization
+obtaining a copy of the software and accompanying documentation covered by
+this license (the "Software") to use, reproduce, display, distribute,
+execute, and transmit the Software, and to prepare derivative works of the
+Software, and to permit third-parties to whom the Software is furnished to
+do so, all subject to the following:
+
+The copyright notices in the Software and this entire statement, including
+the above license grant, this restriction and the following disclaimer,
+must be included in all copies of the Software, in whole or in part, and
+all derivative works of the Software, unless such copies or derivative
+works are solely in the form of machine-executable object code generated by
+a source language processor.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
+*/
+
+
+#ifndef UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
+#define UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
+
+#include "core.h"
+
+namespace utf8
+{
+    namespace unchecked 
+    {
+        template <typename octet_iterator>
+        octet_iterator append(uint32_t cp, octet_iterator result)
+        {
+            if (cp < 0x80)                        // one octet
+                *(result++) = static_cast<uint8_t>(cp);  
+            else if (cp < 0x800) {                // two octets
+                *(result++) = static_cast<uint8_t>((cp >> 6)          | 0xc0);
+                *(result++) = static_cast<uint8_t>((cp & 0x3f)        | 0x80);
+            }
+            else if (cp < 0x10000) {              // three octets
+                *(result++) = static_cast<uint8_t>((cp >> 12)         | 0xe0);
+                *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
+                *(result++) = static_cast<uint8_t>((cp & 0x3f)        | 0x80);
+            }
+            else {                                // four octets
+                *(result++) = static_cast<uint8_t>((cp >> 18)         | 0xf0);
+                *(result++) = static_cast<uint8_t>(((cp >> 12) & 0x3f)| 0x80);
+                *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
+                *(result++) = static_cast<uint8_t>((cp & 0x3f)        | 0x80);
+            }
+            return result;
+        }
+
+        template <typename octet_iterator>
+        uint32_t next(octet_iterator& it)
+        {
+            uint32_t cp = utf8::internal::mask8(*it);
+            typename std::iterator_traits<octet_iterator>::difference_type length = utf8::internal::sequence_length(it);
+            switch (length) {
+                case 1:
+                    break;
+                case 2:
+                    it++;
+                    cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f);
+                    break;
+                case 3:
+                    ++it; 
+                    cp = ((cp << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff);
+                    ++it;
+                    cp += (*it) & 0x3f;
+                    break;
+                case 4:
+                    ++it;
+                    cp = ((cp << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff);                
+                    ++it;
+                    cp += (utf8::internal::mask8(*it) << 6) & 0xfff;
+                    ++it;
+                    cp += (*it) & 0x3f; 
+                    break;
+            }
+            ++it;
+            return cp;        
+        }
+
+        template <typename octet_iterator>
+        uint32_t peek_next(octet_iterator it)
+        {
+            return utf8::unchecked::next(it);    
+        }
+
+        template <typename octet_iterator>
+        uint32_t prior(octet_iterator& it)
+        {
+            while (utf8::internal::is_trail(*(--it))) ;
+            octet_iterator temp = it;
+            return utf8::unchecked::next(temp);
+        }
+
+        // Deprecated in versions that include prior, but only for the sake of consistency (see utf8::previous)
+        template <typename octet_iterator>
+        inline uint32_t previous(octet_iterator& it)
+        {
+            return utf8::unchecked::prior(it);
+        }
+
+        template <typename octet_iterator, typename distance_type>
+        void advance (octet_iterator& it, distance_type n)
+        {
+            for (distance_type i = 0; i < n; ++i)
+                utf8::unchecked::next(it);
+        }
+
+        template <typename octet_iterator>
+        typename std::iterator_traits<octet_iterator>::difference_type
+        distance (octet_iterator first, octet_iterator last)
+        {
+            typename std::iterator_traits<octet_iterator>::difference_type dist;
+            for (dist = 0; first < last; ++dist) 
+                utf8::unchecked::next(first);
+            return dist;
+        }
+
+        template <typename u16bit_iterator, typename octet_iterator>
+        octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
+        {       
+            while (start != end) {
+                uint32_t cp = utf8::internal::mask16(*start++);
+            // Take care of surrogate pairs first
+                if (utf8::internal::is_lead_surrogate(cp)) {
+                    uint32_t trail_surrogate = utf8::internal::mask16(*start++);
+                    cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
+                }
+                result = utf8::unchecked::append(cp, result);
+            }
+            return result;         
+        }
+
+        template <typename u16bit_iterator, typename octet_iterator>
+        u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result)
+        {
+            while (start < end) {
+                uint32_t cp = utf8::unchecked::next(start);
+                if (cp > 0xffff) { //make a surrogate pair
+                    *result++ = static_cast<uint16_t>((cp >> 10)   + internal::LEAD_OFFSET);
+                    *result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
+                }
+                else
+                    *result++ = static_cast<uint16_t>(cp);
+            }
+            return result;
+        }
+
+        template <typename octet_iterator, typename u32bit_iterator>
+        octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result)
+        {
+            while (start != end)
+                result = utf8::unchecked::append(*(start++), result);
+
+            return result;
+        }
+
+        template <typename octet_iterator, typename u32bit_iterator>
+        u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result)
+        {
+            while (start < end)
+                (*result++) = utf8::unchecked::next(start);
+
+            return result;
+        }
+
+        // The iterator class
+        template <typename octet_iterator>
+          class iterator : public std::iterator <std::bidirectional_iterator_tag, uint32_t> { 
+            octet_iterator it;
+            public:
+            iterator () {}
+            explicit iterator (const octet_iterator& octet_it): it(octet_it) {}
+            // the default "big three" are OK
+            octet_iterator base () const { return it; }
+            uint32_t operator * () const
+            {
+                octet_iterator temp = it;
+                return utf8::unchecked::next(temp);
+            }
+            bool operator == (const iterator& rhs) const 
+            { 
+                return (it == rhs.it);
+            }
+            bool operator != (const iterator& rhs) const
+            {
+                return !(operator == (rhs));
+            }
+            iterator& operator ++ () 
+            {
+                ::std::advance(it, utf8::internal::sequence_length(it));
+                return *this;
+            }
+            iterator operator ++ (int)
+            {
+                iterator temp = *this;
+                ::std::advance(it, utf8::internal::sequence_length(it));
+                return temp;
+            }  
+            iterator& operator -- ()
+            {
+                utf8::unchecked::prior(it);
+                return *this;
+            }
+            iterator operator -- (int)
+            {
+                iterator temp = *this;
+                utf8::unchecked::prior(it);
+                return temp;
+            }
+          }; // class iterator
+
+    } // namespace utf8::unchecked
+} // namespace utf8 
+
+
+#endif // header guard
+
--- a/third-party/utf8cpp/test_data/negative/utf8_invalid.txt
+++ b/third-party/utf8cpp/test_data/negative/utf8_invalid.txt
--- a/third-party/utf8cpp/test_data/utf8samples/UTF-8-demo.txt
+++ b/third-party/utf8cpp/test_data/utf8samples/UTF-8-demo.txt
@ -0,0 +1,212 @@
+
+UTF-8 encoded sample plain-text file
+‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾
+
+Markus Kuhn [ˈmaʳkʊs kuːn] <http://www.cl.cam.ac.uk/~mgk25/> — 2002-07-25
+
+
+The ASCII compatible UTF-8 encoding used in this plain-text file
+is defined in Unicode, ISO 10646-1, and RFC 2279.
+
+
+Using Unicode/UTF-8, you can write in emails and source code things such as
+
+Mathematics and sciences:
+
+  ∮ E⋅da = Q,  n → ∞, ∑ f(i) = ∏ g(i),      ⎧⎡⎛┌─────┐⎞⎤⎫
+                                            ⎪⎢⎜│a²+b³ ⎟⎥⎪
+  ∀x∈ℝ: ⌈x⌉ = −⌊−x⌋, α ∧ ¬β = ¬(¬α ∨ β),    ⎪⎢⎜│───── ⎟⎥⎪
+                                            ⎪⎢⎜⎷ c₈   ⎟⎥⎪
+  ℕ ⊆ ℕ₀ ⊂ ℤ ⊂ ℚ ⊂ ℝ ⊂ ℂ,                   ⎨⎢⎜       ⎟⎥⎬
+                                            ⎪⎢⎜ ∞     ⎟⎥⎪
+  ⊥ < a ≠ b ≡ c ≤ d ≪ ⊤ ⇒ (⟦A⟧ ⇔ ⟪B⟫),      ⎪⎢⎜ ⎲     ⎟⎥⎪
+                                            ⎪⎢⎜ ⎳aⁱ-bⁱ⎟⎥⎪
+  2H₂ + O₂ ⇌ 2H₂O, R = 4.7 kΩ, ⌀ 200 mm     ⎩⎣⎝i=1    ⎠⎦⎭
+
+Linguistics and dictionaries:
+
+  ði ıntəˈnæʃənəl fəˈnɛtık əsoʊsiˈeıʃn
+  Y [ˈʏpsilɔn], Yen [jɛn], Yoga [ˈjoːgɑ]
+
+APL:
+
+  ((V⍳V)=⍳⍴V)/V←,V    ⌷←⍳→⍴∆∇⊃‾⍎⍕⌈
+
+Nicer typography in plain text files:
+
+  ╔══════════════════════════════════════════╗
+  ║                                          ║
+  ║   • ‘single’ and “double” quotes         ║
+  ║                                          ║
+  ║   • Curly apostrophes: “We’ve been here” ║
+  ║                                          ║
+  ║   • Latin-1 apostrophe and accents: '´`  ║
+  ║                                          ║
+  ║   • ‚deutsche‘ „Anführungszeichen“       ║
+  ║                                          ║
+  ║   • †, ‡, ‰, •, 3–4, —, −5/+5, ™, …      ║
+  ║                                          ║
+  ║   • ASCII safety test: 1lI|, 0OD, 8B     ║
+  ║                      ╭─────────╮         ║
+  ║   • the euro symbol: │ 14.95 € │         ║
+  ║                      ╰─────────╯         ║
+  ╚══════════════════════════════════════════╝
+
+Combining characters:
+
+  STARGΛ̊TE SG-1, a = v̇ = r̈, a⃑ ⊥ b⃑
+
+Greek (in Polytonic):
+
+  The Greek anthem:
+
+  Σὲ γνωρίζω ἀπὸ τὴν κόψη
+  τοῦ σπαθιοῦ τὴν τρομερή,
+  σὲ γνωρίζω ἀπὸ τὴν ὄψη
+  ποὺ μὲ βία μετράει τὴ γῆ.
+
+  ᾿Απ᾿ τὰ κόκκαλα βγαλμένη
+  τῶν ῾Ελλήνων τὰ ἱερά
+  καὶ σὰν πρῶτα ἀνδρειωμένη
+  χαῖρε, ὦ χαῖρε, ᾿Ελευθεριά!
+
+  From a speech of Demosthenes in the 4th century BC:
+
+  Οὐχὶ ταὐτὰ παρίσταταί μοι γιγνώσκειν, ὦ ἄνδρες ᾿Αθηναῖοι,
+  ὅταν τ᾿ εἰς τὰ πράγματα ἀποβλέψω καὶ ὅταν πρὸς τοὺς
+  λόγους οὓς ἀκούω· τοὺς μὲν γὰρ λόγους περὶ τοῦ
+  τιμωρήσασθαι Φίλιππον ὁρῶ γιγνομένους, τὰ δὲ πράγματ᾿
+  εἰς τοῦτο προήκοντα,  ὥσθ᾿ ὅπως μὴ πεισόμεθ᾿ αὐτοὶ
+  πρότερον κακῶς σκέψασθαι δέον. οὐδέν οὖν ἄλλο μοι δοκοῦσιν
+  οἱ τὰ τοιαῦτα λέγοντες ἢ τὴν ὑπόθεσιν, περὶ ἧς βουλεύεσθαι,
+  οὐχὶ τὴν οὖσαν παριστάντες ὑμῖν ἁμαρτάνειν. ἐγὼ δέ, ὅτι μέν
+  ποτ᾿ ἐξῆν τῇ πόλει καὶ τὰ αὑτῆς ἔχειν ἀσφαλῶς καὶ Φίλιππον
+  τιμωρήσασθαι, καὶ μάλ᾿ ἀκριβῶς οἶδα· ἐπ᾿ ἐμοῦ γάρ, οὐ πάλαι
+  γέγονεν ταῦτ᾿ ἀμφότερα· νῦν μέντοι πέπεισμαι τοῦθ᾿ ἱκανὸν
+  προλαβεῖν ἡμῖν εἶναι τὴν πρώτην, ὅπως τοὺς συμμάχους
+  σώσομεν. ἐὰν γὰρ τοῦτο βεβαίως ὑπάρξῃ, τότε καὶ περὶ τοῦ
+  τίνα τιμωρήσεταί τις καὶ ὃν τρόπον ἐξέσται σκοπεῖν· πρὶν δὲ
+  τὴν ἀρχὴν ὀρθῶς ὑποθέσθαι, μάταιον ἡγοῦμαι περὶ τῆς
+  τελευτῆς ὁντινοῦν ποιεῖσθαι λόγον.
+
+  Δημοσθένους, Γ´ ᾿Ολυνθιακὸς
+
+Georgian:
+
+  From a Unicode conference invitation:
+
+  გთხოვთ ახლავე გაიაროთ რეგისტრაცია Unicode-ის მეათე საერთაშორისო
+  კონფერენციაზე დასასწრებად, რომელიც გაიმართება 10-12 მარტს,
+  ქ. მაინცში, გერმანიაში. კონფერენცია შეჰკრებს ერთად მსოფლიოს
+  ექსპერტებს ისეთ დარგებში როგორიცაა ინტერნეტი და Unicode-ი,
+  ინტერნაციონალიზაცია და ლოკალიზაცია, Unicode-ის გამოყენება
+  ოპერაციულ სისტემებსა, და გამოყენებით პროგრამებში, შრიფტებში,
+  ტექსტების დამუშავებასა და მრავალენოვან კომპიუტერულ სისტემებში.
+
+Russian:
+
+  From a Unicode conference invitation:
+
+  Зарегистрируйтесь сейчас на Десятую Международную Конференцию по
+  Unicode, которая состоится 10-12 марта 1997 года в Майнце в Германии.
+  Конференция соберет широкий круг экспертов по  вопросам глобального
+  Интернета и Unicode, локализации и интернационализации, воплощению и
+  применению Unicode в различных операционных системах и программных
+  приложениях, шрифтах, верстке и многоязычных компьютерных системах.
+
+Thai (UCS Level 2):
+
+  Excerpt from a poetry on The Romance of The Three Kingdoms (a Chinese
+  classic 'San Gua'):
+
+  [----------------------------|------------------------]
+    ๏ แผ่นดินฮั่นเสื่อมโทรมแสนสังเวช  พระปกเกศกองบู๊กู้ขึ้นใหม่
+  สิบสองกษัตริย์ก่อนหน้าแลถัดไป       สององค์ไซร้โง่เขลาเบาปัญญา
+    ทรงนับถือขันทีเป็นที่พึ่ง           บ้านเมืองจึงวิปริตเป็นนักหนา
+  โฮจิ๋นเรียกทัพทั่วหัวเมืองมา         หมายจะฆ่ามดชั่วตัวสำคัญ
+    เหมือนขับไสไล่เสือจากเคหา      รับหมาป่าเข้ามาเลยอาสัญ
+  ฝ่ายอ้องอุ้นยุแยกให้แตกกัน          ใช้สาวนั้นเป็นชนวนชื่นชวนใจ
+    พลันลิฉุยกุยกีกลับก่อเหตุ          ช่างอาเพศจริงหนาฟ้าร้องไห้
+  ต้องรบราฆ่าฟันจนบรรลัย           ฤๅหาใครค้ำชูกู้บรรลังก์ ฯ
+
+  (The above is a two-column text. If combining characters are handled
+  correctly, the lines of the second column should be aligned with the
+  | character above.)
+
+Ethiopian:
+
+  Proverbs in the Amharic language:
+
+  ሰማይ አይታረስ ንጉሥ አይከሰስ።
+  ብላ ካለኝ እንደአባቴ በቆመጠኝ።
+  ጌጥ ያለቤቱ ቁምጥና ነው።
+  ደሀ በሕልሙ ቅቤ ባይጠጣ ንጣት በገደለው።
+  የአፍ ወለምታ በቅቤ አይታሽም።
+  አይጥ በበላ ዳዋ ተመታ።
+  ሲተረጉሙ ይደረግሙ።
+  ቀስ በቀስ፥ ዕንቁላል በእግሩ ይሄዳል።
+  ድር ቢያብር አንበሳ ያስር።
+  ሰው እንደቤቱ እንጅ እንደ ጉረቤቱ አይተዳደርም።
+  እግዜር የከፈተውን ጉሮሮ ሳይዘጋው አይድርም።
+  የጎረቤት ሌባ፥ ቢያዩት ይስቅ ባያዩት ያጠልቅ።
+  ሥራ ከመፍታት ልጄን ላፋታት።
+  ዓባይ ማደሪያ የለው፥ ግንድ ይዞ ይዞራል።
+  የእስላም አገሩ መካ የአሞራ አገሩ ዋርካ።
+  ተንጋሎ ቢተፉ ተመልሶ ባፉ።
+  ወዳጅህ ማር ቢሆን ጨርስህ አትላሰው።
+  እግርህን በፍራሽህ ልክ ዘርጋ።
+
+Runes:
+
+  ᚻᛖ ᚳᚹᚫᚦ ᚦᚫᛏ ᚻᛖ ᛒᚢᛞᛖ ᚩᚾ ᚦᚫᛗ ᛚᚪᚾᛞᛖ ᚾᚩᚱᚦᚹᛖᚪᚱᛞᚢᛗ ᚹᛁᚦ ᚦᚪ ᚹᛖᛥᚫ
+
+  (Old English, which transcribed into Latin reads 'He cwaeth that he
+  bude thaem lande northweardum with tha Westsae.' and means 'He said
+  that he lived in the northern land near the Western Sea.')
+
+Braille:
+
+  ⡌⠁⠧⠑ ⠼⠁⠒  ⡍⠜⠇⠑⠹⠰⠎ ⡣⠕⠌
+
+  ⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠙⠑⠁⠙⠒ ⠞⠕ ⠃⠑⠛⠔ ⠺⠊⠹⠲ ⡹⠻⠑ ⠊⠎ ⠝⠕ ⠙⠳⠃⠞
+  ⠱⠁⠞⠑⠧⠻ ⠁⠃⠳⠞ ⠹⠁⠞⠲ ⡹⠑ ⠗⠑⠛⠊⠌⠻ ⠕⠋ ⠙⠊⠎ ⠃⠥⠗⠊⠁⠇ ⠺⠁⠎
+  ⠎⠊⠛⠝⠫ ⠃⠹ ⠹⠑ ⠊⠇⠻⠛⠹⠍⠁⠝⠂ ⠹⠑ ⠊⠇⠻⠅⠂ ⠹⠑ ⠥⠝⠙⠻⠞⠁⠅⠻⠂
+  ⠁⠝⠙ ⠹⠑ ⠡⠊⠑⠋ ⠍⠳⠗⠝⠻⠲ ⡎⠊⠗⠕⠕⠛⠑ ⠎⠊⠛⠝⠫ ⠊⠞⠲ ⡁⠝⠙
+  ⡎⠊⠗⠕⠕⠛⠑⠰⠎ ⠝⠁⠍⠑ ⠺⠁⠎ ⠛⠕⠕⠙ ⠥⠏⠕⠝ ⠰⡡⠁⠝⠛⠑⠂ ⠋⠕⠗ ⠁⠝⠹⠹⠔⠛ ⠙⠑
+  ⠡⠕⠎⠑ ⠞⠕ ⠏⠥⠞ ⠙⠊⠎ ⠙⠁⠝⠙ ⠞⠕⠲
+
+  ⡕⠇⠙ ⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠁⠎ ⠙⠑⠁⠙ ⠁⠎ ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲
+
+  ⡍⠔⠙⠖ ⡊ ⠙⠕⠝⠰⠞ ⠍⠑⠁⠝ ⠞⠕ ⠎⠁⠹ ⠹⠁⠞ ⡊ ⠅⠝⠪⠂ ⠕⠋ ⠍⠹
+  ⠪⠝ ⠅⠝⠪⠇⠫⠛⠑⠂ ⠱⠁⠞ ⠹⠻⠑ ⠊⠎ ⠏⠜⠞⠊⠊⠥⠇⠜⠇⠹ ⠙⠑⠁⠙ ⠁⠃⠳⠞
+  ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲ ⡊ ⠍⠊⠣⠞ ⠙⠁⠧⠑ ⠃⠑⠲ ⠔⠊⠇⠔⠫⠂ ⠍⠹⠎⠑⠇⠋⠂ ⠞⠕
+  ⠗⠑⠛⠜⠙ ⠁ ⠊⠕⠋⠋⠔⠤⠝⠁⠊⠇ ⠁⠎ ⠹⠑ ⠙⠑⠁⠙⠑⠌ ⠏⠊⠑⠊⠑ ⠕⠋ ⠊⠗⠕⠝⠍⠕⠝⠛⠻⠹
+  ⠔ ⠹⠑ ⠞⠗⠁⠙⠑⠲ ⡃⠥⠞ ⠹⠑ ⠺⠊⠎⠙⠕⠍ ⠕⠋ ⠳⠗ ⠁⠝⠊⠑⠌⠕⠗⠎
+  ⠊⠎ ⠔ ⠹⠑ ⠎⠊⠍⠊⠇⠑⠆ ⠁⠝⠙ ⠍⠹ ⠥⠝⠙⠁⠇⠇⠪⠫ ⠙⠁⠝⠙⠎
+  ⠩⠁⠇⠇ ⠝⠕⠞ ⠙⠊⠌⠥⠗⠃ ⠊⠞⠂ ⠕⠗ ⠹⠑ ⡊⠳⠝⠞⠗⠹⠰⠎ ⠙⠕⠝⠑ ⠋⠕⠗⠲ ⡹⠳
+  ⠺⠊⠇⠇ ⠹⠻⠑⠋⠕⠗⠑ ⠏⠻⠍⠊⠞ ⠍⠑ ⠞⠕ ⠗⠑⠏⠑⠁⠞⠂ ⠑⠍⠏⠙⠁⠞⠊⠊⠁⠇⠇⠹⠂ ⠹⠁⠞
+  ⡍⠜⠇⠑⠹ ⠺⠁⠎ ⠁⠎ ⠙⠑⠁⠙ ⠁⠎ ⠁ ⠙⠕⠕⠗⠤⠝⠁⠊⠇⠲
+
+  (The first couple of paragraphs of "A Christmas Carol" by Dickens)
+
+Compact font selection example text:
+
+  ABCDEFGHIJKLMNOPQRSTUVWXYZ /0123456789
+  abcdefghijklmnopqrstuvwxyz £©µÀÆÖÞßéöÿ
+  –—‘“”„†•…‰™œŠŸž€ ΑΒΓΔΩαβγδω АБВГДабвгд
+  ∀∂∈ℝ∧∪≡∞ ↑↗↨↻⇣ ┐┼╔╘░►☺♀ ﬁ<>⑀₂ἠḂӥẄɐː⍎אԱა
+
+Greetings in various languages:
+
+  Hello world, Καλημέρα κόσμε, コンニチハ
+
+Box drawing alignment tests:                                          █
+                                                                      ▉
+  ╔══╦══╗  ┌──┬──┐  ╭──┬──╮  ╭──┬──╮  ┏━━┳━━┓  ┎┒┏┑   ╷  ╻ ┏┯┓ ┌┰┐    ▊ ╱╲╱╲╳╳╳
+  ║┌─╨─┐║  │╔═╧═╗│  │╒═╪═╕│  │╓─╁─╖│  ┃┌─╂─┐┃  ┗╃╄┙  ╶┼╴╺╋╸┠┼┨ ┝╋┥    ▋ ╲╱╲╱╳╳╳
+  ║│╲ ╱│║  │║   ║│  ││ │ ││  │║ ┃ ║│  ┃│ ╿ │┃  ┍╅╆┓   ╵  ╹ ┗┷┛ └┸┘    ▌ ╱╲╱╲╳╳╳
+  ╠╡ ╳ ╞╣  ├╢   ╟┤  ├┼─┼─┼┤  ├╫─╂─╫┤  ┣┿╾┼╼┿┫  ┕┛┖┚     ┌┄┄┐ ╎ ┏┅┅┓ ┋ ▍ ╲╱╲╱╳╳╳
+  ║│╱ ╲│║  │║   ║│  ││ │ ││  │║ ┃ ║│  ┃│ ╽ │┃  ░░▒▒▓▓██ ┊  ┆ ╎ ╏  ┇ ┋ ▎
+  ║└─╥─┘║  │╚═╤═╝│  │╘═╪═╛│  │╙─╀─╜│  ┃└─╂─┘┃  ░░▒▒▓▓██ ┊  ┆ ╎ ╏  ┇ ┋ ▏
+  ╚══╩══╝  └──┴──┘  ╰──┴──╯  ╰──┴──╯  ┗━━┻━━┛  ▗▄▖▛▀▜   └╌╌┘ ╎ ┗╍╍┛ ┋  ▁▂▃▄▅▆▇█
+                                               ▝▀▘▙▄▟
--- a/third-party/utf8cpp/test_data/utf8samples/Unicode_transcriptions.html
+++ b/third-party/utf8cpp/test_data/utf8samples/Unicode_transcriptions.html
@ -0,0 +1,167 @@
+? 	*Unicode Transcriptions* 	Notes <#Notes>
+
+Glyphs <http://www.macchiato.com/unicode/show.html> | Samples
+<http://www.macchiato.com/unicode/Unicode_transcriptions.html> | Charts
+<http://www.macchiato.com/unicode/charts.html> | UTF
+<http://www.macchiato.com/unicode/convert.html> | Forms
+<http://www-4.ibm.com/software/developer/library/utfencodingforms/> |
+Home <http://www.macchiato.com>.
+<http://member.linkexchange.com/cgi-bin/fc/fastcounter-login?750641>
+
+Name 	Text 	Image
+Arabic (Arabic) 	يونِكود 	?
+Arabic (Persian) 	یونی‌کُد 	/ ?/
+Armenian 	Յունիկօդ 	
+Bengali 	য়ূনিকোড 	
+Bopomofo 	ㄊㄨㄥ˅ ㄧˋ ㄇㄚ˅ 	
+ㄨㄢˋ ㄍㄨㄛˊ ㄇㄚ˅ 	
+Braille 	  	 
+Buhid 	  	 
+Canadian Aboriginal 	ᔫᗂᑰᑦ 	
+Cherokee 	ᏳᏂᎪᏛ 	
+Cypriot 	  	 
+Cyrillic (Russian) 	Юникод 	?
+Deseret (English) 	??????? 	
+Devanagari (Hindi) 	यूनिकोड 	?
+Ethiopic 	ዩኒኮድ 	
+Georgian 	უნიკოდი 	?
+Gothic 	  	 
+Greek 	Γιούνικοντ 	
+Gujarati 	યૂનિકોડ 	
+Gurmukhi 	ਯੂਨਿਕੋਡ 	
+Han (Chinese) 	统一码 	?
+統一碼 	?
+万国码 	?
+萬國碼 	?
+Hangul 	유니코드 	
+Hanunoo 	  	 
+Hebrew 	יוניקוד 	
+Hebrew (pointed) 	יוּנִיקוׁד 	
+Hebrew (Yiddish) 	יוניקאָד 	?
+Hiragana (Japanese) 	ゆにこおど 	 
+Katakana (Japanese) 	ユニコード 	?
+Kannada 	ಯೂನಿಕೋಡ್ 	
+Khmer 	យូនីគោដ 	
+Lao 	  	 
+Latin 	Unicode 	Unicode
+Latin (IPA <#English_Pronunciation>) 	ˈjunɪˌkoːd 	?
+Latin (Am. Dict. <#American_Dictionary>) 	Ūnĭcōde̽ 	?
+Limbu 	  	 
+Linear B 	  	 
+Malayalam 	യൂനികോഡ് 	
+Mongolian 	  	
+Myanmar 	  	
+Ogham 	ᚔᚒᚅᚔᚉᚑᚇ 	/ /
+Old Italic 	  	 
+Oriya 	ୟୂନିକୋଡ 	
+Osmanya 	  	 
+Runic (Anglo-Saxon) 	ᛡᚢᚾᛁᚳᚩᛞ 	
+Shavian 	  	 
+Sinhala 	යණනිකෞද් 	
+Syriac 	ܝܘܢܝܩܘܕ 	
+Tagbanwa 	  	 
+Tagalog 	  	 
+Tai Le 	  	 
+Tamil 	யூனிகோட் 	
+Telugu 	యూనికోడ్ 	
+Thaana 	  	
+Thai 	ยูนืโคด 	
+Tibetan (Dzongkha) 	ཨུ་ནི་ཀོཌྲ། 	
+Ugaritic 	  	 
+Yi 	  	
+
+
+      Notes:
+
+There are different ways to transcribe the word “Unicode”, depending on
+the language and script. In some cases there is only one language that
+customarily uses a given script; in others there are many languages. The
+goal here is at a minimum to collect at least one transcription for each
+script in a language customarily written in that script, with more
+languages if possible. If the transcription is the same for multiple
+languages in a script, then a single representative language is used.
+
+Still missing are transcriptions for the items above in RED (in at least
+one language). I would appreciate any other transcriptions, or
+corrections for the ones listed here. Send to mark3@macchiato.com
+<mailto:mark3@macchiato.com>, using the directions below:
+
+    * *Supplying Missing Items*
+          o Most Latin-script languages will follow the spelling, and
+            change the pronunciation. For any that would not, it would
+            be good to have the alternate spelling.
+          o For non-Latin scripts the goal is to match the English
+            pronunciation — /*not*/ spelling. Above is the IPA <#IPA>
+            (in phonemic transcription) that should be matched as
+            closely as possible (without sounding affected in the target
+            language)
+          o Text would be best in either the UTF-8 text, or the code
+            points in hex HTML. E.g. either of the following:
+                + "Юникод"
+                + "&#x042E;&#x043D;&#x0438;&#x043A;&#x043E;&#x0434;"
+                + Note: for / supplementary characters/
+                  <http://www.unicode.org/glossary/#supplementary_character>,
+                  there should be one hex number per code point, not two
+                  surrogates
+                  <http://www.unicode.org/glossary/#surrogate_code_point>:
+                      # &#x10000; /*not*/ &#xD800;&xDC00;
+          o If you have a good font, I'd also appreciate a GIF. It
+            should be *96 x 24* bits, with the text centered, in black
+            on white (plus grays if smoothed).
+    * *Other Comments*
+          o Because some browsers won't handle the text, both text and
+            GIF image are supplied. If you can’t read the text columns,
+            see Display Problems
+            <http://www.unicode.org/help/display_problems.html>.
+          o The Chinese versions (inc. Bopomofo) are translations, not
+            transcriptions, since "transcription in Chinese is pretty
+            lame" [J. Becker].
+          o There are other "translations" of Unicode that may be in
+            use, such as the Vietnamese "Thống Nhất Mã".
+          o For sample pages in different languages on the Unicode site,
+            see What is Unicode?
+            <http://www.unicode.org/unicode/standard/WhatIsUnicode.html>
+          o Americans are not generally used to IPA, and find a variety
+            of different systems in their dictionaries. This one leaves
+            the base letters as they are, and uses diacritics for
+            pronunciation.
+    * *Etymology of /Unicode/*
+          o Coined by J. Becker. Not related to previous usages, such as:
+                + A telegraphic code in which one word or set of letters
+                  represents a sentence or phrase; a telegram or message
+                  in this. (late 19th century, OED)
+          o According to my references, the prefix "uni" is directly
+            from Latin while the word "code" is through French.
+          o The original Indo-European apparently would have been
+            *oino-kau-do ("one strike give"): *kau apparently being
+            related to such English words as: hew, haggle, hoe, hag,
+            hay, hack, caudad, caudal, caudate, caudex, coda, codex,
+            codicil, coward, incus, and Kovač (personal name: "smith").
+                + I will leave the exact derivations to the exegetes,
+                  but I like the association with "haggle" myself.
+    * *Contributions*
+          o This draws on contributions or comments from:
+                + Dixon Au
+                + Joe Becker
+                + Maurice Bauhahn
+                + Abel Cheung
+                + Peter Constable
+                + Michael Everson
+                + Christopher John Fynn
+                + Michael Kaplan
+                + George Kiraz
+                + Abdul Malik
+                + Siva Nataraja
+                + Roozbeh Pournader
+                + Jonathan Rosenne
+                + Jungshik Shin
+
+------------------------------------------------------------------------
+	
+
+Terms of Use <http://www.macchiato.com/terms_of_use.html>. Last updated:
+MED - 04/20/2003 15:30:33.
+<http://member.linkexchange.com/cgi-bin/fc/fastcounter-login?750641>
+
+ 
+
--- a/third-party/utf8cpp/test_data/utf8samples/quickbrown.txt
+++ b/third-party/utf8cpp/test_data/utf8samples/quickbrown.txt
@ -0,0 +1,126 @@
+Sentences that contain all letters commonly used in a language
+--------------------------------------------------------------
+
+Markus Kuhn <http://www.cl.cam.ac.uk/~mgk25/> -- 2001-09-02
+
+This file is UTF-8 encoded.
+
+
+Danish (da)
+---------
+
+  Quizdeltagerne spiste jordbær med fløde, mens cirkusklovnen
+  Wolther spillede på xylofon.
+  (= Quiz contestants were eating strawbery with cream while Wolther
+  the circus clown played on xylophone.)
+
+German (de)
+-----------
+
+  Falsches Üben von Xylophonmusik quält jeden größeren Zwerg
+  (= Wrongful practicing of xylophone music tortures every larger dwarf)
+
+  Zwölf Boxkämpfer jagten Eva quer über den Sylter Deich
+  (= Twelve boxing fighters hunted Eva across the dike of Sylt)
+
+  Heizölrückstoßabdämpfung
+  (= fuel oil recoil absorber)
+  (jqvwxy missing, but all non-ASCII letters in one word)
+
+English (en)
+------------
+
+  The quick brown fox jumps over the lazy dog
+
+Spanish (es)
+------------
+
+  El pingüino Wenceslao hizo kilómetros bajo exhaustiva lluvia y 
+  frío, añoraba a su querido cachorro.
+  (Contains every letter and every accent, but not every combination
+  of vowel + acute.)
+
+French (fr)
+-----------
+
+  Portez ce vieux whisky au juge blond qui fume sur son île intérieure, à
+  côté de l'alcôve ovoïde, où les bûches se consument dans l'âtre, ce
+  qui lui permet de penser à la cænogenèse de l'être dont il est question
+  dans la cause ambiguë entendue à Moÿ, dans un capharnaüm qui,
+  pense-t-il, diminue çà et là la qualité de son œuvre. 
+
+  l'île exiguë
+  Où l'obèse jury mûr
+  Fête l'haï volapük,
+  Âne ex aéquo au whist,
+  Ôtez ce vœu déçu.
+
+  Le cœur déçu mais l'âme plutôt naïve, Louÿs rêva de crapaüter en
+  canoë au delà des îles, près du mälström où brûlent les novæ.
+
+Irish Gaelic (ga)
+-----------------
+
+  D'fhuascail Íosa, Úrmhac na hÓighe Beannaithe, pór Éava agus Ádhaimh
+
+Hungarian (hu)
+--------------
+
+  Árvíztűrő tükörfúrógép
+  (= flood-proof mirror-drilling machine, only all non-ASCII letters)
+
+Icelandic (is)
+--------------
+
+  Kæmi ný öxi hér ykist þjófum nú bæði víl og ádrepa
+
+  Sævör grét áðan því úlpan var ónýt
+  (some ASCII letters missing)
+
+Japanese (jp)
+-------------
+
+  Hiragana: (Iroha)
+
+  いろはにほへとちりぬるを
+  わかよたれそつねならむ
+  うゐのおくやまけふこえて
+  あさきゆめみしゑひもせす
+
+  Katakana:
+
+  イロハニホヘト チリヌルヲ ワカヨタレソ ツネナラム
+  ウヰノオクヤマ ケフコエテ アサキユメミシ ヱヒモセスン
+
+Hebrew (iw)
+-----------
+
+  ? דג סקרן שט בים מאוכזב ולפתע מצא לו חברה איך הקליטה
+
+Polish (pl)
+-----------
+
+  Pchnąć w tę łódź jeża lub ośm skrzyń fig
+  (= To push a hedgehog or eight bins of figs in this boat)
+
+Russian (ru)
+------------
+
+  В чащах юга жил бы цитрус? Да, но фальшивый экземпляр!
+  (= Would a citrus live in the bushes of south? Yes, but only a fake one!)
+
+Thai (th)
+---------
+
+  [--------------------------|------------------------]
+  ๏ เป็นมนุษย์สุดประเสริฐเลิศคุณค่า  กว่าบรรดาฝูงสัตว์เดรัจฉาน
+  จงฝ่าฟันพัฒนาวิชาการ           อย่าล้างผลาญฤๅเข่นฆ่าบีฑาใคร
+  ไม่ถือโทษโกรธแช่งซัดฮึดฮัดด่า     หัดอภัยเหมือนกีฬาอัชฌาสัย
+  ปฏิบัติประพฤติกฎกำหนดใจ        พูดจาให้จ๊ะๆ จ๋าๆ น่าฟังเอย ฯ
+
+  [The copyright for the Thai example is owned by The Computer
+  Association of Thailand under the Royal Patronage of His Majesty the
+  King.]
+
+Please let me know if you find others! Special thanks to the people
+from all over the world who contributed these sentences.
--- a/third-party/utf8cpp/test_drivers/negative/negative.cpp
+++ b/third-party/utf8cpp/test_drivers/negative/negative.cpp
@ -0,0 +1,53 @@
+#include "../../source/utf8.h"
+using namespace utf8;
+
+#include <string>
+#include <iostream>
+#include <fstream>
+#include <algorithm>
+using namespace std;
+
+const unsigned INVALID_LINES[] = { 75, 76, 83, 84, 85, 93, 102, 103, 105, 106, 107, 108, 109, 110, 114, 115, 116, 117, 124, 125, 130, 135, 140, 145, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 169, 175, 176, 177, 207, 208, 209, 210, 211, 220, 221, 222, 223, 224, 232, 233, 234, 235, 236, 247, 248, 249, 250, 251, 252, 253, 257, 258, 259, 260, 261, 262, 263, 264};
+const unsigned* INVALID_LINES_END = INVALID_LINES + sizeof(INVALID_LINES)/sizeof(unsigned);
+
+int main(int argc, char** argv)
+{
+    string test_file_path;
+    if (argc == 2) 
+        test_file_path = argv[1];
+    else {
+        cout << "Wrong number of arguments" << endl;
+        exit(0);
+    }
+    // Open the test file
+    ifstream fs8(test_file_path.c_str());
+    if (!fs8.is_open()) {
+        cout << "Could not open " << test_file_path << endl;
+        return 0;
+    }
+
+    // Read it line by line
+    unsigned int line_count = 0;
+    char byte;
+    while (!fs8.eof()) {
+        string line;
+        while ((byte = static_cast<char>(fs8.get())) != '\n' && !fs8.eof())
+            line.push_back(byte);
+
+        line_count++;
+        bool expected_valid = (find(INVALID_LINES, INVALID_LINES_END, line_count) == INVALID_LINES_END);
+        // Print out lines that contain unexpected invalid UTF-8
+        if (!is_valid(line.begin(), line.end())) {
+            if (expected_valid)    
+                cout << "Unexpected invalid utf-8 at line " << line_count << '\n';
+
+            // try fixing it:
+            string fixed_line;
+            replace_invalid(line.begin(), line.end(), back_inserter(fixed_line));
+            if (!is_valid(fixed_line.begin(), fixed_line.end()))
+                cout << "replace_invalid() resulted in an invalid utf-8 at line " << line_count << '\n';
+        }
+        else if (!expected_valid)
+            cout << "Invalid utf-8 NOT detected at line " << line_count << '\n';
+    }
+}
--- a/third-party/utf8cpp/test_drivers/smoke_test/test.cpp
+++ b/third-party/utf8cpp/test_drivers/smoke_test/test.cpp
@ -0,0 +1,298 @@
+#include <cstring>
+#include <cassert>
+#include <vector>
+#include "../../source/utf8.h"
+using namespace utf8;
+using namespace std;
+
+int main()
+{
+    //append
+    unsigned char u[5] = {0,0,0,0,0};
+
+    append(0x0448, u);
+    assert (u[0] == 0xd1 && u[1] == 0x88 && u[2] == 0 && u[3] == 0 && u[4] == 0);
+
+    append(0x65e5, u);
+    assert (u[0] == 0xe6 && u[1] == 0x97 && u[2] == 0xa5 && u[3] == 0 && u[4] == 0);
+
+    append(0x3044, u);
+    assert (u[0] == 0xe3 && u[1] == 0x81 && u[2] == 0x84 && u[3] == 0 && u[4] == 0);
+
+    append(0x10346, u);
+    assert (u[0] == 0xf0 && u[1] == 0x90 && u[2] == 0x8d && u[3] == 0x86 && u[4] == 0);
+
+
+    //next
+    const char* twochars = "\xe6\x97\xa5\xd1\x88";
+    const char* w = twochars;
+    int cp = next(w, twochars + 6);
+    assert (cp == 0x65e5);
+    assert (w == twochars + 3);
+
+    const char* threechars = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88";
+    w = threechars;
+    cp = next(w, threechars + 9);
+    assert (cp == 0x10346);
+    assert (w == threechars + 4);
+    cp = next(w, threechars + 9);
+    assert (cp == 0x65e5);
+    assert (w == threechars + 7);
+    cp = next(w, threechars + 9);
+    assert (cp == 0x0448);
+    assert (w == threechars + 9);
+
+    //peek_next
+    const char* const cw = twochars;
+    cp = peek_next(cw, cw + 6);
+    assert (cp == 0x65e5);
+    assert (cw == twochars);
+
+    //prior
+    w = twochars + 3;
+    cp = prior (w, twochars);
+    assert (cp == 0x65e5);
+    assert (w == twochars);
+
+    w = threechars + 9;
+    cp = prior(w, threechars);
+    assert (cp == 0x0448);
+    assert (w == threechars + 7);
+    cp = prior(w, threechars);
+    assert (cp == 0x65e5);
+    assert (w == threechars + 4);
+    cp = prior(w, threechars);
+    assert (cp == 0x10346);
+    assert (w == threechars); 
+
+    //previous (deprecated)
+    w = twochars + 3;
+    cp = previous (w, twochars - 1);
+    assert (cp == 0x65e5);
+    assert (w == twochars);
+
+    w = threechars + 9;
+    cp = previous(w, threechars - 1);
+    assert (cp == 0x0448);
+    assert (w == threechars + 7);
+    cp = previous(w, threechars -1);
+    assert (cp == 0x65e5);
+    assert (w == threechars + 4);
+    cp = previous(w, threechars - 1);
+    assert (cp == 0x10346);
+    assert (w == threechars); 
+
+    // advance
+    w = twochars;
+    advance (w, 2, twochars + 6);
+    assert (w == twochars + 5);
+
+    // distance
+    size_t dist = utf8::distance(twochars, twochars + 5);
+    assert (dist == 2);
+
+    // utf32to8
+    int utf32string[] = {0x448, 0x65E5, 0x10346, 0};
+    vector<char> utf8result;
+    utf32to8(utf32string, utf32string + 3, back_inserter(utf8result));
+    assert (utf8result.size() == 9);
+    // try it with the return value;
+    char* utf8_end = utf32to8(utf32string, utf32string + 3, &utf8result[0]);
+    assert (utf8_end == &utf8result[0] + 9);
+
+    //utf8to32
+    vector<int> utf32result;
+    utf8to32(twochars, twochars + 5, back_inserter(utf32result));
+    assert (utf32result.size() == 2);
+    // try it with the return value;
+    int* utf32_end = utf8to32(twochars, twochars + 5, &utf32result[0]);
+    assert (utf32_end == &utf32result[0] + 2);
+
+    //utf16to8
+    unsigned short utf16string[] = {0x41, 0x0448, 0x65e5, 0xd834, 0xdd1e};
+    utf8result.clear();
+    utf16to8(utf16string, utf16string + 5, back_inserter(utf8result));
+    assert (utf8result.size() == 10);
+    // try it with the return value;
+    utf8_end = utf16to8 (utf16string, utf16string + 5, &utf8result[0]);
+    assert (utf8_end == &utf8result[0] + 10);
+
+    //utf8to16
+    char utf8_with_surrogates[] = "\xe6\x97\xa5\xd1\x88\xf0\x9d\x84\x9e";
+    vector <unsigned short> utf16result;
+    utf8to16(utf8_with_surrogates, utf8_with_surrogates + 9, back_inserter(utf16result));
+    assert (utf16result.size() == 4);
+    assert (utf16result[2] == 0xd834);
+    assert (utf16result[3] == 0xdd1e);
+    // try it with the return value;
+    unsigned short* utf16_end = utf8to16 (utf8_with_surrogates, utf8_with_surrogates + 9, &utf16result[0]);
+    assert (utf16_end == &utf16result[0] + 4);
+
+    //find_invalid
+    char utf_invalid[] = "\xe6\x97\xa5\xd1\x88\xfa";
+    char* invalid = find_invalid(utf_invalid, utf_invalid + 6);
+    assert (invalid == utf_invalid + 5);
+
+    //is_valid
+    bool bvalid = is_valid(utf_invalid, utf_invalid + 6);
+    assert (bvalid == false);
+    bvalid = is_valid(utf8_with_surrogates, utf8_with_surrogates + 9);
+    assert (bvalid == true);
+
+    //starts_with_bom
+    unsigned char byte_order_mark[] = {0xef, 0xbb, 0xbf};
+    bool bbom = starts_with_bom(byte_order_mark, byte_order_mark + sizeof(byte_order_mark));
+    assert (bbom == true);
+	bool no_bbom = starts_with_bom(threechars, threechars + sizeof(threechars));
+	assert (no_bbom == false);
+
+    //is_bom
+	bool unsafe_bbom = is_bom(byte_order_mark);
+    assert (unsafe_bbom == true);
+
+    
+    //replace_invalid
+    char invalid_sequence[] = "a\x80\xe0\xa0\xc0\xaf\xed\xa0\x80z";
+    vector<char> replace_invalid_result;
+    replace_invalid (invalid_sequence, invalid_sequence + sizeof(invalid_sequence), std::back_inserter(replace_invalid_result), '?');
+    bvalid = is_valid(replace_invalid_result.begin(), replace_invalid_result.end());
+    assert (bvalid);
+    const char fixed_invalid_sequence[] = "a????z";
+    assert (sizeof(fixed_invalid_sequence) == replace_invalid_result.size());
+    assert (std::equal(replace_invalid_result.begin(), replace_invalid_result.begin() + sizeof(fixed_invalid_sequence), fixed_invalid_sequence));
+
+    // iterator
+    utf8::iterator<const char*> it(threechars, threechars, threechars + 9);
+    utf8::iterator<const char*> it2 = it;
+    assert (it2 == it);
+    assert (*it == 0x10346);
+    assert (*(++it) == 0x65e5);
+    assert ((*it++) == 0x65e5);
+    assert (*it == 0x0448);
+    assert (it != it2);
+    utf8::iterator<const char*> endit (threechars + 9, threechars, threechars + 9);  
+    assert (++it == endit);
+    assert (*(--it) == 0x0448);
+    assert ((*it--) == 0x0448);
+    assert (*it == 0x65e5);
+    assert (--it == utf8::iterator<const char*>(threechars, threechars, threechars + 9));
+    assert (*it == 0x10346);
+
+    //////////////////////////////////////////////////////////
+    //// Unchecked variants
+    //////////////////////////////////////////////////////////
+
+    //append
+    memset(u, 0, 5);
+    append(0x0448, u);
+    assert (u[0] == 0xd1 && u[1] == 0x88 && u[2] == 0 && u[3] == 0 && u[4] == 0);
+
+    append(0x65e5, u);
+    assert (u[0] == 0xe6 && u[1] == 0x97 && u[2] == 0xa5 && u[3] == 0 && u[4] == 0);
+
+    append(0x10346, u);
+    assert (u[0] == 0xf0 && u[1] == 0x90 && u[2] == 0x8d && u[3] == 0x86 && u[4] == 0);
+
+    //next
+    w = twochars;
+    cp = unchecked::next(w);
+    assert (cp == 0x65e5);
+    assert (w == twochars + 3);
+
+    w = threechars;
+    cp = unchecked::next(w);
+    assert (cp == 0x10346);
+    assert (w == threechars + 4);
+    cp = unchecked::next(w);
+    assert (cp == 0x65e5);
+    assert (w == threechars + 7);
+    cp = unchecked::next(w);
+    assert (cp == 0x0448);
+    assert (w == threechars + 9);
+
+    //peek_next
+    cp = unchecked::peek_next(cw);
+    assert (cp == 0x65e5);
+    assert (cw == twochars);
+
+
+    //previous (calls prior internally)
+
+    w = twochars + 3;
+    cp = unchecked::previous (w);
+    assert (cp == 0x65e5);
+    assert (w == twochars);
+
+    w = threechars + 9;
+    cp = unchecked::previous(w);
+    assert (cp == 0x0448);
+    assert (w == threechars + 7);
+    cp = unchecked::previous(w);
+    assert (cp == 0x65e5);
+    assert (w == threechars + 4);
+    cp = unchecked::previous(w);
+    assert (cp == 0x10346);
+    assert (w == threechars); 
+
+    // advance
+    w = twochars;
+    unchecked::advance (w, 2);
+    assert (w == twochars + 5);
+
+    // distance
+    dist = unchecked::distance(twochars, twochars + 5);
+    assert (dist == 2);
+
+    // utf32to8
+    utf8result.clear();
+    unchecked::utf32to8(utf32string, utf32string + 3, back_inserter(utf8result));
+    assert (utf8result.size() == 9);
+    // try it with the return value;
+    utf8_end = utf32to8(utf32string, utf32string + 3, &utf8result[0]);
+    assert(utf8_end == &utf8result[0] + 9);
+
+    //utf8to32
+    utf32result.clear();
+    unchecked::utf8to32(twochars, twochars + 5, back_inserter(utf32result));
+    assert (utf32result.size() == 2);
+    // try it with the return value;
+    utf32_end = utf8to32(twochars, twochars + 5, &utf32result[0]);
+    assert (utf32_end == &utf32result[0] + 2);
+
+    //utf16to8
+    utf8result.clear();
+    unchecked::utf16to8(utf16string, utf16string + 5, back_inserter(utf8result));
+    assert (utf8result.size() == 10);
+    // try it with the return value;
+    utf8_end = utf16to8 (utf16string, utf16string + 5, &utf8result[0]);
+    assert (utf8_end == &utf8result[0] + 10);
+
+    //utf8to16
+    utf16result.clear();
+    unchecked::utf8to16(utf8_with_surrogates, utf8_with_surrogates + 9, back_inserter(utf16result));
+    assert (utf16result.size() == 4);
+    assert (utf16result[2] == 0xd834);
+    assert (utf16result[3] == 0xdd1e);
+    // try it with the return value;
+    utf16_end = utf8to16 (utf8_with_surrogates, utf8_with_surrogates + 9, &utf16result[0]);
+    assert (utf16_end == &utf16result[0] + 4);
+    
+    // iterator
+    utf8::unchecked::iterator<const char*> un_it(threechars);
+    utf8::unchecked::iterator<const char*> un_it2 = un_it;
+    assert (un_it2 == un_it);
+    assert (*un_it == 0x10346);
+    assert (*(++un_it) == 0x65e5);
+    assert ((*un_it++) == 0x65e5);
+    assert (un_it != un_it2);
+    assert (*un_it == 0x0448);
+    utf8::unchecked::iterator<const char*> un_endit (threechars + 9);  
+    assert (++un_it == un_endit);
+    assert (*(--un_it) == 0x0448);
+    assert ((*un_it--) == 0x0448);
+    assert (*un_it == 0x65e5);
+    assert (--un_it == utf8::unchecked::iterator<const char*>(threechars));
+    assert (*un_it == 0x10346);
+}
+
+
--- a/third-party/utf8cpp/test_drivers/utf8reader/utf8reader.cpp
+++ b/third-party/utf8cpp/test_drivers/utf8reader/utf8reader.cpp
@ -0,0 +1,160 @@
+#include "../../source/utf8.h"
+using namespace utf8;
+
+#include <string>
+#include <iostream>
+#include <fstream>
+#include <vector>
+using namespace std;
+
+int main(int argc, char** argv)
+{
+    if (argc != 2) {
+        cout << "\nUsage: utfreader filename\n";
+        return 0;
+    }
+    const char* TEST_FILE_PATH = argv[1];
+    // Open the test file
+    ifstream fs8(TEST_FILE_PATH);
+    if (!fs8.is_open()) {
+    cout << "Could not open " << TEST_FILE_PATH << endl;
+    return 0;
+    }
+
+    // Read it line by line
+    unsigned int line_count = 0;
+    char byte;
+    while (!fs8.eof()) {
+        string line;
+        while ((byte = static_cast<char>(fs8.get())) != '\n' && !fs8.eof()) 
+            line.push_back(byte);
+
+        line_count++;
+	// Play around with each line and convert it to utf16
+        string::iterator line_start = line.begin();
+        string::iterator line_end   = line.end();
+        line_end = find_invalid(line_start, line_end);
+        if (line_end != line.end()) 
+            cout << "Line " << line_count << ": Invalid utf-8 at byte " << int(line.end() - line_end) << '\n';
+
+        // Convert it to utf-16 and write to the file
+        vector<unsigned short> utf16_line;
+        utf8to16(line_start, line_end, back_inserter(utf16_line));
+
+        // Back to utf-8 and compare it to the original line.
+        string back_to_utf8;
+        utf16to8(utf16_line.begin(), utf16_line.end(), back_inserter(back_to_utf8));
+        if (back_to_utf8.compare(string(line_start, line_end)) != 0) 
+            cout << "Line " << line_count << ": Conversion to UTF-16 and back failed" << '\n';
+
+        // Now, convert it to utf-32, back to utf-8 and compare
+        vector <unsigned> utf32_line;
+        utf8to32(line_start, line_end, back_inserter(utf32_line));
+        back_to_utf8.clear();
+        utf32to8(utf32_line.begin(), utf32_line.end(), back_inserter(back_to_utf8));
+        if (back_to_utf8.compare(string(line_start, line_end)) != 0) 
+            cout << "Line " << line_count << ": Conversion to UTF-32 and back failed" << '\n';
+
+        // Now, iterate and back
+        unsigned char_count = 0;
+        string::iterator it = line_start;
+        while (it != line_end) {
+            unsigned int next_cp = peek_next(it, line_end);
+            if (next(it, line_end) != next_cp)
+                cout << "Line " << line_count << ": Error: peek_next gave a different result than next" << '\n';
+            char_count++;
+        }
+        if (char_count != utf32_line.size())
+            cout << "Line " << line_count << ": Error in iterating with next - wrong number of characters" << '\n';
+
+        string::iterator adv_it = line_start;
+        utf8::advance(adv_it, char_count, line_end);
+        if (adv_it != line_end)
+            cout << "Line " << line_count << ": Error in advance function" << '\n';
+
+        if (string::size_type(utf8::distance(line_start, line_end)) != char_count)
+            cout << "Line " << line_count << ": Error in distance function" << '\n';
+
+        while (it != line_start) {
+            previous(it, line.rend().base());
+            char_count--;
+        }
+        if (char_count != 0)
+            cout << "Line " << line_count << ": Error in iterating with previous - wrong number of characters" << '\n';
+
+        // Try utf8::iterator
+        utf8::iterator<string::iterator> u8it(line_start, line_start, line_end);
+        if (!utf32_line.empty() && *u8it != utf32_line.at(0))
+          cout << "Line " << line_count << ": Error in utf::iterator * operator" << '\n'; 
+        if (std::distance(u8it, utf8::iterator<string::iterator>(line_end, line_start, line_end)) != static_cast<int>(utf32_line.size()))
+          cout << "Line " << line_count << ": Error in using utf::iterator with std::distance - wrong number of characters" << '\n';
+
+        std::advance(u8it, utf32_line.size());
+        if (u8it != utf8::iterator<string::iterator>(line_end, line_start, line_end))
+          cout << "Line " << line_count << ": Error in using utf::iterator with std::advance" << '\n';
+
+
+        //======================== Now, the unchecked versions ======================
+        // Convert it to utf-16 and compare to the checked version
+        vector<unsigned short> utf16_line_unchecked;
+        unchecked::utf8to16(line_start, line_end, back_inserter(utf16_line_unchecked));
+
+        if (utf16_line != utf16_line_unchecked)
+            cout << "Line " << line_count << ": Error in unchecked::utf8to16" << '\n';
+
+        // Back to utf-8 and compare it to the original line.
+        back_to_utf8.clear();
+        unchecked::utf16to8(utf16_line_unchecked.begin(), utf16_line_unchecked.end(), back_inserter(back_to_utf8));
+        if (back_to_utf8.compare(string(line_start, line_end)) != 0) 
+            cout << "Line " << line_count << ": Unchecked conversion to UTF-16 and back failed" << '\n';
+
+        // Now, convert it to utf-32, back to utf-8 and compare
+        vector <unsigned> utf32_line_unchecked;
+        unchecked::utf8to32(line_start, line_end, back_inserter(utf32_line_unchecked));
+        if (utf32_line != utf32_line_unchecked)
+            cout << "Line " << line_count << ": Error in unchecked::utf8to32" << '\n';
+
+        back_to_utf8.clear();
+        unchecked::utf32to8(utf32_line.begin(), utf32_line.end(), back_inserter(back_to_utf8));
+        if (back_to_utf8.compare(string(line_start, line_end)) != 0) 
+            cout << "Line " << line_count << ": Unchecked conversion to UTF-32 and back failed" << '\n';
+
+        // Now, iterate and back
+        char_count = 0;
+        it = line_start;
+        while (it != line_end) {
+            unsigned int next_cp = unchecked::peek_next(it); 
+            if (unchecked::next(it) != next_cp)
+              cout << "Line " << line_count << ": Error: unchecked::peek_next gave a different result than unchecked::next" << '\n';;
+            char_count++;
+        }
+        if (char_count != utf32_line.size())
+            cout << "Line " << line_count << ": Error in iterating with unchecked::next - wrong number of characters" << '\n';
+
+        adv_it = line_start;
+        utf8::unchecked::advance(adv_it, char_count);
+        if (adv_it != line_end)
+            cout << "Line " << line_count << ": Error in unchecked::advance function" << '\n';
+
+        if (string::size_type(utf8::unchecked::distance(line_start, line_end)) != char_count)
+            cout << "Line " << line_count << ": Error in unchecked::distance function" << '\n';
+
+        while (it != line_start) {
+            unchecked::previous(it);
+            char_count--;
+        }
+        if (char_count != 0)
+            cout << "Line " << line_count << ": Error in iterating with unchecked::previous - wrong number of characters" << '\n';
+
+        // Try utf8::unchecked::iterator
+        utf8::unchecked::iterator<string::iterator> un_u8it(line_start);
+        if (!utf32_line.empty() && *un_u8it != utf32_line.at(0))
+          cout << "Line " << line_count << ": Error in utf::unchecked::iterator * operator" << '\n'; 
+        if (std::distance(un_u8it, utf8::unchecked::iterator<string::iterator>(line_end)) != static_cast<int>(utf32_line.size()))
+          cout << "Line " << line_count << ": Error in using utf::unchecked::iterator with std::distance - wrong number of characters" << '\n';
+
+        std::advance(un_u8it, utf32_line.size());
+        if (un_u8it != utf8::unchecked::iterator<string::iterator>(line_end))
+          cout << "Line " << line_count << ": Error in using utf::unchecked::iterator with std::advance" << '\n';
+    }
+}