// Example use of charset.hh in ../include/charset.hh. // // To compile, you need to make charset.hh accessible on the include path. // Also, charset.hh includes my iconv wrapper, which is in ../include/Incover.hh, // and depends on other things in that directory; those dependencies could be // made to go away. // This source file should be viewed using a UTF8 editor, and its output when run should be // viewed on a UTF8 terminal. #include "charset.hh" #include #include #include using namespace pbe; using namespace std; void compile_time_tagged_strings_example() { // This example declares strings with compile-time-fixed character sets, converts // them to other compile-time-fixed character sets, combines them, and checks for // consistency: cout << "\ncompile_time_tagged_strings_example:\n"; utf8_string french = "Le traité simplifié prêt à être soumis " "à l'approbation des gouvernements"; latin1_string french_fixed = french.recode(); utf8_string icelandic = "Smjörið er brætt og hveitið smátt og smátt hrært út í það"; latin1_string icelandic_fixed = icelandic.recode(); utf8_string all = french + icelandic; latin1_string all_fixed = french_fixed + icelandic_fixed; if ((all.recode() == all_fixed) && (all == all_fixed.recode())) { cout << "Pass, both strings are '" << all << "'\n"; } } void utf8_const_iterator_example() { // This example shows how a string with a variable-width // character set can be iterated over character-at-a-time // or "unit"-at-a-time. cout << "\nutf8_const_iterator_example:\n"; utf8_string s = "Théâtre"; // My editor stores UTF8. // Iterate "unit" (byte) at a time: cout << "Here are the bytes of '" << s << "': " << hex; for (utf8_string::const_iterator i = s.begin(); i != s.end(); ++i) { char8_t c = *i; cout << static_cast(static_cast(c)) << " "; } // Iterate character at a time: cout << "\nHere are the characters of '" << s << "': "; for (utf8_string::const_character_iterator i = s.begin(); i != utf8_string::const_character_iterator( s.end() ); ++i) { utf8_char_t c = *i; // A 32-bit decoded Unicode character cout << static_cast(c) << " "; } cout << dec << "\n"; } void utf8_output_iterator_example() { // This example shows how a string with a variable-width // character set can be appended to using push_back and // an output iterator. cout << "\nutf8_output_iterator_example:\n"; utf8_string s; for (utf8_char_t c=64; c<96; ++c) { s.push_back(c); } utf8_string::character_output_iterator i(s); for (utf8_char_t c=150; c<200; ++c) { *i++ = c; // s.push_back(c); } cout << "Unicode characters 64 to 95 and 150 to 199:\n" << s << "\n"; } void utf8_word_split_example() { // This example demonstrates a case where a "unit" rather than a character iterator for a // UTF8 string is useful: because bytes < 128 can only ever represent single characters in // UTF8, we can treat a UTF8 string as a sequence of bytes when spliting at spaces. cout << "\nutf8_word_split_example:\n"; utf8_string s = "Yo también quemo la Corona española"; utf8_string::const_iterator i = s.begin(); utf8_string::const_iterator e = s.end(); utf8_string::const_iterator j; do { j = find(i,e,' '); utf8_string word(i,j); cout << word << "\n"; i = j+1; } while (j != e); } void ucs4_line_wrap_example() { // Sometimes a random-access character iterator is needed, but an iso_8859 or similar byte // character set can't be used because the characters in the content are not restricted. // In this case, ucs4 is normally the best choice - though its requirement for 4 bytes per // character may be considered a disadvantage in memory-limited applications. // This example uses random access to break a string into lines of <=40 characters each. cout << "\nucs4_line_wrap_example:\n"; utf8_string text_var = "Партия Единая Россия отказалась от формирования первой " "тройки федерального списка - его возглавил только президент " "Владимир Путин. Такое решение было принято на съезде Единой " "России во вторник. Накануне президент России дал согласие " "возглавить список Единой России на выборах в Госдуму."; ucs4_string text_fixed = text_var.recode(); for (unsigned int i=39; i() << "\n"; } // This example shows how a library-user can make a new character set available. // The example is the KOI8 character set, a fixed-width byte character set containing // cyrillic and latin characters. ////// This section needs some attention from a preprocessor expert; I want to use ////// a counter of some sort to allocate new charset_t values with a macro: ////// PBE_DEFINE_CHARSET(koi8); ////// But I can't see a good way to do it. For the time being, I'll choose a value ////// manually: const charset_t koi8 = static_cast(25); // Define charset_traits for KOI8: namespace pbe { template <> struct charset_traits { typedef char8_t unit_t; typedef char8_t char_t; }; }; typedef tagged_string koi8_string; void user_defined_charset_example() { charset_names[koi8] = "koi8"; cout << "\nuser_defined_charset_example:\n"; // We'll convert a string back and forth between utf8 and koi8: utf8_string u = "Код Обмена Информацией, 8 бит"; koi8_string k = u.recode(); utf8_string u2 = k.recode(); // KOI8 is a more compact encoiding than UTF8 for cyrillic: cout << "Length of UTF8 string = " << u2.length() << ", length of KOI8 string = " << k.length() << "\n"; } void runtime_tagged_example() { // This example shows how character sets known only at run-time can be used. // This is motivated by multipart MIME email, where each part can have a different // character set. But since MIME is rather complex to parse, this example uses // the following simpler format: the input byte sequence consists of a character // set name (in ascii) followed by data using that character set enclosed in {}, // followed by further content in another character set, and so on. // This example first creates such a message and then decomposes it. cout << "\nruntime_tagged_example:\n"; // We'll store the hybrid message in a std::string. string message = string("utf8{") + "El catalán, moneda lingüística" + "}" + "iso-8859-1{" + utf8_string("får årets Nobelpris i litteratur.").recode() + "}"; // + "ucs2{" + utf8_string("Директором СВР назначен Михаил Фрадков").recode() + "}"; // Now parse it into a list of run-time-tagged strings: typedef list strings_t; strings_t strings; string::const_iterator i = message.begin(); string::const_iterator e = message.end(); while (i != e) { string::const_iterator j = find(i,e,'{'); string charset_name(i,j); string::const_iterator k = find(j,e,'}'); string content(j+1,k); rt_tagged_string s(lookup_charset(charset_name),content); strings.push_back(s); i = k+1; } // Output the parsed strings, converting to UTF8 to do so: for (strings_t::const_iterator a = strings.begin(); a != strings.end(); ++a) { utf8_string u = a->recode(); cout << u << "\n"; } } // The following examples illustrate planned functionality that's not yet implemented: #if 1 #endif int main() { // These examples work: compile_time_tagged_strings_example(); utf8_const_iterator_example(); utf8_output_iterator_example(); utf8_word_split_example(); ucs4_line_wrap_example(); runtime_tagged_example(); // These examples don't yet work: #if 1 user_defined_charset_example(); #endif return 0; }