263 lines
8.2 KiB
C++
263 lines
8.2 KiB
C++
|
// Example use of charset.hh in ../include/charset.hh.
|
|||
|
//
|
|||
|
// To compile, you need to make charset.hh accessible on the include path.
|
|||
|
// Also, charset.hh includes my iconv wrapper, which is in ../include/Incover.hh,
|
|||
|
// and depends on other things in that directory; those dependencies could be
|
|||
|
// made to go away.
|
|||
|
|
|||
|
|
|||
|
// This source file should be viewed using a UTF8 editor, and its output when run should be
|
|||
|
// viewed on a UTF8 terminal.
|
|||
|
|
|||
|
|
|||
|
#include "charset.hh"
|
|||
|
|
|||
|
#include <iostream>
|
|||
|
#include <algorithm>
|
|||
|
#include <list>
|
|||
|
|
|||
|
using namespace pbe;
|
|||
|
using namespace std;
|
|||
|
|
|||
|
|
|||
|
void compile_time_tagged_strings_example()
|
|||
|
{
|
|||
|
// This example declares strings with compile-time-fixed character sets, converts
|
|||
|
// them to other compile-time-fixed character sets, combines them, and checks for
|
|||
|
// consistency:
|
|||
|
|
|||
|
cout << "\ncompile_time_tagged_strings_example:\n";
|
|||
|
|
|||
|
utf8_string french = "Le traité simplifié prêt à être soumis "
|
|||
|
"à l'approbation des gouvernements";
|
|||
|
latin1_string french_fixed = french.recode<latin1>();
|
|||
|
|
|||
|
utf8_string icelandic = "Smjörið er brætt og hveitið smátt og smátt hrært út í það";
|
|||
|
latin1_string icelandic_fixed = icelandic.recode<latin1>();
|
|||
|
|
|||
|
utf8_string all = french + icelandic;
|
|||
|
|
|||
|
latin1_string all_fixed = french_fixed + icelandic_fixed;
|
|||
|
|
|||
|
if ((all.recode<latin1>() == all_fixed)
|
|||
|
&& (all == all_fixed.recode<utf8>())) {
|
|||
|
cout << "Pass, both strings are '" << all << "'\n";
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
|
|||
|
void utf8_const_iterator_example()
|
|||
|
{
|
|||
|
// This example shows how a string with a variable-width
|
|||
|
// character set can be iterated over character-at-a-time
|
|||
|
// or "unit"-at-a-time.
|
|||
|
|
|||
|
cout << "\nutf8_const_iterator_example:\n";
|
|||
|
|
|||
|
utf8_string s = "Théâtre"; // My editor stores UTF8.
|
|||
|
|
|||
|
// Iterate "unit" (byte) at a time:
|
|||
|
cout << "Here are the bytes of '" << s << "': " << hex;
|
|||
|
for (utf8_string::const_iterator i = s.begin();
|
|||
|
i != s.end(); ++i) {
|
|||
|
char8_t c = *i;
|
|||
|
cout << static_cast<unsigned int>(static_cast<uint8_t>(c)) << " ";
|
|||
|
}
|
|||
|
|
|||
|
// Iterate character at a time:
|
|||
|
cout << "\nHere are the characters of '" << s << "': ";
|
|||
|
for (utf8_string::const_character_iterator i = s.begin();
|
|||
|
i != utf8_string::const_character_iterator( s.end() ); ++i) {
|
|||
|
utf8_char_t c = *i; // A 32-bit decoded Unicode character
|
|||
|
cout << static_cast<unsigned int>(c) << " ";
|
|||
|
}
|
|||
|
cout << dec << "\n";
|
|||
|
}
|
|||
|
|
|||
|
|
|||
|
void utf8_output_iterator_example()
|
|||
|
{
|
|||
|
// This example shows how a string with a variable-width
|
|||
|
// character set can be appended to using push_back and
|
|||
|
// an output iterator.
|
|||
|
|
|||
|
cout << "\nutf8_output_iterator_example:\n";
|
|||
|
|
|||
|
utf8_string s;
|
|||
|
|
|||
|
for (utf8_char_t c=64; c<96; ++c) {
|
|||
|
s.push_back(c);
|
|||
|
}
|
|||
|
|
|||
|
utf8_string::character_output_iterator i(s);
|
|||
|
|
|||
|
for (utf8_char_t c=150; c<200; ++c) {
|
|||
|
*i++ = c;
|
|||
|
// s.push_back(c);
|
|||
|
}
|
|||
|
|
|||
|
cout << "Unicode characters 64 to 95 and 150 to 199:\n"
|
|||
|
<< s << "\n";
|
|||
|
}
|
|||
|
|
|||
|
|
|||
|
void utf8_word_split_example()
|
|||
|
{
|
|||
|
// This example demonstrates a case where a "unit" rather than a character iterator for a
|
|||
|
// UTF8 string is useful: because bytes < 128 can only ever represent single characters in
|
|||
|
// UTF8, we can treat a UTF8 string as a sequence of bytes when spliting at spaces.
|
|||
|
|
|||
|
cout << "\nutf8_word_split_example:\n";
|
|||
|
|
|||
|
utf8_string s = "Yo también quemo la Corona española";
|
|||
|
utf8_string::const_iterator i = s.begin();
|
|||
|
utf8_string::const_iterator e = s.end();
|
|||
|
utf8_string::const_iterator j;
|
|||
|
do {
|
|||
|
j = find(i,e,' ');
|
|||
|
utf8_string word(i,j);
|
|||
|
cout << word << "\n";
|
|||
|
i = j+1;
|
|||
|
} while (j != e);
|
|||
|
}
|
|||
|
|
|||
|
|
|||
|
void ucs4_line_wrap_example()
|
|||
|
{
|
|||
|
// Sometimes a random-access character iterator is needed, but an iso_8859 or similar byte
|
|||
|
// character set can't be used because the characters in the content are not restricted.
|
|||
|
// In this case, ucs4 is normally the best choice - though its requirement for 4 bytes per
|
|||
|
// character may be considered a disadvantage in memory-limited applications.
|
|||
|
// This example uses random access to break a string into lines of <=40 characters each.
|
|||
|
|
|||
|
cout << "\nucs4_line_wrap_example:\n";
|
|||
|
|
|||
|
utf8_string text_var = "Партия Единая Россия отказалась от формирования первой "
|
|||
|
"тройки федерального списка - его возглавил только президент "
|
|||
|
"Владимир Путин. Такое решение было принято на съезде Единой "
|
|||
|
"России во вторник. Накануне президент России дал согласие "
|
|||
|
"возглавить список Единой России на выборах в Госдуму.";
|
|||
|
|
|||
|
ucs4_string text_fixed = text_var.recode<ucs4>();
|
|||
|
|
|||
|
for (unsigned int i=39; i<text_fixed.length(); i+=40) {
|
|||
|
while (text_fixed[i]!=' ') {
|
|||
|
--i;
|
|||
|
}
|
|||
|
text_fixed[i] = '\n';
|
|||
|
}
|
|||
|
|
|||
|
cout << text_fixed.recode<utf8>() << "\n";
|
|||
|
}
|
|||
|
|
|||
|
|
|||
|
// This example shows how a library-user can make a new character set available.
|
|||
|
// The example is the KOI8 character set, a fixed-width byte character set containing
|
|||
|
// cyrillic and latin characters.
|
|||
|
|
|||
|
////// This section needs some attention from a preprocessor expert; I want to use
|
|||
|
////// a counter of some sort to allocate new charset_t values with a macro:
|
|||
|
////// PBE_DEFINE_CHARSET(koi8);
|
|||
|
////// But I can't see a good way to do it. For the time being, I'll choose a value
|
|||
|
////// manually:
|
|||
|
const charset_t koi8 = static_cast<charset_t>(25);
|
|||
|
|
|||
|
// Define charset_traits for KOI8:
|
|||
|
namespace pbe {
|
|||
|
template <>
|
|||
|
struct charset_traits<koi8> {
|
|||
|
typedef char8_t unit_t;
|
|||
|
typedef char8_t char_t;
|
|||
|
};
|
|||
|
};
|
|||
|
typedef tagged_string<koi8> koi8_string;
|
|||
|
|
|||
|
void user_defined_charset_example()
|
|||
|
{
|
|||
|
charset_names[koi8] = "koi8";
|
|||
|
|
|||
|
cout << "\nuser_defined_charset_example:\n";
|
|||
|
|
|||
|
// We'll convert a string back and forth between utf8 and koi8:
|
|||
|
utf8_string u = "Код Обмена Информацией, 8 бит";
|
|||
|
koi8_string k = u.recode<koi8>();
|
|||
|
utf8_string u2 = k.recode<utf8>();
|
|||
|
|
|||
|
// KOI8 is a more compact encoiding than UTF8 for cyrillic:
|
|||
|
cout << "Length of UTF8 string = " << u2.length()
|
|||
|
<< ", length of KOI8 string = " << k.length() << "\n";
|
|||
|
}
|
|||
|
|
|||
|
|
|||
|
void runtime_tagged_example()
|
|||
|
{
|
|||
|
// This example shows how character sets known only at run-time can be used.
|
|||
|
// This is motivated by multipart MIME email, where each part can have a different
|
|||
|
// character set. But since MIME is rather complex to parse, this example uses
|
|||
|
// the following simpler format: the input byte sequence consists of a character
|
|||
|
// set name (in ascii) followed by data using that character set enclosed in {},
|
|||
|
// followed by further content in another character set, and so on.
|
|||
|
// This example first creates such a message and then decomposes it.
|
|||
|
|
|||
|
cout << "\nruntime_tagged_example:\n";
|
|||
|
|
|||
|
// We'll store the hybrid message in a std::string.
|
|||
|
string message =
|
|||
|
string("utf8{") + "El catalán, moneda lingüística" + "}"
|
|||
|
+ "iso-8859-1{" + utf8_string("får årets Nobelpris i litteratur.").recode<latin1>() + "}";
|
|||
|
// + "ucs2{" + utf8_string("Директором СВР назначен Михаил Фрадков").recode<ucs2>() + "}";
|
|||
|
|
|||
|
// Now parse it into a list of run-time-tagged strings:
|
|||
|
typedef list<rt_tagged_string> strings_t;
|
|||
|
strings_t strings;
|
|||
|
string::const_iterator i = message.begin();
|
|||
|
string::const_iterator e = message.end();
|
|||
|
while (i != e) {
|
|||
|
string::const_iterator j = find(i,e,'{');
|
|||
|
string charset_name(i,j);
|
|||
|
string::const_iterator k = find(j,e,'}');
|
|||
|
string content(j+1,k);
|
|||
|
rt_tagged_string s(lookup_charset(charset_name),content);
|
|||
|
strings.push_back(s);
|
|||
|
i = k+1;
|
|||
|
}
|
|||
|
|
|||
|
// Output the parsed strings, converting to UTF8 to do so:
|
|||
|
for (strings_t::const_iterator a = strings.begin();
|
|||
|
a != strings.end(); ++a) {
|
|||
|
utf8_string u = a->recode<utf8>();
|
|||
|
cout << u << "\n";
|
|||
|
}
|
|||
|
|
|||
|
}
|
|||
|
|
|||
|
|
|||
|
|
|||
|
// The following examples illustrate planned functionality that's not yet implemented:
|
|||
|
|
|||
|
#if 1
|
|||
|
|
|||
|
#endif
|
|||
|
|
|||
|
|
|||
|
|
|||
|
int main()
|
|||
|
{
|
|||
|
// These examples work:
|
|||
|
compile_time_tagged_strings_example();
|
|||
|
utf8_const_iterator_example();
|
|||
|
utf8_output_iterator_example();
|
|||
|
utf8_word_split_example();
|
|||
|
ucs4_line_wrap_example();
|
|||
|
|
|||
|
runtime_tagged_example();
|
|||
|
|
|||
|
// These examples don't yet work:
|
|||
|
#if 1
|
|||
|
user_defined_charset_example();
|
|||
|
#endif
|
|||
|
|
|||
|
return 0;
|
|||
|
}
|
|||
|
|