pandorafms/extras/anytermd/libpbe/examples/charsets.cc

263 lines
8.2 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// Example use of charset.hh in ../include/charset.hh.
//
// To compile, you need to make charset.hh accessible on the include path.
// Also, charset.hh includes my iconv wrapper, which is in ../include/Incover.hh,
// and depends on other things in that directory; those dependencies could be
// made to go away.
// This source file should be viewed using a UTF8 editor, and its output when run should be
// viewed on a UTF8 terminal.
#include "charset.hh"
#include <iostream>
#include <algorithm>
#include <list>
using namespace pbe;
using namespace std;
void compile_time_tagged_strings_example()
{
// This example declares strings with compile-time-fixed character sets, converts
// them to other compile-time-fixed character sets, combines them, and checks for
// consistency:
cout << "\ncompile_time_tagged_strings_example:\n";
utf8_string french = "Le traité simplifié prêt à être soumis "
"à l'approbation des gouvernements";
latin1_string french_fixed = french.recode<latin1>();
utf8_string icelandic = "Smjörið er brætt og hveitið smátt og smátt hrært út í það";
latin1_string icelandic_fixed = icelandic.recode<latin1>();
utf8_string all = french + icelandic;
latin1_string all_fixed = french_fixed + icelandic_fixed;
if ((all.recode<latin1>() == all_fixed)
&& (all == all_fixed.recode<utf8>())) {
cout << "Pass, both strings are '" << all << "'\n";
}
}
void utf8_const_iterator_example()
{
// This example shows how a string with a variable-width
// character set can be iterated over character-at-a-time
// or "unit"-at-a-time.
cout << "\nutf8_const_iterator_example:\n";
utf8_string s = "Théâtre"; // My editor stores UTF8.
// Iterate "unit" (byte) at a time:
cout << "Here are the bytes of '" << s << "': " << hex;
for (utf8_string::const_iterator i = s.begin();
i != s.end(); ++i) {
char8_t c = *i;
cout << static_cast<unsigned int>(static_cast<uint8_t>(c)) << " ";
}
// Iterate character at a time:
cout << "\nHere are the characters of '" << s << "': ";
for (utf8_string::const_character_iterator i = s.begin();
i != utf8_string::const_character_iterator( s.end() ); ++i) {
utf8_char_t c = *i; // A 32-bit decoded Unicode character
cout << static_cast<unsigned int>(c) << " ";
}
cout << dec << "\n";
}
void utf8_output_iterator_example()
{
// This example shows how a string with a variable-width
// character set can be appended to using push_back and
// an output iterator.
cout << "\nutf8_output_iterator_example:\n";
utf8_string s;
for (utf8_char_t c=64; c<96; ++c) {
s.push_back(c);
}
utf8_string::character_output_iterator i(s);
for (utf8_char_t c=150; c<200; ++c) {
*i++ = c;
// s.push_back(c);
}
cout << "Unicode characters 64 to 95 and 150 to 199:\n"
<< s << "\n";
}
void utf8_word_split_example()
{
// This example demonstrates a case where a "unit" rather than a character iterator for a
// UTF8 string is useful: because bytes < 128 can only ever represent single characters in
// UTF8, we can treat a UTF8 string as a sequence of bytes when spliting at spaces.
cout << "\nutf8_word_split_example:\n";
utf8_string s = "Yo también quemo la Corona española";
utf8_string::const_iterator i = s.begin();
utf8_string::const_iterator e = s.end();
utf8_string::const_iterator j;
do {
j = find(i,e,' ');
utf8_string word(i,j);
cout << word << "\n";
i = j+1;
} while (j != e);
}
void ucs4_line_wrap_example()
{
// Sometimes a random-access character iterator is needed, but an iso_8859 or similar byte
// character set can't be used because the characters in the content are not restricted.
// In this case, ucs4 is normally the best choice - though its requirement for 4 bytes per
// character may be considered a disadvantage in memory-limited applications.
// This example uses random access to break a string into lines of <=40 characters each.
cout << "\nucs4_line_wrap_example:\n";
utf8_string text_var = "Партия Единая Россия отказалась от формирования первой "
"тройки федерального списка - его возглавил только президент "
"Владимир Путин. Такое решение было принято на съезде Единой "
"России во вторник. Накануне президент России дал согласие "
"возглавить список Единой России на выборах в Госдуму.";
ucs4_string text_fixed = text_var.recode<ucs4>();
for (unsigned int i=39; i<text_fixed.length(); i+=40) {
while (text_fixed[i]!=' ') {
--i;
}
text_fixed[i] = '\n';
}
cout << text_fixed.recode<utf8>() << "\n";
}
// This example shows how a library-user can make a new character set available.
// The example is the KOI8 character set, a fixed-width byte character set containing
// cyrillic and latin characters.
////// This section needs some attention from a preprocessor expert; I want to use
////// a counter of some sort to allocate new charset_t values with a macro:
////// PBE_DEFINE_CHARSET(koi8);
////// But I can't see a good way to do it. For the time being, I'll choose a value
////// manually:
const charset_t koi8 = static_cast<charset_t>(25);
// Define charset_traits for KOI8:
namespace pbe {
template <>
struct charset_traits<koi8> {
typedef char8_t unit_t;
typedef char8_t char_t;
};
};
typedef tagged_string<koi8> koi8_string;
void user_defined_charset_example()
{
charset_names[koi8] = "koi8";
cout << "\nuser_defined_charset_example:\n";
// We'll convert a string back and forth between utf8 and koi8:
utf8_string u = "Код Обмена Информацией, 8 бит";
koi8_string k = u.recode<koi8>();
utf8_string u2 = k.recode<utf8>();
// KOI8 is a more compact encoiding than UTF8 for cyrillic:
cout << "Length of UTF8 string = " << u2.length()
<< ", length of KOI8 string = " << k.length() << "\n";
}
void runtime_tagged_example()
{
// This example shows how character sets known only at run-time can be used.
// This is motivated by multipart MIME email, where each part can have a different
// character set. But since MIME is rather complex to parse, this example uses
// the following simpler format: the input byte sequence consists of a character
// set name (in ascii) followed by data using that character set enclosed in {},
// followed by further content in another character set, and so on.
// This example first creates such a message and then decomposes it.
cout << "\nruntime_tagged_example:\n";
// We'll store the hybrid message in a std::string.
string message =
string("utf8{") + "El catalán, moneda lingüística" + "}"
+ "iso-8859-1{" + utf8_string("får årets Nobelpris i litteratur.").recode<latin1>() + "}";
// + "ucs2{" + utf8_string("Директором СВР назначен Михаил Фрадков").recode<ucs2>() + "}";
// Now parse it into a list of run-time-tagged strings:
typedef list<rt_tagged_string> strings_t;
strings_t strings;
string::const_iterator i = message.begin();
string::const_iterator e = message.end();
while (i != e) {
string::const_iterator j = find(i,e,'{');
string charset_name(i,j);
string::const_iterator k = find(j,e,'}');
string content(j+1,k);
rt_tagged_string s(lookup_charset(charset_name),content);
strings.push_back(s);
i = k+1;
}
// Output the parsed strings, converting to UTF8 to do so:
for (strings_t::const_iterator a = strings.begin();
a != strings.end(); ++a) {
utf8_string u = a->recode<utf8>();
cout << u << "\n";
}
}
// The following examples illustrate planned functionality that's not yet implemented:
#if 1
#endif
int main()
{
// These examples work:
compile_time_tagged_strings_example();
utf8_const_iterator_example();
utf8_output_iterator_example();
utf8_word_split_example();
ucs4_line_wrap_example();
runtime_tagged_example();
// These examples don't yet work:
#if 1
user_defined_charset_example();
#endif
return 0;
}