2009-04-25 01:34:47 +02:00
|
|
|
// Utf8_16.h
|
|
|
|
// Copyright (C) 2002 Scott Kirkwood
|
|
|
|
//
|
|
|
|
// Permission to use, copy, modify, distribute and sell this code
|
|
|
|
// and its documentation for any purpose is hereby granted without fee,
|
|
|
|
// provided that the above copyright notice appear in all copies or
|
|
|
|
// any derived copies. Scott Kirkwood makes no representations
|
|
|
|
// about the suitability of this software for any purpose.
|
|
|
|
// It is provided "as is" without express or implied warranty.
|
|
|
|
//
|
|
|
|
// Notes: Used the UTF information I found at:
|
|
|
|
// http://www.cl.cam.ac.uk/~mgk25/unicode.html
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
//
|
|
|
|
// Modificated 2006 Jens Lorenz
|
|
|
|
//
|
|
|
|
// - Clean up the sources
|
|
|
|
// - Removing UCS-Bug in Utf8_Iter
|
|
|
|
// - Add convert function in Utf8_16_Write
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
2009-09-04 02:10:01 +02:00
|
|
|
|
2009-04-25 01:34:47 +02:00
|
|
|
#pragma once
|
|
|
|
|
|
|
|
#include "Parameters.h"
|
|
|
|
|
|
|
|
#ifdef _MSC_VER
|
|
|
|
#pragma warning(disable: 4514) // nreferenced inline function has been removed
|
|
|
|
#endif
|
|
|
|
|
2019-09-19 13:24:58 +02:00
|
|
|
#include <memory>
|
|
|
|
#include "FileInterface.h"
|
|
|
|
|
|
|
|
|
2009-04-25 01:34:47 +02:00
|
|
|
class Utf8_16 {
|
|
|
|
public:
|
|
|
|
typedef unsigned short utf16; // 16 bits
|
|
|
|
typedef UCHAR utf8; // 8 bits
|
|
|
|
typedef UCHAR ubyte;
|
|
|
|
static const utf8 k_Boms[uniEnd][3];
|
|
|
|
};
|
|
|
|
|
|
|
|
// Reads UTF-16 and outputs UTF-8
|
|
|
|
class Utf16_Iter : public Utf8_16 {
|
|
|
|
public:
|
|
|
|
enum eState {
|
|
|
|
eStart,
|
2021-03-03 12:45:24 +01:00
|
|
|
eSurrogate
|
2009-04-25 01:34:47 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
Utf16_Iter();
|
|
|
|
void reset();
|
|
|
|
void set(const ubyte* pBuf, size_t nLen, UniMode eEncoding);
|
2021-03-03 12:45:24 +01:00
|
|
|
bool get(utf8 *c);
|
2009-04-25 01:34:47 +02:00
|
|
|
void operator++();
|
|
|
|
eState getState() { return m_eState; };
|
2021-04-24 11:02:50 +02:00
|
|
|
operator bool() { return (m_pRead < m_pEnd) || (m_out1st != m_outLst); };
|
2009-04-25 01:34:47 +02:00
|
|
|
|
|
|
|
protected:
|
2021-03-03 12:45:24 +01:00
|
|
|
void read();
|
|
|
|
void pushout(ubyte c);
|
2009-04-25 01:34:47 +02:00
|
|
|
|
|
|
|
protected:
|
|
|
|
UniMode m_eEncoding;
|
|
|
|
eState m_eState;
|
2021-03-03 12:45:24 +01:00
|
|
|
utf8 m_out [16];
|
|
|
|
int m_out1st;
|
|
|
|
int m_outLst;
|
2009-04-25 01:34:47 +02:00
|
|
|
utf16 m_nCur16;
|
2021-03-03 12:45:24 +01:00
|
|
|
utf16 m_highSurrogate;
|
2009-04-25 01:34:47 +02:00
|
|
|
const ubyte* m_pBuf;
|
|
|
|
const ubyte* m_pRead;
|
|
|
|
const ubyte* m_pEnd;
|
|
|
|
};
|
|
|
|
|
|
|
|
// Reads UTF-8 and outputs UTF-16
|
|
|
|
class Utf8_Iter : public Utf8_16 {
|
|
|
|
public:
|
|
|
|
Utf8_Iter();
|
|
|
|
void reset();
|
|
|
|
void set(const ubyte* pBuf, size_t nLen, UniMode eEncoding);
|
2021-03-03 12:45:24 +01:00
|
|
|
bool get(utf16* c);
|
|
|
|
bool canGet() const { return m_out1st != m_outLst; }
|
|
|
|
void toStart();
|
2009-04-25 01:34:47 +02:00
|
|
|
void operator++();
|
2021-04-24 11:02:50 +02:00
|
|
|
operator bool() { return (m_pRead < m_pEnd) || (m_out1st != m_outLst); }
|
2009-04-25 01:34:47 +02:00
|
|
|
|
|
|
|
protected:
|
2021-03-03 12:45:24 +01:00
|
|
|
enum eState {eStart, eFollow};
|
|
|
|
void pushout(utf16 c);
|
2009-04-25 01:34:47 +02:00
|
|
|
protected:
|
|
|
|
UniMode m_eEncoding;
|
|
|
|
eState m_eState;
|
2021-03-03 12:45:24 +01:00
|
|
|
int m_code;
|
|
|
|
int m_count;
|
|
|
|
utf16 m_out [4];
|
|
|
|
int m_out1st, m_outLst;
|
2009-04-25 01:34:47 +02:00
|
|
|
const ubyte* m_pBuf;
|
|
|
|
const ubyte* m_pRead;
|
|
|
|
const ubyte* m_pEnd;
|
|
|
|
};
|
|
|
|
|
|
|
|
// Reads UTF16 and outputs UTF8
|
|
|
|
enum u78 {utf8NoBOM=0, ascii7bits=1, ascii8bits=2};
|
|
|
|
class Utf8_16_Read : public Utf8_16 {
|
|
|
|
public:
|
2022-02-09 04:40:16 +01:00
|
|
|
Utf8_16_Read() {};
|
2009-04-25 01:34:47 +02:00
|
|
|
~Utf8_16_Read();
|
|
|
|
|
|
|
|
size_t convert(char* buf, size_t len);
|
2015-05-30 19:47:44 +02:00
|
|
|
const char* getNewBuf() const { return (const char*) m_pNewBuf; }
|
|
|
|
size_t getNewSize() const { return m_nNewBufSize; }
|
2009-04-25 01:34:47 +02:00
|
|
|
|
|
|
|
UniMode getEncoding() const { return m_eEncoding; }
|
2016-06-05 20:29:21 +02:00
|
|
|
static UniMode determineEncoding(const unsigned char *buf, size_t bufLen);
|
2010-01-18 00:13:06 +01:00
|
|
|
|
2009-04-25 01:34:47 +02:00
|
|
|
protected:
|
|
|
|
void determineEncoding();
|
2010-01-18 00:13:06 +01:00
|
|
|
|
2009-04-25 01:34:47 +02:00
|
|
|
u78 utf8_7bits_8bits();
|
|
|
|
private:
|
2022-02-09 04:40:16 +01:00
|
|
|
UniMode m_eEncoding = uni8Bit;
|
|
|
|
ubyte* m_pBuf = nullptr;
|
|
|
|
ubyte* m_pNewBuf = nullptr;
|
2015-05-30 19:47:44 +02:00
|
|
|
// size of the new buffer
|
2022-02-09 04:40:16 +01:00
|
|
|
size_t m_nNewBufSize = 0;
|
2015-05-30 19:47:44 +02:00
|
|
|
// size of the previously allocated buffer (if != 0)
|
2022-02-09 04:40:16 +01:00
|
|
|
size_t m_nAllocatedBufSize = 0;
|
|
|
|
size_t m_nSkip = 0;
|
|
|
|
bool m_bFirstRead = true;
|
|
|
|
size_t m_nLen = 0;
|
2009-04-25 01:34:47 +02:00
|
|
|
Utf16_Iter m_Iter16;
|
|
|
|
};
|
|
|
|
|
|
|
|
// Read in a UTF-8 buffer and write out to UTF-16 or UTF-8
|
|
|
|
class Utf8_16_Write : public Utf8_16 {
|
|
|
|
public:
|
|
|
|
Utf8_16_Write();
|
|
|
|
~Utf8_16_Write();
|
|
|
|
|
|
|
|
void setEncoding(UniMode eType);
|
|
|
|
|
2021-10-01 17:39:05 +02:00
|
|
|
bool openFile(const TCHAR *name);
|
2022-12-31 03:29:40 +01:00
|
|
|
bool writeFile(const void* p, size_t _size);
|
2021-10-01 17:39:05 +02:00
|
|
|
void closeFile();
|
2009-04-25 01:34:47 +02:00
|
|
|
|
|
|
|
size_t convert(char* p, size_t _size);
|
|
|
|
char* getNewBuf() { return reinterpret_cast<char*>(m_pNewBuf); }
|
|
|
|
|
|
|
|
protected:
|
|
|
|
UniMode m_eEncoding;
|
2021-10-01 17:39:05 +02:00
|
|
|
std::unique_ptr<Win32_IO_File> m_pFile;
|
2009-04-25 01:34:47 +02:00
|
|
|
ubyte* m_pNewBuf;
|
|
|
|
size_t m_nBufSize;
|
|
|
|
bool m_bFirstWrite;
|
|
|
|
};
|
2009-09-04 02:10:01 +02:00
|
|
|
|