[NEW_FEATURE] Detect UTF16 encoding (BE and LE) w/o BOM.
git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@473 f5eea248-9336-0410-98b8-ebc06183d4e3
This commit is contained in:
parent
10aa27cfe5
commit
9867d7b578
Binary file not shown.
|
@ -5492,9 +5492,14 @@ void Notepad_plus::checkUnicodeMenuItems(UniMode um) const
|
||||||
case uni16BE : id = IDM_FORMAT_UCS_2BE; break;
|
case uni16BE : id = IDM_FORMAT_UCS_2BE; break;
|
||||||
case uni16LE : id = IDM_FORMAT_UCS_2LE; break;
|
case uni16LE : id = IDM_FORMAT_UCS_2LE; break;
|
||||||
case uniCookie : id = IDM_FORMAT_AS_UTF_8; break;
|
case uniCookie : id = IDM_FORMAT_AS_UTF_8; break;
|
||||||
default :
|
case uni8Bit : id = IDM_FORMAT_ANSI; break;
|
||||||
id = IDM_FORMAT_ANSI;
|
|
||||||
}
|
}
|
||||||
|
if (id == -1) //um == uni16BE_NoBOM || um == uni16LE_NoBOM
|
||||||
|
{
|
||||||
|
::CheckMenuRadioItem(_mainMenuHandle, IDM_FORMAT_ANSI, IDM_FORMAT_AS_UTF_8, IDM_FORMAT_ANSI, MF_BYCOMMAND);
|
||||||
|
::CheckMenuItem(_mainMenuHandle, IDM_FORMAT_ANSI, MF_UNCHECKED | MF_BYCOMMAND);
|
||||||
|
}
|
||||||
|
else
|
||||||
::CheckMenuRadioItem(_mainMenuHandle, IDM_FORMAT_ANSI, IDM_FORMAT_AS_UTF_8, id, MF_BYCOMMAND);
|
::CheckMenuRadioItem(_mainMenuHandle, IDM_FORMAT_ANSI, IDM_FORMAT_AS_UTF_8, id, MF_BYCOMMAND);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -522,7 +522,11 @@ private:
|
||||||
case uni16BE:
|
case uni16BE:
|
||||||
uniModeText = TEXT("UCS-2 Big Endian"); break;
|
uniModeText = TEXT("UCS-2 Big Endian"); break;
|
||||||
case uni16LE:
|
case uni16LE:
|
||||||
uniModeText = TEXT("UCS-2 little Endian"); break;
|
uniModeText = TEXT("UCS-2 Little Endian"); break;
|
||||||
|
case uni16BE_NoBOM:
|
||||||
|
uniModeText = TEXT("UCS-2 BE w/o BOM"); break;
|
||||||
|
case uni16LE_NoBOM:
|
||||||
|
uniModeText = TEXT("UCS-2 LE w/o BOM"); break;
|
||||||
case uniCookie:
|
case uniCookie:
|
||||||
uniModeText = TEXT("ANSI as UTF-8"); break;
|
uniModeText = TEXT("ANSI as UTF-8"); break;
|
||||||
default :
|
default :
|
||||||
|
|
|
@ -56,7 +56,7 @@ const int TAB_MULTILINE = 128; // 1000 0000
|
||||||
const int TAB_HIDE = 256; //1 0000 0000
|
const int TAB_HIDE = 256; //1 0000 0000
|
||||||
|
|
||||||
enum formatType {WIN_FORMAT, MAC_FORMAT, UNIX_FORMAT};
|
enum formatType {WIN_FORMAT, MAC_FORMAT, UNIX_FORMAT};
|
||||||
enum UniMode {uni8Bit=0, uniUTF8=1, uni16BE=2, uni16LE=3, uniCookie=4, uni7Bit=5, uniEnd};
|
enum UniMode {uni8Bit=0, uniUTF8=1, uni16BE=2, uni16LE=3, uniCookie=4, uni7Bit=5, uni16BE_NoBOM=6, uni16LE_NoBOM=7, uniEnd};
|
||||||
enum ChangeDetect {cdDisabled=0, cdEnabled=1, cdAutoUpdate=2, cdGo2end=3, cdAutoUpdateGo2end=4};
|
enum ChangeDetect {cdDisabled=0, cdEnabled=1, cdAutoUpdate=2, cdGo2end=3, cdAutoUpdateGo2end=4};
|
||||||
enum BackupFeature {bak_none = 0, bak_simple = 1, bak_verbose = 2};
|
enum BackupFeature {bak_none = 0, bak_simple = 1, bak_verbose = 2};
|
||||||
enum OpenSaveDirSetting {dir_followCurrent = 0, dir_last = 1, dir_userDef = 2};
|
enum OpenSaveDirSetting {dir_followCurrent = 0, dir_last = 1, dir_userDef = 2};
|
||||||
|
|
|
@ -41,7 +41,7 @@ Utf8_16_Read::Utf8_16_Read() {
|
||||||
|
|
||||||
Utf8_16_Read::~Utf8_16_Read()
|
Utf8_16_Read::~Utf8_16_Read()
|
||||||
{
|
{
|
||||||
if ((m_eEncoding == uni16BE) || (m_eEncoding == uni16LE))
|
if ((m_eEncoding == uni16BE) || (m_eEncoding == uni16LE) || (m_eEncoding == uni16BE_NoBOM) || (m_eEncoding == uni16LE_NoBOM))
|
||||||
{
|
{
|
||||||
delete [] m_pNewBuf;
|
delete [] m_pNewBuf;
|
||||||
m_pNewBuf = NULL;
|
m_pNewBuf = NULL;
|
||||||
|
@ -146,6 +146,8 @@ size_t Utf8_16_Read::convert(char* buf, size_t len)
|
||||||
ret = len - nSkip;
|
ret = len - nSkip;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
case uni16BE_NoBOM:
|
||||||
|
case uni16LE_NoBOM:
|
||||||
case uni16BE:
|
case uni16BE:
|
||||||
case uni16LE: {
|
case uni16LE: {
|
||||||
size_t newSize = len + len / 2 + 1;
|
size_t newSize = len + len / 2 + 1;
|
||||||
|
@ -186,22 +188,37 @@ void Utf8_16_Read::determineEncoding()
|
||||||
m_eEncoding = uni8Bit;
|
m_eEncoding = uni8Bit;
|
||||||
m_nSkip = 0;
|
m_nSkip = 0;
|
||||||
|
|
||||||
|
// detect UTF-16 big-endian with BOM
|
||||||
if (m_nLen > 1 && m_pBuf[0] == k_Boms[uni16BE][0] && m_pBuf[1] == k_Boms[uni16BE][1])
|
if (m_nLen > 1 && m_pBuf[0] == k_Boms[uni16BE][0] && m_pBuf[1] == k_Boms[uni16BE][1])
|
||||||
{
|
{
|
||||||
m_eEncoding = uni16BE;
|
m_eEncoding = uni16BE;
|
||||||
m_nSkip = 2;
|
m_nSkip = 2;
|
||||||
}
|
}
|
||||||
|
// detect UTF-16 little-endian with BOM
|
||||||
else if (m_nLen > 1 && m_pBuf[0] == k_Boms[uni16LE][0] && m_pBuf[1] == k_Boms[uni16LE][1])
|
else if (m_nLen > 1 && m_pBuf[0] == k_Boms[uni16LE][0] && m_pBuf[1] == k_Boms[uni16LE][1])
|
||||||
{
|
{
|
||||||
m_eEncoding = uni16LE;
|
m_eEncoding = uni16LE;
|
||||||
m_nSkip = 2;
|
m_nSkip = 2;
|
||||||
}
|
}
|
||||||
|
// detect UTF-8 with BOM
|
||||||
else if (m_nLen > 2 && m_pBuf[0] == k_Boms[uniUTF8][0] &&
|
else if (m_nLen > 2 && m_pBuf[0] == k_Boms[uniUTF8][0] &&
|
||||||
m_pBuf[1] == k_Boms[uniUTF8][1] && m_pBuf[2] == k_Boms[uniUTF8][2])
|
m_pBuf[1] == k_Boms[uniUTF8][1] && m_pBuf[2] == k_Boms[uniUTF8][2])
|
||||||
{
|
{
|
||||||
m_eEncoding = uniUTF8;
|
m_eEncoding = uniUTF8;
|
||||||
m_nSkip = 3;
|
m_nSkip = 3;
|
||||||
}
|
}
|
||||||
|
// try to detect UTF-16 little-endian without BOM
|
||||||
|
else if (m_nLen > 1 && m_pBuf[0] != NULL && m_pBuf[1] == NULL && IsTextUnicode(m_pBuf, m_nLen, NULL))
|
||||||
|
{
|
||||||
|
m_eEncoding = uni16LE_NoBOM;
|
||||||
|
m_nSkip = 0;
|
||||||
|
}
|
||||||
|
// try to detect UTF-16 big-endian without BOM
|
||||||
|
else if (m_nLen > 1 && m_pBuf[0] == NULL && m_pBuf[1] != NULL)
|
||||||
|
{
|
||||||
|
m_eEncoding = uni16BE_NoBOM;
|
||||||
|
m_nSkip = 0;
|
||||||
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
u78 detectedEncoding = utf8_7bits_8bits();
|
u78 detectedEncoding = utf8_7bits_8bits();
|
||||||
|
@ -281,6 +298,8 @@ size_t Utf8_16_Write::fwrite(const void* p, size_t _size)
|
||||||
ret = ::fwrite(p, _size, 1, m_pFile);
|
ret = ::fwrite(p, _size, 1, m_pFile);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
case uni16BE_NoBOM:
|
||||||
|
case uni16LE_NoBOM:
|
||||||
case uni16BE:
|
case uni16BE:
|
||||||
case uni16LE: {
|
case uni16LE: {
|
||||||
if (_size > m_nBufSize)
|
if (_size > m_nBufSize)
|
||||||
|
@ -338,6 +357,8 @@ size_t Utf8_16_Write::convert(char* p, size_t _size)
|
||||||
memcpy(&m_pNewBuf[3], p, _size);
|
memcpy(&m_pNewBuf[3], p, _size);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
case uni16BE_NoBOM:
|
||||||
|
case uni16LE_NoBOM:
|
||||||
case uni16BE:
|
case uni16BE:
|
||||||
case uni16LE: {
|
case uni16LE: {
|
||||||
m_pNewBuf = (ubyte*)new ubyte[sizeof(utf16) * (_size + 1)];
|
m_pNewBuf = (ubyte*)new ubyte[sizeof(utf16) * (_size + 1)];
|
||||||
|
@ -442,7 +463,7 @@ void Utf8_Iter::operator++()
|
||||||
void Utf8_Iter::toStart()
|
void Utf8_Iter::toStart()
|
||||||
{
|
{
|
||||||
m_eState = eStart;
|
m_eState = eStart;
|
||||||
if (m_eEncoding == uni16BE)
|
if (m_eEncoding == uni16BE || m_eEncoding == uni16BE_NoBOM)
|
||||||
{
|
{
|
||||||
swap();
|
swap();
|
||||||
}
|
}
|
||||||
|
@ -492,10 +513,13 @@ void Utf16_Iter::operator++()
|
||||||
switch (m_eState)
|
switch (m_eState)
|
||||||
{
|
{
|
||||||
case eStart:
|
case eStart:
|
||||||
if (m_eEncoding == uni16LE) {
|
if (m_eEncoding == uni16LE || m_eEncoding == uni16LE_NoBOM)
|
||||||
|
{
|
||||||
m_nCur16 = *m_pRead++;
|
m_nCur16 = *m_pRead++;
|
||||||
m_nCur16 |= static_cast<utf16>(*m_pRead << 8);
|
m_nCur16 |= static_cast<utf16>(*m_pRead << 8);
|
||||||
} else {
|
}
|
||||||
|
else //(m_eEncoding == uni16BE || m_eEncoding == uni16BE_NoBOM)
|
||||||
|
{
|
||||||
m_nCur16 = static_cast<utf16>(*m_pRead++ << 8);
|
m_nCur16 = static_cast<utf16>(*m_pRead++ << 8);
|
||||||
m_nCur16 |= *m_pRead;
|
m_nCur16 |= *m_pRead;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue