[NEW_FEATURE] Detect UTF16 encoding (BE and LE) w/o BOM.

git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@473 f5eea248-9336-0410-98b8-ebc06183d4e3
This commit is contained in:
Don Ho 2009-04-29 18:07:30 +00:00
parent 10aa27cfe5
commit 9867d7b578
5 changed files with 41 additions and 8 deletions

Binary file not shown.

View File

@ -5492,9 +5492,14 @@ void Notepad_plus::checkUnicodeMenuItems(UniMode um) const
case uni16BE : id = IDM_FORMAT_UCS_2BE; break; case uni16BE : id = IDM_FORMAT_UCS_2BE; break;
case uni16LE : id = IDM_FORMAT_UCS_2LE; break; case uni16LE : id = IDM_FORMAT_UCS_2LE; break;
case uniCookie : id = IDM_FORMAT_AS_UTF_8; break; case uniCookie : id = IDM_FORMAT_AS_UTF_8; break;
default : case uni8Bit : id = IDM_FORMAT_ANSI; break;
id = IDM_FORMAT_ANSI;
} }
if (id == -1) //um == uni16BE_NoBOM || um == uni16LE_NoBOM
{
::CheckMenuRadioItem(_mainMenuHandle, IDM_FORMAT_ANSI, IDM_FORMAT_AS_UTF_8, IDM_FORMAT_ANSI, MF_BYCOMMAND);
::CheckMenuItem(_mainMenuHandle, IDM_FORMAT_ANSI, MF_UNCHECKED | MF_BYCOMMAND);
}
else
::CheckMenuRadioItem(_mainMenuHandle, IDM_FORMAT_ANSI, IDM_FORMAT_AS_UTF_8, id, MF_BYCOMMAND); ::CheckMenuRadioItem(_mainMenuHandle, IDM_FORMAT_ANSI, IDM_FORMAT_AS_UTF_8, id, MF_BYCOMMAND);
} }

View File

@ -522,7 +522,11 @@ private:
case uni16BE: case uni16BE:
uniModeText = TEXT("UCS-2 Big Endian"); break; uniModeText = TEXT("UCS-2 Big Endian"); break;
case uni16LE: case uni16LE:
uniModeText = TEXT("UCS-2 little Endian"); break; uniModeText = TEXT("UCS-2 Little Endian"); break;
case uni16BE_NoBOM:
uniModeText = TEXT("UCS-2 BE w/o BOM"); break;
case uni16LE_NoBOM:
uniModeText = TEXT("UCS-2 LE w/o BOM"); break;
case uniCookie: case uniCookie:
uniModeText = TEXT("ANSI as UTF-8"); break; uniModeText = TEXT("ANSI as UTF-8"); break;
default : default :

View File

@ -56,7 +56,7 @@ const int TAB_MULTILINE = 128; // 1000 0000
const int TAB_HIDE = 256; //1 0000 0000 const int TAB_HIDE = 256; //1 0000 0000
enum formatType {WIN_FORMAT, MAC_FORMAT, UNIX_FORMAT}; enum formatType {WIN_FORMAT, MAC_FORMAT, UNIX_FORMAT};
enum UniMode {uni8Bit=0, uniUTF8=1, uni16BE=2, uni16LE=3, uniCookie=4, uni7Bit=5, uniEnd}; enum UniMode {uni8Bit=0, uniUTF8=1, uni16BE=2, uni16LE=3, uniCookie=4, uni7Bit=5, uni16BE_NoBOM=6, uni16LE_NoBOM=7, uniEnd};
enum ChangeDetect {cdDisabled=0, cdEnabled=1, cdAutoUpdate=2, cdGo2end=3, cdAutoUpdateGo2end=4}; enum ChangeDetect {cdDisabled=0, cdEnabled=1, cdAutoUpdate=2, cdGo2end=3, cdAutoUpdateGo2end=4};
enum BackupFeature {bak_none = 0, bak_simple = 1, bak_verbose = 2}; enum BackupFeature {bak_none = 0, bak_simple = 1, bak_verbose = 2};
enum OpenSaveDirSetting {dir_followCurrent = 0, dir_last = 1, dir_userDef = 2}; enum OpenSaveDirSetting {dir_followCurrent = 0, dir_last = 1, dir_userDef = 2};

View File

@ -41,7 +41,7 @@ Utf8_16_Read::Utf8_16_Read() {
Utf8_16_Read::~Utf8_16_Read() Utf8_16_Read::~Utf8_16_Read()
{ {
if ((m_eEncoding == uni16BE) || (m_eEncoding == uni16LE)) if ((m_eEncoding == uni16BE) || (m_eEncoding == uni16LE) || (m_eEncoding == uni16BE_NoBOM) || (m_eEncoding == uni16LE_NoBOM))
{ {
delete [] m_pNewBuf; delete [] m_pNewBuf;
m_pNewBuf = NULL; m_pNewBuf = NULL;
@ -146,6 +146,8 @@ size_t Utf8_16_Read::convert(char* buf, size_t len)
ret = len - nSkip; ret = len - nSkip;
break; break;
} }
case uni16BE_NoBOM:
case uni16LE_NoBOM:
case uni16BE: case uni16BE:
case uni16LE: { case uni16LE: {
size_t newSize = len + len / 2 + 1; size_t newSize = len + len / 2 + 1;
@ -186,22 +188,37 @@ void Utf8_16_Read::determineEncoding()
m_eEncoding = uni8Bit; m_eEncoding = uni8Bit;
m_nSkip = 0; m_nSkip = 0;
// detect UTF-16 big-endian with BOM
if (m_nLen > 1 && m_pBuf[0] == k_Boms[uni16BE][0] && m_pBuf[1] == k_Boms[uni16BE][1]) if (m_nLen > 1 && m_pBuf[0] == k_Boms[uni16BE][0] && m_pBuf[1] == k_Boms[uni16BE][1])
{ {
m_eEncoding = uni16BE; m_eEncoding = uni16BE;
m_nSkip = 2; m_nSkip = 2;
} }
// detect UTF-16 little-endian with BOM
else if (m_nLen > 1 && m_pBuf[0] == k_Boms[uni16LE][0] && m_pBuf[1] == k_Boms[uni16LE][1]) else if (m_nLen > 1 && m_pBuf[0] == k_Boms[uni16LE][0] && m_pBuf[1] == k_Boms[uni16LE][1])
{ {
m_eEncoding = uni16LE; m_eEncoding = uni16LE;
m_nSkip = 2; m_nSkip = 2;
} }
// detect UTF-8 with BOM
else if (m_nLen > 2 && m_pBuf[0] == k_Boms[uniUTF8][0] && else if (m_nLen > 2 && m_pBuf[0] == k_Boms[uniUTF8][0] &&
m_pBuf[1] == k_Boms[uniUTF8][1] && m_pBuf[2] == k_Boms[uniUTF8][2]) m_pBuf[1] == k_Boms[uniUTF8][1] && m_pBuf[2] == k_Boms[uniUTF8][2])
{ {
m_eEncoding = uniUTF8; m_eEncoding = uniUTF8;
m_nSkip = 3; m_nSkip = 3;
} }
// try to detect UTF-16 little-endian without BOM
else if (m_nLen > 1 && m_pBuf[0] != NULL && m_pBuf[1] == NULL && IsTextUnicode(m_pBuf, m_nLen, NULL))
{
m_eEncoding = uni16LE_NoBOM;
m_nSkip = 0;
}
// try to detect UTF-16 big-endian without BOM
else if (m_nLen > 1 && m_pBuf[0] == NULL && m_pBuf[1] != NULL)
{
m_eEncoding = uni16BE_NoBOM;
m_nSkip = 0;
}
else else
{ {
u78 detectedEncoding = utf8_7bits_8bits(); u78 detectedEncoding = utf8_7bits_8bits();
@ -281,6 +298,8 @@ size_t Utf8_16_Write::fwrite(const void* p, size_t _size)
ret = ::fwrite(p, _size, 1, m_pFile); ret = ::fwrite(p, _size, 1, m_pFile);
break; break;
} }
case uni16BE_NoBOM:
case uni16LE_NoBOM:
case uni16BE: case uni16BE:
case uni16LE: { case uni16LE: {
if (_size > m_nBufSize) if (_size > m_nBufSize)
@ -338,6 +357,8 @@ size_t Utf8_16_Write::convert(char* p, size_t _size)
memcpy(&m_pNewBuf[3], p, _size); memcpy(&m_pNewBuf[3], p, _size);
break; break;
} }
case uni16BE_NoBOM:
case uni16LE_NoBOM:
case uni16BE: case uni16BE:
case uni16LE: { case uni16LE: {
m_pNewBuf = (ubyte*)new ubyte[sizeof(utf16) * (_size + 1)]; m_pNewBuf = (ubyte*)new ubyte[sizeof(utf16) * (_size + 1)];
@ -442,7 +463,7 @@ void Utf8_Iter::operator++()
void Utf8_Iter::toStart() void Utf8_Iter::toStart()
{ {
m_eState = eStart; m_eState = eStart;
if (m_eEncoding == uni16BE) if (m_eEncoding == uni16BE || m_eEncoding == uni16BE_NoBOM)
{ {
swap(); swap();
} }
@ -492,10 +513,13 @@ void Utf16_Iter::operator++()
switch (m_eState) switch (m_eState)
{ {
case eStart: case eStart:
if (m_eEncoding == uni16LE) { if (m_eEncoding == uni16LE || m_eEncoding == uni16LE_NoBOM)
{
m_nCur16 = *m_pRead++; m_nCur16 = *m_pRead++;
m_nCur16 |= static_cast<utf16>(*m_pRead << 8); m_nCur16 |= static_cast<utf16>(*m_pRead << 8);
} else { }
else //(m_eEncoding == uni16BE || m_eEncoding == uni16BE_NoBOM)
{
m_nCur16 = static_cast<utf16>(*m_pRead++ << 8); m_nCur16 = static_cast<utf16>(*m_pRead++ << 8);
m_nCur16 |= *m_pRead; m_nCur16 |= *m_pRead;
} }