mirror of
https://github.com/notepad-plus-plus/notepad-plus-plus.git
synced 2025-11-03 13:03:52 +01:00
Bypass Windows caching when flushing files after write. According Microsoft documentation this fixes the issues with saved file corruption (all NULs) on sudden power loss or restart. Microsoft documentation for reference: https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/fopen-wfopen?view=vs-2019 https://docs.microsoft.com/en-us/cpp/c-runtime-library/stream-i-o?view=vs-2019 Fix #6133, close #8952
597 lines
14 KiB
C++
597 lines
14 KiB
C++
// Utf8_16.cxx
|
|
// Copyright (C) 2002 Scott Kirkwood
|
|
//
|
|
// Permission to use, copy, modify, distribute and sell this code
|
|
// and its documentation for any purpose is hereby granted without fee,
|
|
// provided that the above copyright notice appear in all copies or
|
|
// any derived copies. Scott Kirkwood makes no representations
|
|
// about the suitability of this software for any purpose.
|
|
// It is provided "as is" without express or implied warranty.
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
//
|
|
// Modificated 2006 Jens Lorenz
|
|
//
|
|
// - Clean up the sources
|
|
// - Removing UCS-Bug in Utf8_Iter
|
|
// - Add convert function in Utf8_16_Write
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
#include "Utf8_16.h"
|
|
|
|
const Utf8_16::utf8 Utf8_16::k_Boms[][3] = {
|
|
{0x00, 0x00, 0x00}, // Unknown
|
|
{0xEF, 0xBB, 0xBF}, // UTF8
|
|
{0xFE, 0xFF, 0x00}, // Big endian
|
|
{0xFF, 0xFE, 0x00}, // Little endian
|
|
};
|
|
|
|
|
|
// ==================================================================
|
|
|
|
Utf8_16_Read::Utf8_16_Read() {
|
|
m_eEncoding = uni8Bit;
|
|
m_nAllocatedBufSize = 0;
|
|
m_nNewBufSize = 0;
|
|
m_pNewBuf = NULL;
|
|
m_bFirstRead = true;
|
|
}
|
|
|
|
Utf8_16_Read::~Utf8_16_Read()
|
|
{
|
|
if ((m_eEncoding == uni16BE) || (m_eEncoding == uni16LE) || (m_eEncoding == uni16BE_NoBOM) || (m_eEncoding == uni16LE_NoBOM))
|
|
{
|
|
delete [] m_pNewBuf;
|
|
m_pNewBuf = NULL;
|
|
}
|
|
}
|
|
|
|
// Returned value :
|
|
// 0 : utf8
|
|
// 1 : 7bits
|
|
// 2 : 8bits
|
|
u78 Utf8_16_Read::utf8_7bits_8bits()
|
|
{
|
|
int rv = 1;
|
|
int ASCII7only = 1;
|
|
utf8 *sx = (utf8 *)m_pBuf;
|
|
utf8 *endx = sx + m_nLen;
|
|
|
|
while (sx<endx)
|
|
{
|
|
if (*sx == '\0')
|
|
{ // For detection, we'll say that NUL means not UTF8
|
|
ASCII7only = 0;
|
|
rv = 0;
|
|
break;
|
|
}
|
|
else if ((*sx & 0x80) == 0x0)
|
|
{ // 0nnnnnnn If the byte's first hex code begins with 0-7, it is an ASCII character.
|
|
++sx;
|
|
}
|
|
else if ((*sx & (0x80+0x40)) == 0x80)
|
|
{ // 10nnnnnn 8 through B cannot be first hex codes
|
|
ASCII7only=0;
|
|
rv=0;
|
|
break;
|
|
}
|
|
else if ((*sx & (0x80+0x40+0x20)) == (0x80+0x40))
|
|
{ // 110xxxvv 10nnnnnn, 11 bit character
|
|
ASCII7only=0;
|
|
if (std::distance(sx, endx) < 2) {
|
|
rv=0; break;
|
|
}
|
|
if ( (sx[1]&(0x80+0x40)) != 0x80) {
|
|
rv=0; break;
|
|
}
|
|
sx+=2;
|
|
}
|
|
else if ((*sx & (0x80+0x40+0x20+0x10)) == (0x80+0x40+0x20))
|
|
{ // 1110qqqq 10xxxxvv 10nnnnnn, 16 bit character
|
|
ASCII7only=0;
|
|
if (std::distance(sx, endx) < 3) {
|
|
rv=0; break;
|
|
}
|
|
if ((sx[1]&(0x80+0x40)) != 0x80 || (sx[2]&(0x80+0x40)) != 0x80) {
|
|
rv=0; break;
|
|
}
|
|
sx+=3;
|
|
}
|
|
else if ((*sx & (0x80+0x40+0x20+0x10+0x8)) == (0x80+0x40+0x20+0x10))
|
|
{ // 11110qqq 10xxxxvv 10nnnnnn 10mmmmmm, 21 bit character
|
|
ASCII7only=0;
|
|
if (std::distance(sx, endx) < 4) {
|
|
rv=0; break;
|
|
}
|
|
if ((sx[1]&(0x80+0x40)) != 0x80 || (sx[2]&(0x80+0x40)) != 0x80 || (sx[3]&(0x80+0x40)) != 0x80) {
|
|
rv=0; break;
|
|
}
|
|
sx+=4;
|
|
}
|
|
else
|
|
{
|
|
ASCII7only=0;
|
|
rv=0;
|
|
break;
|
|
}
|
|
}
|
|
if (ASCII7only)
|
|
return ascii7bits;
|
|
if (rv)
|
|
return utf8NoBOM;
|
|
return ascii8bits;
|
|
}
|
|
|
|
size_t Utf8_16_Read::convert(char* buf, size_t len)
|
|
{
|
|
// bugfix by Jens Lorenz
|
|
static size_t nSkip = 0;
|
|
|
|
m_pBuf = (ubyte*)buf;
|
|
m_nLen = len;
|
|
m_nNewBufSize = 0;
|
|
|
|
if (m_bFirstRead == true)
|
|
{
|
|
determineEncoding();
|
|
nSkip = m_nSkip;
|
|
m_bFirstRead = false;
|
|
}
|
|
|
|
switch (m_eEncoding)
|
|
{
|
|
case uni7Bit:
|
|
case uni8Bit:
|
|
case uniCookie: {
|
|
// Do nothing, pass through
|
|
m_nAllocatedBufSize = 0;
|
|
m_pNewBuf = m_pBuf;
|
|
m_nNewBufSize = len;
|
|
break;
|
|
}
|
|
case uniUTF8: {
|
|
// Pass through after BOM
|
|
m_nAllocatedBufSize = 0;
|
|
m_pNewBuf = m_pBuf + nSkip;
|
|
m_nNewBufSize = len - nSkip;
|
|
break;
|
|
}
|
|
case uni16BE_NoBOM:
|
|
case uni16LE_NoBOM:
|
|
case uni16BE:
|
|
case uni16LE: {
|
|
size_t newSize = len + len / 2 + 1;
|
|
|
|
if (m_nAllocatedBufSize != newSize)
|
|
{
|
|
if (m_pNewBuf)
|
|
delete [] m_pNewBuf;
|
|
m_pNewBuf = NULL;
|
|
m_pNewBuf = new ubyte[newSize];
|
|
m_nAllocatedBufSize = newSize;
|
|
}
|
|
|
|
ubyte* pCur = m_pNewBuf;
|
|
|
|
m_Iter16.set(m_pBuf + nSkip, len - nSkip, m_eEncoding);
|
|
|
|
for (; m_Iter16; ++m_Iter16)
|
|
{
|
|
*pCur++ = m_Iter16.get();
|
|
}
|
|
m_nNewBufSize = pCur - m_pNewBuf;
|
|
break;
|
|
}
|
|
default:
|
|
break;
|
|
}
|
|
|
|
// necessary for second calls and more
|
|
nSkip = 0;
|
|
|
|
return m_nNewBufSize;
|
|
}
|
|
|
|
|
|
void Utf8_16_Read::determineEncoding()
|
|
{
|
|
INT uniTest = IS_TEXT_UNICODE_STATISTICS;
|
|
m_eEncoding = uni8Bit;
|
|
m_nSkip = 0;
|
|
|
|
// detect UTF-16 big-endian with BOM
|
|
if (m_nLen > 1 && m_pBuf[0] == k_Boms[uni16BE][0] && m_pBuf[1] == k_Boms[uni16BE][1])
|
|
{
|
|
m_eEncoding = uni16BE;
|
|
m_nSkip = 2;
|
|
}
|
|
// detect UTF-16 little-endian with BOM
|
|
else if (m_nLen > 1 && m_pBuf[0] == k_Boms[uni16LE][0] && m_pBuf[1] == k_Boms[uni16LE][1])
|
|
{
|
|
m_eEncoding = uni16LE;
|
|
m_nSkip = 2;
|
|
}
|
|
// detect UTF-8 with BOM
|
|
else if (m_nLen > 2 && m_pBuf[0] == k_Boms[uniUTF8][0] &&
|
|
m_pBuf[1] == k_Boms[uniUTF8][1] && m_pBuf[2] == k_Boms[uniUTF8][2])
|
|
{
|
|
m_eEncoding = uniUTF8;
|
|
m_nSkip = 3;
|
|
}
|
|
// try to detect UTF-16 little-endian without BOM
|
|
else if (m_nLen > 1 && m_nLen % 2 == 0 && m_pBuf[0] != NULL && m_pBuf[1] == NULL && IsTextUnicode(m_pBuf, static_cast<int32_t>(m_nLen), &uniTest))
|
|
{
|
|
m_eEncoding = uni16LE_NoBOM;
|
|
m_nSkip = 0;
|
|
}
|
|
/* UTF-16 big-endian without BOM detection is taken away scince this detection is very week
|
|
// try to detect UTF-16 big-endian without BOM
|
|
else if (m_nLen > 1 && m_pBuf[0] == NULL && m_pBuf[1] != NULL)
|
|
{
|
|
m_eEncoding = uni16BE_NoBOM;
|
|
m_nSkip = 0;
|
|
}
|
|
*/
|
|
else
|
|
{
|
|
u78 detectedEncoding = utf8_7bits_8bits();
|
|
if (detectedEncoding == utf8NoBOM)
|
|
m_eEncoding = uniCookie;
|
|
else if (detectedEncoding == ascii7bits)
|
|
m_eEncoding = uni7Bit;
|
|
else //(detectedEncoding == ascii8bits)
|
|
m_eEncoding = uni8Bit;
|
|
m_nSkip = 0;
|
|
}
|
|
}
|
|
|
|
UniMode Utf8_16_Read::determineEncoding(const unsigned char *buf, size_t bufLen)
|
|
{
|
|
// detect UTF-16 big-endian with BOM
|
|
if (bufLen > 1 && buf[0] == k_Boms[uni16BE][0] && buf[1] == k_Boms[uni16BE][1])
|
|
{
|
|
return uni16BE;
|
|
}
|
|
|
|
// detect UTF-16 little-endian with BOM
|
|
if (bufLen > 1 && buf[0] == k_Boms[uni16LE][0] && buf[1] == k_Boms[uni16LE][1])
|
|
{
|
|
return uni16LE;
|
|
}
|
|
|
|
// detect UTF-8 with BOM
|
|
if (bufLen > 2 && buf[0] == k_Boms[uniUTF8][0] &&
|
|
buf[1] == k_Boms[uniUTF8][1] && buf[2] == k_Boms[uniUTF8][2])
|
|
{
|
|
return uniUTF8;
|
|
}
|
|
|
|
return uni8Bit;
|
|
}
|
|
|
|
|
|
// ==================================================================
|
|
|
|
Utf8_16_Write::Utf8_16_Write()
|
|
{
|
|
m_eEncoding = uni8Bit;
|
|
m_pFile = NULL;
|
|
m_pNewBuf = NULL;
|
|
m_bFirstWrite = true;
|
|
m_nBufSize = 0;
|
|
}
|
|
|
|
Utf8_16_Write::~Utf8_16_Write()
|
|
{
|
|
fclose();
|
|
}
|
|
|
|
FILE * Utf8_16_Write::fopen(const TCHAR *_name, const TCHAR *_type)
|
|
{
|
|
m_pFile = ::generic_fopen(_name, _type);
|
|
|
|
m_bFirstWrite = true;
|
|
|
|
return m_pFile;
|
|
}
|
|
|
|
size_t Utf8_16_Write::fwrite(const void* p, size_t _size)
|
|
{
|
|
// no file open
|
|
if (!m_pFile)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
size_t ret = 0;
|
|
|
|
if (m_bFirstWrite)
|
|
{
|
|
switch (m_eEncoding)
|
|
{
|
|
case uniUTF8: {
|
|
::fwrite(k_Boms[m_eEncoding], 3, 1, m_pFile);
|
|
break;
|
|
}
|
|
case uni16BE:
|
|
case uni16LE:
|
|
::fwrite(k_Boms[m_eEncoding], 2, 1, m_pFile);
|
|
break;
|
|
default:
|
|
// nothing to do
|
|
break;
|
|
}
|
|
m_bFirstWrite = false;
|
|
}
|
|
|
|
switch (m_eEncoding)
|
|
{
|
|
case uni7Bit:
|
|
case uni8Bit:
|
|
case uniCookie:
|
|
case uniUTF8: {
|
|
// Normal write
|
|
ret = ::fwrite(p, _size, 1, m_pFile);
|
|
break;
|
|
}
|
|
case uni16BE_NoBOM:
|
|
case uni16LE_NoBOM:
|
|
case uni16BE:
|
|
case uni16LE: {
|
|
static const int bufSize = 64*1024;
|
|
utf16 buf[bufSize];
|
|
|
|
Utf8_Iter iter8;
|
|
iter8.set(static_cast<const ubyte*>(p), _size, m_eEncoding);
|
|
|
|
int bufIndex = 0;
|
|
while (iter8) {
|
|
if (iter8.canGet()) {
|
|
buf[bufIndex++] = iter8.get();
|
|
}
|
|
++iter8;
|
|
if (bufIndex == bufSize || !iter8) {
|
|
if (!::fwrite(buf, bufIndex*sizeof(utf16), 1, m_pFile)) return 0;
|
|
bufIndex = 0;
|
|
}
|
|
}
|
|
ret = 1;
|
|
break;
|
|
}
|
|
default:
|
|
break;
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
|
|
size_t Utf8_16_Write::convert(char* p, size_t _size)
|
|
{
|
|
if (m_pNewBuf)
|
|
{
|
|
delete [] m_pNewBuf;
|
|
}
|
|
|
|
switch (m_eEncoding)
|
|
{
|
|
case uni7Bit:
|
|
case uni8Bit:
|
|
case uniCookie: {
|
|
// Normal write
|
|
m_nBufSize = _size;
|
|
m_pNewBuf = (ubyte*)new ubyte[m_nBufSize];
|
|
memcpy(m_pNewBuf, p, _size);
|
|
break;
|
|
}
|
|
case uniUTF8: {
|
|
m_nBufSize = _size + 3;
|
|
m_pNewBuf = (ubyte*)new ubyte[m_nBufSize];
|
|
memcpy(m_pNewBuf, k_Boms[m_eEncoding], 3);
|
|
memcpy(&m_pNewBuf[3], p, _size);
|
|
break;
|
|
}
|
|
case uni16BE_NoBOM:
|
|
case uni16LE_NoBOM:
|
|
case uni16BE:
|
|
case uni16LE:
|
|
{
|
|
utf16* pCur = NULL;
|
|
|
|
if (m_eEncoding == uni16BE || m_eEncoding == uni16LE) {
|
|
// Write the BOM
|
|
m_pNewBuf = (ubyte*)new ubyte[sizeof(utf16) * (_size + 1)];
|
|
memcpy(m_pNewBuf, k_Boms[m_eEncoding], 2);
|
|
pCur = (utf16*)&m_pNewBuf[2];
|
|
} else {
|
|
m_pNewBuf = (ubyte*)new ubyte[sizeof(utf16) * _size];
|
|
pCur = (utf16*)m_pNewBuf;
|
|
}
|
|
|
|
Utf8_Iter iter8;
|
|
iter8.set(reinterpret_cast<const ubyte*>(p), _size, m_eEncoding);
|
|
|
|
for (; iter8; ++iter8) {
|
|
if (iter8.canGet()) {
|
|
*pCur++ = iter8.get();
|
|
}
|
|
}
|
|
m_nBufSize = (const char*)pCur - (const char*)m_pNewBuf;
|
|
break;
|
|
}
|
|
default:
|
|
break;
|
|
}
|
|
|
|
return m_nBufSize;
|
|
}
|
|
|
|
|
|
void Utf8_16_Write::setEncoding(UniMode eType)
|
|
{
|
|
m_eEncoding = eType;
|
|
}
|
|
|
|
|
|
void Utf8_16_Write::fclose()
|
|
{
|
|
if (m_pNewBuf)
|
|
{
|
|
delete [] m_pNewBuf;
|
|
m_pNewBuf = NULL;
|
|
}
|
|
|
|
if (m_pFile)
|
|
{
|
|
::fflush(m_pFile);
|
|
::fclose(m_pFile);
|
|
m_pFile = NULL;
|
|
}
|
|
}
|
|
|
|
|
|
//=================================================================
|
|
Utf8_Iter::Utf8_Iter()
|
|
{
|
|
reset();
|
|
}
|
|
|
|
void Utf8_Iter::reset()
|
|
{
|
|
m_pBuf = NULL;
|
|
m_pRead = NULL;
|
|
m_pEnd = NULL;
|
|
m_eState = eStart;
|
|
m_nCur = 0;
|
|
m_eEncoding = uni8Bit;
|
|
}
|
|
|
|
void Utf8_Iter::set(const ubyte* pBuf, size_t nLen, UniMode eEncoding)
|
|
{
|
|
m_pBuf = pBuf;
|
|
m_pRead = pBuf;
|
|
m_pEnd = pBuf + nLen;
|
|
m_eEncoding = eEncoding;
|
|
operator++();
|
|
// Note: m_eState, m_nCur not set
|
|
}
|
|
|
|
// Go to the next byte.
|
|
void Utf8_Iter::operator++()
|
|
{
|
|
switch (m_eState)
|
|
{
|
|
case eStart:
|
|
if (*m_pRead < 0x80) {
|
|
m_nCur = *m_pRead;
|
|
toStart();
|
|
} else if (*m_pRead < 0xE0) {
|
|
m_nCur = static_cast<utf16>((0x1F & *m_pRead) << 6);
|
|
m_eState = e2Bytes_Byte2;
|
|
} else {
|
|
m_nCur = static_cast<utf16>((0xF & *m_pRead) << 12);
|
|
m_eState = e3Bytes_Byte2;
|
|
}
|
|
break;
|
|
case e2Bytes_Byte2:
|
|
case e3Bytes_Byte3:
|
|
m_nCur |= static_cast<utf8>(0x3F & *m_pRead);
|
|
toStart();
|
|
break;
|
|
case e3Bytes_Byte2:
|
|
m_nCur |= static_cast<utf16>((0x3F & *m_pRead) << 6);
|
|
m_eState = e3Bytes_Byte3;
|
|
break;
|
|
}
|
|
++m_pRead;
|
|
}
|
|
|
|
void Utf8_Iter::toStart()
|
|
{
|
|
m_eState = eStart;
|
|
if (m_eEncoding == uni16BE || m_eEncoding == uni16BE_NoBOM)
|
|
{
|
|
swap();
|
|
}
|
|
}
|
|
|
|
void Utf8_Iter::swap()
|
|
{
|
|
utf8* p = reinterpret_cast<utf8*>(&m_nCur);
|
|
utf8 swapbyte = *p;
|
|
*p = *(p + 1);
|
|
*(p + 1) = swapbyte;
|
|
}
|
|
|
|
//==================================================
|
|
Utf16_Iter::Utf16_Iter()
|
|
{
|
|
reset();
|
|
}
|
|
|
|
void Utf16_Iter::reset()
|
|
{
|
|
m_pBuf = NULL;
|
|
m_pRead = NULL;
|
|
m_pEnd = NULL;
|
|
m_eState = eStart;
|
|
m_nCur = 0;
|
|
m_nCur16 = 0;
|
|
m_eEncoding = uni8Bit;
|
|
}
|
|
|
|
void Utf16_Iter::set(const ubyte* pBuf, size_t nLen, UniMode eEncoding)
|
|
{
|
|
m_pBuf = pBuf;
|
|
m_pRead = pBuf;
|
|
m_pEnd = pBuf + nLen;
|
|
m_eEncoding = eEncoding;
|
|
m_eState = eStart;
|
|
operator++();
|
|
// Note: m_eState, m_nCur, m_nCur16 not reinitalized.
|
|
}
|
|
|
|
// Goes to the next byte.
|
|
// Not the next symbol which you might expect.
|
|
// This way we can continue from a partial buffer that doesn't align
|
|
void Utf16_Iter::operator++()
|
|
{
|
|
switch (m_eState)
|
|
{
|
|
case eStart:
|
|
if (m_eEncoding == uni16LE || m_eEncoding == uni16LE_NoBOM)
|
|
{
|
|
m_nCur16 = *m_pRead++;
|
|
m_nCur16 |= static_cast<utf16>(*m_pRead << 8);
|
|
}
|
|
else //(m_eEncoding == uni16BE || m_eEncoding == uni16BE_NoBOM)
|
|
{
|
|
m_nCur16 = static_cast<utf16>(*m_pRead++ << 8);
|
|
m_nCur16 |= *m_pRead;
|
|
}
|
|
++m_pRead;
|
|
|
|
if (m_nCur16 < 0x80) {
|
|
m_nCur = static_cast<ubyte>(m_nCur16 & 0xFF);
|
|
m_eState = eStart;
|
|
} else if (m_nCur16 < 0x800) {
|
|
m_nCur = static_cast<ubyte>(0xC0 | m_nCur16 >> 6);
|
|
m_eState = e2Bytes2;
|
|
} else {
|
|
m_nCur = static_cast<ubyte>(0xE0 | m_nCur16 >> 12);
|
|
m_eState = e3Bytes2;
|
|
}
|
|
break;
|
|
case e2Bytes2:
|
|
case e3Bytes3:
|
|
m_nCur = static_cast<ubyte>(0x80 | m_nCur16 & 0x3F);
|
|
m_eState = eStart;
|
|
break;
|
|
case e3Bytes2:
|
|
m_nCur = static_cast<ubyte>(0x80 | ((m_nCur16 >> 6) & 0x3F));
|
|
m_eState = e3Bytes3;
|
|
break;
|
|
}
|
|
}
|
|
|
|
|