diff --git a/PowerEditor/src/ScitillaComponent/Buffer.cpp b/PowerEditor/src/ScitillaComponent/Buffer.cpp index 5d8932d33..c608adf84 100644 --- a/PowerEditor/src/ScitillaComponent/Buffer.cpp +++ b/PowerEditor/src/ScitillaComponent/Buffer.cpp @@ -692,10 +692,26 @@ bool FileManager::loadFileData(Document doc, const TCHAR * filename, Utf8_16_Rea size_t lenFile = 0; size_t lenConvert = 0; //just in case conversion results in 0, but file not empty bool isFirstTime = true; + int incompleteMultibyteChar = 0; //we do not want to call SCI_APPENDTEXT with an incomplete character if the buffer ends in the middle of one + char incompleteMultibyteChar_first = 0; do { - lenFile = fread(data, 1, blockSize, fp); - + lenFile = fread(data+incompleteMultibyteChar, 1, blockSize-incompleteMultibyteChar, fp) + incompleteMultibyteChar; + + // we might not know yet the encoding; we ensure that valid UTF-8 characters will not be cut in the middle, without causing problems if it's not UTF-8 + // TODO: all expressions for testing UTF chars should be put in inline functions, not directly in the code + if(lenFile == blockSize && (data[blockSize-1]&0x80) != 0) // possible multi-byte character that could be cut due to blockSize + { + incompleteMultibyteChar = 1; + while(incompleteMultibyteChar < 6 // longest "defined" UTF-8 code (including restricted codes not yet defined by Unicode) + && (data[blockSize-incompleteMultibyteChar]&0xC0) == 0x80) // is possibly a continuation byte in a multi-byte character + ++incompleteMultibyteChar; + // leave for the next buffer all bytes that could potentially be multi-byte UTF-8 at the end of current buffer + lenFile -= incompleteMultibyteChar; + incompleteMultibyteChar_first = data[lenFile]; // this byte can be erased by following code to put a null terminator + } + else incompleteMultibyteChar = 0; + // check if file contain any BOM if (isFirstTime) { @@ -722,6 +738,13 @@ bool FileManager::loadFileData(Document doc, const TCHAR * filename, Utf8_16_Rea lenConvert = UnicodeConvertor->convert(data, lenFile); _pscratchTilla->execute(SCI_APPENDTEXT, lenConvert, (LPARAM)(UnicodeConvertor->getNewBuf())); } + + if(incompleteMultibyteChar != 0) + { + // copy bytes to next buffer + memcpy(data, data+blockSize-incompleteMultibyteChar, incompleteMultibyteChar); + data[0] = incompleteMultibyteChar_first; + } } while (lenFile > 0); } __except(filter(GetExceptionCode(), GetExceptionInformation())) {