From 9ebb4b39f5b6341c246a3f25221b716d20b5d1df Mon Sep 17 00:00:00 2001 From: Don Ho Date: Thu, 26 Nov 2009 01:34:25 +0000 Subject: [PATCH] [NEW_FEATURE] Add auto-detection of HTML/XML file encoding. git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@573 f5eea248-9336-0410-98b8-ebc06183d4e3 --- PowerEditor/bin/change.log | 25 ++-- PowerEditor/src/Notepad_plus.cpp | 117 +++++++++++++++++++ PowerEditor/src/Notepad_plus.h | 1 + PowerEditor/src/Parameters.cpp | 60 ++++++++++ PowerEditor/src/Parameters.h | 2 + PowerEditor/src/ScitillaComponent/Buffer.cpp | 64 +--------- PowerEditor/src/ScitillaComponent/Buffer.h | 2 - 7 files changed, 195 insertions(+), 76 deletions(-) diff --git a/PowerEditor/bin/change.log b/PowerEditor/bin/change.log index 699f0bf91..5729cd02d 100644 --- a/PowerEditor/bin/change.log +++ b/PowerEditor/bin/change.log @@ -1,18 +1,19 @@ Notepad++ v5.6 new features and fixed bugs (from v5.5.1) : 1. Add languages encoding - Chinese traditional (BIG5), Chinese Simplified (GB2312), Japanese (Shift JIS), Korean (EUC), Thai (TIS-620), Hebrew (iso-8859-8), Hebrew (1255), Central European (1250), Cyrillic (1251), Cyrillic (KOI8-U), Cyrillic (KOI8-R), Cyrillic (Mac), Western European(1252), Greek (1253), Turkish(1254), Arabic (1256), Baltic (1257) and Vietnamese (1258). -2. Add COBOL, D, Gui4Cli, PowerShell and R language support. -3. Add Marker Jumper feature (Jump down/up : Ctrl+Num/Ctrl+Shift+Num). -4. Add indent guide line highlighting for html/xml tags. -5. Add system tray context menu and new command argument "-systemtray". -6. Add new command argument "--help". -7. Fix Calltip hint bug and add a new capacity in it. -8. Add the ability to add the second keyword group for user in both LISP and Scheme languages. -9. Fix the wrap symbol display problem. -10. Add SQL ESC symbol '\'. -11. Fix column editor insert number bug in virtual space mode. -12. Fix status bar displaying "-2 char" issue for a empty document. -13. Fix installation of NppShell64 failed issue in installer. +2. Add auto-detection of HTML and XML files encodings. +3. Add COBOL, D, Gui4Cli, PowerShell and R language support. +4. Add Marker Jumper feature (Jump down/up : Ctrl+Num/Ctrl+Shift+Num). +5. Add indent guide line highlighting for html/xml tags. +6. Add system tray context menu and new command argument "-systemtray". +7. Add new command argument "--help". +8. Fix Calltip hint bug and add a new capacity in it. +9. Add the ability to add the second keyword group for user in both LISP and Scheme languages. +10. Fix the wrap symbol display problem. +11. Add SQL ESC symbol '\'. +12. Fix column editor insert number bug in virtual space mode. +13. Fix status bar displaying "-2 char" issue for a empty document. +14. Fix installation of NppShell64 failed issue in installer. Included plugins (Unicode): diff --git a/PowerEditor/src/Notepad_plus.cpp b/PowerEditor/src/Notepad_plus.cpp index 6eb24fbde..aaa0f3c08 100644 --- a/PowerEditor/src/Notepad_plus.cpp +++ b/PowerEditor/src/Notepad_plus.cpp @@ -783,6 +783,11 @@ BufferID Notepad_plus::doOpen(const TCHAR *fileName, bool isReadOnly, int encodi scnN.nmhdr.idFrom = NULL; _pluginsManager.notify(&scnN); + if (encoding == -1) + { + encoding = getHtmlXmlEncoding(longFileName); + } + BufferID buffer = MainFileManager->loadFile(longFileName, NULL, encoding); if (buffer != BUFFER_INVALID) { @@ -858,6 +863,118 @@ BufferID Notepad_plus::doOpen(const TCHAR *fileName, bool isReadOnly, int encodi } } +int Notepad_plus::getHtmlXmlEncoding(const TCHAR *fileName) const +{ + // Get Language type + TCHAR *ext = PathFindExtension(fileName); + if (*ext == '.') //extension found + { + ext += 1; + } + else + { + return -1; + } + NppParameters *pNppParamInst = NppParameters::getInstance(); + LangType langT = pNppParamInst->getLangFromExt(ext); + if (langT != L_XML && langT != L_HTML && langT == L_PHP) + return -1; + + // Get the begining of file data + FILE *f = generic_fopen(fileName, TEXT("rb")); + if (!f) + return -1; + const int blockSize = 1024; // To ensure that length is long enough to capture the encoding in html + char data[blockSize]; + int lenFile = fread(data, 1, blockSize, f); + fclose(f); + + // Put data in _invisibleEditView + _invisibleEditView.execute(SCI_CLEARALL); + _invisibleEditView.execute(SCI_APPENDTEXT, lenFile, (LPARAM)data); + + const char *encodingAliasRegExpr = "[a-zA-Z0-9_-]+"; + + if (langT == L_XML) + { + // find encoding by RegExpr + + const char *xmlHeaderRegExpr = ""; + + int startPos = 0; + int endPos = lenFile-1; + _invisibleEditView.execute(SCI_SETSEARCHFLAGS, SCFIND_REGEXP|SCFIND_POSIX); + + _invisibleEditView.execute(SCI_SETTARGETSTART, startPos); + _invisibleEditView.execute(SCI_SETTARGETEND, endPos); + + int posFound = _invisibleEditView.execute(SCI_SEARCHINTARGET, strlen(xmlHeaderRegExpr), (LPARAM)xmlHeaderRegExpr); + if (posFound != -1) + { + const char *encodingBlockRegExpr = "encoding[ \\t]*=[ \\t]*\"[^\".]+\""; + posFound = _invisibleEditView.execute(SCI_SEARCHINTARGET, strlen(encodingBlockRegExpr), (LPARAM)encodingBlockRegExpr); + + const char *encodingRegExpr = "\".+\""; + posFound = _invisibleEditView.execute(SCI_SEARCHINTARGET, strlen(encodingRegExpr), (LPARAM)encodingRegExpr); + + posFound = _invisibleEditView.execute(SCI_SEARCHINTARGET, strlen(encodingAliasRegExpr), (LPARAM)encodingAliasRegExpr); + + startPos = int(_invisibleEditView.execute(SCI_GETTARGETSTART)); + endPos = int(_invisibleEditView.execute(SCI_GETTARGETEND)); + + char encodingStr[128]; + _invisibleEditView.getText(encodingStr, startPos, endPos); + + int enc = getCpFromStringValue(encodingStr); + return (enc==CP_ACP?-1:enc); + } + return -1; + } + else // if (langT == L_HTML) + { + // find encoding by RegExpr + const char *htmlHeaderRegExpr = ""; + const char *htmlHeaderRegExpr2 = ""; + + int startPos = 0; + int endPos = lenFile-1; + _invisibleEditView.execute(SCI_SETSEARCHFLAGS, SCFIND_REGEXP|SCFIND_POSIX); + + _invisibleEditView.execute(SCI_SETTARGETSTART, startPos); + _invisibleEditView.execute(SCI_SETTARGETEND, endPos); + + int posFound = _invisibleEditView.execute(SCI_SEARCHINTARGET, strlen(htmlHeaderRegExpr), (LPARAM)htmlHeaderRegExpr); + + if (posFound != -1) + { + const char *charsetBlockRegExpr = "charset[ \\t]*=[ \\t]*.+\""; + posFound = _invisibleEditView.execute(SCI_SEARCHINTARGET, strlen(charsetBlockRegExpr), (LPARAM)charsetBlockRegExpr); + + const char *charsetRegExpr = "=[ \\t]*[^\"]+"; + posFound = _invisibleEditView.execute(SCI_SEARCHINTARGET, strlen(charsetRegExpr), (LPARAM)charsetRegExpr); + + posFound = _invisibleEditView.execute(SCI_SEARCHINTARGET, strlen(encodingAliasRegExpr), (LPARAM)encodingAliasRegExpr); + + startPos = int(_invisibleEditView.execute(SCI_GETTARGETSTART)); + endPos = int(_invisibleEditView.execute(SCI_GETTARGETEND)); + + char encodingStr[128]; + _invisibleEditView.getText(encodingStr, startPos, endPos); + + int enc = getCpFromStringValue(encodingStr); + return (enc==CP_ACP?-1:enc); + } + else + { + posFound = _invisibleEditView.execute(SCI_SEARCHINTARGET, strlen(htmlHeaderRegExpr2), (LPARAM)htmlHeaderRegExpr2); + if (posFound == -1) + return -1; + //TODO + } + + return -1; + } +} bool Notepad_plus::doReload(BufferID id, bool alert) { diff --git a/PowerEditor/src/Notepad_plus.h b/PowerEditor/src/Notepad_plus.h index df72bf7a4..c7e7355be 100644 --- a/PowerEditor/src/Notepad_plus.h +++ b/PowerEditor/src/Notepad_plus.h @@ -265,6 +265,7 @@ public: bool replaceInFiles(); void setFindReplaceFolderFilter(const TCHAR *dir, const TCHAR *filters); vector addNppComponents(const TCHAR *destDir, const TCHAR *extFilterName, const TCHAR *extFilter); + int getHtmlXmlEncoding(const TCHAR *fileName) const; static HWND gNppHWND; //static handle to Notepad++ window, NULL if non-existant private: diff --git a/PowerEditor/src/Parameters.cpp b/PowerEditor/src/Parameters.cpp index d73bb02bd..a6fcf5d93 100644 --- a/PowerEditor/src/Parameters.cpp +++ b/PowerEditor/src/Parameters.cpp @@ -377,6 +377,34 @@ ScintillaKeyDefinition scintKeyDefs[] = { //array of accelerator keys for all po // }; +static bool isInList(const TCHAR *token, const TCHAR *list) { + if ((!token) || (!list)) + return false; + TCHAR word[64]; + int i = 0; + int j = 0; + for (; i <= int(lstrlen(list)) ; i++) + { + if ((list[i] == ' ')||(list[i] == '\0')) + { + if (j != 0) + { + word[j] = '\0'; + j = 0; + + if (!generic_stricmp(token, word)) + return true; + } + } + else + { + word[j] = list[i]; + j++; + } + } + return false; +}; + static int strVal(const TCHAR *str, int base) { if (!str) return -1; if (!str[0]) return 0; @@ -1988,6 +2016,38 @@ void NppParameters::feedUserLang(TiXmlNode *node) } } +LangType NppParameters::getLangFromExt(const TCHAR *ext) +{ + int i = getNbLang(); + i--; + while (i >= 0) + { + Lang *l = getLangFromIndex(i--); + + const TCHAR *defList = l->getDefaultExtList(); + const TCHAR *userList = NULL; + + LexerStylerArray &lsa = getLStylerArray(); + const TCHAR *lName = l->getLangName(); + LexerStyler *pLS = lsa.getLexerStylerByName(lName); + + if (pLS) + userList = pLS->getLexerUserExt(); + + generic_string list(TEXT("")); + if (defList) + list += defList; + if (userList) + { + list += TEXT(" "); + list += userList; + } + if (isInList(ext, list.c_str())) + return l->getLangID(); + } + return L_TXT; +} + void NppParameters::writeUserDefinedLang() { if (!_pXmlUserLangDoc) diff --git a/PowerEditor/src/Parameters.h b/PowerEditor/src/Parameters.h index 5f0491e57..c3187e199 100644 --- a/PowerEditor/src/Parameters.h +++ b/PowerEditor/src/Parameters.h @@ -1113,6 +1113,8 @@ public: }; int getNbLang() const {return _nbLang;}; + + LangType getLangFromExt(const TCHAR *ext); const TCHAR * getLangExtFromName(const TCHAR *langName) const { for (int i = 0 ; i < _nbLang ; i++) diff --git a/PowerEditor/src/ScitillaComponent/Buffer.cpp b/PowerEditor/src/ScitillaComponent/Buffer.cpp index 6b44609b7..76a41e059 100644 --- a/PowerEditor/src/ScitillaComponent/Buffer.cpp +++ b/PowerEditor/src/ScitillaComponent/Buffer.cpp @@ -34,34 +34,6 @@ const int blockSize = 128 * 1024 + 4; const int CR = 0x0D; const int LF = 0x0A; -static bool isInList(const TCHAR *token, const TCHAR *list) { - if ((!token) || (!list)) - return false; - TCHAR word[64]; - int i = 0; - int j = 0; - for (; i <= int(lstrlen(list)) ; i++) - { - if ((list[i] == ' ')||(list[i] == '\0')) - { - if (j != 0) - { - word[j] = '\0'; - j = 0; - - if (!generic_stricmp(token, word)) - return true; - } - } - else - { - word[j] = list[i]; - j++; - } - } - return false; -}; - Buffer::Buffer(FileManager * pManager, BufferID id, Document doc, DocFileStatus type, const TCHAR *fileName) //type must be either DOC_REGULAR or DOC_UNNAMED : _pManager(pManager), _id(id), _isDirty(false), _doc(doc), _isFileReadOnly(false), _isUserReadOnly(false), _recentTag(-1), _references(0), _canNotify(false), _timeStamp(0), _needReloading(false), _encoding(-1) @@ -84,6 +56,7 @@ Buffer::Buffer(FileManager * pManager, BufferID id, Document doc, DocFileStatus _canNotify = true; } + void Buffer::determinateFormat(const char *data) { _format = WIN_FORMAT; size_t len = strlen(data); @@ -169,7 +142,7 @@ void Buffer::setFileName(const TCHAR *fn, LangType defaultLang) else // if it's not user lang, then check if it's supported lang { _userLangExt[0] = '\0'; - newLang = getLangFromExt(ext); + newLang = pNppParamInst->getLangFromExt(ext); } } @@ -278,39 +251,6 @@ std::vector & Buffer::getHeaderLineState(ScintillaEditView * id return _foldStates.at(index); } -LangType Buffer::getLangFromExt(const TCHAR *ext) -{ - NppParameters *pNppParam = NppParameters::getInstance(); - int i = pNppParam->getNbLang(); - i--; - while (i >= 0) - { - Lang *l = pNppParam->getLangFromIndex(i--); - - const TCHAR *defList = l->getDefaultExtList(); - const TCHAR *userList = NULL; - - LexerStylerArray &lsa = pNppParam->getLStylerArray(); - const TCHAR *lName = l->getLangName(); - LexerStyler *pLS = lsa.getLexerStylerByName(lName); - - if (pLS) - userList = pLS->getLexerUserExt(); - - generic_string list(TEXT("")); - if (defList) - list += defList; - if (userList) - { - list += TEXT(" "); - list += userList; - } - if (isInList(ext, list.c_str())) - return l->getLangID(); - } - return L_TXT; -} - Lang * Buffer::getCurrentLang() const { NppParameters *pNppParam = NppParameters::getInstance(); int i = 0; diff --git a/PowerEditor/src/ScitillaComponent/Buffer.h b/PowerEditor/src/ScitillaComponent/Buffer.h index fa37d799e..69c8b86d3 100644 --- a/PowerEditor/src/ScitillaComponent/Buffer.h +++ b/PowerEditor/src/ScitillaComponent/Buffer.h @@ -139,8 +139,6 @@ public : //Destructor makes sure its purged Buffer(FileManager * pManager, BufferID id, Document doc, DocFileStatus type, const TCHAR *fileName); - LangType getLangFromExt(const TCHAR *ext); - // this method 1. copies the file name // 2. determinates the language from the ext of file name // 3. gets the last modified time