[NEW_FEATURE] Add auto-detection of HTML/XML file encoding.

git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@573 f5eea248-9336-0410-98b8-ebc06183d4e3
This commit is contained in:
Don Ho 2009-11-26 01:34:25 +00:00
parent 5b23ddeefb
commit 9ebb4b39f5
7 changed files with 195 additions and 76 deletions

View File

@ -1,18 +1,19 @@
Notepad++ v5.6 new features and fixed bugs (from v5.5.1) : Notepad++ v5.6 new features and fixed bugs (from v5.5.1) :
1. Add languages encoding - Chinese traditional (BIG5), Chinese Simplified (GB2312), Japanese (Shift JIS), Korean (EUC), Thai (TIS-620), Hebrew (iso-8859-8), Hebrew (1255), Central European (1250), Cyrillic (1251), Cyrillic (KOI8-U), Cyrillic (KOI8-R), Cyrillic (Mac), Western European(1252), Greek (1253), Turkish(1254), Arabic (1256), Baltic (1257) and Vietnamese (1258). 1. Add languages encoding - Chinese traditional (BIG5), Chinese Simplified (GB2312), Japanese (Shift JIS), Korean (EUC), Thai (TIS-620), Hebrew (iso-8859-8), Hebrew (1255), Central European (1250), Cyrillic (1251), Cyrillic (KOI8-U), Cyrillic (KOI8-R), Cyrillic (Mac), Western European(1252), Greek (1253), Turkish(1254), Arabic (1256), Baltic (1257) and Vietnamese (1258).
2. Add COBOL, D, Gui4Cli, PowerShell and R language support. 2. Add auto-detection of HTML and XML files encodings.
3. Add Marker Jumper feature (Jump down/up : Ctrl+Num/Ctrl+Shift+Num). 3. Add COBOL, D, Gui4Cli, PowerShell and R language support.
4. Add indent guide line highlighting for html/xml tags. 4. Add Marker Jumper feature (Jump down/up : Ctrl+Num/Ctrl+Shift+Num).
5. Add system tray context menu and new command argument "-systemtray". 5. Add indent guide line highlighting for html/xml tags.
6. Add new command argument "--help". 6. Add system tray context menu and new command argument "-systemtray".
7. Fix Calltip hint bug and add a new capacity in it. 7. Add new command argument "--help".
8. Add the ability to add the second keyword group for user in both LISP and Scheme languages. 8. Fix Calltip hint bug and add a new capacity in it.
9. Fix the wrap symbol display problem. 9. Add the ability to add the second keyword group for user in both LISP and Scheme languages.
10. Add SQL ESC symbol '\'. 10. Fix the wrap symbol display problem.
11. Fix column editor insert number bug in virtual space mode. 11. Add SQL ESC symbol '\'.
12. Fix status bar displaying "-2 char" issue for a empty document. 12. Fix column editor insert number bug in virtual space mode.
13. Fix installation of NppShell64 failed issue in installer. 13. Fix status bar displaying "-2 char" issue for a empty document.
14. Fix installation of NppShell64 failed issue in installer.
Included plugins (Unicode): Included plugins (Unicode):

View File

@ -783,6 +783,11 @@ BufferID Notepad_plus::doOpen(const TCHAR *fileName, bool isReadOnly, int encodi
scnN.nmhdr.idFrom = NULL; scnN.nmhdr.idFrom = NULL;
_pluginsManager.notify(&scnN); _pluginsManager.notify(&scnN);
if (encoding == -1)
{
encoding = getHtmlXmlEncoding(longFileName);
}
BufferID buffer = MainFileManager->loadFile(longFileName, NULL, encoding); BufferID buffer = MainFileManager->loadFile(longFileName, NULL, encoding);
if (buffer != BUFFER_INVALID) if (buffer != BUFFER_INVALID)
{ {
@ -858,6 +863,118 @@ BufferID Notepad_plus::doOpen(const TCHAR *fileName, bool isReadOnly, int encodi
} }
} }
int Notepad_plus::getHtmlXmlEncoding(const TCHAR *fileName) const
{
// Get Language type
TCHAR *ext = PathFindExtension(fileName);
if (*ext == '.') //extension found
{
ext += 1;
}
else
{
return -1;
}
NppParameters *pNppParamInst = NppParameters::getInstance();
LangType langT = pNppParamInst->getLangFromExt(ext);
if (langT != L_XML && langT != L_HTML && langT == L_PHP)
return -1;
// Get the begining of file data
FILE *f = generic_fopen(fileName, TEXT("rb"));
if (!f)
return -1;
const int blockSize = 1024; // To ensure that length is long enough to capture the encoding in html
char data[blockSize];
int lenFile = fread(data, 1, blockSize, f);
fclose(f);
// Put data in _invisibleEditView
_invisibleEditView.execute(SCI_CLEARALL);
_invisibleEditView.execute(SCI_APPENDTEXT, lenFile, (LPARAM)data);
const char *encodingAliasRegExpr = "[a-zA-Z0-9_-]+";
if (langT == L_XML)
{
// find encoding by RegExpr
const char *xmlHeaderRegExpr = "<?xml[ \\t]+version[ \\t]*=[ \\t]*\"[^\"]+\"[ \\t]+encoding[ \\t]*=[ \\t]*\"[^\"]+\"[ \\t]*.*?>";
int startPos = 0;
int endPos = lenFile-1;
_invisibleEditView.execute(SCI_SETSEARCHFLAGS, SCFIND_REGEXP|SCFIND_POSIX);
_invisibleEditView.execute(SCI_SETTARGETSTART, startPos);
_invisibleEditView.execute(SCI_SETTARGETEND, endPos);
int posFound = _invisibleEditView.execute(SCI_SEARCHINTARGET, strlen(xmlHeaderRegExpr), (LPARAM)xmlHeaderRegExpr);
if (posFound != -1)
{
const char *encodingBlockRegExpr = "encoding[ \\t]*=[ \\t]*\"[^\".]+\"";
posFound = _invisibleEditView.execute(SCI_SEARCHINTARGET, strlen(encodingBlockRegExpr), (LPARAM)encodingBlockRegExpr);
const char *encodingRegExpr = "\".+\"";
posFound = _invisibleEditView.execute(SCI_SEARCHINTARGET, strlen(encodingRegExpr), (LPARAM)encodingRegExpr);
posFound = _invisibleEditView.execute(SCI_SEARCHINTARGET, strlen(encodingAliasRegExpr), (LPARAM)encodingAliasRegExpr);
startPos = int(_invisibleEditView.execute(SCI_GETTARGETSTART));
endPos = int(_invisibleEditView.execute(SCI_GETTARGETEND));
char encodingStr[128];
_invisibleEditView.getText(encodingStr, startPos, endPos);
int enc = getCpFromStringValue(encodingStr);
return (enc==CP_ACP?-1:enc);
}
return -1;
}
else // if (langT == L_HTML)
{
// find encoding by RegExpr
const char *htmlHeaderRegExpr = "<meta[ \\t]+http-equiv[ \\t]*=[ \\t]*\"Content-Type\"[ \\t]+content[ \\t]*=[ \\t]*\"text/html;[ \\t]+charset[ \\t]*=[ \\t]*.+\"[ \\t]*/*>";
const char *htmlHeaderRegExpr2 = "<meta[ \\t]+content[ \\t]*=[ \\t]*\"text/html;[ \\t]+charset[ \\t]*=[ \\t]*.+\"[ \\t]*http-equiv[ \\t]*=[ \\t]*\"Content-Type\"[ \\t]+/*>";
int startPos = 0;
int endPos = lenFile-1;
_invisibleEditView.execute(SCI_SETSEARCHFLAGS, SCFIND_REGEXP|SCFIND_POSIX);
_invisibleEditView.execute(SCI_SETTARGETSTART, startPos);
_invisibleEditView.execute(SCI_SETTARGETEND, endPos);
int posFound = _invisibleEditView.execute(SCI_SEARCHINTARGET, strlen(htmlHeaderRegExpr), (LPARAM)htmlHeaderRegExpr);
if (posFound != -1)
{
const char *charsetBlockRegExpr = "charset[ \\t]*=[ \\t]*.+\"";
posFound = _invisibleEditView.execute(SCI_SEARCHINTARGET, strlen(charsetBlockRegExpr), (LPARAM)charsetBlockRegExpr);
const char *charsetRegExpr = "=[ \\t]*[^\"]+";
posFound = _invisibleEditView.execute(SCI_SEARCHINTARGET, strlen(charsetRegExpr), (LPARAM)charsetRegExpr);
posFound = _invisibleEditView.execute(SCI_SEARCHINTARGET, strlen(encodingAliasRegExpr), (LPARAM)encodingAliasRegExpr);
startPos = int(_invisibleEditView.execute(SCI_GETTARGETSTART));
endPos = int(_invisibleEditView.execute(SCI_GETTARGETEND));
char encodingStr[128];
_invisibleEditView.getText(encodingStr, startPos, endPos);
int enc = getCpFromStringValue(encodingStr);
return (enc==CP_ACP?-1:enc);
}
else
{
posFound = _invisibleEditView.execute(SCI_SEARCHINTARGET, strlen(htmlHeaderRegExpr2), (LPARAM)htmlHeaderRegExpr2);
if (posFound == -1)
return -1;
//TODO
}
return -1;
}
}
bool Notepad_plus::doReload(BufferID id, bool alert) bool Notepad_plus::doReload(BufferID id, bool alert)
{ {

View File

@ -265,6 +265,7 @@ public:
bool replaceInFiles(); bool replaceInFiles();
void setFindReplaceFolderFilter(const TCHAR *dir, const TCHAR *filters); void setFindReplaceFolderFilter(const TCHAR *dir, const TCHAR *filters);
vector<generic_string> addNppComponents(const TCHAR *destDir, const TCHAR *extFilterName, const TCHAR *extFilter); vector<generic_string> addNppComponents(const TCHAR *destDir, const TCHAR *extFilterName, const TCHAR *extFilter);
int getHtmlXmlEncoding(const TCHAR *fileName) const;
static HWND gNppHWND; //static handle to Notepad++ window, NULL if non-existant static HWND gNppHWND; //static handle to Notepad++ window, NULL if non-existant
private: private:

View File

@ -377,6 +377,34 @@ ScintillaKeyDefinition scintKeyDefs[] = { //array of accelerator keys for all po
// //
}; };
static bool isInList(const TCHAR *token, const TCHAR *list) {
if ((!token) || (!list))
return false;
TCHAR word[64];
int i = 0;
int j = 0;
for (; i <= int(lstrlen(list)) ; i++)
{
if ((list[i] == ' ')||(list[i] == '\0'))
{
if (j != 0)
{
word[j] = '\0';
j = 0;
if (!generic_stricmp(token, word))
return true;
}
}
else
{
word[j] = list[i];
j++;
}
}
return false;
};
static int strVal(const TCHAR *str, int base) { static int strVal(const TCHAR *str, int base) {
if (!str) return -1; if (!str) return -1;
if (!str[0]) return 0; if (!str[0]) return 0;
@ -1988,6 +2016,38 @@ void NppParameters::feedUserLang(TiXmlNode *node)
} }
} }
LangType NppParameters::getLangFromExt(const TCHAR *ext)
{
int i = getNbLang();
i--;
while (i >= 0)
{
Lang *l = getLangFromIndex(i--);
const TCHAR *defList = l->getDefaultExtList();
const TCHAR *userList = NULL;
LexerStylerArray &lsa = getLStylerArray();
const TCHAR *lName = l->getLangName();
LexerStyler *pLS = lsa.getLexerStylerByName(lName);
if (pLS)
userList = pLS->getLexerUserExt();
generic_string list(TEXT(""));
if (defList)
list += defList;
if (userList)
{
list += TEXT(" ");
list += userList;
}
if (isInList(ext, list.c_str()))
return l->getLangID();
}
return L_TXT;
}
void NppParameters::writeUserDefinedLang() void NppParameters::writeUserDefinedLang()
{ {
if (!_pXmlUserLangDoc) if (!_pXmlUserLangDoc)

View File

@ -1113,6 +1113,8 @@ public:
}; };
int getNbLang() const {return _nbLang;}; int getNbLang() const {return _nbLang;};
LangType getLangFromExt(const TCHAR *ext);
const TCHAR * getLangExtFromName(const TCHAR *langName) const { const TCHAR * getLangExtFromName(const TCHAR *langName) const {
for (int i = 0 ; i < _nbLang ; i++) for (int i = 0 ; i < _nbLang ; i++)

View File

@ -34,34 +34,6 @@ const int blockSize = 128 * 1024 + 4;
const int CR = 0x0D; const int CR = 0x0D;
const int LF = 0x0A; const int LF = 0x0A;
static bool isInList(const TCHAR *token, const TCHAR *list) {
if ((!token) || (!list))
return false;
TCHAR word[64];
int i = 0;
int j = 0;
for (; i <= int(lstrlen(list)) ; i++)
{
if ((list[i] == ' ')||(list[i] == '\0'))
{
if (j != 0)
{
word[j] = '\0';
j = 0;
if (!generic_stricmp(token, word))
return true;
}
}
else
{
word[j] = list[i];
j++;
}
}
return false;
};
Buffer::Buffer(FileManager * pManager, BufferID id, Document doc, DocFileStatus type, const TCHAR *fileName) //type must be either DOC_REGULAR or DOC_UNNAMED Buffer::Buffer(FileManager * pManager, BufferID id, Document doc, DocFileStatus type, const TCHAR *fileName) //type must be either DOC_REGULAR or DOC_UNNAMED
: _pManager(pManager), _id(id), _isDirty(false), _doc(doc), _isFileReadOnly(false), _isUserReadOnly(false), _recentTag(-1), _references(0), : _pManager(pManager), _id(id), _isDirty(false), _doc(doc), _isFileReadOnly(false), _isUserReadOnly(false), _recentTag(-1), _references(0),
_canNotify(false), _timeStamp(0), _needReloading(false), _encoding(-1) _canNotify(false), _timeStamp(0), _needReloading(false), _encoding(-1)
@ -84,6 +56,7 @@ Buffer::Buffer(FileManager * pManager, BufferID id, Document doc, DocFileStatus
_canNotify = true; _canNotify = true;
} }
void Buffer::determinateFormat(const char *data) { void Buffer::determinateFormat(const char *data) {
_format = WIN_FORMAT; _format = WIN_FORMAT;
size_t len = strlen(data); size_t len = strlen(data);
@ -169,7 +142,7 @@ void Buffer::setFileName(const TCHAR *fn, LangType defaultLang)
else // if it's not user lang, then check if it's supported lang else // if it's not user lang, then check if it's supported lang
{ {
_userLangExt[0] = '\0'; _userLangExt[0] = '\0';
newLang = getLangFromExt(ext); newLang = pNppParamInst->getLangFromExt(ext);
} }
} }
@ -278,39 +251,6 @@ std::vector<HeaderLineState> & Buffer::getHeaderLineState(ScintillaEditView * id
return _foldStates.at(index); return _foldStates.at(index);
} }
LangType Buffer::getLangFromExt(const TCHAR *ext)
{
NppParameters *pNppParam = NppParameters::getInstance();
int i = pNppParam->getNbLang();
i--;
while (i >= 0)
{
Lang *l = pNppParam->getLangFromIndex(i--);
const TCHAR *defList = l->getDefaultExtList();
const TCHAR *userList = NULL;
LexerStylerArray &lsa = pNppParam->getLStylerArray();
const TCHAR *lName = l->getLangName();
LexerStyler *pLS = lsa.getLexerStylerByName(lName);
if (pLS)
userList = pLS->getLexerUserExt();
generic_string list(TEXT(""));
if (defList)
list += defList;
if (userList)
{
list += TEXT(" ");
list += userList;
}
if (isInList(ext, list.c_str()))
return l->getLangID();
}
return L_TXT;
}
Lang * Buffer::getCurrentLang() const { Lang * Buffer::getCurrentLang() const {
NppParameters *pNppParam = NppParameters::getInstance(); NppParameters *pNppParam = NppParameters::getInstance();
int i = 0; int i = 0;

View File

@ -139,8 +139,6 @@ public :
//Destructor makes sure its purged //Destructor makes sure its purged
Buffer(FileManager * pManager, BufferID id, Document doc, DocFileStatus type, const TCHAR *fileName); Buffer(FileManager * pManager, BufferID id, Document doc, DocFileStatus type, const TCHAR *fileName);
LangType getLangFromExt(const TCHAR *ext);
// this method 1. copies the file name // this method 1. copies the file name
// 2. determinates the language from the ext of file name // 2. determinates the language from the ext of file name
// 3. gets the last modified time // 3. gets the last modified time