notepad-plus-plus/lexilla/lexers/LexTOML.cxx

486 lines
13 KiB
C++

// Scintilla source code edit control
/** @file LexTOML.cxx
** Lexer for TOML language.
**/
// Based on Zufu Liu's Notepad4 TOML lexer
// Modified for Scintilla by Jiri Techet, 2024
// The License.txt file describes the conditions under which this software may be distributed.
#include <cassert>
#include <cstring>
#include <string>
#include <string_view>
#include "ILexer.h"
#include "Scintilla.h"
#include "SciLexer.h"
#include "WordList.h"
#include "LexAccessor.h"
#include "Accessor.h"
#include "StyleContext.h"
#include "CharacterSet.h"
#include "LexerModule.h"
using namespace Lexilla;
namespace {
// Use an unnamed namespace to protect the functions and classes from name conflicts
constexpr bool IsEOLChar(int ch) noexcept {
return ch == '\r' || ch == '\n';
}
constexpr bool IsIdentifierChar(int ch) noexcept {
return IsAlphaNumeric(ch) || ch == '_';
}
constexpr bool IsNumberContinue(int chPrev, int ch, int chNext) noexcept {
return ((ch == '+' || ch == '-') && (chPrev == 'e' || chPrev == 'E'))
|| (ch == '.' && chNext != '.');
}
constexpr bool IsDecimalNumber(int chPrev, int ch, int chNext) noexcept {
return IsIdentifierChar(ch) || IsNumberContinue(chPrev, ch, chNext);
}
constexpr bool IsISODateTime(int ch, int chNext) noexcept {
return ((ch == '+' || ch == '-' || ch == ':' || ch == '.') && IsADigit(chNext))
|| (ch == ' ' && (chNext == '+' || chNext == '-' || IsADigit(chNext)));
}
struct EscapeSequence {
int outerState = SCE_TOML_DEFAULT;
int digitsLeft = 0;
// highlight any character as escape sequence.
bool resetEscapeState(int state, int chNext) noexcept {
if (IsEOLChar(chNext)) {
return false;
}
outerState = state;
digitsLeft = 1;
if (chNext == 'x') {
digitsLeft = 3;
} else if (chNext == 'u') {
digitsLeft = 5;
} else if (chNext == 'U') {
digitsLeft = 9;
}
return true;
}
bool atEscapeEnd(int ch) noexcept {
--digitsLeft;
return digitsLeft <= 0 || !IsAHeXDigit(ch);
}
};
constexpr bool IsTripleString(int state) noexcept {
return state == SCE_TOML_TRIPLE_STRING_SQ || state == SCE_TOML_TRIPLE_STRING_DQ;
}
constexpr bool IsDoubleQuoted(int state) noexcept {
return state == SCE_TOML_STRING_DQ || state == SCE_TOML_TRIPLE_STRING_DQ;
}
constexpr int GetStringQuote(int state) noexcept {
return IsDoubleQuoted(state) ? '\"' : '\'';
}
constexpr bool IsTOMLOperator(int ch) noexcept {
return AnyOf(ch, '[', ']', '{', '}', ',', '=', '.', '+', '-');
}
constexpr bool IsTOMLUnquotedKey(int ch) noexcept {
return IsIdentifierChar(ch) || ch == '-';
}
constexpr bool IsWhiteSpace(int ch) noexcept {
return (ch == ' ') || ((ch >= 0x09) && (ch <= 0x0d));
}
int GetLineNextChar(StyleContext& sc) {
if (!IsWhiteSpace(sc.ch)) {
return sc.ch;
}
if (static_cast<Sci_Position>(sc.currentPos) + 1 == sc.lineStartNext) {
return '\0';
}
if (!IsWhiteSpace(sc.chNext)) {
return sc.chNext;
}
for (Sci_Position pos = 2; pos < sc.lineStartNext; pos++) {
const unsigned char chPos = sc.GetRelative(pos);
if (!IsWhiteSpace(chPos)) {
return chPos;
}
}
return '\0';
}
bool IsTOMLKey(StyleContext& sc, int braceCount, const WordList *kwList) {
if (braceCount) {
const int chNext = GetLineNextChar(sc);
if (chNext == '=' || chNext == '.' || chNext == '-') {
sc.ChangeState(SCE_TOML_KEY);
return true;
}
}
if (sc.state == SCE_TOML_IDENTIFIER) {
char s[8];
sc.GetCurrentLowered(s, sizeof(s));
#if defined(__clang__)
__builtin_assume(kwList != nullptr); // suppress [clang-analyzer-core.CallAndMessage]
#endif
if (kwList->InList(s)) {
sc.ChangeState(SCE_TOML_KEYWORD);
}
}
sc.SetState(SCE_TOML_DEFAULT);
return false;
}
enum class TOMLLineType {
None = 0,
Table,
CommentLine,
};
enum class TOMLKeyState {
Unquoted = 0,
Literal, // single-quoted
Quoted, // double-quoted
End,
};
void ColouriseTOMLDoc(Sci_PositionU startPos, Sci_Position lengthDoc, int initStyle, WordList *keywordLists[], Accessor &styler) {
int visibleChars = 0;
int chPrevNonWhite = 0;
int tableLevel = 0;
int braceCount = 0;
TOMLLineType lineType = TOMLLineType::None;
TOMLKeyState keyState = TOMLKeyState::Unquoted;
EscapeSequence escSeq;
StyleContext sc(startPos, lengthDoc, initStyle, styler);
if (sc.currentLine > 0) {
const int lineState = styler.GetLineState(sc.currentLine - 1);
/*
2: lineType
8: tableLevel
8: braceCount
*/
braceCount = (lineState >> 10) & 0xff;
}
while (sc.More()) {
switch (sc.state) {
case SCE_TOML_OPERATOR:
sc.SetState(SCE_TOML_DEFAULT);
break;
case SCE_TOML_NUMBER:
if (!IsDecimalNumber(sc.chPrev, sc.ch, sc.chNext)) {
if (IsISODateTime(sc.ch, sc.chNext)) {
sc.ChangeState(SCE_TOML_DATETIME);
} else if (IsTOMLKey(sc, braceCount, nullptr)) {
keyState = TOMLKeyState::Unquoted;
continue;
}
}
break;
case SCE_TOML_DATETIME:
if (!(IsIdentifierChar(sc.ch) || IsISODateTime(sc.ch, sc.chNext))) {
if (IsTOMLKey(sc, braceCount, nullptr)) {
keyState = TOMLKeyState::Unquoted;
continue;
}
}
break;
case SCE_TOML_IDENTIFIER:
if (!IsIdentifierChar(sc.ch)) {
if (IsTOMLKey(sc, braceCount, keywordLists[0])) {
keyState = TOMLKeyState::Unquoted;
continue;
}
}
break;
case SCE_TOML_TABLE:
case SCE_TOML_KEY:
if (sc.atLineStart) {
sc.SetState(SCE_TOML_DEFAULT);
} else {
switch (keyState) {
case TOMLKeyState::Literal:
if (sc.ch == '\'') {
keyState = TOMLKeyState::Unquoted;
sc.Forward();
}
break;
case TOMLKeyState::Quoted:
if (sc.ch == '\\') {
sc.Forward();
} else if (sc.ch == '\"') {
keyState = TOMLKeyState::Unquoted;
sc.Forward();
}
break;
default:
break;
}
if (keyState == TOMLKeyState::Unquoted) {
if (sc.ch == '\'') {
keyState = TOMLKeyState::Literal;
} else if (sc.ch == '\"') {
keyState = TOMLKeyState::Quoted;
} else if (sc.ch == '.') {
if (sc.state == SCE_TOML_TABLE) {
++tableLevel;
} else {
chPrevNonWhite = '.';
sc.SetState(SCE_TOML_OPERATOR);
sc.ForwardSetState(SCE_TOML_KEY);
// TODO: skip space after dot
continue;
}
} else if (sc.state == SCE_TOML_TABLE && sc.ch == ']') {
keyState = TOMLKeyState::End;
sc.Forward();
if (sc.ch == ']') {
sc.Forward();
}
const int chNext = GetLineNextChar(sc);
if (chNext == '#') {
sc.SetState(SCE_TOML_DEFAULT);
}
} else if (sc.state == SCE_TOML_KEY && !IsTOMLUnquotedKey(sc.ch)) {
const int chNext = GetLineNextChar(sc);
if (chNext == '=') {
keyState = TOMLKeyState::End;
sc.SetState(SCE_TOML_DEFAULT);
} else if (chNext != '.' && chPrevNonWhite != '.') {
sc.ChangeState(SCE_TOML_ERROR);
continue;
}
}
}
}
break;
case SCE_TOML_STRING_SQ:
case SCE_TOML_STRING_DQ:
case SCE_TOML_TRIPLE_STRING_SQ:
case SCE_TOML_TRIPLE_STRING_DQ:
if (sc.atLineStart && !IsTripleString(sc.state)) {
sc.SetState(SCE_TOML_DEFAULT);
} else if (sc.ch == '\\' && IsDoubleQuoted(sc.state)) {
if (escSeq.resetEscapeState(sc.state, sc.chNext)) {
sc.SetState(SCE_TOML_ESCAPECHAR);
sc.Forward();
}
} else if (sc.ch == GetStringQuote(sc.state) &&
(!IsTripleString(sc.state) || (sc.Match(IsDoubleQuoted(sc.state) ? R"(""")" : R"(''')")))) {
while (sc.ch == sc.chNext) {
sc.Forward();
}
sc.Forward();
if (!IsTripleString(sc.state) && IsTOMLKey(sc, braceCount, nullptr)) {
keyState = TOMLKeyState::Unquoted;
continue;
}
sc.SetState(SCE_TOML_DEFAULT);
}
break;
case SCE_TOML_ESCAPECHAR:
if (escSeq.atEscapeEnd(sc.ch)) {
sc.SetState(escSeq.outerState);
continue;
}
break;
case SCE_TOML_ERROR:
if (sc.atLineStart) {
sc.SetState(SCE_TOML_DEFAULT);
} else if (sc.ch == '#') {
sc.SetState(SCE_TOML_COMMENT);
}
break;
case SCE_TOML_COMMENT:
if (sc.atLineStart) {
sc.SetState(SCE_TOML_DEFAULT);
}
break;
}
if (sc.state == SCE_TOML_DEFAULT) {
if (sc.ch == '#') {
sc.SetState(SCE_TOML_COMMENT);
if (visibleChars == 0) {
lineType = TOMLLineType::CommentLine;
}
} else if (visibleChars == 0 && braceCount == 0) {
if (sc.ch == '[') {
tableLevel = 0;
sc.SetState(SCE_TOML_TABLE);
if (sc.chNext == '[') {
sc.Forward();
}
keyState = TOMLKeyState::Unquoted;
lineType = TOMLLineType::Table;
} else if (sc.ch == '\'' || sc.ch == '\"') {
keyState = (sc.ch == '\'')? TOMLKeyState::Literal : TOMLKeyState::Quoted;
sc.SetState(SCE_TOML_KEY);
} else if (IsTOMLUnquotedKey(sc.ch)) {
keyState = TOMLKeyState::Unquoted;
sc.SetState(SCE_TOML_KEY);
} else if (!isspacechar(sc.ch)) {
// each line must be: key = value
sc.SetState(SCE_TOML_ERROR);
}
} else {
if (sc.ch == '\'') {
if (sc.Match(R"(''')")) {
sc.SetState(SCE_TOML_TRIPLE_STRING_SQ);
sc.Forward(2);
} else {
sc.SetState(SCE_TOML_STRING_SQ);
}
} else if (sc.ch == '"') {
if (sc.Match(R"(""")")) {
sc.SetState(SCE_TOML_TRIPLE_STRING_DQ);
sc.Forward(2);
} else {
sc.SetState(SCE_TOML_STRING_DQ);
}
} else if (IsADigit(sc.ch)) {
sc.SetState(SCE_TOML_NUMBER);
} else if (IsLowerCase(sc.ch)) {
sc.SetState(SCE_TOML_IDENTIFIER);
} else if (IsTOMLOperator(sc.ch)) {
sc.SetState(SCE_TOML_OPERATOR);
if (sc.ch == '[' || sc.ch == '{') {
++braceCount;
} else if (sc.ch == ']' || sc.ch == '}') {
if (braceCount > 0) {
--braceCount;
}
}
} else if (braceCount && IsTOMLUnquotedKey(sc.ch)) {
// Inline Table
keyState = TOMLKeyState::Unquoted;
sc.SetState(SCE_TOML_KEY);
}
}
}
if (!isspacechar(sc.ch)) {
chPrevNonWhite = sc.ch;
++visibleChars;
}
if (sc.atLineEnd) {
const int lineState = (tableLevel << 2) | (braceCount << 10) | static_cast<int>(lineType);
styler.SetLineState(sc.currentLine, lineState);
lineType = TOMLLineType::None;
visibleChars = 0;
chPrevNonWhite = 0;
tableLevel = 0;
keyState = TOMLKeyState::Unquoted;
}
sc.Forward();
}
sc.Complete();
}
constexpr TOMLLineType GetLineType(int lineState) noexcept {
return static_cast<TOMLLineType>(lineState & 3);
}
constexpr int GetTableLevel(int lineState) noexcept {
return (lineState >> 2) & 0xff;
}
// code folding based on LexProps
void FoldTOMLDoc(Sci_PositionU startPos, Sci_Position lengthDoc, int /*initStyle*/, WordList *[] /*keywordLists*/, Accessor &styler) {
const Sci_Position endPos = startPos + lengthDoc;
const Sci_Position maxLines = styler.GetLine((endPos == styler.Length()) ? endPos : endPos - 1);
Sci_Position lineCurrent = styler.GetLine(startPos);
int prevLevel = SC_FOLDLEVELBASE;
TOMLLineType prevType = TOMLLineType::None;
TOMLLineType prev2Type = TOMLLineType::None;
if (lineCurrent > 0) {
prevLevel = styler.LevelAt(lineCurrent - 1);
prevType = GetLineType(styler.GetLineState(lineCurrent - 1));
if (lineCurrent >= 2) {
prev2Type = GetLineType(styler.GetLineState(lineCurrent - 2));
}
}
bool commentHead = (prevType == TOMLLineType::CommentLine) && (prevLevel & SC_FOLDLEVELHEADERFLAG);
while (lineCurrent <= maxLines) {
int nextLevel;
const int lineState = styler.GetLineState(lineCurrent);
const TOMLLineType lineType = GetLineType(lineState);
if (lineType == TOMLLineType::CommentLine) {
if (prevLevel & SC_FOLDLEVELHEADERFLAG) {
nextLevel = (prevLevel & SC_FOLDLEVELNUMBERMASK) + 1;
} else {
nextLevel = prevLevel;
}
commentHead = prevType != TOMLLineType::CommentLine;
nextLevel |= commentHead ? SC_FOLDLEVELHEADERFLAG : 0;
} else {
if (lineType == TOMLLineType::Table) {
nextLevel = SC_FOLDLEVELBASE + GetTableLevel(lineState);
if ((prevType == TOMLLineType::CommentLine) && prevLevel <= nextLevel) {
// comment above nested table
commentHead = true;
prevLevel = nextLevel - 1;
} else if ((prevType == TOMLLineType::Table) && (prevLevel & SC_FOLDLEVELNUMBERMASK) >= nextLevel) {
commentHead = true; // empty table
}
nextLevel |= SC_FOLDLEVELHEADERFLAG;
} else {
if (commentHead) {
nextLevel = prevLevel & SC_FOLDLEVELNUMBERMASK;
} else if (prevLevel & SC_FOLDLEVELHEADERFLAG) {
nextLevel = (prevLevel & SC_FOLDLEVELNUMBERMASK) + 1;
} else if ((prevType == TOMLLineType::CommentLine) && (prev2Type == TOMLLineType::CommentLine)) {
nextLevel = prevLevel - 1;
} else {
nextLevel = prevLevel;
}
}
if (commentHead) {
commentHead = false;
styler.SetLevel(lineCurrent - 1, prevLevel & SC_FOLDLEVELNUMBERMASK);
}
}
styler.SetLevel(lineCurrent, nextLevel);
prevLevel = nextLevel;
prev2Type = prevType;
prevType = lineType;
lineCurrent++;
}
}
} // unnamed namespace end
static const char *const tomlWordListDesc[] = {
"Keywords",
0
};
extern const LexerModule lmTOML(SCLEX_TOML, ColouriseTOMLDoc, "toml", FoldTOMLDoc, tomlWordListDesc);