notepad-plus-plus/lexilla/lexers/LexR.cxx

// Scintilla source code edit control
/** @file LexR.cxx
 ** Lexer for R, S, SPlus Statistics Program (Heavily derived from CPP Lexer).
 **
 **/
// Copyright 1998-2002 by Neil Hodgson <neilh@scintilla.org>
// The License.txt file describes the conditions under which this software may be distributed.

#include <cassert>
#include <cctype>

#include <string>
#include <string_view>

#include "ILexer.h"
#include "Scintilla.h"
#include "SciLexer.h"

#include "WordList.h"
#include "LexAccessor.h"
#include "Accessor.h"
#include "StyleContext.h"
#include "CharacterSet.h"
#include "LexerModule.h"

using namespace Lexilla;

namespace {

inline bool IsAWordChar(int ch) noexcept {
	return (ch < 0x80) && (isalnum(ch) || ch == '.' || ch == '_');
}

inline bool IsAWordStart(int ch) noexcept {
	return (ch < 0x80) && (isalnum(ch) || ch == '_');
}

constexpr bool IsAnOperator(int ch) noexcept {
	// '.' left out as it is used to make up numbers
	if (ch == '-' || ch == '+' || ch == '!' || ch == '~' ||
	        ch == '?' || ch == ':' || ch == '*' || ch == '/' ||
	        ch == '^' || ch == '<' || ch == '>' || ch == '=' ||
	        ch == '&' || ch == '|' || ch == '$' || ch == '(' ||
	        ch == ')' || ch == '}' || ch == '{' || ch == '[' ||
		ch == ']')
		return true;
	return false;
}

constexpr bool IsOctalOrHex(int ch, bool hex) noexcept {
	return IsAnOctalDigit(ch) || (hex && IsAHeXDigit(ch));
}

// https://search.r-project.org/R/refmans/base/html/Quotes.html
struct EscapeSequence {
	int outerState = SCE_R_DEFAULT;
	int digitsLeft = 0;
	bool hex = false;
	bool brace = false;

	// highlight any character as escape sequence, unrecognized escape sequence is syntax error.
	void resetEscapeState(int state, int chNext) noexcept {
		outerState = state;
		digitsLeft = 1;
		hex = true;
		brace = false;
		if (chNext == 'x') {
			digitsLeft = 3;
		} else if (chNext == 'u') {
			digitsLeft = 5;
		} else if (chNext == 'U') {
			digitsLeft = 9;
		} else if (IsAnOctalDigit(chNext)) {
			digitsLeft = 3;
			hex = false;
		}
	}
	bool atEscapeEnd(int ch) noexcept {
		--digitsLeft;
		return digitsLeft <= 0 || !IsOctalOrHex(ch, hex);
	}
};

int CheckRawString(LexAccessor &styler, Sci_Position pos, int &dashCount) {
	dashCount = 0;
	while (true) {
		const char ch = styler.SafeGetCharAt(pos++);
		switch (ch) {
		case '-':
			++dashCount;
			break;
		case '(':
			return ')';
		case '[':
			return ']';
		case '{':
			return '}';
		default:
			dashCount = 0;
			return 0;
		}
	}
}

void ColouriseRDoc(Sci_PositionU startPos, Sci_Position length, int initStyle, WordList *keywordlists[],
                            Accessor &styler) {

	const WordList &keywords   = *keywordlists[0];
	const WordList &keywords2 = *keywordlists[1];
	const WordList &keywords3 = *keywordlists[2];
	// state for raw string
	int matchingDelimiter = 0;
	int dashCount = 0;

	// property lexer.r.escape.sequence
	//	Set to 1 to enable highlighting of escape sequences in strings.
	const bool escapeSequence = styler.GetPropertyInt("lexer.r.escape.sequence", 0) != 0;
	EscapeSequence escapeSeq;

	StyleContext sc(startPos, length, initStyle, styler);
	if (sc.currentLine > 0) {
		const int lineState = styler.GetLineState(sc.currentLine - 1);
		matchingDelimiter = lineState & 0xff;
		dashCount = lineState >> 8;
	}

	while (sc.More()) {
		// Determine if the current state should terminate.
		switch (sc.state) {
		case SCE_R_OPERATOR:
			sc.SetState(SCE_R_DEFAULT);
			break;

		case SCE_R_NUMBER:
			// https://cran.r-project.org/doc/manuals/r-release/R-lang.html#Literal-constants
			if (AnyOf(sc.ch, 'e', 'E', 'p', 'P') && (IsADigit(sc.chNext) || sc.chNext == '+' || sc.chNext == '-')) {
				sc.Forward(); // exponent part
			} else if (!(IsAHeXDigit(sc.ch) || (sc.ch == '.' && IsADigit(sc.chNext)))) {
				if (AnyOf(sc.ch, 'L', 'i')) {
					sc.Forward(); // integer and complex qualifier
				}
				sc.SetState(SCE_R_DEFAULT);
			}
			break;

		case SCE_R_IDENTIFIER:
			if (!IsAWordChar(sc.ch)) {
				char s[100];
				sc.GetCurrent(s, sizeof(s));
				if (keywords.InList(s)) {
					sc.ChangeState(SCE_R_KWORD);
				} else if  (keywords2.InList(s)) {
					sc.ChangeState(SCE_R_BASEKWORD);
				} else if  (keywords3.InList(s)) {
					sc.ChangeState(SCE_R_OTHERKWORD);
				}
				sc.SetState(SCE_R_DEFAULT);
			}
			break;

		case SCE_R_COMMENT:
			if (sc.MatchLineEnd()) {
				sc.SetState(SCE_R_DEFAULT);
			}
			break;

		case SCE_R_STRING:
		case SCE_R_STRING2:
		case SCE_R_BACKTICKS:
			if (sc.ch == '\\') {
				if (escapeSequence) {
					escapeSeq.resetEscapeState(sc.state, sc.chNext);
					sc.SetState(SCE_R_ESCAPESEQUENCE);
					sc.Forward();
					if (sc.chNext == '{' && AnyOf(sc.ch, 'u', 'U')) {
						escapeSeq.brace = true;
						sc.Forward();
					} else if (sc.MatchLineEnd()) {
						// don't highlight line ending as escape sequence:
						// escapeSeq.outerState is lost when editing on next line.
						sc.SetState(escapeSeq.outerState);
					}
				} else {
					sc.Forward(); // Skip all characters after the backslash
				}
			} else if ((sc.state == SCE_R_STRING && sc.ch == '\"')
				|| (sc.state == SCE_R_STRING2 && sc.ch == '\'')
				|| (sc.state == SCE_R_BACKTICKS && sc.ch == '`')) {
				sc.ForwardSetState(SCE_R_DEFAULT);
			}
			break;

		case SCE_R_ESCAPESEQUENCE:
			if (escapeSeq.atEscapeEnd(sc.ch)) {
				if (escapeSeq.brace && sc.ch == '}') {
					sc.Forward();
				}
				sc.SetState(escapeSeq.outerState);
				continue;
			}
			break;

		case SCE_R_RAWSTRING:
		case SCE_R_RAWSTRING2:
			while (sc.ch == matchingDelimiter) {
				sc.Forward();
				int count = dashCount;
				while (count != 0 && sc.ch == '-') {
					--count;
					sc.Forward();
				}
				if (count == 0 && sc.ch == ((sc.state == SCE_R_RAWSTRING) ? '\"' : '\'')) {
					matchingDelimiter = 0;
					dashCount = 0;
					sc.ForwardSetState(SCE_R_DEFAULT);
					break;
				}
			}
			break;

		case SCE_R_INFIX:
			if (sc.ch == '%') {
				sc.ForwardSetState(SCE_R_DEFAULT);
			} else if (sc.atLineEnd) {
				sc.ChangeState(SCE_R_INFIXEOL);
			}
			break;

		case SCE_R_INFIXEOL:
			if (sc.atLineStart) {
				sc.SetState(SCE_R_DEFAULT);
			}
			break;
		}

		// Determine if a new state should be entered.
		if (sc.state == SCE_R_DEFAULT) {
			if (IsADigit(sc.ch) || (sc.ch == '.' && IsADigit(sc.chNext))) {
				sc.SetState(SCE_R_NUMBER);
				if (sc.ch == '0' && AnyOf(sc.chNext, 'x', 'X')) {
					sc.Forward();
				}
			} else if (AnyOf(sc.ch, 'r', 'R') && AnyOf(sc.chNext, '\"', '\'')) {
				const int chNext = sc.chNext;
				matchingDelimiter = CheckRawString(styler, sc.currentPos + 2, dashCount);
				if (matchingDelimiter) {
					sc.SetState((chNext == '\"') ? SCE_R_RAWSTRING : SCE_R_RAWSTRING2);
					sc.Forward(dashCount + 2);
				} else {
					// syntax error
					sc.SetState(SCE_R_IDENTIFIER);
					sc.ForwardSetState((chNext == '\"') ? SCE_R_STRING : SCE_R_STRING2);
				}
			} else if (IsAWordStart(sc.ch) ) {
				sc.SetState(SCE_R_IDENTIFIER);
			} else if (sc.Match('#')) {
				sc.SetState(SCE_R_COMMENT);
			} else if (sc.ch == '\"') {
				sc.SetState(SCE_R_STRING);
			} else if (sc.ch == '%') {
				sc.SetState(SCE_R_INFIX);
			} else if (sc.ch == '\'') {
				sc.SetState(SCE_R_STRING2);
			} else if (sc.ch == '`') {
				sc.SetState(SCE_R_BACKTICKS);
			} else if (IsAnOperator(sc.ch)) {
				sc.SetState(SCE_R_OPERATOR);
			}
		}

		if (sc.atLineEnd) {
			const int lineState = matchingDelimiter | (dashCount << 8);
			styler.SetLineState(sc.currentLine, lineState);
		}
		sc.Forward();
	}
	sc.Complete();
}

// Store both the current line's fold level and the next lines in the
// level store to make it easy to pick up with each increment
// and to make it possible to fiddle the current level for "} else {".
void FoldRDoc(Sci_PositionU startPos, Sci_Position length, int, WordList *[],
                       Accessor &styler) {
	const bool foldCompact = styler.GetPropertyInt("fold.compact", 1) != 0;
	const bool foldAtElse = styler.GetPropertyInt("fold.at.else", 0) != 0;
	const Sci_PositionU endPos = startPos + length;
	int visibleChars = 0;
	Sci_Position lineCurrent = styler.GetLine(startPos);
	int levelCurrent = SC_FOLDLEVELBASE;
	if (lineCurrent > 0)
		levelCurrent = styler.LevelAt(lineCurrent-1) >> 16;
	int levelMinCurrent = levelCurrent;
	int levelNext = levelCurrent;
	char chNext = styler[startPos];
	int styleNext = styler.StyleAt(startPos);
	for (Sci_PositionU i = startPos; i < endPos; i++) {
		const char ch = chNext;
		chNext = styler.SafeGetCharAt(i + 1);
		const int style = styleNext;
		styleNext = styler.StyleAt(i + 1);
		const bool atEOL = (ch == '\r' && chNext != '\n') || (ch == '\n');
		if (style == SCE_R_OPERATOR) {
			if (ch == '{') {
				// Measure the minimum before a '{' to allow
				// folding on "} else {"
				if (levelMinCurrent > levelNext) {
					levelMinCurrent = levelNext;
				}
				levelNext++;
			} else if (ch == '}') {
				levelNext--;
			}
		}
		if (atEOL) {
			int levelUse = levelCurrent;
			if (foldAtElse) {
				levelUse = levelMinCurrent;
			}
			int lev = levelUse | levelNext << 16;
			if (visibleChars == 0 && foldCompact)
				lev |= SC_FOLDLEVELWHITEFLAG;
			if (levelUse < levelNext)
				lev |= SC_FOLDLEVELHEADERFLAG;
			if (lev != styler.LevelAt(lineCurrent)) {
				styler.SetLevel(lineCurrent, lev);
			}
			lineCurrent++;
			levelCurrent = levelNext;
			levelMinCurrent = levelCurrent;
			visibleChars = 0;
		}
		if (!isspacechar(ch))
			visibleChars++;
	}
}


const char * const RWordLists[] = {
		"Language Keywords",
		"Base / Default package function",
		"Other Package Functions",
		"Unused",
		"Unused",
		nullptr,
};

}

LexerModule lmR(SCLEX_R, ColouriseRDoc, "r", FoldRDoc, RWordLists);