notepad-plus-plus/lexilla/lexers/LexMarkdown.cxx
Christian Grasser ad79718fc8 Update to scintilla 5.5.2 & Lexilla 5.4.0
Release 5.5.2 ( https://www.scintilla.org/scintilla552.zip )

    Released 21 August 2024.
    Add SCI_SETCOPYSEPARATOR for separator between parts of a multiple selection when copied to the clipboard. Feature #1530.
    Add SCI_GETUNDOSEQUENCE to determine whether an undo sequence is active and its nesting depth.
    Add SCI_STYLESETSTRETCH to support condensed and expanded text styles.
    Add SCI_LINEINDENT and SCI_LINEDEDENT. Feature #1524.
    Fix bug on Cocoa where double-click stopped working when system had been running for a long time.
    On Cocoa implement more values of font weight and stretch.

Release 5.4.0 ( https://www.scintilla.org/lexilla540.zip )

    Released 21 August 2024.
    Inside Lexilla, LexerModule instances are now const. This will require changes to applications that modify Lexilla.cxx, which may be done to add custom lexers.
    Lexer added for TOML "toml".
    Bash: Handle backslash in heredoc delimiter. Issue #257.
    Progress: Fix lexing of nested comments. Pull request #258.
    Force lower-casing of case-insensitive keyword lists so keywords match in some lexers. Issue #259.

Close #15564
2024-08-23 02:59:58 +02:00

487 lines
18 KiB
C++

/******************************************************************
* LexMarkdown.cxx
*
* A simple Markdown lexer for scintilla.
*
* Includes highlighting for some extra features from the
* Pandoc implementation; strikeout, using '#.' as a default
* ordered list item marker, and delimited code blocks.
*
* Limitations:
*
* Standard indented code blocks are not highlighted at all,
* as it would conflict with other indentation schemes. Use
* delimited code blocks for blanket highlighting of an
* entire code block. Embedded HTML is not highlighted either.
* Blanket HTML highlighting has issues, because some Markdown
* implementations allow Markdown markup inside of the HTML. Also,
* there is a following blank line issue that can't be ignored,
* explained in the next paragraph. Embedded HTML and code
* blocks would be better supported with language specific
* highlighting.
*
* The highlighting aims to accurately reflect correct syntax,
* but a few restrictions are relaxed. Delimited code blocks are
* highlighted, even if the line following the code block is not blank.
* Requiring a blank line after a block, breaks the highlighting
* in certain cases, because of the way Scintilla ends up calling
* the lexer.
*
* Written by Jon Strait - jstrait@moonloop.net
*
* The License.txt file describes the conditions under which this
* software may be distributed.
*
*****************************************************************/
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <stdarg.h>
#include <assert.h>
#include <string>
#include <string_view>
#include "ILexer.h"
#include "Scintilla.h"
#include "SciLexer.h"
#include "WordList.h"
#include "LexAccessor.h"
#include "Accessor.h"
#include "StyleContext.h"
#include "CharacterSet.h"
#include "LexerModule.h"
using namespace Lexilla;
namespace {
constexpr bool IsNewline(const int ch) {
// sc.GetRelative(i) returns '\0' if out of range
return (ch == '\n' || ch == '\r' || ch == '\0');
}
}
// True if can follow ch down to the end with possibly trailing whitespace
// Does not set the state SCE_MARKDOWN_LINE_BEGIN as to allow further processing
static bool FollowToLineEnd(const int ch, const int state, const Sci_PositionU endPos, StyleContext &sc) {
Sci_Position i = 0;
while (sc.GetRelative(++i) == ch)
;
// Skip over whitespace
while (IsASpaceOrTab(sc.GetRelative(i)) && sc.currentPos + i < endPos)
++i;
if (IsNewline(sc.GetRelative(i)) || sc.currentPos + i == endPos) {
sc.SetState(state);
sc.Forward(i);
return true;
}
else return false;
}
// Set the state on text section from current to length characters,
// then set the rest until the newline to default, except for any characters matching token
static void SetStateAndZoom(const int state, const Sci_Position length, const int token, StyleContext &sc) {
sc.SetState(state);
sc.Forward(length);
sc.SetState(SCE_MARKDOWN_DEFAULT);
sc.Forward();
bool started = false;
while (sc.More() && !IsNewline(sc.ch)) {
if (sc.ch == token && !started) {
sc.SetState(state);
started = true;
}
else if (sc.ch != token) {
sc.SetState(SCE_MARKDOWN_DEFAULT);
started = false;
}
sc.Forward();
}
sc.SetState(SCE_MARKDOWN_LINE_BEGIN);
}
// Does the previous line have more than spaces and tabs?
static bool HasPrevLineContent(StyleContext &sc) {
Sci_Position i = 0;
// Go back to the previous newline
while ((--i + (Sci_Position)sc.currentPos) >= 0 && !IsNewline(sc.GetRelative(i)))
;
while ((--i + (Sci_Position)sc.currentPos) >= 0) {
const int ch = sc.GetRelative(i);
if (ch == '\n')
break;
if (!((ch == '\r' || IsASpaceOrTab(ch))))
return true;
}
return false;
}
static bool AtTermStart(StyleContext &sc) {
return sc.currentPos == 0 || sc.chPrev == 0 || isspacechar(sc.chPrev);
}
static bool IsCompleteStyleRegion(StyleContext &sc, const char *token) {
bool found = false;
const size_t start = strlen(token);
Sci_Position i = static_cast<Sci_Position>(start);
while (!IsNewline(sc.GetRelative(i))) {
// make sure an empty pair of single-char tokens doesn't match
// with a longer token: {*}{*} != {**}
if (sc.GetRelative(i) == *token && sc.GetRelative(i - 1) != *token) {
found = start > 1U ? sc.GetRelative(i + 1) == token[1] : true;
break;
}
i++;
}
return AtTermStart(sc) && found;
}
static bool IsValidHrule(const Sci_PositionU endPos, StyleContext &sc) {
int count = 1;
Sci_Position i = 0;
for (;;) {
++i;
int c = sc.GetRelative(i);
if (c == sc.ch)
++count;
// hit a terminating character
else if (!IsASpaceOrTab(c) || sc.currentPos + i == endPos) {
// Are we a valid HRULE
if ((IsNewline(c) || sc.currentPos + i == endPos) &&
count >= 3 && !HasPrevLineContent(sc)) {
sc.SetState(SCE_MARKDOWN_HRULE);
sc.Forward(i);
sc.SetState(SCE_MARKDOWN_LINE_BEGIN);
return true;
}
else {
sc.SetState(SCE_MARKDOWN_DEFAULT);
return false;
}
}
}
}
static void ColorizeMarkdownDoc(Sci_PositionU startPos, Sci_Position length, int initStyle,
WordList **, Accessor &styler) {
Sci_PositionU endPos = startPos + length;
int precharCount = 0;
bool isLinkNameDetecting = false;
// Don't advance on a new loop iteration and retry at the same position.
// Useful in the corner case of having to start at the beginning file position
// in the default state.
bool freezeCursor = false;
// property lexer.markdown.header.eolfill
// Set to 1 to highlight all ATX header text.
bool headerEOLFill = styler.GetPropertyInt("lexer.markdown.header.eolfill", 0) == 1;
StyleContext sc(startPos, static_cast<Sci_PositionU>(length), initStyle, styler);
while (sc.More()) {
// Skip past escaped characters
if (sc.ch == '\\') {
sc.Forward();
continue;
}
// A blockquotes resets the line semantics
if (sc.state == SCE_MARKDOWN_BLOCKQUOTE)
sc.SetState(SCE_MARKDOWN_LINE_BEGIN);
// Conditional state-based actions
if (sc.state == SCE_MARKDOWN_CODE2) {
if (sc.Match("``")) {
const int closingSpan = (sc.GetRelative(2) == '`') ? 3 : 2;
sc.Forward(closingSpan);
sc.SetState(SCE_MARKDOWN_DEFAULT);
}
}
else if (sc.state == SCE_MARKDOWN_CODE) {
if (sc.ch == '`' && sc.chPrev != ' ')
sc.ForwardSetState(SCE_MARKDOWN_DEFAULT);
}
/* De-activated because it gets in the way of other valid indentation
* schemes, for example multiple paragraphs inside a list item.
// Code block
else if (sc.state == SCE_MARKDOWN_CODEBK) {
bool d = true;
if (IsNewline(sc.ch)) {
if (sc.chNext != '\t') {
for (int c = 1; c < 5; ++c) {
if (sc.GetRelative(c) != ' ')
d = false;
}
}
}
else if (sc.atLineStart) {
if (sc.ch != '\t' ) {
for (int i = 0; i < 4; ++i) {
if (sc.GetRelative(i) != ' ')
d = false;
}
}
}
if (!d)
sc.SetState(SCE_MARKDOWN_LINE_BEGIN);
}
*/
// Strong
else if (sc.state == SCE_MARKDOWN_STRONG1) {
if ((sc.Match("**") && sc.chPrev != ' ') || IsNewline(sc.GetRelative(2))) {
sc.Forward(2);
sc.SetState(SCE_MARKDOWN_DEFAULT);
}
}
else if (sc.state == SCE_MARKDOWN_STRONG2) {
if ((sc.Match("__") && sc.chPrev != ' ') || IsNewline(sc.GetRelative(2))) {
sc.Forward(2);
sc.SetState(SCE_MARKDOWN_DEFAULT);
}
}
// Emphasis
else if (sc.state == SCE_MARKDOWN_EM1) {
if ((sc.ch == '*' && sc.chPrev != ' ') || IsNewline(sc.chNext))
sc.ForwardSetState(SCE_MARKDOWN_DEFAULT);
}
else if (sc.state == SCE_MARKDOWN_EM2) {
if ((sc.ch == '_' && sc.chPrev != ' ') || IsNewline(sc.chNext))
sc.ForwardSetState(SCE_MARKDOWN_DEFAULT);
}
else if (sc.state == SCE_MARKDOWN_CODEBK) {
if (sc.atLineStart && sc.Match("~~~")) {
Sci_Position i = 1;
while (!IsNewline(sc.GetRelative(i)) && sc.currentPos + i < endPos)
i++;
sc.Forward(i);
sc.SetState(SCE_MARKDOWN_DEFAULT);
}
}
else if (sc.state == SCE_MARKDOWN_STRIKEOUT) {
if ((sc.Match("~~") && sc.chPrev != ' ') || IsNewline(sc.GetRelative(2))) {
sc.Forward(2);
sc.SetState(SCE_MARKDOWN_DEFAULT);
}
}
else if (sc.state == SCE_MARKDOWN_LINE_BEGIN) {
// Header
if (sc.Match("######")) {
if (headerEOLFill)
sc.SetState(SCE_MARKDOWN_HEADER6);
else
SetStateAndZoom(SCE_MARKDOWN_HEADER6, 6, '#', sc);
}
else if (sc.Match("#####")) {
if (headerEOLFill)
sc.SetState(SCE_MARKDOWN_HEADER5);
else
SetStateAndZoom(SCE_MARKDOWN_HEADER5, 5, '#', sc);
}
else if (sc.Match("####")) {
if (headerEOLFill)
sc.SetState(SCE_MARKDOWN_HEADER4);
else
SetStateAndZoom(SCE_MARKDOWN_HEADER4, 4, '#', sc);
}
else if (sc.Match("###")) {
if (headerEOLFill)
sc.SetState(SCE_MARKDOWN_HEADER3);
else
SetStateAndZoom(SCE_MARKDOWN_HEADER3, 3, '#', sc);
}
else if (sc.Match("##")) {
if (headerEOLFill)
sc.SetState(SCE_MARKDOWN_HEADER2);
else
SetStateAndZoom(SCE_MARKDOWN_HEADER2, 2, '#', sc);
}
else if (sc.Match("#")) {
// Catch the special case of an unordered list
if (sc.chNext == '.' && IsASpaceOrTab(sc.GetRelative(2))) {
precharCount = 0;
sc.SetState(SCE_MARKDOWN_PRECHAR);
}
else if (headerEOLFill) {
sc.SetState(SCE_MARKDOWN_HEADER1);
}
else
SetStateAndZoom(SCE_MARKDOWN_HEADER1, 1, '#', sc);
}
// Code block
else if (sc.Match("~~~")) {
if (!HasPrevLineContent(sc))
sc.SetState(SCE_MARKDOWN_CODEBK);
else
sc.SetState(SCE_MARKDOWN_DEFAULT);
}
else if (sc.ch == '=') {
if (HasPrevLineContent(sc) && FollowToLineEnd('=', SCE_MARKDOWN_HEADER1, endPos, sc)) {
if (!headerEOLFill)
sc.SetState(SCE_MARKDOWN_LINE_BEGIN);
}
else
sc.SetState(SCE_MARKDOWN_DEFAULT);
}
else if (sc.ch == '-') {
if (HasPrevLineContent(sc) && FollowToLineEnd('-', SCE_MARKDOWN_HEADER2, endPos, sc)) {
if (!headerEOLFill)
sc.SetState(SCE_MARKDOWN_LINE_BEGIN);
}
else {
precharCount = 0;
sc.SetState(SCE_MARKDOWN_PRECHAR);
}
}
else if (IsNewline(sc.ch))
sc.SetState(SCE_MARKDOWN_LINE_BEGIN);
else {
precharCount = 0;
sc.SetState(SCE_MARKDOWN_PRECHAR);
}
}
// The header lasts until the newline
else if (sc.state == SCE_MARKDOWN_HEADER1 || sc.state == SCE_MARKDOWN_HEADER2 ||
sc.state == SCE_MARKDOWN_HEADER3 || sc.state == SCE_MARKDOWN_HEADER4 ||
sc.state == SCE_MARKDOWN_HEADER5 || sc.state == SCE_MARKDOWN_HEADER6) {
if (headerEOLFill) {
if (sc.atLineStart) {
sc.SetState(SCE_MARKDOWN_LINE_BEGIN);
freezeCursor = true;
}
}
else if (IsNewline(sc.ch))
sc.SetState(SCE_MARKDOWN_LINE_BEGIN);
}
// New state only within the initial whitespace
if (sc.state == SCE_MARKDOWN_PRECHAR) {
// Blockquote
if (sc.ch == '>' && precharCount < 5)
sc.SetState(SCE_MARKDOWN_BLOCKQUOTE);
/*
// Begin of code block
else if (!HasPrevLineContent(sc) && (sc.chPrev == '\t' || precharCount >= 4))
sc.SetState(SCE_MARKDOWN_CODEBK);
*/
// HRule - Total of three or more hyphens, asterisks, or underscores
// on a line by themselves
else if ((sc.ch == '-' || sc.ch == '*' || sc.ch == '_') && IsValidHrule(endPos, sc))
;
// Unordered list
else if ((sc.ch == '-' || sc.ch == '*' || sc.ch == '+') && IsASpaceOrTab(sc.chNext)) {
sc.SetState(SCE_MARKDOWN_ULIST_ITEM);
sc.ForwardSetState(SCE_MARKDOWN_DEFAULT);
}
// Ordered list
else if (IsADigit(sc.ch)) {
int digitCount = 0;
while (IsADigit(sc.GetRelative(++digitCount)))
;
if (sc.GetRelative(digitCount) == '.' &&
IsASpaceOrTab(sc.GetRelative(digitCount + 1))) {
sc.SetState(SCE_MARKDOWN_OLIST_ITEM);
sc.Forward(digitCount + 1);
sc.SetState(SCE_MARKDOWN_DEFAULT);
} else {
// a textual number at the margin should be plain text
sc.SetState(SCE_MARKDOWN_DEFAULT);
}
}
// Alternate Ordered list
else if (sc.ch == '#' && sc.chNext == '.' && IsASpaceOrTab(sc.GetRelative(2))) {
sc.SetState(SCE_MARKDOWN_OLIST_ITEM);
sc.Forward(2);
sc.SetState(SCE_MARKDOWN_DEFAULT);
}
else if (sc.ch != ' ' || precharCount > 2)
sc.SetState(SCE_MARKDOWN_DEFAULT);
else
++precharCount;
}
// Any link
if (sc.state == SCE_MARKDOWN_LINK) {
if (sc.Match("](") && sc.GetRelative(-1) != '\\') {
sc.Forward(2);
isLinkNameDetecting = true;
}
else if (sc.Match("]:") && sc.GetRelative(-1) != '\\') {
sc.Forward(2);
sc.SetState(SCE_MARKDOWN_DEFAULT);
}
else if (!isLinkNameDetecting && sc.ch == ']' && sc.GetRelative(-1) != '\\') {
sc.Forward();
sc.SetState(SCE_MARKDOWN_DEFAULT);
}
else if (isLinkNameDetecting && sc.ch == ')' && sc.GetRelative(-1) != '\\') {
sc.Forward();
sc.SetState(SCE_MARKDOWN_DEFAULT);
isLinkNameDetecting = false;
}
}
// New state anywhere in doc
if (sc.state == SCE_MARKDOWN_DEFAULT) {
if (sc.atLineStart && sc.ch == '#') {
sc.SetState(SCE_MARKDOWN_LINE_BEGIN);
freezeCursor = true;
}
// Links and Images
if (sc.Match("![")) {
sc.SetState(SCE_MARKDOWN_LINK);
sc.Forward(1);
}
else if (sc.ch == '[' && sc.GetRelative(-1) != '\\') {
sc.SetState(SCE_MARKDOWN_LINK);
}
// Code - also a special case for alternate inside spacing
else if (sc.Match("``") && sc.GetRelative(3) != ' ' && AtTermStart(sc)) {
const int openingSpan = (sc.GetRelative(2) == '`') ? 2 : 1;
sc.SetState(SCE_MARKDOWN_CODE2);
sc.Forward(openingSpan);
}
else if (sc.ch == '`' && sc.chNext != ' ' && IsCompleteStyleRegion(sc, "`")) {
sc.SetState(SCE_MARKDOWN_CODE);
}
// Strong
else if (sc.Match("**") && sc.GetRelative(2) != ' ' && IsCompleteStyleRegion(sc, "**")) {
sc.SetState(SCE_MARKDOWN_STRONG1);
sc.Forward();
}
else if (sc.Match("__") && sc.GetRelative(2) != ' ' && IsCompleteStyleRegion(sc, "__")) {
sc.SetState(SCE_MARKDOWN_STRONG2);
sc.Forward();
}
// Emphasis
else if (sc.ch == '*' && sc.chNext != ' ' && IsCompleteStyleRegion(sc, "*")) {
sc.SetState(SCE_MARKDOWN_EM1);
}
else if (sc.ch == '_' && sc.chNext != ' ' && IsCompleteStyleRegion(sc, "_")) {
sc.SetState(SCE_MARKDOWN_EM2);
}
// Strikeout
else if (sc.Match("~~") && !(sc.GetRelative(2) == '~' || sc.GetRelative(2) == ' ') &&
IsCompleteStyleRegion(sc, "~~")) {
sc.SetState(SCE_MARKDOWN_STRIKEOUT);
sc.Forward();
}
// Beginning of line
else if (IsNewline(sc.ch)) {
sc.SetState(SCE_MARKDOWN_LINE_BEGIN);
}
}
// Advance if not holding back the cursor for this iteration.
if (!freezeCursor)
sc.Forward();
freezeCursor = false;
}
sc.Complete();
}
extern const LexerModule lmMarkdown(SCLEX_MARKDOWN, ColorizeMarkdownDoc, "markdown");