mirror of
				https://github.com/notepad-plus-plus/notepad-plus-plus.git
				synced 2025-10-31 11:34:05 +01:00 
			
		
		
		
	git-svn-id: svn://svn.tuxfamily.org/svnroot/notepadplus/repository/trunk@461 f5eea248-9336-0410-98b8-ebc06183d4e3
		
			
				
	
	
		
			226 lines
		
	
	
		
			8.8 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
			
		
		
	
	
			226 lines
		
	
	
		
			8.8 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
| How to write a scintilla lexer
 | |
| 
 | |
| A lexer for a particular language determines how a specified range of
 | |
| text shall be colored.  Writing a lexer is relatively straightforward
 | |
| because the lexer need only color given text.  The harder job of
 | |
| determining how much text actually needs to be colored is handled by
 | |
| Scintilla itself, that is, the lexer's caller.
 | |
| 
 | |
| 
 | |
| Parameters
 | |
| 
 | |
| The lexer for language LLL has the following prototype:
 | |
| 
 | |
|     static void ColouriseLLLDoc (
 | |
|         unsigned int startPos, int length,
 | |
|         int initStyle,
 | |
|         WordList *keywordlists[],
 | |
|         Accessor &styler);
 | |
| 
 | |
| The styler parameter is an Accessor object.  The lexer must use this
 | |
| object to access the text to be colored.  The lexer gets the character
 | |
| at position i using styler.SafeGetCharAt(i);
 | |
| 
 | |
| The startPos and length parameters indicate the range of text to be
 | |
| recolored; the lexer must determine the proper color for all characters
 | |
| in positions startPos through startPos+length.
 | |
| 
 | |
| The initStyle parameter indicates the initial state, that is, the state
 | |
| at the character before startPos. States also indicate the coloring to
 | |
| be used for a particular range of text.
 | |
| 
 | |
| Note:  the character at StartPos is assumed to start a line, so if a
 | |
| newline terminates the initStyle state the lexer should enter its
 | |
| default state (or whatever state should follow initStyle).
 | |
| 
 | |
| The keywordlists parameter specifies the keywords that the lexer must
 | |
| recognize.  A WordList class object contains methods that make simplify
 | |
| the recognition of keywords.  Present lexers use a helper function
 | |
| called classifyWordLLL to recognize keywords.  These functions show how
 | |
| to use the keywordlists parameter to recognize keywords.  This
 | |
| documentation will not discuss keywords further.
 | |
| 
 | |
| 
 | |
| The lexer code
 | |
| 
 | |
| The task of a lexer can be summarized briefly: for each range r of
 | |
| characters that are to be colored the same, the lexer should call
 | |
| 
 | |
|     styler.ColourTo(i, state)
 | |
|         
 | |
| where i is the position of the last character of the range r.  The lexer
 | |
| should set the state variable to the coloring state of the character at
 | |
| position i and continue until the entire text has been colored.
 | |
| 
 | |
| Note 1:  the styler (Accessor) object remembers the i parameter in the
 | |
| previous calls to styler.ColourTo, so the single i parameter suffices to
 | |
| indicate a range of characters.
 | |
| 
 | |
| Note 2: As a side effect of calling styler.ColourTo(i,state), the
 | |
| coloring states of all characters in the range are remembered so that
 | |
| Scintilla may set the initStyle parameter correctly on future calls to
 | |
| the
 | |
| lexer.
 | |
| 
 | |
| 
 | |
| Lexer organization
 | |
| 
 | |
| There are at least two ways to organize the code of each lexer.  Present
 | |
| lexers use what might be called a "character-based" approach: the outer
 | |
| loop iterates over characters, like this:
 | |
| 
 | |
|   lengthDoc = startPos + length ;
 | |
|   for (unsigned int i = startPos; i < lengthDoc; i++) {
 | |
|     chNext = styler.SafeGetCharAt(i + 1);
 | |
|     << handle special cases >>
 | |
|     switch(state) {
 | |
|       // Handlers examine only ch and chNext.
 | |
|       // Handlers call styler.ColorTo(i,state) if the state changes.
 | |
|       case state_1: << handle ch in state 1 >>
 | |
|       case state_2: << handle ch in state 2 >>
 | |
|       ...
 | |
|       case state_n: << handle ch in state n >>
 | |
|     }
 | |
|     chPrev = ch;
 | |
|   }
 | |
|   styler.ColourTo(lengthDoc - 1, state);
 | |
| 
 | |
| 
 | |
| An alternative would be to use a "state-based" approach.  The outer loop
 | |
| would iterate over states, like this:
 | |
| 
 | |
|   lengthDoc = startPos+lenth ;
 | |
|   for ( unsigned int i = startPos ;; ) {
 | |
|     char ch = styler.SafeGetCharAt(i);
 | |
|     int new_state = 0 ;
 | |
|     switch ( state ) {
 | |
|       // scanners set new_state if they set the next state.
 | |
|       case state_1: << scan to the end of state 1 >> break ;
 | |
|       case state_2: << scan to the end of state 2 >> break ;
 | |
|       case default_state:
 | |
|         << scan to the next non-default state and set new_state >>
 | |
|     }
 | |
|     styler.ColourTo(i, state);
 | |
|     if ( i >= lengthDoc ) break ;
 | |
|     if ( ! new_state ) {
 | |
|       ch = styler.SafeGetCharAt(i);
 | |
|       << set state based on ch in the default state >>
 | |
|     }
 | |
|   }
 | |
|   styler.ColourTo(lengthDoc - 1, state);
 | |
| 
 | |
| This approach might seem to be more natural.  State scanners are simpler
 | |
| than character scanners because less needs to be done.  For example,
 | |
| there is no need to test for the start of a C string inside the scanner
 | |
| for a C comment.  Also this way makes it natural to define routines that
 | |
| could be used by more than one scanner; for example, a scanToEndOfLine
 | |
| routine.
 | |
| 
 | |
| However, the special cases handled in the main loop in the
 | |
| character-based approach would have to be handled by each state scanner,
 | |
| so both approaches have advantages.  These special cases are discussed
 | |
| below.
 | |
| 
 | |
| Special case: Lead characters
 | |
| 
 | |
| Lead bytes are part of DBCS processing for languages such as Japanese
 | |
| using an encoding such as Shift-JIS. In these encodings, extended
 | |
| (16-bit) characters are encoded as a lead byte followed by a trail byte.
 | |
| 
 | |
| Lead bytes are rarely of any lexical significance, normally only being
 | |
| allowed within strings and comments. In such contexts, lexers should
 | |
| ignore ch if styler.IsLeadByte(ch) returns TRUE.
 | |
| 
 | |
| Note: UTF-8 is simpler than Shift-JIS, so no special handling is
 | |
| applied for it. All UTF-8 extended characters are >= 128 and none are
 | |
| lexically significant in programming languages which, so far, use only
 | |
| characters in ASCII for operators, comment markers, etc.
 | |
| 
 | |
| 
 | |
| Special case: Folding
 | |
| 
 | |
| Folding may be performed in the lexer function. It is better to use a 
 | |
| separate folder function as that avoids some troublesome interaction 
 | |
| between styling and folding. The folder function will be run after the
 | |
| lexer function if folding is enabled. The rest of this section explains
 | |
| how to perform folding within the lexer function.
 | |
| 
 | |
| During initialization, lexers that support folding set
 | |
| 
 | |
|     bool fold = styler.GetPropertyInt("fold");
 | |
|         
 | |
| If folding is enabled in the editor, fold will be TRUE and the lexer
 | |
| should call:
 | |
| 
 | |
|     styler.SetLevel(line, level);
 | |
|         
 | |
| at the end of each line and just before exiting.
 | |
| 
 | |
| The line parameter is simply the count of the number of newlines seen. 
 | |
| It's initial value is styler.GetLine(startPos) and it is incremented
 | |
| (after calling styler.SetLevel) whenever a newline is seen.
 | |
| 
 | |
| The level parameter is the desired indentation level in the low 12 bits,
 | |
| along with flag bits in the upper four bits. The indentation level
 | |
| depends on the language.  For C++, it is incremented when the lexer sees
 | |
| a '{' and decremented when the lexer sees a '}' (outside of strings and
 | |
| comments, of course).
 | |
| 
 | |
| The following flag bits, defined in Scintilla.h, may be set or cleared
 | |
| in the flags parameter. The SC_FOLDLEVELWHITEFLAG flag is set if the
 | |
| lexer considers that the line contains nothing but whitespace.  The
 | |
| SC_FOLDLEVELHEADERFLAG flag indicates that the line is a fold point. 
 | |
| This normally means that the next line has a greater level than present
 | |
| line.  However, the lexer may have some other basis for determining a
 | |
| fold point.  For example, a lexer might create a header line for the
 | |
| first line of a function definition rather than the last.
 | |
| 
 | |
| The SC_FOLDLEVELNUMBERMASK mask denotes the level number in the low 12
 | |
| bits of the level param. This mask may be used to isolate either flags
 | |
| or level numbers.
 | |
| 
 | |
| For example, the C++ lexer contains the following code when a newline is
 | |
| seen:
 | |
| 
 | |
|   if (fold) {
 | |
|     int lev = levelPrev;
 | |
| 
 | |
|     // Set the "all whitespace" bit if the line is blank.
 | |
|     if (visChars == 0)
 | |
|       lev |= SC_FOLDLEVELWHITEFLAG;
 | |
| 
 | |
|     // Set the "header" bit if needed.
 | |
|     if ((levelCurrent > levelPrev) && (visChars > 0))
 | |
|       lev |= SC_FOLDLEVELHEADERFLAG;
 | |
|       styler.SetLevel(lineCurrent, lev);
 | |
|         
 | |
|     // reinitialize the folding vars describing the present line.
 | |
|     lineCurrent++;
 | |
|     visChars = 0;  // Number of non-whitespace characters on the line.
 | |
|     levelPrev = levelCurrent;
 | |
|   }
 | |
| 
 | |
| The following code appears in the C++ lexer just before exit:
 | |
| 
 | |
|   // Fill in the real level of the next line, keeping the current flags
 | |
|   // as they will be filled in later.
 | |
|   if (fold) {
 | |
|     // Mask off the level number, leaving only the previous flags.
 | |
|     int flagsNext = styler.LevelAt(lineCurrent);
 | |
|     flagsNext &= ~SC_FOLDLEVELNUMBERMASK;
 | |
|     styler.SetLevel(lineCurrent, levelPrev | flagsNext);
 | |
|   }
 | |
|         
 | |
| 
 | |
| Don't worry about performance
 | |
| 
 | |
| The writer of a lexer may safely ignore performance considerations: the
 | |
| cost of redrawing the screen is several orders of magnitude greater than
 | |
| the cost of function calls, etc.  Moreover, Scintilla performs all the
 | |
| important optimizations; Scintilla ensures that a lexer will be called
 | |
| only to recolor text that actually needs to be recolored.  Finally, it
 | |
| is not necessary to avoid extra calls to styler.ColourTo: the sytler
 | |
| object buffers calls to ColourTo to avoid multiple updates of the
 | |
| screen.
 | |
| 
 | |
| Page contributed by Edward K. Ream |