mirror of
				https://github.com/go-gitea/gitea.git
				synced 2025-10-25 01:24:13 +02:00 
			
		
		
		
	* Server-side syntax hilighting for all code This PR does a few things: * Remove all traces of highlight.js * Use chroma library to provide fast syntax hilighting directly on the server * Provide syntax hilighting for diffs * Re-style both unified and split diffs views * Add custom syntax hilighting styling for both regular and arc-green Fixes #7729 Fixes #10157 Fixes #11825 Fixes #7728 Fixes #3872 Fixes #3682 And perhaps gets closer to #9553 * fix line marker * fix repo search * Fix single line select * properly load settings * npm uninstall highlight.js * review suggestion * code review * forgot to call function * fix test * Apply suggestions from code review suggestions from @silverwind thanks Co-authored-by: silverwind <me@silverwind.io> * code review * copy/paste error * Use const for highlight size limit * Update web_src/less/_repository.less Co-authored-by: Lauris BH <lauris@nix.lv> * update size limit to 1MB and other styling tweaks * fix highlighting for certain diff sections * fix test * add worker back as suggested Co-authored-by: silverwind <me@silverwind.io> Co-authored-by: Lauris BH <lauris@nix.lv>
		
			
				
	
	
		
			855 lines
		
	
	
		
			22 KiB
		
	
	
	
		
			Go
		
	
	
	
		
			Vendored
		
	
	
	
			
		
		
	
	
			855 lines
		
	
	
		
			22 KiB
		
	
	
	
		
			Go
		
	
	
	
		
			Vendored
		
	
	
	
| package syntax
 | |
| 
 | |
| import (
 | |
| 	"bytes"
 | |
| 	"encoding/binary"
 | |
| 	"fmt"
 | |
| 	"sort"
 | |
| 	"unicode"
 | |
| 	"unicode/utf8"
 | |
| )
 | |
| 
 | |
| // CharSet combines start-end rune ranges and unicode categories representing a set of characters
 | |
| type CharSet struct {
 | |
| 	ranges     []singleRange
 | |
| 	categories []category
 | |
| 	sub        *CharSet //optional subtractor
 | |
| 	negate     bool
 | |
| 	anything   bool
 | |
| }
 | |
| 
 | |
| type category struct {
 | |
| 	negate bool
 | |
| 	cat    string
 | |
| }
 | |
| 
 | |
| type singleRange struct {
 | |
| 	first rune
 | |
| 	last  rune
 | |
| }
 | |
| 
 | |
| const (
 | |
| 	spaceCategoryText = " "
 | |
| 	wordCategoryText  = "W"
 | |
| )
 | |
| 
 | |
| var (
 | |
| 	ecmaSpace = []rune{0x0009, 0x000e, 0x0020, 0x0021, 0x00a0, 0x00a1, 0x1680, 0x1681, 0x2000, 0x200b, 0x2028, 0x202a, 0x202f, 0x2030, 0x205f, 0x2060, 0x3000, 0x3001, 0xfeff, 0xff00}
 | |
| 	ecmaWord  = []rune{0x0030, 0x003a, 0x0041, 0x005b, 0x005f, 0x0060, 0x0061, 0x007b}
 | |
| 	ecmaDigit = []rune{0x0030, 0x003a}
 | |
| )
 | |
| 
 | |
| var (
 | |
| 	AnyClass          = getCharSetFromOldString([]rune{0}, false)
 | |
| 	ECMAAnyClass      = getCharSetFromOldString([]rune{0, 0x000a, 0x000b, 0x000d, 0x000e}, false)
 | |
| 	NoneClass         = getCharSetFromOldString(nil, false)
 | |
| 	ECMAWordClass     = getCharSetFromOldString(ecmaWord, false)
 | |
| 	NotECMAWordClass  = getCharSetFromOldString(ecmaWord, true)
 | |
| 	ECMASpaceClass    = getCharSetFromOldString(ecmaSpace, false)
 | |
| 	NotECMASpaceClass = getCharSetFromOldString(ecmaSpace, true)
 | |
| 	ECMADigitClass    = getCharSetFromOldString(ecmaDigit, false)
 | |
| 	NotECMADigitClass = getCharSetFromOldString(ecmaDigit, true)
 | |
| 
 | |
| 	WordClass     = getCharSetFromCategoryString(false, false, wordCategoryText)
 | |
| 	NotWordClass  = getCharSetFromCategoryString(true, false, wordCategoryText)
 | |
| 	SpaceClass    = getCharSetFromCategoryString(false, false, spaceCategoryText)
 | |
| 	NotSpaceClass = getCharSetFromCategoryString(true, false, spaceCategoryText)
 | |
| 	DigitClass    = getCharSetFromCategoryString(false, false, "Nd")
 | |
| 	NotDigitClass = getCharSetFromCategoryString(false, true, "Nd")
 | |
| )
 | |
| 
 | |
| var unicodeCategories = func() map[string]*unicode.RangeTable {
 | |
| 	retVal := make(map[string]*unicode.RangeTable)
 | |
| 	for k, v := range unicode.Scripts {
 | |
| 		retVal[k] = v
 | |
| 	}
 | |
| 	for k, v := range unicode.Categories {
 | |
| 		retVal[k] = v
 | |
| 	}
 | |
| 	for k, v := range unicode.Properties {
 | |
| 		retVal[k] = v
 | |
| 	}
 | |
| 	return retVal
 | |
| }()
 | |
| 
 | |
| func getCharSetFromCategoryString(negateSet bool, negateCat bool, cats ...string) func() *CharSet {
 | |
| 	if negateCat && negateSet {
 | |
| 		panic("BUG!  You should only negate the set OR the category in a constant setup, but not both")
 | |
| 	}
 | |
| 
 | |
| 	c := CharSet{negate: negateSet}
 | |
| 
 | |
| 	c.categories = make([]category, len(cats))
 | |
| 	for i, cat := range cats {
 | |
| 		c.categories[i] = category{cat: cat, negate: negateCat}
 | |
| 	}
 | |
| 	return func() *CharSet {
 | |
| 		//make a copy each time
 | |
| 		local := c
 | |
| 		//return that address
 | |
| 		return &local
 | |
| 	}
 | |
| }
 | |
| 
 | |
| func getCharSetFromOldString(setText []rune, negate bool) func() *CharSet {
 | |
| 	c := CharSet{}
 | |
| 	if len(setText) > 0 {
 | |
| 		fillFirst := false
 | |
| 		l := len(setText)
 | |
| 		if negate {
 | |
| 			if setText[0] == 0 {
 | |
| 				setText = setText[1:]
 | |
| 			} else {
 | |
| 				l++
 | |
| 				fillFirst = true
 | |
| 			}
 | |
| 		}
 | |
| 
 | |
| 		if l%2 == 0 {
 | |
| 			c.ranges = make([]singleRange, l/2)
 | |
| 		} else {
 | |
| 			c.ranges = make([]singleRange, l/2+1)
 | |
| 		}
 | |
| 
 | |
| 		first := true
 | |
| 		if fillFirst {
 | |
| 			c.ranges[0] = singleRange{first: 0}
 | |
| 			first = false
 | |
| 		}
 | |
| 
 | |
| 		i := 0
 | |
| 		for _, r := range setText {
 | |
| 			if first {
 | |
| 				// lower bound in a new range
 | |
| 				c.ranges[i] = singleRange{first: r}
 | |
| 				first = false
 | |
| 			} else {
 | |
| 				c.ranges[i].last = r - 1
 | |
| 				i++
 | |
| 				first = true
 | |
| 			}
 | |
| 		}
 | |
| 		if !first {
 | |
| 			c.ranges[i].last = utf8.MaxRune
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	return func() *CharSet {
 | |
| 		local := c
 | |
| 		return &local
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // Copy makes a deep copy to prevent accidental mutation of a set
 | |
| func (c CharSet) Copy() CharSet {
 | |
| 	ret := CharSet{
 | |
| 		anything: c.anything,
 | |
| 		negate:   c.negate,
 | |
| 	}
 | |
| 
 | |
| 	ret.ranges = append(ret.ranges, c.ranges...)
 | |
| 	ret.categories = append(ret.categories, c.categories...)
 | |
| 
 | |
| 	if c.sub != nil {
 | |
| 		sub := c.sub.Copy()
 | |
| 		ret.sub = &sub
 | |
| 	}
 | |
| 
 | |
| 	return ret
 | |
| }
 | |
| 
 | |
| // gets a human-readable description for a set string
 | |
| func (c CharSet) String() string {
 | |
| 	buf := &bytes.Buffer{}
 | |
| 	buf.WriteRune('[')
 | |
| 
 | |
| 	if c.IsNegated() {
 | |
| 		buf.WriteRune('^')
 | |
| 	}
 | |
| 
 | |
| 	for _, r := range c.ranges {
 | |
| 
 | |
| 		buf.WriteString(CharDescription(r.first))
 | |
| 		if r.first != r.last {
 | |
| 			if r.last-r.first != 1 {
 | |
| 				//groups that are 1 char apart skip the dash
 | |
| 				buf.WriteRune('-')
 | |
| 			}
 | |
| 			buf.WriteString(CharDescription(r.last))
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	for _, c := range c.categories {
 | |
| 		buf.WriteString(c.String())
 | |
| 	}
 | |
| 
 | |
| 	if c.sub != nil {
 | |
| 		buf.WriteRune('-')
 | |
| 		buf.WriteString(c.sub.String())
 | |
| 	}
 | |
| 
 | |
| 	buf.WriteRune(']')
 | |
| 
 | |
| 	return buf.String()
 | |
| }
 | |
| 
 | |
| // mapHashFill converts a charset into a buffer for use in maps
 | |
| func (c CharSet) mapHashFill(buf *bytes.Buffer) {
 | |
| 	if c.negate {
 | |
| 		buf.WriteByte(0)
 | |
| 	} else {
 | |
| 		buf.WriteByte(1)
 | |
| 	}
 | |
| 
 | |
| 	binary.Write(buf, binary.LittleEndian, len(c.ranges))
 | |
| 	binary.Write(buf, binary.LittleEndian, len(c.categories))
 | |
| 	for _, r := range c.ranges {
 | |
| 		buf.WriteRune(r.first)
 | |
| 		buf.WriteRune(r.last)
 | |
| 	}
 | |
| 	for _, ct := range c.categories {
 | |
| 		buf.WriteString(ct.cat)
 | |
| 		if ct.negate {
 | |
| 			buf.WriteByte(1)
 | |
| 		} else {
 | |
| 			buf.WriteByte(0)
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	if c.sub != nil {
 | |
| 		c.sub.mapHashFill(buf)
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // CharIn returns true if the rune is in our character set (either ranges or categories).
 | |
| // It handles negations and subtracted sub-charsets.
 | |
| func (c CharSet) CharIn(ch rune) bool {
 | |
| 	val := false
 | |
| 	// in s && !s.subtracted
 | |
| 
 | |
| 	//check ranges
 | |
| 	for _, r := range c.ranges {
 | |
| 		if ch < r.first {
 | |
| 			continue
 | |
| 		}
 | |
| 		if ch <= r.last {
 | |
| 			val = true
 | |
| 			break
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	//check categories if we haven't already found a range
 | |
| 	if !val && len(c.categories) > 0 {
 | |
| 		for _, ct := range c.categories {
 | |
| 			// special categories...then unicode
 | |
| 			if ct.cat == spaceCategoryText {
 | |
| 				if unicode.IsSpace(ch) {
 | |
| 					// we found a space so we're done
 | |
| 					// negate means this is a "bad" thing
 | |
| 					val = !ct.negate
 | |
| 					break
 | |
| 				} else if ct.negate {
 | |
| 					val = true
 | |
| 					break
 | |
| 				}
 | |
| 			} else if ct.cat == wordCategoryText {
 | |
| 				if IsWordChar(ch) {
 | |
| 					val = !ct.negate
 | |
| 					break
 | |
| 				} else if ct.negate {
 | |
| 					val = true
 | |
| 					break
 | |
| 				}
 | |
| 			} else if unicode.Is(unicodeCategories[ct.cat], ch) {
 | |
| 				// if we're in this unicode category then we're done
 | |
| 				// if negate=true on this category then we "failed" our test
 | |
| 				// otherwise we're good that we found it
 | |
| 				val = !ct.negate
 | |
| 				break
 | |
| 			} else if ct.negate {
 | |
| 				val = true
 | |
| 				break
 | |
| 			}
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	// negate the whole char set
 | |
| 	if c.negate {
 | |
| 		val = !val
 | |
| 	}
 | |
| 
 | |
| 	// get subtracted recurse
 | |
| 	if val && c.sub != nil {
 | |
| 		val = !c.sub.CharIn(ch)
 | |
| 	}
 | |
| 
 | |
| 	//log.Printf("Char '%v' in %v == %v", string(ch), c.String(), val)
 | |
| 	return val
 | |
| }
 | |
| 
 | |
| func (c category) String() string {
 | |
| 	switch c.cat {
 | |
| 	case spaceCategoryText:
 | |
| 		if c.negate {
 | |
| 			return "\\S"
 | |
| 		}
 | |
| 		return "\\s"
 | |
| 	case wordCategoryText:
 | |
| 		if c.negate {
 | |
| 			return "\\W"
 | |
| 		}
 | |
| 		return "\\w"
 | |
| 	}
 | |
| 	if _, ok := unicodeCategories[c.cat]; ok {
 | |
| 
 | |
| 		if c.negate {
 | |
| 			return "\\P{" + c.cat + "}"
 | |
| 		}
 | |
| 		return "\\p{" + c.cat + "}"
 | |
| 	}
 | |
| 	return "Unknown category: " + c.cat
 | |
| }
 | |
| 
 | |
| // CharDescription Produces a human-readable description for a single character.
 | |
| func CharDescription(ch rune) string {
 | |
| 	/*if ch == '\\' {
 | |
| 		return "\\\\"
 | |
| 	}
 | |
| 
 | |
| 	if ch > ' ' && ch <= '~' {
 | |
| 		return string(ch)
 | |
| 	} else if ch == '\n' {
 | |
| 		return "\\n"
 | |
| 	} else if ch == ' ' {
 | |
| 		return "\\ "
 | |
| 	}*/
 | |
| 
 | |
| 	b := &bytes.Buffer{}
 | |
| 	escape(b, ch, false) //fmt.Sprintf("%U", ch)
 | |
| 	return b.String()
 | |
| }
 | |
| 
 | |
| // According to UTS#18 Unicode Regular Expressions (http://www.unicode.org/reports/tr18/)
 | |
| // RL 1.4 Simple Word Boundaries  The class of <word_character> includes all Alphabetic
 | |
| // values from the Unicode character database, from UnicodeData.txt [UData], plus the U+200C
 | |
| // ZERO WIDTH NON-JOINER and U+200D ZERO WIDTH JOINER.
 | |
| func IsWordChar(r rune) bool {
 | |
| 	//"L", "Mn", "Nd", "Pc"
 | |
| 	return unicode.In(r,
 | |
| 		unicode.Categories["L"], unicode.Categories["Mn"],
 | |
| 		unicode.Categories["Nd"], unicode.Categories["Pc"]) || r == '\u200D' || r == '\u200C'
 | |
| 	//return 'A' <= r && r <= 'Z' || 'a' <= r && r <= 'z' || '0' <= r && r <= '9' || r == '_'
 | |
| }
 | |
| 
 | |
| func IsECMAWordChar(r rune) bool {
 | |
| 	return unicode.In(r,
 | |
| 		unicode.Categories["L"], unicode.Categories["Mn"],
 | |
| 		unicode.Categories["Nd"], unicode.Categories["Pc"])
 | |
| 
 | |
| 	//return 'A' <= r && r <= 'Z' || 'a' <= r && r <= 'z' || '0' <= r && r <= '9' || r == '_'
 | |
| }
 | |
| 
 | |
| // SingletonChar will return the char from the first range without validation.
 | |
| // It assumes you have checked for IsSingleton or IsSingletonInverse and will panic given bad input
 | |
| func (c CharSet) SingletonChar() rune {
 | |
| 	return c.ranges[0].first
 | |
| }
 | |
| 
 | |
| func (c CharSet) IsSingleton() bool {
 | |
| 	return !c.negate && //negated is multiple chars
 | |
| 		len(c.categories) == 0 && len(c.ranges) == 1 && // multiple ranges and unicode classes represent multiple chars
 | |
| 		c.sub == nil && // subtraction means we've got multiple chars
 | |
| 		c.ranges[0].first == c.ranges[0].last // first and last equal means we're just 1 char
 | |
| }
 | |
| 
 | |
| func (c CharSet) IsSingletonInverse() bool {
 | |
| 	return c.negate && //same as above, but requires negated
 | |
| 		len(c.categories) == 0 && len(c.ranges) == 1 && // multiple ranges and unicode classes represent multiple chars
 | |
| 		c.sub == nil && // subtraction means we've got multiple chars
 | |
| 		c.ranges[0].first == c.ranges[0].last // first and last equal means we're just 1 char
 | |
| }
 | |
| 
 | |
| func (c CharSet) IsMergeable() bool {
 | |
| 	return !c.IsNegated() && !c.HasSubtraction()
 | |
| }
 | |
| 
 | |
| func (c CharSet) IsNegated() bool {
 | |
| 	return c.negate
 | |
| }
 | |
| 
 | |
| func (c CharSet) HasSubtraction() bool {
 | |
| 	return c.sub != nil
 | |
| }
 | |
| 
 | |
| func (c CharSet) IsEmpty() bool {
 | |
| 	return len(c.ranges) == 0 && len(c.categories) == 0 && c.sub == nil
 | |
| }
 | |
| 
 | |
| func (c *CharSet) addDigit(ecma, negate bool, pattern string) {
 | |
| 	if ecma {
 | |
| 		if negate {
 | |
| 			c.addRanges(NotECMADigitClass().ranges)
 | |
| 		} else {
 | |
| 			c.addRanges(ECMADigitClass().ranges)
 | |
| 		}
 | |
| 	} else {
 | |
| 		c.addCategories(category{cat: "Nd", negate: negate})
 | |
| 	}
 | |
| }
 | |
| 
 | |
| func (c *CharSet) addChar(ch rune) {
 | |
| 	c.addRange(ch, ch)
 | |
| }
 | |
| 
 | |
| func (c *CharSet) addSpace(ecma, negate bool) {
 | |
| 	if ecma {
 | |
| 		if negate {
 | |
| 			c.addRanges(NotECMASpaceClass().ranges)
 | |
| 		} else {
 | |
| 			c.addRanges(ECMASpaceClass().ranges)
 | |
| 		}
 | |
| 	} else {
 | |
| 		c.addCategories(category{cat: spaceCategoryText, negate: negate})
 | |
| 	}
 | |
| }
 | |
| 
 | |
| func (c *CharSet) addWord(ecma, negate bool) {
 | |
| 	if ecma {
 | |
| 		if negate {
 | |
| 			c.addRanges(NotECMAWordClass().ranges)
 | |
| 		} else {
 | |
| 			c.addRanges(ECMAWordClass().ranges)
 | |
| 		}
 | |
| 	} else {
 | |
| 		c.addCategories(category{cat: wordCategoryText, negate: negate})
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // Add set ranges and categories into ours -- no deduping or anything
 | |
| func (c *CharSet) addSet(set CharSet) {
 | |
| 	if c.anything {
 | |
| 		return
 | |
| 	}
 | |
| 	if set.anything {
 | |
| 		c.makeAnything()
 | |
| 		return
 | |
| 	}
 | |
| 	// just append here to prevent double-canon
 | |
| 	c.ranges = append(c.ranges, set.ranges...)
 | |
| 	c.addCategories(set.categories...)
 | |
| 	c.canonicalize()
 | |
| }
 | |
| 
 | |
| func (c *CharSet) makeAnything() {
 | |
| 	c.anything = true
 | |
| 	c.categories = []category{}
 | |
| 	c.ranges = AnyClass().ranges
 | |
| }
 | |
| 
 | |
| func (c *CharSet) addCategories(cats ...category) {
 | |
| 	// don't add dupes and remove positive+negative
 | |
| 	if c.anything {
 | |
| 		// if we've had a previous positive+negative group then
 | |
| 		// just return, we're as broad as we can get
 | |
| 		return
 | |
| 	}
 | |
| 
 | |
| 	for _, ct := range cats {
 | |
| 		found := false
 | |
| 		for _, ct2 := range c.categories {
 | |
| 			if ct.cat == ct2.cat {
 | |
| 				if ct.negate != ct2.negate {
 | |
| 					// oposite negations...this mean we just
 | |
| 					// take us as anything and move on
 | |
| 					c.makeAnything()
 | |
| 					return
 | |
| 				}
 | |
| 				found = true
 | |
| 				break
 | |
| 			}
 | |
| 		}
 | |
| 
 | |
| 		if !found {
 | |
| 			c.categories = append(c.categories, ct)
 | |
| 		}
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // Merges new ranges to our own
 | |
| func (c *CharSet) addRanges(ranges []singleRange) {
 | |
| 	if c.anything {
 | |
| 		return
 | |
| 	}
 | |
| 	c.ranges = append(c.ranges, ranges...)
 | |
| 	c.canonicalize()
 | |
| }
 | |
| 
 | |
| // Merges everything but the new ranges into our own
 | |
| func (c *CharSet) addNegativeRanges(ranges []singleRange) {
 | |
| 	if c.anything {
 | |
| 		return
 | |
| 	}
 | |
| 
 | |
| 	var hi rune
 | |
| 
 | |
| 	// convert incoming ranges into opposites, assume they are in order
 | |
| 	for _, r := range ranges {
 | |
| 		if hi < r.first {
 | |
| 			c.ranges = append(c.ranges, singleRange{hi, r.first - 1})
 | |
| 		}
 | |
| 		hi = r.last + 1
 | |
| 	}
 | |
| 
 | |
| 	if hi < utf8.MaxRune {
 | |
| 		c.ranges = append(c.ranges, singleRange{hi, utf8.MaxRune})
 | |
| 	}
 | |
| 
 | |
| 	c.canonicalize()
 | |
| }
 | |
| 
 | |
| func isValidUnicodeCat(catName string) bool {
 | |
| 	_, ok := unicodeCategories[catName]
 | |
| 	return ok
 | |
| }
 | |
| 
 | |
| func (c *CharSet) addCategory(categoryName string, negate, caseInsensitive bool, pattern string) {
 | |
| 	if !isValidUnicodeCat(categoryName) {
 | |
| 		// unknown unicode category, script, or property "blah"
 | |
| 		panic(fmt.Errorf("Unknown unicode category, script, or property '%v'", categoryName))
 | |
| 
 | |
| 	}
 | |
| 
 | |
| 	if caseInsensitive && (categoryName == "Ll" || categoryName == "Lu" || categoryName == "Lt") {
 | |
| 		// when RegexOptions.IgnoreCase is specified then {Ll} {Lu} and {Lt} cases should all match
 | |
| 		c.addCategories(
 | |
| 			category{cat: "Ll", negate: negate},
 | |
| 			category{cat: "Lu", negate: negate},
 | |
| 			category{cat: "Lt", negate: negate})
 | |
| 	}
 | |
| 	c.addCategories(category{cat: categoryName, negate: negate})
 | |
| }
 | |
| 
 | |
| func (c *CharSet) addSubtraction(sub *CharSet) {
 | |
| 	c.sub = sub
 | |
| }
 | |
| 
 | |
| func (c *CharSet) addRange(chMin, chMax rune) {
 | |
| 	c.ranges = append(c.ranges, singleRange{first: chMin, last: chMax})
 | |
| 	c.canonicalize()
 | |
| }
 | |
| 
 | |
| func (c *CharSet) addNamedASCII(name string, negate bool) bool {
 | |
| 	var rs []singleRange
 | |
| 
 | |
| 	switch name {
 | |
| 	case "alnum":
 | |
| 		rs = []singleRange{singleRange{'0', '9'}, singleRange{'A', 'Z'}, singleRange{'a', 'z'}}
 | |
| 	case "alpha":
 | |
| 		rs = []singleRange{singleRange{'A', 'Z'}, singleRange{'a', 'z'}}
 | |
| 	case "ascii":
 | |
| 		rs = []singleRange{singleRange{0, 0x7f}}
 | |
| 	case "blank":
 | |
| 		rs = []singleRange{singleRange{'\t', '\t'}, singleRange{' ', ' '}}
 | |
| 	case "cntrl":
 | |
| 		rs = []singleRange{singleRange{0, 0x1f}, singleRange{0x7f, 0x7f}}
 | |
| 	case "digit":
 | |
| 		c.addDigit(false, negate, "")
 | |
| 	case "graph":
 | |
| 		rs = []singleRange{singleRange{'!', '~'}}
 | |
| 	case "lower":
 | |
| 		rs = []singleRange{singleRange{'a', 'z'}}
 | |
| 	case "print":
 | |
| 		rs = []singleRange{singleRange{' ', '~'}}
 | |
| 	case "punct": //[!-/:-@[-`{-~]
 | |
| 		rs = []singleRange{singleRange{'!', '/'}, singleRange{':', '@'}, singleRange{'[', '`'}, singleRange{'{', '~'}}
 | |
| 	case "space":
 | |
| 		c.addSpace(true, negate)
 | |
| 	case "upper":
 | |
| 		rs = []singleRange{singleRange{'A', 'Z'}}
 | |
| 	case "word":
 | |
| 		c.addWord(true, negate)
 | |
| 	case "xdigit":
 | |
| 		rs = []singleRange{singleRange{'0', '9'}, singleRange{'A', 'F'}, singleRange{'a', 'f'}}
 | |
| 	default:
 | |
| 		return false
 | |
| 	}
 | |
| 
 | |
| 	if len(rs) > 0 {
 | |
| 		if negate {
 | |
| 			c.addNegativeRanges(rs)
 | |
| 		} else {
 | |
| 			c.addRanges(rs)
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	return true
 | |
| }
 | |
| 
 | |
| type singleRangeSorter []singleRange
 | |
| 
 | |
| func (p singleRangeSorter) Len() int           { return len(p) }
 | |
| func (p singleRangeSorter) Less(i, j int) bool { return p[i].first < p[j].first }
 | |
| func (p singleRangeSorter) Swap(i, j int)      { p[i], p[j] = p[j], p[i] }
 | |
| 
 | |
| // Logic to reduce a character class to a unique, sorted form.
 | |
| func (c *CharSet) canonicalize() {
 | |
| 	var i, j int
 | |
| 	var last rune
 | |
| 
 | |
| 	//
 | |
| 	// Find and eliminate overlapping or abutting ranges
 | |
| 	//
 | |
| 
 | |
| 	if len(c.ranges) > 1 {
 | |
| 		sort.Sort(singleRangeSorter(c.ranges))
 | |
| 
 | |
| 		done := false
 | |
| 
 | |
| 		for i, j = 1, 0; ; i++ {
 | |
| 			for last = c.ranges[j].last; ; i++ {
 | |
| 				if i == len(c.ranges) || last == utf8.MaxRune {
 | |
| 					done = true
 | |
| 					break
 | |
| 				}
 | |
| 
 | |
| 				CurrentRange := c.ranges[i]
 | |
| 				if CurrentRange.first > last+1 {
 | |
| 					break
 | |
| 				}
 | |
| 
 | |
| 				if last < CurrentRange.last {
 | |
| 					last = CurrentRange.last
 | |
| 				}
 | |
| 			}
 | |
| 
 | |
| 			c.ranges[j] = singleRange{first: c.ranges[j].first, last: last}
 | |
| 
 | |
| 			j++
 | |
| 
 | |
| 			if done {
 | |
| 				break
 | |
| 			}
 | |
| 
 | |
| 			if j < i {
 | |
| 				c.ranges[j] = c.ranges[i]
 | |
| 			}
 | |
| 		}
 | |
| 
 | |
| 		c.ranges = append(c.ranges[:j], c.ranges[len(c.ranges):]...)
 | |
| 	}
 | |
| }
 | |
| 
 | |
| // Adds to the class any lowercase versions of characters already
 | |
| // in the class. Used for case-insensitivity.
 | |
| func (c *CharSet) addLowercase() {
 | |
| 	if c.anything {
 | |
| 		return
 | |
| 	}
 | |
| 	toAdd := []singleRange{}
 | |
| 	for i := 0; i < len(c.ranges); i++ {
 | |
| 		r := c.ranges[i]
 | |
| 		if r.first == r.last {
 | |
| 			lower := unicode.ToLower(r.first)
 | |
| 			c.ranges[i] = singleRange{first: lower, last: lower}
 | |
| 		} else {
 | |
| 			toAdd = append(toAdd, r)
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	for _, r := range toAdd {
 | |
| 		c.addLowercaseRange(r.first, r.last)
 | |
| 	}
 | |
| 	c.canonicalize()
 | |
| }
 | |
| 
 | |
| /**************************************************************************
 | |
|     Let U be the set of Unicode character values and let L be the lowercase
 | |
|     function, mapping from U to U. To perform case insensitive matching of
 | |
|     character sets, we need to be able to map an interval I in U, say
 | |
| 
 | |
|         I = [chMin, chMax] = { ch : chMin <= ch <= chMax }
 | |
| 
 | |
|     to a set A such that A contains L(I) and A is contained in the union of
 | |
|     I and L(I).
 | |
| 
 | |
|     The table below partitions U into intervals on which L is non-decreasing.
 | |
|     Thus, for any interval J = [a, b] contained in one of these intervals,
 | |
|     L(J) is contained in [L(a), L(b)].
 | |
| 
 | |
|     It is also true that for any such J, [L(a), L(b)] is contained in the
 | |
|     union of J and L(J). This does not follow from L being non-decreasing on
 | |
|     these intervals. It follows from the nature of the L on each interval.
 | |
|     On each interval, L has one of the following forms:
 | |
| 
 | |
|         (1) L(ch) = constant            (LowercaseSet)
 | |
|         (2) L(ch) = ch + offset         (LowercaseAdd)
 | |
|         (3) L(ch) = ch | 1              (LowercaseBor)
 | |
|         (4) L(ch) = ch + (ch & 1)       (LowercaseBad)
 | |
| 
 | |
|     It is easy to verify that for any of these forms [L(a), L(b)] is
 | |
|     contained in the union of [a, b] and L([a, b]).
 | |
| ***************************************************************************/
 | |
| 
 | |
| const (
 | |
| 	LowercaseSet = 0 // Set to arg.
 | |
| 	LowercaseAdd = 1 // Add arg.
 | |
| 	LowercaseBor = 2 // Bitwise or with 1.
 | |
| 	LowercaseBad = 3 // Bitwise and with 1 and add original.
 | |
| )
 | |
| 
 | |
| type lcMap struct {
 | |
| 	chMin, chMax rune
 | |
| 	op, data     int32
 | |
| }
 | |
| 
 | |
| var lcTable = []lcMap{
 | |
| 	lcMap{'\u0041', '\u005A', LowercaseAdd, 32},
 | |
| 	lcMap{'\u00C0', '\u00DE', LowercaseAdd, 32},
 | |
| 	lcMap{'\u0100', '\u012E', LowercaseBor, 0},
 | |
| 	lcMap{'\u0130', '\u0130', LowercaseSet, 0x0069},
 | |
| 	lcMap{'\u0132', '\u0136', LowercaseBor, 0},
 | |
| 	lcMap{'\u0139', '\u0147', LowercaseBad, 0},
 | |
| 	lcMap{'\u014A', '\u0176', LowercaseBor, 0},
 | |
| 	lcMap{'\u0178', '\u0178', LowercaseSet, 0x00FF},
 | |
| 	lcMap{'\u0179', '\u017D', LowercaseBad, 0},
 | |
| 	lcMap{'\u0181', '\u0181', LowercaseSet, 0x0253},
 | |
| 	lcMap{'\u0182', '\u0184', LowercaseBor, 0},
 | |
| 	lcMap{'\u0186', '\u0186', LowercaseSet, 0x0254},
 | |
| 	lcMap{'\u0187', '\u0187', LowercaseSet, 0x0188},
 | |
| 	lcMap{'\u0189', '\u018A', LowercaseAdd, 205},
 | |
| 	lcMap{'\u018B', '\u018B', LowercaseSet, 0x018C},
 | |
| 	lcMap{'\u018E', '\u018E', LowercaseSet, 0x01DD},
 | |
| 	lcMap{'\u018F', '\u018F', LowercaseSet, 0x0259},
 | |
| 	lcMap{'\u0190', '\u0190', LowercaseSet, 0x025B},
 | |
| 	lcMap{'\u0191', '\u0191', LowercaseSet, 0x0192},
 | |
| 	lcMap{'\u0193', '\u0193', LowercaseSet, 0x0260},
 | |
| 	lcMap{'\u0194', '\u0194', LowercaseSet, 0x0263},
 | |
| 	lcMap{'\u0196', '\u0196', LowercaseSet, 0x0269},
 | |
| 	lcMap{'\u0197', '\u0197', LowercaseSet, 0x0268},
 | |
| 	lcMap{'\u0198', '\u0198', LowercaseSet, 0x0199},
 | |
| 	lcMap{'\u019C', '\u019C', LowercaseSet, 0x026F},
 | |
| 	lcMap{'\u019D', '\u019D', LowercaseSet, 0x0272},
 | |
| 	lcMap{'\u019F', '\u019F', LowercaseSet, 0x0275},
 | |
| 	lcMap{'\u01A0', '\u01A4', LowercaseBor, 0},
 | |
| 	lcMap{'\u01A7', '\u01A7', LowercaseSet, 0x01A8},
 | |
| 	lcMap{'\u01A9', '\u01A9', LowercaseSet, 0x0283},
 | |
| 	lcMap{'\u01AC', '\u01AC', LowercaseSet, 0x01AD},
 | |
| 	lcMap{'\u01AE', '\u01AE', LowercaseSet, 0x0288},
 | |
| 	lcMap{'\u01AF', '\u01AF', LowercaseSet, 0x01B0},
 | |
| 	lcMap{'\u01B1', '\u01B2', LowercaseAdd, 217},
 | |
| 	lcMap{'\u01B3', '\u01B5', LowercaseBad, 0},
 | |
| 	lcMap{'\u01B7', '\u01B7', LowercaseSet, 0x0292},
 | |
| 	lcMap{'\u01B8', '\u01B8', LowercaseSet, 0x01B9},
 | |
| 	lcMap{'\u01BC', '\u01BC', LowercaseSet, 0x01BD},
 | |
| 	lcMap{'\u01C4', '\u01C5', LowercaseSet, 0x01C6},
 | |
| 	lcMap{'\u01C7', '\u01C8', LowercaseSet, 0x01C9},
 | |
| 	lcMap{'\u01CA', '\u01CB', LowercaseSet, 0x01CC},
 | |
| 	lcMap{'\u01CD', '\u01DB', LowercaseBad, 0},
 | |
| 	lcMap{'\u01DE', '\u01EE', LowercaseBor, 0},
 | |
| 	lcMap{'\u01F1', '\u01F2', LowercaseSet, 0x01F3},
 | |
| 	lcMap{'\u01F4', '\u01F4', LowercaseSet, 0x01F5},
 | |
| 	lcMap{'\u01FA', '\u0216', LowercaseBor, 0},
 | |
| 	lcMap{'\u0386', '\u0386', LowercaseSet, 0x03AC},
 | |
| 	lcMap{'\u0388', '\u038A', LowercaseAdd, 37},
 | |
| 	lcMap{'\u038C', '\u038C', LowercaseSet, 0x03CC},
 | |
| 	lcMap{'\u038E', '\u038F', LowercaseAdd, 63},
 | |
| 	lcMap{'\u0391', '\u03AB', LowercaseAdd, 32},
 | |
| 	lcMap{'\u03E2', '\u03EE', LowercaseBor, 0},
 | |
| 	lcMap{'\u0401', '\u040F', LowercaseAdd, 80},
 | |
| 	lcMap{'\u0410', '\u042F', LowercaseAdd, 32},
 | |
| 	lcMap{'\u0460', '\u0480', LowercaseBor, 0},
 | |
| 	lcMap{'\u0490', '\u04BE', LowercaseBor, 0},
 | |
| 	lcMap{'\u04C1', '\u04C3', LowercaseBad, 0},
 | |
| 	lcMap{'\u04C7', '\u04C7', LowercaseSet, 0x04C8},
 | |
| 	lcMap{'\u04CB', '\u04CB', LowercaseSet, 0x04CC},
 | |
| 	lcMap{'\u04D0', '\u04EA', LowercaseBor, 0},
 | |
| 	lcMap{'\u04EE', '\u04F4', LowercaseBor, 0},
 | |
| 	lcMap{'\u04F8', '\u04F8', LowercaseSet, 0x04F9},
 | |
| 	lcMap{'\u0531', '\u0556', LowercaseAdd, 48},
 | |
| 	lcMap{'\u10A0', '\u10C5', LowercaseAdd, 48},
 | |
| 	lcMap{'\u1E00', '\u1EF8', LowercaseBor, 0},
 | |
| 	lcMap{'\u1F08', '\u1F0F', LowercaseAdd, -8},
 | |
| 	lcMap{'\u1F18', '\u1F1F', LowercaseAdd, -8},
 | |
| 	lcMap{'\u1F28', '\u1F2F', LowercaseAdd, -8},
 | |
| 	lcMap{'\u1F38', '\u1F3F', LowercaseAdd, -8},
 | |
| 	lcMap{'\u1F48', '\u1F4D', LowercaseAdd, -8},
 | |
| 	lcMap{'\u1F59', '\u1F59', LowercaseSet, 0x1F51},
 | |
| 	lcMap{'\u1F5B', '\u1F5B', LowercaseSet, 0x1F53},
 | |
| 	lcMap{'\u1F5D', '\u1F5D', LowercaseSet, 0x1F55},
 | |
| 	lcMap{'\u1F5F', '\u1F5F', LowercaseSet, 0x1F57},
 | |
| 	lcMap{'\u1F68', '\u1F6F', LowercaseAdd, -8},
 | |
| 	lcMap{'\u1F88', '\u1F8F', LowercaseAdd, -8},
 | |
| 	lcMap{'\u1F98', '\u1F9F', LowercaseAdd, -8},
 | |
| 	lcMap{'\u1FA8', '\u1FAF', LowercaseAdd, -8},
 | |
| 	lcMap{'\u1FB8', '\u1FB9', LowercaseAdd, -8},
 | |
| 	lcMap{'\u1FBA', '\u1FBB', LowercaseAdd, -74},
 | |
| 	lcMap{'\u1FBC', '\u1FBC', LowercaseSet, 0x1FB3},
 | |
| 	lcMap{'\u1FC8', '\u1FCB', LowercaseAdd, -86},
 | |
| 	lcMap{'\u1FCC', '\u1FCC', LowercaseSet, 0x1FC3},
 | |
| 	lcMap{'\u1FD8', '\u1FD9', LowercaseAdd, -8},
 | |
| 	lcMap{'\u1FDA', '\u1FDB', LowercaseAdd, -100},
 | |
| 	lcMap{'\u1FE8', '\u1FE9', LowercaseAdd, -8},
 | |
| 	lcMap{'\u1FEA', '\u1FEB', LowercaseAdd, -112},
 | |
| 	lcMap{'\u1FEC', '\u1FEC', LowercaseSet, 0x1FE5},
 | |
| 	lcMap{'\u1FF8', '\u1FF9', LowercaseAdd, -128},
 | |
| 	lcMap{'\u1FFA', '\u1FFB', LowercaseAdd, -126},
 | |
| 	lcMap{'\u1FFC', '\u1FFC', LowercaseSet, 0x1FF3},
 | |
| 	lcMap{'\u2160', '\u216F', LowercaseAdd, 16},
 | |
| 	lcMap{'\u24B6', '\u24D0', LowercaseAdd, 26},
 | |
| 	lcMap{'\uFF21', '\uFF3A', LowercaseAdd, 32},
 | |
| }
 | |
| 
 | |
| func (c *CharSet) addLowercaseRange(chMin, chMax rune) {
 | |
| 	var i, iMax, iMid int
 | |
| 	var chMinT, chMaxT rune
 | |
| 	var lc lcMap
 | |
| 
 | |
| 	for i, iMax = 0, len(lcTable); i < iMax; {
 | |
| 		iMid = (i + iMax) / 2
 | |
| 		if lcTable[iMid].chMax < chMin {
 | |
| 			i = iMid + 1
 | |
| 		} else {
 | |
| 			iMax = iMid
 | |
| 		}
 | |
| 	}
 | |
| 
 | |
| 	for ; i < len(lcTable); i++ {
 | |
| 		lc = lcTable[i]
 | |
| 		if lc.chMin > chMax {
 | |
| 			return
 | |
| 		}
 | |
| 		chMinT = lc.chMin
 | |
| 		if chMinT < chMin {
 | |
| 			chMinT = chMin
 | |
| 		}
 | |
| 
 | |
| 		chMaxT = lc.chMax
 | |
| 		if chMaxT > chMax {
 | |
| 			chMaxT = chMax
 | |
| 		}
 | |
| 
 | |
| 		switch lc.op {
 | |
| 		case LowercaseSet:
 | |
| 			chMinT = rune(lc.data)
 | |
| 			chMaxT = rune(lc.data)
 | |
| 			break
 | |
| 		case LowercaseAdd:
 | |
| 			chMinT += lc.data
 | |
| 			chMaxT += lc.data
 | |
| 			break
 | |
| 		case LowercaseBor:
 | |
| 			chMinT |= 1
 | |
| 			chMaxT |= 1
 | |
| 			break
 | |
| 		case LowercaseBad:
 | |
| 			chMinT += (chMinT & 1)
 | |
| 			chMaxT += (chMaxT & 1)
 | |
| 			break
 | |
| 		}
 | |
| 
 | |
| 		if chMinT < chMin || chMaxT > chMax {
 | |
| 			c.addRange(chMinT, chMaxT)
 | |
| 		}
 | |
| 	}
 | |
| }
 |