mirror of
				https://github.com/go-gitea/gitea.git
				synced 2025-10-25 17:44:32 +02:00 
			
		
		
		
	This is a large and complex PR, so let me explain in detail its changes. First, I had to create new index mappings for Bleve and ElasticSerach as the current ones do not support search by filename. This requires Gitea to recreate the code search indexes (I do not know if this is a breaking change, but I feel it deserves a heads-up). I've used [this approach](https://www.elastic.co/guide/en/elasticsearch/reference/7.17/analysis-pathhierarchy-tokenizer.html) to model the filename index. It allows us to efficiently search for both the full path and the name of a file. Bleve, however, does not support this out-of-box, so I had to code a brand new [token filter](https://blevesearch.com/docs/Token-Filters/) to generate the search terms. I also did an overhaul in the `indexer_test.go` file. It now asserts the order of the expected results (this is important since matches based on the name of a file are more relevant than those based on its content). I've added new test scenarios that deal with searching by filename. They use a new repo included in the Gitea fixture. The screenshot below depicts how Gitea shows the search results. It shows results based on content in the same way as the current version does. In matches based on the filename, the first seven lines of the file contents are shown (BTW, this is how GitHub does it).  Resolves #32096 --------- Signed-off-by: Bruno Sofiato <bruno.sofiato@gmail.com>
		
			
				
	
	
		
			102 lines
		
	
	
		
			2.3 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
			
		
		
	
	
			102 lines
		
	
	
		
			2.3 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
| // Copyright 2024 The Gitea Authors. All rights reserved.
 | |
| // SPDX-License-Identifier: MIT
 | |
| 
 | |
| package path
 | |
| 
 | |
| import (
 | |
| 	"slices"
 | |
| 	"strings"
 | |
| 
 | |
| 	"github.com/blevesearch/bleve/v2/analysis"
 | |
| 	"github.com/blevesearch/bleve/v2/registry"
 | |
| )
 | |
| 
 | |
| const (
 | |
| 	Name = "gitea/path"
 | |
| )
 | |
| 
 | |
| type TokenFilter struct{}
 | |
| 
 | |
| func NewTokenFilter() *TokenFilter {
 | |
| 	return &TokenFilter{}
 | |
| }
 | |
| 
 | |
| func TokenFilterConstructor(config map[string]any, cache *registry.Cache) (analysis.TokenFilter, error) {
 | |
| 	return NewTokenFilter(), nil
 | |
| }
 | |
| 
 | |
| func (s *TokenFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
 | |
| 	if len(input) == 1 {
 | |
| 		// if there is only one token, we dont need to generate the reversed chain
 | |
| 		return generatePathTokens(input, false)
 | |
| 	}
 | |
| 
 | |
| 	normal := generatePathTokens(input, false)
 | |
| 	reversed := generatePathTokens(input, true)
 | |
| 
 | |
| 	return append(normal, reversed...)
 | |
| }
 | |
| 
 | |
| // Generates path tokens from the input tokens.
 | |
| // This mimics the behavior of the path hierarchy tokenizer in ES. It takes the input tokens and combine them, generating a term for each component
 | |
| // in tree (e.g., foo/bar/baz.md will generate foo, foo/bar, and foo/bar/baz.md).
 | |
| //
 | |
| // If the reverse flag is set, the order of the tokens is reversed (the same input will generate baz.md, baz.md/bar, baz.md/bar/foo). This is useful
 | |
| // to efficiently search for filenames without supplying the fullpath.
 | |
| func generatePathTokens(input analysis.TokenStream, reversed bool) analysis.TokenStream {
 | |
| 	terms := make([]string, 0, len(input))
 | |
| 	longestTerm := 0
 | |
| 
 | |
| 	if reversed {
 | |
| 		slices.Reverse(input)
 | |
| 	}
 | |
| 
 | |
| 	for i := 0; i < len(input); i++ {
 | |
| 		var sb strings.Builder
 | |
| 		sb.WriteString(string(input[0].Term))
 | |
| 
 | |
| 		for j := 1; j < i; j++ {
 | |
| 			sb.WriteString("/")
 | |
| 			sb.WriteString(string(input[j].Term))
 | |
| 		}
 | |
| 
 | |
| 		term := sb.String()
 | |
| 
 | |
| 		if longestTerm < len(term) {
 | |
| 			longestTerm = len(term)
 | |
| 		}
 | |
| 
 | |
| 		terms = append(terms, term)
 | |
| 	}
 | |
| 
 | |
| 	output := make(analysis.TokenStream, 0, len(terms))
 | |
| 
 | |
| 	for _, term := range terms {
 | |
| 		var start, end int
 | |
| 
 | |
| 		if reversed {
 | |
| 			start = 0
 | |
| 			end = len(term)
 | |
| 		} else {
 | |
| 			start = longestTerm - len(term)
 | |
| 			end = longestTerm
 | |
| 		}
 | |
| 
 | |
| 		token := analysis.Token{
 | |
| 			Position: 1,
 | |
| 			Start:    start,
 | |
| 			End:      end,
 | |
| 			Type:     analysis.AlphaNumeric,
 | |
| 			Term:     []byte(term),
 | |
| 		}
 | |
| 
 | |
| 		output = append(output, &token)
 | |
| 	}
 | |
| 
 | |
| 	return output
 | |
| }
 | |
| 
 | |
| func init() {
 | |
| 	registry.RegisterTokenFilter(Name, TokenFilterConstructor)
 | |
| }
 |