From 0ba9c8439eadd6c895b30ea434b398069e586c9b Mon Sep 17 00:00:00 2001
From: Rob Findley <rfindley@google.com>
Date: Fri, 29 Sep 2023 12:56:16 -0400
Subject: [PATCH] internal/fuzzy: several improvements for symbol matching

Following the edge case discovered in golang/go#60201, take a more
scientific approach to improving symbol match scoring:

- Add a conformance test that compares Matcher with SymbolMatcher,
  querying all identifiers in x/tools. The two are not expected to agree
  in all cases, but this test helped find interesting ranking edge
  cases, which are added to the ranking test.
- Don't count a capital letter in the middle of a sequence of capital
  letters (e.g. the M in YAML) as a word start. This was the
  inconsistency that led to golang/go#60201.
- Compute the sequence bonus before role score; role score should take
  precedent.
- Simplify the sequence scoring logic: a sequential character gets the
  same score as a word start, unless it is the final character in the
  pattern in which case we also adjust for whether it completes a word
  or segment. This feels like a reasonable heuristic.
- Fix a bug in final-rune adjustment where we were checking the next
  input rune for a segment start, not a separator.

Notably, the scoring improvements above were all derived from first
principles, and happened to also improve the conformance rate in the new
test.

Additionally, make the following cleanup:

- s/character/rune throughout, since that's what we mean
- add debugging support for more easily understanding the match
  algorithm
- add additional commentary
- add benchmarks

Fixes golang/go#60201

Change-Id: I838898c49cbb69af083a8cc837612da047778c40
Reviewed-on: https://go-review.googlesource.com/c/tools/+/531697
Reviewed-by: Alan Donovan <adonovan@google.com>
Auto-Submit: Robert Findley <rfindley@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
---
 gopls/internal/lsp/command/interface.go |   1 +
 internal/fuzzy/self_test.go             |  39 +++++
 internal/fuzzy/symbol.go                | 186 +++++++++++++++--------
 internal/fuzzy/symbol_test.go           | 192 +++++++++++++++++++++---
 4 files changed, 334 insertions(+), 84 deletions(-)
 create mode 100644 internal/fuzzy/self_test.go

diff --git a/gopls/internal/lsp/command/interface.go b/gopls/internal/lsp/command/interface.go
index 603e6121c8a..c3bd921fcf1 100644
--- a/gopls/internal/lsp/command/interface.go
+++ b/gopls/internal/lsp/command/interface.go
@@ -37,6 +37,7 @@ type Interface interface {
 	//
 	// Applies a fix to a region of source code.
 	ApplyFix(context.Context, ApplyFixArgs) error
+
 	// Test: Run test(s) (legacy)
 	//
 	// Runs `go test` for a specific set of test or benchmark functions.
diff --git a/internal/fuzzy/self_test.go b/internal/fuzzy/self_test.go
new file mode 100644
index 00000000000..fae0aeae249
--- /dev/null
+++ b/internal/fuzzy/self_test.go
@@ -0,0 +1,39 @@
+// Copyright 2023 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package fuzzy_test
+
+import (
+	"testing"
+
+	. "golang.org/x/tools/internal/fuzzy"
+)
+
+func BenchmarkSelf_Matcher(b *testing.B) {
+	idents := collectIdentifiers(b)
+	patterns := generatePatterns()
+
+	for i := 0; i < b.N; i++ {
+		for _, pattern := range patterns {
+			sm := NewMatcher(pattern)
+			for _, ident := range idents {
+				_ = sm.Score(ident)
+			}
+		}
+	}
+}
+
+func BenchmarkSelf_SymbolMatcher(b *testing.B) {
+	idents := collectIdentifiers(b)
+	patterns := generatePatterns()
+
+	for i := 0; i < b.N; i++ {
+		for _, pattern := range patterns {
+			sm := NewSymbolMatcher(pattern)
+			for _, ident := range idents {
+				_, _ = sm.Match([]string{ident})
+			}
+		}
+	}
+}
diff --git a/internal/fuzzy/symbol.go b/internal/fuzzy/symbol.go
index bf93041521b..5fe2ce3e2a3 100644
--- a/internal/fuzzy/symbol.go
+++ b/internal/fuzzy/symbol.go
@@ -5,6 +5,9 @@
 package fuzzy
 
 import (
+	"bytes"
+	"fmt"
+	"log"
 	"unicode"
 )
 
@@ -36,10 +39,12 @@ type SymbolMatcher struct {
 	segments    [256]uint8  // how many segments from the right is each rune
 }
 
+// Rune roles.
 const (
-	segmentStart uint32 = 1 << iota
-	wordStart
-	separator
+	segmentStart uint32 = 1 << iota // input rune starts a segment (i.e. follows '/' or '.')
+	wordStart                       // input rune starts a word, per camel-case naming rules
+	separator                       // input rune is a separator ('/' or '.')
+	upper                           // input rune is an upper case letter
 )
 
 // NewSymbolMatcher creates a SymbolMatcher that may be used to match the given
@@ -61,17 +66,17 @@ func NewSymbolMatcher(pattern string) *SymbolMatcher {
 	return m
 }
 
-// Match looks for the right-most match of the search pattern within the symbol
-// represented by concatenating the given chunks, returning its offset and
-// score.
+// Match searches for the right-most match of the search pattern within the
+// symbol represented by concatenating the given chunks.
 //
-// If a match is found, the first return value will hold the absolute byte
-// offset within all chunks for the start of the symbol. In other words, the
-// index of the match within strings.Join(chunks, ""). If no match is found,
-// the first return value will be -1.
+// If a match is found, the first result holds the absolute byte offset within
+// all chunks for the start of the symbol. In other words, the index of the
+// match within strings.Join(chunks, "").
 //
 // The second return value will be the score of the match, which is always
 // between 0 and 1, inclusive. A score of 0 indicates no match.
+//
+// If no match is found, Match returns (-1, 0).
 func (m *SymbolMatcher) Match(chunks []string) (int, float64) {
 	// Explicit behavior for an empty pattern.
 	//
@@ -81,11 +86,25 @@ func (m *SymbolMatcher) Match(chunks []string) (int, float64) {
 		return -1, 0
 	}
 
-	// First phase: populate the input buffer with lower-cased runes.
+	// Matching implements a heavily optimized linear scoring algorithm on the
+	// input. This is not guaranteed to produce the highest score, but works well
+	// enough, particularly due to the right-to-left significance of qualified
+	// symbols.
+	//
+	// Matching proceeds in three passes through the input:
+	//  - The first pass populates the input buffer and collects rune roles.
+	//  - The second pass proceeds right-to-left to find the right-most match.
+	//  - The third pass proceeds left-to-right from the start of the right-most
+	//    match, to find the most *compact* match, and computes the score of this
+	//    match.
+	//
+	// See below for more details of each pass, as well as the scoring algorithm.
+
+	// First pass: populate the input buffer out of the provided chunks
+	// (lower-casing in the process), and collect rune roles.
 	//
 	// We could also check for a forward match here, but since we'd have to write
 	// the entire input anyway this has negligible impact on performance.
-
 	var (
 		inputLen  = uint8(0)
 		modifiers = wordStart | segmentStart
@@ -107,7 +126,16 @@ input:
 				l = unicode.ToLower(r)
 			}
 			if l != r {
-				modifiers |= wordStart
+				modifiers |= upper
+
+				// If the current rune is capitalized *and the preceding rune was not*,
+				// mark this as a word start. This avoids spuriously high ranking of
+				// non-camelcase naming schemas, such as the
+				// yaml_PARSE_FLOW_SEQUENCE_ENTRY_MAPPING_END_STATE example of
+				// golang/go#60201.
+				if inputLen == 0 || m.roles[inputLen-1]&upper == 0 {
+					modifiers |= wordStart
+				}
 			}
 			m.inputBuffer[inputLen] = l
 			m.roles[inputLen] = modifiers
@@ -125,14 +153,13 @@ input:
 		}
 	}
 
-	// Second phase: find the right-most match, and count segments from the
+	// Second pass: find the right-most match, and count segments from the
 	// right.
-
 	var (
 		pi    = uint8(m.patternLen - 1) // pattern index
 		p     = m.pattern[pi]           // pattern rune
 		start = -1                      // start offset of match
-		rseg  = uint8(0)
+		rseg  = uint8(0)                // effective "depth" from the right of the current rune in consideration
 	)
 	const maxSeg = 3 // maximum number of segments from the right to count, for scoring purposes.
 
@@ -144,6 +171,8 @@ input:
 		m.segments[ii] = rseg
 		if p == r {
 			if pi == 0 {
+				// TODO(rfindley): BUG: the docstring for Match says that it returns an
+				// absolute byte offset, but clearly it is returning a rune offset here.
 				start = int(ii)
 				break
 			}
@@ -161,85 +190,120 @@ input:
 		return -1, 0
 	}
 
-	// Third phase: find the shortest match, and compute the score.
+	// Third pass: find the shortest match and compute the score.
 
-	// Score is the average score for each character.
+	// Score is the average score for each rune.
 	//
-	// A character score is the multiple of:
-	//   1. 1.0 if the character starts a segment or is preceded by a matching
-	//      character, 0.9 if the character starts a mid-segment word, else 0.6.
+	// A rune score is the multiple of:
+	//   1. The base score, which is 1.0 if the rune starts a segment, 0.9 if the
+	//      rune starts a mid-segment word, else 0.6.
 	//
-	//      Note that characters preceded by a matching character get the max
-	//      score of 1.0 so that sequential or exact matches are preferred, even
-	//      if they don't start/end at a segment or word boundary. For example, a
-	//      match for "func" in intfuncs should have a higher score than in
-	//      ifunmatched.
+	//      Runes preceded by a matching rune are treated the same as the start
+	//      of a mid-segment word (with a 0.9 score), so that sequential or exact
+	//      matches are preferred. We call this a sequential bonus.
 	//
-	//      For the final character match, the multiplier from (1) is reduced to
-	//      0.9 if the next character in the input is a mid-segment word, or 0.6
-	//      if the next character in the input is not a word or segment start.
-	//      This ensures that we favor whole-word or whole-segment matches over
-	//      prefix matches.
+	//      For the final rune match, this sequential bonus is reduced to 0.8 if
+	//      the next rune in the input is a mid-segment word, or 0.7 if the next
+	//      rune in the input is not a word or segment start. This ensures that
+	//      we favor whole-word or whole-segment matches over prefix matches.
 	//
-	//   2. 1.0 if the character is part of the last segment, otherwise
+	//   2. 1.0 if the rune is part of the last segment, otherwise
 	//      1.0-0.1*<segments from the right>, with a max segment count of 3.
 	//      Notably 1.0-0.1*3 = 0.7 > 0.6, so that foo/_/_/_/_ (a match very
-	//      early in a qualified symbol name) still scores higher than _f_o_o_
-	//      (a completely split match).
+	//      early in a qualified symbol name) still scores higher than _f_o_o_ (a
+	//      completely split match).
 	//
 	// This is a naive algorithm, but it is fast. There's lots of prior art here
 	// that could be leveraged. For example, we could explicitly consider
-	// character distance, and exact matches of words or segments.
+	// rune distance, and exact matches of words or segments.
 	//
 	// Also note that this might not actually find the highest scoring match, as
 	// doing so could require a non-linear algorithm, depending on how the score
 	// is calculated.
 
+	// debugging support
+	const debug = false // enable to log debugging information
+	var (
+		runeScores []float64
+		runeIdxs   []int
+	)
+
 	pi = 0
 	p = m.pattern[pi]
 
 	const (
-		segStreak  = 1.0 // start of segment or sequential match
-		wordStreak = 0.9 // start of word match
-		noStreak   = 0.6
-		perSegment = 0.1 // we count at most 3 segments above
+		segStartScore = 1.0 // base score of runes starting a segment
+		wordScore     = 0.9 // base score of runes starting or continuing a word
+		noStreak      = 0.6
+		perSegment    = 0.1 // we count at most 3 segments above
 	)
 
-	streakBonus := noStreak
 	totScore := 0.0
+	lastMatch := uint8(255)
 	for ii := uint8(start); ii < inputLen; ii++ {
 		r := m.inputBuffer[ii]
 		if r == p {
 			pi++
+			finalRune := pi >= m.patternLen
 			p = m.pattern[pi]
-			// Note: this could be optimized with some bit operations.
+
+			baseScore := noStreak
+
+			// Calculate the sequence bonus based on preceding matches.
+			//
+			// We do this first as it is overridden by role scoring below.
+			if lastMatch == ii-1 {
+				baseScore = wordScore
+				// Reduce the sequence bonus for the final rune of the pattern based on
+				// whether it borders a new segment or word.
+				if finalRune {
+					switch {
+					case ii == inputLen-1 || m.roles[ii+1]&separator != 0:
+						// Full segment: no reduction
+					case m.roles[ii+1]&wordStart != 0:
+						baseScore = wordScore - 0.1
+					default:
+						baseScore = wordScore - 0.2
+					}
+				}
+			}
+			lastMatch = ii
+
+			// Calculate the rune's role score. If the rune starts a segment or word,
+			// this overrides the sequence score, as the rune starts a new sequence.
 			switch {
-			case m.roles[ii]&segmentStart != 0 && segStreak > streakBonus:
-				streakBonus = segStreak
-			case m.roles[ii]&wordStart != 0 && wordStreak > streakBonus:
-				streakBonus = wordStreak
+			case m.roles[ii]&segmentStart != 0:
+				baseScore = segStartScore
+			case m.roles[ii]&wordStart != 0:
+				baseScore = wordScore
 			}
-			finalChar := pi >= m.patternLen
-			// finalCost := 1.0
-			if finalChar && streakBonus > noStreak {
-				switch {
-				case ii == inputLen-1 || m.roles[ii+1]&segmentStart != 0:
-					// Full segment: no reduction
-				case m.roles[ii+1]&wordStart != 0:
-					streakBonus = wordStreak
-				default:
-					streakBonus = noStreak
-				}
+
+			// Apply the segment-depth penalty (segments from the right).
+			runeScore := baseScore * (1.0 - float64(m.segments[ii])*perSegment)
+			if debug {
+				runeScores = append(runeScores, runeScore)
+				runeIdxs = append(runeIdxs, int(ii))
 			}
-			totScore += streakBonus * (1.0 - float64(m.segments[ii])*perSegment)
-			if finalChar {
+			totScore += runeScore
+			if finalRune {
 				break
 			}
-			streakBonus = segStreak // see above: sequential characters get the max score
-		} else {
-			streakBonus = noStreak
 		}
 	}
 
+	if debug {
+		// Format rune roles and scores in line:
+		// fo[o:.52].[b:1]a[r:.6]
+		var summary bytes.Buffer
+		last := 0
+		for i, idx := range runeIdxs {
+			summary.WriteString(string(m.inputBuffer[last:idx])) // encode runes
+			fmt.Fprintf(&summary, "[%s:%.2g]", string(m.inputBuffer[idx]), runeScores[i])
+			last = idx + 1
+		}
+		summary.WriteString(string(m.inputBuffer[last:inputLen])) // encode runes
+		log.Println(summary.String())
+	}
+
 	return start, totScore / float64(m.patternLen)
 }
diff --git a/internal/fuzzy/symbol_test.go b/internal/fuzzy/symbol_test.go
index 2a9d9b663bd..43e629d51ef 100644
--- a/internal/fuzzy/symbol_test.go
+++ b/internal/fuzzy/symbol_test.go
@@ -5,8 +5,12 @@
 package fuzzy_test
 
 import (
+	"go/ast"
+	"go/token"
+	"sort"
 	"testing"
 
+	"golang.org/x/tools/go/packages"
 	. "golang.org/x/tools/internal/fuzzy"
 )
 
@@ -34,30 +38,173 @@ func TestSymbolMatchIndex(t *testing.T) {
 }
 
 func TestSymbolRanking(t *testing.T) {
-	matcher := NewSymbolMatcher("test")
 
-	// symbols to match, in ascending order of ranking.
-	symbols := []string{
-		"this.is.better.than.most",
-		"test.foo.bar",
-		"thebest",
-		"test.foo",
-		"test.foo",
-		"atest",
-		"testage",
-		"tTest",
-		"foo.test",
-		"test",
+	// query -> symbols to match, in ascending order of score
+	queryRanks := map[string][]string{
+		"test": {
+			"this.is.better.than.most",
+			"test.foo.bar",
+			"thebest",
+			"atest",
+			"test.foo",
+			"testage",
+			"tTest",
+			"foo.test",
+		},
+		"parseside": { // golang/go#60201
+			"yaml_PARSE_FLOW_SEQUENCE_ENTRY_MAPPING_END_STATE",
+			"parseContext.parse_sidebyside",
+		},
+		"cvb": {
+			"filecache_test.testIPCValueB",
+			"cover.Boundary",
+		},
+		"dho": {
+			"gocommand.DebugHangingGoCommands",
+			"protocol.DocumentHighlightOptions",
+		},
+		"flg": {
+			"completion.FALLTHROUGH",
+			"main.flagGoCmd",
+		},
+		"fvi": {
+			"godoc.fileIndexVersion",
+			"macho.FlagSubsectionsViaSymbols",
+		},
 	}
-	prev := 0.0
-	for _, sym := range symbols {
-		_, score := matcher.Match([]string{sym})
-		t.Logf("Match(%q) = %v", sym, score)
-		if score < prev {
-			t.Errorf("Match(%q) = _, %v, want > %v", sym, score, prev)
+
+	for query, symbols := range queryRanks {
+		t.Run(query, func(t *testing.T) {
+			matcher := NewSymbolMatcher(query)
+			prev := 0.0
+			for _, sym := range symbols {
+				_, score := matcher.Match([]string{sym})
+				t.Logf("Match(%q) = %v", sym, score)
+				if score <= prev {
+					t.Errorf("Match(%q) = _, %v, want > %v", sym, score, prev)
+				}
+				prev = score
+			}
+		})
+	}
+}
+
+func TestMatcherSimilarities(t *testing.T) {
+	// This test compares the fuzzy matcher with the symbol matcher on a corpus
+	// of qualified identifiers extracted from x/tools.
+	//
+	// These two matchers are not expected to agree, but inspecting differences
+	// can be useful for finding interesting ranking edge cases.
+	t.Skip("unskip this test to compare matchers")
+
+	idents := collectIdentifiers(t)
+	t.Logf("collected %d unique identifiers", len(idents))
+
+	// TODO: use go1.21 slices.MaxFunc.
+	topMatch := func(score func(string) float64) string {
+		top := ""
+		topScore := 0.0
+		for _, cand := range idents {
+			if s := score(cand); s > topScore {
+				top = cand
+				topScore = s
+			}
 		}
-		prev = score
+		return top
 	}
+
+	agreed := 0
+	total := 0
+	bad := 0
+	patterns := generatePatterns()
+	for _, pattern := range patterns {
+		total++
+
+		fm := NewMatcher(pattern)
+		topFuzzy := topMatch(func(input string) float64 {
+			return float64(fm.Score(input))
+		})
+		sm := NewSymbolMatcher(pattern)
+		topSymbol := topMatch(func(input string) float64 {
+			_, score := sm.Match([]string{input})
+			return score
+		})
+		switch {
+		case topFuzzy == "" && topSymbol != "":
+			if false {
+				// The fuzzy matcher has a bug where it misses some matches; for this
+				// test we only care about the symbol matcher.
+				t.Logf("%q matched %q but no fuzzy match", pattern, topSymbol)
+			}
+			total--
+			bad++
+		case topFuzzy != "" && topSymbol == "":
+			t.Fatalf("%q matched %q but no symbol match", pattern, topFuzzy)
+		case topFuzzy == topSymbol:
+			agreed++
+		default:
+			// Enable this log to see mismatches.
+			if false {
+				t.Logf("mismatch for %q: fuzzy: %q, symbol: %q", pattern, topFuzzy, topSymbol)
+			}
+		}
+	}
+	t.Logf("fuzzy matchers agreed on %d out of %d queries (%d bad)", agreed, total, bad)
+}
+
+func collectIdentifiers(tb testing.TB) []string {
+	cfg := &packages.Config{
+		Mode:  packages.NeedName | packages.NeedSyntax | packages.NeedFiles,
+		Tests: true,
+	}
+	pkgs, err := packages.Load(cfg, "golang.org/x/tools/...")
+	if err != nil {
+		tb.Fatal(err)
+	}
+	uniqueIdents := make(map[string]bool)
+	decls := 0
+	for _, pkg := range pkgs {
+		for _, f := range pkg.Syntax {
+			for _, decl := range f.Decls {
+				decls++
+				switch decl := decl.(type) {
+				case *ast.GenDecl:
+					for _, spec := range decl.Specs {
+						switch decl.Tok {
+						case token.IMPORT:
+						case token.TYPE:
+							name := spec.(*ast.TypeSpec).Name.Name
+							qualified := pkg.Name + "." + name
+							uniqueIdents[qualified] = true
+						case token.CONST, token.VAR:
+							for _, n := range spec.(*ast.ValueSpec).Names {
+								qualified := pkg.Name + "." + n.Name
+								uniqueIdents[qualified] = true
+							}
+						}
+					}
+				}
+			}
+		}
+	}
+	var idents []string
+	for k := range uniqueIdents {
+		idents = append(idents, k)
+	}
+	sort.Strings(idents)
+	return idents
+}
+
+func generatePatterns() []string {
+	var patterns []string
+	for x := 'a'; x <= 'z'; x++ {
+		for y := 'a'; y <= 'z'; y++ {
+			for z := 'a'; z <= 'z'; z++ {
+				patterns = append(patterns, string(x)+string(y)+string(z))
+			}
+		}
+	}
+	return patterns
 }
 
 // Test that we strongly prefer exact matches.
@@ -89,9 +236,8 @@ func TestSymbolRanking_Issue60027(t *testing.T) {
 
 func TestChunkedMatch(t *testing.T) {
 	matcher := NewSymbolMatcher("test")
-
+	_, want := matcher.Match([]string{"test"})
 	chunked := [][]string{
-		{"test"},
 		{"", "test"},
 		{"test", ""},
 		{"te", "st"},
@@ -99,7 +245,7 @@ func TestChunkedMatch(t *testing.T) {
 
 	for _, chunks := range chunked {
 		offset, score := matcher.Match(chunks)
-		if offset != 0 || score != 1.0 {
+		if offset != 0 || score != want {
 			t.Errorf("Match(%v) = %v, %v, want 0, 1.0", chunks, offset, score)
 		}
 	}