From 0ba9c8439eadd6c895b30ea434b398069e586c9b Mon Sep 17 00:00:00 2001 From: Rob Findley Date: Fri, 29 Sep 2023 12:56:16 -0400 Subject: [PATCH] internal/fuzzy: several improvements for symbol matching Following the edge case discovered in golang/go#60201, take a more scientific approach to improving symbol match scoring: - Add a conformance test that compares Matcher with SymbolMatcher, querying all identifiers in x/tools. The two are not expected to agree in all cases, but this test helped find interesting ranking edge cases, which are added to the ranking test. - Don't count a capital letter in the middle of a sequence of capital letters (e.g. the M in YAML) as a word start. This was the inconsistency that led to golang/go#60201. - Compute the sequence bonus before role score; role score should take precedent. - Simplify the sequence scoring logic: a sequential character gets the same score as a word start, unless it is the final character in the pattern in which case we also adjust for whether it completes a word or segment. This feels like a reasonable heuristic. - Fix a bug in final-rune adjustment where we were checking the next input rune for a segment start, not a separator. Notably, the scoring improvements above were all derived from first principles, and happened to also improve the conformance rate in the new test. Additionally, make the following cleanup: - s/character/rune throughout, since that's what we mean - add debugging support for more easily understanding the match algorithm - add additional commentary - add benchmarks Fixes golang/go#60201 Change-Id: I838898c49cbb69af083a8cc837612da047778c40 Reviewed-on: https://go-review.googlesource.com/c/tools/+/531697 Reviewed-by: Alan Donovan Auto-Submit: Robert Findley LUCI-TryBot-Result: Go LUCI --- gopls/internal/lsp/command/interface.go | 1 + internal/fuzzy/self_test.go | 39 +++++ internal/fuzzy/symbol.go | 186 +++++++++++++++-------- internal/fuzzy/symbol_test.go | 192 +++++++++++++++++++++--- 4 files changed, 334 insertions(+), 84 deletions(-) create mode 100644 internal/fuzzy/self_test.go diff --git a/gopls/internal/lsp/command/interface.go b/gopls/internal/lsp/command/interface.go index 603e6121c8a..c3bd921fcf1 100644 --- a/gopls/internal/lsp/command/interface.go +++ b/gopls/internal/lsp/command/interface.go @@ -37,6 +37,7 @@ type Interface interface { // // Applies a fix to a region of source code. ApplyFix(context.Context, ApplyFixArgs) error + // Test: Run test(s) (legacy) // // Runs `go test` for a specific set of test or benchmark functions. diff --git a/internal/fuzzy/self_test.go b/internal/fuzzy/self_test.go new file mode 100644 index 00000000000..fae0aeae249 --- /dev/null +++ b/internal/fuzzy/self_test.go @@ -0,0 +1,39 @@ +// Copyright 2023 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package fuzzy_test + +import ( + "testing" + + . "golang.org/x/tools/internal/fuzzy" +) + +func BenchmarkSelf_Matcher(b *testing.B) { + idents := collectIdentifiers(b) + patterns := generatePatterns() + + for i := 0; i < b.N; i++ { + for _, pattern := range patterns { + sm := NewMatcher(pattern) + for _, ident := range idents { + _ = sm.Score(ident) + } + } + } +} + +func BenchmarkSelf_SymbolMatcher(b *testing.B) { + idents := collectIdentifiers(b) + patterns := generatePatterns() + + for i := 0; i < b.N; i++ { + for _, pattern := range patterns { + sm := NewSymbolMatcher(pattern) + for _, ident := range idents { + _, _ = sm.Match([]string{ident}) + } + } + } +} diff --git a/internal/fuzzy/symbol.go b/internal/fuzzy/symbol.go index bf93041521b..5fe2ce3e2a3 100644 --- a/internal/fuzzy/symbol.go +++ b/internal/fuzzy/symbol.go @@ -5,6 +5,9 @@ package fuzzy import ( + "bytes" + "fmt" + "log" "unicode" ) @@ -36,10 +39,12 @@ type SymbolMatcher struct { segments [256]uint8 // how many segments from the right is each rune } +// Rune roles. const ( - segmentStart uint32 = 1 << iota - wordStart - separator + segmentStart uint32 = 1 << iota // input rune starts a segment (i.e. follows '/' or '.') + wordStart // input rune starts a word, per camel-case naming rules + separator // input rune is a separator ('/' or '.') + upper // input rune is an upper case letter ) // NewSymbolMatcher creates a SymbolMatcher that may be used to match the given @@ -61,17 +66,17 @@ func NewSymbolMatcher(pattern string) *SymbolMatcher { return m } -// Match looks for the right-most match of the search pattern within the symbol -// represented by concatenating the given chunks, returning its offset and -// score. +// Match searches for the right-most match of the search pattern within the +// symbol represented by concatenating the given chunks. // -// If a match is found, the first return value will hold the absolute byte -// offset within all chunks for the start of the symbol. In other words, the -// index of the match within strings.Join(chunks, ""). If no match is found, -// the first return value will be -1. +// If a match is found, the first result holds the absolute byte offset within +// all chunks for the start of the symbol. In other words, the index of the +// match within strings.Join(chunks, ""). // // The second return value will be the score of the match, which is always // between 0 and 1, inclusive. A score of 0 indicates no match. +// +// If no match is found, Match returns (-1, 0). func (m *SymbolMatcher) Match(chunks []string) (int, float64) { // Explicit behavior for an empty pattern. // @@ -81,11 +86,25 @@ func (m *SymbolMatcher) Match(chunks []string) (int, float64) { return -1, 0 } - // First phase: populate the input buffer with lower-cased runes. + // Matching implements a heavily optimized linear scoring algorithm on the + // input. This is not guaranteed to produce the highest score, but works well + // enough, particularly due to the right-to-left significance of qualified + // symbols. + // + // Matching proceeds in three passes through the input: + // - The first pass populates the input buffer and collects rune roles. + // - The second pass proceeds right-to-left to find the right-most match. + // - The third pass proceeds left-to-right from the start of the right-most + // match, to find the most *compact* match, and computes the score of this + // match. + // + // See below for more details of each pass, as well as the scoring algorithm. + + // First pass: populate the input buffer out of the provided chunks + // (lower-casing in the process), and collect rune roles. // // We could also check for a forward match here, but since we'd have to write // the entire input anyway this has negligible impact on performance. - var ( inputLen = uint8(0) modifiers = wordStart | segmentStart @@ -107,7 +126,16 @@ input: l = unicode.ToLower(r) } if l != r { - modifiers |= wordStart + modifiers |= upper + + // If the current rune is capitalized *and the preceding rune was not*, + // mark this as a word start. This avoids spuriously high ranking of + // non-camelcase naming schemas, such as the + // yaml_PARSE_FLOW_SEQUENCE_ENTRY_MAPPING_END_STATE example of + // golang/go#60201. + if inputLen == 0 || m.roles[inputLen-1]&upper == 0 { + modifiers |= wordStart + } } m.inputBuffer[inputLen] = l m.roles[inputLen] = modifiers @@ -125,14 +153,13 @@ input: } } - // Second phase: find the right-most match, and count segments from the + // Second pass: find the right-most match, and count segments from the // right. - var ( pi = uint8(m.patternLen - 1) // pattern index p = m.pattern[pi] // pattern rune start = -1 // start offset of match - rseg = uint8(0) + rseg = uint8(0) // effective "depth" from the right of the current rune in consideration ) const maxSeg = 3 // maximum number of segments from the right to count, for scoring purposes. @@ -144,6 +171,8 @@ input: m.segments[ii] = rseg if p == r { if pi == 0 { + // TODO(rfindley): BUG: the docstring for Match says that it returns an + // absolute byte offset, but clearly it is returning a rune offset here. start = int(ii) break } @@ -161,85 +190,120 @@ input: return -1, 0 } - // Third phase: find the shortest match, and compute the score. + // Third pass: find the shortest match and compute the score. - // Score is the average score for each character. + // Score is the average score for each rune. // - // A character score is the multiple of: - // 1. 1.0 if the character starts a segment or is preceded by a matching - // character, 0.9 if the character starts a mid-segment word, else 0.6. + // A rune score is the multiple of: + // 1. The base score, which is 1.0 if the rune starts a segment, 0.9 if the + // rune starts a mid-segment word, else 0.6. // - // Note that characters preceded by a matching character get the max - // score of 1.0 so that sequential or exact matches are preferred, even - // if they don't start/end at a segment or word boundary. For example, a - // match for "func" in intfuncs should have a higher score than in - // ifunmatched. + // Runes preceded by a matching rune are treated the same as the start + // of a mid-segment word (with a 0.9 score), so that sequential or exact + // matches are preferred. We call this a sequential bonus. // - // For the final character match, the multiplier from (1) is reduced to - // 0.9 if the next character in the input is a mid-segment word, or 0.6 - // if the next character in the input is not a word or segment start. - // This ensures that we favor whole-word or whole-segment matches over - // prefix matches. + // For the final rune match, this sequential bonus is reduced to 0.8 if + // the next rune in the input is a mid-segment word, or 0.7 if the next + // rune in the input is not a word or segment start. This ensures that + // we favor whole-word or whole-segment matches over prefix matches. // - // 2. 1.0 if the character is part of the last segment, otherwise + // 2. 1.0 if the rune is part of the last segment, otherwise // 1.0-0.1*, with a max segment count of 3. // Notably 1.0-0.1*3 = 0.7 > 0.6, so that foo/_/_/_/_ (a match very - // early in a qualified symbol name) still scores higher than _f_o_o_ - // (a completely split match). + // early in a qualified symbol name) still scores higher than _f_o_o_ (a + // completely split match). // // This is a naive algorithm, but it is fast. There's lots of prior art here // that could be leveraged. For example, we could explicitly consider - // character distance, and exact matches of words or segments. + // rune distance, and exact matches of words or segments. // // Also note that this might not actually find the highest scoring match, as // doing so could require a non-linear algorithm, depending on how the score // is calculated. + // debugging support + const debug = false // enable to log debugging information + var ( + runeScores []float64 + runeIdxs []int + ) + pi = 0 p = m.pattern[pi] const ( - segStreak = 1.0 // start of segment or sequential match - wordStreak = 0.9 // start of word match - noStreak = 0.6 - perSegment = 0.1 // we count at most 3 segments above + segStartScore = 1.0 // base score of runes starting a segment + wordScore = 0.9 // base score of runes starting or continuing a word + noStreak = 0.6 + perSegment = 0.1 // we count at most 3 segments above ) - streakBonus := noStreak totScore := 0.0 + lastMatch := uint8(255) for ii := uint8(start); ii < inputLen; ii++ { r := m.inputBuffer[ii] if r == p { pi++ + finalRune := pi >= m.patternLen p = m.pattern[pi] - // Note: this could be optimized with some bit operations. + + baseScore := noStreak + + // Calculate the sequence bonus based on preceding matches. + // + // We do this first as it is overridden by role scoring below. + if lastMatch == ii-1 { + baseScore = wordScore + // Reduce the sequence bonus for the final rune of the pattern based on + // whether it borders a new segment or word. + if finalRune { + switch { + case ii == inputLen-1 || m.roles[ii+1]&separator != 0: + // Full segment: no reduction + case m.roles[ii+1]&wordStart != 0: + baseScore = wordScore - 0.1 + default: + baseScore = wordScore - 0.2 + } + } + } + lastMatch = ii + + // Calculate the rune's role score. If the rune starts a segment or word, + // this overrides the sequence score, as the rune starts a new sequence. switch { - case m.roles[ii]&segmentStart != 0 && segStreak > streakBonus: - streakBonus = segStreak - case m.roles[ii]&wordStart != 0 && wordStreak > streakBonus: - streakBonus = wordStreak + case m.roles[ii]&segmentStart != 0: + baseScore = segStartScore + case m.roles[ii]&wordStart != 0: + baseScore = wordScore } - finalChar := pi >= m.patternLen - // finalCost := 1.0 - if finalChar && streakBonus > noStreak { - switch { - case ii == inputLen-1 || m.roles[ii+1]&segmentStart != 0: - // Full segment: no reduction - case m.roles[ii+1]&wordStart != 0: - streakBonus = wordStreak - default: - streakBonus = noStreak - } + + // Apply the segment-depth penalty (segments from the right). + runeScore := baseScore * (1.0 - float64(m.segments[ii])*perSegment) + if debug { + runeScores = append(runeScores, runeScore) + runeIdxs = append(runeIdxs, int(ii)) } - totScore += streakBonus * (1.0 - float64(m.segments[ii])*perSegment) - if finalChar { + totScore += runeScore + if finalRune { break } - streakBonus = segStreak // see above: sequential characters get the max score - } else { - streakBonus = noStreak } } + if debug { + // Format rune roles and scores in line: + // fo[o:.52].[b:1]a[r:.6] + var summary bytes.Buffer + last := 0 + for i, idx := range runeIdxs { + summary.WriteString(string(m.inputBuffer[last:idx])) // encode runes + fmt.Fprintf(&summary, "[%s:%.2g]", string(m.inputBuffer[idx]), runeScores[i]) + last = idx + 1 + } + summary.WriteString(string(m.inputBuffer[last:inputLen])) // encode runes + log.Println(summary.String()) + } + return start, totScore / float64(m.patternLen) } diff --git a/internal/fuzzy/symbol_test.go b/internal/fuzzy/symbol_test.go index 2a9d9b663bd..43e629d51ef 100644 --- a/internal/fuzzy/symbol_test.go +++ b/internal/fuzzy/symbol_test.go @@ -5,8 +5,12 @@ package fuzzy_test import ( + "go/ast" + "go/token" + "sort" "testing" + "golang.org/x/tools/go/packages" . "golang.org/x/tools/internal/fuzzy" ) @@ -34,30 +38,173 @@ func TestSymbolMatchIndex(t *testing.T) { } func TestSymbolRanking(t *testing.T) { - matcher := NewSymbolMatcher("test") - // symbols to match, in ascending order of ranking. - symbols := []string{ - "this.is.better.than.most", - "test.foo.bar", - "thebest", - "test.foo", - "test.foo", - "atest", - "testage", - "tTest", - "foo.test", - "test", + // query -> symbols to match, in ascending order of score + queryRanks := map[string][]string{ + "test": { + "this.is.better.than.most", + "test.foo.bar", + "thebest", + "atest", + "test.foo", + "testage", + "tTest", + "foo.test", + }, + "parseside": { // golang/go#60201 + "yaml_PARSE_FLOW_SEQUENCE_ENTRY_MAPPING_END_STATE", + "parseContext.parse_sidebyside", + }, + "cvb": { + "filecache_test.testIPCValueB", + "cover.Boundary", + }, + "dho": { + "gocommand.DebugHangingGoCommands", + "protocol.DocumentHighlightOptions", + }, + "flg": { + "completion.FALLTHROUGH", + "main.flagGoCmd", + }, + "fvi": { + "godoc.fileIndexVersion", + "macho.FlagSubsectionsViaSymbols", + }, } - prev := 0.0 - for _, sym := range symbols { - _, score := matcher.Match([]string{sym}) - t.Logf("Match(%q) = %v", sym, score) - if score < prev { - t.Errorf("Match(%q) = _, %v, want > %v", sym, score, prev) + + for query, symbols := range queryRanks { + t.Run(query, func(t *testing.T) { + matcher := NewSymbolMatcher(query) + prev := 0.0 + for _, sym := range symbols { + _, score := matcher.Match([]string{sym}) + t.Logf("Match(%q) = %v", sym, score) + if score <= prev { + t.Errorf("Match(%q) = _, %v, want > %v", sym, score, prev) + } + prev = score + } + }) + } +} + +func TestMatcherSimilarities(t *testing.T) { + // This test compares the fuzzy matcher with the symbol matcher on a corpus + // of qualified identifiers extracted from x/tools. + // + // These two matchers are not expected to agree, but inspecting differences + // can be useful for finding interesting ranking edge cases. + t.Skip("unskip this test to compare matchers") + + idents := collectIdentifiers(t) + t.Logf("collected %d unique identifiers", len(idents)) + + // TODO: use go1.21 slices.MaxFunc. + topMatch := func(score func(string) float64) string { + top := "" + topScore := 0.0 + for _, cand := range idents { + if s := score(cand); s > topScore { + top = cand + topScore = s + } } - prev = score + return top } + + agreed := 0 + total := 0 + bad := 0 + patterns := generatePatterns() + for _, pattern := range patterns { + total++ + + fm := NewMatcher(pattern) + topFuzzy := topMatch(func(input string) float64 { + return float64(fm.Score(input)) + }) + sm := NewSymbolMatcher(pattern) + topSymbol := topMatch(func(input string) float64 { + _, score := sm.Match([]string{input}) + return score + }) + switch { + case topFuzzy == "" && topSymbol != "": + if false { + // The fuzzy matcher has a bug where it misses some matches; for this + // test we only care about the symbol matcher. + t.Logf("%q matched %q but no fuzzy match", pattern, topSymbol) + } + total-- + bad++ + case topFuzzy != "" && topSymbol == "": + t.Fatalf("%q matched %q but no symbol match", pattern, topFuzzy) + case topFuzzy == topSymbol: + agreed++ + default: + // Enable this log to see mismatches. + if false { + t.Logf("mismatch for %q: fuzzy: %q, symbol: %q", pattern, topFuzzy, topSymbol) + } + } + } + t.Logf("fuzzy matchers agreed on %d out of %d queries (%d bad)", agreed, total, bad) +} + +func collectIdentifiers(tb testing.TB) []string { + cfg := &packages.Config{ + Mode: packages.NeedName | packages.NeedSyntax | packages.NeedFiles, + Tests: true, + } + pkgs, err := packages.Load(cfg, "golang.org/x/tools/...") + if err != nil { + tb.Fatal(err) + } + uniqueIdents := make(map[string]bool) + decls := 0 + for _, pkg := range pkgs { + for _, f := range pkg.Syntax { + for _, decl := range f.Decls { + decls++ + switch decl := decl.(type) { + case *ast.GenDecl: + for _, spec := range decl.Specs { + switch decl.Tok { + case token.IMPORT: + case token.TYPE: + name := spec.(*ast.TypeSpec).Name.Name + qualified := pkg.Name + "." + name + uniqueIdents[qualified] = true + case token.CONST, token.VAR: + for _, n := range spec.(*ast.ValueSpec).Names { + qualified := pkg.Name + "." + n.Name + uniqueIdents[qualified] = true + } + } + } + } + } + } + } + var idents []string + for k := range uniqueIdents { + idents = append(idents, k) + } + sort.Strings(idents) + return idents +} + +func generatePatterns() []string { + var patterns []string + for x := 'a'; x <= 'z'; x++ { + for y := 'a'; y <= 'z'; y++ { + for z := 'a'; z <= 'z'; z++ { + patterns = append(patterns, string(x)+string(y)+string(z)) + } + } + } + return patterns } // Test that we strongly prefer exact matches. @@ -89,9 +236,8 @@ func TestSymbolRanking_Issue60027(t *testing.T) { func TestChunkedMatch(t *testing.T) { matcher := NewSymbolMatcher("test") - + _, want := matcher.Match([]string{"test"}) chunked := [][]string{ - {"test"}, {"", "test"}, {"test", ""}, {"te", "st"}, @@ -99,7 +245,7 @@ func TestChunkedMatch(t *testing.T) { for _, chunks := range chunked { offset, score := matcher.Match(chunks) - if offset != 0 || score != 1.0 { + if offset != 0 || score != want { t.Errorf("Match(%v) = %v, %v, want 0, 1.0", chunks, offset, score) } }