whatcanGOwrong

This commit is contained in:
2024-09-19 21:38:24 -04:00
commit d0ae4d841d
17908 changed files with 4096831 additions and 0 deletions
@@ -0,0 +1,183 @@
// Copyright 2019 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package fuzzy
import (
"unicode"
)
// RuneRole specifies the role of a rune in the context of an input.
type RuneRole byte
const (
// RNone specifies a rune without any role in the input (i.e., whitespace/non-ASCII).
RNone RuneRole = iota
// RSep specifies a rune with the role of segment separator.
RSep
// RTail specifies a rune which is a lower-case tail in a word in the input.
RTail
// RUCTail specifies a rune which is an upper-case tail in a word in the input.
RUCTail
// RHead specifies a rune which is the first character in a word in the input.
RHead
)
// RuneRoles detects the roles of each byte rune in an input string and stores it in the output
// slice. The rune role depends on the input type. Stops when it parsed all the runes in the string
// or when it filled the output. If output is nil, then it gets created.
func RuneRoles(candidate []byte, reuse []RuneRole) []RuneRole {
var output []RuneRole
if cap(reuse) < len(candidate) {
output = make([]RuneRole, 0, len(candidate))
} else {
output = reuse[:0]
}
prev, prev2 := rtNone, rtNone
for i := 0; i < len(candidate); i++ {
r := rune(candidate[i])
role := RNone
curr := rtLower
if candidate[i] <= unicode.MaxASCII {
curr = runeType(rt[candidate[i]] - '0')
}
if curr == rtLower {
if prev == rtNone || prev == rtPunct {
role = RHead
} else {
role = RTail
}
} else if curr == rtUpper {
role = RHead
if prev == rtUpper {
// This and previous characters are both upper case.
if i+1 == len(candidate) {
// This is last character, previous was also uppercase -> this is UCTail
// i.e., (current char is C): aBC / BC / ABC
role = RUCTail
}
}
} else if curr == rtPunct {
switch r {
case '.', ':':
role = RSep
}
}
if curr != rtLower {
if i > 1 && output[i-1] == RHead && prev2 == rtUpper && (output[i-2] == RHead || output[i-2] == RUCTail) {
// The previous two characters were uppercase. The current one is not a lower case, so the
// previous one can't be a HEAD. Make it a UCTail.
// i.e., (last char is current char - B must be a UCTail): ABC / ZABC / AB.
output[i-1] = RUCTail
}
}
output = append(output, role)
prev2 = prev
prev = curr
}
return output
}
type runeType byte
const (
rtNone runeType = iota
rtPunct
rtLower
rtUpper
)
const rt = "00000000000000000000000000000000000000000000001122222222221000000333333333333333333333333330000002222222222222222222222222200000"
// LastSegment returns the substring representing the last segment from the input, where each
// byte has an associated RuneRole in the roles slice. This makes sense only for inputs of Symbol
// or Filename type.
func LastSegment(input string, roles []RuneRole) string {
// Exclude ending separators.
end := len(input) - 1
for end >= 0 && roles[end] == RSep {
end--
}
if end < 0 {
return ""
}
start := end - 1
for start >= 0 && roles[start] != RSep {
start--
}
return input[start+1 : end+1]
}
// fromChunks copies string chunks into the given buffer.
func fromChunks(chunks []string, buffer []byte) []byte {
ii := 0
for _, chunk := range chunks {
for i := 0; i < len(chunk); i++ {
if ii >= cap(buffer) {
break
}
buffer[ii] = chunk[i]
ii++
}
}
return buffer[:ii]
}
// toLower transforms the input string to lower case, which is stored in the output byte slice.
// The lower casing considers only ASCII values - non ASCII values are left unmodified.
// Stops when parsed all input or when it filled the output slice. If output is nil, then it gets
// created.
func toLower(input []byte, reuse []byte) []byte {
output := reuse
if cap(reuse) < len(input) {
output = make([]byte, len(input))
}
for i := 0; i < len(input); i++ {
r := rune(input[i])
if input[i] <= unicode.MaxASCII {
if 'A' <= r && r <= 'Z' {
r += 'a' - 'A'
}
}
output[i] = byte(r)
}
return output[:len(input)]
}
// WordConsumer defines a consumer for a word delimited by the [start,end) byte offsets in an input
// (start is inclusive, end is exclusive).
type WordConsumer func(start, end int)
// Words find word delimiters in an input based on its bytes' mappings to rune roles. The offset
// delimiters for each word are fed to the provided consumer function.
func Words(roles []RuneRole, consume WordConsumer) {
var wordStart int
for i, r := range roles {
switch r {
case RUCTail, RTail:
case RHead, RNone, RSep:
if i != wordStart {
consume(wordStart, i)
}
wordStart = i
if r != RHead {
// Skip this character.
wordStart = i + 1
}
}
}
if wordStart != len(roles) {
consume(wordStart, len(roles))
}
}
@@ -0,0 +1,141 @@
// Copyright 2019 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package fuzzy_test
import (
"bytes"
"sort"
"testing"
"golang.org/x/tools/internal/fuzzy"
)
var rolesTests = []struct {
str string
want string
}{
{str: "abc::def::goo", want: "Ccc//Ccc//Ccc"},
{str: "proto::Message", want: "Ccccc//Ccccccc"},
{str: "AbstractSWTFactory", want: "CcccccccCuuCcccccc"},
{str: "Abs012", want: "Cccccc"},
{str: "/", want: " "},
{str: "fOO", want: "CCu"},
{str: "fo_oo.o_oo", want: "Cc Cc/C Cc"},
}
func rolesString(roles []fuzzy.RuneRole) string {
var buf bytes.Buffer
for _, r := range roles {
buf.WriteByte(" /cuC"[int(r)])
}
return buf.String()
}
func TestRoles(t *testing.T) {
for _, tc := range rolesTests {
gotRoles := make([]fuzzy.RuneRole, len(tc.str))
fuzzy.RuneRoles([]byte(tc.str), gotRoles)
got := rolesString(gotRoles)
if got != tc.want {
t.Errorf("roles(%s) = %v; want %v", tc.str, got, tc.want)
}
}
}
var wordSplitTests = []struct {
input string
want []string
}{
{
input: "foo bar baz",
want: []string{"foo", "bar", "baz"},
},
{
input: "fooBarBaz",
want: []string{"foo", "Bar", "Baz"},
},
{
input: "FOOBarBAZ",
want: []string{"FOO", "Bar", "BAZ"},
},
{
input: "foo123_bar2Baz3",
want: []string{"foo123", "bar2", "Baz3"},
},
}
func TestWordSplit(t *testing.T) {
for _, tc := range wordSplitTests {
roles := fuzzy.RuneRoles([]byte(tc.input), nil)
var got []string
consumer := func(i, j int) {
got = append(got, tc.input[i:j])
}
fuzzy.Words(roles, consumer)
if eq := diffStringLists(tc.want, got); !eq {
t.Errorf("input %v: (want %v -> got %v)", tc.input, tc.want, got)
}
}
}
func diffStringLists(a, b []string) bool {
if len(a) != len(b) {
return false
}
sort.Strings(a)
sort.Strings(b)
for i := range a {
if a[i] != b[i] {
return false
}
}
return true
}
var lastSegmentSplitTests = []struct {
str string
want string
}{
{
str: "identifier",
want: "identifier",
},
{
str: "two_words",
want: "two_words",
},
{
str: "first::second",
want: "second",
},
{
str: "foo.bar.FOOBar_buz123_test",
want: "FOOBar_buz123_test",
},
}
func TestLastSegment(t *testing.T) {
for _, tc := range lastSegmentSplitTests {
roles := fuzzy.RuneRoles([]byte(tc.str), nil)
got := fuzzy.LastSegment(tc.str, roles)
if got != tc.want {
t.Errorf("str %v: want %v; got %v", tc.str, tc.want, got)
}
}
}
func BenchmarkRoles(b *testing.B) {
str := "AbstractSWTFactory"
out := make([]fuzzy.RuneRole, len(str))
for i := 0; i < b.N; i++ {
fuzzy.RuneRoles([]byte(str), out)
}
b.SetBytes(int64(len(str)))
}
@@ -0,0 +1,434 @@
// Copyright 2019 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package fuzzy implements a fuzzy matching algorithm.
package fuzzy
import (
"bytes"
"fmt"
)
const (
// MaxInputSize is the maximum size of the input scored against the fuzzy matcher. Longer inputs
// will be truncated to this size.
MaxInputSize = 127
// MaxPatternSize is the maximum size of the pattern used to construct the fuzzy matcher. Longer
// inputs are truncated to this size.
MaxPatternSize = 63
)
type scoreVal int
func (s scoreVal) val() int {
return int(s) >> 1
}
func (s scoreVal) prevK() int {
return int(s) & 1
}
func score(val int, prevK int /*0 or 1*/) scoreVal {
return scoreVal(val<<1 + prevK)
}
// Matcher implements a fuzzy matching algorithm for scoring candidates against a pattern.
// The matcher does not support parallel usage.
type Matcher struct {
pattern string
patternLower []byte // lower-case version of the pattern
patternShort []byte // first characters of the pattern
caseSensitive bool // set if the pattern is mix-cased
patternRoles []RuneRole // the role of each character in the pattern
roles []RuneRole // the role of each character in the tested string
scores [MaxInputSize + 1][MaxPatternSize + 1][2]scoreVal
scoreScale float32
lastCandidateLen int // in bytes
lastCandidateMatched bool
// Reusable buffers to avoid allocating for every candidate.
// - inputBuf stores the concatenated input chunks
// - lowerBuf stores the last candidate in lower-case
// - rolesBuf stores the calculated roles for each rune in the last
// candidate.
inputBuf [MaxInputSize]byte
lowerBuf [MaxInputSize]byte
rolesBuf [MaxInputSize]RuneRole
}
func (m *Matcher) bestK(i, j int) int {
if m.scores[i][j][0].val() < m.scores[i][j][1].val() {
return 1
}
return 0
}
// NewMatcher returns a new fuzzy matcher for scoring candidates against the provided pattern.
func NewMatcher(pattern string) *Matcher {
if len(pattern) > MaxPatternSize {
pattern = pattern[:MaxPatternSize]
}
m := &Matcher{
pattern: pattern,
patternLower: toLower([]byte(pattern), nil),
}
for i, c := range m.patternLower {
if pattern[i] != c {
m.caseSensitive = true
break
}
}
if len(pattern) > 3 {
m.patternShort = m.patternLower[:3]
} else {
m.patternShort = m.patternLower
}
m.patternRoles = RuneRoles([]byte(pattern), nil)
if len(pattern) > 0 {
maxCharScore := 4
m.scoreScale = 1 / float32(maxCharScore*len(pattern))
}
return m
}
// Score returns the score returned by matching the candidate to the pattern.
// This is not designed for parallel use. Multiple candidates must be scored sequentially.
// Returns a score between 0 and 1 (0 - no match, 1 - perfect match).
func (m *Matcher) Score(candidate string) float32 {
return m.ScoreChunks([]string{candidate})
}
func (m *Matcher) ScoreChunks(chunks []string) float32 {
candidate := fromChunks(chunks, m.inputBuf[:])
if len(candidate) > MaxInputSize {
candidate = candidate[:MaxInputSize]
}
lower := toLower(candidate, m.lowerBuf[:])
m.lastCandidateLen = len(candidate)
if len(m.pattern) == 0 {
// Empty patterns perfectly match candidates.
return 1
}
if m.match(candidate, lower) {
sc := m.computeScore(candidate, lower)
if sc > minScore/2 && !m.poorMatch() {
m.lastCandidateMatched = true
if len(m.pattern) == len(candidate) {
// Perfect match.
return 1
}
if sc < 0 {
sc = 0
}
normalizedScore := float32(sc) * m.scoreScale
if normalizedScore > 1 {
normalizedScore = 1
}
return normalizedScore
}
}
m.lastCandidateMatched = false
return 0
}
const minScore = -10000
// MatchedRanges returns matches ranges for the last scored string as a flattened array of
// [begin, end) byte offset pairs.
func (m *Matcher) MatchedRanges() []int {
if len(m.pattern) == 0 || !m.lastCandidateMatched {
return nil
}
i, j := m.lastCandidateLen, len(m.pattern)
if m.scores[i][j][0].val() < minScore/2 && m.scores[i][j][1].val() < minScore/2 {
return nil
}
var ret []int
k := m.bestK(i, j)
for i > 0 {
take := (k == 1)
k = m.scores[i][j][k].prevK()
if take {
if len(ret) == 0 || ret[len(ret)-1] != i {
ret = append(ret, i)
ret = append(ret, i-1)
} else {
ret[len(ret)-1] = i - 1
}
j--
}
i--
}
// Reverse slice.
for i := 0; i < len(ret)/2; i++ {
ret[i], ret[len(ret)-1-i] = ret[len(ret)-1-i], ret[i]
}
return ret
}
func (m *Matcher) match(candidate []byte, candidateLower []byte) bool {
i, j := 0, 0
for ; i < len(candidateLower) && j < len(m.patternLower); i++ {
if candidateLower[i] == m.patternLower[j] {
j++
}
}
if j != len(m.patternLower) {
return false
}
// The input passes the simple test against pattern, so it is time to classify its characters.
// Character roles are used below to find the last segment.
m.roles = RuneRoles(candidate, m.rolesBuf[:])
return true
}
func (m *Matcher) computeScore(candidate []byte, candidateLower []byte) int {
pattLen, candLen := len(m.pattern), len(candidate)
for j := 0; j <= len(m.pattern); j++ {
m.scores[0][j][0] = minScore << 1
m.scores[0][j][1] = minScore << 1
}
m.scores[0][0][0] = score(0, 0) // Start with 0.
segmentsLeft, lastSegStart := 1, 0
for i := 0; i < candLen; i++ {
if m.roles[i] == RSep {
segmentsLeft++
lastSegStart = i + 1
}
}
// A per-character bonus for a consecutive match.
consecutiveBonus := 2
wordIdx := 0 // Word count within segment.
for i := 1; i <= candLen; i++ {
role := m.roles[i-1]
isHead := role == RHead
if isHead {
wordIdx++
} else if role == RSep && segmentsLeft > 1 {
wordIdx = 0
segmentsLeft--
}
var skipPenalty int
if i == 1 || (i-1) == lastSegStart {
// Skipping the start of first or last segment.
skipPenalty++
}
for j := 0; j <= pattLen; j++ {
// By default, we don't have a match. Fill in the skip data.
m.scores[i][j][1] = minScore << 1
// Compute the skip score.
k := 0
if m.scores[i-1][j][0].val() < m.scores[i-1][j][1].val() {
k = 1
}
skipScore := m.scores[i-1][j][k].val()
// Do not penalize missing characters after the last matched segment.
if j != pattLen {
skipScore -= skipPenalty
}
m.scores[i][j][0] = score(skipScore, k)
if j == 0 || candidateLower[i-1] != m.patternLower[j-1] {
// Not a match.
continue
}
pRole := m.patternRoles[j-1]
if role == RTail && pRole == RHead {
if j > 1 {
// Not a match: a head in the pattern matches a tail character in the candidate.
continue
}
// Special treatment for the first character of the pattern. We allow
// matches in the middle of a word if they are long enough, at least
// min(3, pattern.length) characters.
if !bytes.HasPrefix(candidateLower[i-1:], m.patternShort) {
continue
}
}
// Compute the char score.
var charScore int
// Bonus 1: the char is in the candidate's last segment.
if segmentsLeft <= 1 {
charScore++
}
// Bonus 2: Case match or a Head in the pattern aligns with one in the word.
// Single-case patterns lack segmentation signals and we assume any character
// can be a head of a segment.
if candidate[i-1] == m.pattern[j-1] || role == RHead && (!m.caseSensitive || pRole == RHead) {
charScore++
}
// Penalty 1: pattern char is Head, candidate char is Tail.
if role == RTail && pRole == RHead {
charScore--
}
// Penalty 2: first pattern character matched in the middle of a word.
if j == 1 && role == RTail {
charScore -= 4
}
// Third dimension encodes whether there is a gap between the previous match and the current
// one.
for k := 0; k < 2; k++ {
sc := m.scores[i-1][j-1][k].val() + charScore
isConsecutive := k == 1 || i-1 == 0 || i-1 == lastSegStart
if isConsecutive {
// Bonus 3: a consecutive match. First character match also gets a bonus to
// ensure prefix final match score normalizes to 1.0.
// Logically, this is a part of charScore, but we have to compute it here because it
// only applies for consecutive matches (k == 1).
sc += consecutiveBonus
}
if k == 0 {
// Penalty 3: Matching inside a segment (and previous char wasn't matched). Penalize for the lack
// of alignment.
if role == RTail || role == RUCTail {
sc -= 3
}
}
if sc > m.scores[i][j][1].val() {
m.scores[i][j][1] = score(sc, k)
}
}
}
}
result := m.scores[len(candidate)][len(m.pattern)][m.bestK(len(candidate), len(m.pattern))].val()
return result
}
// ScoreTable returns the score table computed for the provided candidate. Used only for debugging.
func (m *Matcher) ScoreTable(candidate string) string {
var buf bytes.Buffer
var line1, line2, separator bytes.Buffer
line1.WriteString("\t")
line2.WriteString("\t")
for j := 0; j < len(m.pattern); j++ {
line1.WriteString(fmt.Sprintf("%c\t\t", m.pattern[j]))
separator.WriteString("----------------")
}
buf.WriteString(line1.String())
buf.WriteString("\n")
buf.WriteString(separator.String())
buf.WriteString("\n")
for i := 1; i <= len(candidate); i++ {
line1.Reset()
line2.Reset()
line1.WriteString(fmt.Sprintf("%c\t", candidate[i-1]))
line2.WriteString("\t")
for j := 1; j <= len(m.pattern); j++ {
line1.WriteString(fmt.Sprintf("M%6d(%c)\t", m.scores[i][j][0].val(), dir(m.scores[i][j][0].prevK())))
line2.WriteString(fmt.Sprintf("H%6d(%c)\t", m.scores[i][j][1].val(), dir(m.scores[i][j][1].prevK())))
}
buf.WriteString(line1.String())
buf.WriteString("\n")
buf.WriteString(line2.String())
buf.WriteString("\n")
buf.WriteString(separator.String())
buf.WriteString("\n")
}
return buf.String()
}
func dir(prevK int) rune {
if prevK == 0 {
return 'M'
}
return 'H'
}
func (m *Matcher) poorMatch() bool {
if len(m.pattern) < 2 {
return false
}
i, j := m.lastCandidateLen, len(m.pattern)
k := m.bestK(i, j)
var counter, len int
for i > 0 {
take := (k == 1)
k = m.scores[i][j][k].prevK()
if take {
len++
if k == 0 && len < 3 && m.roles[i-1] == RTail {
// Short match in the middle of a word
counter++
if counter > 1 {
return true
}
}
j--
} else {
len = 0
}
i--
}
return false
}
// BestMatch returns the name most similar to the
// pattern, using fuzzy matching, or the empty string.
func BestMatch(pattern string, names []string) string {
fuzz := NewMatcher(pattern)
best := ""
highScore := float32(0) // minimum score is 0 (no match)
for _, name := range names {
// TODO: Improve scoring algorithm.
score := fuzz.Score(name)
if score > highScore {
highScore = score
best = name
} else if score == 0 {
// Order matters in the fuzzy matching algorithm. If we find no match
// when matching the target to the identifier, try matching the identifier
// to the target.
revFuzz := NewMatcher(name)
revScore := revFuzz.Score(pattern)
if revScore > highScore {
highScore = revScore
best = name
}
}
}
return best
}
@@ -0,0 +1,294 @@
// Copyright 2019 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Benchmark results:
//
// BenchmarkMatcher-12 1000000 1615 ns/op 30.95 MB/s 0 B/op 0 allocs/op
package fuzzy_test
import (
"bytes"
"fmt"
"math"
"testing"
"golang.org/x/tools/internal/fuzzy"
)
type comparator struct {
f func(val, ref float32) bool
descr string
}
var (
eq = comparator{
f: func(val, ref float32) bool {
return val == ref
},
descr: "==",
}
ge = comparator{
f: func(val, ref float32) bool {
return val >= ref
},
descr: ">=",
}
gt = comparator{
f: func(val, ref float32) bool {
return val > ref
},
descr: ">",
}
)
func (c comparator) eval(val, ref float32) bool {
return c.f(val, ref)
}
func (c comparator) String() string {
return c.descr
}
type scoreTest struct {
candidate string
comparator
ref float32
}
var matcherTests = []struct {
pattern string
tests []scoreTest
}{
{
pattern: "",
tests: []scoreTest{
{"def", eq, 1},
{"Ab stuff c", eq, 1},
},
},
{
pattern: "abc",
tests: []scoreTest{
{"def", eq, 0},
{"abd", eq, 0},
{"abc", ge, 0},
{"Abc", ge, 0},
{"Ab stuff c", ge, 0},
},
},
{
pattern: "Abc",
tests: []scoreTest{
{"def", eq, 0},
{"abd", eq, 0},
{"abc", ge, 0},
{"Abc", ge, 0},
{"Ab stuff c", ge, 0},
},
},
{
pattern: "U",
tests: []scoreTest{
{"ErrUnexpectedEOF", gt, 0},
{"ErrUnexpectedEOF.Error", eq, 0},
},
},
}
func TestScore(t *testing.T) {
for _, tc := range matcherTests {
m := fuzzy.NewMatcher(tc.pattern)
for _, sct := range tc.tests {
score := m.Score(sct.candidate)
if !sct.comparator.eval(score, sct.ref) {
t.Errorf("m.Score(%q) = %.2g, want %s %v", sct.candidate, score, sct.comparator, sct.ref)
}
}
}
}
var compareCandidatesTestCases = []struct {
pattern string
orderedCandidates []string
}{
{
pattern: "Foo",
orderedCandidates: []string{
"Barfoo",
"Faoo",
"F_o_o",
"FaoFooa",
"BarFoo",
"F__oo",
"F_oo",
"FooA",
"FooBar",
"Foo",
},
},
{
pattern: "U",
orderedCandidates: []string{
"ErrUnexpectedEOF.Error",
"ErrUnexpectedEOF",
},
},
}
func TestCompareCandidateScores(t *testing.T) {
for _, tc := range compareCandidatesTestCases {
m := fuzzy.NewMatcher(tc.pattern)
var prevScore float32
prevCand := "MIN_SCORE"
for _, cand := range tc.orderedCandidates {
score := m.Score(cand)
if prevScore > score {
t.Errorf("%s[=%v] is scored lower than %s[=%v]", cand, score, prevCand, prevScore)
}
if score < -1 || score > 1 {
t.Errorf("%s score is %v; want value between [-1, 1]", cand, score)
}
prevScore = score
prevCand = cand
}
}
}
var fuzzyMatcherTestCases = []struct {
p string
str string
want string
}{
{p: "foo", str: "abc::foo", want: "abc::[foo]"},
{p: "foo", str: "foo.foo", want: "foo.[foo]"},
{p: "foo", str: "fo_oo.o_oo", want: "[fo]_oo.[o]_oo"},
{p: "foo", str: "fo_oo.fo_oo", want: "fo_oo.[fo]_[o]o"},
{p: "fo_o", str: "fo_oo.o_oo", want: "[f]o_oo.[o_o]o"},
{p: "fOO", str: "fo_oo.o_oo", want: "[f]o_oo.[o]_[o]o"},
{p: "tedit", str: "foo.TextEdit", want: "foo.[T]ext[Edit]"},
{p: "TEdit", str: "foo.TextEdit", want: "foo.[T]ext[Edit]"},
{p: "Tedit", str: "foo.TextEdit", want: "foo.[T]ext[Edit]"},
{p: "Tedit", str: "foo.Textedit", want: "foo.[Te]xte[dit]"},
{p: "TEdit", str: "foo.Textedit", want: ""},
{p: "te", str: "foo.Textedit", want: "foo.[Te]xtedit"},
{p: "ee", str: "foo.Textedit", want: ""}, // short middle of the word match
{p: "ex", str: "foo.Textedit", want: "foo.T[ex]tedit"},
{p: "exdi", str: "foo.Textedit", want: ""}, // short middle of the word match
{p: "exdit", str: "foo.Textedit", want: ""}, // short middle of the word match
{p: "extdit", str: "foo.Textedit", want: "foo.T[ext]e[dit]"},
{p: "e", str: "foo.Textedit", want: "foo.T[e]xtedit"},
{p: "E", str: "foo.Textedit", want: "foo.T[e]xtedit"},
{p: "ed", str: "foo.Textedit", want: "foo.Text[ed]it"},
{p: "edt", str: "foo.Textedit", want: ""}, // short middle of the word match
{p: "edit", str: "foo.Textedit", want: "foo.Text[edit]"},
{p: "edin", str: "foo.TexteditNum", want: "foo.Text[edi]t[N]um"},
{p: "n", str: "node.GoNodeMax", want: "[n]ode.GoNodeMax"},
{p: "N", str: "node.GoNodeMax", want: "[n]ode.GoNodeMax"},
{p: "completio", str: "completion", want: "[completio]n"},
{p: "completio", str: "completion.None", want: "[completio]n.None"},
}
func TestFuzzyMatcherRanges(t *testing.T) {
for _, tc := range fuzzyMatcherTestCases {
matcher := fuzzy.NewMatcher(tc.p)
score := matcher.Score(tc.str)
if tc.want == "" {
if score > 0 {
t.Errorf("Score(%s, %s) = %v; want: <= 0", tc.p, tc.str, score)
}
continue
}
if score < 0 {
t.Errorf("Score(%s, %s) = %v, want: > 0", tc.p, tc.str, score)
continue
}
got := highlightMatches(tc.str, matcher)
if tc.want != got {
t.Errorf("highlightMatches(%s, %s) = %v, want: %v", tc.p, tc.str, got, tc.want)
}
}
}
var scoreTestCases = []struct {
p string
str string
want float64
}{
// Score precision up to five digits. Modify if changing the score, but make sure the new values
// are reasonable.
{p: "abc", str: "abc", want: 1},
{p: "abc", str: "Abc", want: 1},
{p: "abc", str: "Abcdef", want: 1},
{p: "strc", str: "StrCat", want: 1},
{p: "abc_def", str: "abc_def_xyz", want: 1},
{p: "abcdef", str: "abc_def_xyz", want: 0.91667},
{p: "abcxyz", str: "abc_def_xyz", want: 0.91667},
{p: "sc", str: "StrCat", want: 0.75},
{p: "abc", str: "AbstrBasicCtor", want: 0.83333},
{p: "foo", str: "abc::foo", want: 0.91667},
{p: "afoo", str: "abc::foo", want: 0.9375},
{p: "abr", str: "abc::bar", want: 0.5},
{p: "br", str: "abc::bar", want: 0.25},
{p: "aar", str: "abc::bar", want: 0.41667},
{p: "edin", str: "foo.TexteditNum", want: 0.125},
{p: "ediu", str: "foo.TexteditNum", want: 0},
// We want the next two items to have roughly similar scores.
{p: "up", str: "unique_ptr", want: 0.75},
{p: "up", str: "upper_bound", want: 1},
}
func TestScores(t *testing.T) {
for _, tc := range scoreTestCases {
matcher := fuzzy.NewMatcher(tc.p)
got := math.Round(float64(matcher.Score(tc.str))*1e5) / 1e5
if got != tc.want {
t.Errorf("Score(%s, %s) = %v, want: %v", tc.p, tc.str, got, tc.want)
}
}
}
func highlightMatches(str string, matcher *fuzzy.Matcher) string {
matches := matcher.MatchedRanges()
var buf bytes.Buffer
index := 0
for i := 0; i < len(matches)-1; i += 2 {
s, e := matches[i], matches[i+1]
fmt.Fprintf(&buf, "%s[%s]", str[index:s], str[s:e])
index = e
}
buf.WriteString(str[index:])
return buf.String()
}
func BenchmarkMatcher(b *testing.B) {
pattern := "Foo"
candidates := []string{
"F_o_o",
"Barfoo",
"Faoo",
"F__oo",
"F_oo",
"FaoFooa",
"BarFoo",
"FooA",
"FooBar",
"Foo",
}
matcher := fuzzy.NewMatcher(pattern)
b.ResetTimer()
for i := 0; i < b.N; i++ {
for _, c := range candidates {
matcher.Score(c)
}
}
var numBytes int
for _, c := range candidates {
numBytes += len(c)
}
b.SetBytes(int64(numBytes))
}
@@ -0,0 +1,39 @@
// Copyright 2023 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package fuzzy_test
import (
"testing"
. "golang.org/x/tools/internal/fuzzy"
)
func BenchmarkSelf_Matcher(b *testing.B) {
idents := collectIdentifiers(b)
patterns := generatePatterns()
for i := 0; i < b.N; i++ {
for _, pattern := range patterns {
sm := NewMatcher(pattern)
for _, ident := range idents {
_ = sm.Score(ident)
}
}
}
}
func BenchmarkSelf_SymbolMatcher(b *testing.B) {
idents := collectIdentifiers(b)
patterns := generatePatterns()
for i := 0; i < b.N; i++ {
for _, pattern := range patterns {
sm := NewSymbolMatcher(pattern)
for _, ident := range idents {
_, _ = sm.Match([]string{ident})
}
}
}
}
@@ -0,0 +1,309 @@
// Copyright 2021 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package fuzzy
import (
"bytes"
"fmt"
"log"
"unicode"
)
// SymbolMatcher implements a fuzzy matching algorithm optimized for Go symbols
// of the form:
//
// example.com/path/to/package.object.field
//
// Knowing that we are matching symbols like this allows us to make the
// following optimizations:
// - We can incorporate right-to-left relevance directly into the score
// calculation.
// - We can match from right to left, discarding leading bytes if the input is
// too long.
// - We just take the right-most match without losing too much precision. This
// allows us to use an O(n) algorithm.
// - We can operate directly on chunked strings; in many cases we will
// be storing the package path and/or package name separately from the
// symbol or identifiers, so doing this avoids allocating strings.
// - We can return the index of the right-most match, allowing us to trim
// irrelevant qualification.
type SymbolMatcher struct {
// Using buffers of length 256 is both a reasonable size for most qualified
// symbols, and makes it easy to avoid bounds checks by using uint8 indexes.
pattern [256]rune
patternLen uint8
inputBuffer [256]rune // avoid allocating when considering chunks
roles [256]uint32 // which roles does a rune play (word start, etc.)
segments [256]uint8 // how many segments from the right is each rune
}
// Rune roles.
const (
segmentStart uint32 = 1 << iota // input rune starts a segment (i.e. follows '/' or '.')
wordStart // input rune starts a word, per camel-case naming rules
separator // input rune is a separator ('/' or '.')
upper // input rune is an upper case letter
)
// NewSymbolMatcher creates a SymbolMatcher that may be used to match the given
// search pattern.
//
// Currently this matcher only accepts case-insensitive fuzzy patterns.
//
// An empty pattern matches no input.
func NewSymbolMatcher(pattern string) *SymbolMatcher {
m := &SymbolMatcher{}
for _, p := range pattern {
m.pattern[m.patternLen] = unicode.ToLower(p)
m.patternLen++
if m.patternLen == 255 || int(m.patternLen) == len(pattern) {
// break at 255 so that we can represent patternLen with a uint8.
break
}
}
return m
}
// Match searches for the right-most match of the search pattern within the
// symbol represented by concatenating the given chunks.
//
// If a match is found, the first result holds the absolute byte offset within
// all chunks for the start of the symbol. In other words, the index of the
// match within strings.Join(chunks, "").
//
// The second return value will be the score of the match, which is always
// between 0 and 1, inclusive. A score of 0 indicates no match.
//
// If no match is found, Match returns (-1, 0).
func (m *SymbolMatcher) Match(chunks []string) (int, float64) {
// Explicit behavior for an empty pattern.
//
// As a minor optimization, this also avoids nilness checks later on, since
// the compiler can prove that m != nil.
if m.patternLen == 0 {
return -1, 0
}
// Matching implements a heavily optimized linear scoring algorithm on the
// input. This is not guaranteed to produce the highest score, but works well
// enough, particularly due to the right-to-left significance of qualified
// symbols.
//
// Matching proceeds in three passes through the input:
// - The first pass populates the input buffer and collects rune roles.
// - The second pass proceeds right-to-left to find the right-most match.
// - The third pass proceeds left-to-right from the start of the right-most
// match, to find the most *compact* match, and computes the score of this
// match.
//
// See below for more details of each pass, as well as the scoring algorithm.
// First pass: populate the input buffer out of the provided chunks
// (lower-casing in the process), and collect rune roles.
//
// We could also check for a forward match here, but since we'd have to write
// the entire input anyway this has negligible impact on performance.
var (
inputLen = uint8(0)
modifiers = wordStart | segmentStart
)
input:
for _, chunk := range chunks {
for _, r := range chunk {
if r == '.' || r == '/' {
modifiers |= separator
}
// optimization: avoid calls to unicode.ToLower, which can't be inlined.
l := r
if r <= unicode.MaxASCII {
if 'A' <= r && r <= 'Z' {
l = r + 'a' - 'A'
}
} else {
l = unicode.ToLower(r)
}
if l != r {
modifiers |= upper
// If the current rune is capitalized *and the preceding rune was not*,
// mark this as a word start. This avoids spuriously high ranking of
// non-camelcase naming schemas, such as the
// yaml_PARSE_FLOW_SEQUENCE_ENTRY_MAPPING_END_STATE example of
// golang/go#60201.
if inputLen == 0 || m.roles[inputLen-1]&upper == 0 {
modifiers |= wordStart
}
}
m.inputBuffer[inputLen] = l
m.roles[inputLen] = modifiers
inputLen++
if m.roles[inputLen-1]&separator != 0 {
modifiers = wordStart | segmentStart
} else {
modifiers = 0
}
// TODO: we should prefer the right-most input if it overflows, rather
// than the left-most as we're doing here.
if inputLen == 255 {
break input
}
}
}
// Second pass: find the right-most match, and count segments from the
// right.
var (
pi = uint8(m.patternLen - 1) // pattern index
p = m.pattern[pi] // pattern rune
start = -1 // start offset of match
rseg = uint8(0) // effective "depth" from the right of the current rune in consideration
)
const maxSeg = 3 // maximum number of segments from the right to count, for scoring purposes.
for ii := inputLen - 1; ; ii-- {
r := m.inputBuffer[ii]
if rseg < maxSeg && m.roles[ii]&separator != 0 {
rseg++
}
m.segments[ii] = rseg
if p == r {
if pi == 0 {
// TODO(rfindley): BUG: the docstring for Match says that it returns an
// absolute byte offset, but clearly it is returning a rune offset here.
start = int(ii)
break
}
pi--
p = m.pattern[pi]
}
// Don't check ii >= 0 in the loop condition: ii is a uint8.
if ii == 0 {
break
}
}
if start < 0 {
// no match: skip scoring
return -1, 0
}
// Third pass: find the shortest match and compute the score.
// Score is the average score for each rune.
//
// A rune score is the multiple of:
// 1. The base score, which is 1.0 if the rune starts a segment, 0.9 if the
// rune starts a mid-segment word, else 0.6.
//
// Runes preceded by a matching rune are treated the same as the start
// of a mid-segment word (with a 0.9 score), so that sequential or exact
// matches are preferred. We call this a sequential bonus.
//
// For the final rune match, this sequential bonus is reduced to 0.8 if
// the next rune in the input is a mid-segment word, or 0.7 if the next
// rune in the input is not a word or segment start. This ensures that
// we favor whole-word or whole-segment matches over prefix matches.
//
// 2. 1.0 if the rune is part of the last segment, otherwise
// 1.0-0.1*<segments from the right>, with a max segment count of 3.
// Notably 1.0-0.1*3 = 0.7 > 0.6, so that foo/_/_/_/_ (a match very
// early in a qualified symbol name) still scores higher than _f_o_o_ (a
// completely split match).
//
// This is a naive algorithm, but it is fast. There's lots of prior art here
// that could be leveraged. For example, we could explicitly consider
// rune distance, and exact matches of words or segments.
//
// Also note that this might not actually find the highest scoring match, as
// doing so could require a non-linear algorithm, depending on how the score
// is calculated.
// debugging support
const debug = false // enable to log debugging information
var (
runeScores []float64
runeIdxs []int
)
pi = 0
p = m.pattern[pi]
const (
segStartScore = 1.0 // base score of runes starting a segment
wordScore = 0.9 // base score of runes starting or continuing a word
noStreak = 0.6
perSegment = 0.1 // we count at most 3 segments above
)
totScore := 0.0
lastMatch := uint8(255)
for ii := uint8(start); ii < inputLen; ii++ {
r := m.inputBuffer[ii]
if r == p {
pi++
finalRune := pi >= m.patternLen
p = m.pattern[pi]
baseScore := noStreak
// Calculate the sequence bonus based on preceding matches.
//
// We do this first as it is overridden by role scoring below.
if lastMatch == ii-1 {
baseScore = wordScore
// Reduce the sequence bonus for the final rune of the pattern based on
// whether it borders a new segment or word.
if finalRune {
switch {
case ii == inputLen-1 || m.roles[ii+1]&separator != 0:
// Full segment: no reduction
case m.roles[ii+1]&wordStart != 0:
baseScore = wordScore - 0.1
default:
baseScore = wordScore - 0.2
}
}
}
lastMatch = ii
// Calculate the rune's role score. If the rune starts a segment or word,
// this overrides the sequence score, as the rune starts a new sequence.
switch {
case m.roles[ii]&segmentStart != 0:
baseScore = segStartScore
case m.roles[ii]&wordStart != 0:
baseScore = wordScore
}
// Apply the segment-depth penalty (segments from the right).
runeScore := baseScore * (1.0 - float64(m.segments[ii])*perSegment)
if debug {
runeScores = append(runeScores, runeScore)
runeIdxs = append(runeIdxs, int(ii))
}
totScore += runeScore
if finalRune {
break
}
}
}
if debug {
// Format rune roles and scores in line:
// fo[o:.52].[b:1]a[r:.6]
var summary bytes.Buffer
last := 0
for i, idx := range runeIdxs {
summary.WriteString(string(m.inputBuffer[last:idx])) // encode runes
fmt.Fprintf(&summary, "[%s:%.2g]", string(m.inputBuffer[idx]), runeScores[i])
last = idx + 1
}
summary.WriteString(string(m.inputBuffer[last:inputLen])) // encode runes
log.Println(summary.String())
}
return start, totScore / float64(m.patternLen)
}
@@ -0,0 +1,252 @@
// Copyright 2021 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package fuzzy_test
import (
"go/ast"
"go/token"
"sort"
"testing"
"golang.org/x/tools/go/packages"
. "golang.org/x/tools/internal/fuzzy"
)
func TestSymbolMatchIndex(t *testing.T) {
tests := []struct {
pattern, input string
want int
}{
{"test", "foo.TestFoo", 4},
{"test", "test", 0},
{"test", "Test", 0},
{"test", "est", -1},
{"t", "shortest", 7},
{"", "foo", -1},
{"", string([]rune{0}), -1}, // verify that we don't default to an empty pattern.
{"anything", "", -1},
}
for _, test := range tests {
matcher := NewSymbolMatcher(test.pattern)
if got, _ := matcher.Match([]string{test.input}); got != test.want {
t.Errorf("NewSymbolMatcher(%q).Match(%q) = %v, _, want %v, _", test.pattern, test.input, got, test.want)
}
}
}
func TestSymbolRanking(t *testing.T) {
// query -> symbols to match, in ascending order of score
queryRanks := map[string][]string{
"test": {
"this.is.better.than.most",
"test.foo.bar",
"thebest",
"atest",
"test.foo",
"testage",
"tTest",
"foo.test",
},
"parseside": { // golang/go#60201
"yaml_PARSE_FLOW_SEQUENCE_ENTRY_MAPPING_END_STATE",
"parseContext.parse_sidebyside",
},
"cvb": {
"filecache_test.testIPCValueB",
"cover.Boundary",
},
"dho": {
"gocommand.DebugHangingGoCommands",
"protocol.DocumentHighlightOptions",
},
"flg": {
"completion.FALLTHROUGH",
"main.flagGoCmd",
},
"fvi": {
"godoc.fileIndexVersion",
"macho.FlagSubsectionsViaSymbols",
},
}
for query, symbols := range queryRanks {
t.Run(query, func(t *testing.T) {
matcher := NewSymbolMatcher(query)
prev := 0.0
for _, sym := range symbols {
_, score := matcher.Match([]string{sym})
t.Logf("Match(%q) = %v", sym, score)
if score <= prev {
t.Errorf("Match(%q) = _, %v, want > %v", sym, score, prev)
}
prev = score
}
})
}
}
func TestMatcherSimilarities(t *testing.T) {
// This test compares the fuzzy matcher with the symbol matcher on a corpus
// of qualified identifiers extracted from x/tools.
//
// These two matchers are not expected to agree, but inspecting differences
// can be useful for finding interesting ranking edge cases.
t.Skip("unskip this test to compare matchers")
idents := collectIdentifiers(t)
t.Logf("collected %d unique identifiers", len(idents))
// TODO: use go1.21 slices.MaxFunc.
topMatch := func(score func(string) float64) string {
top := ""
topScore := 0.0
for _, cand := range idents {
if s := score(cand); s > topScore {
top = cand
topScore = s
}
}
return top
}
agreed := 0
total := 0
bad := 0
patterns := generatePatterns()
for _, pattern := range patterns {
total++
fm := NewMatcher(pattern)
topFuzzy := topMatch(func(input string) float64 {
return float64(fm.Score(input))
})
sm := NewSymbolMatcher(pattern)
topSymbol := topMatch(func(input string) float64 {
_, score := sm.Match([]string{input})
return score
})
switch {
case topFuzzy == "" && topSymbol != "":
if false {
// The fuzzy matcher has a bug where it misses some matches; for this
// test we only care about the symbol matcher.
t.Logf("%q matched %q but no fuzzy match", pattern, topSymbol)
}
total--
bad++
case topFuzzy != "" && topSymbol == "":
t.Fatalf("%q matched %q but no symbol match", pattern, topFuzzy)
case topFuzzy == topSymbol:
agreed++
default:
// Enable this log to see mismatches.
if false {
t.Logf("mismatch for %q: fuzzy: %q, symbol: %q", pattern, topFuzzy, topSymbol)
}
}
}
t.Logf("fuzzy matchers agreed on %d out of %d queries (%d bad)", agreed, total, bad)
}
func collectIdentifiers(tb testing.TB) []string {
cfg := &packages.Config{
Mode: packages.NeedName | packages.NeedSyntax | packages.NeedFiles,
Tests: true,
}
pkgs, err := packages.Load(cfg, "golang.org/x/tools/...")
if err != nil {
tb.Fatal(err)
}
uniqueIdents := make(map[string]bool)
decls := 0
for _, pkg := range pkgs {
for _, f := range pkg.Syntax {
for _, decl := range f.Decls {
decls++
switch decl := decl.(type) {
case *ast.GenDecl:
for _, spec := range decl.Specs {
switch decl.Tok {
case token.IMPORT:
case token.TYPE:
name := spec.(*ast.TypeSpec).Name.Name
qualified := pkg.Name + "." + name
uniqueIdents[qualified] = true
case token.CONST, token.VAR:
for _, n := range spec.(*ast.ValueSpec).Names {
qualified := pkg.Name + "." + n.Name
uniqueIdents[qualified] = true
}
}
}
}
}
}
}
var idents []string
for k := range uniqueIdents {
idents = append(idents, k)
}
sort.Strings(idents)
return idents
}
func generatePatterns() []string {
var patterns []string
for x := 'a'; x <= 'z'; x++ {
for y := 'a'; y <= 'z'; y++ {
for z := 'a'; z <= 'z'; z++ {
patterns = append(patterns, string(x)+string(y)+string(z))
}
}
}
return patterns
}
// Test that we strongly prefer exact matches.
//
// In golang/go#60027, we preferred "Runner" for the query "rune" over several
// results containing the word "rune" exactly. Following this observation,
// scoring was tweaked to more strongly emphasize sequential characters and
// exact matches.
func TestSymbolRanking_Issue60027(t *testing.T) {
matcher := NewSymbolMatcher("rune")
// symbols to match, in ascending order of ranking.
symbols := []string{
"Runner",
"singleRuneParam",
"Config.ifsRune",
"Parser.rune",
}
prev := 0.0
for _, sym := range symbols {
_, score := matcher.Match([]string{sym})
t.Logf("Match(%q) = %v", sym, score)
if score < prev {
t.Errorf("Match(%q) = _, %v, want > %v", sym, score, prev)
}
prev = score
}
}
func TestChunkedMatch(t *testing.T) {
matcher := NewSymbolMatcher("test")
_, want := matcher.Match([]string{"test"})
chunked := [][]string{
{"", "test"},
{"test", ""},
{"te", "st"},
}
for _, chunks := range chunked {
offset, score := matcher.Match(chunks)
if offset != 0 || score != want {
t.Errorf("Match(%v) = %v, %v, want 0, 1.0", chunks, offset, score)
}
}
}