whatcanGOwrong
This commit is contained in:
+183
@@ -0,0 +1,183 @@
|
||||
// Copyright 2019 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package fuzzy
|
||||
|
||||
import (
|
||||
"unicode"
|
||||
)
|
||||
|
||||
// RuneRole specifies the role of a rune in the context of an input.
|
||||
type RuneRole byte
|
||||
|
||||
const (
|
||||
// RNone specifies a rune without any role in the input (i.e., whitespace/non-ASCII).
|
||||
RNone RuneRole = iota
|
||||
// RSep specifies a rune with the role of segment separator.
|
||||
RSep
|
||||
// RTail specifies a rune which is a lower-case tail in a word in the input.
|
||||
RTail
|
||||
// RUCTail specifies a rune which is an upper-case tail in a word in the input.
|
||||
RUCTail
|
||||
// RHead specifies a rune which is the first character in a word in the input.
|
||||
RHead
|
||||
)
|
||||
|
||||
// RuneRoles detects the roles of each byte rune in an input string and stores it in the output
|
||||
// slice. The rune role depends on the input type. Stops when it parsed all the runes in the string
|
||||
// or when it filled the output. If output is nil, then it gets created.
|
||||
func RuneRoles(candidate []byte, reuse []RuneRole) []RuneRole {
|
||||
var output []RuneRole
|
||||
if cap(reuse) < len(candidate) {
|
||||
output = make([]RuneRole, 0, len(candidate))
|
||||
} else {
|
||||
output = reuse[:0]
|
||||
}
|
||||
|
||||
prev, prev2 := rtNone, rtNone
|
||||
for i := 0; i < len(candidate); i++ {
|
||||
r := rune(candidate[i])
|
||||
|
||||
role := RNone
|
||||
|
||||
curr := rtLower
|
||||
if candidate[i] <= unicode.MaxASCII {
|
||||
curr = runeType(rt[candidate[i]] - '0')
|
||||
}
|
||||
|
||||
if curr == rtLower {
|
||||
if prev == rtNone || prev == rtPunct {
|
||||
role = RHead
|
||||
} else {
|
||||
role = RTail
|
||||
}
|
||||
} else if curr == rtUpper {
|
||||
role = RHead
|
||||
|
||||
if prev == rtUpper {
|
||||
// This and previous characters are both upper case.
|
||||
|
||||
if i+1 == len(candidate) {
|
||||
// This is last character, previous was also uppercase -> this is UCTail
|
||||
// i.e., (current char is C): aBC / BC / ABC
|
||||
role = RUCTail
|
||||
}
|
||||
}
|
||||
} else if curr == rtPunct {
|
||||
switch r {
|
||||
case '.', ':':
|
||||
role = RSep
|
||||
}
|
||||
}
|
||||
if curr != rtLower {
|
||||
if i > 1 && output[i-1] == RHead && prev2 == rtUpper && (output[i-2] == RHead || output[i-2] == RUCTail) {
|
||||
// The previous two characters were uppercase. The current one is not a lower case, so the
|
||||
// previous one can't be a HEAD. Make it a UCTail.
|
||||
// i.e., (last char is current char - B must be a UCTail): ABC / ZABC / AB.
|
||||
output[i-1] = RUCTail
|
||||
}
|
||||
}
|
||||
|
||||
output = append(output, role)
|
||||
prev2 = prev
|
||||
prev = curr
|
||||
}
|
||||
return output
|
||||
}
|
||||
|
||||
type runeType byte
|
||||
|
||||
const (
|
||||
rtNone runeType = iota
|
||||
rtPunct
|
||||
rtLower
|
||||
rtUpper
|
||||
)
|
||||
|
||||
const rt = "00000000000000000000000000000000000000000000001122222222221000000333333333333333333333333330000002222222222222222222222222200000"
|
||||
|
||||
// LastSegment returns the substring representing the last segment from the input, where each
|
||||
// byte has an associated RuneRole in the roles slice. This makes sense only for inputs of Symbol
|
||||
// or Filename type.
|
||||
func LastSegment(input string, roles []RuneRole) string {
|
||||
// Exclude ending separators.
|
||||
end := len(input) - 1
|
||||
for end >= 0 && roles[end] == RSep {
|
||||
end--
|
||||
}
|
||||
if end < 0 {
|
||||
return ""
|
||||
}
|
||||
|
||||
start := end - 1
|
||||
for start >= 0 && roles[start] != RSep {
|
||||
start--
|
||||
}
|
||||
|
||||
return input[start+1 : end+1]
|
||||
}
|
||||
|
||||
// fromChunks copies string chunks into the given buffer.
|
||||
func fromChunks(chunks []string, buffer []byte) []byte {
|
||||
ii := 0
|
||||
for _, chunk := range chunks {
|
||||
for i := 0; i < len(chunk); i++ {
|
||||
if ii >= cap(buffer) {
|
||||
break
|
||||
}
|
||||
buffer[ii] = chunk[i]
|
||||
ii++
|
||||
}
|
||||
}
|
||||
return buffer[:ii]
|
||||
}
|
||||
|
||||
// toLower transforms the input string to lower case, which is stored in the output byte slice.
|
||||
// The lower casing considers only ASCII values - non ASCII values are left unmodified.
|
||||
// Stops when parsed all input or when it filled the output slice. If output is nil, then it gets
|
||||
// created.
|
||||
func toLower(input []byte, reuse []byte) []byte {
|
||||
output := reuse
|
||||
if cap(reuse) < len(input) {
|
||||
output = make([]byte, len(input))
|
||||
}
|
||||
|
||||
for i := 0; i < len(input); i++ {
|
||||
r := rune(input[i])
|
||||
if input[i] <= unicode.MaxASCII {
|
||||
if 'A' <= r && r <= 'Z' {
|
||||
r += 'a' - 'A'
|
||||
}
|
||||
}
|
||||
output[i] = byte(r)
|
||||
}
|
||||
return output[:len(input)]
|
||||
}
|
||||
|
||||
// WordConsumer defines a consumer for a word delimited by the [start,end) byte offsets in an input
|
||||
// (start is inclusive, end is exclusive).
|
||||
type WordConsumer func(start, end int)
|
||||
|
||||
// Words find word delimiters in an input based on its bytes' mappings to rune roles. The offset
|
||||
// delimiters for each word are fed to the provided consumer function.
|
||||
func Words(roles []RuneRole, consume WordConsumer) {
|
||||
var wordStart int
|
||||
for i, r := range roles {
|
||||
switch r {
|
||||
case RUCTail, RTail:
|
||||
case RHead, RNone, RSep:
|
||||
if i != wordStart {
|
||||
consume(wordStart, i)
|
||||
}
|
||||
wordStart = i
|
||||
if r != RHead {
|
||||
// Skip this character.
|
||||
wordStart = i + 1
|
||||
}
|
||||
}
|
||||
}
|
||||
if wordStart != len(roles) {
|
||||
consume(wordStart, len(roles))
|
||||
}
|
||||
}
|
||||
+141
@@ -0,0 +1,141 @@
|
||||
// Copyright 2019 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package fuzzy_test
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"sort"
|
||||
"testing"
|
||||
|
||||
"golang.org/x/tools/internal/fuzzy"
|
||||
)
|
||||
|
||||
var rolesTests = []struct {
|
||||
str string
|
||||
want string
|
||||
}{
|
||||
{str: "abc::def::goo", want: "Ccc//Ccc//Ccc"},
|
||||
{str: "proto::Message", want: "Ccccc//Ccccccc"},
|
||||
{str: "AbstractSWTFactory", want: "CcccccccCuuCcccccc"},
|
||||
{str: "Abs012", want: "Cccccc"},
|
||||
{str: "/", want: " "},
|
||||
{str: "fOO", want: "CCu"},
|
||||
{str: "fo_oo.o_oo", want: "Cc Cc/C Cc"},
|
||||
}
|
||||
|
||||
func rolesString(roles []fuzzy.RuneRole) string {
|
||||
var buf bytes.Buffer
|
||||
for _, r := range roles {
|
||||
buf.WriteByte(" /cuC"[int(r)])
|
||||
}
|
||||
return buf.String()
|
||||
}
|
||||
|
||||
func TestRoles(t *testing.T) {
|
||||
for _, tc := range rolesTests {
|
||||
gotRoles := make([]fuzzy.RuneRole, len(tc.str))
|
||||
fuzzy.RuneRoles([]byte(tc.str), gotRoles)
|
||||
got := rolesString(gotRoles)
|
||||
if got != tc.want {
|
||||
t.Errorf("roles(%s) = %v; want %v", tc.str, got, tc.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var wordSplitTests = []struct {
|
||||
input string
|
||||
want []string
|
||||
}{
|
||||
{
|
||||
input: "foo bar baz",
|
||||
want: []string{"foo", "bar", "baz"},
|
||||
},
|
||||
{
|
||||
input: "fooBarBaz",
|
||||
want: []string{"foo", "Bar", "Baz"},
|
||||
},
|
||||
{
|
||||
input: "FOOBarBAZ",
|
||||
want: []string{"FOO", "Bar", "BAZ"},
|
||||
},
|
||||
{
|
||||
input: "foo123_bar2Baz3",
|
||||
want: []string{"foo123", "bar2", "Baz3"},
|
||||
},
|
||||
}
|
||||
|
||||
func TestWordSplit(t *testing.T) {
|
||||
for _, tc := range wordSplitTests {
|
||||
roles := fuzzy.RuneRoles([]byte(tc.input), nil)
|
||||
|
||||
var got []string
|
||||
consumer := func(i, j int) {
|
||||
got = append(got, tc.input[i:j])
|
||||
}
|
||||
fuzzy.Words(roles, consumer)
|
||||
|
||||
if eq := diffStringLists(tc.want, got); !eq {
|
||||
t.Errorf("input %v: (want %v -> got %v)", tc.input, tc.want, got)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func diffStringLists(a, b []string) bool {
|
||||
if len(a) != len(b) {
|
||||
return false
|
||||
}
|
||||
sort.Strings(a)
|
||||
sort.Strings(b)
|
||||
for i := range a {
|
||||
if a[i] != b[i] {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
var lastSegmentSplitTests = []struct {
|
||||
str string
|
||||
want string
|
||||
}{
|
||||
{
|
||||
str: "identifier",
|
||||
want: "identifier",
|
||||
},
|
||||
{
|
||||
str: "two_words",
|
||||
want: "two_words",
|
||||
},
|
||||
{
|
||||
str: "first::second",
|
||||
want: "second",
|
||||
},
|
||||
{
|
||||
str: "foo.bar.FOOBar_buz123_test",
|
||||
want: "FOOBar_buz123_test",
|
||||
},
|
||||
}
|
||||
|
||||
func TestLastSegment(t *testing.T) {
|
||||
for _, tc := range lastSegmentSplitTests {
|
||||
roles := fuzzy.RuneRoles([]byte(tc.str), nil)
|
||||
|
||||
got := fuzzy.LastSegment(tc.str, roles)
|
||||
|
||||
if got != tc.want {
|
||||
t.Errorf("str %v: want %v; got %v", tc.str, tc.want, got)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkRoles(b *testing.B) {
|
||||
str := "AbstractSWTFactory"
|
||||
out := make([]fuzzy.RuneRole, len(str))
|
||||
|
||||
for i := 0; i < b.N; i++ {
|
||||
fuzzy.RuneRoles([]byte(str), out)
|
||||
}
|
||||
b.SetBytes(int64(len(str)))
|
||||
}
|
||||
+434
@@ -0,0 +1,434 @@
|
||||
// Copyright 2019 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Package fuzzy implements a fuzzy matching algorithm.
|
||||
package fuzzy
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
)
|
||||
|
||||
const (
|
||||
// MaxInputSize is the maximum size of the input scored against the fuzzy matcher. Longer inputs
|
||||
// will be truncated to this size.
|
||||
MaxInputSize = 127
|
||||
// MaxPatternSize is the maximum size of the pattern used to construct the fuzzy matcher. Longer
|
||||
// inputs are truncated to this size.
|
||||
MaxPatternSize = 63
|
||||
)
|
||||
|
||||
type scoreVal int
|
||||
|
||||
func (s scoreVal) val() int {
|
||||
return int(s) >> 1
|
||||
}
|
||||
|
||||
func (s scoreVal) prevK() int {
|
||||
return int(s) & 1
|
||||
}
|
||||
|
||||
func score(val int, prevK int /*0 or 1*/) scoreVal {
|
||||
return scoreVal(val<<1 + prevK)
|
||||
}
|
||||
|
||||
// Matcher implements a fuzzy matching algorithm for scoring candidates against a pattern.
|
||||
// The matcher does not support parallel usage.
|
||||
type Matcher struct {
|
||||
pattern string
|
||||
patternLower []byte // lower-case version of the pattern
|
||||
patternShort []byte // first characters of the pattern
|
||||
caseSensitive bool // set if the pattern is mix-cased
|
||||
|
||||
patternRoles []RuneRole // the role of each character in the pattern
|
||||
roles []RuneRole // the role of each character in the tested string
|
||||
|
||||
scores [MaxInputSize + 1][MaxPatternSize + 1][2]scoreVal
|
||||
|
||||
scoreScale float32
|
||||
|
||||
lastCandidateLen int // in bytes
|
||||
lastCandidateMatched bool
|
||||
|
||||
// Reusable buffers to avoid allocating for every candidate.
|
||||
// - inputBuf stores the concatenated input chunks
|
||||
// - lowerBuf stores the last candidate in lower-case
|
||||
// - rolesBuf stores the calculated roles for each rune in the last
|
||||
// candidate.
|
||||
inputBuf [MaxInputSize]byte
|
||||
lowerBuf [MaxInputSize]byte
|
||||
rolesBuf [MaxInputSize]RuneRole
|
||||
}
|
||||
|
||||
func (m *Matcher) bestK(i, j int) int {
|
||||
if m.scores[i][j][0].val() < m.scores[i][j][1].val() {
|
||||
return 1
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
// NewMatcher returns a new fuzzy matcher for scoring candidates against the provided pattern.
|
||||
func NewMatcher(pattern string) *Matcher {
|
||||
if len(pattern) > MaxPatternSize {
|
||||
pattern = pattern[:MaxPatternSize]
|
||||
}
|
||||
|
||||
m := &Matcher{
|
||||
pattern: pattern,
|
||||
patternLower: toLower([]byte(pattern), nil),
|
||||
}
|
||||
|
||||
for i, c := range m.patternLower {
|
||||
if pattern[i] != c {
|
||||
m.caseSensitive = true
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if len(pattern) > 3 {
|
||||
m.patternShort = m.patternLower[:3]
|
||||
} else {
|
||||
m.patternShort = m.patternLower
|
||||
}
|
||||
|
||||
m.patternRoles = RuneRoles([]byte(pattern), nil)
|
||||
|
||||
if len(pattern) > 0 {
|
||||
maxCharScore := 4
|
||||
m.scoreScale = 1 / float32(maxCharScore*len(pattern))
|
||||
}
|
||||
|
||||
return m
|
||||
}
|
||||
|
||||
// Score returns the score returned by matching the candidate to the pattern.
|
||||
// This is not designed for parallel use. Multiple candidates must be scored sequentially.
|
||||
// Returns a score between 0 and 1 (0 - no match, 1 - perfect match).
|
||||
func (m *Matcher) Score(candidate string) float32 {
|
||||
return m.ScoreChunks([]string{candidate})
|
||||
}
|
||||
|
||||
func (m *Matcher) ScoreChunks(chunks []string) float32 {
|
||||
candidate := fromChunks(chunks, m.inputBuf[:])
|
||||
if len(candidate) > MaxInputSize {
|
||||
candidate = candidate[:MaxInputSize]
|
||||
}
|
||||
lower := toLower(candidate, m.lowerBuf[:])
|
||||
m.lastCandidateLen = len(candidate)
|
||||
|
||||
if len(m.pattern) == 0 {
|
||||
// Empty patterns perfectly match candidates.
|
||||
return 1
|
||||
}
|
||||
|
||||
if m.match(candidate, lower) {
|
||||
sc := m.computeScore(candidate, lower)
|
||||
if sc > minScore/2 && !m.poorMatch() {
|
||||
m.lastCandidateMatched = true
|
||||
if len(m.pattern) == len(candidate) {
|
||||
// Perfect match.
|
||||
return 1
|
||||
}
|
||||
|
||||
if sc < 0 {
|
||||
sc = 0
|
||||
}
|
||||
normalizedScore := float32(sc) * m.scoreScale
|
||||
if normalizedScore > 1 {
|
||||
normalizedScore = 1
|
||||
}
|
||||
|
||||
return normalizedScore
|
||||
}
|
||||
}
|
||||
|
||||
m.lastCandidateMatched = false
|
||||
return 0
|
||||
}
|
||||
|
||||
const minScore = -10000
|
||||
|
||||
// MatchedRanges returns matches ranges for the last scored string as a flattened array of
|
||||
// [begin, end) byte offset pairs.
|
||||
func (m *Matcher) MatchedRanges() []int {
|
||||
if len(m.pattern) == 0 || !m.lastCandidateMatched {
|
||||
return nil
|
||||
}
|
||||
i, j := m.lastCandidateLen, len(m.pattern)
|
||||
if m.scores[i][j][0].val() < minScore/2 && m.scores[i][j][1].val() < minScore/2 {
|
||||
return nil
|
||||
}
|
||||
|
||||
var ret []int
|
||||
k := m.bestK(i, j)
|
||||
for i > 0 {
|
||||
take := (k == 1)
|
||||
k = m.scores[i][j][k].prevK()
|
||||
if take {
|
||||
if len(ret) == 0 || ret[len(ret)-1] != i {
|
||||
ret = append(ret, i)
|
||||
ret = append(ret, i-1)
|
||||
} else {
|
||||
ret[len(ret)-1] = i - 1
|
||||
}
|
||||
j--
|
||||
}
|
||||
i--
|
||||
}
|
||||
// Reverse slice.
|
||||
for i := 0; i < len(ret)/2; i++ {
|
||||
ret[i], ret[len(ret)-1-i] = ret[len(ret)-1-i], ret[i]
|
||||
}
|
||||
return ret
|
||||
}
|
||||
|
||||
func (m *Matcher) match(candidate []byte, candidateLower []byte) bool {
|
||||
i, j := 0, 0
|
||||
for ; i < len(candidateLower) && j < len(m.patternLower); i++ {
|
||||
if candidateLower[i] == m.patternLower[j] {
|
||||
j++
|
||||
}
|
||||
}
|
||||
if j != len(m.patternLower) {
|
||||
return false
|
||||
}
|
||||
|
||||
// The input passes the simple test against pattern, so it is time to classify its characters.
|
||||
// Character roles are used below to find the last segment.
|
||||
m.roles = RuneRoles(candidate, m.rolesBuf[:])
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
func (m *Matcher) computeScore(candidate []byte, candidateLower []byte) int {
|
||||
pattLen, candLen := len(m.pattern), len(candidate)
|
||||
|
||||
for j := 0; j <= len(m.pattern); j++ {
|
||||
m.scores[0][j][0] = minScore << 1
|
||||
m.scores[0][j][1] = minScore << 1
|
||||
}
|
||||
m.scores[0][0][0] = score(0, 0) // Start with 0.
|
||||
|
||||
segmentsLeft, lastSegStart := 1, 0
|
||||
for i := 0; i < candLen; i++ {
|
||||
if m.roles[i] == RSep {
|
||||
segmentsLeft++
|
||||
lastSegStart = i + 1
|
||||
}
|
||||
}
|
||||
|
||||
// A per-character bonus for a consecutive match.
|
||||
consecutiveBonus := 2
|
||||
wordIdx := 0 // Word count within segment.
|
||||
for i := 1; i <= candLen; i++ {
|
||||
|
||||
role := m.roles[i-1]
|
||||
isHead := role == RHead
|
||||
|
||||
if isHead {
|
||||
wordIdx++
|
||||
} else if role == RSep && segmentsLeft > 1 {
|
||||
wordIdx = 0
|
||||
segmentsLeft--
|
||||
}
|
||||
|
||||
var skipPenalty int
|
||||
if i == 1 || (i-1) == lastSegStart {
|
||||
// Skipping the start of first or last segment.
|
||||
skipPenalty++
|
||||
}
|
||||
|
||||
for j := 0; j <= pattLen; j++ {
|
||||
// By default, we don't have a match. Fill in the skip data.
|
||||
m.scores[i][j][1] = minScore << 1
|
||||
|
||||
// Compute the skip score.
|
||||
k := 0
|
||||
if m.scores[i-1][j][0].val() < m.scores[i-1][j][1].val() {
|
||||
k = 1
|
||||
}
|
||||
|
||||
skipScore := m.scores[i-1][j][k].val()
|
||||
// Do not penalize missing characters after the last matched segment.
|
||||
if j != pattLen {
|
||||
skipScore -= skipPenalty
|
||||
}
|
||||
m.scores[i][j][0] = score(skipScore, k)
|
||||
|
||||
if j == 0 || candidateLower[i-1] != m.patternLower[j-1] {
|
||||
// Not a match.
|
||||
continue
|
||||
}
|
||||
pRole := m.patternRoles[j-1]
|
||||
|
||||
if role == RTail && pRole == RHead {
|
||||
if j > 1 {
|
||||
// Not a match: a head in the pattern matches a tail character in the candidate.
|
||||
continue
|
||||
}
|
||||
// Special treatment for the first character of the pattern. We allow
|
||||
// matches in the middle of a word if they are long enough, at least
|
||||
// min(3, pattern.length) characters.
|
||||
if !bytes.HasPrefix(candidateLower[i-1:], m.patternShort) {
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
// Compute the char score.
|
||||
var charScore int
|
||||
// Bonus 1: the char is in the candidate's last segment.
|
||||
if segmentsLeft <= 1 {
|
||||
charScore++
|
||||
}
|
||||
// Bonus 2: Case match or a Head in the pattern aligns with one in the word.
|
||||
// Single-case patterns lack segmentation signals and we assume any character
|
||||
// can be a head of a segment.
|
||||
if candidate[i-1] == m.pattern[j-1] || role == RHead && (!m.caseSensitive || pRole == RHead) {
|
||||
charScore++
|
||||
}
|
||||
|
||||
// Penalty 1: pattern char is Head, candidate char is Tail.
|
||||
if role == RTail && pRole == RHead {
|
||||
charScore--
|
||||
}
|
||||
// Penalty 2: first pattern character matched in the middle of a word.
|
||||
if j == 1 && role == RTail {
|
||||
charScore -= 4
|
||||
}
|
||||
|
||||
// Third dimension encodes whether there is a gap between the previous match and the current
|
||||
// one.
|
||||
for k := 0; k < 2; k++ {
|
||||
sc := m.scores[i-1][j-1][k].val() + charScore
|
||||
|
||||
isConsecutive := k == 1 || i-1 == 0 || i-1 == lastSegStart
|
||||
if isConsecutive {
|
||||
// Bonus 3: a consecutive match. First character match also gets a bonus to
|
||||
// ensure prefix final match score normalizes to 1.0.
|
||||
// Logically, this is a part of charScore, but we have to compute it here because it
|
||||
// only applies for consecutive matches (k == 1).
|
||||
sc += consecutiveBonus
|
||||
}
|
||||
if k == 0 {
|
||||
// Penalty 3: Matching inside a segment (and previous char wasn't matched). Penalize for the lack
|
||||
// of alignment.
|
||||
if role == RTail || role == RUCTail {
|
||||
sc -= 3
|
||||
}
|
||||
}
|
||||
|
||||
if sc > m.scores[i][j][1].val() {
|
||||
m.scores[i][j][1] = score(sc, k)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
result := m.scores[len(candidate)][len(m.pattern)][m.bestK(len(candidate), len(m.pattern))].val()
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
// ScoreTable returns the score table computed for the provided candidate. Used only for debugging.
|
||||
func (m *Matcher) ScoreTable(candidate string) string {
|
||||
var buf bytes.Buffer
|
||||
|
||||
var line1, line2, separator bytes.Buffer
|
||||
line1.WriteString("\t")
|
||||
line2.WriteString("\t")
|
||||
for j := 0; j < len(m.pattern); j++ {
|
||||
line1.WriteString(fmt.Sprintf("%c\t\t", m.pattern[j]))
|
||||
separator.WriteString("----------------")
|
||||
}
|
||||
|
||||
buf.WriteString(line1.String())
|
||||
buf.WriteString("\n")
|
||||
buf.WriteString(separator.String())
|
||||
buf.WriteString("\n")
|
||||
|
||||
for i := 1; i <= len(candidate); i++ {
|
||||
line1.Reset()
|
||||
line2.Reset()
|
||||
|
||||
line1.WriteString(fmt.Sprintf("%c\t", candidate[i-1]))
|
||||
line2.WriteString("\t")
|
||||
|
||||
for j := 1; j <= len(m.pattern); j++ {
|
||||
line1.WriteString(fmt.Sprintf("M%6d(%c)\t", m.scores[i][j][0].val(), dir(m.scores[i][j][0].prevK())))
|
||||
line2.WriteString(fmt.Sprintf("H%6d(%c)\t", m.scores[i][j][1].val(), dir(m.scores[i][j][1].prevK())))
|
||||
}
|
||||
buf.WriteString(line1.String())
|
||||
buf.WriteString("\n")
|
||||
buf.WriteString(line2.String())
|
||||
buf.WriteString("\n")
|
||||
buf.WriteString(separator.String())
|
||||
buf.WriteString("\n")
|
||||
}
|
||||
|
||||
return buf.String()
|
||||
}
|
||||
|
||||
func dir(prevK int) rune {
|
||||
if prevK == 0 {
|
||||
return 'M'
|
||||
}
|
||||
return 'H'
|
||||
}
|
||||
|
||||
func (m *Matcher) poorMatch() bool {
|
||||
if len(m.pattern) < 2 {
|
||||
return false
|
||||
}
|
||||
|
||||
i, j := m.lastCandidateLen, len(m.pattern)
|
||||
k := m.bestK(i, j)
|
||||
|
||||
var counter, len int
|
||||
for i > 0 {
|
||||
take := (k == 1)
|
||||
k = m.scores[i][j][k].prevK()
|
||||
if take {
|
||||
len++
|
||||
if k == 0 && len < 3 && m.roles[i-1] == RTail {
|
||||
// Short match in the middle of a word
|
||||
counter++
|
||||
if counter > 1 {
|
||||
return true
|
||||
}
|
||||
}
|
||||
j--
|
||||
} else {
|
||||
len = 0
|
||||
}
|
||||
i--
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// BestMatch returns the name most similar to the
|
||||
// pattern, using fuzzy matching, or the empty string.
|
||||
func BestMatch(pattern string, names []string) string {
|
||||
fuzz := NewMatcher(pattern)
|
||||
best := ""
|
||||
highScore := float32(0) // minimum score is 0 (no match)
|
||||
for _, name := range names {
|
||||
// TODO: Improve scoring algorithm.
|
||||
score := fuzz.Score(name)
|
||||
if score > highScore {
|
||||
highScore = score
|
||||
best = name
|
||||
} else if score == 0 {
|
||||
// Order matters in the fuzzy matching algorithm. If we find no match
|
||||
// when matching the target to the identifier, try matching the identifier
|
||||
// to the target.
|
||||
revFuzz := NewMatcher(name)
|
||||
revScore := revFuzz.Score(pattern)
|
||||
if revScore > highScore {
|
||||
highScore = revScore
|
||||
best = name
|
||||
}
|
||||
}
|
||||
}
|
||||
return best
|
||||
}
|
||||
+294
@@ -0,0 +1,294 @@
|
||||
// Copyright 2019 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Benchmark results:
|
||||
//
|
||||
// BenchmarkMatcher-12 1000000 1615 ns/op 30.95 MB/s 0 B/op 0 allocs/op
|
||||
package fuzzy_test
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"math"
|
||||
"testing"
|
||||
|
||||
"golang.org/x/tools/internal/fuzzy"
|
||||
)
|
||||
|
||||
type comparator struct {
|
||||
f func(val, ref float32) bool
|
||||
descr string
|
||||
}
|
||||
|
||||
var (
|
||||
eq = comparator{
|
||||
f: func(val, ref float32) bool {
|
||||
return val == ref
|
||||
},
|
||||
descr: "==",
|
||||
}
|
||||
ge = comparator{
|
||||
f: func(val, ref float32) bool {
|
||||
return val >= ref
|
||||
},
|
||||
descr: ">=",
|
||||
}
|
||||
gt = comparator{
|
||||
f: func(val, ref float32) bool {
|
||||
return val > ref
|
||||
},
|
||||
descr: ">",
|
||||
}
|
||||
)
|
||||
|
||||
func (c comparator) eval(val, ref float32) bool {
|
||||
return c.f(val, ref)
|
||||
}
|
||||
|
||||
func (c comparator) String() string {
|
||||
return c.descr
|
||||
}
|
||||
|
||||
type scoreTest struct {
|
||||
candidate string
|
||||
comparator
|
||||
ref float32
|
||||
}
|
||||
|
||||
var matcherTests = []struct {
|
||||
pattern string
|
||||
tests []scoreTest
|
||||
}{
|
||||
{
|
||||
pattern: "",
|
||||
tests: []scoreTest{
|
||||
{"def", eq, 1},
|
||||
{"Ab stuff c", eq, 1},
|
||||
},
|
||||
},
|
||||
{
|
||||
pattern: "abc",
|
||||
tests: []scoreTest{
|
||||
{"def", eq, 0},
|
||||
{"abd", eq, 0},
|
||||
{"abc", ge, 0},
|
||||
{"Abc", ge, 0},
|
||||
{"Ab stuff c", ge, 0},
|
||||
},
|
||||
},
|
||||
{
|
||||
pattern: "Abc",
|
||||
tests: []scoreTest{
|
||||
{"def", eq, 0},
|
||||
{"abd", eq, 0},
|
||||
{"abc", ge, 0},
|
||||
{"Abc", ge, 0},
|
||||
{"Ab stuff c", ge, 0},
|
||||
},
|
||||
},
|
||||
{
|
||||
pattern: "U",
|
||||
tests: []scoreTest{
|
||||
{"ErrUnexpectedEOF", gt, 0},
|
||||
{"ErrUnexpectedEOF.Error", eq, 0},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
func TestScore(t *testing.T) {
|
||||
for _, tc := range matcherTests {
|
||||
m := fuzzy.NewMatcher(tc.pattern)
|
||||
for _, sct := range tc.tests {
|
||||
score := m.Score(sct.candidate)
|
||||
if !sct.comparator.eval(score, sct.ref) {
|
||||
t.Errorf("m.Score(%q) = %.2g, want %s %v", sct.candidate, score, sct.comparator, sct.ref)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var compareCandidatesTestCases = []struct {
|
||||
pattern string
|
||||
orderedCandidates []string
|
||||
}{
|
||||
{
|
||||
pattern: "Foo",
|
||||
orderedCandidates: []string{
|
||||
"Barfoo",
|
||||
"Faoo",
|
||||
"F_o_o",
|
||||
"FaoFooa",
|
||||
"BarFoo",
|
||||
"F__oo",
|
||||
"F_oo",
|
||||
"FooA",
|
||||
"FooBar",
|
||||
"Foo",
|
||||
},
|
||||
},
|
||||
{
|
||||
pattern: "U",
|
||||
orderedCandidates: []string{
|
||||
"ErrUnexpectedEOF.Error",
|
||||
"ErrUnexpectedEOF",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
func TestCompareCandidateScores(t *testing.T) {
|
||||
for _, tc := range compareCandidatesTestCases {
|
||||
m := fuzzy.NewMatcher(tc.pattern)
|
||||
|
||||
var prevScore float32
|
||||
prevCand := "MIN_SCORE"
|
||||
for _, cand := range tc.orderedCandidates {
|
||||
score := m.Score(cand)
|
||||
if prevScore > score {
|
||||
t.Errorf("%s[=%v] is scored lower than %s[=%v]", cand, score, prevCand, prevScore)
|
||||
}
|
||||
if score < -1 || score > 1 {
|
||||
t.Errorf("%s score is %v; want value between [-1, 1]", cand, score)
|
||||
}
|
||||
prevScore = score
|
||||
prevCand = cand
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var fuzzyMatcherTestCases = []struct {
|
||||
p string
|
||||
str string
|
||||
want string
|
||||
}{
|
||||
{p: "foo", str: "abc::foo", want: "abc::[foo]"},
|
||||
{p: "foo", str: "foo.foo", want: "foo.[foo]"},
|
||||
{p: "foo", str: "fo_oo.o_oo", want: "[fo]_oo.[o]_oo"},
|
||||
{p: "foo", str: "fo_oo.fo_oo", want: "fo_oo.[fo]_[o]o"},
|
||||
{p: "fo_o", str: "fo_oo.o_oo", want: "[f]o_oo.[o_o]o"},
|
||||
{p: "fOO", str: "fo_oo.o_oo", want: "[f]o_oo.[o]_[o]o"},
|
||||
{p: "tedit", str: "foo.TextEdit", want: "foo.[T]ext[Edit]"},
|
||||
{p: "TEdit", str: "foo.TextEdit", want: "foo.[T]ext[Edit]"},
|
||||
{p: "Tedit", str: "foo.TextEdit", want: "foo.[T]ext[Edit]"},
|
||||
{p: "Tedit", str: "foo.Textedit", want: "foo.[Te]xte[dit]"},
|
||||
{p: "TEdit", str: "foo.Textedit", want: ""},
|
||||
{p: "te", str: "foo.Textedit", want: "foo.[Te]xtedit"},
|
||||
{p: "ee", str: "foo.Textedit", want: ""}, // short middle of the word match
|
||||
{p: "ex", str: "foo.Textedit", want: "foo.T[ex]tedit"},
|
||||
{p: "exdi", str: "foo.Textedit", want: ""}, // short middle of the word match
|
||||
{p: "exdit", str: "foo.Textedit", want: ""}, // short middle of the word match
|
||||
{p: "extdit", str: "foo.Textedit", want: "foo.T[ext]e[dit]"},
|
||||
{p: "e", str: "foo.Textedit", want: "foo.T[e]xtedit"},
|
||||
{p: "E", str: "foo.Textedit", want: "foo.T[e]xtedit"},
|
||||
{p: "ed", str: "foo.Textedit", want: "foo.Text[ed]it"},
|
||||
{p: "edt", str: "foo.Textedit", want: ""}, // short middle of the word match
|
||||
{p: "edit", str: "foo.Textedit", want: "foo.Text[edit]"},
|
||||
{p: "edin", str: "foo.TexteditNum", want: "foo.Text[edi]t[N]um"},
|
||||
{p: "n", str: "node.GoNodeMax", want: "[n]ode.GoNodeMax"},
|
||||
{p: "N", str: "node.GoNodeMax", want: "[n]ode.GoNodeMax"},
|
||||
{p: "completio", str: "completion", want: "[completio]n"},
|
||||
{p: "completio", str: "completion.None", want: "[completio]n.None"},
|
||||
}
|
||||
|
||||
func TestFuzzyMatcherRanges(t *testing.T) {
|
||||
for _, tc := range fuzzyMatcherTestCases {
|
||||
matcher := fuzzy.NewMatcher(tc.p)
|
||||
score := matcher.Score(tc.str)
|
||||
if tc.want == "" {
|
||||
if score > 0 {
|
||||
t.Errorf("Score(%s, %s) = %v; want: <= 0", tc.p, tc.str, score)
|
||||
}
|
||||
continue
|
||||
}
|
||||
if score < 0 {
|
||||
t.Errorf("Score(%s, %s) = %v, want: > 0", tc.p, tc.str, score)
|
||||
continue
|
||||
}
|
||||
got := highlightMatches(tc.str, matcher)
|
||||
if tc.want != got {
|
||||
t.Errorf("highlightMatches(%s, %s) = %v, want: %v", tc.p, tc.str, got, tc.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var scoreTestCases = []struct {
|
||||
p string
|
||||
str string
|
||||
want float64
|
||||
}{
|
||||
// Score precision up to five digits. Modify if changing the score, but make sure the new values
|
||||
// are reasonable.
|
||||
{p: "abc", str: "abc", want: 1},
|
||||
{p: "abc", str: "Abc", want: 1},
|
||||
{p: "abc", str: "Abcdef", want: 1},
|
||||
{p: "strc", str: "StrCat", want: 1},
|
||||
{p: "abc_def", str: "abc_def_xyz", want: 1},
|
||||
{p: "abcdef", str: "abc_def_xyz", want: 0.91667},
|
||||
{p: "abcxyz", str: "abc_def_xyz", want: 0.91667},
|
||||
{p: "sc", str: "StrCat", want: 0.75},
|
||||
{p: "abc", str: "AbstrBasicCtor", want: 0.83333},
|
||||
{p: "foo", str: "abc::foo", want: 0.91667},
|
||||
{p: "afoo", str: "abc::foo", want: 0.9375},
|
||||
{p: "abr", str: "abc::bar", want: 0.5},
|
||||
{p: "br", str: "abc::bar", want: 0.25},
|
||||
{p: "aar", str: "abc::bar", want: 0.41667},
|
||||
{p: "edin", str: "foo.TexteditNum", want: 0.125},
|
||||
{p: "ediu", str: "foo.TexteditNum", want: 0},
|
||||
// We want the next two items to have roughly similar scores.
|
||||
{p: "up", str: "unique_ptr", want: 0.75},
|
||||
{p: "up", str: "upper_bound", want: 1},
|
||||
}
|
||||
|
||||
func TestScores(t *testing.T) {
|
||||
for _, tc := range scoreTestCases {
|
||||
matcher := fuzzy.NewMatcher(tc.p)
|
||||
got := math.Round(float64(matcher.Score(tc.str))*1e5) / 1e5
|
||||
if got != tc.want {
|
||||
t.Errorf("Score(%s, %s) = %v, want: %v", tc.p, tc.str, got, tc.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func highlightMatches(str string, matcher *fuzzy.Matcher) string {
|
||||
matches := matcher.MatchedRanges()
|
||||
|
||||
var buf bytes.Buffer
|
||||
index := 0
|
||||
for i := 0; i < len(matches)-1; i += 2 {
|
||||
s, e := matches[i], matches[i+1]
|
||||
fmt.Fprintf(&buf, "%s[%s]", str[index:s], str[s:e])
|
||||
index = e
|
||||
}
|
||||
buf.WriteString(str[index:])
|
||||
return buf.String()
|
||||
}
|
||||
|
||||
func BenchmarkMatcher(b *testing.B) {
|
||||
pattern := "Foo"
|
||||
candidates := []string{
|
||||
"F_o_o",
|
||||
"Barfoo",
|
||||
"Faoo",
|
||||
"F__oo",
|
||||
"F_oo",
|
||||
"FaoFooa",
|
||||
"BarFoo",
|
||||
"FooA",
|
||||
"FooBar",
|
||||
"Foo",
|
||||
}
|
||||
|
||||
matcher := fuzzy.NewMatcher(pattern)
|
||||
|
||||
b.ResetTimer()
|
||||
for i := 0; i < b.N; i++ {
|
||||
for _, c := range candidates {
|
||||
matcher.Score(c)
|
||||
}
|
||||
}
|
||||
var numBytes int
|
||||
for _, c := range candidates {
|
||||
numBytes += len(c)
|
||||
}
|
||||
b.SetBytes(int64(numBytes))
|
||||
}
|
||||
+39
@@ -0,0 +1,39 @@
|
||||
// Copyright 2023 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package fuzzy_test
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
. "golang.org/x/tools/internal/fuzzy"
|
||||
)
|
||||
|
||||
func BenchmarkSelf_Matcher(b *testing.B) {
|
||||
idents := collectIdentifiers(b)
|
||||
patterns := generatePatterns()
|
||||
|
||||
for i := 0; i < b.N; i++ {
|
||||
for _, pattern := range patterns {
|
||||
sm := NewMatcher(pattern)
|
||||
for _, ident := range idents {
|
||||
_ = sm.Score(ident)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkSelf_SymbolMatcher(b *testing.B) {
|
||||
idents := collectIdentifiers(b)
|
||||
patterns := generatePatterns()
|
||||
|
||||
for i := 0; i < b.N; i++ {
|
||||
for _, pattern := range patterns {
|
||||
sm := NewSymbolMatcher(pattern)
|
||||
for _, ident := range idents {
|
||||
_, _ = sm.Match([]string{ident})
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
+309
@@ -0,0 +1,309 @@
|
||||
// Copyright 2021 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package fuzzy
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"log"
|
||||
"unicode"
|
||||
)
|
||||
|
||||
// SymbolMatcher implements a fuzzy matching algorithm optimized for Go symbols
|
||||
// of the form:
|
||||
//
|
||||
// example.com/path/to/package.object.field
|
||||
//
|
||||
// Knowing that we are matching symbols like this allows us to make the
|
||||
// following optimizations:
|
||||
// - We can incorporate right-to-left relevance directly into the score
|
||||
// calculation.
|
||||
// - We can match from right to left, discarding leading bytes if the input is
|
||||
// too long.
|
||||
// - We just take the right-most match without losing too much precision. This
|
||||
// allows us to use an O(n) algorithm.
|
||||
// - We can operate directly on chunked strings; in many cases we will
|
||||
// be storing the package path and/or package name separately from the
|
||||
// symbol or identifiers, so doing this avoids allocating strings.
|
||||
// - We can return the index of the right-most match, allowing us to trim
|
||||
// irrelevant qualification.
|
||||
type SymbolMatcher struct {
|
||||
// Using buffers of length 256 is both a reasonable size for most qualified
|
||||
// symbols, and makes it easy to avoid bounds checks by using uint8 indexes.
|
||||
pattern [256]rune
|
||||
patternLen uint8
|
||||
inputBuffer [256]rune // avoid allocating when considering chunks
|
||||
roles [256]uint32 // which roles does a rune play (word start, etc.)
|
||||
segments [256]uint8 // how many segments from the right is each rune
|
||||
}
|
||||
|
||||
// Rune roles.
|
||||
const (
|
||||
segmentStart uint32 = 1 << iota // input rune starts a segment (i.e. follows '/' or '.')
|
||||
wordStart // input rune starts a word, per camel-case naming rules
|
||||
separator // input rune is a separator ('/' or '.')
|
||||
upper // input rune is an upper case letter
|
||||
)
|
||||
|
||||
// NewSymbolMatcher creates a SymbolMatcher that may be used to match the given
|
||||
// search pattern.
|
||||
//
|
||||
// Currently this matcher only accepts case-insensitive fuzzy patterns.
|
||||
//
|
||||
// An empty pattern matches no input.
|
||||
func NewSymbolMatcher(pattern string) *SymbolMatcher {
|
||||
m := &SymbolMatcher{}
|
||||
for _, p := range pattern {
|
||||
m.pattern[m.patternLen] = unicode.ToLower(p)
|
||||
m.patternLen++
|
||||
if m.patternLen == 255 || int(m.patternLen) == len(pattern) {
|
||||
// break at 255 so that we can represent patternLen with a uint8.
|
||||
break
|
||||
}
|
||||
}
|
||||
return m
|
||||
}
|
||||
|
||||
// Match searches for the right-most match of the search pattern within the
|
||||
// symbol represented by concatenating the given chunks.
|
||||
//
|
||||
// If a match is found, the first result holds the absolute byte offset within
|
||||
// all chunks for the start of the symbol. In other words, the index of the
|
||||
// match within strings.Join(chunks, "").
|
||||
//
|
||||
// The second return value will be the score of the match, which is always
|
||||
// between 0 and 1, inclusive. A score of 0 indicates no match.
|
||||
//
|
||||
// If no match is found, Match returns (-1, 0).
|
||||
func (m *SymbolMatcher) Match(chunks []string) (int, float64) {
|
||||
// Explicit behavior for an empty pattern.
|
||||
//
|
||||
// As a minor optimization, this also avoids nilness checks later on, since
|
||||
// the compiler can prove that m != nil.
|
||||
if m.patternLen == 0 {
|
||||
return -1, 0
|
||||
}
|
||||
|
||||
// Matching implements a heavily optimized linear scoring algorithm on the
|
||||
// input. This is not guaranteed to produce the highest score, but works well
|
||||
// enough, particularly due to the right-to-left significance of qualified
|
||||
// symbols.
|
||||
//
|
||||
// Matching proceeds in three passes through the input:
|
||||
// - The first pass populates the input buffer and collects rune roles.
|
||||
// - The second pass proceeds right-to-left to find the right-most match.
|
||||
// - The third pass proceeds left-to-right from the start of the right-most
|
||||
// match, to find the most *compact* match, and computes the score of this
|
||||
// match.
|
||||
//
|
||||
// See below for more details of each pass, as well as the scoring algorithm.
|
||||
|
||||
// First pass: populate the input buffer out of the provided chunks
|
||||
// (lower-casing in the process), and collect rune roles.
|
||||
//
|
||||
// We could also check for a forward match here, but since we'd have to write
|
||||
// the entire input anyway this has negligible impact on performance.
|
||||
var (
|
||||
inputLen = uint8(0)
|
||||
modifiers = wordStart | segmentStart
|
||||
)
|
||||
|
||||
input:
|
||||
for _, chunk := range chunks {
|
||||
for _, r := range chunk {
|
||||
if r == '.' || r == '/' {
|
||||
modifiers |= separator
|
||||
}
|
||||
// optimization: avoid calls to unicode.ToLower, which can't be inlined.
|
||||
l := r
|
||||
if r <= unicode.MaxASCII {
|
||||
if 'A' <= r && r <= 'Z' {
|
||||
l = r + 'a' - 'A'
|
||||
}
|
||||
} else {
|
||||
l = unicode.ToLower(r)
|
||||
}
|
||||
if l != r {
|
||||
modifiers |= upper
|
||||
|
||||
// If the current rune is capitalized *and the preceding rune was not*,
|
||||
// mark this as a word start. This avoids spuriously high ranking of
|
||||
// non-camelcase naming schemas, such as the
|
||||
// yaml_PARSE_FLOW_SEQUENCE_ENTRY_MAPPING_END_STATE example of
|
||||
// golang/go#60201.
|
||||
if inputLen == 0 || m.roles[inputLen-1]&upper == 0 {
|
||||
modifiers |= wordStart
|
||||
}
|
||||
}
|
||||
m.inputBuffer[inputLen] = l
|
||||
m.roles[inputLen] = modifiers
|
||||
inputLen++
|
||||
if m.roles[inputLen-1]&separator != 0 {
|
||||
modifiers = wordStart | segmentStart
|
||||
} else {
|
||||
modifiers = 0
|
||||
}
|
||||
// TODO: we should prefer the right-most input if it overflows, rather
|
||||
// than the left-most as we're doing here.
|
||||
if inputLen == 255 {
|
||||
break input
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Second pass: find the right-most match, and count segments from the
|
||||
// right.
|
||||
var (
|
||||
pi = uint8(m.patternLen - 1) // pattern index
|
||||
p = m.pattern[pi] // pattern rune
|
||||
start = -1 // start offset of match
|
||||
rseg = uint8(0) // effective "depth" from the right of the current rune in consideration
|
||||
)
|
||||
const maxSeg = 3 // maximum number of segments from the right to count, for scoring purposes.
|
||||
|
||||
for ii := inputLen - 1; ; ii-- {
|
||||
r := m.inputBuffer[ii]
|
||||
if rseg < maxSeg && m.roles[ii]&separator != 0 {
|
||||
rseg++
|
||||
}
|
||||
m.segments[ii] = rseg
|
||||
if p == r {
|
||||
if pi == 0 {
|
||||
// TODO(rfindley): BUG: the docstring for Match says that it returns an
|
||||
// absolute byte offset, but clearly it is returning a rune offset here.
|
||||
start = int(ii)
|
||||
break
|
||||
}
|
||||
pi--
|
||||
p = m.pattern[pi]
|
||||
}
|
||||
// Don't check ii >= 0 in the loop condition: ii is a uint8.
|
||||
if ii == 0 {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if start < 0 {
|
||||
// no match: skip scoring
|
||||
return -1, 0
|
||||
}
|
||||
|
||||
// Third pass: find the shortest match and compute the score.
|
||||
|
||||
// Score is the average score for each rune.
|
||||
//
|
||||
// A rune score is the multiple of:
|
||||
// 1. The base score, which is 1.0 if the rune starts a segment, 0.9 if the
|
||||
// rune starts a mid-segment word, else 0.6.
|
||||
//
|
||||
// Runes preceded by a matching rune are treated the same as the start
|
||||
// of a mid-segment word (with a 0.9 score), so that sequential or exact
|
||||
// matches are preferred. We call this a sequential bonus.
|
||||
//
|
||||
// For the final rune match, this sequential bonus is reduced to 0.8 if
|
||||
// the next rune in the input is a mid-segment word, or 0.7 if the next
|
||||
// rune in the input is not a word or segment start. This ensures that
|
||||
// we favor whole-word or whole-segment matches over prefix matches.
|
||||
//
|
||||
// 2. 1.0 if the rune is part of the last segment, otherwise
|
||||
// 1.0-0.1*<segments from the right>, with a max segment count of 3.
|
||||
// Notably 1.0-0.1*3 = 0.7 > 0.6, so that foo/_/_/_/_ (a match very
|
||||
// early in a qualified symbol name) still scores higher than _f_o_o_ (a
|
||||
// completely split match).
|
||||
//
|
||||
// This is a naive algorithm, but it is fast. There's lots of prior art here
|
||||
// that could be leveraged. For example, we could explicitly consider
|
||||
// rune distance, and exact matches of words or segments.
|
||||
//
|
||||
// Also note that this might not actually find the highest scoring match, as
|
||||
// doing so could require a non-linear algorithm, depending on how the score
|
||||
// is calculated.
|
||||
|
||||
// debugging support
|
||||
const debug = false // enable to log debugging information
|
||||
var (
|
||||
runeScores []float64
|
||||
runeIdxs []int
|
||||
)
|
||||
|
||||
pi = 0
|
||||
p = m.pattern[pi]
|
||||
|
||||
const (
|
||||
segStartScore = 1.0 // base score of runes starting a segment
|
||||
wordScore = 0.9 // base score of runes starting or continuing a word
|
||||
noStreak = 0.6
|
||||
perSegment = 0.1 // we count at most 3 segments above
|
||||
)
|
||||
|
||||
totScore := 0.0
|
||||
lastMatch := uint8(255)
|
||||
for ii := uint8(start); ii < inputLen; ii++ {
|
||||
r := m.inputBuffer[ii]
|
||||
if r == p {
|
||||
pi++
|
||||
finalRune := pi >= m.patternLen
|
||||
p = m.pattern[pi]
|
||||
|
||||
baseScore := noStreak
|
||||
|
||||
// Calculate the sequence bonus based on preceding matches.
|
||||
//
|
||||
// We do this first as it is overridden by role scoring below.
|
||||
if lastMatch == ii-1 {
|
||||
baseScore = wordScore
|
||||
// Reduce the sequence bonus for the final rune of the pattern based on
|
||||
// whether it borders a new segment or word.
|
||||
if finalRune {
|
||||
switch {
|
||||
case ii == inputLen-1 || m.roles[ii+1]&separator != 0:
|
||||
// Full segment: no reduction
|
||||
case m.roles[ii+1]&wordStart != 0:
|
||||
baseScore = wordScore - 0.1
|
||||
default:
|
||||
baseScore = wordScore - 0.2
|
||||
}
|
||||
}
|
||||
}
|
||||
lastMatch = ii
|
||||
|
||||
// Calculate the rune's role score. If the rune starts a segment or word,
|
||||
// this overrides the sequence score, as the rune starts a new sequence.
|
||||
switch {
|
||||
case m.roles[ii]&segmentStart != 0:
|
||||
baseScore = segStartScore
|
||||
case m.roles[ii]&wordStart != 0:
|
||||
baseScore = wordScore
|
||||
}
|
||||
|
||||
// Apply the segment-depth penalty (segments from the right).
|
||||
runeScore := baseScore * (1.0 - float64(m.segments[ii])*perSegment)
|
||||
if debug {
|
||||
runeScores = append(runeScores, runeScore)
|
||||
runeIdxs = append(runeIdxs, int(ii))
|
||||
}
|
||||
totScore += runeScore
|
||||
if finalRune {
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if debug {
|
||||
// Format rune roles and scores in line:
|
||||
// fo[o:.52].[b:1]a[r:.6]
|
||||
var summary bytes.Buffer
|
||||
last := 0
|
||||
for i, idx := range runeIdxs {
|
||||
summary.WriteString(string(m.inputBuffer[last:idx])) // encode runes
|
||||
fmt.Fprintf(&summary, "[%s:%.2g]", string(m.inputBuffer[idx]), runeScores[i])
|
||||
last = idx + 1
|
||||
}
|
||||
summary.WriteString(string(m.inputBuffer[last:inputLen])) // encode runes
|
||||
log.Println(summary.String())
|
||||
}
|
||||
|
||||
return start, totScore / float64(m.patternLen)
|
||||
}
|
||||
+252
@@ -0,0 +1,252 @@
|
||||
// Copyright 2021 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package fuzzy_test
|
||||
|
||||
import (
|
||||
"go/ast"
|
||||
"go/token"
|
||||
"sort"
|
||||
"testing"
|
||||
|
||||
"golang.org/x/tools/go/packages"
|
||||
. "golang.org/x/tools/internal/fuzzy"
|
||||
)
|
||||
|
||||
func TestSymbolMatchIndex(t *testing.T) {
|
||||
tests := []struct {
|
||||
pattern, input string
|
||||
want int
|
||||
}{
|
||||
{"test", "foo.TestFoo", 4},
|
||||
{"test", "test", 0},
|
||||
{"test", "Test", 0},
|
||||
{"test", "est", -1},
|
||||
{"t", "shortest", 7},
|
||||
{"", "foo", -1},
|
||||
{"", string([]rune{0}), -1}, // verify that we don't default to an empty pattern.
|
||||
{"anything", "", -1},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
matcher := NewSymbolMatcher(test.pattern)
|
||||
if got, _ := matcher.Match([]string{test.input}); got != test.want {
|
||||
t.Errorf("NewSymbolMatcher(%q).Match(%q) = %v, _, want %v, _", test.pattern, test.input, got, test.want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestSymbolRanking(t *testing.T) {
|
||||
|
||||
// query -> symbols to match, in ascending order of score
|
||||
queryRanks := map[string][]string{
|
||||
"test": {
|
||||
"this.is.better.than.most",
|
||||
"test.foo.bar",
|
||||
"thebest",
|
||||
"atest",
|
||||
"test.foo",
|
||||
"testage",
|
||||
"tTest",
|
||||
"foo.test",
|
||||
},
|
||||
"parseside": { // golang/go#60201
|
||||
"yaml_PARSE_FLOW_SEQUENCE_ENTRY_MAPPING_END_STATE",
|
||||
"parseContext.parse_sidebyside",
|
||||
},
|
||||
"cvb": {
|
||||
"filecache_test.testIPCValueB",
|
||||
"cover.Boundary",
|
||||
},
|
||||
"dho": {
|
||||
"gocommand.DebugHangingGoCommands",
|
||||
"protocol.DocumentHighlightOptions",
|
||||
},
|
||||
"flg": {
|
||||
"completion.FALLTHROUGH",
|
||||
"main.flagGoCmd",
|
||||
},
|
||||
"fvi": {
|
||||
"godoc.fileIndexVersion",
|
||||
"macho.FlagSubsectionsViaSymbols",
|
||||
},
|
||||
}
|
||||
|
||||
for query, symbols := range queryRanks {
|
||||
t.Run(query, func(t *testing.T) {
|
||||
matcher := NewSymbolMatcher(query)
|
||||
prev := 0.0
|
||||
for _, sym := range symbols {
|
||||
_, score := matcher.Match([]string{sym})
|
||||
t.Logf("Match(%q) = %v", sym, score)
|
||||
if score <= prev {
|
||||
t.Errorf("Match(%q) = _, %v, want > %v", sym, score, prev)
|
||||
}
|
||||
prev = score
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestMatcherSimilarities(t *testing.T) {
|
||||
// This test compares the fuzzy matcher with the symbol matcher on a corpus
|
||||
// of qualified identifiers extracted from x/tools.
|
||||
//
|
||||
// These two matchers are not expected to agree, but inspecting differences
|
||||
// can be useful for finding interesting ranking edge cases.
|
||||
t.Skip("unskip this test to compare matchers")
|
||||
|
||||
idents := collectIdentifiers(t)
|
||||
t.Logf("collected %d unique identifiers", len(idents))
|
||||
|
||||
// TODO: use go1.21 slices.MaxFunc.
|
||||
topMatch := func(score func(string) float64) string {
|
||||
top := ""
|
||||
topScore := 0.0
|
||||
for _, cand := range idents {
|
||||
if s := score(cand); s > topScore {
|
||||
top = cand
|
||||
topScore = s
|
||||
}
|
||||
}
|
||||
return top
|
||||
}
|
||||
|
||||
agreed := 0
|
||||
total := 0
|
||||
bad := 0
|
||||
patterns := generatePatterns()
|
||||
for _, pattern := range patterns {
|
||||
total++
|
||||
|
||||
fm := NewMatcher(pattern)
|
||||
topFuzzy := topMatch(func(input string) float64 {
|
||||
return float64(fm.Score(input))
|
||||
})
|
||||
sm := NewSymbolMatcher(pattern)
|
||||
topSymbol := topMatch(func(input string) float64 {
|
||||
_, score := sm.Match([]string{input})
|
||||
return score
|
||||
})
|
||||
switch {
|
||||
case topFuzzy == "" && topSymbol != "":
|
||||
if false {
|
||||
// The fuzzy matcher has a bug where it misses some matches; for this
|
||||
// test we only care about the symbol matcher.
|
||||
t.Logf("%q matched %q but no fuzzy match", pattern, topSymbol)
|
||||
}
|
||||
total--
|
||||
bad++
|
||||
case topFuzzy != "" && topSymbol == "":
|
||||
t.Fatalf("%q matched %q but no symbol match", pattern, topFuzzy)
|
||||
case topFuzzy == topSymbol:
|
||||
agreed++
|
||||
default:
|
||||
// Enable this log to see mismatches.
|
||||
if false {
|
||||
t.Logf("mismatch for %q: fuzzy: %q, symbol: %q", pattern, topFuzzy, topSymbol)
|
||||
}
|
||||
}
|
||||
}
|
||||
t.Logf("fuzzy matchers agreed on %d out of %d queries (%d bad)", agreed, total, bad)
|
||||
}
|
||||
|
||||
func collectIdentifiers(tb testing.TB) []string {
|
||||
cfg := &packages.Config{
|
||||
Mode: packages.NeedName | packages.NeedSyntax | packages.NeedFiles,
|
||||
Tests: true,
|
||||
}
|
||||
pkgs, err := packages.Load(cfg, "golang.org/x/tools/...")
|
||||
if err != nil {
|
||||
tb.Fatal(err)
|
||||
}
|
||||
uniqueIdents := make(map[string]bool)
|
||||
decls := 0
|
||||
for _, pkg := range pkgs {
|
||||
for _, f := range pkg.Syntax {
|
||||
for _, decl := range f.Decls {
|
||||
decls++
|
||||
switch decl := decl.(type) {
|
||||
case *ast.GenDecl:
|
||||
for _, spec := range decl.Specs {
|
||||
switch decl.Tok {
|
||||
case token.IMPORT:
|
||||
case token.TYPE:
|
||||
name := spec.(*ast.TypeSpec).Name.Name
|
||||
qualified := pkg.Name + "." + name
|
||||
uniqueIdents[qualified] = true
|
||||
case token.CONST, token.VAR:
|
||||
for _, n := range spec.(*ast.ValueSpec).Names {
|
||||
qualified := pkg.Name + "." + n.Name
|
||||
uniqueIdents[qualified] = true
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
var idents []string
|
||||
for k := range uniqueIdents {
|
||||
idents = append(idents, k)
|
||||
}
|
||||
sort.Strings(idents)
|
||||
return idents
|
||||
}
|
||||
|
||||
func generatePatterns() []string {
|
||||
var patterns []string
|
||||
for x := 'a'; x <= 'z'; x++ {
|
||||
for y := 'a'; y <= 'z'; y++ {
|
||||
for z := 'a'; z <= 'z'; z++ {
|
||||
patterns = append(patterns, string(x)+string(y)+string(z))
|
||||
}
|
||||
}
|
||||
}
|
||||
return patterns
|
||||
}
|
||||
|
||||
// Test that we strongly prefer exact matches.
|
||||
//
|
||||
// In golang/go#60027, we preferred "Runner" for the query "rune" over several
|
||||
// results containing the word "rune" exactly. Following this observation,
|
||||
// scoring was tweaked to more strongly emphasize sequential characters and
|
||||
// exact matches.
|
||||
func TestSymbolRanking_Issue60027(t *testing.T) {
|
||||
matcher := NewSymbolMatcher("rune")
|
||||
|
||||
// symbols to match, in ascending order of ranking.
|
||||
symbols := []string{
|
||||
"Runner",
|
||||
"singleRuneParam",
|
||||
"Config.ifsRune",
|
||||
"Parser.rune",
|
||||
}
|
||||
prev := 0.0
|
||||
for _, sym := range symbols {
|
||||
_, score := matcher.Match([]string{sym})
|
||||
t.Logf("Match(%q) = %v", sym, score)
|
||||
if score < prev {
|
||||
t.Errorf("Match(%q) = _, %v, want > %v", sym, score, prev)
|
||||
}
|
||||
prev = score
|
||||
}
|
||||
}
|
||||
|
||||
func TestChunkedMatch(t *testing.T) {
|
||||
matcher := NewSymbolMatcher("test")
|
||||
_, want := matcher.Match([]string{"test"})
|
||||
chunked := [][]string{
|
||||
{"", "test"},
|
||||
{"test", ""},
|
||||
{"te", "st"},
|
||||
}
|
||||
|
||||
for _, chunks := range chunked {
|
||||
offset, score := matcher.Match(chunks)
|
||||
if offset != 0 || score != want {
|
||||
t.Errorf("Match(%v) = %v, %v, want 0, 1.0", chunks, offset, score)
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user