whatcanGOwrong
This commit is contained in:
@@ -0,0 +1,203 @@
|
||||
// Copyright 2009 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Package utf8string provides an efficient way to index strings by rune rather than by byte.
|
||||
package utf8string // import "golang.org/x/exp/utf8string"
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
||||
// String wraps a regular string with a small structure that provides more
|
||||
// efficient indexing by code point index, as opposed to byte index.
|
||||
// Scanning incrementally forwards or backwards is O(1) per index operation
|
||||
// (although not as fast a range clause going forwards). Random access is
|
||||
// O(N) in the length of the string, but the overhead is less than always
|
||||
// scanning from the beginning.
|
||||
// If the string is ASCII, random access is O(1).
|
||||
// Unlike the built-in string type, String has internal mutable state and
|
||||
// is not thread-safe.
|
||||
type String struct {
|
||||
str string
|
||||
numRunes int
|
||||
// If width > 0, the rune at runePos starts at bytePos and has the specified width.
|
||||
width int
|
||||
bytePos int
|
||||
runePos int
|
||||
nonASCII int // byte index of the first non-ASCII rune.
|
||||
}
|
||||
|
||||
// NewString returns a new UTF-8 string with the provided contents.
|
||||
func NewString(contents string) *String {
|
||||
return new(String).Init(contents)
|
||||
}
|
||||
|
||||
// Init initializes an existing String to hold the provided contents.
|
||||
// It returns a pointer to the initialized String.
|
||||
func (s *String) Init(contents string) *String {
|
||||
s.str = contents
|
||||
s.bytePos = 0
|
||||
s.runePos = 0
|
||||
for i := 0; i < len(contents); i++ {
|
||||
if contents[i] >= utf8.RuneSelf {
|
||||
// Not ASCII.
|
||||
s.numRunes = utf8.RuneCountInString(contents)
|
||||
_, s.width = utf8.DecodeRuneInString(contents)
|
||||
s.nonASCII = i
|
||||
return s
|
||||
}
|
||||
}
|
||||
// ASCII is simple. Also, the empty string is ASCII.
|
||||
s.numRunes = len(contents)
|
||||
s.width = 0
|
||||
s.nonASCII = len(contents)
|
||||
return s
|
||||
}
|
||||
|
||||
// String returns the contents of the String. This method also means the
|
||||
// String is directly printable by fmt.Print.
|
||||
func (s *String) String() string {
|
||||
return s.str
|
||||
}
|
||||
|
||||
// RuneCount returns the number of runes (Unicode code points) in the String.
|
||||
func (s *String) RuneCount() int {
|
||||
return s.numRunes
|
||||
}
|
||||
|
||||
// IsASCII returns a boolean indicating whether the String contains only ASCII bytes.
|
||||
func (s *String) IsASCII() bool {
|
||||
return s.width == 0
|
||||
}
|
||||
|
||||
// Slice returns the string sliced at rune positions [i:j].
|
||||
func (s *String) Slice(i, j int) string {
|
||||
// ASCII is easy. Let the compiler catch the indexing error if there is one.
|
||||
if j < s.nonASCII {
|
||||
return s.str[i:j]
|
||||
}
|
||||
if i < 0 || j > s.numRunes || i > j {
|
||||
panic(sliceOutOfRange)
|
||||
}
|
||||
if i == j {
|
||||
return ""
|
||||
}
|
||||
// For non-ASCII, after At(i), bytePos is always the position of the indexed character.
|
||||
var low, high int
|
||||
switch {
|
||||
case i < s.nonASCII:
|
||||
low = i
|
||||
case i == s.numRunes:
|
||||
low = len(s.str)
|
||||
default:
|
||||
s.At(i)
|
||||
low = s.bytePos
|
||||
}
|
||||
switch {
|
||||
case j == s.numRunes:
|
||||
high = len(s.str)
|
||||
default:
|
||||
s.At(j)
|
||||
high = s.bytePos
|
||||
}
|
||||
return s.str[low:high]
|
||||
}
|
||||
|
||||
// At returns the rune with index i in the String. The sequence of runes is the same
|
||||
// as iterating over the contents with a "for range" clause.
|
||||
func (s *String) At(i int) rune {
|
||||
// ASCII is easy. Let the compiler catch the indexing error if there is one.
|
||||
if i < s.nonASCII {
|
||||
return rune(s.str[i])
|
||||
}
|
||||
|
||||
// Now we do need to know the index is valid.
|
||||
if i < 0 || i >= s.numRunes {
|
||||
panic(outOfRange)
|
||||
}
|
||||
|
||||
var r rune
|
||||
|
||||
// Five easy common cases: within 1 spot of bytePos/runePos, or the beginning, or the end.
|
||||
// With these cases, all scans from beginning or end work in O(1) time per rune.
|
||||
switch {
|
||||
|
||||
case i == s.runePos-1: // backing up one rune
|
||||
r, s.width = utf8.DecodeLastRuneInString(s.str[0:s.bytePos])
|
||||
s.runePos = i
|
||||
s.bytePos -= s.width
|
||||
return r
|
||||
case i == s.runePos+1: // moving ahead one rune
|
||||
s.runePos = i
|
||||
s.bytePos += s.width
|
||||
fallthrough
|
||||
case i == s.runePos:
|
||||
r, s.width = utf8.DecodeRuneInString(s.str[s.bytePos:])
|
||||
return r
|
||||
case i == 0: // start of string
|
||||
r, s.width = utf8.DecodeRuneInString(s.str)
|
||||
s.runePos = 0
|
||||
s.bytePos = 0
|
||||
return r
|
||||
|
||||
case i == s.numRunes-1: // last rune in string
|
||||
r, s.width = utf8.DecodeLastRuneInString(s.str)
|
||||
s.runePos = i
|
||||
s.bytePos = len(s.str) - s.width
|
||||
return r
|
||||
}
|
||||
|
||||
// We need to do a linear scan. There are three places to start from:
|
||||
// 1) The beginning
|
||||
// 2) bytePos/runePos.
|
||||
// 3) The end
|
||||
// Choose the closest in rune count, scanning backwards if necessary.
|
||||
forward := true
|
||||
if i < s.runePos {
|
||||
// Between beginning and pos. Which is closer?
|
||||
// Since both i and runePos are guaranteed >= nonASCII, that's the
|
||||
// lowest location we need to start from.
|
||||
if i < (s.runePos-s.nonASCII)/2 {
|
||||
// Scan forward from beginning
|
||||
s.bytePos, s.runePos = s.nonASCII, s.nonASCII
|
||||
} else {
|
||||
// Scan backwards from where we are
|
||||
forward = false
|
||||
}
|
||||
} else {
|
||||
// Between pos and end. Which is closer?
|
||||
if i-s.runePos < (s.numRunes-s.runePos)/2 {
|
||||
// Scan forward from pos
|
||||
} else {
|
||||
// Scan backwards from end
|
||||
s.bytePos, s.runePos = len(s.str), s.numRunes
|
||||
forward = false
|
||||
}
|
||||
}
|
||||
if forward {
|
||||
// TODO: Is it much faster to use a range loop for this scan?
|
||||
for {
|
||||
r, s.width = utf8.DecodeRuneInString(s.str[s.bytePos:])
|
||||
if s.runePos == i {
|
||||
break
|
||||
}
|
||||
s.runePos++
|
||||
s.bytePos += s.width
|
||||
}
|
||||
} else {
|
||||
for {
|
||||
r, s.width = utf8.DecodeLastRuneInString(s.str[0:s.bytePos])
|
||||
s.runePos--
|
||||
s.bytePos -= s.width
|
||||
if s.runePos == i {
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
return r
|
||||
}
|
||||
|
||||
var outOfRange = errors.New("utf8string: index out of range")
|
||||
var sliceOutOfRange = errors.New("utf8string: slice index out of range")
|
||||
+123
@@ -0,0 +1,123 @@
|
||||
// Copyright 2009 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package utf8string
|
||||
|
||||
import (
|
||||
"math/rand"
|
||||
"testing"
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
||||
var testStrings = []string{
|
||||
"",
|
||||
"abcd",
|
||||
"☺☻☹",
|
||||
"日a本b語ç日ð本Ê語þ日¥本¼語i日©",
|
||||
"日a本b語ç日ð本Ê語þ日¥本¼語i日©日a本b語ç日ð本Ê語þ日¥本¼語i日©日a本b語ç日ð本Ê語þ日¥本¼語i日©",
|
||||
"\x80\x80\x80\x80",
|
||||
}
|
||||
|
||||
func TestScanForwards(t *testing.T) {
|
||||
for _, s := range testStrings {
|
||||
runes := []rune(s)
|
||||
str := NewString(s)
|
||||
if str.RuneCount() != len(runes) {
|
||||
t.Errorf("%s: expected %d runes; got %d", s, len(runes), str.RuneCount())
|
||||
break
|
||||
}
|
||||
for i, expect := range runes {
|
||||
got := str.At(i)
|
||||
if got != expect {
|
||||
t.Errorf("%s[%d]: expected %c (%U); got %c (%U)", s, i, expect, expect, got, got)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestScanBackwards(t *testing.T) {
|
||||
for _, s := range testStrings {
|
||||
runes := []rune(s)
|
||||
str := NewString(s)
|
||||
if str.RuneCount() != len(runes) {
|
||||
t.Errorf("%s: expected %d runes; got %d", s, len(runes), str.RuneCount())
|
||||
break
|
||||
}
|
||||
for i := len(runes) - 1; i >= 0; i-- {
|
||||
expect := runes[i]
|
||||
got := str.At(i)
|
||||
if got != expect {
|
||||
t.Errorf("%s[%d]: expected %c (%U); got %c (%U)", s, i, expect, expect, got, got)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func randCount() int {
|
||||
if testing.Short() {
|
||||
return 100
|
||||
}
|
||||
return 100000
|
||||
}
|
||||
|
||||
func TestRandomAccess(t *testing.T) {
|
||||
for _, s := range testStrings {
|
||||
if len(s) == 0 {
|
||||
continue
|
||||
}
|
||||
runes := []rune(s)
|
||||
str := NewString(s)
|
||||
if str.RuneCount() != len(runes) {
|
||||
t.Errorf("%s: expected %d runes; got %d", s, len(runes), str.RuneCount())
|
||||
break
|
||||
}
|
||||
for j := 0; j < randCount(); j++ {
|
||||
i := rand.Intn(len(runes))
|
||||
expect := runes[i]
|
||||
got := str.At(i)
|
||||
if got != expect {
|
||||
t.Errorf("%s[%d]: expected %c (%U); got %c (%U)", s, i, expect, expect, got, got)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestRandomSliceAccess(t *testing.T) {
|
||||
for _, s := range testStrings {
|
||||
if len(s) == 0 || s[0] == '\x80' { // the bad-UTF-8 string fools this simple test
|
||||
continue
|
||||
}
|
||||
runes := []rune(s)
|
||||
str := NewString(s)
|
||||
if str.RuneCount() != len(runes) {
|
||||
t.Errorf("%s: expected %d runes; got %d", s, len(runes), str.RuneCount())
|
||||
break
|
||||
}
|
||||
for k := 0; k < randCount(); k++ {
|
||||
i := rand.Intn(len(runes))
|
||||
j := rand.Intn(len(runes) + 1)
|
||||
if i > j { // include empty strings
|
||||
continue
|
||||
}
|
||||
expect := string(runes[i:j])
|
||||
got := str.Slice(i, j)
|
||||
if got != expect {
|
||||
t.Errorf("%s[%d:%d]: expected %q got %q", s, i, j, expect, got)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestLimitSliceAccess(t *testing.T) {
|
||||
for _, s := range testStrings {
|
||||
str := NewString(s)
|
||||
if str.Slice(0, 0) != "" {
|
||||
t.Error("failure with empty slice at beginning")
|
||||
}
|
||||
nr := utf8.RuneCountInString(s)
|
||||
if str.Slice(nr, nr) != "" {
|
||||
t.Error("failure with empty slice at end")
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user