whatcanGOwrong

2024-09-19 21:38:24 -04:00
commit d0ae4d841d
17908 changed files with 4096831 additions and 0 deletions
@@ -0,0 +1,203 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package utf8string provides an efficient way to index strings by rune rather than by byte.
+package utf8string // import "golang.org/x/exp/utf8string"
+
+import (
+	"errors"
+	"unicode/utf8"
+)
+
+// String wraps a regular string with a small structure that provides more
+// efficient indexing by code point index, as opposed to byte index.
+// Scanning incrementally forwards or backwards is O(1) per index operation
+// (although not as fast a range clause going forwards).  Random access is
+// O(N) in the length of the string, but the overhead is less than always
+// scanning from the beginning.
+// If the string is ASCII, random access is O(1).
+// Unlike the built-in string type, String has internal mutable state and
+// is not thread-safe.
+type String struct {
+	str      string
+	numRunes int
+	// If width > 0, the rune at runePos starts at bytePos and has the specified width.
+	width    int
+	bytePos  int
+	runePos  int
+	nonASCII int // byte index of the first non-ASCII rune.
+}
+
+// NewString returns a new UTF-8 string with the provided contents.
+func NewString(contents string) *String {
+	return new(String).Init(contents)
+}
+
+// Init initializes an existing String to hold the provided contents.
+// It returns a pointer to the initialized String.
+func (s *String) Init(contents string) *String {
+	s.str = contents
+	s.bytePos = 0
+	s.runePos = 0
+	for i := 0; i < len(contents); i++ {
+		if contents[i] >= utf8.RuneSelf {
+			// Not ASCII.
+			s.numRunes = utf8.RuneCountInString(contents)
+			_, s.width = utf8.DecodeRuneInString(contents)
+			s.nonASCII = i
+			return s
+		}
+	}
+	// ASCII is simple.  Also, the empty string is ASCII.
+	s.numRunes = len(contents)
+	s.width = 0
+	s.nonASCII = len(contents)
+	return s
+}
+
+// String returns the contents of the String.  This method also means the
+// String is directly printable by fmt.Print.
+func (s *String) String() string {
+	return s.str
+}
+
+// RuneCount returns the number of runes (Unicode code points) in the String.
+func (s *String) RuneCount() int {
+	return s.numRunes
+}
+
+// IsASCII returns a boolean indicating whether the String contains only ASCII bytes.
+func (s *String) IsASCII() bool {
+	return s.width == 0
+}
+
+// Slice returns the string sliced at rune positions [i:j].
+func (s *String) Slice(i, j int) string {
+	// ASCII is easy.  Let the compiler catch the indexing error if there is one.
+	if j < s.nonASCII {
+		return s.str[i:j]
+	}
+	if i < 0 || j > s.numRunes || i > j {
+		panic(sliceOutOfRange)
+	}
+	if i == j {
+		return ""
+	}
+	// For non-ASCII, after At(i), bytePos is always the position of the indexed character.
+	var low, high int
+	switch {
+	case i < s.nonASCII:
+		low = i
+	case i == s.numRunes:
+		low = len(s.str)
+	default:
+		s.At(i)
+		low = s.bytePos
+	}
+	switch {
+	case j == s.numRunes:
+		high = len(s.str)
+	default:
+		s.At(j)
+		high = s.bytePos
+	}
+	return s.str[low:high]
+}
+
+// At returns the rune with index i in the String.  The sequence of runes is the same
+// as iterating over the contents with a "for range" clause.
+func (s *String) At(i int) rune {
+	// ASCII is easy.  Let the compiler catch the indexing error if there is one.
+	if i < s.nonASCII {
+		return rune(s.str[i])
+	}
+
+	// Now we do need to know the index is valid.
+	if i < 0 || i >= s.numRunes {
+		panic(outOfRange)
+	}
+
+	var r rune
+
+	// Five easy common cases: within 1 spot of bytePos/runePos, or the beginning, or the end.
+	// With these cases, all scans from beginning or end work in O(1) time per rune.
+	switch {
+
+	case i == s.runePos-1: // backing up one rune
+		r, s.width = utf8.DecodeLastRuneInString(s.str[0:s.bytePos])
+		s.runePos = i
+		s.bytePos -= s.width
+		return r
+	case i == s.runePos+1: // moving ahead one rune
+		s.runePos = i
+		s.bytePos += s.width
+		fallthrough
+	case i == s.runePos:
+		r, s.width = utf8.DecodeRuneInString(s.str[s.bytePos:])
+		return r
+	case i == 0: // start of string
+		r, s.width = utf8.DecodeRuneInString(s.str)
+		s.runePos = 0
+		s.bytePos = 0
+		return r
+
+	case i == s.numRunes-1: // last rune in string
+		r, s.width = utf8.DecodeLastRuneInString(s.str)
+		s.runePos = i
+		s.bytePos = len(s.str) - s.width
+		return r
+	}
+
+	// We need to do a linear scan.  There are three places to start from:
+	// 1) The beginning
+	// 2) bytePos/runePos.
+	// 3) The end
+	// Choose the closest in rune count, scanning backwards if necessary.
+	forward := true
+	if i < s.runePos {
+		// Between beginning and pos.  Which is closer?
+		// Since both i and runePos are guaranteed >= nonASCII, that's the
+		// lowest location we need to start from.
+		if i < (s.runePos-s.nonASCII)/2 {
+			// Scan forward from beginning
+			s.bytePos, s.runePos = s.nonASCII, s.nonASCII
+		} else {
+			// Scan backwards from where we are
+			forward = false
+		}
+	} else {
+		// Between pos and end.  Which is closer?
+		if i-s.runePos < (s.numRunes-s.runePos)/2 {
+			// Scan forward from pos
+		} else {
+			// Scan backwards from end
+			s.bytePos, s.runePos = len(s.str), s.numRunes
+			forward = false
+		}
+	}
+	if forward {
+		// TODO: Is it much faster to use a range loop for this scan?
+		for {
+			r, s.width = utf8.DecodeRuneInString(s.str[s.bytePos:])
+			if s.runePos == i {
+				break
+			}
+			s.runePos++
+			s.bytePos += s.width
+		}
+	} else {
+		for {
+			r, s.width = utf8.DecodeLastRuneInString(s.str[0:s.bytePos])
+			s.runePos--
+			s.bytePos -= s.width
+			if s.runePos == i {
+				break
+			}
+		}
+	}
+	return r
+}
+
+var outOfRange = errors.New("utf8string: index out of range")
+var sliceOutOfRange = errors.New("utf8string: slice index out of range")
@@ -0,0 +1,123 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package utf8string
+
+import (
+	"math/rand"
+	"testing"
+	"unicode/utf8"
+)
+
+var testStrings = []string{
+	"",
+	"abcd",
+	"☺☻☹",
+	"日a本b語ç日ð本Ê語þ日¥本¼語i日©",
+	"日a本b語ç日ð本Ê語þ日¥本¼語i日©日a本b語ç日ð本Ê語þ日¥本¼語i日©日a本b語ç日ð本Ê語þ日¥本¼語i日©",
+	"\x80\x80\x80\x80",
+}
+
+func TestScanForwards(t *testing.T) {
+	for _, s := range testStrings {
+		runes := []rune(s)
+		str := NewString(s)
+		if str.RuneCount() != len(runes) {
+			t.Errorf("%s: expected %d runes; got %d", s, len(runes), str.RuneCount())
+			break
+		}
+		for i, expect := range runes {
+			got := str.At(i)
+			if got != expect {
+				t.Errorf("%s[%d]: expected %c (%U); got %c (%U)", s, i, expect, expect, got, got)
+			}
+		}
+	}
+}
+
+func TestScanBackwards(t *testing.T) {
+	for _, s := range testStrings {
+		runes := []rune(s)
+		str := NewString(s)
+		if str.RuneCount() != len(runes) {
+			t.Errorf("%s: expected %d runes; got %d", s, len(runes), str.RuneCount())
+			break
+		}
+		for i := len(runes) - 1; i >= 0; i-- {
+			expect := runes[i]
+			got := str.At(i)
+			if got != expect {
+				t.Errorf("%s[%d]: expected %c (%U); got %c (%U)", s, i, expect, expect, got, got)
+			}
+		}
+	}
+}
+
+func randCount() int {
+	if testing.Short() {
+		return 100
+	}
+	return 100000
+}
+
+func TestRandomAccess(t *testing.T) {
+	for _, s := range testStrings {
+		if len(s) == 0 {
+			continue
+		}
+		runes := []rune(s)
+		str := NewString(s)
+		if str.RuneCount() != len(runes) {
+			t.Errorf("%s: expected %d runes; got %d", s, len(runes), str.RuneCount())
+			break
+		}
+		for j := 0; j < randCount(); j++ {
+			i := rand.Intn(len(runes))
+			expect := runes[i]
+			got := str.At(i)
+			if got != expect {
+				t.Errorf("%s[%d]: expected %c (%U); got %c (%U)", s, i, expect, expect, got, got)
+			}
+		}
+	}
+}
+
+func TestRandomSliceAccess(t *testing.T) {
+	for _, s := range testStrings {
+		if len(s) == 0 || s[0] == '\x80' { // the bad-UTF-8 string fools this simple test
+			continue
+		}
+		runes := []rune(s)
+		str := NewString(s)
+		if str.RuneCount() != len(runes) {
+			t.Errorf("%s: expected %d runes; got %d", s, len(runes), str.RuneCount())
+			break
+		}
+		for k := 0; k < randCount(); k++ {
+			i := rand.Intn(len(runes))
+			j := rand.Intn(len(runes) + 1)
+			if i > j { // include empty strings
+				continue
+			}
+			expect := string(runes[i:j])
+			got := str.Slice(i, j)
+			if got != expect {
+				t.Errorf("%s[%d:%d]: expected %q got %q", s, i, j, expect, got)
+			}
+		}
+	}
+}
+
+func TestLimitSliceAccess(t *testing.T) {
+	for _, s := range testStrings {
+		str := NewString(s)
+		if str.Slice(0, 0) != "" {
+			t.Error("failure with empty slice at beginning")
+		}
+		nr := utf8.RuneCountInString(s)
+		if str.Slice(nr, nr) != "" {
+			t.Error("failure with empty slice at end")
+		}
+	}
+}