whatcanGOwrong

This commit is contained in:
2024-09-19 21:38:24 -04:00
commit d0ae4d841d
17908 changed files with 4096831 additions and 0 deletions
@@ -0,0 +1,187 @@
// Copyright 2014 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package language
import (
"fmt"
"sort"
"golang.org/x/text/internal/language"
)
// The Coverage interface is used to define the level of coverage of an
// internationalization service. Note that not all types are supported by all
// services. As lists may be generated on the fly, it is recommended that users
// of a Coverage cache the results.
type Coverage interface {
// Tags returns the list of supported tags.
Tags() []Tag
// BaseLanguages returns the list of supported base languages.
BaseLanguages() []Base
// Scripts returns the list of supported scripts.
Scripts() []Script
// Regions returns the list of supported regions.
Regions() []Region
}
var (
// Supported defines a Coverage that lists all supported subtags. Tags
// always returns nil.
Supported Coverage = allSubtags{}
)
// TODO:
// - Support Variants, numbering systems.
// - CLDR coverage levels.
// - Set of common tags defined in this package.
type allSubtags struct{}
// Regions returns the list of supported regions. As all regions are in a
// consecutive range, it simply returns a slice of numbers in increasing order.
// The "undefined" region is not returned.
func (s allSubtags) Regions() []Region {
reg := make([]Region, language.NumRegions)
for i := range reg {
reg[i] = Region{language.Region(i + 1)}
}
return reg
}
// Scripts returns the list of supported scripts. As all scripts are in a
// consecutive range, it simply returns a slice of numbers in increasing order.
// The "undefined" script is not returned.
func (s allSubtags) Scripts() []Script {
scr := make([]Script, language.NumScripts)
for i := range scr {
scr[i] = Script{language.Script(i + 1)}
}
return scr
}
// BaseLanguages returns the list of all supported base languages. It generates
// the list by traversing the internal structures.
func (s allSubtags) BaseLanguages() []Base {
bs := language.BaseLanguages()
base := make([]Base, len(bs))
for i, b := range bs {
base[i] = Base{b}
}
return base
}
// Tags always returns nil.
func (s allSubtags) Tags() []Tag {
return nil
}
// coverage is used by NewCoverage which is used as a convenient way for
// creating Coverage implementations for partially defined data. Very often a
// package will only need to define a subset of slices. coverage provides a
// convenient way to do this. Moreover, packages using NewCoverage, instead of
// their own implementation, will not break if later new slice types are added.
type coverage struct {
tags func() []Tag
bases func() []Base
scripts func() []Script
regions func() []Region
}
func (s *coverage) Tags() []Tag {
if s.tags == nil {
return nil
}
return s.tags()
}
// bases implements sort.Interface and is used to sort base languages.
type bases []Base
func (b bases) Len() int {
return len(b)
}
func (b bases) Swap(i, j int) {
b[i], b[j] = b[j], b[i]
}
func (b bases) Less(i, j int) bool {
return b[i].langID < b[j].langID
}
// BaseLanguages returns the result from calling s.bases if it is specified or
// otherwise derives the set of supported base languages from tags.
func (s *coverage) BaseLanguages() []Base {
if s.bases == nil {
tags := s.Tags()
if len(tags) == 0 {
return nil
}
a := make([]Base, len(tags))
for i, t := range tags {
a[i] = Base{language.Language(t.lang())}
}
sort.Sort(bases(a))
k := 0
for i := 1; i < len(a); i++ {
if a[k] != a[i] {
k++
a[k] = a[i]
}
}
return a[:k+1]
}
return s.bases()
}
func (s *coverage) Scripts() []Script {
if s.scripts == nil {
return nil
}
return s.scripts()
}
func (s *coverage) Regions() []Region {
if s.regions == nil {
return nil
}
return s.regions()
}
// NewCoverage returns a Coverage for the given lists. It is typically used by
// packages providing internationalization services to define their level of
// coverage. A list may be of type []T or func() []T, where T is either Tag,
// Base, Script or Region. The returned Coverage derives the value for Bases
// from Tags if no func or slice for []Base is specified. For other unspecified
// types the returned Coverage will return nil for the respective methods.
func NewCoverage(list ...interface{}) Coverage {
s := &coverage{}
for _, x := range list {
switch v := x.(type) {
case func() []Base:
s.bases = v
case func() []Script:
s.scripts = v
case func() []Region:
s.regions = v
case func() []Tag:
s.tags = v
case []Base:
s.bases = func() []Base { return v }
case []Script:
s.scripts = func() []Script { return v }
case []Region:
s.regions = func() []Region { return v }
case []Tag:
s.tags = func() []Tag { return v }
default:
panic(fmt.Sprintf("language: unsupported set type %T", v))
}
}
return s
}
@@ -0,0 +1,156 @@
// Copyright 2014 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package language
import (
"fmt"
"reflect"
"testing"
"golang.org/x/text/internal/language"
)
func TestSupported(t *testing.T) {
// To prove the results are correct for a type, we test that the number of
// results is identical to the number of results on record, that all results
// are distinct and that all results are valid.
tests := map[string]int{
"BaseLanguages": language.NumLanguages,
"Scripts": language.NumScripts,
"Regions": language.NumRegions,
"Tags": 0,
}
sup := reflect.ValueOf(Supported)
for name, num := range tests {
v := sup.MethodByName(name).Call(nil)[0]
if n := v.Len(); n != num {
t.Errorf("len(%s()) was %d; want %d", name, n, num)
}
dup := make(map[string]bool)
for i := 0; i < v.Len(); i++ {
x := v.Index(i).Interface()
// An invalid value will either cause a crash or result in a
// duplicate when passed to Sprint.
s := fmt.Sprint(x)
if dup[s] {
t.Errorf("%s: duplicate entry %q", name, s)
}
dup[s] = true
}
if len(dup) != v.Len() {
t.Errorf("%s: # unique entries was %d; want %d", name, len(dup), v.Len())
}
}
}
func TestNewCoverage(t *testing.T) {
bases := []Base{Base{0}, Base{3}, Base{7}}
scripts := []Script{Script{11}, Script{17}, Script{23}}
regions := []Region{Region{101}, Region{103}, Region{107}}
tags := []Tag{Make("pt"), Make("en"), Make("en-GB"), Make("en-US"), Make("pt-PT")}
fbases := func() []Base { return bases }
fscripts := func() []Script { return scripts }
fregions := func() []Region { return regions }
ftags := func() []Tag { return tags }
tests := []struct {
desc string
list []interface{}
bases []Base
scripts []Script
regions []Region
tags []Tag
}{
{
desc: "empty",
},
{
desc: "bases",
list: []interface{}{bases},
bases: bases,
},
{
desc: "scripts",
list: []interface{}{scripts},
scripts: scripts,
},
{
desc: "regions",
list: []interface{}{regions},
regions: regions,
},
{
desc: "bases derives from tags",
list: []interface{}{tags},
bases: []Base{Base{_en}, Base{_pt}},
tags: tags,
},
{
desc: "tags and bases",
list: []interface{}{tags, bases},
bases: bases,
tags: tags,
},
{
desc: "fully specified",
list: []interface{}{tags, bases, scripts, regions},
bases: bases,
scripts: scripts,
regions: regions,
tags: tags,
},
{
desc: "bases func",
list: []interface{}{fbases},
bases: bases,
},
{
desc: "scripts func",
list: []interface{}{fscripts},
scripts: scripts,
},
{
desc: "regions func",
list: []interface{}{fregions},
regions: regions,
},
{
desc: "tags func",
list: []interface{}{ftags},
bases: []Base{Base{_en}, Base{_pt}},
tags: tags,
},
{
desc: "tags and bases",
list: []interface{}{ftags, fbases},
bases: bases,
tags: tags,
},
{
desc: "fully specified",
list: []interface{}{ftags, fbases, fscripts, fregions},
bases: bases,
scripts: scripts,
regions: regions,
tags: tags,
},
}
for i, tt := range tests {
l := NewCoverage(tt.list...)
if a := l.BaseLanguages(); !reflect.DeepEqual(a, tt.bases) {
t.Errorf("%d:%s: BaseLanguages was %v; want %v", i, tt.desc, a, tt.bases)
}
if a := l.Scripts(); !reflect.DeepEqual(a, tt.scripts) {
t.Errorf("%d:%s: Scripts was %v; want %v", i, tt.desc, a, tt.scripts)
}
if a := l.Regions(); !reflect.DeepEqual(a, tt.regions) {
t.Errorf("%d:%s: Regions was %v; want %v", i, tt.desc, a, tt.regions)
}
if a := l.Tags(); !reflect.DeepEqual(a, tt.tags) {
t.Errorf("%d:%s: Tags was %v; want %v", i, tt.desc, a, tt.tags)
}
}
}
@@ -0,0 +1,92 @@
// Copyright 2014 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package display
// This file contains sets of data for specific languages. Users can use these
// to create smaller collections of supported languages and reduce total table
// size.
// The variable names defined here correspond to those in package language.
var (
Afrikaans *Dictionary = &af // af
Amharic *Dictionary = &am // am
Arabic *Dictionary = &ar // ar
ModernStandardArabic *Dictionary = Arabic // ar-001
Azerbaijani *Dictionary = &az // az
Bulgarian *Dictionary = &bg // bg
Bengali *Dictionary = &bn // bn
Catalan *Dictionary = &ca // ca
Czech *Dictionary = &cs // cs
Danish *Dictionary = &da // da
German *Dictionary = &de // de
Greek *Dictionary = &el // el
English *Dictionary = &en // en
AmericanEnglish *Dictionary = English // en-US
BritishEnglish *Dictionary = English // en-GB
Spanish *Dictionary = &es // es
EuropeanSpanish *Dictionary = Spanish // es-ES
LatinAmericanSpanish *Dictionary = Spanish // es-419
Estonian *Dictionary = &et // et
Persian *Dictionary = &fa // fa
Finnish *Dictionary = &fi // fi
Filipino *Dictionary = &fil // fil
French *Dictionary = &fr // fr
Gujarati *Dictionary = &gu // gu
Hebrew *Dictionary = &he // he
Hindi *Dictionary = &hi // hi
Croatian *Dictionary = &hr // hr
Hungarian *Dictionary = &hu // hu
Armenian *Dictionary = &hy // hy
Indonesian *Dictionary = &id // id
Icelandic *Dictionary = &is // is
Italian *Dictionary = &it // it
Japanese *Dictionary = &ja // ja
Georgian *Dictionary = &ka // ka
Kazakh *Dictionary = &kk // kk
Khmer *Dictionary = &km // km
Kannada *Dictionary = &kn // kn
Korean *Dictionary = &ko // ko
Kirghiz *Dictionary = &ky // ky
Lao *Dictionary = &lo // lo
Lithuanian *Dictionary = &lt // lt
Latvian *Dictionary = &lv // lv
Macedonian *Dictionary = &mk // mk
Malayalam *Dictionary = &ml // ml
Mongolian *Dictionary = &mn // mn
Marathi *Dictionary = &mr // mr
Malay *Dictionary = &ms // ms
Burmese *Dictionary = &my // my
Nepali *Dictionary = &ne // ne
Dutch *Dictionary = &nl // nl
Norwegian *Dictionary = &no // no
Punjabi *Dictionary = &pa // pa
Polish *Dictionary = &pl // pl
Portuguese *Dictionary = &pt // pt
BrazilianPortuguese *Dictionary = Portuguese // pt-BR
EuropeanPortuguese *Dictionary = &ptPT // pt-PT
Romanian *Dictionary = &ro // ro
Russian *Dictionary = &ru // ru
Sinhala *Dictionary = &si // si
Slovak *Dictionary = &sk // sk
Slovenian *Dictionary = &sl // sl
Albanian *Dictionary = &sq // sq
Serbian *Dictionary = &sr // sr
SerbianLatin *Dictionary = &srLatn // sr
Swedish *Dictionary = &sv // sv
Swahili *Dictionary = &sw // sw
Tamil *Dictionary = &ta // ta
Telugu *Dictionary = &te // te
Thai *Dictionary = &th // th
Turkish *Dictionary = &tr // tr
Ukrainian *Dictionary = &uk // uk
Urdu *Dictionary = &ur // ur
Uzbek *Dictionary = &uz // uz
Vietnamese *Dictionary = &vi // vi
Chinese *Dictionary = &zh // zh
SimplifiedChinese *Dictionary = Chinese // zh-Hans
TraditionalChinese *Dictionary = &zhHant // zh-Hant
Zulu *Dictionary = &zu // zu
)
@@ -0,0 +1,39 @@
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package display
import (
"fmt"
"testing"
"golang.org/x/text/internal/testtext"
)
func TestLinking(t *testing.T) {
base := getSize(t, `display.Tags(language.English).Name(language.English)`)
compact := getSize(t, `display.English.Languages().Name(language.English)`)
if d := base - compact; d < 1.5*1024*1024 {
t.Errorf("size(base) - size(compact) = %d - %d = was %d; want > 1.5MB", base, compact, d)
}
}
func getSize(t *testing.T, main string) int {
size, err := testtext.CodeSize(fmt.Sprintf(body, main))
if err != nil {
t.Skipf("skipping link size test; binary size could not be determined: %v", err)
}
return size
}
const body = `package main
import (
"golang.org/x/text/language"
"golang.org/x/text/language/display"
)
func main() {
%s
}
`
@@ -0,0 +1,420 @@
// Copyright 2014 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:generate go run maketables.go -output tables.go
// Package display provides display names for languages, scripts and regions in
// a requested language.
//
// The data is based on CLDR's localeDisplayNames. It includes the names of the
// draft level "contributed" or "approved". The resulting tables are quite
// large. The display package is designed so that users can reduce the linked-in
// table sizes by cherry picking the languages one wishes to support. There is a
// Dictionary defined for a selected set of common languages for this purpose.
package display // import "golang.org/x/text/language/display"
import (
"fmt"
"strings"
"golang.org/x/text/internal/format"
"golang.org/x/text/language"
)
/*
TODO:
All fairly low priority at the moment:
- Include alternative and variants as an option (using func options).
- Option for returning the empty string for undefined values.
- Support variants, currencies, time zones, option names and other data
provided in CLDR.
- Do various optimizations:
- Reduce size of offset tables.
- Consider compressing infrequently used languages and decompress on demand.
*/
// A Formatter formats a tag in the current language. It is used in conjunction
// with the message package.
type Formatter struct {
lookup func(tag int, x interface{}) string
x interface{}
}
// Format implements "golang.org/x/text/internal/format".Formatter.
func (f Formatter) Format(state format.State, verb rune) {
// TODO: there are a lot of inefficiencies in this code. Fix it when we
// language.Tag has embedded compact tags.
t := state.Language()
_, index, _ := matcher.Match(t)
str := f.lookup(index, f.x)
if str == "" {
// TODO: use language-specific punctuation.
// TODO: use codePattern instead of language?
if unknown := f.lookup(index, language.Und); unknown != "" {
fmt.Fprintf(state, "%v (%v)", unknown, f.x)
} else {
fmt.Fprintf(state, "[language: %v]", f.x)
}
} else {
state.Write([]byte(str))
}
}
// Language returns a Formatter that renders the name for lang in the
// current language. x may be a language.Base or a language.Tag.
// It renders lang in the default language if no translation for the current
// language is supported.
func Language(lang interface{}) Formatter {
return Formatter{langFunc, lang}
}
// Region returns a Formatter that renders the name for region in the current
// language. region may be a language.Region or a language.Tag.
// It renders region in the default language if no translation for the current
// language is supported.
func Region(region interface{}) Formatter {
return Formatter{regionFunc, region}
}
// Script returns a Formatter that renders the name for script in the current
// language. script may be a language.Script or a language.Tag.
// It renders script in the default language if no translation for the current
// language is supported.
func Script(script interface{}) Formatter {
return Formatter{scriptFunc, script}
}
// Tag returns a Formatter that renders the name for tag in the current
// language. tag may be a language.Tag.
// It renders tag in the default language if no translation for the current
// language is supported.
func Tag(tag interface{}) Formatter {
return Formatter{tagFunc, tag}
}
// A Namer is used to get the name for a given value, such as a Tag, Language,
// Script or Region.
type Namer interface {
// Name returns a display string for the given value. A Namer returns an
// empty string for values it does not support. A Namer may support naming
// an unspecified value. For example, when getting the name for a region for
// a tag that does not have a defined Region, it may return the name for an
// unknown region. It is up to the user to filter calls to Name for values
// for which one does not want to have a name string.
Name(x interface{}) string
}
var (
// Supported lists the languages for which names are defined.
Supported language.Coverage
// The set of all possible values for which names are defined. Note that not
// all Namer implementations will cover all the values of a given type.
// A Namer will return the empty string for unsupported values.
Values language.Coverage
matcher language.Matcher
)
func init() {
tags := make([]language.Tag, numSupported)
s := supported
for i := range tags {
p := strings.IndexByte(s, '|')
tags[i] = language.Raw.Make(s[:p])
s = s[p+1:]
}
matcher = language.NewMatcher(tags)
Supported = language.NewCoverage(tags)
Values = language.NewCoverage(langTagSet.Tags, supportedScripts, supportedRegions)
}
// Languages returns a Namer for naming languages. It returns nil if there is no
// data for the given tag. The type passed to Name must be either language.Base
// or language.Tag. Note that the result may differ between passing a tag or its
// base language. For example, for English, passing "nl-BE" would return Flemish
// whereas passing "nl" returns "Dutch".
func Languages(t language.Tag) Namer {
if _, index, conf := matcher.Match(t); conf != language.No {
return languageNamer(index)
}
return nil
}
type languageNamer int
func langFunc(i int, x interface{}) string {
return nameLanguage(languageNamer(i), x)
}
func (n languageNamer) name(i int) string {
return lookup(langHeaders[:], int(n), i)
}
// Name implements the Namer interface for language names.
func (n languageNamer) Name(x interface{}) string {
return nameLanguage(n, x)
}
// nonEmptyIndex walks up the parent chain until a non-empty header is found.
// It returns -1 if no index could be found.
func nonEmptyIndex(h []header, index int) int {
for ; index != -1 && h[index].data == ""; index = int(parents[index]) {
}
return index
}
// Scripts returns a Namer for naming scripts. It returns nil if there is no
// data for the given tag. The type passed to Name must be either a
// language.Script or a language.Tag. It will not attempt to infer a script for
// tags with an unspecified script.
func Scripts(t language.Tag) Namer {
if _, index, conf := matcher.Match(t); conf != language.No {
if index = nonEmptyIndex(scriptHeaders[:], index); index != -1 {
return scriptNamer(index)
}
}
return nil
}
type scriptNamer int
func scriptFunc(i int, x interface{}) string {
return nameScript(scriptNamer(i), x)
}
func (n scriptNamer) name(i int) string {
return lookup(scriptHeaders[:], int(n), i)
}
// Name implements the Namer interface for script names.
func (n scriptNamer) Name(x interface{}) string {
return nameScript(n, x)
}
// Regions returns a Namer for naming regions. It returns nil if there is no
// data for the given tag. The type passed to Name must be either a
// language.Region or a language.Tag. It will not attempt to infer a region for
// tags with an unspecified region.
func Regions(t language.Tag) Namer {
if _, index, conf := matcher.Match(t); conf != language.No {
if index = nonEmptyIndex(regionHeaders[:], index); index != -1 {
return regionNamer(index)
}
}
return nil
}
type regionNamer int
func regionFunc(i int, x interface{}) string {
return nameRegion(regionNamer(i), x)
}
func (n regionNamer) name(i int) string {
return lookup(regionHeaders[:], int(n), i)
}
// Name implements the Namer interface for region names.
func (n regionNamer) Name(x interface{}) string {
return nameRegion(n, x)
}
// Tags returns a Namer for giving a full description of a tag. The names of
// scripts and regions that are not already implied by the language name will
// in appended within parentheses. It returns nil if there is not data for the
// given tag. The type passed to Name must be a tag.
func Tags(t language.Tag) Namer {
if _, index, conf := matcher.Match(t); conf != language.No {
return tagNamer(index)
}
return nil
}
type tagNamer int
func tagFunc(i int, x interface{}) string {
return nameTag(languageNamer(i), scriptNamer(i), regionNamer(i), x)
}
// Name implements the Namer interface for tag names.
func (n tagNamer) Name(x interface{}) string {
return nameTag(languageNamer(n), scriptNamer(n), regionNamer(n), x)
}
// lookup finds the name for an entry in a global table, traversing the
// inheritance hierarchy if needed.
func lookup(table []header, dict, want int) string {
for dict != -1 {
if s := table[dict].name(want); s != "" {
return s
}
dict = int(parents[dict])
}
return ""
}
// A Dictionary holds a collection of Namers for a single language. One can
// reduce the amount of data linked in to a binary by only referencing
// Dictionaries for the languages one needs to support instead of using the
// generic Namer factories.
type Dictionary struct {
parent *Dictionary
lang header
script header
region header
}
// Tags returns a Namer for giving a full description of a tag. The names of
// scripts and regions that are not already implied by the language name will
// in appended within parentheses. It returns nil if there is not data for the
// given tag. The type passed to Name must be a tag.
func (d *Dictionary) Tags() Namer {
return dictTags{d}
}
type dictTags struct {
d *Dictionary
}
// Name implements the Namer interface for tag names.
func (n dictTags) Name(x interface{}) string {
return nameTag(dictLanguages{n.d}, dictScripts{n.d}, dictRegions{n.d}, x)
}
// Languages returns a Namer for naming languages. It returns nil if there is no
// data for the given tag. The type passed to Name must be either language.Base
// or language.Tag. Note that the result may differ between passing a tag or its
// base language. For example, for English, passing "nl-BE" would return Flemish
// whereas passing "nl" returns "Dutch".
func (d *Dictionary) Languages() Namer {
return dictLanguages{d}
}
type dictLanguages struct {
d *Dictionary
}
func (n dictLanguages) name(i int) string {
for d := n.d; d != nil; d = d.parent {
if s := d.lang.name(i); s != "" {
return s
}
}
return ""
}
// Name implements the Namer interface for language names.
func (n dictLanguages) Name(x interface{}) string {
return nameLanguage(n, x)
}
// Scripts returns a Namer for naming scripts. It returns nil if there is no
// data for the given tag. The type passed to Name must be either a
// language.Script or a language.Tag. It will not attempt to infer a script for
// tags with an unspecified script.
func (d *Dictionary) Scripts() Namer {
return dictScripts{d}
}
type dictScripts struct {
d *Dictionary
}
func (n dictScripts) name(i int) string {
for d := n.d; d != nil; d = d.parent {
if s := d.script.name(i); s != "" {
return s
}
}
return ""
}
// Name implements the Namer interface for script names.
func (n dictScripts) Name(x interface{}) string {
return nameScript(n, x)
}
// Regions returns a Namer for naming regions. It returns nil if there is no
// data for the given tag. The type passed to Name must be either a
// language.Region or a language.Tag. It will not attempt to infer a region for
// tags with an unspecified region.
func (d *Dictionary) Regions() Namer {
return dictRegions{d}
}
type dictRegions struct {
d *Dictionary
}
func (n dictRegions) name(i int) string {
for d := n.d; d != nil; d = d.parent {
if s := d.region.name(i); s != "" {
return s
}
}
return ""
}
// Name implements the Namer interface for region names.
func (n dictRegions) Name(x interface{}) string {
return nameRegion(n, x)
}
// A SelfNamer implements a Namer that returns the name of language in this same
// language. It provides a very compact mechanism to provide a comprehensive
// list of languages to users in their native language.
type SelfNamer struct {
// Supported defines the values supported by this Namer.
Supported language.Coverage
}
var (
// Self is a shared instance of a SelfNamer.
Self *SelfNamer = &self
self = SelfNamer{language.NewCoverage(selfTagSet.Tags)}
)
// Name returns the name of a given language tag in the language identified by
// this tag. It supports both the language.Base and language.Tag types.
func (n SelfNamer) Name(x interface{}) string {
t, _ := language.All.Compose(x)
base, scr, reg := t.Raw()
baseScript := language.Script{}
if (scr == language.Script{} && reg != language.Region{}) {
// For looking up in the self dictionary, we need to select the
// maximized script. This is even the case if the script isn't
// specified.
s1, _ := t.Script()
if baseScript = getScript(base); baseScript != s1 {
scr = s1
}
}
i, scr, reg := selfTagSet.index(base, scr, reg)
if i == -1 {
return ""
}
// Only return the display name if the script matches the expected script.
if (scr != language.Script{}) {
if (baseScript == language.Script{}) {
baseScript = getScript(base)
}
if baseScript != scr {
return ""
}
}
return selfHeaders[0].name(i)
}
// getScript returns the maximized script for a base language.
func getScript(b language.Base) language.Script {
tag, _ := language.Raw.Compose(b)
scr, _ := tag.Script()
return scr
}
@@ -0,0 +1,714 @@
// Copyright 2014 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package display
import (
"fmt"
"reflect"
"strings"
"testing"
"unicode"
"golang.org/x/text/internal/testtext"
"golang.org/x/text/language"
"golang.org/x/text/message"
)
// TODO: test that tables are properly dropped by the linker for various use
// cases.
var (
firstLang2aa = language.MustParseBase("aa")
lastLang2zu = language.MustParseBase("zu")
firstLang3ace = language.MustParseBase("ace")
lastLang3zza = language.MustParseBase("zza")
firstTagAr001 = language.MustParse("ar-001")
lastTagZhHant = language.MustParse("zh-Hant")
)
// TestValues tests that for all languages, regions, and scripts in Values, at
// least one language has a name defined for it by checking it exists in
// English, which is assumed to be the most comprehensive. It is also tested
// that a Namer returns "" for unsupported values.
func TestValues(t *testing.T) {
type testcase struct {
kind string
n Namer
}
// checkDefined checks that a value exists in a Namer.
checkDefined := func(x interface{}, namers []testcase) {
for _, n := range namers {
testtext.Run(t, fmt.Sprintf("%s.Name(%s)", n.kind, x), func(t *testing.T) {
if n.n.Name(x) == "" {
// As of version 28 there is no data for az-Arab in English,
// although there is useful data in other languages.
if x.(fmt.Stringer).String() == "az-Arab" {
return
}
t.Errorf("supported but no result")
}
})
}
}
// checkUnsupported checks that a value does not exist in a Namer.
checkUnsupported := func(x interface{}, namers []testcase) {
for _, n := range namers {
if got := n.n.Name(x); got != "" {
t.Fatalf("%s.Name(%s): unsupported tag gave non-empty result: %q", n.kind, x, got)
}
}
}
tags := map[language.Tag]bool{}
namers := []testcase{
{"Languages(en)", Languages(language.English)},
{"Tags(en)", Tags(language.English)},
{"English.Languages()", English.Languages()},
{"English.Tags()", English.Tags()},
}
for _, tag := range Values.Tags() {
checkDefined(tag, namers)
tags[tag] = true
}
for _, base := range language.Supported.BaseLanguages() {
tag, _ := language.All.Compose(base)
if !tags[tag] {
checkUnsupported(tag, namers)
}
}
regions := map[language.Region]bool{}
namers = []testcase{
{"Regions(en)", Regions(language.English)},
{"English.Regions()", English.Regions()},
}
for _, r := range Values.Regions() {
checkDefined(r, namers)
regions[r] = true
}
for _, r := range language.Supported.Regions() {
if r = r.Canonicalize(); !regions[r] {
checkUnsupported(r, namers)
}
}
scripts := map[language.Script]bool{}
namers = []testcase{
{"Scripts(en)", Scripts(language.English)},
{"English.Scripts()", English.Scripts()},
}
for _, s := range Values.Scripts() {
checkDefined(s, namers)
scripts[s] = true
}
for _, s := range language.Supported.Scripts() {
// Canonicalize the script.
tag, _ := language.DeprecatedScript.Compose(s)
if _, s, _ = tag.Raw(); !scripts[s] {
checkUnsupported(s, namers)
}
}
}
// TestSupported tests that we have at least some Namers for languages that we
// claim to support. To test the claims in the documentation, it also verifies
// that if a Namer is returned, it will have at least some data.
func TestSupported(t *testing.T) {
supportedTags := Supported.Tags()
if len(supportedTags) != numSupported {
t.Errorf("number of supported was %d; want %d", len(supportedTags), numSupported)
}
namerFuncs := []struct {
kind string
fn func(language.Tag) Namer
}{
{"Tags", Tags},
{"Languages", Languages},
{"Regions", Regions},
{"Scripts", Scripts},
}
// Verify that we have at least one Namer for all tags we claim to support.
tags := make(map[language.Tag]bool)
for _, tag := range supportedTags {
// Test we have at least one Namer for this supported Tag.
found := false
for _, kind := range namerFuncs {
if defined(t, kind.kind, kind.fn(tag), tag) {
found = true
}
}
if !found {
t.Errorf("%s: supported, but no data available", tag)
}
if tags[tag] {
t.Errorf("%s: included in Supported.Tags more than once", tag)
}
tags[tag] = true
}
// Verify that we have no Namers for tags we don't claim to support.
for _, base := range language.Supported.BaseLanguages() {
tag, _ := language.All.Compose(base)
// Skip tags that are supported after matching.
if _, _, conf := matcher.Match(tag); conf != language.No {
continue
}
// Test there are no Namers for this tag.
for _, kind := range namerFuncs {
if defined(t, kind.kind, kind.fn(tag), tag) {
t.Errorf("%[1]s(%[2]s) returns a Namer, but %[2]s is not in the set of supported Tags.", kind.kind, tag)
}
}
}
}
// defined reports whether n is a proper Namer, which means it is non-nil and
// must have at least one non-empty value.
func defined(t *testing.T, kind string, n Namer, tag language.Tag) bool {
if n == nil {
return false
}
switch kind {
case "Tags":
for _, t := range Values.Tags() {
if n.Name(t) != "" {
return true
}
}
case "Languages":
for _, t := range Values.BaseLanguages() {
if n.Name(t) != "" {
return true
}
}
case "Regions":
for _, t := range Values.Regions() {
if n.Name(t) != "" {
return true
}
}
case "Scripts":
for _, t := range Values.Scripts() {
if n.Name(t) != "" {
return true
}
}
}
t.Errorf("%s(%s) returns non-nil Namer without content", kind, tag)
return false
}
func TestCoverage(t *testing.T) {
en := language.English
tests := []struct {
n Namer
x interface{}
}{
{Languages(en), Values.Tags()},
{Scripts(en), Values.Scripts()},
{Regions(en), Values.Regions()},
}
for i, tt := range tests {
uniq := make(map[string]interface{})
v := reflect.ValueOf(tt.x)
for j := 0; j < v.Len(); j++ {
x := v.Index(j).Interface()
// As of version 28 there is no data for az-Arab in English,
// although there is useful data in other languages.
if x.(fmt.Stringer).String() == "az-Arab" {
continue
}
s := tt.n.Name(x)
if s == "" {
t.Errorf("%d:%d:%s: missing content", i, j, x)
} else if uniq[s] != nil {
t.Errorf("%d:%d:%s: identical return value %q for %v and %v", i, j, x, s, x, uniq[s])
}
uniq[s] = x
}
}
}
// TestUpdate tests whether dictionary entries for certain languages need to be
// updated. For some languages, some of the headers may be empty or they may be
// identical to the parent. This code detects if such entries need to be updated
// after a table update.
func TestUpdate(t *testing.T) {
tests := []struct {
d *Dictionary
tag string
}{
{ModernStandardArabic, "ar-001"},
{AmericanEnglish, "en-US"},
{EuropeanSpanish, "es-ES"},
{BrazilianPortuguese, "pt-BR"},
{SimplifiedChinese, "zh-Hans"},
}
for _, tt := range tests {
_, i, _ := matcher.Match(language.MustParse(tt.tag))
if !reflect.DeepEqual(tt.d.lang, langHeaders[i]) {
t.Errorf("%s: lang table update needed", tt.tag)
}
if !reflect.DeepEqual(tt.d.script, scriptHeaders[i]) {
t.Errorf("%s: script table update needed", tt.tag)
}
if !reflect.DeepEqual(tt.d.region, regionHeaders[i]) {
t.Errorf("%s: region table update needed", tt.tag)
}
}
}
func TestIndex(t *testing.T) {
notIn := []string{"aa", "xx", "zz", "aaa", "xxx", "zzz", "Aaaa", "Xxxx", "Zzzz"}
tests := []tagIndex{
{
"",
"",
"",
},
{
"bb",
"",
"",
},
{
"",
"bbb",
"",
},
{
"",
"",
"Bbbb",
},
{
"bb",
"bbb",
"Bbbb",
},
{
"bbccddyy",
"bbbcccdddyyy",
"BbbbCcccDdddYyyy",
},
}
for i, tt := range tests {
// Create the test set from the tagIndex.
cnt := 0
for sz := 2; sz <= 4; sz++ {
a := tt[sz-2]
for j := 0; j < len(a); j += sz {
s := a[j : j+sz]
if idx := tt.index(s); idx != cnt {
t.Errorf("%d:%s: index was %d; want %d", i, s, idx, cnt)
}
cnt++
}
}
if n := tt.len(); n != cnt {
t.Errorf("%d: len was %d; want %d", i, n, cnt)
}
for _, x := range notIn {
if idx := tt.index(x); idx != -1 {
t.Errorf("%d:%s: index was %d; want -1", i, x, idx)
}
}
}
}
func TestTag(t *testing.T) {
tests := []struct {
dict string
tag string
name string
}{
// sr is in Value.Languages(), but is not supported by agq.
{"agq", "sr", "|[language: sr]"},
{"nl", "nl", "Nederlands"},
// CLDR 30 dropped Vlaams as the word for nl-BE. It is still called
// Flemish in English, though. TODO: check if this is a CLDR bug.
// {"nl", "nl-BE", "Vlaams"},
{"nl", "nl-BE", "Nederlands (België)"},
{"nl", "vls", "West-Vlaams"},
{"en", "nl-BE", "Flemish"},
{"en", "en", "English"},
{"en", "en-GB", "British English"},
{"en", "en-US", "American English"}, // American English in CLDR 24+
{"ru", "ru", "русский"},
{"ru", "ru-RU", "русский (Россия)"},
{"ru", "ru-Cyrl", "русский (кириллица)"},
{"en", lastLang2zu.String(), "Zulu"},
{"en", firstLang2aa.String(), "Afar"},
{"en", lastLang3zza.String(), "Zaza"},
{"en", firstLang3ace.String(), "Achinese"},
{"en", firstTagAr001.String(), "Modern Standard Arabic"},
{"en", lastTagZhHant.String(), "Traditional Chinese"},
{"en", "aaa", "|Unknown language (aaa)"},
{"en", "zzj", "|Unknown language (zzj)"},
// If full tag doesn't match, try without script or region.
{"en", "aa-Hans", "Afar (Simplified Han)"},
{"en", "af-Arab", "Afrikaans (Arabic)"},
{"en", "zu-Cyrl", "Zulu (Cyrillic)"},
{"en", "aa-GB", "Afar (United Kingdom)"},
{"en", "af-NA", "Afrikaans (Namibia)"},
{"en", "zu-BR", "Zulu (Brazil)"},
// Correct inheritance and language selection.
{"zh", "zh-TW", "中文 (台湾)"},
{"zh", "zh-Hant-TW", "繁体中文 (台湾)"},
{"zh-Hant", "zh-TW", "中文 (台灣)"},
{"zh-Hant", "zh-Hant-TW", "繁體中文 (台灣)"},
// Some rather arbitrary interpretations for Serbian. This is arguably
// correct and consistent with the way zh-[Hant-]TW is handled. It will
// also give results more in line with the expectations if users
// explicitly use "sh".
{"sr-Latn", "sr-ME", "srpski (Crna Gora)"},
{"sr-Latn", "sr-Latn-ME", "srpskohrvatski (Crna Gora)"},
// Double script and region
{"nl", "en-Cyrl-BE", "Engels (Cyrillisch, België)"},
}
for _, tt := range tests {
t.Run(tt.dict+"/"+tt.tag, func(t *testing.T) {
name, fmtName := splitName(tt.name)
dict := language.MustParse(tt.dict)
tag := language.Raw.MustParse(tt.tag)
d := Tags(dict)
if n := d.Name(tag); n != name {
// There are inconsistencies w.r.t. capitalization in the tests
// due to CLDR's update procedure which treats modern and other
// languages differently.
// See https://unicode.org/cldr/trac/ticket/8051.
// TODO: use language capitalization to sanitize the strings.
t.Errorf("Name(%s) = %q; want %q", tag, n, name)
}
p := message.NewPrinter(dict)
if n := p.Sprint(Tag(tag)); n != fmtName {
t.Errorf("Tag(%s) = %q; want %q", tag, n, fmtName)
}
})
}
}
func splitName(names string) (name, formatName string) {
split := strings.Split(names, "|")
name, formatName = split[0], split[0]
if len(split) > 1 {
formatName = split[1]
}
return name, formatName
}
func TestLanguage(t *testing.T) {
tests := []struct {
dict string
tag string
name string
}{
// sr is in Value.Languages(), but is not supported by agq.
{"agq", "sr", "|[language: sr]"},
// CLDR 30 dropped Vlaams as the word for nl-BE. It is still called
// Flemish in English, though. TODO: this is probably incorrect.
// West-Vlaams (vls) is not Vlaams. West-Vlaams could be considered its
// own language, whereas Vlaams is generally Dutch. So expect to have
// to change these tests back.
{"nl", "nl", "Nederlands"},
{"nl", "vls", "West-Vlaams"},
{"nl", "nl-BE", "Nederlands"},
{"en", "pt", "Portuguese"},
{"en", "pt-PT", "European Portuguese"},
{"en", "pt-BR", "Brazilian Portuguese"},
{"en", "en", "English"},
{"en", "en-GB", "British English"},
{"en", "en-US", "American English"}, // American English in CLDR 24+
{"en", lastLang2zu.String(), "Zulu"},
{"en", firstLang2aa.String(), "Afar"},
{"en", lastLang3zza.String(), "Zaza"},
{"en", firstLang3ace.String(), "Achinese"},
{"en", firstTagAr001.String(), "Modern Standard Arabic"},
{"en", lastTagZhHant.String(), "Traditional Chinese"},
{"en", "aaa", "|Unknown language (aaa)"},
{"en", "zzj", "|Unknown language (zzj)"},
// If full tag doesn't match, try without script or region.
{"en", "aa-Hans", "Afar"},
{"en", "af-Arab", "Afrikaans"},
{"en", "zu-Cyrl", "Zulu"},
{"en", "aa-GB", "Afar"},
{"en", "af-NA", "Afrikaans"},
{"en", "zu-BR", "Zulu"},
{"agq", "zh-Hant", "|[language: zh-Hant]"},
{"en", "sh", "Serbo-Croatian"},
{"en", "sr-Latn", "Serbo-Croatian"},
{"en", "sr", "Serbian"},
{"en", "sr-ME", "Serbian"},
{"en", "sr-Latn-ME", "Serbo-Croatian"}, // See comments in TestTag.
}
for _, tt := range tests {
testtext.Run(t, tt.dict+"/"+tt.tag, func(t *testing.T) {
name, fmtName := splitName(tt.name)
dict := language.MustParse(tt.dict)
tag := language.Raw.MustParse(tt.tag)
p := message.NewPrinter(dict)
d := Languages(dict)
if n := d.Name(tag); n != name {
t.Errorf("Name(%v) = %q; want %q", tag, n, name)
}
if n := p.Sprint(Language(tag)); n != fmtName {
t.Errorf("Language(%v) = %q; want %q", tag, n, fmtName)
}
if len(tt.tag) <= 3 {
base := language.MustParseBase(tt.tag)
if n := d.Name(base); n != name {
t.Errorf("Name(%v) = %q; want %q", base, n, name)
}
if n := p.Sprint(Language(base)); n != fmtName {
t.Errorf("Language(%v) = %q; want %q", base, n, fmtName)
}
}
})
}
}
func TestScript(t *testing.T) {
tests := []struct {
dict string
scr string
name string
}{
{"nl", "Arab", "Arabisch"},
{"en", "Arab", "Arabic"},
{"en", "Zzzz", "Unknown Script"},
{"zh-Hant", "Hang", "韓文字"},
{"zh-Hant-HK", "Hang", "韓文字"},
{"zh", "Arab", "阿拉伯文"},
{"zh-Hans-HK", "Arab", "阿拉伯文"}, // same as zh
{"zh-Hant", "Arab", "阿拉伯文"},
{"zh-Hant-HK", "Arab", "阿拉伯文"}, // same as zh
// Canonicalized form
{"en", "Qaai", "Inherited"}, // deprecated script, now is Zinh
{"en", "sh", "Unknown Script"}, // sh canonicalizes to sr-Latn
{"en", "en", "Unknown Script"},
// Don't introduce scripts with canonicalization.
{"en", "sh", "Unknown Script"}, // sh canonicalizes to sr-Latn
}
for _, tt := range tests {
t.Run(tt.dict+"/"+tt.scr, func(t *testing.T) {
name, fmtName := splitName(tt.name)
dict := language.MustParse(tt.dict)
p := message.NewPrinter(dict)
d := Scripts(dict)
var tag language.Tag
if unicode.IsUpper(rune(tt.scr[0])) {
x := language.MustParseScript(tt.scr)
if n := d.Name(x); n != name {
t.Errorf("Name(%v) = %q; want %q", x, n, name)
}
if n := p.Sprint(Script(x)); n != fmtName {
t.Errorf("Script(%v) = %q; want %q", x, n, fmtName)
}
tag, _ = language.Raw.Compose(x)
} else {
tag = language.Raw.MustParse(tt.scr)
}
if n := d.Name(tag); n != name {
t.Errorf("Name(%v) = %q; want %q", tag, n, name)
}
if n := p.Sprint(Script(tag)); n != fmtName {
t.Errorf("Script(%v) = %q; want %q", tag, n, fmtName)
}
})
}
}
func TestRegion(t *testing.T) {
tests := []struct {
dict string
reg string
name string
}{
{"nl", "NL", "Nederland"},
{"en", "US", "United States"},
{"en", "ZZ", "Unknown Region"},
{"en-GB", "NL", "Netherlands"},
// Canonical equivalents
{"en", "UK", "United Kingdom"},
// No region
{"en", "pt", "Unknown Region"},
{"en", "und", "Unknown Region"},
// Don't introduce regions with canonicalization.
{"en", "mo", "Unknown Region"},
}
for _, tt := range tests {
t.Run(tt.dict+"/"+tt.reg, func(t *testing.T) {
dict := language.MustParse(tt.dict)
p := message.NewPrinter(dict)
d := Regions(dict)
var tag language.Tag
if unicode.IsUpper(rune(tt.reg[0])) {
// Region
x := language.MustParseRegion(tt.reg)
if n := d.Name(x); n != tt.name {
t.Errorf("Name(%v) = %q; want %q", x, n, tt.name)
}
if n := p.Sprint(Region(x)); n != tt.name {
t.Errorf("Region(%v) = %q; want %q", x, n, tt.name)
}
tag, _ = language.Raw.Compose(x)
} else {
tag = language.Raw.MustParse(tt.reg)
}
if n := d.Name(tag); n != tt.name {
t.Errorf("Name(%v) = %q; want %q", tag, n, tt.name)
}
if n := p.Sprint(Region(tag)); n != tt.name {
t.Errorf("Region(%v) = %q; want %q", tag, n, tt.name)
}
})
}
}
func TestSelf(t *testing.T) {
tests := []struct {
tag string
name string
}{
{"nl", "Nederlands"},
// CLDR 30 dropped Vlaams as the word for nl-BE. It is still called
// Flemish in English, though. TODO: check if this is a CLDR bug.
// {"nl-BE", "Vlaams"},
{"nl-BE", "Nederlands"},
{"en-GB", "British English"},
{lastLang2zu.String(), "isiZulu"},
{firstLang2aa.String(), ""}, // not defined
{lastLang3zza.String(), ""}, // not defined
{firstLang3ace.String(), ""}, // not defined
{firstTagAr001.String(), "العربية الرسمية الحديثة"},
{"ar", "العربية"},
{lastTagZhHant.String(), "繁體中文"},
{"aaa", ""},
{"zzj", ""},
// Drop entries that are not in the requested script, even if there is
// an entry for the language.
{"aa-Hans", ""},
{"af-Arab", ""},
{"zu-Cyrl", ""},
// Append the country name in the language of the matching language.
{"af-NA", "Afrikaans"},
{"zh", "中文"},
// zh-TW should match zh-Hant instead of zh!
{"zh-TW", "繁體中文"},
{"zh-Hant", "繁體中文"},
{"zh-Hans", "简体中文"},
{"zh-Hant-TW", "繁體中文"},
{"zh-Hans-TW", "简体中文"},
// Take the entry for sr which has the matching script.
// TODO: Capitalization changed as of CLDR 26, but change seems
// arbitrary. Revisit capitalization with revision 27. See
// https://unicode.org/cldr/trac/ticket/8051.
{"sr", "српски"},
// TODO: sr-ME should show up as Serbian or Montenegrin, not Serbo-
// Croatian. This is an artifact of the current algorithm, which is the
// way it is to have the preferred behavior for other languages such as
// Chinese. We can hardwire this case in the table generator or package
// code, but we first check if CLDR can be updated.
// {"sr-ME", "Srpski"}, // Is Srpskohrvatski
{"sr-Latn-ME", "srpskohrvatski"},
{"sr-Cyrl-ME", "српски"},
{"sr-NL", "српски"},
// NOTE: kk is defined, but in Cyrillic script. For China, Arab is the
// dominant script. We do not have data for kk-Arab and we chose to not
// fall back in such cases.
{"kk-CN", ""},
}
for i, tt := range tests {
d := Self
if n := d.Name(language.Raw.MustParse(tt.tag)); n != tt.name {
t.Errorf("%d:%s: was %q; want %q", i, tt.tag, n, tt.name)
}
}
}
func TestEquivalence(t *testing.T) {
testCases := []struct {
desc string
namer Namer
}{
{"Self", Self},
{"Tags", Tags(language.Romanian)},
{"Languages", Languages(language.Romanian)},
{"Scripts", Scripts(language.Romanian)},
}
for _, tc := range testCases {
t.Run(tc.desc, func(t *testing.T) {
ro := tc.namer.Name(language.Raw.MustParse("ro-MD"))
mo := tc.namer.Name(language.Raw.MustParse("mo"))
if ro != mo {
t.Errorf("%q != %q", ro, mo)
}
})
}
}
func TestDictionaryLang(t *testing.T) {
tests := []struct {
d *Dictionary
tag string
name string
}{
{English, "en", "English"},
{Portuguese, "af", "africâner"},
{EuropeanPortuguese, "af", "africanês"},
{English, "nl-BE", "Flemish"},
}
for i, test := range tests {
tag := language.MustParse(test.tag)
if got := test.d.Tags().Name(tag); got != test.name {
t.Errorf("%d:%v: got %s; want %s", i, tag, got, test.name)
}
if base, _ := language.Compose(tag.Base()); base == tag {
if got := test.d.Languages().Name(base); got != test.name {
t.Errorf("%d:%v: got %s; want %s", i, tag, got, test.name)
}
}
}
}
func TestDictionaryRegion(t *testing.T) {
tests := []struct {
d *Dictionary
region string
name string
}{
{English, "FR", "France"},
{Portuguese, "009", "Oceania"},
{EuropeanPortuguese, "009", "Oceânia"},
}
for i, test := range tests {
tag := language.MustParseRegion(test.region)
if got := test.d.Regions().Name(tag); got != test.name {
t.Errorf("%d:%v: got %s; want %s", i, tag, got, test.name)
}
}
}
func TestDictionaryScript(t *testing.T) {
tests := []struct {
d *Dictionary
script string
name string
}{
{English, "Cyrl", "Cyrillic"},
{EuropeanPortuguese, "Gujr", "guzerate"},
}
for i, test := range tests {
tag := language.MustParseScript(test.script)
if got := test.d.Scripts().Name(tag); got != test.name {
t.Errorf("%d:%v: got %s; want %s", i, tag, got, test.name)
}
}
}
@@ -0,0 +1,116 @@
// Copyright 2014 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package display_test
import (
"fmt"
"golang.org/x/text/language"
"golang.org/x/text/language/display"
"golang.org/x/text/message"
)
func ExampleFormatter() {
message.SetString(language.Dutch, "In %v people speak %v.", "In %v spreekt men %v.")
fr := language.French
region, _ := fr.Region()
for _, tag := range []string{"en", "nl"} {
p := message.NewPrinter(language.Make(tag))
p.Printf("In %v people speak %v.", display.Region(region), display.Language(fr))
p.Println()
}
// Output:
// In France people speak French.
// In Frankrijk spreekt men Frans.
}
func ExampleNamer() {
supported := []string{
"en-US", "en-GB", "ja", "zh", "zh-Hans", "zh-Hant", "pt", "pt-PT", "ko", "ar", "el", "ru", "uk", "pa",
}
en := display.English.Languages()
for _, s := range supported {
t := language.MustParse(s)
fmt.Printf("%-20s (%s)\n", en.Name(t), display.Self.Name(t))
}
// Output:
// American English (American English)
// British English (British English)
// Japanese (日本語)
// Chinese (中文)
// Simplified Chinese (简体中文)
// Traditional Chinese (繁體中文)
// Portuguese (português)
// European Portuguese (português europeu)
// Korean (한국어)
// Arabic (العربية)
// Greek (Ελληνικά)
// Russian (русский)
// Ukrainian (українська)
// Punjabi (ਪੰਜਾਬੀ)
}
func ExampleTags() {
n := display.Tags(language.English)
fmt.Println(n.Name(language.Make("nl")))
fmt.Println(n.Name(language.Make("nl-BE")))
fmt.Println(n.Name(language.Make("nl-CW")))
fmt.Println(n.Name(language.Make("nl-Arab")))
fmt.Println(n.Name(language.Make("nl-Cyrl-RU")))
// Output:
// Dutch
// Flemish
// Dutch (Curaçao)
// Dutch (Arabic)
// Dutch (Cyrillic, Russia)
}
// ExampleDictionary shows how to reduce the amount of data linked into your
// binary by only using the predefined Dictionary variables of the languages you
// wish to support.
func ExampleDictionary() {
tags := []language.Tag{
language.English,
language.German,
language.Japanese,
language.Russian,
}
dicts := []*display.Dictionary{
display.English,
display.German,
display.Japanese,
display.Russian,
}
m := language.NewMatcher(tags)
getDict := func(t language.Tag) *display.Dictionary {
_, i, confidence := m.Match(t)
// Skip this check if you want to support a fall-back language, which
// will be the first one passed to NewMatcher.
if confidence == language.No {
return nil
}
return dicts[i]
}
// The matcher will match Swiss German to German.
n := getDict(language.Make("gsw")).Languages()
fmt.Println(n.Name(language.German))
fmt.Println(n.Name(language.Make("de-CH")))
fmt.Println(n.Name(language.Make("gsw")))
// Output:
// Deutsch
// Schweizer Hochdeutsch
// Schweizerdeutsch
}
@@ -0,0 +1,253 @@
// Copyright 2014 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package display
// This file contains common lookup code that is shared between the various
// implementations of Namer and Dictionaries.
import (
"fmt"
"sort"
"strings"
"golang.org/x/text/language"
)
type namer interface {
// name gets the string for the given index. It should walk the
// inheritance chain if a value is not present in the base index.
name(idx int) string
}
func nameLanguage(n namer, x interface{}) string {
t, _ := language.All.Compose(x)
for {
i, _, _ := langTagSet.index(t.Raw())
if s := n.name(i); s != "" {
return s
}
if t = t.Parent(); t == language.Und {
return ""
}
}
}
func nameScript(n namer, x interface{}) string {
t, _ := language.DeprecatedScript.Compose(x)
_, s, _ := t.Raw()
return n.name(scriptIndex.index(s.String()))
}
func nameRegion(n namer, x interface{}) string {
t, _ := language.DeprecatedRegion.Compose(x)
_, _, r := t.Raw()
return n.name(regionIndex.index(r.String()))
}
func nameTag(langN, scrN, regN namer, x interface{}) string {
t, ok := x.(language.Tag)
if !ok {
return ""
}
const form = language.All &^ language.SuppressScript
if c, err := form.Canonicalize(t); err == nil {
t = c
}
_, sRaw, rRaw := t.Raw()
i, scr, reg := langTagSet.index(t.Raw())
for i != -1 {
if str := langN.name(i); str != "" {
if hasS, hasR := (scr != language.Script{}), (reg != language.Region{}); hasS || hasR {
ss, sr := "", ""
if hasS {
ss = scrN.name(scriptIndex.index(scr.String()))
}
if hasR {
sr = regN.name(regionIndex.index(reg.String()))
}
// TODO: use patterns in CLDR or at least confirm they are the
// same for all languages.
if ss != "" && sr != "" {
return fmt.Sprintf("%s (%s, %s)", str, ss, sr)
}
if ss != "" || sr != "" {
return fmt.Sprintf("%s (%s%s)", str, ss, sr)
}
}
return str
}
scr, reg = sRaw, rRaw
if t = t.Parent(); t == language.Und {
return ""
}
i, _, _ = langTagSet.index(t.Raw())
}
return ""
}
// header contains the data and indexes for a single namer.
// data contains a series of strings concatenated into one. index contains the
// offsets for a string in data. For example, consider a header that defines
// strings for the languages de, el, en, fi, and nl:
//
// header{
// data: "GermanGreekEnglishDutch",
// index: []uint16{0, 6, 11, 18, 18, 23},
// }
//
// For a language with index i, the string is defined by
// data[index[i]:index[i+1]]. So the number of elements in index is always one
// greater than the number of languages for which header defines a value.
// A string for a language may be empty, which means the name is undefined. In
// the above example, the name for fi (Finnish) is undefined.
type header struct {
data string
index []uint16
}
// name looks up the name for a tag in the dictionary, given its index.
func (h *header) name(i int) string {
if 0 <= i && i < len(h.index)-1 {
return h.data[h.index[i]:h.index[i+1]]
}
return ""
}
// tagSet is used to find the index of a language in a set of tags.
type tagSet struct {
single tagIndex
long []string
}
var (
langTagSet = tagSet{
single: langIndex,
long: langTagsLong,
}
// selfTagSet is used for indexing the language strings in their own
// language.
selfTagSet = tagSet{
single: selfIndex,
long: selfTagsLong,
}
zzzz = language.MustParseScript("Zzzz")
zz = language.MustParseRegion("ZZ")
)
// index returns the index of the tag for the given base, script and region or
// its parent if the tag is not available. If the match is for a parent entry,
// the excess script and region are returned.
func (ts *tagSet) index(base language.Base, scr language.Script, reg language.Region) (int, language.Script, language.Region) {
lang := base.String()
index := -1
if (scr != language.Script{} || reg != language.Region{}) {
if scr == zzzz {
scr = language.Script{}
}
if reg == zz {
reg = language.Region{}
}
i := sort.SearchStrings(ts.long, lang)
// All entries have either a script or a region and not both.
scrStr, regStr := scr.String(), reg.String()
for ; i < len(ts.long) && strings.HasPrefix(ts.long[i], lang); i++ {
if s := ts.long[i][len(lang)+1:]; s == scrStr {
scr = language.Script{}
index = i + ts.single.len()
break
} else if s == regStr {
reg = language.Region{}
index = i + ts.single.len()
break
}
}
}
if index == -1 {
index = ts.single.index(lang)
}
return index, scr, reg
}
func (ts *tagSet) Tags() []language.Tag {
tags := make([]language.Tag, 0, ts.single.len()+len(ts.long))
ts.single.keys(func(s string) {
tags = append(tags, language.Raw.MustParse(s))
})
for _, s := range ts.long {
tags = append(tags, language.Raw.MustParse(s))
}
return tags
}
func supportedScripts() []language.Script {
scr := make([]language.Script, 0, scriptIndex.len())
scriptIndex.keys(func(s string) {
scr = append(scr, language.MustParseScript(s))
})
return scr
}
func supportedRegions() []language.Region {
reg := make([]language.Region, 0, regionIndex.len())
regionIndex.keys(func(s string) {
reg = append(reg, language.MustParseRegion(s))
})
return reg
}
// tagIndex holds a concatenated lists of subtags of length 2 to 4, one string
// for each length, which can be used in combination with binary search to get
// the index associated with a tag.
// For example, a tagIndex{
//
// "arenesfrruzh", // 6 2-byte tags.
// "barwae", // 2 3-byte tags.
// "",
//
// }
// would mean that the 2-byte tag "fr" had an index of 3, and the 3-byte tag
// "wae" had an index of 7.
type tagIndex [3]string
func (t *tagIndex) index(s string) int {
sz := len(s)
if sz < 2 || 4 < sz {
return -1
}
a := t[sz-2]
index := sort.Search(len(a)/sz, func(i int) bool {
p := i * sz
return a[p:p+sz] >= s
})
p := index * sz
if end := p + sz; end > len(a) || a[p:end] != s {
return -1
}
// Add the number of tags for smaller sizes.
for i := 0; i < sz-2; i++ {
index += len(t[i]) / (i + 2)
}
return index
}
// len returns the number of tags that are contained in the tagIndex.
func (t *tagIndex) len() (n int) {
for i, s := range t {
n += len(s) / (i + 2)
}
return n
}
// keys calls f for each tag.
func (t *tagIndex) keys(f func(key string)) {
for i, s := range *t {
for ; s != ""; s = s[i+2:] {
f(s[:i+2])
}
}
}
@@ -0,0 +1,602 @@
// Copyright 2014 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build ignore
// Generator for display name tables.
package main
import (
"bytes"
"flag"
"fmt"
"log"
"reflect"
"sort"
"strings"
"golang.org/x/text/internal/gen"
"golang.org/x/text/language"
"golang.org/x/text/unicode/cldr"
)
var (
test = flag.Bool("test", false,
"test existing tables; can be used to compare web data with package data.")
outputFile = flag.String("output", "tables.go", "output file")
stats = flag.Bool("stats", false, "prints statistics to stderr")
short = flag.Bool("short", false, `Use "short" alternatives, when available.`)
draft = flag.String("draft",
"contributed",
`Minimal draft requirements (approved, contributed, provisional, unconfirmed).`)
pkg = flag.String("package",
"display",
"the name of the package in which the generated file is to be included")
tags = newTagSet("tags",
[]language.Tag{},
"space-separated list of tags to include or empty for all")
dict = newTagSet("dict",
dictTags(),
"space-separated list or tags for which to include a Dictionary. "+
`"" means the common list from go.text/language.`)
)
func dictTags() (tag []language.Tag) {
// TODO: replace with language.Common.Tags() once supported.
const str = "af am ar ar-001 az bg bn ca cs da de el en en-US en-GB " +
"es es-ES es-419 et fa fi fil fr fr-CA gu he hi hr hu hy id is it ja " +
"ka kk km kn ko ky lo lt lv mk ml mn mr ms my ne nl no pa pl pt pt-BR " +
"pt-PT ro ru si sk sl sq sr sr-Latn sv sw ta te th tr uk ur uz vi " +
"zh zh-Hans zh-Hant zu"
for _, s := range strings.Split(str, " ") {
tag = append(tag, language.MustParse(s))
}
return tag
}
func main() {
gen.Init()
// Read the CLDR zip file.
r := gen.OpenCLDRCoreZip()
defer r.Close()
d := &cldr.Decoder{}
d.SetDirFilter("main", "supplemental")
d.SetSectionFilter("localeDisplayNames")
data, err := d.DecodeZip(r)
if err != nil {
log.Fatalf("DecodeZip: %v", err)
}
w := gen.NewCodeWriter()
defer w.WriteGoFile(*outputFile, "display")
gen.WriteCLDRVersion(w)
b := builder{
w: w,
data: data,
group: make(map[string]*group),
}
b.generate()
}
const tagForm = language.All
// tagSet is used to parse command line flags of tags. It implements the
// flag.Value interface.
type tagSet map[language.Tag]bool
func newTagSet(name string, tags []language.Tag, usage string) tagSet {
f := tagSet(make(map[language.Tag]bool))
for _, t := range tags {
f[t] = true
}
flag.Var(f, name, usage)
return f
}
// String implements the String method of the flag.Value interface.
func (f tagSet) String() string {
tags := []string{}
for t := range f {
tags = append(tags, t.String())
}
sort.Strings(tags)
return strings.Join(tags, " ")
}
// Set implements Set from the flag.Value interface.
func (f tagSet) Set(s string) error {
if s != "" {
for _, s := range strings.Split(s, " ") {
if s != "" {
tag, err := tagForm.Parse(s)
if err != nil {
return err
}
f[tag] = true
}
}
}
return nil
}
func (f tagSet) contains(t language.Tag) bool {
if len(f) == 0 {
return true
}
return f[t]
}
// builder is used to create all tables with display name information.
type builder struct {
w *gen.CodeWriter
data *cldr.CLDR
fromLocs []string
// destination tags for the current locale.
toTags []string
toTagIndex map[string]int
// list of supported tags
supported []language.Tag
// key-value pairs per group
group map[string]*group
// statistics
sizeIndex int // total size of all indexes of headers
sizeData int // total size of all data of headers
totalSize int
}
type group struct {
// Maps from a given language to the Namer data for this language.
lang map[language.Tag]keyValues
headers []header
toTags []string
threeStart int
fourPlusStart int
}
// set sets the typ to the name for locale loc.
func (g *group) set(t language.Tag, typ, name string) {
kv := g.lang[t]
if kv == nil {
kv = make(keyValues)
g.lang[t] = kv
}
if kv[typ] == "" {
kv[typ] = name
}
}
type keyValues map[string]string
type header struct {
tag language.Tag
data string
index []uint16
}
var versionInfo = `// Version is deprecated. Use CLDRVersion.
const Version = %#v
`
var self = language.MustParse("mul")
// generate builds and writes all tables.
func (b *builder) generate() {
fmt.Fprintf(b.w, versionInfo, cldr.Version)
b.filter()
b.setData("lang", func(g *group, loc language.Tag, ldn *cldr.LocaleDisplayNames) {
if ldn.Languages != nil {
for _, v := range ldn.Languages.Language {
lang := v.Type
if lang == "root" {
// We prefer the data from "und"
// TODO: allow both the data for root and und somehow.
continue
}
tag := tagForm.MustParse(lang)
if tags.contains(tag) {
g.set(loc, tag.String(), v.Data())
}
}
}
})
b.setData("script", func(g *group, loc language.Tag, ldn *cldr.LocaleDisplayNames) {
if ldn.Scripts != nil {
for _, v := range ldn.Scripts.Script {
code := language.MustParseScript(v.Type)
if code.IsPrivateUse() { // Qaaa..Qabx
// TODO: data currently appears to be very meager.
// Reconsider if we have data for English.
if loc == language.English {
log.Fatal("Consider including data for private use scripts.")
}
continue
}
g.set(loc, code.String(), v.Data())
}
}
})
b.setData("region", func(g *group, loc language.Tag, ldn *cldr.LocaleDisplayNames) {
if ldn.Territories != nil {
for _, v := range ldn.Territories.Territory {
g.set(loc, language.MustParseRegion(v.Type).String(), v.Data())
}
}
})
b.makeSupported()
b.writeParents()
b.writeGroup("lang")
b.writeGroup("script")
b.writeGroup("region")
b.w.WriteConst("numSupported", len(b.supported))
buf := bytes.Buffer{}
for _, tag := range b.supported {
fmt.Fprint(&buf, tag.String(), "|")
}
b.w.WriteConst("supported", buf.String())
b.writeDictionaries()
b.supported = []language.Tag{self}
// Compute the names of locales in their own language. Some of these names
// may be specified in their parent locales. We iterate the maximum depth
// of the parent three times to match successive parents of tags until a
// possible match is found.
for i := 0; i < 4; i++ {
b.setData("self", func(g *group, tag language.Tag, ldn *cldr.LocaleDisplayNames) {
parent := tag
if b, s, r := tag.Raw(); i > 0 && (s != language.Script{} && r == language.Region{}) {
parent, _ = language.Raw.Compose(b)
}
if ldn.Languages != nil {
for _, v := range ldn.Languages.Language {
key := tagForm.MustParse(v.Type)
saved := key
if key == parent {
g.set(self, tag.String(), v.Data())
}
for k := 0; k < i; k++ {
key = key.Parent()
}
if key == tag {
g.set(self, saved.String(), v.Data()) // set does not overwrite a value.
}
}
}
})
}
b.writeGroup("self")
}
func (b *builder) setData(name string, f func(*group, language.Tag, *cldr.LocaleDisplayNames)) {
b.sizeIndex = 0
b.sizeData = 0
b.toTags = nil
b.fromLocs = nil
b.toTagIndex = make(map[string]int)
g := b.group[name]
if g == nil {
g = &group{lang: make(map[language.Tag]keyValues)}
b.group[name] = g
}
for _, loc := range b.data.Locales() {
// We use RawLDML instead of LDML as we are managing our own inheritance
// in this implementation.
ldml := b.data.RawLDML(loc)
// We do not support the POSIX variant (it is not a supported BCP 47
// variant). This locale also doesn't happen to contain any data, so
// we'll skip it by checking for this.
tag, err := tagForm.Parse(loc)
if err != nil {
if ldml.LocaleDisplayNames != nil {
log.Fatalf("setData: %v", err)
}
continue
}
if ldml.LocaleDisplayNames != nil && tags.contains(tag) {
f(g, tag, ldml.LocaleDisplayNames)
}
}
}
func (b *builder) filter() {
filter := func(s *cldr.Slice) {
if *short {
s.SelectOnePerGroup("alt", []string{"short", ""})
} else {
s.SelectOnePerGroup("alt", []string{"stand-alone", ""})
}
d, err := cldr.ParseDraft(*draft)
if err != nil {
log.Fatalf("filter: %v", err)
}
s.SelectDraft(d)
}
for _, loc := range b.data.Locales() {
if ldn := b.data.RawLDML(loc).LocaleDisplayNames; ldn != nil {
if ldn.Languages != nil {
s := cldr.MakeSlice(&ldn.Languages.Language)
if filter(&s); len(ldn.Languages.Language) == 0 {
ldn.Languages = nil
}
}
if ldn.Scripts != nil {
s := cldr.MakeSlice(&ldn.Scripts.Script)
if filter(&s); len(ldn.Scripts.Script) == 0 {
ldn.Scripts = nil
}
}
if ldn.Territories != nil {
s := cldr.MakeSlice(&ldn.Territories.Territory)
if filter(&s); len(ldn.Territories.Territory) == 0 {
ldn.Territories = nil
}
}
}
}
}
// makeSupported creates a list of all supported locales.
func (b *builder) makeSupported() {
// tags across groups
for _, g := range b.group {
for t, _ := range g.lang {
b.supported = append(b.supported, t)
}
}
b.supported = b.supported[:unique(tagsSorter(b.supported))]
}
type tagsSorter []language.Tag
func (a tagsSorter) Len() int { return len(a) }
func (a tagsSorter) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
func (a tagsSorter) Less(i, j int) bool { return a[i].String() < a[j].String() }
func (b *builder) writeGroup(name string) {
g := b.group[name]
for _, kv := range g.lang {
for t, _ := range kv {
g.toTags = append(g.toTags, t)
}
}
g.toTags = g.toTags[:unique(tagsBySize(g.toTags))]
// Allocate header per supported value.
g.headers = make([]header, len(b.supported))
for i, sup := range b.supported {
kv, ok := g.lang[sup]
if !ok {
g.headers[i].tag = sup
continue
}
data := []byte{}
index := make([]uint16, len(g.toTags), len(g.toTags)+1)
for j, t := range g.toTags {
index[j] = uint16(len(data))
data = append(data, kv[t]...)
}
index = append(index, uint16(len(data)))
// Trim the tail of the index.
// TODO: indexes can be reduced in size quite a bit more.
n := len(index)
for ; n >= 2 && index[n-2] == index[n-1]; n-- {
}
index = index[:n]
// Workaround for a bug in CLDR 26.
// See https://unicode.org/cldr/trac/ticket/8042.
if cldr.Version == "26" && sup.String() == "hsb" {
data = bytes.Replace(data, []byte{'"'}, nil, 1)
}
g.headers[i] = header{sup, string(data), index}
}
g.writeTable(b.w, name)
}
type tagsBySize []string
func (l tagsBySize) Len() int { return len(l) }
func (l tagsBySize) Swap(i, j int) { l[i], l[j] = l[j], l[i] }
func (l tagsBySize) Less(i, j int) bool {
a, b := l[i], l[j]
// Sort single-tag entries based on size first. Otherwise alphabetic.
if len(a) != len(b) && (len(a) <= 4 || len(b) <= 4) {
return len(a) < len(b)
}
return a < b
}
// parentIndices returns slice a of len(tags) where tags[a[i]] is the parent
// of tags[i].
func parentIndices(tags []language.Tag) []int16 {
index := make(map[language.Tag]int16)
for i, t := range tags {
index[t] = int16(i)
}
// Construct default parents.
parents := make([]int16, len(tags))
for i, t := range tags {
parents[i] = -1
for t = t.Parent(); t != language.Und; t = t.Parent() {
if j, ok := index[t]; ok {
parents[i] = j
break
}
}
}
return parents
}
func (b *builder) writeParents() {
parents := parentIndices(b.supported)
fmt.Fprintf(b.w, "var parents = ")
b.w.WriteArray(parents)
}
// writeKeys writes keys to a special index used by the display package.
// tags are assumed to be sorted by length.
func writeKeys(w *gen.CodeWriter, name string, keys []string) {
w.Size += int(3 * reflect.TypeOf("").Size())
w.WriteComment("Number of keys: %d", len(keys))
fmt.Fprintf(w, "var (\n\t%sIndex = tagIndex{\n", name)
for i := 2; i <= 4; i++ {
sub := []string{}
for _, t := range keys {
if len(t) != i {
break
}
sub = append(sub, t)
}
s := strings.Join(sub, "")
w.WriteString(s)
fmt.Fprintf(w, ",\n")
keys = keys[len(sub):]
}
fmt.Fprintln(w, "\t}")
if len(keys) > 0 {
w.Size += int(reflect.TypeOf([]string{}).Size())
fmt.Fprintf(w, "\t%sTagsLong = ", name)
w.WriteSlice(keys)
}
fmt.Fprintln(w, ")\n")
}
// identifier creates an identifier from the given tag.
func identifier(t language.Tag) string {
return strings.Replace(t.String(), "-", "", -1)
}
func (h *header) writeEntry(w *gen.CodeWriter, name string) {
if len(dict) > 0 && dict.contains(h.tag) {
fmt.Fprintf(w, "\t{ // %s\n", h.tag)
fmt.Fprintf(w, "\t\t%[1]s%[2]sStr,\n\t\t%[1]s%[2]sIdx,\n", identifier(h.tag), name)
fmt.Fprintln(w, "\t},")
} else if len(h.data) == 0 {
fmt.Fprintln(w, "\t\t{}, //", h.tag)
} else {
fmt.Fprintf(w, "\t{ // %s\n", h.tag)
w.WriteString(h.data)
fmt.Fprintln(w, ",")
w.WriteSlice(h.index)
fmt.Fprintln(w, ",\n\t},")
}
}
// write the data for the given header as single entries. The size for this data
// was already accounted for in writeEntry.
func (h *header) writeSingle(w *gen.CodeWriter, name string) {
if len(dict) > 0 && dict.contains(h.tag) {
tag := identifier(h.tag)
w.WriteConst(tag+name+"Str", h.data)
// Note that we create a slice instead of an array. If we use an array
// we need to refer to it as a[:] in other tables, which will cause the
// array to always be included by the linker. See Issue 7651.
w.WriteVar(tag+name+"Idx", h.index)
}
}
// writeTable writes an entry for a single Namer.
func (g *group) writeTable(w *gen.CodeWriter, name string) {
start := w.Size
writeKeys(w, name, g.toTags)
w.Size += len(g.headers) * int(reflect.ValueOf(g.headers[0]).Type().Size())
fmt.Fprintf(w, "var %sHeaders = [%d]header{\n", name, len(g.headers))
title := strings.Title(name)
for _, h := range g.headers {
h.writeEntry(w, title)
}
fmt.Fprintln(w, "}\n")
for _, h := range g.headers {
h.writeSingle(w, title)
}
n := w.Size - start
fmt.Fprintf(w, "// Total size for %s: %d bytes (%d KB)\n\n", name, n, n/1000)
}
func (b *builder) writeDictionaries() {
fmt.Fprintln(b.w, "// Dictionary entries of frequent languages")
fmt.Fprintln(b.w, "var (")
parents := parentIndices(b.supported)
for i, t := range b.supported {
if dict.contains(t) {
ident := identifier(t)
fmt.Fprintf(b.w, "\t%s = Dictionary{ // %s\n", ident, t)
if p := parents[i]; p == -1 {
fmt.Fprintln(b.w, "\t\tnil,")
} else {
fmt.Fprintf(b.w, "\t\t&%s,\n", identifier(b.supported[p]))
}
fmt.Fprintf(b.w, "\t\theader{%[1]sLangStr, %[1]sLangIdx},\n", ident)
fmt.Fprintf(b.w, "\t\theader{%[1]sScriptStr, %[1]sScriptIdx},\n", ident)
fmt.Fprintf(b.w, "\t\theader{%[1]sRegionStr, %[1]sRegionIdx},\n", ident)
fmt.Fprintln(b.w, "\t}")
}
}
fmt.Fprintln(b.w, ")")
var s string
var a []uint16
sz := reflect.TypeOf(s).Size()
sz += reflect.TypeOf(a).Size()
sz *= 3
sz += reflect.TypeOf(&a).Size()
n := int(sz) * len(dict)
fmt.Fprintf(b.w, "// Total size for %d entries: %d bytes (%d KB)\n\n", len(dict), n, n/1000)
b.w.Size += n
}
// unique sorts the given lists and removes duplicate entries by swapping them
// past position k, where k is the number of unique values. It returns k.
func unique(a sort.Interface) int {
if a.Len() == 0 {
return 0
}
sort.Sort(a)
k := 1
for i := 1; i < a.Len(); i++ {
if a.Less(k-1, i) {
if k != i {
a.Swap(k, i)
}
k++
}
}
return k
}
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,98 @@
// Copyright 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package language implements BCP 47 language tags and related functionality.
//
// The most important function of package language is to match a list of
// user-preferred languages to a list of supported languages.
// It alleviates the developer of dealing with the complexity of this process
// and provides the user with the best experience
// (see https://blog.golang.org/matchlang).
//
// # Matching preferred against supported languages
//
// A Matcher for an application that supports English, Australian English,
// Danish, and standard Mandarin can be created as follows:
//
// var matcher = language.NewMatcher([]language.Tag{
// language.English, // The first language is used as fallback.
// language.MustParse("en-AU"),
// language.Danish,
// language.Chinese,
// })
//
// This list of supported languages is typically implied by the languages for
// which there exists translations of the user interface.
//
// User-preferred languages usually come as a comma-separated list of BCP 47
// language tags.
// The MatchString finds best matches for such strings:
//
// handler(w http.ResponseWriter, r *http.Request) {
// lang, _ := r.Cookie("lang")
// accept := r.Header.Get("Accept-Language")
// tag, _ := language.MatchStrings(matcher, lang.String(), accept)
//
// // tag should now be used for the initialization of any
// // locale-specific service.
// }
//
// The Matcher's Match method can be used to match Tags directly.
//
// Matchers are aware of the intricacies of equivalence between languages, such
// as deprecated subtags, legacy tags, macro languages, mutual
// intelligibility between scripts and languages, and transparently passing
// BCP 47 user configuration.
// For instance, it will know that a reader of Bokmål Danish can read Norwegian
// and will know that Cantonese ("yue") is a good match for "zh-HK".
//
// # Using match results
//
// To guarantee a consistent user experience to the user it is important to
// use the same language tag for the selection of any locale-specific services.
// For example, it is utterly confusing to substitute spelled-out numbers
// or dates in one language in text of another language.
// More subtly confusing is using the wrong sorting order or casing
// algorithm for a certain language.
//
// All the packages in x/text that provide locale-specific services
// (e.g. collate, cases) should be initialized with the tag that was
// obtained at the start of an interaction with the user.
//
// Note that Tag that is returned by Match and MatchString may differ from any
// of the supported languages, as it may contain carried over settings from
// the user tags.
// This may be inconvenient when your application has some additional
// locale-specific data for your supported languages.
// Match and MatchString both return the index of the matched supported tag
// to simplify associating such data with the matched tag.
//
// # Canonicalization
//
// If one uses the Matcher to compare languages one does not need to
// worry about canonicalization.
//
// The meaning of a Tag varies per application. The language package
// therefore delays canonicalization and preserves information as much
// as possible. The Matcher, however, will always take into account that
// two different tags may represent the same language.
//
// By default, only legacy and deprecated tags are converted into their
// canonical equivalent. All other information is preserved. This approach makes
// the confidence scores more accurate and allows matchers to distinguish
// between variants that are otherwise lost.
//
// As a consequence, two tags that should be treated as identical according to
// BCP 47 or CLDR, like "en-Latn" and "en", will be represented differently. The
// Matcher handles such distinctions, though, and is aware of the
// equivalence relations. The CanonType type can be used to alter the
// canonicalization form.
//
// # References
//
// BCP 47 - Tags for Identifying Languages http://tools.ietf.org/html/bcp47
package language // import "golang.org/x/text/language"
// TODO: explanation on how to match languages for your own locale-specific
// service.
@@ -0,0 +1,411 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package language_test
import (
"fmt"
"net/http"
"golang.org/x/text/language"
)
func ExampleCanonType() {
p := func(id string) {
fmt.Printf("Default(%s) -> %s\n", id, language.Make(id))
fmt.Printf("BCP47(%s) -> %s\n", id, language.BCP47.Make(id))
fmt.Printf("Macro(%s) -> %s\n", id, language.Macro.Make(id))
fmt.Printf("All(%s) -> %s\n", id, language.All.Make(id))
}
p("en-Latn")
p("sh")
p("zh-cmn")
p("bjd")
p("iw-Latn-fonipa-u-cu-usd")
// Output:
// Default(en-Latn) -> en-Latn
// BCP47(en-Latn) -> en
// Macro(en-Latn) -> en-Latn
// All(en-Latn) -> en
// Default(sh) -> sr-Latn
// BCP47(sh) -> sh
// Macro(sh) -> sh
// All(sh) -> sr-Latn
// Default(zh-cmn) -> cmn
// BCP47(zh-cmn) -> cmn
// Macro(zh-cmn) -> zh
// All(zh-cmn) -> zh
// Default(bjd) -> drl
// BCP47(bjd) -> drl
// Macro(bjd) -> bjd
// All(bjd) -> drl
// Default(iw-Latn-fonipa-u-cu-usd) -> he-Latn-fonipa-u-cu-usd
// BCP47(iw-Latn-fonipa-u-cu-usd) -> he-Latn-fonipa-u-cu-usd
// Macro(iw-Latn-fonipa-u-cu-usd) -> iw-Latn-fonipa-u-cu-usd
// All(iw-Latn-fonipa-u-cu-usd) -> he-Latn-fonipa-u-cu-usd
}
func ExampleTag_Base() {
fmt.Println(language.Make("und").Base())
fmt.Println(language.Make("und-US").Base())
fmt.Println(language.Make("und-NL").Base())
fmt.Println(language.Make("und-419").Base()) // Latin America
fmt.Println(language.Make("und-ZZ").Base())
// Output:
// en Low
// en High
// nl High
// es Low
// en Low
}
func ExampleTag_Script() {
en := language.Make("en")
sr := language.Make("sr")
sr_Latn := language.Make("sr_Latn")
fmt.Println(en.Script())
fmt.Println(sr.Script())
// Was a script explicitly specified?
_, c := sr.Script()
fmt.Println(c == language.Exact)
_, c = sr_Latn.Script()
fmt.Println(c == language.Exact)
// Output:
// Latn High
// Cyrl Low
// false
// true
}
func ExampleTag_Region() {
ru := language.Make("ru")
en := language.Make("en")
fmt.Println(ru.Region())
fmt.Println(en.Region())
// Output:
// RU Low
// US Low
}
func ExampleRegion_TLD() {
us := language.MustParseRegion("US")
gb := language.MustParseRegion("GB")
uk := language.MustParseRegion("UK")
bu := language.MustParseRegion("BU")
fmt.Println(us.TLD())
fmt.Println(gb.TLD())
fmt.Println(uk.TLD())
fmt.Println(bu.TLD())
fmt.Println(us.Canonicalize().TLD())
fmt.Println(gb.Canonicalize().TLD())
fmt.Println(uk.Canonicalize().TLD())
fmt.Println(bu.Canonicalize().TLD())
// Output:
// US <nil>
// UK <nil>
// UK <nil>
// ZZ language: region is not a valid ccTLD
// US <nil>
// UK <nil>
// UK <nil>
// MM <nil>
}
func ExampleCompose() {
nl, _ := language.ParseBase("nl")
us, _ := language.ParseRegion("US")
de := language.Make("de-1901-u-co-phonebk")
jp := language.Make("ja-JP")
fi := language.Make("fi-x-ing")
u, _ := language.ParseExtension("u-nu-arabic")
x, _ := language.ParseExtension("x-piglatin")
// Combine a base language and region.
fmt.Println(language.Compose(nl, us))
// Combine a base language and extension.
fmt.Println(language.Compose(nl, x))
// Replace the region.
fmt.Println(language.Compose(jp, us))
// Combine several tags.
fmt.Println(language.Compose(us, nl, u))
// Replace the base language of a tag.
fmt.Println(language.Compose(de, nl))
fmt.Println(language.Compose(de, nl, u))
// Remove the base language.
fmt.Println(language.Compose(de, language.Base{}))
// Remove all variants.
fmt.Println(language.Compose(de, []language.Variant{}))
// Remove all extensions.
fmt.Println(language.Compose(de, []language.Extension{}))
fmt.Println(language.Compose(fi, []language.Extension{}))
// Remove all variants and extensions.
fmt.Println(language.Compose(de.Raw()))
// An error is gobbled or returned if non-nil.
fmt.Println(language.Compose(language.ParseRegion("ZA")))
fmt.Println(language.Compose(language.ParseRegion("HH")))
// Compose uses the same Default canonicalization as Make.
fmt.Println(language.Compose(language.Raw.Parse("en-Latn-UK")))
// Call compose on a different CanonType for different results.
fmt.Println(language.All.Compose(language.Raw.Parse("en-Latn-UK")))
// Output:
// nl-US <nil>
// nl-x-piglatin <nil>
// ja-US <nil>
// nl-US-u-nu-arabic <nil>
// nl-1901-u-co-phonebk <nil>
// nl-1901-u-co-phonebk-nu-arabic <nil>
// und-1901-u-co-phonebk <nil>
// de-u-co-phonebk <nil>
// de-1901 <nil>
// fi <nil>
// de <nil>
// und-ZA <nil>
// und language: subtag "HH" is well-formed but unknown
// en-Latn-GB <nil>
// en-GB <nil>
}
func ExampleParse_errors() {
for _, s := range []string{"Foo", "Bar", "Foobar"} {
_, err := language.Parse(s)
if err != nil {
if inv, ok := err.(language.ValueError); ok {
fmt.Println(inv.Subtag())
} else {
fmt.Println(s)
}
}
}
for _, s := range []string{"en", "aa-Uuuu", "AC", "ac-u"} {
_, err := language.Parse(s)
switch e := err.(type) {
case language.ValueError:
fmt.Printf("%s: culprit %q\n", s, e.Subtag())
case nil:
// No error.
default:
// A syntax error.
fmt.Printf("%s: ill-formed\n", s)
}
}
// Output:
// foo
// Foobar
// aa-Uuuu: culprit "Uuuu"
// AC: culprit "ac"
// ac-u: ill-formed
}
func ExampleTag_Parent() {
p := func(tag string) {
fmt.Printf("parent(%v): %v\n", tag, language.Make(tag).Parent())
}
p("zh-CN")
// Australian English inherits from World English.
p("en-AU")
// If the tag has a different maximized script from its parent, a tag with
// this maximized script is inserted. This allows different language tags
// which have the same base language and script in common to inherit from
// a common set of settings.
p("zh-HK")
// If the maximized script of the parent is not identical, CLDR will skip
// inheriting from it, as it means there will not be many entries in common
// and inheriting from it is nonsensical.
p("zh-Hant")
// The parent of a tag with variants and extensions is the tag with all
// variants and extensions removed.
p("de-1994-u-co-phonebk")
// Remove default script.
p("de-Latn-LU")
// Output:
// parent(zh-CN): zh
// parent(en-AU): en-001
// parent(zh-HK): zh-Hant
// parent(zh-Hant): und
// parent(de-1994-u-co-phonebk): de
// parent(de-Latn-LU): de
}
// ExampleMatcher_bestMatch gives some examples of getting the best match of
// a set of tags to any of the tags of given set.
func ExampleMatcher() {
// This is the set of tags from which we want to pick the best match. These
// can be, for example, the supported languages for some package.
tags := []language.Tag{
language.English, // en
language.BritishEnglish, // en-GB
language.French, // fr
language.Afrikaans, // af
language.BrazilianPortuguese, // pt-BR
language.EuropeanPortuguese, // pt-PT
language.SimplifiedChinese, // zh-Hans
language.Raw.Make("iw-IL"), // Hebrew from Israel
language.Raw.Make("iw"), // Hebrew
language.Raw.Make("he"), // Hebrew
}
m := language.NewMatcher(tags)
// A simple match.
fmt.Println(m.Match(language.Make("fr")))
// Australian English is closer to British English than American English.
// The resulting match is "en-GB-u-rg-auzzzz". The first language listed,
// "en-GB", is the matched language. Next is the region override prefix
// "-u-rg-", the region override "au", and the region override suffix "zzzz".
// The region override is for things like currency, dates, and measurement
// systems.
fmt.Println(m.Match(language.Make("en-AU")))
// Default to the first tag passed to the Matcher if there is no match.
fmt.Println(m.Match(language.Make("ar")))
// Get the default tag.
fmt.Println(m.Match())
fmt.Println("----")
// We match SimplifiedChinese, but with Low confidence.
fmt.Println(m.Match(language.TraditionalChinese))
// British English is closer to Australian English than Traditional Chinese
// to Simplified Chinese.
fmt.Println(m.Match(language.TraditionalChinese, language.Make("en-AU")))
fmt.Println("----")
// In case a multiple variants of a language are available, the most spoken
// variant is typically returned.
fmt.Println(m.Match(language.Portuguese))
// Pick the first value passed to Match in case of a tie.
fmt.Println(m.Match(language.Dutch, language.Make("fr-BE"), language.Make("af-NA")))
fmt.Println(m.Match(language.Dutch, language.Make("af-NA"), language.Make("fr-BE")))
fmt.Println("----")
// If a Matcher is initialized with a language and its deprecated version,
// it will distinguish between them.
fmt.Println(m.Match(language.Raw.Make("iw")))
// However, for non-exact matches, it will treat deprecated versions as
// equivalent and consider other factors first.
fmt.Println(m.Match(language.Raw.Make("he-IL")))
fmt.Println("----")
// User settings passed to the Unicode extension are ignored for matching
// and preserved in the returned tag.
fmt.Println(m.Match(language.Make("de-u-co-phonebk"), language.Make("fr-u-cu-frf")))
// Even if the matching language is different.
fmt.Println(m.Match(language.Make("de-u-co-phonebk"), language.Make("br-u-cu-frf")))
// If there is no matching language, the options of the first preferred tag are used.
fmt.Println(m.Match(language.Make("de-u-co-phonebk")))
// Output:
// fr 2 Exact
// en-GB-u-rg-auzzzz 1 High
// en 0 No
// en 0 No
// ----
// zh-Hans 6 Low
// en-GB-u-rg-auzzzz 1 High
// ----
// pt-BR 4 Exact
// fr-u-rg-bezzzz 2 High
// af-u-rg-nazzzz 3 High
// ----
// iw-IL 7 Exact
// he-u-rg-ilzzzz 9 Exact
// ----
// fr-u-cu-frf 2 Exact
// fr-u-cu-frf 2 High
// en-u-co-phonebk 0 No
}
func ExampleMatchStrings() {
// languages supported by this service:
matcher := language.NewMatcher([]language.Tag{
language.English, language.Dutch, language.German,
})
http.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
lang, _ := r.Cookie("lang")
tag, _ := language.MatchStrings(matcher, lang.String(), r.Header.Get("Accept-Language"))
fmt.Println("User language:", tag)
})
}
func ExampleComprehends() {
// Various levels of comprehensibility.
fmt.Println(language.Comprehends(language.English, language.English))
fmt.Println(language.Comprehends(language.AmericanEnglish, language.BritishEnglish))
// An explicit Und results in no match.
fmt.Println(language.Comprehends(language.English, language.Und))
fmt.Println("----")
// There is usually no mutual comprehensibility between different scripts.
fmt.Println(language.Comprehends(language.Make("en-Dsrt"), language.English))
// One exception is for Traditional versus Simplified Chinese, albeit with
// a low confidence.
fmt.Println(language.Comprehends(language.TraditionalChinese, language.SimplifiedChinese))
fmt.Println("----")
// A Swiss German speaker will often understand High German.
fmt.Println(language.Comprehends(language.Make("gsw"), language.Make("de")))
// The converse is not generally the case.
fmt.Println(language.Comprehends(language.Make("de"), language.Make("gsw")))
// Output:
// Exact
// High
// No
// ----
// No
// Low
// ----
// High
// No
}
func ExampleTag_values() {
us := language.MustParseRegion("US")
en := language.MustParseBase("en")
lang, _, region := language.AmericanEnglish.Raw()
fmt.Println(lang == en, region == us)
lang, _, region = language.BritishEnglish.Raw()
fmt.Println(lang == en, region == us)
// Tags can be compared for exact equivalence using '=='.
en_us, _ := language.Compose(en, us)
fmt.Println(en_us == language.AmericanEnglish)
// Output:
// true true
// true false
// true
}
@@ -0,0 +1,307 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build ignore
// Language tag table generator.
// Data read from the web.
package main
import (
"flag"
"fmt"
"io"
"log"
"sort"
"strconv"
"strings"
"golang.org/x/text/internal/gen"
"golang.org/x/text/internal/language"
"golang.org/x/text/unicode/cldr"
)
var (
test = flag.Bool("test",
false,
"test existing tables; can be used to compare web data with package data.")
outputFile = flag.String("output",
"tables.go",
"output file for generated tables")
)
func main() {
gen.Init()
w := gen.NewCodeWriter()
defer w.WriteGoFile("tables.go", "language")
b := newBuilder(w)
gen.WriteCLDRVersion(w)
b.writeConstants()
b.writeMatchData()
}
type builder struct {
w *gen.CodeWriter
hw io.Writer // MultiWriter for w and w.Hash
data *cldr.CLDR
supp *cldr.SupplementalData
}
func (b *builder) langIndex(s string) uint16 {
return uint16(language.MustParseBase(s))
}
func (b *builder) regionIndex(s string) int {
return int(language.MustParseRegion(s))
}
func (b *builder) scriptIndex(s string) int {
return int(language.MustParseScript(s))
}
func newBuilder(w *gen.CodeWriter) *builder {
r := gen.OpenCLDRCoreZip()
defer r.Close()
d := &cldr.Decoder{}
data, err := d.DecodeZip(r)
if err != nil {
log.Fatal(err)
}
b := builder{
w: w,
hw: io.MultiWriter(w, w.Hash),
data: data,
supp: data.Supplemental(),
}
return &b
}
// writeConsts computes f(v) for all v in values and writes the results
// as constants named _v to a single constant block.
func (b *builder) writeConsts(f func(string) int, values ...string) {
fmt.Fprintln(b.w, "const (")
for _, v := range values {
fmt.Fprintf(b.w, "\t_%s = %v\n", v, f(v))
}
fmt.Fprintln(b.w, ")")
}
// TODO: region inclusion data will probably not be use used in future matchers.
var langConsts = []string{
"de", "en", "fr", "it", "mo", "no", "nb", "pt", "sh", "mul", "und",
}
var scriptConsts = []string{
"Latn", "Hani", "Hans", "Hant", "Qaaa", "Qaai", "Qabx", "Zinh", "Zyyy",
"Zzzz",
}
var regionConsts = []string{
"001", "419", "BR", "CA", "ES", "GB", "MD", "PT", "UK", "US",
"ZZ", "XA", "XC", "XK", // Unofficial tag for Kosovo.
}
func (b *builder) writeConstants() {
b.writeConsts(func(s string) int { return int(b.langIndex(s)) }, langConsts...)
b.writeConsts(b.regionIndex, regionConsts...)
b.writeConsts(b.scriptIndex, scriptConsts...)
}
type mutualIntelligibility struct {
want, have uint16
distance uint8
oneway bool
}
type scriptIntelligibility struct {
wantLang, haveLang uint16
wantScript, haveScript uint8
distance uint8
// Always oneway
}
type regionIntelligibility struct {
lang uint16 // compact language id
script uint8 // 0 means any
group uint8 // 0 means any; if bit 7 is set it means inverse
distance uint8
// Always twoway.
}
// writeMatchData writes tables with languages and scripts for which there is
// mutual intelligibility. The data is based on CLDR's languageMatching data.
// Note that we use a different algorithm than the one defined by CLDR and that
// we slightly modify the data. For example, we convert scores to confidence levels.
// We also drop all region-related data as we use a different algorithm to
// determine region equivalence.
func (b *builder) writeMatchData() {
lm := b.supp.LanguageMatching.LanguageMatches
cldr.MakeSlice(&lm).SelectAnyOf("type", "written_new")
regionHierarchy := map[string][]string{}
for _, g := range b.supp.TerritoryContainment.Group {
regions := strings.Split(g.Contains, " ")
regionHierarchy[g.Type] = append(regionHierarchy[g.Type], regions...)
}
// Regions start at 1, so the slice must be one larger than the number of
// regions.
regionToGroups := make([]uint8, language.NumRegions+1)
idToIndex := map[string]uint8{}
for i, mv := range lm[0].MatchVariable {
if i > 6 {
log.Fatalf("Too many groups: %d", i)
}
idToIndex[mv.Id] = uint8(i + 1)
// TODO: also handle '-'
for _, r := range strings.Split(mv.Value, "+") {
todo := []string{r}
for k := 0; k < len(todo); k++ {
r := todo[k]
regionToGroups[b.regionIndex(r)] |= 1 << uint8(i)
todo = append(todo, regionHierarchy[r]...)
}
}
}
b.w.WriteVar("regionToGroups", regionToGroups)
// maps language id to in- and out-of-group region.
paradigmLocales := [][3]uint16{}
locales := strings.Split(lm[0].ParadigmLocales[0].Locales, " ")
for i := 0; i < len(locales); i += 2 {
x := [3]uint16{}
for j := 0; j < 2; j++ {
pc := strings.SplitN(locales[i+j], "-", 2)
x[0] = b.langIndex(pc[0])
if len(pc) == 2 {
x[1+j] = uint16(b.regionIndex(pc[1]))
}
}
paradigmLocales = append(paradigmLocales, x)
}
b.w.WriteVar("paradigmLocales", paradigmLocales)
b.w.WriteType(mutualIntelligibility{})
b.w.WriteType(scriptIntelligibility{})
b.w.WriteType(regionIntelligibility{})
matchLang := []mutualIntelligibility{}
matchScript := []scriptIntelligibility{}
matchRegion := []regionIntelligibility{}
// Convert the languageMatch entries in lists keyed by desired language.
for _, m := range lm[0].LanguageMatch {
// Different versions of CLDR use different separators.
desired := strings.Replace(m.Desired, "-", "_", -1)
supported := strings.Replace(m.Supported, "-", "_", -1)
d := strings.Split(desired, "_")
s := strings.Split(supported, "_")
if len(d) != len(s) {
log.Fatalf("not supported: desired=%q; supported=%q", desired, supported)
continue
}
distance, _ := strconv.ParseInt(m.Distance, 10, 8)
switch len(d) {
case 2:
if desired == supported && desired == "*_*" {
continue
}
// language-script pair.
matchScript = append(matchScript, scriptIntelligibility{
wantLang: uint16(b.langIndex(d[0])),
haveLang: uint16(b.langIndex(s[0])),
wantScript: uint8(b.scriptIndex(d[1])),
haveScript: uint8(b.scriptIndex(s[1])),
distance: uint8(distance),
})
if m.Oneway != "true" {
matchScript = append(matchScript, scriptIntelligibility{
wantLang: uint16(b.langIndex(s[0])),
haveLang: uint16(b.langIndex(d[0])),
wantScript: uint8(b.scriptIndex(s[1])),
haveScript: uint8(b.scriptIndex(d[1])),
distance: uint8(distance),
})
}
case 1:
if desired == supported && desired == "*" {
continue
}
if distance == 1 {
// nb == no is already handled by macro mapping. Check there
// really is only this case.
if d[0] != "no" || s[0] != "nb" {
log.Fatalf("unhandled equivalence %s == %s", s[0], d[0])
}
continue
}
// TODO: consider dropping oneway field and just doubling the entry.
matchLang = append(matchLang, mutualIntelligibility{
want: uint16(b.langIndex(d[0])),
have: uint16(b.langIndex(s[0])),
distance: uint8(distance),
oneway: m.Oneway == "true",
})
case 3:
if desired == supported && desired == "*_*_*" {
continue
}
if desired != supported {
// This is now supported by CLDR, but only one case, which
// should already be covered by paradigm locales. For instance,
// test case "und, en, en-GU, en-IN, en-GB ; en-ZA ; en-GB" in
// testdata/CLDRLocaleMatcherTest.txt tests this.
if supported != "en_*_GB" {
log.Fatalf("not supported: desired=%q; supported=%q", desired, supported)
}
continue
}
ri := regionIntelligibility{
lang: b.langIndex(d[0]),
distance: uint8(distance),
}
if d[1] != "*" {
ri.script = uint8(b.scriptIndex(d[1]))
}
switch {
case d[2] == "*":
ri.group = 0x80 // not contained in anything
case strings.HasPrefix(d[2], "$!"):
ri.group = 0x80
d[2] = "$" + d[2][len("$!"):]
fallthrough
case strings.HasPrefix(d[2], "$"):
ri.group |= idToIndex[d[2]]
}
matchRegion = append(matchRegion, ri)
default:
log.Fatalf("not supported: desired=%q; supported=%q", desired, supported)
}
}
sort.SliceStable(matchLang, func(i, j int) bool {
return matchLang[i].distance < matchLang[j].distance
})
b.w.WriteComment(`
matchLang holds pairs of langIDs of base languages that are typically
mutually intelligible. Each pair is associated with a confidence and
whether the intelligibility goes one or both ways.`)
b.w.WriteVar("matchLang", matchLang)
b.w.WriteComment(`
matchScript holds pairs of scriptIDs where readers of one script
can typically also read the other. Each is associated with a confidence.`)
sort.SliceStable(matchScript, func(i, j int) bool {
return matchScript[i].distance < matchScript[j].distance
})
b.w.WriteVar("matchScript", matchScript)
sort.SliceStable(matchRegion, func(i, j int) bool {
return matchRegion[i].distance < matchRegion[j].distance
})
b.w.WriteVar("matchRegion", matchRegion)
}
@@ -0,0 +1,48 @@
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package language_test
import (
"fmt"
"net/http"
"strings"
"golang.org/x/text/language"
)
// matcher is a language.Matcher configured for all supported languages.
var matcher = language.NewMatcher([]language.Tag{
language.BritishEnglish,
language.Norwegian,
language.German,
})
// handler is an http.HandlerFunc.
func handler(w http.ResponseWriter, r *http.Request) {
t, q, err := language.ParseAcceptLanguage(r.Header.Get("Accept-Language"))
// We ignore the error: the default language will be selected for t == nil.
tag, _, _ := matcher.Match(t...)
fmt.Printf("%17v (t: %6v; q: %3v; err: %v)\n", tag, t, q, err)
}
func ExampleParseAcceptLanguage() {
for _, al := range []string{
"nn;q=0.3, en-us;q=0.8, en,",
"gsw, en;q=0.7, en-US;q=0.8",
"gsw, nl, da",
"invalid",
} {
// Create dummy request with Accept-Language set and pass it to handler.
r, _ := http.NewRequest("GET", "example.com", strings.NewReader("Hello"))
r.Header.Set("Accept-Language", al)
handler(nil, r)
}
// Output:
// en-GB (t: [ en en-US nn]; q: [ 1 0.8 0.3]; err: <nil>)
// en-GB-u-rg-uszzzz (t: [ gsw en-US en]; q: [ 1 0.8 0.7]; err: <nil>)
// de (t: [ gsw nl da]; q: [ 1 1 1]; err: <nil>)
// en-GB (t: []; q: []; err: language: tag is not well-formed)
}
@@ -0,0 +1,605 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:generate go run gen.go -output tables.go
package language
// TODO: Remove above NOTE after:
// - verifying that tables are dropped correctly (most notably matcher tables).
import (
"strings"
"golang.org/x/text/internal/language"
"golang.org/x/text/internal/language/compact"
)
// Tag represents a BCP 47 language tag. It is used to specify an instance of a
// specific language or locale. All language tag values are guaranteed to be
// well-formed.
type Tag compact.Tag
func makeTag(t language.Tag) (tag Tag) {
return Tag(compact.Make(t))
}
func (t *Tag) tag() language.Tag {
return (*compact.Tag)(t).Tag()
}
func (t *Tag) isCompact() bool {
return (*compact.Tag)(t).IsCompact()
}
// TODO: improve performance.
func (t *Tag) lang() language.Language { return t.tag().LangID }
func (t *Tag) region() language.Region { return t.tag().RegionID }
func (t *Tag) script() language.Script { return t.tag().ScriptID }
// Make is a convenience wrapper for Parse that omits the error.
// In case of an error, a sensible default is returned.
func Make(s string) Tag {
return Default.Make(s)
}
// Make is a convenience wrapper for c.Parse that omits the error.
// In case of an error, a sensible default is returned.
func (c CanonType) Make(s string) Tag {
t, _ := c.Parse(s)
return t
}
// Raw returns the raw base language, script and region, without making an
// attempt to infer their values.
func (t Tag) Raw() (b Base, s Script, r Region) {
tt := t.tag()
return Base{tt.LangID}, Script{tt.ScriptID}, Region{tt.RegionID}
}
// IsRoot returns true if t is equal to language "und".
func (t Tag) IsRoot() bool {
return compact.Tag(t).IsRoot()
}
// CanonType can be used to enable or disable various types of canonicalization.
type CanonType int
const (
// Replace deprecated base languages with their preferred replacements.
DeprecatedBase CanonType = 1 << iota
// Replace deprecated scripts with their preferred replacements.
DeprecatedScript
// Replace deprecated regions with their preferred replacements.
DeprecatedRegion
// Remove redundant scripts.
SuppressScript
// Normalize legacy encodings. This includes legacy languages defined in
// CLDR as well as bibliographic codes defined in ISO-639.
Legacy
// Map the dominant language of a macro language group to the macro language
// subtag. For example cmn -> zh.
Macro
// The CLDR flag should be used if full compatibility with CLDR is required.
// There are a few cases where language.Tag may differ from CLDR. To follow all
// of CLDR's suggestions, use All|CLDR.
CLDR
// Raw can be used to Compose or Parse without Canonicalization.
Raw CanonType = 0
// Replace all deprecated tags with their preferred replacements.
Deprecated = DeprecatedBase | DeprecatedScript | DeprecatedRegion
// All canonicalizations recommended by BCP 47.
BCP47 = Deprecated | SuppressScript
// All canonicalizations.
All = BCP47 | Legacy | Macro
// Default is the canonicalization used by Parse, Make and Compose. To
// preserve as much information as possible, canonicalizations that remove
// potentially valuable information are not included. The Matcher is
// designed to recognize similar tags that would be the same if
// they were canonicalized using All.
Default = Deprecated | Legacy
canonLang = DeprecatedBase | Legacy | Macro
// TODO: LikelyScript, LikelyRegion: suppress similar to ICU.
)
// canonicalize returns the canonicalized equivalent of the tag and
// whether there was any change.
func canonicalize(c CanonType, t language.Tag) (language.Tag, bool) {
if c == Raw {
return t, false
}
changed := false
if c&SuppressScript != 0 {
if t.LangID.SuppressScript() == t.ScriptID {
t.ScriptID = 0
changed = true
}
}
if c&canonLang != 0 {
for {
if l, aliasType := t.LangID.Canonicalize(); l != t.LangID {
switch aliasType {
case language.Legacy:
if c&Legacy != 0 {
if t.LangID == _sh && t.ScriptID == 0 {
t.ScriptID = _Latn
}
t.LangID = l
changed = true
}
case language.Macro:
if c&Macro != 0 {
// We deviate here from CLDR. The mapping "nb" -> "no"
// qualifies as a typical Macro language mapping. However,
// for legacy reasons, CLDR maps "no", the macro language
// code for Norwegian, to the dominant variant "nb". This
// change is currently under consideration for CLDR as well.
// See https://unicode.org/cldr/trac/ticket/2698 and also
// https://unicode.org/cldr/trac/ticket/1790 for some of the
// practical implications. TODO: this check could be removed
// if CLDR adopts this change.
if c&CLDR == 0 || t.LangID != _nb {
changed = true
t.LangID = l
}
}
case language.Deprecated:
if c&DeprecatedBase != 0 {
if t.LangID == _mo && t.RegionID == 0 {
t.RegionID = _MD
}
t.LangID = l
changed = true
// Other canonicalization types may still apply.
continue
}
}
} else if c&Legacy != 0 && t.LangID == _no && c&CLDR != 0 {
t.LangID = _nb
changed = true
}
break
}
}
if c&DeprecatedScript != 0 {
if t.ScriptID == _Qaai {
changed = true
t.ScriptID = _Zinh
}
}
if c&DeprecatedRegion != 0 {
if r := t.RegionID.Canonicalize(); r != t.RegionID {
changed = true
t.RegionID = r
}
}
return t, changed
}
// Canonicalize returns the canonicalized equivalent of the tag.
func (c CanonType) Canonicalize(t Tag) (Tag, error) {
// First try fast path.
if t.isCompact() {
if _, changed := canonicalize(c, compact.Tag(t).Tag()); !changed {
return t, nil
}
}
// It is unlikely that one will canonicalize a tag after matching. So do
// a slow but simple approach here.
if tag, changed := canonicalize(c, t.tag()); changed {
tag.RemakeString()
return makeTag(tag), nil
}
return t, nil
}
// Confidence indicates the level of certainty for a given return value.
// For example, Serbian may be written in Cyrillic or Latin script.
// The confidence level indicates whether a value was explicitly specified,
// whether it is typically the only possible value, or whether there is
// an ambiguity.
type Confidence int
const (
No Confidence = iota // full confidence that there was no match
Low // most likely value picked out of a set of alternatives
High // value is generally assumed to be the correct match
Exact // exact match or explicitly specified value
)
var confName = []string{"No", "Low", "High", "Exact"}
func (c Confidence) String() string {
return confName[c]
}
// String returns the canonical string representation of the language tag.
func (t Tag) String() string {
return t.tag().String()
}
// MarshalText implements encoding.TextMarshaler.
func (t Tag) MarshalText() (text []byte, err error) {
return t.tag().MarshalText()
}
// UnmarshalText implements encoding.TextUnmarshaler.
func (t *Tag) UnmarshalText(text []byte) error {
var tag language.Tag
err := tag.UnmarshalText(text)
*t = makeTag(tag)
return err
}
// Base returns the base language of the language tag. If the base language is
// unspecified, an attempt will be made to infer it from the context.
// It uses a variant of CLDR's Add Likely Subtags algorithm. This is subject to change.
func (t Tag) Base() (Base, Confidence) {
if b := t.lang(); b != 0 {
return Base{b}, Exact
}
tt := t.tag()
c := High
if tt.ScriptID == 0 && !tt.RegionID.IsCountry() {
c = Low
}
if tag, err := tt.Maximize(); err == nil && tag.LangID != 0 {
return Base{tag.LangID}, c
}
return Base{0}, No
}
// Script infers the script for the language tag. If it was not explicitly given, it will infer
// a most likely candidate.
// If more than one script is commonly used for a language, the most likely one
// is returned with a low confidence indication. For example, it returns (Cyrl, Low)
// for Serbian.
// If a script cannot be inferred (Zzzz, No) is returned. We do not use Zyyy (undetermined)
// as one would suspect from the IANA registry for BCP 47. In a Unicode context Zyyy marks
// common characters (like 1, 2, 3, '.', etc.) and is therefore more like multiple scripts.
// See https://www.unicode.org/reports/tr24/#Values for more details. Zzzz is also used for
// unknown value in CLDR. (Zzzz, Exact) is returned if Zzzz was explicitly specified.
// Note that an inferred script is never guaranteed to be the correct one. Latin is
// almost exclusively used for Afrikaans, but Arabic has been used for some texts
// in the past. Also, the script that is commonly used may change over time.
// It uses a variant of CLDR's Add Likely Subtags algorithm. This is subject to change.
func (t Tag) Script() (Script, Confidence) {
if scr := t.script(); scr != 0 {
return Script{scr}, Exact
}
tt := t.tag()
sc, c := language.Script(_Zzzz), No
if scr := tt.LangID.SuppressScript(); scr != 0 {
// Note: it is not always the case that a language with a suppress
// script value is only written in one script (e.g. kk, ms, pa).
if tt.RegionID == 0 {
return Script{scr}, High
}
sc, c = scr, High
}
if tag, err := tt.Maximize(); err == nil {
if tag.ScriptID != sc {
sc, c = tag.ScriptID, Low
}
} else {
tt, _ = canonicalize(Deprecated|Macro, tt)
if tag, err := tt.Maximize(); err == nil && tag.ScriptID != sc {
sc, c = tag.ScriptID, Low
}
}
return Script{sc}, c
}
// Region returns the region for the language tag. If it was not explicitly given, it will
// infer a most likely candidate from the context.
// It uses a variant of CLDR's Add Likely Subtags algorithm. This is subject to change.
func (t Tag) Region() (Region, Confidence) {
if r := t.region(); r != 0 {
return Region{r}, Exact
}
tt := t.tag()
if tt, err := tt.Maximize(); err == nil {
return Region{tt.RegionID}, Low // TODO: differentiate between high and low.
}
tt, _ = canonicalize(Deprecated|Macro, tt)
if tag, err := tt.Maximize(); err == nil {
return Region{tag.RegionID}, Low
}
return Region{_ZZ}, No // TODO: return world instead of undetermined?
}
// Variants returns the variants specified explicitly for this language tag.
// or nil if no variant was specified.
func (t Tag) Variants() []Variant {
if !compact.Tag(t).MayHaveVariants() {
return nil
}
v := []Variant{}
x, str := "", t.tag().Variants()
for str != "" {
x, str = nextToken(str)
v = append(v, Variant{x})
}
return v
}
// Parent returns the CLDR parent of t. In CLDR, missing fields in data for a
// specific language are substituted with fields from the parent language.
// The parent for a language may change for newer versions of CLDR.
//
// Parent returns a tag for a less specific language that is mutually
// intelligible or Und if there is no such language. This may not be the same as
// simply stripping the last BCP 47 subtag. For instance, the parent of "zh-TW"
// is "zh-Hant", and the parent of "zh-Hant" is "und".
func (t Tag) Parent() Tag {
return Tag(compact.Tag(t).Parent())
}
// nextToken returns token t and the rest of the string.
func nextToken(s string) (t, tail string) {
p := strings.Index(s[1:], "-")
if p == -1 {
return s[1:], ""
}
p++
return s[1:p], s[p:]
}
// Extension is a single BCP 47 extension.
type Extension struct {
s string
}
// String returns the string representation of the extension, including the
// type tag.
func (e Extension) String() string {
return e.s
}
// ParseExtension parses s as an extension and returns it on success.
func ParseExtension(s string) (e Extension, err error) {
ext, err := language.ParseExtension(s)
return Extension{ext}, err
}
// Type returns the one-byte extension type of e. It returns 0 for the zero
// exception.
func (e Extension) Type() byte {
if e.s == "" {
return 0
}
return e.s[0]
}
// Tokens returns the list of tokens of e.
func (e Extension) Tokens() []string {
return strings.Split(e.s, "-")
}
// Extension returns the extension of type x for tag t. It will return
// false for ok if t does not have the requested extension. The returned
// extension will be invalid in this case.
func (t Tag) Extension(x byte) (ext Extension, ok bool) {
if !compact.Tag(t).MayHaveExtensions() {
return Extension{}, false
}
e, ok := t.tag().Extension(x)
return Extension{e}, ok
}
// Extensions returns all extensions of t.
func (t Tag) Extensions() []Extension {
if !compact.Tag(t).MayHaveExtensions() {
return nil
}
e := []Extension{}
for _, ext := range t.tag().Extensions() {
e = append(e, Extension{ext})
}
return e
}
// TypeForKey returns the type associated with the given key, where key and type
// are of the allowed values defined for the Unicode locale extension ('u') in
// https://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
// TypeForKey will traverse the inheritance chain to get the correct value.
//
// If there are multiple types associated with a key, only the first will be
// returned. If there is no type associated with a key, it returns the empty
// string.
func (t Tag) TypeForKey(key string) string {
if !compact.Tag(t).MayHaveExtensions() {
if key != "rg" && key != "va" {
return ""
}
}
return t.tag().TypeForKey(key)
}
// SetTypeForKey returns a new Tag with the key set to type, where key and type
// are of the allowed values defined for the Unicode locale extension ('u') in
// https://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
// An empty value removes an existing pair with the same key.
func (t Tag) SetTypeForKey(key, value string) (Tag, error) {
tt, err := t.tag().SetTypeForKey(key, value)
return makeTag(tt), err
}
// NumCompactTags is the number of compact tags. The maximum tag is
// NumCompactTags-1.
const NumCompactTags = compact.NumCompactTags
// CompactIndex returns an index, where 0 <= index < NumCompactTags, for tags
// for which data exists in the text repository.The index will change over time
// and should not be stored in persistent storage. If t does not match a compact
// index, exact will be false and the compact index will be returned for the
// first match after repeatedly taking the Parent of t.
func CompactIndex(t Tag) (index int, exact bool) {
id, exact := compact.LanguageID(compact.Tag(t))
return int(id), exact
}
var root = language.Tag{}
// Base is an ISO 639 language code, used for encoding the base language
// of a language tag.
type Base struct {
langID language.Language
}
// ParseBase parses a 2- or 3-letter ISO 639 code.
// It returns a ValueError if s is a well-formed but unknown language identifier
// or another error if another error occurred.
func ParseBase(s string) (Base, error) {
l, err := language.ParseBase(s)
return Base{l}, err
}
// String returns the BCP 47 representation of the base language.
func (b Base) String() string {
return b.langID.String()
}
// ISO3 returns the ISO 639-3 language code.
func (b Base) ISO3() string {
return b.langID.ISO3()
}
// IsPrivateUse reports whether this language code is reserved for private use.
func (b Base) IsPrivateUse() bool {
return b.langID.IsPrivateUse()
}
// Script is a 4-letter ISO 15924 code for representing scripts.
// It is idiomatically represented in title case.
type Script struct {
scriptID language.Script
}
// ParseScript parses a 4-letter ISO 15924 code.
// It returns a ValueError if s is a well-formed but unknown script identifier
// or another error if another error occurred.
func ParseScript(s string) (Script, error) {
sc, err := language.ParseScript(s)
return Script{sc}, err
}
// String returns the script code in title case.
// It returns "Zzzz" for an unspecified script.
func (s Script) String() string {
return s.scriptID.String()
}
// IsPrivateUse reports whether this script code is reserved for private use.
func (s Script) IsPrivateUse() bool {
return s.scriptID.IsPrivateUse()
}
// Region is an ISO 3166-1 or UN M.49 code for representing countries and regions.
type Region struct {
regionID language.Region
}
// EncodeM49 returns the Region for the given UN M.49 code.
// It returns an error if r is not a valid code.
func EncodeM49(r int) (Region, error) {
rid, err := language.EncodeM49(r)
return Region{rid}, err
}
// ParseRegion parses a 2- or 3-letter ISO 3166-1 or a UN M.49 code.
// It returns a ValueError if s is a well-formed but unknown region identifier
// or another error if another error occurred.
func ParseRegion(s string) (Region, error) {
r, err := language.ParseRegion(s)
return Region{r}, err
}
// String returns the BCP 47 representation for the region.
// It returns "ZZ" for an unspecified region.
func (r Region) String() string {
return r.regionID.String()
}
// ISO3 returns the 3-letter ISO code of r.
// Note that not all regions have a 3-letter ISO code.
// In such cases this method returns "ZZZ".
func (r Region) ISO3() string {
return r.regionID.ISO3()
}
// M49 returns the UN M.49 encoding of r, or 0 if this encoding
// is not defined for r.
func (r Region) M49() int {
return r.regionID.M49()
}
// IsPrivateUse reports whether r has the ISO 3166 User-assigned status. This
// may include private-use tags that are assigned by CLDR and used in this
// implementation. So IsPrivateUse and IsCountry can be simultaneously true.
func (r Region) IsPrivateUse() bool {
return r.regionID.IsPrivateUse()
}
// IsCountry returns whether this region is a country or autonomous area. This
// includes non-standard definitions from CLDR.
func (r Region) IsCountry() bool {
return r.regionID.IsCountry()
}
// IsGroup returns whether this region defines a collection of regions. This
// includes non-standard definitions from CLDR.
func (r Region) IsGroup() bool {
return r.regionID.IsGroup()
}
// Contains returns whether Region c is contained by Region r. It returns true
// if c == r.
func (r Region) Contains(c Region) bool {
return r.regionID.Contains(c.regionID)
}
// TLD returns the country code top-level domain (ccTLD). UK is returned for GB.
// In all other cases it returns either the region itself or an error.
//
// This method may return an error for a region for which there exists a
// canonical form with a ccTLD. To get that ccTLD canonicalize r first. The
// region will already be canonicalized it was obtained from a Tag that was
// obtained using any of the default methods.
func (r Region) TLD() (Region, error) {
tld, err := r.regionID.TLD()
return Region{tld}, err
}
// Canonicalize returns the region or a possible replacement if the region is
// deprecated. It will not return a replacement for deprecated regions that
// are split into multiple regions.
func (r Region) Canonicalize() Region {
return Region{r.regionID.Canonicalize()}
}
// Variant represents a registered variant of a language as defined by BCP 47.
type Variant struct {
variant string
}
// ParseVariant parses and returns a Variant. An error is returned if s is not
// a valid variant.
func ParseVariant(s string) (Variant, error) {
v, err := language.ParseVariant(s)
return Variant{v.String()}, err
}
// String returns the string representation of the variant.
func (v Variant) String() string {
return v.variant
}
@@ -0,0 +1,788 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package language
import (
"reflect"
"testing"
)
func TestTagSize(t *testing.T) {
id := Tag{}
typ := reflect.TypeOf(id)
if typ.Size() > 24 {
t.Errorf("size of Tag was %d; want 24", typ.Size())
}
}
func TestIsRoot(t *testing.T) {
loc := Tag{}
if !loc.IsRoot() {
t.Errorf("unspecified should be root.")
}
for i, tt := range parseTests() {
loc, _ := Parse(tt.in)
undef := tt.lang == "und" && tt.script == "" && tt.region == "" && tt.ext == ""
if loc.IsRoot() != undef {
t.Errorf("%d: was %v; want %v", i, loc.IsRoot(), undef)
}
}
}
func TestEquality(t *testing.T) {
for i, tt := range parseTests() {
s := tt.in
tag := Make(s)
t1 := Make(tag.String())
if tag != t1 {
t.Errorf("%d:%s: equality test 1 failed\n got: %#v\nwant: %#v)", i, s, t1, tag)
}
t2, _ := Compose(tag)
if tag != t2 {
t.Errorf("%d:%s: equality test 2 failed\n got: %#v\nwant: %#v", i, s, t2, tag)
}
}
}
func TestString(t *testing.T) {
tests := []string{
"no-u-rg-dkzzzz",
}
for i, s := range tests {
tag := Make(s)
if tag.String() != s {
t.Errorf("%d:%s: got %s: want %s (%#v)", i, s, tag.String(), s, tag)
}
}
}
func TestMarshal(t *testing.T) {
testCases := []string{
// TODO: these values will change with each CLDR update. This issue
// will be solved if we decide to fix the indexes.
"und",
"ca-ES-valencia",
"ca-ES-valencia-u-va-posix",
"ca-ES-valencia-u-co-phonebk",
"ca-ES-valencia-u-co-phonebk-va-posix",
"x-klingon",
"en-US",
"en-US-u-va-posix",
"en",
"en-u-co-phonebk",
"en-001",
"sh",
"en-GB-u-rg-uszzzz",
"en-GB-u-rg-uszzzz-va-posix",
"en-GB-u-co-phonebk-rg-uszzzz",
// Invalid tags should also roundtrip.
"en-GB-u-co-phonebk-rg-uszz",
}
for _, tc := range testCases {
var tag Tag
err := tag.UnmarshalText([]byte(tc))
if err != nil {
t.Errorf("UnmarshalText(%q): unexpected error: %v", tc, err)
}
b, err := tag.MarshalText()
if err != nil {
t.Errorf("MarshalText(%q): unexpected error: %v", tc, err)
}
if got := string(b); got != tc {
t.Errorf("%s: got %q; want %q", tc, got, tc)
}
}
}
func TestBase(t *testing.T) {
tests := []struct {
loc, lang string
conf Confidence
}{
{"und", "en", Low},
{"x-abc", "und", No},
{"en", "en", Exact},
{"und-Cyrl", "ru", High},
// If a region is not included, the official language should be English.
{"und-US", "en", High},
// TODO: not-explicitly listed scripts should probably be und, No
// Modify addTags to return info on how the match was derived.
// {"und-Aghb", "und", No},
}
for i, tt := range tests {
loc, _ := Parse(tt.loc)
lang, conf := loc.Base()
if lang.String() != tt.lang {
t.Errorf("%d: language was %s; want %s", i, lang, tt.lang)
}
if conf != tt.conf {
t.Errorf("%d: confidence was %d; want %d", i, conf, tt.conf)
}
}
}
func TestParseBase(t *testing.T) {
tests := []struct {
in string
out string
ok bool
}{
{"en", "en", true},
{"EN", "en", true},
{"nld", "nl", true},
{"dut", "dut", true}, // bibliographic
{"aaj", "und", false}, // unknown
{"qaa", "qaa", true},
{"a", "und", false},
{"", "und", false},
{"aaaa", "und", false},
}
for i, tt := range tests {
x, err := ParseBase(tt.in)
if x.String() != tt.out || err == nil != tt.ok {
t.Errorf("%d:%s: was %s, %v; want %s, %v", i, tt.in, x, err == nil, tt.out, tt.ok)
}
if y, _, _ := Raw.Make(tt.out).Raw(); x != y {
t.Errorf("%d:%s: tag was %s; want %s", i, tt.in, x, y)
}
}
}
func TestScript(t *testing.T) {
tests := []struct {
loc, scr string
conf Confidence
}{
{"und", "Latn", Low},
{"en-Latn", "Latn", Exact},
{"en", "Latn", High},
{"sr", "Cyrl", Low},
{"kk", "Cyrl", High},
{"kk-CN", "Arab", Low},
{"cmn", "Hans", Low},
{"ru", "Cyrl", High},
{"ru-RU", "Cyrl", High},
{"yue", "Hant", Low},
{"x-abc", "Zzzz", Low},
{"und-zyyy", "Zyyy", Exact},
}
for i, tt := range tests {
loc, _ := Parse(tt.loc)
sc, conf := loc.Script()
if sc.String() != tt.scr {
t.Errorf("%d:%s: script was %s; want %s", i, tt.loc, sc, tt.scr)
}
if conf != tt.conf {
t.Errorf("%d:%s: confidence was %d; want %d", i, tt.loc, conf, tt.conf)
}
}
}
func TestParseScript(t *testing.T) {
tests := []struct {
in string
out string
ok bool
}{
{"Latn", "Latn", true},
{"zzzz", "Zzzz", true},
{"zyyy", "Zyyy", true},
{"Latm", "Zzzz", false},
{"Zzz", "Zzzz", false},
{"", "Zzzz", false},
{"Zzzxx", "Zzzz", false},
}
for i, tt := range tests {
x, err := ParseScript(tt.in)
if x.String() != tt.out || err == nil != tt.ok {
t.Errorf("%d:%s: was %s, %v; want %s, %v", i, tt.in, x, err == nil, tt.out, tt.ok)
}
if err == nil {
if _, y, _ := Raw.Make("und-" + tt.out).Raw(); x != y {
t.Errorf("%d:%s: tag was %s; want %s", i, tt.in, x, y)
}
}
}
}
func TestRegion(t *testing.T) {
tests := []struct {
loc, reg string
conf Confidence
}{
{"und", "US", Low},
{"en", "US", Low},
{"zh-Hant", "TW", Low},
{"en-US", "US", Exact},
{"cmn", "CN", Low},
{"ru", "RU", Low},
{"yue", "HK", Low},
{"x-abc", "ZZ", Low},
}
for i, tt := range tests {
loc, _ := Raw.Parse(tt.loc)
reg, conf := loc.Region()
if reg.String() != tt.reg {
t.Errorf("%d:%s: region was %s; want %s", i, tt.loc, reg, tt.reg)
}
if conf != tt.conf {
t.Errorf("%d:%s: confidence was %d; want %d", i, tt.loc, conf, tt.conf)
}
}
}
func TestEncodeM49(t *testing.T) {
tests := []struct {
m49 int
code string
ok bool
}{
{1, "001", true},
{840, "US", true},
{899, "ZZ", false},
}
for i, tt := range tests {
if r, err := EncodeM49(tt.m49); r.String() != tt.code || err == nil != tt.ok {
t.Errorf("%d:%d: was %s, %v; want %s, %v", i, tt.m49, r, err == nil, tt.code, tt.ok)
}
}
for i := 1; i <= 1000; i++ {
if r, err := EncodeM49(i); err == nil && r.M49() == 0 {
t.Errorf("%d has no error, but maps to undefined region", i)
}
}
}
func TestParseRegion(t *testing.T) {
tests := []struct {
in string
out string
ok bool
}{
{"001", "001", true},
{"840", "US", true},
{"899", "ZZ", false},
{"USA", "US", true},
{"US", "US", true},
{"BC", "ZZ", false},
{"C", "ZZ", false},
{"CCCC", "ZZ", false},
{"01", "ZZ", false},
}
for i, tt := range tests {
r, err := ParseRegion(tt.in)
if r.String() != tt.out || err == nil != tt.ok {
t.Errorf("%d:%s: was %s, %v; want %s, %v", i, tt.in, r, err == nil, tt.out, tt.ok)
}
if err == nil {
if _, _, y := Raw.Make("und-" + tt.out).Raw(); r != y {
t.Errorf("%d:%s: tag was %s; want %s", i, tt.in, r, y)
}
}
}
}
func TestIsCountry(t *testing.T) {
tests := []struct {
reg string
country bool
}{
{"US", true},
{"001", false},
{"958", false},
{"419", false},
{"203", true},
{"020", true},
{"900", false},
{"999", false},
{"QO", false},
{"EU", false},
{"AA", false},
{"XK", true},
}
for i, tt := range tests {
r, _ := ParseRegion(tt.reg)
if r.IsCountry() != tt.country {
t.Errorf("%d: IsCountry(%s) was %v; want %v", i, tt.reg, r.IsCountry(), tt.country)
}
}
}
func TestIsGroup(t *testing.T) {
tests := []struct {
reg string
group bool
}{
{"US", false},
{"001", true},
{"958", false},
{"419", true},
{"203", false},
{"020", false},
{"900", false},
{"999", false},
{"QO", true},
{"EU", true},
{"AA", false},
{"XK", false},
}
for i, tt := range tests {
r, _ := ParseRegion(tt.reg)
if r.IsGroup() != tt.group {
t.Errorf("%d: IsGroup(%s) was %v; want %v", i, tt.reg, r.IsGroup(), tt.group)
}
}
}
func TestContains(t *testing.T) {
tests := []struct {
enclosing, contained string
contains bool
}{
// A region contains itself.
{"US", "US", true},
{"001", "001", true},
// Direct containment.
{"001", "002", true},
{"039", "XK", true},
{"150", "XK", true},
{"EU", "AT", true},
{"QO", "AQ", true},
// Indirect containemnt.
{"001", "US", true},
{"001", "419", true},
{"001", "013", true},
// No containment.
{"US", "001", false},
{"155", "EU", false},
}
for i, tt := range tests {
r := MustParseRegion(tt.enclosing)
con := MustParseRegion(tt.contained)
if got := r.Contains(con); got != tt.contains {
t.Errorf("%d: %s.Contains(%s) was %v; want %v", i, tt.enclosing, tt.contained, got, tt.contains)
}
}
}
func TestRegionCanonicalize(t *testing.T) {
for i, tt := range []struct{ in, out string }{
{"UK", "GB"},
{"TP", "TL"},
{"QU", "EU"},
{"SU", "SU"},
{"VD", "VN"},
{"DD", "DE"},
} {
r := MustParseRegion(tt.in)
want := MustParseRegion(tt.out)
if got := r.Canonicalize(); got != want {
t.Errorf("%d: got %v; want %v", i, got, want)
}
}
}
func TestRegionTLD(t *testing.T) {
for _, tt := range []struct {
in, out string
ok bool
}{
{"EH", "EH", true},
{"FR", "FR", true},
{"TL", "TL", true},
// In ccTLD before in ISO.
{"GG", "GG", true},
// Non-standard assignment of ccTLD to ISO code.
{"GB", "UK", true},
// Exceptionally reserved in ISO and valid ccTLD.
{"UK", "UK", true},
{"AC", "AC", true},
{"EU", "EU", true},
{"SU", "SU", true},
// Exceptionally reserved in ISO and invalid ccTLD.
{"CP", "ZZ", false},
{"DG", "ZZ", false},
{"EA", "ZZ", false},
{"FX", "ZZ", false},
{"IC", "ZZ", false},
{"TA", "ZZ", false},
// Transitionally reserved in ISO (e.g. deprecated) but valid ccTLD as
// it is still being phased out.
{"AN", "AN", true},
{"TP", "TP", true},
// Transitionally reserved in ISO (e.g. deprecated) and invalid ccTLD.
// Defined in package language as it has a mapping in CLDR.
{"BU", "ZZ", false},
{"CS", "ZZ", false},
{"NT", "ZZ", false},
{"YU", "ZZ", false},
{"ZR", "ZZ", false},
// Not defined in package: SF.
// Indeterminately reserved in ISO.
// Defined in package language as it has a legacy mapping in CLDR.
{"DY", "ZZ", false},
{"RH", "ZZ", false},
{"VD", "ZZ", false},
// Not defined in package: EW, FL, JA, LF, PI, RA, RB, RC, RI, RL, RM,
// RN, RP, WG, WL, WV, and YV.
// Not assigned in ISO, but legacy definitions in CLDR.
{"DD", "ZZ", false},
{"YD", "ZZ", false},
// Normal mappings but somewhat special status in ccTLD.
{"BL", "BL", true},
{"MF", "MF", true},
{"BV", "BV", true},
{"SJ", "SJ", true},
// Have values when normalized, but not as is.
{"QU", "ZZ", false},
// ISO Private Use.
{"AA", "ZZ", false},
{"QM", "ZZ", false},
{"QO", "ZZ", false},
{"XA", "ZZ", false},
{"XK", "ZZ", false}, // Sometimes used for Kosovo, but invalid ccTLD.
} {
if tt.in == "" {
continue
}
r := MustParseRegion(tt.in)
var want Region
if tt.out != "ZZ" {
want = MustParseRegion(tt.out)
}
tld, err := r.TLD()
if got := err == nil; got != tt.ok {
t.Errorf("error(%v): got %v; want %v", r, got, tt.ok)
}
if tld != want {
t.Errorf("TLD(%v): got %v; want %v", r, tld, want)
}
}
}
func TestCanonicalize(t *testing.T) {
// TODO: do a full test using CLDR data in a separate regression test.
tests := []struct {
in, out string
option CanonType
}{
{"en-Latn", "en", SuppressScript},
{"sr-Cyrl", "sr-Cyrl", SuppressScript},
{"sh", "sr-Latn", Legacy},
{"sh-HR", "sr-Latn-HR", Legacy},
{"sh-Cyrl-HR", "sr-Cyrl-HR", Legacy},
{"tl", "fil", Legacy},
{"no", "no", Legacy},
{"no", "nb", Legacy | CLDR},
{"cmn", "cmn", Legacy},
{"cmn", "zh", Macro},
{"cmn-u-co-stroke", "zh-u-co-stroke", Macro},
{"yue", "yue", Macro},
{"nb", "no", Macro},
{"nb", "nb", Macro | CLDR},
{"no", "no", Macro},
{"no", "no", Macro | CLDR},
{"iw", "he", DeprecatedBase},
{"iw", "he", Deprecated | CLDR},
{"mo", "ro-MD", Deprecated}, // Adopted by CLDR as of version 25.
{"alb", "sq", Legacy}, // bibliographic
{"dut", "nl", Legacy}, // bibliographic
// As of CLDR 25, mo is no longer considered a legacy mapping.
{"mo", "mo", Legacy | CLDR},
{"und-AN", "und-AN", Deprecated},
{"und-YD", "und-YE", DeprecatedRegion},
{"und-YD", "und-YD", DeprecatedBase},
{"und-Qaai", "und-Zinh", DeprecatedScript},
{"und-Qaai", "und-Qaai", DeprecatedBase},
{"drh", "mn", All}, // drh -> khk -> mn
{"en-GB-u-rg-uszzzz", "en-GB-u-rg-uszzzz", Raw},
{"en-GB-u-rg-USZZZZ", "en-GB-u-rg-uszzzz", Raw},
// TODO: use different exact values for language and regional tag?
{"en-GB-u-rg-uszzzz-va-posix", "en-GB-u-rg-uszzzz-va-posix", Raw},
{"en-GB-u-rg-uszzzz-co-phonebk", "en-GB-u-co-phonebk-rg-uszzzz", Raw},
// Invalid region specifications are left as is.
{"en-GB-u-rg-usz", "en-GB-u-rg-usz", Raw},
{"en-GB-u-rg-usz-va-posix", "en-GB-u-rg-usz-va-posix", Raw},
{"en-GB-u-rg-usz-co-phonebk", "en-GB-u-co-phonebk-rg-usz", Raw},
// CVE-2020-28851
// invalid key-value pair of -u- extension.
{"ES-u-000-00", "es-u-000-00", Raw},
{"ES-u-000-00-v-00", "es-u-000-00-v-00", Raw},
// reordered and unknown extension.
{"ES-v-00-u-000-00", "es-u-000-00-v-00", Raw},
}
for i, tt := range tests {
in, _ := Raw.Parse(tt.in)
in, _ = tt.option.Canonicalize(in)
if in.String() != tt.out {
t.Errorf("%d:%s: was %s; want %s", i, tt.in, in.String(), tt.out)
}
}
// Test idempotence.
for _, base := range Supported.BaseLanguages() {
tag, _ := Raw.Compose(base)
got, _ := All.Canonicalize(tag)
want, _ := All.Canonicalize(got)
if got != want {
t.Errorf("idem(%s): got %s; want %s", tag, got, want)
}
}
}
func TestTypeForKey(t *testing.T) {
tests := []struct{ key, in, out string }{
{"co", "en", ""},
{"co", "en-u-abc", ""},
{"co", "en-u-co-phonebk", "phonebk"},
{"co", "en-u-co-phonebk-cu-aud", "phonebk"},
{"co", "x-foo-u-co-phonebk", ""},
{"va", "en-US-u-va-posix", "posix"},
{"rg", "en-u-rg-gbzzzz", "gbzzzz"},
{"nu", "en-u-co-phonebk-nu-arabic", "arabic"},
{"kc", "cmn-u-co-stroke", ""},
{"rg", "cmn-u-rg", ""},
{"rg", "cmn-u-rg-co-stroke", ""},
{"co", "cmn-u-rg-co-stroke", "stroke"},
{"co", "cmn-u-co-rg-gbzzzz", ""},
{"rg", "cmn-u-co-rg-gbzzzz", "gbzzzz"},
{"rg", "cmn-u-rg-gbzzzz-nlzzzz", "gbzzzz"},
}
for _, tt := range tests {
if v := Make(tt.in).TypeForKey(tt.key); v != tt.out {
t.Errorf("%q[%q]: was %q; want %q", tt.in, tt.key, v, tt.out)
}
}
}
func TestParent(t *testing.T) {
tests := []struct{ in, out string }{
// Strip variants and extensions first
{"de-u-co-phonebk", "de"},
{"de-1994", "de"},
{"de-Latn-1994", "de"}, // remove superfluous script.
// Ensure the canonical Tag for an entry is in the chain for base-script
// pairs.
{"zh-Hans", "zh"},
// Skip the script if it is the maximized version. CLDR files for the
// skipped tag are always empty.
{"zh-Hans-TW", "zh"},
{"zh-Hans-CN", "zh"},
// Insert the script if the maximized script is not the same as the
// maximized script of the base language.
{"zh-TW", "zh-Hant"},
{"zh-HK", "zh-Hant"},
{"zh-Hant-TW", "zh-Hant"},
{"zh-Hant-HK", "zh-Hant"},
// Non-default script skips to und.
// CLDR
{"az-Cyrl", "und"},
{"bs-Cyrl", "und"},
{"en-Dsrt", "und"},
{"ha-Arab", "und"},
{"mn-Mong", "und"},
{"pa-Arab", "und"},
{"shi-Latn", "und"},
{"sr-Latn", "und"},
{"uz-Arab", "und"},
{"uz-Cyrl", "und"},
{"vai-Latn", "und"},
{"zh-Hant", "und"},
// extra
{"nl-Cyrl", "und"},
// World english inherits from en-001.
{"en-150", "en-001"},
{"en-AU", "en-001"},
{"en-BE", "en-001"},
{"en-GG", "en-001"},
{"en-GI", "en-001"},
{"en-HK", "en-001"},
{"en-IE", "en-001"},
{"en-IM", "en-001"},
{"en-IN", "en-001"},
{"en-JE", "en-001"},
{"en-MT", "en-001"},
{"en-NZ", "en-001"},
{"en-PK", "en-001"},
{"en-SG", "en-001"},
// Spanish in Latin-American countries have es-419 as parent.
{"es-AR", "es-419"},
{"es-BO", "es-419"},
{"es-CL", "es-419"},
{"es-CO", "es-419"},
{"es-CR", "es-419"},
{"es-CU", "es-419"},
{"es-DO", "es-419"},
{"es-EC", "es-419"},
{"es-GT", "es-419"},
{"es-HN", "es-419"},
{"es-MX", "es-419"},
{"es-NI", "es-419"},
{"es-PA", "es-419"},
{"es-PE", "es-419"},
{"es-PR", "es-419"},
{"es-PY", "es-419"},
{"es-SV", "es-419"},
{"es-US", "es-419"},
{"es-UY", "es-419"},
{"es-VE", "es-419"},
// exceptions (according to CLDR)
{"es-CW", "es"},
// Inherit from pt-PT, instead of pt for these countries.
{"pt-AO", "pt-PT"},
{"pt-CV", "pt-PT"},
{"pt-GW", "pt-PT"},
{"pt-MO", "pt-PT"},
{"pt-MZ", "pt-PT"},
{"pt-ST", "pt-PT"},
{"pt-TL", "pt-PT"},
{"en-GB-u-co-phonebk-rg-uszzzz", "en-GB"},
{"en-GB-u-rg-uszzzz", "en-GB"},
{"en-US-u-va-posix", "en-US"},
// Difference between language and regional tag.
{"ca-ES-valencia", "ca-ES"},
{"ca-ES-valencia-u-rg-ptzzzz", "ca-ES"},
{"en-US-u-va-variant", "en-US"},
{"en-u-va-variant", "en"},
{"en-u-rg-gbzzzz", "en"},
{"en-US-u-rg-gbzzzz", "en-US"},
{"nl-US-u-rg-gbzzzz", "nl-US"},
}
for _, tt := range tests {
tag := Raw.MustParse(tt.in)
if p := Raw.MustParse(tt.out); p != tag.Parent() {
t.Errorf("%s: was %v; want %v", tt.in, tag.Parent(), p)
}
}
}
var (
// Tags without error that don't need to be changed.
benchBasic = []string{
"en",
"en-Latn",
"en-GB",
"za",
"zh-Hant",
"zh",
"zh-HK",
"ar-MK",
"en-CA",
"fr-CA",
"fr-CH",
"fr",
"lv",
"he-IT",
"tlh",
"ja",
"ja-Jpan",
"ja-Jpan-JP",
"de-1996",
"de-CH",
"sr",
"sr-Latn",
}
// Tags with extensions, not changes required.
benchExt = []string{
"x-a-b-c-d",
"x-aa-bbbb-cccccccc-d",
"en-x_cc-b-bbb-a-aaa",
"en-c_cc-b-bbb-a-aaa-x-x",
"en-u-co-phonebk",
"en-Cyrl-u-co-phonebk",
"en-US-u-co-phonebk-cu-xau",
"en-nedix-u-co-phonebk",
"en-t-t0-abcd",
"en-t-nl-latn",
"en-t-t0-abcd-x-a",
"en_t_pt_MLt",
"en-t-fr-est",
}
// Change, but not memory allocation required.
benchSimpleChange = []string{
"EN",
"i-klingon",
"en-latn",
"zh-cmn-Hans-CN",
"iw-NL",
}
// Change and memory allocation required.
benchChangeAlloc = []string{
"en-c_cc-b-bbb-a-aaa",
"en-u-cu-xua-co-phonebk",
"en-u-cu-xua-co-phonebk-a-cd",
"en-u-def-abc-cu-xua-co-phonebk",
"en-t-en-Cyrl-NL-1994",
"en-t-en-Cyrl-NL-1994-t0-abc-def",
}
// Tags that result in errors.
benchErr = []string{
// IllFormed
"x_A.-B-C_D",
"en-u-cu-co-phonebk",
"en-u-cu-xau-co",
"en-t-nl-abcd",
// Invalid
"xx",
"nl-Uuuu",
"nl-QB",
}
benchChange = append(benchSimpleChange, benchChangeAlloc...)
benchAll = append(append(append(benchBasic, benchExt...), benchChange...), benchErr...)
)
func doParse(b *testing.B, tag []string) {
for i := 0; i < b.N; i++ {
// Use the modulo instead of looping over all tags so that we get a somewhat
// meaningful ns/op.
Parse(tag[i%len(tag)])
}
}
func BenchmarkParse(b *testing.B) {
doParse(b, benchAll)
}
func BenchmarkParseBasic(b *testing.B) {
doParse(b, benchBasic)
}
func BenchmarkParseError(b *testing.B) {
doParse(b, benchErr)
}
func BenchmarkParseSimpleChange(b *testing.B) {
doParse(b, benchSimpleChange)
}
func BenchmarkParseChangeAlloc(b *testing.B) {
doParse(b, benchChangeAlloc)
}
@@ -0,0 +1,281 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package language
import (
"testing"
)
func TestRegionID(t *testing.T) {
tests := []struct {
in, out string
}{
{"_ ", ""},
{"_000", ""},
{"419", "419"},
{"AA", "AA"},
{"ATF", "TF"},
{"HV", "HV"},
{"CT", "CT"},
{"DY", "DY"},
{"IC", "IC"},
{"FQ", "FQ"},
{"JT", "JT"},
{"ZZ", "ZZ"},
{"EU", "EU"},
{"QO", "QO"},
{"FX", "FX"},
}
for i, tt := range tests {
if tt.in[0] == '_' {
id := tt.in[1:]
if _, err := ParseRegion(id); err == nil {
t.Errorf("%d:err(%s): found nil; want error", i, id)
}
continue
}
want, _ := ParseRegion(tt.in)
if s := want.String(); s != tt.out {
t.Errorf("%d:%s: found %q; want %q", i, tt.in, s, tt.out)
}
if len(tt.in) == 2 {
want, _ := ParseRegion(tt.in)
if s := want.String(); s != tt.out {
t.Errorf("%d:getISO2(%s): found %q; want %q", i, tt.in, s, tt.out)
}
}
}
}
func TestRegionISO3(t *testing.T) {
tests := []struct {
from, iso3, to string
}{
{" ", "ZZZ", "ZZ"},
{"000", "ZZZ", "ZZ"},
{"AA", "AAA", ""},
{"CT", "CTE", ""},
{"DY", "DHY", ""},
{"EU", "QUU", ""},
{"HV", "HVO", ""},
{"IC", "ZZZ", "ZZ"},
{"JT", "JTN", ""},
{"PZ", "PCZ", ""},
{"QU", "QUU", "EU"},
{"QO", "QOO", ""},
{"YD", "YMD", ""},
{"FQ", "ATF", "TF"},
{"TF", "ATF", ""},
{"FX", "FXX", ""},
{"ZZ", "ZZZ", ""},
{"419", "ZZZ", "ZZ"},
}
for _, tt := range tests {
r, _ := ParseRegion(tt.from)
if s := r.ISO3(); s != tt.iso3 {
t.Errorf("iso3(%q): found %q; want %q", tt.from, s, tt.iso3)
}
if tt.iso3 == "" {
continue
}
want := tt.to
if tt.to == "" {
want = tt.from
}
r, _ = ParseRegion(want)
if id, _ := ParseRegion(tt.iso3); id != r {
t.Errorf("%s: found %q; want %q", tt.iso3, id, want)
}
}
}
func TestRegionM49(t *testing.T) {
fromTests := []struct {
m49 int
id string
}{
{0, ""},
{-1, ""},
{1000, ""},
{10000, ""},
{001, "001"},
{104, "MM"},
{180, "CD"},
{230, "ET"},
{231, "ET"},
{249, "FX"},
{250, "FR"},
{276, "DE"},
{278, "DD"},
{280, "DE"},
{419, "419"},
{626, "TL"},
{736, "SD"},
{840, "US"},
{854, "BF"},
{891, "CS"},
{899, ""},
{958, "AA"},
{966, "QT"},
{967, "EU"},
{999, "ZZ"},
}
for _, tt := range fromTests {
id, err := EncodeM49(tt.m49)
if want, have := err != nil, tt.id == ""; want != have {
t.Errorf("error(%d): have %v; want %v", tt.m49, have, want)
continue
}
r, _ := ParseRegion(tt.id)
if r != id {
t.Errorf("region(%d): have %s; want %s", tt.m49, id, r)
}
}
toTests := []struct {
m49 int
id string
}{
{0, "000"},
{0, "IC"}, // Some codes don't have an ID
{001, "001"},
{104, "MM"},
{104, "BU"},
{180, "CD"},
{180, "ZR"},
{231, "ET"},
{250, "FR"},
{249, "FX"},
{276, "DE"},
{278, "DD"},
{419, "419"},
{626, "TL"},
{626, "TP"},
{729, "SD"},
{826, "GB"},
{840, "US"},
{854, "BF"},
{891, "YU"},
{891, "CS"},
{958, "AA"},
{966, "QT"},
{967, "EU"},
{967, "QU"},
{999, "ZZ"},
// For codes that don't have an M49 code use the replacement value,
// if available.
{854, "HV"}, // maps to Burkino Faso
}
for _, tt := range toTests {
r, _ := ParseRegion(tt.id)
if r.M49() != tt.m49 {
t.Errorf("m49(%q): have %d; want %d", tt.id, r.M49(), tt.m49)
}
}
}
func TestRegionDeprecation(t *testing.T) {
tests := []struct{ in, out string }{
{"BU", "MM"},
{"BUR", "MM"},
{"CT", "KI"},
{"DD", "DE"},
{"DDR", "DE"},
{"DY", "BJ"},
{"FX", "FR"},
{"HV", "BF"},
{"JT", "UM"},
{"MI", "UM"},
{"NH", "VU"},
{"NQ", "AQ"},
{"PU", "UM"},
{"PZ", "PA"},
{"QU", "EU"},
{"RH", "ZW"},
{"TP", "TL"},
{"UK", "GB"},
{"VD", "VN"},
{"WK", "UM"},
{"YD", "YE"},
{"NL", "NL"},
}
for _, tt := range tests {
rIn, _ := ParseRegion(tt.in)
rOut, _ := ParseRegion(tt.out)
r := rIn.Canonicalize()
if rOut == rIn && r.String() == "ZZ" {
t.Errorf("%s: was %q; want %q", tt.in, r, tt.in)
}
if rOut != rIn && r != rOut {
t.Errorf("%s: was %q; want %q", tt.in, r, tt.out)
}
}
}
func TestIsPrivateUse(t *testing.T) {
type test struct {
s string
private bool
}
tests := []test{
{"en", false},
{"und", false},
{"pzn", false},
{"qaa", true},
{"qtz", true},
{"qua", false},
}
for i, tt := range tests {
x, _ := ParseBase(tt.s)
if b := x.IsPrivateUse(); b != tt.private {
t.Errorf("%d: langID.IsPrivateUse(%s) was %v; want %v", i, tt.s, b, tt.private)
}
}
tests = []test{
{"001", false},
{"419", false},
{"899", false},
{"900", false},
{"957", false},
{"958", true},
{"AA", true},
{"AC", false},
{"EU", false}, // CLDR grouping, exceptionally reserved in ISO.
{"QU", true}, // Canonicalizes to EU, User-assigned in ISO.
{"QO", true}, // CLDR grouping, User-assigned in ISO.
{"QA", false},
{"QM", true},
{"QZ", true},
{"XA", true},
{"XK", true}, // Assigned to Kosovo in CLDR, User-assigned in ISO.
{"XZ", true},
{"ZW", false},
{"ZZ", true},
}
for i, tt := range tests {
x, _ := ParseRegion(tt.s)
if b := x.IsPrivateUse(); b != tt.private {
t.Errorf("%d: regionID.IsPrivateUse(%s) was %v; want %v", i, tt.s, b, tt.private)
}
}
tests = []test{
{"Latn", false},
{"Laaa", false}, // invalid
{"Qaaa", true},
{"Qabx", true},
{"Qaby", false},
{"Zyyy", false},
{"Zzzz", false},
}
for i, tt := range tests {
x, _ := ParseScript(tt.s)
if b := x.IsPrivateUse(); b != tt.private {
t.Errorf("%d: scriptID.IsPrivateUse(%s) was %v; want %v", i, tt.s, b, tt.private)
}
}
}
@@ -0,0 +1,735 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package language
import (
"errors"
"strings"
"golang.org/x/text/internal/language"
)
// A MatchOption configures a Matcher.
type MatchOption func(*matcher)
// PreferSameScript will, in the absence of a match, result in the first
// preferred tag with the same script as a supported tag to match this supported
// tag. The default is currently true, but this may change in the future.
func PreferSameScript(preferSame bool) MatchOption {
return func(m *matcher) { m.preferSameScript = preferSame }
}
// TODO(v1.0.0): consider making Matcher a concrete type, instead of interface.
// There doesn't seem to be too much need for multiple types.
// Making it a concrete type allows MatchStrings to be a method, which will
// improve its discoverability.
// MatchStrings parses and matches the given strings until one of them matches
// the language in the Matcher. A string may be an Accept-Language header as
// handled by ParseAcceptLanguage. The default language is returned if no
// other language matched.
func MatchStrings(m Matcher, lang ...string) (tag Tag, index int) {
for _, accept := range lang {
desired, _, err := ParseAcceptLanguage(accept)
if err != nil {
continue
}
if tag, index, conf := m.Match(desired...); conf != No {
return tag, index
}
}
tag, index, _ = m.Match()
return
}
// Matcher is the interface that wraps the Match method.
//
// Match returns the best match for any of the given tags, along with
// a unique index associated with the returned tag and a confidence
// score.
type Matcher interface {
Match(t ...Tag) (tag Tag, index int, c Confidence)
}
// Comprehends reports the confidence score for a speaker of a given language
// to being able to comprehend the written form of an alternative language.
func Comprehends(speaker, alternative Tag) Confidence {
_, _, c := NewMatcher([]Tag{alternative}).Match(speaker)
return c
}
// NewMatcher returns a Matcher that matches an ordered list of preferred tags
// against a list of supported tags based on written intelligibility, closeness
// of dialect, equivalence of subtags and various other rules. It is initialized
// with the list of supported tags. The first element is used as the default
// value in case no match is found.
//
// Its Match method matches the first of the given Tags to reach a certain
// confidence threshold. The tags passed to Match should therefore be specified
// in order of preference. Extensions are ignored for matching.
//
// The index returned by the Match method corresponds to the index of the
// matched tag in t, but is augmented with the Unicode extension ('u')of the
// corresponding preferred tag. This allows user locale options to be passed
// transparently.
func NewMatcher(t []Tag, options ...MatchOption) Matcher {
return newMatcher(t, options)
}
func (m *matcher) Match(want ...Tag) (t Tag, index int, c Confidence) {
var tt language.Tag
match, w, c := m.getBest(want...)
if match != nil {
tt, index = match.tag, match.index
} else {
// TODO: this should be an option
tt = m.default_.tag
if m.preferSameScript {
outer:
for _, w := range want {
script, _ := w.Script()
if script.scriptID == 0 {
// Don't do anything if there is no script, such as with
// private subtags.
continue
}
for i, h := range m.supported {
if script.scriptID == h.maxScript {
tt, index = h.tag, i
break outer
}
}
}
}
// TODO: select first language tag based on script.
}
if w.RegionID != tt.RegionID && w.RegionID != 0 {
if w.RegionID != 0 && tt.RegionID != 0 && tt.RegionID.Contains(w.RegionID) {
tt.RegionID = w.RegionID
tt.RemakeString()
} else if r := w.RegionID.String(); len(r) == 2 {
// TODO: also filter macro and deprecated.
tt, _ = tt.SetTypeForKey("rg", strings.ToLower(r)+"zzzz")
}
}
// Copy options from the user-provided tag into the result tag. This is hard
// to do after the fact, so we do it here.
// TODO: add in alternative variants to -u-va-.
// TODO: add preferred region to -u-rg-.
if e := w.Extensions(); len(e) > 0 {
b := language.Builder{}
b.SetTag(tt)
for _, e := range e {
b.AddExt(e)
}
tt = b.Make()
}
return makeTag(tt), index, c
}
// ErrMissingLikelyTagsData indicates no information was available
// to compute likely values of missing tags.
var ErrMissingLikelyTagsData = errors.New("missing likely tags data")
// func (t *Tag) setTagsFrom(id Tag) {
// t.LangID = id.LangID
// t.ScriptID = id.ScriptID
// t.RegionID = id.RegionID
// }
// Tag Matching
// CLDR defines an algorithm for finding the best match between two sets of language
// tags. The basic algorithm defines how to score a possible match and then find
// the match with the best score
// (see https://www.unicode.org/reports/tr35/#LanguageMatching).
// Using scoring has several disadvantages. The scoring obfuscates the importance of
// the various factors considered, making the algorithm harder to understand. Using
// scoring also requires the full score to be computed for each pair of tags.
//
// We will use a different algorithm which aims to have the following properties:
// - clarity on the precedence of the various selection factors, and
// - improved performance by allowing early termination of a comparison.
//
// Matching algorithm (overview)
// Input:
// - supported: a set of supported tags
// - default: the default tag to return in case there is no match
// - desired: list of desired tags, ordered by preference, starting with
// the most-preferred.
//
// Algorithm:
// 1) Set the best match to the lowest confidence level
// 2) For each tag in "desired":
// a) For each tag in "supported":
// 1) compute the match between the two tags.
// 2) if the match is better than the previous best match, replace it
// with the new match. (see next section)
// b) if the current best match is Exact and pin is true the result will be
// frozen to the language found thusfar, although better matches may
// still be found for the same language.
// 3) If the best match so far is below a certain threshold, return "default".
//
// Ranking:
// We use two phases to determine whether one pair of tags are a better match
// than another pair of tags. First, we determine a rough confidence level. If the
// levels are different, the one with the highest confidence wins.
// Second, if the rough confidence levels are identical, we use a set of tie-breaker
// rules.
//
// The confidence level of matching a pair of tags is determined by finding the
// lowest confidence level of any matches of the corresponding subtags (the
// result is deemed as good as its weakest link).
// We define the following levels:
// Exact - An exact match of a subtag, before adding likely subtags.
// MaxExact - An exact match of a subtag, after adding likely subtags.
// [See Note 2].
// High - High level of mutual intelligibility between different subtag
// variants.
// Low - Low level of mutual intelligibility between different subtag
// variants.
// No - No mutual intelligibility.
//
// The following levels can occur for each type of subtag:
// Base: Exact, MaxExact, High, Low, No
// Script: Exact, MaxExact [see Note 3], Low, No
// Region: Exact, MaxExact, High
// Variant: Exact, High
// Private: Exact, No
//
// Any result with a confidence level of Low or higher is deemed a possible match.
// Once a desired tag matches any of the supported tags with a level of MaxExact
// or higher, the next desired tag is not considered (see Step 2.b).
// Note that CLDR provides languageMatching data that defines close equivalence
// classes for base languages, scripts and regions.
//
// Tie-breaking
// If we get the same confidence level for two matches, we apply a sequence of
// tie-breaking rules. The first that succeeds defines the result. The rules are
// applied in the following order.
// 1) Original language was defined and was identical.
// 2) Original region was defined and was identical.
// 3) Distance between two maximized regions was the smallest.
// 4) Original script was defined and was identical.
// 5) Distance from want tag to have tag using the parent relation [see Note 5.]
// If there is still no winner after these rules are applied, the first match
// found wins.
//
// Notes:
// [2] In practice, as matching of Exact is done in a separate phase from
// matching the other levels, we reuse the Exact level to mean MaxExact in
// the second phase. As a consequence, we only need the levels defined by
// the Confidence type. The MaxExact confidence level is mapped to High in
// the public API.
// [3] We do not differentiate between maximized script values that were derived
// from suppressScript versus most likely tag data. We determined that in
// ranking the two, one ranks just after the other. Moreover, the two cannot
// occur concurrently. As a consequence, they are identical for practical
// purposes.
// [4] In case of deprecated, macro-equivalents and legacy mappings, we assign
// the MaxExact level to allow iw vs he to still be a closer match than
// en-AU vs en-US, for example.
// [5] In CLDR a locale inherits fields that are unspecified for this locale
// from its parent. Therefore, if a locale is a parent of another locale,
// it is a strong measure for closeness, especially when no other tie
// breaker rule applies. One could also argue it is inconsistent, for
// example, when pt-AO matches pt (which CLDR equates with pt-BR), even
// though its parent is pt-PT according to the inheritance rules.
//
// Implementation Details:
// There are several performance considerations worth pointing out. Most notably,
// we preprocess as much as possible (within reason) at the time of creation of a
// matcher. This includes:
// - creating a per-language map, which includes data for the raw base language
// and its canonicalized variant (if applicable),
// - expanding entries for the equivalence classes defined in CLDR's
// languageMatch data.
// The per-language map ensures that typically only a very small number of tags
// need to be considered. The pre-expansion of canonicalized subtags and
// equivalence classes reduces the amount of map lookups that need to be done at
// runtime.
// matcher keeps a set of supported language tags, indexed by language.
type matcher struct {
default_ *haveTag
supported []*haveTag
index map[language.Language]*matchHeader
passSettings bool
preferSameScript bool
}
// matchHeader has the lists of tags for exact matches and matches based on
// maximized and canonicalized tags for a given language.
type matchHeader struct {
haveTags []*haveTag
original bool
}
// haveTag holds a supported Tag and its maximized script and region. The maximized
// or canonicalized language is not stored as it is not needed during matching.
type haveTag struct {
tag language.Tag
// index of this tag in the original list of supported tags.
index int
// conf is the maximum confidence that can result from matching this haveTag.
// When conf < Exact this means it was inserted after applying a CLDR equivalence rule.
conf Confidence
// Maximized region and script.
maxRegion language.Region
maxScript language.Script
// altScript may be checked as an alternative match to maxScript. If altScript
// matches, the confidence level for this match is Low. Theoretically there
// could be multiple alternative scripts. This does not occur in practice.
altScript language.Script
// nextMax is the index of the next haveTag with the same maximized tags.
nextMax uint16
}
func makeHaveTag(tag language.Tag, index int) (haveTag, language.Language) {
max := tag
if tag.LangID != 0 || tag.RegionID != 0 || tag.ScriptID != 0 {
max, _ = canonicalize(All, max)
max, _ = max.Maximize()
max.RemakeString()
}
return haveTag{tag, index, Exact, max.RegionID, max.ScriptID, altScript(max.LangID, max.ScriptID), 0}, max.LangID
}
// altScript returns an alternative script that may match the given script with
// a low confidence. At the moment, the langMatch data allows for at most one
// script to map to another and we rely on this to keep the code simple.
func altScript(l language.Language, s language.Script) language.Script {
for _, alt := range matchScript {
// TODO: also match cases where language is not the same.
if (language.Language(alt.wantLang) == l || language.Language(alt.haveLang) == l) &&
language.Script(alt.haveScript) == s {
return language.Script(alt.wantScript)
}
}
return 0
}
// addIfNew adds a haveTag to the list of tags only if it is a unique tag.
// Tags that have the same maximized values are linked by index.
func (h *matchHeader) addIfNew(n haveTag, exact bool) {
h.original = h.original || exact
// Don't add new exact matches.
for _, v := range h.haveTags {
if equalsRest(v.tag, n.tag) {
return
}
}
// Allow duplicate maximized tags, but create a linked list to allow quickly
// comparing the equivalents and bail out.
for i, v := range h.haveTags {
if v.maxScript == n.maxScript &&
v.maxRegion == n.maxRegion &&
v.tag.VariantOrPrivateUseTags() == n.tag.VariantOrPrivateUseTags() {
for h.haveTags[i].nextMax != 0 {
i = int(h.haveTags[i].nextMax)
}
h.haveTags[i].nextMax = uint16(len(h.haveTags))
break
}
}
h.haveTags = append(h.haveTags, &n)
}
// header returns the matchHeader for the given language. It creates one if
// it doesn't already exist.
func (m *matcher) header(l language.Language) *matchHeader {
if h := m.index[l]; h != nil {
return h
}
h := &matchHeader{}
m.index[l] = h
return h
}
func toConf(d uint8) Confidence {
if d <= 10 {
return High
}
if d < 30 {
return Low
}
return No
}
// newMatcher builds an index for the given supported tags and returns it as
// a matcher. It also expands the index by considering various equivalence classes
// for a given tag.
func newMatcher(supported []Tag, options []MatchOption) *matcher {
m := &matcher{
index: make(map[language.Language]*matchHeader),
preferSameScript: true,
}
for _, o := range options {
o(m)
}
if len(supported) == 0 {
m.default_ = &haveTag{}
return m
}
// Add supported languages to the index. Add exact matches first to give
// them precedence.
for i, tag := range supported {
tt := tag.tag()
pair, _ := makeHaveTag(tt, i)
m.header(tt.LangID).addIfNew(pair, true)
m.supported = append(m.supported, &pair)
}
m.default_ = m.header(supported[0].lang()).haveTags[0]
// Keep these in two different loops to support the case that two equivalent
// languages are distinguished, such as iw and he.
for i, tag := range supported {
tt := tag.tag()
pair, max := makeHaveTag(tt, i)
if max != tt.LangID {
m.header(max).addIfNew(pair, true)
}
}
// update is used to add indexes in the map for equivalent languages.
// update will only add entries to original indexes, thus not computing any
// transitive relations.
update := func(want, have uint16, conf Confidence) {
if hh := m.index[language.Language(have)]; hh != nil {
if !hh.original {
return
}
hw := m.header(language.Language(want))
for _, ht := range hh.haveTags {
v := *ht
if conf < v.conf {
v.conf = conf
}
v.nextMax = 0 // this value needs to be recomputed
if v.altScript != 0 {
v.altScript = altScript(language.Language(want), v.maxScript)
}
hw.addIfNew(v, conf == Exact && hh.original)
}
}
}
// Add entries for languages with mutual intelligibility as defined by CLDR's
// languageMatch data.
for _, ml := range matchLang {
update(ml.want, ml.have, toConf(ml.distance))
if !ml.oneway {
update(ml.have, ml.want, toConf(ml.distance))
}
}
// Add entries for possible canonicalizations. This is an optimization to
// ensure that only one map lookup needs to be done at runtime per desired tag.
// First we match deprecated equivalents. If they are perfect equivalents
// (their canonicalization simply substitutes a different language code, but
// nothing else), the match confidence is Exact, otherwise it is High.
for i, lm := range language.AliasMap {
// If deprecated codes match and there is no fiddling with the script
// or region, we consider it an exact match.
conf := Exact
if language.AliasTypes[i] != language.Macro {
if !isExactEquivalent(language.Language(lm.From)) {
conf = High
}
update(lm.To, lm.From, conf)
}
update(lm.From, lm.To, conf)
}
return m
}
// getBest gets the best matching tag in m for any of the given tags, taking into
// account the order of preference of the given tags.
func (m *matcher) getBest(want ...Tag) (got *haveTag, orig language.Tag, c Confidence) {
best := bestMatch{}
for i, ww := range want {
w := ww.tag()
var max language.Tag
// Check for exact match first.
h := m.index[w.LangID]
if w.LangID != 0 {
if h == nil {
continue
}
// Base language is defined.
max, _ = canonicalize(Legacy|Deprecated|Macro, w)
// A region that is added through canonicalization is stronger than
// a maximized region: set it in the original (e.g. mo -> ro-MD).
if w.RegionID != max.RegionID {
w.RegionID = max.RegionID
}
// TODO: should we do the same for scripts?
// See test case: en, sr, nl ; sh ; sr
max, _ = max.Maximize()
} else {
// Base language is not defined.
if h != nil {
for i := range h.haveTags {
have := h.haveTags[i]
if equalsRest(have.tag, w) {
return have, w, Exact
}
}
}
if w.ScriptID == 0 && w.RegionID == 0 {
// We skip all tags matching und for approximate matching, including
// private tags.
continue
}
max, _ = w.Maximize()
if h = m.index[max.LangID]; h == nil {
continue
}
}
pin := true
for _, t := range want[i+1:] {
if w.LangID == t.lang() {
pin = false
break
}
}
// Check for match based on maximized tag.
for i := range h.haveTags {
have := h.haveTags[i]
best.update(have, w, max.ScriptID, max.RegionID, pin)
if best.conf == Exact {
for have.nextMax != 0 {
have = h.haveTags[have.nextMax]
best.update(have, w, max.ScriptID, max.RegionID, pin)
}
return best.have, best.want, best.conf
}
}
}
if best.conf <= No {
if len(want) != 0 {
return nil, want[0].tag(), No
}
return nil, language.Tag{}, No
}
return best.have, best.want, best.conf
}
// bestMatch accumulates the best match so far.
type bestMatch struct {
have *haveTag
want language.Tag
conf Confidence
pinnedRegion language.Region
pinLanguage bool
sameRegionGroup bool
// Cached results from applying tie-breaking rules.
origLang bool
origReg bool
paradigmReg bool
regGroupDist uint8
origScript bool
}
// update updates the existing best match if the new pair is considered to be a
// better match. To determine if the given pair is a better match, it first
// computes the rough confidence level. If this surpasses the current match, it
// will replace it and update the tie-breaker rule cache. If there is a tie, it
// proceeds with applying a series of tie-breaker rules. If there is no
// conclusive winner after applying the tie-breaker rules, it leaves the current
// match as the preferred match.
//
// If pin is true and have and tag are a strong match, it will henceforth only
// consider matches for this language. This corresponds to the idea that most
// users have a strong preference for the first defined language. A user can
// still prefer a second language over a dialect of the preferred language by
// explicitly specifying dialects, e.g. "en, nl, en-GB". In this case pin should
// be false.
func (m *bestMatch) update(have *haveTag, tag language.Tag, maxScript language.Script, maxRegion language.Region, pin bool) {
// Bail if the maximum attainable confidence is below that of the current best match.
c := have.conf
if c < m.conf {
return
}
// Don't change the language once we already have found an exact match.
if m.pinLanguage && tag.LangID != m.want.LangID {
return
}
// Pin the region group if we are comparing tags for the same language.
if tag.LangID == m.want.LangID && m.sameRegionGroup {
_, sameGroup := regionGroupDist(m.pinnedRegion, have.maxRegion, have.maxScript, m.want.LangID)
if !sameGroup {
return
}
}
if c == Exact && have.maxScript == maxScript {
// If there is another language and then another entry of this language,
// don't pin anything, otherwise pin the language.
m.pinLanguage = pin
}
if equalsRest(have.tag, tag) {
} else if have.maxScript != maxScript {
// There is usually very little comprehension between different scripts.
// In a few cases there may still be Low comprehension. This possibility
// is pre-computed and stored in have.altScript.
if Low < m.conf || have.altScript != maxScript {
return
}
c = Low
} else if have.maxRegion != maxRegion {
if High < c {
// There is usually a small difference between languages across regions.
c = High
}
}
// We store the results of the computations of the tie-breaker rules along
// with the best match. There is no need to do the checks once we determine
// we have a winner, but we do still need to do the tie-breaker computations.
// We use "beaten" to keep track if we still need to do the checks.
beaten := false // true if the new pair defeats the current one.
if c != m.conf {
if c < m.conf {
return
}
beaten = true
}
// Tie-breaker rules:
// We prefer if the pre-maximized language was specified and identical.
origLang := have.tag.LangID == tag.LangID && tag.LangID != 0
if !beaten && m.origLang != origLang {
if m.origLang {
return
}
beaten = true
}
// We prefer if the pre-maximized region was specified and identical.
origReg := have.tag.RegionID == tag.RegionID && tag.RegionID != 0
if !beaten && m.origReg != origReg {
if m.origReg {
return
}
beaten = true
}
regGroupDist, sameGroup := regionGroupDist(have.maxRegion, maxRegion, maxScript, tag.LangID)
if !beaten && m.regGroupDist != regGroupDist {
if regGroupDist > m.regGroupDist {
return
}
beaten = true
}
paradigmReg := isParadigmLocale(tag.LangID, have.maxRegion)
if !beaten && m.paradigmReg != paradigmReg {
if !paradigmReg {
return
}
beaten = true
}
// Next we prefer if the pre-maximized script was specified and identical.
origScript := have.tag.ScriptID == tag.ScriptID && tag.ScriptID != 0
if !beaten && m.origScript != origScript {
if m.origScript {
return
}
beaten = true
}
// Update m to the newly found best match.
if beaten {
m.have = have
m.want = tag
m.conf = c
m.pinnedRegion = maxRegion
m.sameRegionGroup = sameGroup
m.origLang = origLang
m.origReg = origReg
m.paradigmReg = paradigmReg
m.origScript = origScript
m.regGroupDist = regGroupDist
}
}
func isParadigmLocale(lang language.Language, r language.Region) bool {
for _, e := range paradigmLocales {
if language.Language(e[0]) == lang && (r == language.Region(e[1]) || r == language.Region(e[2])) {
return true
}
}
return false
}
// regionGroupDist computes the distance between two regions based on their
// CLDR grouping.
func regionGroupDist(a, b language.Region, script language.Script, lang language.Language) (dist uint8, same bool) {
const defaultDistance = 4
aGroup := uint(regionToGroups[a]) << 1
bGroup := uint(regionToGroups[b]) << 1
for _, ri := range matchRegion {
if language.Language(ri.lang) == lang && (ri.script == 0 || language.Script(ri.script) == script) {
group := uint(1 << (ri.group &^ 0x80))
if 0x80&ri.group == 0 {
if aGroup&bGroup&group != 0 { // Both regions are in the group.
return ri.distance, ri.distance == defaultDistance
}
} else {
if (aGroup|bGroup)&group == 0 { // Both regions are not in the group.
return ri.distance, ri.distance == defaultDistance
}
}
}
}
return defaultDistance, true
}
// equalsRest compares everything except the language.
func equalsRest(a, b language.Tag) bool {
// TODO: don't include extensions in this comparison. To do this efficiently,
// though, we should handle private tags separately.
return a.ScriptID == b.ScriptID && a.RegionID == b.RegionID && a.VariantOrPrivateUseTags() == b.VariantOrPrivateUseTags()
}
// isExactEquivalent returns true if canonicalizing the language will not alter
// the script or region of a tag.
func isExactEquivalent(l language.Language) bool {
for _, o := range notEquivalent {
if o == l {
return false
}
}
return true
}
var notEquivalent []language.Language
func init() {
// Create a list of all languages for which canonicalization may alter the
// script or region.
for _, lm := range language.AliasMap {
tag := language.Tag{LangID: language.Language(lm.From)}
if tag, _ = canonicalize(All, tag); tag.ScriptID != 0 || tag.RegionID != 0 {
notEquivalent = append(notEquivalent, language.Language(lm.From))
}
}
// Maximize undefined regions of paradigm locales.
for i, v := range paradigmLocales {
t := language.Tag{LangID: language.Language(v[0])}
max, _ := t.Maximize()
if v[1] == 0 {
paradigmLocales[i][1] = uint16(max.RegionID)
}
if v[2] == 0 {
paradigmLocales[i][2] = uint16(max.RegionID)
}
}
}
@@ -0,0 +1,384 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package language
import (
"bytes"
"flag"
"fmt"
"os"
"path"
"path/filepath"
"strings"
"testing"
"unicode/utf8"
"golang.org/x/text/internal/testtext"
"golang.org/x/text/internal/ucd"
)
var verbose = flag.Bool("verbose", false, "set to true to print the internal tables of matchers")
func TestCompliance(t *testing.T) {
filepath.Walk("testdata", func(file string, info os.FileInfo, err error) error {
if info.IsDir() {
return nil
}
r, err := os.Open(file)
if err != nil {
t.Fatal(err)
}
ucd.Parse(r, func(p *ucd.Parser) {
name := strings.ReplaceAll(path.Join(p.String(0), p.String(1)), " ", "")
if skip[name] {
return
}
t.Run(info.Name()+"/"+short(name), func(t *testing.T) {
supported := makeTagList(p.String(0))
desired := makeTagList(p.String(1))
gotCombined, index, conf := NewMatcher(supported).Match(desired...)
gotMatch := supported[index]
wantMatch := Raw.Make(p.String(2)) // wantMatch may be null
if gotMatch != wantMatch {
t.Fatalf("match: got %q; want %q (%v)", gotMatch, wantMatch, conf)
}
if tag := strings.TrimSpace(p.String(3)); tag != "" {
wantCombined := Raw.MustParse(tag)
if err == nil && gotCombined != wantCombined {
t.Errorf("combined: got %q; want %q (%v)", gotCombined, wantCombined, conf)
}
}
})
})
return nil
})
}
func short(s string) string {
if len(s) <= 50 {
return s
}
var i int
for i = 1; i < utf8.UTFMax && !utf8.RuneStart(s[50-i]); i++ {
}
return s[:50-i] + "…"
}
var skip = map[string]bool{
// TODO: bugs
// Honor the wildcard match. This may only be useful to select non-exact
// stuff.
"mul,af/nl": true, // match: got "af"; want "mul"
// TODO: include other extensions.
// combined: got "en-GB-u-ca-buddhist-nu-arab"; want "en-GB-fonipa-t-m0-iso-i0-pinyin-u-ca-buddhist-nu-arab"
"und,en-GB-u-sd-gbsct/en-fonipa-u-nu-Arab-ca-buddhist-t-m0-iso-i0-pinyin": true,
// Inconsistencies with Mark Davis' implementation where it is not clear
// which is better.
// Inconsistencies in combined. I think the Go approach is more appropriate.
// We could use -u-rg- as alternative.
"und,fr/fr-BE-fonipa": true, // combined: got "fr"; want "fr-BE-fonipa"
"und,fr-CA/fr-BE-fonipa": true, // combined: got "fr-CA"; want "fr-BE-fonipa"
"und,fr-fonupa/fr-BE-fonipa": true, // combined: got "fr-fonupa"; want "fr-BE-fonipa"
"und,no/nn-BE-fonipa": true, // combined: got "no"; want "no-BE-fonipa"
"50,und,fr-CA-fonupa/fr-BE-fonipa": true, // combined: got "fr-CA-fonupa"; want "fr-BE-fonipa"
// The initial number is a threshold. As we don't use scoring, we will not
// implement this.
"50,und,fr-Cyrl-CA-fonupa/fr-BE-fonipa": true,
// match: got "und"; want "fr-Cyrl-CA-fonupa"
// combined: got "und"; want "fr-Cyrl-BE-fonipa"
// Other interesting cases to test:
// - Should same language or same script have the preference if there is
// usually no understanding of the other script?
// - More specific region in desired may replace enclosing supported.
}
func makeTagList(s string) (tags []Tag) {
for _, s := range strings.Split(s, ",") {
tags = append(tags, mk(strings.TrimSpace(s)))
}
return tags
}
func TestMatchStrings(t *testing.T) {
testCases := []struct {
supported string
desired string // strings separated by |
tag string
index int
}{{
supported: "en",
desired: "",
tag: "en",
index: 0,
}, {
supported: "en",
desired: "nl",
tag: "en",
index: 0,
}, {
supported: "en,nl",
desired: "nl",
tag: "nl",
index: 1,
}, {
supported: "en,nl",
desired: "nl|en",
tag: "nl",
index: 1,
}, {
supported: "en-GB,nl",
desired: "en ; q=0.1,nl",
tag: "nl",
index: 1,
}, {
supported: "en-GB,nl",
desired: "en;q=0.005 | dk; q=0.1,nl ",
tag: "en-GB",
index: 0,
}, {
// do not match faulty tags with und
supported: "en,und",
desired: "|en",
tag: "en",
index: 0,
}}
for _, tc := range testCases {
t.Run(path.Join(tc.supported, tc.desired), func(t *testing.T) {
m := NewMatcher(makeTagList(tc.supported))
tag, index := MatchStrings(m, strings.Split(tc.desired, "|")...)
if tag.String() != tc.tag || index != tc.index {
t.Errorf("got %v, %d; want %v, %d", tag, index, tc.tag, tc.index)
}
})
}
}
func TestRegionGroups(t *testing.T) {
testCases := []struct {
a, b string
distance uint8
}{
{"zh-TW", "zh-HK", 5},
{"zh-MO", "zh-HK", 4},
{"es-ES", "es-AR", 5},
{"es-ES", "es", 4},
{"es-419", "es-MX", 4},
{"es-AR", "es-MX", 4},
{"es-ES", "es-MX", 5},
{"es-PT", "es-MX", 5},
}
for _, tc := range testCases {
a := MustParse(tc.a)
aScript, _ := a.Script()
b := MustParse(tc.b)
bScript, _ := b.Script()
if aScript != bScript {
t.Errorf("scripts differ: %q vs %q", aScript, bScript)
continue
}
d, _ := regionGroupDist(a.region(), b.region(), aScript.scriptID, a.lang())
if d != tc.distance {
t.Errorf("got %q; want %q", d, tc.distance)
}
}
}
func TestIsParadigmLocale(t *testing.T) {
testCases := map[string]bool{
"en-US": true,
"en-GB": true,
"en-VI": false,
"es-GB": false,
"es-ES": true,
"es-419": true,
}
for str, want := range testCases {
tt := Make(str)
tag := tt.tag()
got := isParadigmLocale(tag.LangID, tag.RegionID)
if got != want {
t.Errorf("isPL(%q) = %v; want %v", str, got, want)
}
}
}
// Implementation of String methods for various types for debugging purposes.
func (m *matcher) String() string {
w := &bytes.Buffer{}
fmt.Fprintln(w, "Default:", m.default_)
for tag, h := range m.index {
fmt.Fprintf(w, " %s: %v\n", tag, h)
}
return w.String()
}
func (h *matchHeader) String() string {
w := &bytes.Buffer{}
fmt.Fprint(w, "haveTag: ")
for _, h := range h.haveTags {
fmt.Fprintf(w, "%v, ", h)
}
return w.String()
}
func (t haveTag) String() string {
return fmt.Sprintf("%v:%d:%v:%v-%v|%v", t.tag, t.index, t.conf, t.maxRegion, t.maxScript, t.altScript)
}
func TestIssue43834(t *testing.T) {
matcher := NewMatcher([]Tag{English})
// ZZ is the largest region code and should not cause overflow.
desired, _, err := ParseAcceptLanguage("en-ZZ")
if err != nil {
t.Error(err)
}
_, i, _ := matcher.Match(desired...)
if i != 0 {
t.Errorf("got %v; want 0", i)
}
}
func TestBestMatchAlloc(t *testing.T) {
m := NewMatcher(makeTagList("en sr nl"))
// Go allocates when creating a list of tags from a single tag!
list := []Tag{English}
avg := testtext.AllocsPerRun(100, func() {
m.Match(list...)
})
if avg > 0 {
t.Errorf("got %f; want 0", avg)
}
}
var benchHave = []Tag{
mk("en"),
mk("en-GB"),
mk("za"),
mk("zh-Hant"),
mk("zh-Hans-CN"),
mk("zh"),
mk("zh-HK"),
mk("ar-MK"),
mk("en-CA"),
mk("fr-CA"),
mk("fr-US"),
mk("fr-CH"),
mk("fr"),
mk("lt"),
mk("lv"),
mk("iw"),
mk("iw-NL"),
mk("he"),
mk("he-IT"),
mk("tlh"),
mk("ja"),
mk("ja-Jpan"),
mk("ja-Jpan-JP"),
mk("de"),
mk("de-CH"),
mk("de-AT"),
mk("de-DE"),
mk("sr"),
mk("sr-Latn"),
mk("sr-Cyrl"),
mk("sr-ME"),
}
var benchWant = [][]Tag{
[]Tag{
mk("en"),
},
[]Tag{
mk("en-AU"),
mk("de-HK"),
mk("nl"),
mk("fy"),
mk("lv"),
},
[]Tag{
mk("en-AU"),
mk("de-HK"),
mk("nl"),
mk("fy"),
},
[]Tag{
mk("ja-Hant"),
mk("da-HK"),
mk("nl"),
mk("zh-TW"),
},
[]Tag{
mk("ja-Hant"),
mk("da-HK"),
mk("nl"),
mk("hr"),
},
}
func BenchmarkMatch(b *testing.B) {
m := newMatcher(benchHave, nil)
for i := 0; i < b.N; i++ {
for _, want := range benchWant {
m.getBest(want...)
}
}
}
func BenchmarkMatchExact(b *testing.B) {
want := mk("en")
m := newMatcher(benchHave, nil)
for i := 0; i < b.N; i++ {
m.getBest(want)
}
}
func BenchmarkMatchAltLanguagePresent(b *testing.B) {
want := mk("hr")
m := newMatcher(benchHave, nil)
for i := 0; i < b.N; i++ {
m.getBest(want)
}
}
func BenchmarkMatchAltLanguageNotPresent(b *testing.B) {
want := mk("nn")
m := newMatcher(benchHave, nil)
for i := 0; i < b.N; i++ {
m.getBest(want)
}
}
func BenchmarkMatchAltScriptPresent(b *testing.B) {
want := mk("zh-Hant-CN")
m := newMatcher(benchHave, nil)
for i := 0; i < b.N; i++ {
m.getBest(want)
}
}
func BenchmarkMatchAltScriptNotPresent(b *testing.B) {
want := mk("fr-Cyrl")
m := newMatcher(benchHave, nil)
for i := 0; i < b.N; i++ {
m.getBest(want)
}
}
func BenchmarkMatchLimitedExact(b *testing.B) {
want := []Tag{mk("he-NL"), mk("iw-NL")}
m := newMatcher(benchHave, nil)
for i := 0; i < b.N; i++ {
m.getBest(want...)
}
}
@@ -0,0 +1,256 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package language
import (
"errors"
"sort"
"strconv"
"strings"
"golang.org/x/text/internal/language"
)
// ValueError is returned by any of the parsing functions when the
// input is well-formed but the respective subtag is not recognized
// as a valid value.
type ValueError interface {
error
// Subtag returns the subtag for which the error occurred.
Subtag() string
}
// Parse parses the given BCP 47 string and returns a valid Tag. If parsing
// failed it returns an error and any part of the tag that could be parsed.
// If parsing succeeded but an unknown value was found, it returns
// ValueError. The Tag returned in this case is just stripped of the unknown
// value. All other values are preserved. It accepts tags in the BCP 47 format
// and extensions to this standard defined in
// https://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
// The resulting tag is canonicalized using the default canonicalization type.
func Parse(s string) (t Tag, err error) {
return Default.Parse(s)
}
// Parse parses the given BCP 47 string and returns a valid Tag. If parsing
// failed it returns an error and any part of the tag that could be parsed.
// If parsing succeeded but an unknown value was found, it returns
// ValueError. The Tag returned in this case is just stripped of the unknown
// value. All other values are preserved. It accepts tags in the BCP 47 format
// and extensions to this standard defined in
// https://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
// The resulting tag is canonicalized using the canonicalization type c.
func (c CanonType) Parse(s string) (t Tag, err error) {
defer func() {
if recover() != nil {
t = Tag{}
err = language.ErrSyntax
}
}()
tt, err := language.Parse(s)
if err != nil {
return makeTag(tt), err
}
tt, changed := canonicalize(c, tt)
if changed {
tt.RemakeString()
}
return makeTag(tt), err
}
// Compose creates a Tag from individual parts, which may be of type Tag, Base,
// Script, Region, Variant, []Variant, Extension, []Extension or error. If a
// Base, Script or Region or slice of type Variant or Extension is passed more
// than once, the latter will overwrite the former. Variants and Extensions are
// accumulated, but if two extensions of the same type are passed, the latter
// will replace the former. For -u extensions, though, the key-type pairs are
// added, where later values overwrite older ones. A Tag overwrites all former
// values and typically only makes sense as the first argument. The resulting
// tag is returned after canonicalizing using the Default CanonType. If one or
// more errors are encountered, one of the errors is returned.
func Compose(part ...interface{}) (t Tag, err error) {
return Default.Compose(part...)
}
// Compose creates a Tag from individual parts, which may be of type Tag, Base,
// Script, Region, Variant, []Variant, Extension, []Extension or error. If a
// Base, Script or Region or slice of type Variant or Extension is passed more
// than once, the latter will overwrite the former. Variants and Extensions are
// accumulated, but if two extensions of the same type are passed, the latter
// will replace the former. For -u extensions, though, the key-type pairs are
// added, where later values overwrite older ones. A Tag overwrites all former
// values and typically only makes sense as the first argument. The resulting
// tag is returned after canonicalizing using CanonType c. If one or more errors
// are encountered, one of the errors is returned.
func (c CanonType) Compose(part ...interface{}) (t Tag, err error) {
defer func() {
if recover() != nil {
t = Tag{}
err = language.ErrSyntax
}
}()
var b language.Builder
if err = update(&b, part...); err != nil {
return und, err
}
b.Tag, _ = canonicalize(c, b.Tag)
return makeTag(b.Make()), err
}
var errInvalidArgument = errors.New("invalid Extension or Variant")
func update(b *language.Builder, part ...interface{}) (err error) {
for _, x := range part {
switch v := x.(type) {
case Tag:
b.SetTag(v.tag())
case Base:
b.Tag.LangID = v.langID
case Script:
b.Tag.ScriptID = v.scriptID
case Region:
b.Tag.RegionID = v.regionID
case Variant:
if v.variant == "" {
err = errInvalidArgument
break
}
b.AddVariant(v.variant)
case Extension:
if v.s == "" {
err = errInvalidArgument
break
}
b.SetExt(v.s)
case []Variant:
b.ClearVariants()
for _, v := range v {
b.AddVariant(v.variant)
}
case []Extension:
b.ClearExtensions()
for _, e := range v {
b.SetExt(e.s)
}
// TODO: support parsing of raw strings based on morphology or just extensions?
case error:
if v != nil {
err = v
}
}
}
return
}
var errInvalidWeight = errors.New("ParseAcceptLanguage: invalid weight")
var errTagListTooLarge = errors.New("tag list exceeds max length")
// ParseAcceptLanguage parses the contents of an Accept-Language header as
// defined in http://www.ietf.org/rfc/rfc2616.txt and returns a list of Tags and
// a list of corresponding quality weights. It is more permissive than RFC 2616
// and may return non-nil slices even if the input is not valid.
// The Tags will be sorted by highest weight first and then by first occurrence.
// Tags with a weight of zero will be dropped. An error will be returned if the
// input could not be parsed.
func ParseAcceptLanguage(s string) (tag []Tag, q []float32, err error) {
defer func() {
if recover() != nil {
tag = nil
q = nil
err = language.ErrSyntax
}
}()
if strings.Count(s, "-") > 1000 {
return nil, nil, errTagListTooLarge
}
var entry string
for s != "" {
if entry, s = split(s, ','); entry == "" {
continue
}
entry, weight := split(entry, ';')
// Scan the language.
t, err := Parse(entry)
if err != nil {
id, ok := acceptFallback[entry]
if !ok {
return nil, nil, err
}
t = makeTag(language.Tag{LangID: id})
}
// Scan the optional weight.
w := 1.0
if weight != "" {
weight = consume(weight, 'q')
weight = consume(weight, '=')
// consume returns the empty string when a token could not be
// consumed, resulting in an error for ParseFloat.
if w, err = strconv.ParseFloat(weight, 32); err != nil {
return nil, nil, errInvalidWeight
}
// Drop tags with a quality weight of 0.
if w <= 0 {
continue
}
}
tag = append(tag, t)
q = append(q, float32(w))
}
sort.Stable(&tagSort{tag, q})
return tag, q, nil
}
// consume removes a leading token c from s and returns the result or the empty
// string if there is no such token.
func consume(s string, c byte) string {
if s == "" || s[0] != c {
return ""
}
return strings.TrimSpace(s[1:])
}
func split(s string, c byte) (head, tail string) {
if i := strings.IndexByte(s, c); i >= 0 {
return strings.TrimSpace(s[:i]), strings.TrimSpace(s[i+1:])
}
return strings.TrimSpace(s), ""
}
// Add hack mapping to deal with a small number of cases that occur
// in Accept-Language (with reasonable frequency).
var acceptFallback = map[string]language.Language{
"english": _en,
"deutsch": _de,
"italian": _it,
"french": _fr,
"*": _mul, // defined in the spec to match all languages.
}
type tagSort struct {
tag []Tag
q []float32
}
func (s *tagSort) Len() int {
return len(s.q)
}
func (s *tagSort) Less(i, j int) bool {
return s.q[i] > s.q[j]
}
func (s *tagSort) Swap(i, j int) {
s.tag[i], s.tag[j] = s.tag[j], s.tag[i]
s.q[i], s.q[j] = s.q[j], s.q[i]
}
@@ -0,0 +1,409 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package language
import (
"strings"
"testing"
"golang.org/x/text/internal/language"
)
// equalTags compares language, script and region subtags only.
func (t Tag) equalTags(a Tag) bool {
return t.lang() == a.lang() &&
t.script() == a.script() &&
t.region() == a.region()
}
var errSyntax = language.ErrSyntax
type parseTest struct {
i int // the index of this test
in string
lang, script, region string
variants, ext string
extList []string // only used when more than one extension is present
invalid bool
rewrite bool // special rewrite not handled by parseTag
changed bool // string needed to be reformatted
}
func parseTests() []parseTest {
tests := []parseTest{
{in: "root", lang: "und"},
{in: "und", lang: "und"},
{in: "en", lang: "en"},
{in: "en-US-u-va-posix", lang: "en", region: "US", ext: "u-va-posix"},
{in: "ca-ES-valencia", lang: "ca", region: "ES", variants: "valencia"},
{in: "en-US-u-rg-gbzzzz", lang: "en", region: "US", ext: "u-rg-gbzzzz"},
{in: "xy", lang: "und", invalid: true},
{in: "en-ZY", lang: "en", invalid: true},
{in: "gsw", lang: "gsw"},
{in: "sr_Latn", lang: "sr", script: "Latn"},
{in: "af-Arab", lang: "af", script: "Arab"},
{in: "nl-BE", lang: "nl", region: "BE"},
{in: "es-419", lang: "es", region: "419"},
{in: "und-001", lang: "und", region: "001"},
{in: "de-latn-be", lang: "de", script: "Latn", region: "BE"},
// Variants
{in: "de-1901", lang: "de", variants: "1901"},
// Accept with unsuppressed script.
{in: "de-Latn-1901", lang: "de", script: "Latn", variants: "1901"},
// Specialized.
{in: "sl-rozaj", lang: "sl", variants: "rozaj"},
{in: "sl-rozaj-lipaw", lang: "sl", variants: "rozaj-lipaw"},
{in: "sl-rozaj-biske", lang: "sl", variants: "rozaj-biske"},
{in: "sl-rozaj-biske-1994", lang: "sl", variants: "rozaj-biske-1994"},
{in: "sl-rozaj-1994", lang: "sl", variants: "rozaj-1994"},
// Maximum number of variants while adhering to prefix rules.
{in: "sl-rozaj-biske-1994-alalc97-fonipa-fonupa-fonxsamp", lang: "sl", variants: "rozaj-biske-1994-alalc97-fonipa-fonupa-fonxsamp"},
// Sorting.
{in: "sl-1994-biske-rozaj", lang: "sl", variants: "rozaj-biske-1994", changed: true},
{in: "sl-rozaj-biske-1994-alalc97-fonupa-fonipa-fonxsamp", lang: "sl", variants: "rozaj-biske-1994-alalc97-fonipa-fonupa-fonxsamp", changed: true},
{in: "nl-fonxsamp-alalc97-fonipa-fonupa", lang: "nl", variants: "alalc97-fonipa-fonupa-fonxsamp", changed: true},
// Duplicates variants are removed, but not an error.
{in: "nl-fonupa-fonupa", lang: "nl", variants: "fonupa"},
// Variants that do not have correct prefixes. We still accept these.
{in: "de-Cyrl-1901", lang: "de", script: "Cyrl", variants: "1901"},
{in: "sl-rozaj-lipaw-1994", lang: "sl", variants: "rozaj-lipaw-1994"},
{in: "sl-1994-biske-rozaj-1994-biske-rozaj", lang: "sl", variants: "rozaj-biske-1994", changed: true},
{in: "de-Cyrl-1901", lang: "de", script: "Cyrl", variants: "1901"},
// Invalid variant.
{in: "de-1902", lang: "de", variants: "", invalid: true},
{in: "EN_CYRL", lang: "en", script: "Cyrl"},
// private use and extensions
{in: "x-a-b-c-d", ext: "x-a-b-c-d"},
{in: "x_A.-B-C_D", ext: "x-b-c-d", invalid: true, changed: true},
{in: "x-aa-bbbb-cccccccc-d", ext: "x-aa-bbbb-cccccccc-d"},
{in: "en-c_cc-b-bbb-a-aaa", lang: "en", changed: true, extList: []string{"a-aaa", "b-bbb", "c-cc"}},
{in: "en-x_cc-b-bbb-a-aaa", lang: "en", ext: "x-cc-b-bbb-a-aaa", changed: true},
{in: "en-c_cc-b-bbb-a-aaa-x-x", lang: "en", changed: true, extList: []string{"a-aaa", "b-bbb", "c-cc", "x-x"}},
{in: "en-v-c", lang: "en", ext: "", invalid: true},
{in: "en-v-abcdefghi", lang: "en", ext: "", invalid: true},
{in: "en-v-abc-x", lang: "en", ext: "v-abc", invalid: true},
{in: "en-v-abc-x-", lang: "en", ext: "v-abc", invalid: true},
{in: "en-v-abc-w-x-xx", lang: "en", extList: []string{"v-abc", "x-xx"}, invalid: true, changed: true},
{in: "en-v-abc-w-y-yx", lang: "en", extList: []string{"v-abc", "y-yx"}, invalid: true, changed: true},
{in: "en-v-c-abc", lang: "en", ext: "c-abc", invalid: true, changed: true},
{in: "en-v-w-abc", lang: "en", ext: "w-abc", invalid: true, changed: true},
{in: "en-v-x-abc", lang: "en", ext: "x-abc", invalid: true, changed: true},
{in: "en-v-x-a", lang: "en", ext: "x-a", invalid: true, changed: true},
{in: "en-9-aa-0-aa-z-bb-x-a", lang: "en", extList: []string{"0-aa", "9-aa", "z-bb", "x-a"}, changed: true},
{in: "en-u-c", lang: "en", ext: "", invalid: true},
{in: "en-u-co-phonebk", lang: "en", ext: "u-co-phonebk"},
{in: "en-u-co-phonebk-ca", lang: "en", ext: "u-ca-co-phonebk", invalid: true},
{in: "en-u-nu-arabic-co-phonebk-ca", lang: "en", ext: "u-ca-co-phonebk-nu-arabic", invalid: true, changed: true},
{in: "en-u-nu-arabic-co-phonebk-ca-x", lang: "en", ext: "u-ca-co-phonebk-nu-arabic", invalid: true, changed: true},
{in: "en-u-nu-arabic-co-phonebk-ca-s", lang: "en", ext: "u-ca-co-phonebk-nu-arabic", invalid: true, changed: true},
{in: "en-u-nu-arabic-co-phonebk-ca-a12345678", lang: "en", ext: "u-ca-co-phonebk-nu-arabic", invalid: true, changed: true},
{in: "en-u-co-phonebook", lang: "en", ext: "u-co", invalid: true},
{in: "en-u-co-phonebook-cu-xau", lang: "en", ext: "u-co-cu-xau", invalid: true, changed: true},
{in: "en-Cyrl-u-co-phonebk", lang: "en", script: "Cyrl", ext: "u-co-phonebk"},
{in: "en-US-u-co-phonebk", lang: "en", region: "US", ext: "u-co-phonebk"},
{in: "en-US-u-co-phonebk-cu-xau", lang: "en", region: "US", ext: "u-co-phonebk-cu-xau"},
{in: "en-scotland-u-co-phonebk", lang: "en", variants: "scotland", ext: "u-co-phonebk"},
{in: "en-u-cu-xua-co-phonebk", lang: "en", ext: "u-co-phonebk-cu-xua", changed: true},
{in: "en-u-def-abc-cu-xua-co-phonebk", lang: "en", ext: "u-abc-def-co-phonebk-cu-xua", changed: true},
{in: "en-u-def-abc", lang: "en", ext: "u-abc-def", changed: true},
{in: "en-u-cu-xua-co-phonebk-a-cd", lang: "en", extList: []string{"a-cd", "u-co-phonebk-cu-xua"}, changed: true},
// Invalid "u" extension. Drop invalid parts.
{in: "en-u-cu-co-phonebk", lang: "en", extList: []string{"u-co-phonebk-cu"}, invalid: true, changed: true},
{in: "en-u-cu-xau-co", lang: "en", extList: []string{"u-co-cu-xau"}, invalid: true},
// We allow duplicate keys as the LDML spec does not explicitly prohibit it.
// TODO: Consider eliminating duplicates and returning an error.
{in: "en-u-cu-xau-co-phonebk-cu-xau", lang: "en", ext: "u-co-phonebk-cu-xau", changed: true},
{in: "en-t-en-Cyrl-NL-fonipa", lang: "en", ext: "t-en-cyrl-nl-fonipa", changed: true},
{in: "en-t-en-Cyrl-NL-fonipa-t0-abc-def", lang: "en", ext: "t-en-cyrl-nl-fonipa-t0-abc-def", changed: true},
{in: "en-t-t0-abcd", lang: "en", ext: "t-t0-abcd"},
// Not necessary to have changed here.
{in: "en-t-nl-abcd", lang: "en", ext: "t-nl", invalid: true},
{in: "en-t-nl-latn", lang: "en", ext: "t-nl-latn"},
{in: "en-t-t0-abcd-x-a", lang: "en", extList: []string{"t-t0-abcd", "x-a"}},
{in: "en_t_pt_MLt", lang: "en", ext: "t-pt-mlt", changed: true},
{in: "en-t-fr-est", lang: "en", ext: "t-fr-est", changed: false},
{in: "fr-est", lang: "et", changed: true},
{in: "fr-est-t-fr-est", lang: "et", ext: "t-fr-est", changed: true},
{in: "fr-est-Cyrl", lang: "et", script: "Cyrl", changed: true},
// invalid
{in: "", lang: "und", invalid: true},
{in: "-", lang: "und", invalid: true},
{in: "x", lang: "und", invalid: true},
{in: "x-", lang: "und", invalid: true},
{in: "x--", lang: "und", invalid: true},
{in: "a-a-b-c-d", lang: "und", invalid: true},
{in: "en-", lang: "en", invalid: true},
{in: "enne-", lang: "und", invalid: true},
{in: "en.", lang: "und", invalid: true},
{in: "en.-latn", lang: "und", invalid: true},
{in: "en.-en", lang: "en", invalid: true},
{in: "x-a-tooManyChars-c-d", ext: "x-a-c-d", invalid: true, changed: true},
{in: "a-tooManyChars-c-d", lang: "und", invalid: true},
// TODO: check key-value validity
// { in: "en-u-cu-xd", lang: "en", ext: "u-cu-xd", invalid: true },
{in: "en-t-abcd", lang: "en", invalid: true},
{in: "en-Latn-US-en", lang: "en", script: "Latn", region: "US", invalid: true},
// rewrites (more tests in TestGrandfathered)
{in: "zh-min-nan", lang: "nan"},
{in: "zh-yue", lang: "yue"},
{in: "zh-xiang", lang: "hsn", rewrite: true},
{in: "zh-guoyu", lang: "cmn", rewrite: true},
{in: "iw", lang: "iw"},
{in: "sgn-BE-FR", lang: "sfb", rewrite: true},
{in: "i-klingon", lang: "tlh", rewrite: true},
}
for i, tt := range tests {
tests[i].i = i
if tt.extList != nil {
tests[i].ext = strings.Join(tt.extList, "-")
}
if tt.ext != "" && tt.extList == nil {
tests[i].extList = []string{tt.ext}
}
}
return tests
}
// partChecks runs checks for each part by calling the function returned by f.
func partChecks(t *testing.T, f func(*parseTest) (Tag, bool)) {
for i, tt := range parseTests() {
tag, skip := f(&tt)
if skip {
continue
}
if l, _ := language.ParseBase(tt.lang); l != tag.lang() {
t.Errorf("%d: lang was %q; want %q", i, tag.lang(), l)
}
if sc, _ := language.ParseScript(tt.script); sc != tag.script() {
t.Errorf("%d: script was %q; want %q", i, tag.script(), sc)
}
if r, _ := language.ParseRegion(tt.region); r != tag.region() {
t.Errorf("%d: region was %q; want %q", i, tag.region(), r)
}
v := tag.tag().Variants()
if v != "" {
v = v[1:]
}
if v != tt.variants {
t.Errorf("%d: variants was %q; want %q", i, v, tt.variants)
}
if e := strings.Join(tag.tag().Extensions(), "-"); e != tt.ext {
t.Errorf("%d: extensions were %q; want %q", i, e, tt.ext)
}
}
}
func TestParse(t *testing.T) {
partChecks(t, func(tt *parseTest) (id Tag, skip bool) {
id, _ = Raw.Parse(tt.in)
return id, false
})
}
func TestErrors(t *testing.T) {
mkInvalid := func(s string) error {
return language.NewValueError([]byte(s))
}
tests := []struct {
in string
out error
}{
// invalid subtags.
{"ac", mkInvalid("ac")},
{"AC", mkInvalid("ac")},
{"aa-Uuuu", mkInvalid("Uuuu")},
{"aa-AB", mkInvalid("AB")},
// ill-formed wins over invalid.
{"ac-u", errSyntax},
{"ac-u-ca", mkInvalid("ac")},
{"ac-u-ca-co-pinyin", mkInvalid("ac")},
{"noob", errSyntax},
}
for _, tt := range tests {
_, err := Parse(tt.in)
if err != tt.out {
t.Errorf("%s: was %q; want %q", tt.in, err, tt.out)
}
}
}
func TestCompose1(t *testing.T) {
partChecks(t, func(tt *parseTest) (id Tag, skip bool) {
l, _ := ParseBase(tt.lang)
s, _ := ParseScript(tt.script)
r, _ := ParseRegion(tt.region)
v := []Variant{}
for _, x := range strings.Split(tt.variants, "-") {
p, _ := ParseVariant(x)
v = append(v, p)
}
e := []Extension{}
for _, x := range tt.extList {
p, _ := ParseExtension(x)
e = append(e, p)
}
id, _ = Raw.Compose(l, s, r, v, e)
return id, false
})
}
func TestCompose2(t *testing.T) {
partChecks(t, func(tt *parseTest) (id Tag, skip bool) {
l, _ := ParseBase(tt.lang)
s, _ := ParseScript(tt.script)
r, _ := ParseRegion(tt.region)
p := []interface{}{l, s, r, s, r, l}
for _, x := range strings.Split(tt.variants, "-") {
if x != "" {
v, _ := ParseVariant(x)
p = append(p, v)
}
}
for _, x := range tt.extList {
e, _ := ParseExtension(x)
p = append(p, e)
}
id, _ = Raw.Compose(p...)
return id, false
})
}
func TestCompose3(t *testing.T) {
partChecks(t, func(tt *parseTest) (id Tag, skip bool) {
id, _ = Raw.Parse(tt.in)
id, _ = Raw.Compose(id)
return id, false
})
}
func mk(s string) Tag {
return Raw.Make(s)
}
func TestParseAcceptLanguage(t *testing.T) {
type res struct {
t Tag
q float32
}
en := []res{{mk("en"), 1.0}}
tests := []struct {
out []res
in string
ok bool
}{
{en, "en", true},
{en, " en", true},
{en, "en ", true},
{en, " en ", true},
{en, "en,", true},
{en, ",en", true},
{en, ",,,en,,,", true},
{en, ",en;q=1", true},
// We allow an empty input, contrary to spec.
{nil, "", true},
{[]res{{mk("aa"), 1}}, "aa;", true}, // allow unspecified weight
// errors
{nil, ";", false},
{nil, "$", false},
{nil, "e;", false},
{nil, "x;", false},
{nil, "x", false},
{nil, "ac", false}, // non-existing language
{nil, "aa;q", false},
{nil, "aa;q=", false},
{nil, "aa;q=.", false},
{nil, "00-t-0o", false},
// odd fallbacks
{
[]res{{mk("en"), 0.1}},
" english ;q=.1",
true,
},
{
[]res{{mk("it"), 1.0}, {mk("de"), 1.0}, {mk("fr"), 1.0}},
" italian, deutsch, french",
true,
},
// lists
{
[]res{{mk("en"), 0.1}},
"en;q=.1",
true,
},
{
[]res{{mk("mul"), 1.0}},
"*",
true,
},
{
[]res{{mk("en"), 1.0}, {mk("de"), 1.0}},
"en,de",
true,
},
{
[]res{{mk("en"), 1.0}, {mk("de"), .5}},
"en,de;q=0.5",
true,
},
{
[]res{{mk("de"), 0.8}, {mk("en"), 0.5}},
" en ; q = 0.5 , , de;q=0.8",
true,
},
{
[]res{{mk("en"), 1.0}, {mk("de"), 1.0}, {mk("fr"), 1.0}, {mk("tlh"), 1.0}},
"en,de,fr,i-klingon",
true,
},
// sorting
{
[]res{{mk("tlh"), 0.4}, {mk("de"), 0.2}, {mk("fr"), 0.2}, {mk("en"), 0.1}},
"en;q=0.1,de;q=0.2,fr;q=0.2,i-klingon;q=0.4",
true,
},
// dropping
{
[]res{{mk("fr"), 0.2}, {mk("en"), 0.1}},
"en;q=0.1,de;q=0,fr;q=0.2,i-klingon;q=0.0",
true,
},
}
for i, tt := range tests {
tags, qs, e := ParseAcceptLanguage(tt.in)
if e == nil != tt.ok {
t.Errorf("%d:%s:err: was %v; want %v", i, tt.in, e == nil, tt.ok)
}
for j, tag := range tags {
if out := tt.out[j]; !tag.equalTags(out.t) || qs[j] != out.q {
t.Errorf("%d:%s: was %s, %1f; want %s, %1f", i, tt.in, tag, qs[j], out.t, out.q)
break
}
}
}
}
func TestParseAcceptLanguageTooBig(t *testing.T) {
s := strings.Repeat("en-x-a-", 333)
_, _, err := ParseAcceptLanguage(s)
if err != language.ErrSyntax {
t.Errorf("ParseAcceptLanguage() unexpected error: got %v, want %v", err, language.ErrSyntax)
}
s += "en-x-a"
_, _, err = ParseAcceptLanguage(s)
if err != errTagListTooLarge {
t.Errorf("ParseAcceptLanguage() unexpected error: got %v, want %v", err, errTagListTooLarge)
}
}
@@ -0,0 +1,298 @@
// Code generated by running "go generate" in golang.org/x/text. DO NOT EDIT.
package language
// CLDRVersion is the CLDR version from which the tables in this package are derived.
const CLDRVersion = "32"
const (
_de = 269
_en = 313
_fr = 350
_it = 505
_mo = 784
_no = 879
_nb = 839
_pt = 960
_sh = 1031
_mul = 806
_und = 0
)
const (
_001 = 1
_419 = 31
_BR = 65
_CA = 73
_ES = 111
_GB = 124
_MD = 189
_PT = 239
_UK = 307
_US = 310
_ZZ = 358
_XA = 324
_XC = 326
_XK = 334
)
const (
_Latn = 91
_Hani = 57
_Hans = 59
_Hant = 60
_Qaaa = 149
_Qaai = 157
_Qabx = 198
_Zinh = 255
_Zyyy = 260
_Zzzz = 261
)
var regionToGroups = []uint8{ // 359 elements
// Entry 0 - 3F
0x00, 0x00, 0x00, 0x04, 0x04, 0x00, 0x00, 0x04,
0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x04, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04,
0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x00,
0x00, 0x04, 0x00, 0x00, 0x04, 0x01, 0x00, 0x00,
0x04, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x00, 0x04,
// Entry 40 - 7F
0x04, 0x04, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00,
0x04, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x04, 0x00, 0x00, 0x04, 0x00, 0x00, 0x04,
0x00, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x00,
0x08, 0x00, 0x04, 0x00, 0x00, 0x08, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x04,
// Entry 80 - BF
0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x04, 0x00,
0x00, 0x00, 0x04, 0x01, 0x00, 0x04, 0x02, 0x00,
0x04, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x04, 0x00,
0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x08, 0x08, 0x00, 0x00, 0x00, 0x04,
// Entry C0 - FF
0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02,
0x01, 0x04, 0x08, 0x04, 0x00, 0x00, 0x00, 0x00,
0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00,
0x00, 0x00, 0x00, 0x04, 0x00, 0x05, 0x00, 0x00,
0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
// Entry 100 - 13F
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04,
0x00, 0x00, 0x00, 0x04, 0x04, 0x00, 0x00, 0x00,
0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x08, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x05, 0x04,
0x00, 0x00, 0x04, 0x00, 0x04, 0x04, 0x05, 0x00,
// Entry 140 - 17F
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
} // Size: 383 bytes
var paradigmLocales = [][3]uint16{ // 3 elements
0: [3]uint16{0x139, 0x0, 0x7c},
1: [3]uint16{0x13e, 0x0, 0x1f},
2: [3]uint16{0x3c0, 0x41, 0xef},
} // Size: 42 bytes
type mutualIntelligibility struct {
want uint16
have uint16
distance uint8
oneway bool
}
type scriptIntelligibility struct {
wantLang uint16
haveLang uint16
wantScript uint8
haveScript uint8
distance uint8
}
type regionIntelligibility struct {
lang uint16
script uint8
group uint8
distance uint8
}
// matchLang holds pairs of langIDs of base languages that are typically
// mutually intelligible. Each pair is associated with a confidence and
// whether the intelligibility goes one or both ways.
var matchLang = []mutualIntelligibility{ // 113 elements
0: {want: 0x1d1, have: 0xb7, distance: 0x4, oneway: false},
1: {want: 0x407, have: 0xb7, distance: 0x4, oneway: false},
2: {want: 0x407, have: 0x1d1, distance: 0x4, oneway: false},
3: {want: 0x407, have: 0x432, distance: 0x4, oneway: false},
4: {want: 0x43a, have: 0x1, distance: 0x4, oneway: false},
5: {want: 0x1a3, have: 0x10d, distance: 0x4, oneway: true},
6: {want: 0x295, have: 0x10d, distance: 0x4, oneway: true},
7: {want: 0x101, have: 0x36f, distance: 0x8, oneway: false},
8: {want: 0x101, have: 0x347, distance: 0x8, oneway: false},
9: {want: 0x5, have: 0x3e2, distance: 0xa, oneway: true},
10: {want: 0xd, have: 0x139, distance: 0xa, oneway: true},
11: {want: 0x16, have: 0x367, distance: 0xa, oneway: true},
12: {want: 0x21, have: 0x139, distance: 0xa, oneway: true},
13: {want: 0x56, have: 0x13e, distance: 0xa, oneway: true},
14: {want: 0x58, have: 0x3e2, distance: 0xa, oneway: true},
15: {want: 0x71, have: 0x3e2, distance: 0xa, oneway: true},
16: {want: 0x75, have: 0x139, distance: 0xa, oneway: true},
17: {want: 0x82, have: 0x1be, distance: 0xa, oneway: true},
18: {want: 0xa5, have: 0x139, distance: 0xa, oneway: true},
19: {want: 0xb2, have: 0x15e, distance: 0xa, oneway: true},
20: {want: 0xdd, have: 0x153, distance: 0xa, oneway: true},
21: {want: 0xe5, have: 0x139, distance: 0xa, oneway: true},
22: {want: 0xe9, have: 0x3a, distance: 0xa, oneway: true},
23: {want: 0xf0, have: 0x15e, distance: 0xa, oneway: true},
24: {want: 0xf9, have: 0x15e, distance: 0xa, oneway: true},
25: {want: 0x100, have: 0x139, distance: 0xa, oneway: true},
26: {want: 0x130, have: 0x139, distance: 0xa, oneway: true},
27: {want: 0x13c, have: 0x139, distance: 0xa, oneway: true},
28: {want: 0x140, have: 0x151, distance: 0xa, oneway: true},
29: {want: 0x145, have: 0x13e, distance: 0xa, oneway: true},
30: {want: 0x158, have: 0x101, distance: 0xa, oneway: true},
31: {want: 0x16d, have: 0x367, distance: 0xa, oneway: true},
32: {want: 0x16e, have: 0x139, distance: 0xa, oneway: true},
33: {want: 0x16f, have: 0x139, distance: 0xa, oneway: true},
34: {want: 0x17e, have: 0x139, distance: 0xa, oneway: true},
35: {want: 0x190, have: 0x13e, distance: 0xa, oneway: true},
36: {want: 0x194, have: 0x13e, distance: 0xa, oneway: true},
37: {want: 0x1a4, have: 0x1be, distance: 0xa, oneway: true},
38: {want: 0x1b4, have: 0x139, distance: 0xa, oneway: true},
39: {want: 0x1b8, have: 0x139, distance: 0xa, oneway: true},
40: {want: 0x1d4, have: 0x15e, distance: 0xa, oneway: true},
41: {want: 0x1d7, have: 0x3e2, distance: 0xa, oneway: true},
42: {want: 0x1d9, have: 0x139, distance: 0xa, oneway: true},
43: {want: 0x1e7, have: 0x139, distance: 0xa, oneway: true},
44: {want: 0x1f8, have: 0x139, distance: 0xa, oneway: true},
45: {want: 0x20e, have: 0x1e1, distance: 0xa, oneway: true},
46: {want: 0x210, have: 0x139, distance: 0xa, oneway: true},
47: {want: 0x22d, have: 0x15e, distance: 0xa, oneway: true},
48: {want: 0x242, have: 0x3e2, distance: 0xa, oneway: true},
49: {want: 0x24a, have: 0x139, distance: 0xa, oneway: true},
50: {want: 0x251, have: 0x139, distance: 0xa, oneway: true},
51: {want: 0x265, have: 0x139, distance: 0xa, oneway: true},
52: {want: 0x274, have: 0x48a, distance: 0xa, oneway: true},
53: {want: 0x28a, have: 0x3e2, distance: 0xa, oneway: true},
54: {want: 0x28e, have: 0x1f9, distance: 0xa, oneway: true},
55: {want: 0x2a3, have: 0x139, distance: 0xa, oneway: true},
56: {want: 0x2b5, have: 0x15e, distance: 0xa, oneway: true},
57: {want: 0x2b8, have: 0x139, distance: 0xa, oneway: true},
58: {want: 0x2be, have: 0x139, distance: 0xa, oneway: true},
59: {want: 0x2c3, have: 0x15e, distance: 0xa, oneway: true},
60: {want: 0x2ed, have: 0x139, distance: 0xa, oneway: true},
61: {want: 0x2f1, have: 0x15e, distance: 0xa, oneway: true},
62: {want: 0x2fa, have: 0x139, distance: 0xa, oneway: true},
63: {want: 0x2ff, have: 0x7e, distance: 0xa, oneway: true},
64: {want: 0x304, have: 0x139, distance: 0xa, oneway: true},
65: {want: 0x30b, have: 0x3e2, distance: 0xa, oneway: true},
66: {want: 0x31b, have: 0x1be, distance: 0xa, oneway: true},
67: {want: 0x31f, have: 0x1e1, distance: 0xa, oneway: true},
68: {want: 0x320, have: 0x139, distance: 0xa, oneway: true},
69: {want: 0x331, have: 0x139, distance: 0xa, oneway: true},
70: {want: 0x351, have: 0x139, distance: 0xa, oneway: true},
71: {want: 0x36a, have: 0x347, distance: 0xa, oneway: false},
72: {want: 0x36a, have: 0x36f, distance: 0xa, oneway: true},
73: {want: 0x37a, have: 0x139, distance: 0xa, oneway: true},
74: {want: 0x387, have: 0x139, distance: 0xa, oneway: true},
75: {want: 0x389, have: 0x139, distance: 0xa, oneway: true},
76: {want: 0x38b, have: 0x15e, distance: 0xa, oneway: true},
77: {want: 0x390, have: 0x139, distance: 0xa, oneway: true},
78: {want: 0x395, have: 0x139, distance: 0xa, oneway: true},
79: {want: 0x39d, have: 0x139, distance: 0xa, oneway: true},
80: {want: 0x3a5, have: 0x139, distance: 0xa, oneway: true},
81: {want: 0x3be, have: 0x139, distance: 0xa, oneway: true},
82: {want: 0x3c4, have: 0x13e, distance: 0xa, oneway: true},
83: {want: 0x3d4, have: 0x10d, distance: 0xa, oneway: true},
84: {want: 0x3d9, have: 0x139, distance: 0xa, oneway: true},
85: {want: 0x3e5, have: 0x15e, distance: 0xa, oneway: true},
86: {want: 0x3e9, have: 0x1be, distance: 0xa, oneway: true},
87: {want: 0x3fa, have: 0x139, distance: 0xa, oneway: true},
88: {want: 0x40c, have: 0x139, distance: 0xa, oneway: true},
89: {want: 0x423, have: 0x139, distance: 0xa, oneway: true},
90: {want: 0x429, have: 0x139, distance: 0xa, oneway: true},
91: {want: 0x431, have: 0x139, distance: 0xa, oneway: true},
92: {want: 0x43b, have: 0x139, distance: 0xa, oneway: true},
93: {want: 0x43e, have: 0x1e1, distance: 0xa, oneway: true},
94: {want: 0x445, have: 0x139, distance: 0xa, oneway: true},
95: {want: 0x450, have: 0x139, distance: 0xa, oneway: true},
96: {want: 0x461, have: 0x139, distance: 0xa, oneway: true},
97: {want: 0x467, have: 0x3e2, distance: 0xa, oneway: true},
98: {want: 0x46f, have: 0x139, distance: 0xa, oneway: true},
99: {want: 0x476, have: 0x3e2, distance: 0xa, oneway: true},
100: {want: 0x3883, have: 0x139, distance: 0xa, oneway: true},
101: {want: 0x480, have: 0x139, distance: 0xa, oneway: true},
102: {want: 0x482, have: 0x139, distance: 0xa, oneway: true},
103: {want: 0x494, have: 0x3e2, distance: 0xa, oneway: true},
104: {want: 0x49d, have: 0x139, distance: 0xa, oneway: true},
105: {want: 0x4ac, have: 0x529, distance: 0xa, oneway: true},
106: {want: 0x4b4, have: 0x139, distance: 0xa, oneway: true},
107: {want: 0x4bc, have: 0x3e2, distance: 0xa, oneway: true},
108: {want: 0x4e5, have: 0x15e, distance: 0xa, oneway: true},
109: {want: 0x4f2, have: 0x139, distance: 0xa, oneway: true},
110: {want: 0x512, have: 0x139, distance: 0xa, oneway: true},
111: {want: 0x518, have: 0x139, distance: 0xa, oneway: true},
112: {want: 0x52f, have: 0x139, distance: 0xa, oneway: true},
} // Size: 702 bytes
// matchScript holds pairs of scriptIDs where readers of one script
// can typically also read the other. Each is associated with a confidence.
var matchScript = []scriptIntelligibility{ // 26 elements
0: {wantLang: 0x432, haveLang: 0x432, wantScript: 0x5b, haveScript: 0x20, distance: 0x5},
1: {wantLang: 0x432, haveLang: 0x432, wantScript: 0x20, haveScript: 0x5b, distance: 0x5},
2: {wantLang: 0x58, haveLang: 0x3e2, wantScript: 0x5b, haveScript: 0x20, distance: 0xa},
3: {wantLang: 0xa5, haveLang: 0x139, wantScript: 0xe, haveScript: 0x5b, distance: 0xa},
4: {wantLang: 0x1d7, haveLang: 0x3e2, wantScript: 0x8, haveScript: 0x20, distance: 0xa},
5: {wantLang: 0x210, haveLang: 0x139, wantScript: 0x2e, haveScript: 0x5b, distance: 0xa},
6: {wantLang: 0x24a, haveLang: 0x139, wantScript: 0x4f, haveScript: 0x5b, distance: 0xa},
7: {wantLang: 0x251, haveLang: 0x139, wantScript: 0x53, haveScript: 0x5b, distance: 0xa},
8: {wantLang: 0x2b8, haveLang: 0x139, wantScript: 0x58, haveScript: 0x5b, distance: 0xa},
9: {wantLang: 0x304, haveLang: 0x139, wantScript: 0x6f, haveScript: 0x5b, distance: 0xa},
10: {wantLang: 0x331, haveLang: 0x139, wantScript: 0x76, haveScript: 0x5b, distance: 0xa},
11: {wantLang: 0x351, haveLang: 0x139, wantScript: 0x22, haveScript: 0x5b, distance: 0xa},
12: {wantLang: 0x395, haveLang: 0x139, wantScript: 0x83, haveScript: 0x5b, distance: 0xa},
13: {wantLang: 0x39d, haveLang: 0x139, wantScript: 0x36, haveScript: 0x5b, distance: 0xa},
14: {wantLang: 0x3be, haveLang: 0x139, wantScript: 0x5, haveScript: 0x5b, distance: 0xa},
15: {wantLang: 0x3fa, haveLang: 0x139, wantScript: 0x5, haveScript: 0x5b, distance: 0xa},
16: {wantLang: 0x40c, haveLang: 0x139, wantScript: 0xd6, haveScript: 0x5b, distance: 0xa},
17: {wantLang: 0x450, haveLang: 0x139, wantScript: 0xe6, haveScript: 0x5b, distance: 0xa},
18: {wantLang: 0x461, haveLang: 0x139, wantScript: 0xe9, haveScript: 0x5b, distance: 0xa},
19: {wantLang: 0x46f, haveLang: 0x139, wantScript: 0x2c, haveScript: 0x5b, distance: 0xa},
20: {wantLang: 0x476, haveLang: 0x3e2, wantScript: 0x5b, haveScript: 0x20, distance: 0xa},
21: {wantLang: 0x4b4, haveLang: 0x139, wantScript: 0x5, haveScript: 0x5b, distance: 0xa},
22: {wantLang: 0x4bc, haveLang: 0x3e2, wantScript: 0x5b, haveScript: 0x20, distance: 0xa},
23: {wantLang: 0x512, haveLang: 0x139, wantScript: 0x3e, haveScript: 0x5b, distance: 0xa},
24: {wantLang: 0x529, haveLang: 0x529, wantScript: 0x3b, haveScript: 0x3c, distance: 0xf},
25: {wantLang: 0x529, haveLang: 0x529, wantScript: 0x3c, haveScript: 0x3b, distance: 0x13},
} // Size: 232 bytes
var matchRegion = []regionIntelligibility{ // 15 elements
0: {lang: 0x3a, script: 0x0, group: 0x4, distance: 0x4},
1: {lang: 0x3a, script: 0x0, group: 0x84, distance: 0x4},
2: {lang: 0x139, script: 0x0, group: 0x1, distance: 0x4},
3: {lang: 0x139, script: 0x0, group: 0x81, distance: 0x4},
4: {lang: 0x13e, script: 0x0, group: 0x3, distance: 0x4},
5: {lang: 0x13e, script: 0x0, group: 0x83, distance: 0x4},
6: {lang: 0x3c0, script: 0x0, group: 0x3, distance: 0x4},
7: {lang: 0x3c0, script: 0x0, group: 0x83, distance: 0x4},
8: {lang: 0x529, script: 0x3c, group: 0x2, distance: 0x4},
9: {lang: 0x529, script: 0x3c, group: 0x82, distance: 0x4},
10: {lang: 0x3a, script: 0x0, group: 0x80, distance: 0x5},
11: {lang: 0x139, script: 0x0, group: 0x80, distance: 0x5},
12: {lang: 0x13e, script: 0x0, group: 0x80, distance: 0x5},
13: {lang: 0x3c0, script: 0x0, group: 0x80, distance: 0x5},
14: {lang: 0x529, script: 0x3c, group: 0x80, distance: 0x5},
} // Size: 114 bytes
// Total table size 1473 bytes (1KiB); checksum: 7BB90B5C
@@ -0,0 +1,145 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package language
import "golang.org/x/text/internal/language/compact"
// TODO: Various sets of commonly use tags and regions.
// MustParse is like Parse, but panics if the given BCP 47 tag cannot be parsed.
// It simplifies safe initialization of Tag values.
func MustParse(s string) Tag {
t, err := Parse(s)
if err != nil {
panic(err)
}
return t
}
// MustParse is like Parse, but panics if the given BCP 47 tag cannot be parsed.
// It simplifies safe initialization of Tag values.
func (c CanonType) MustParse(s string) Tag {
t, err := c.Parse(s)
if err != nil {
panic(err)
}
return t
}
// MustParseBase is like ParseBase, but panics if the given base cannot be parsed.
// It simplifies safe initialization of Base values.
func MustParseBase(s string) Base {
b, err := ParseBase(s)
if err != nil {
panic(err)
}
return b
}
// MustParseScript is like ParseScript, but panics if the given script cannot be
// parsed. It simplifies safe initialization of Script values.
func MustParseScript(s string) Script {
scr, err := ParseScript(s)
if err != nil {
panic(err)
}
return scr
}
// MustParseRegion is like ParseRegion, but panics if the given region cannot be
// parsed. It simplifies safe initialization of Region values.
func MustParseRegion(s string) Region {
r, err := ParseRegion(s)
if err != nil {
panic(err)
}
return r
}
var (
und = Tag{}
Und Tag = Tag{}
Afrikaans Tag = Tag(compact.Afrikaans)
Amharic Tag = Tag(compact.Amharic)
Arabic Tag = Tag(compact.Arabic)
ModernStandardArabic Tag = Tag(compact.ModernStandardArabic)
Azerbaijani Tag = Tag(compact.Azerbaijani)
Bulgarian Tag = Tag(compact.Bulgarian)
Bengali Tag = Tag(compact.Bengali)
Catalan Tag = Tag(compact.Catalan)
Czech Tag = Tag(compact.Czech)
Danish Tag = Tag(compact.Danish)
German Tag = Tag(compact.German)
Greek Tag = Tag(compact.Greek)
English Tag = Tag(compact.English)
AmericanEnglish Tag = Tag(compact.AmericanEnglish)
BritishEnglish Tag = Tag(compact.BritishEnglish)
Spanish Tag = Tag(compact.Spanish)
EuropeanSpanish Tag = Tag(compact.EuropeanSpanish)
LatinAmericanSpanish Tag = Tag(compact.LatinAmericanSpanish)
Estonian Tag = Tag(compact.Estonian)
Persian Tag = Tag(compact.Persian)
Finnish Tag = Tag(compact.Finnish)
Filipino Tag = Tag(compact.Filipino)
French Tag = Tag(compact.French)
CanadianFrench Tag = Tag(compact.CanadianFrench)
Gujarati Tag = Tag(compact.Gujarati)
Hebrew Tag = Tag(compact.Hebrew)
Hindi Tag = Tag(compact.Hindi)
Croatian Tag = Tag(compact.Croatian)
Hungarian Tag = Tag(compact.Hungarian)
Armenian Tag = Tag(compact.Armenian)
Indonesian Tag = Tag(compact.Indonesian)
Icelandic Tag = Tag(compact.Icelandic)
Italian Tag = Tag(compact.Italian)
Japanese Tag = Tag(compact.Japanese)
Georgian Tag = Tag(compact.Georgian)
Kazakh Tag = Tag(compact.Kazakh)
Khmer Tag = Tag(compact.Khmer)
Kannada Tag = Tag(compact.Kannada)
Korean Tag = Tag(compact.Korean)
Kirghiz Tag = Tag(compact.Kirghiz)
Lao Tag = Tag(compact.Lao)
Lithuanian Tag = Tag(compact.Lithuanian)
Latvian Tag = Tag(compact.Latvian)
Macedonian Tag = Tag(compact.Macedonian)
Malayalam Tag = Tag(compact.Malayalam)
Mongolian Tag = Tag(compact.Mongolian)
Marathi Tag = Tag(compact.Marathi)
Malay Tag = Tag(compact.Malay)
Burmese Tag = Tag(compact.Burmese)
Nepali Tag = Tag(compact.Nepali)
Dutch Tag = Tag(compact.Dutch)
Norwegian Tag = Tag(compact.Norwegian)
Punjabi Tag = Tag(compact.Punjabi)
Polish Tag = Tag(compact.Polish)
Portuguese Tag = Tag(compact.Portuguese)
BrazilianPortuguese Tag = Tag(compact.BrazilianPortuguese)
EuropeanPortuguese Tag = Tag(compact.EuropeanPortuguese)
Romanian Tag = Tag(compact.Romanian)
Russian Tag = Tag(compact.Russian)
Sinhala Tag = Tag(compact.Sinhala)
Slovak Tag = Tag(compact.Slovak)
Slovenian Tag = Tag(compact.Slovenian)
Albanian Tag = Tag(compact.Albanian)
Serbian Tag = Tag(compact.Serbian)
SerbianLatin Tag = Tag(compact.SerbianLatin)
Swedish Tag = Tag(compact.Swedish)
Swahili Tag = Tag(compact.Swahili)
Tamil Tag = Tag(compact.Tamil)
Telugu Tag = Tag(compact.Telugu)
Thai Tag = Tag(compact.Thai)
Turkish Tag = Tag(compact.Turkish)
Ukrainian Tag = Tag(compact.Ukrainian)
Urdu Tag = Tag(compact.Urdu)
Uzbek Tag = Tag(compact.Uzbek)
Vietnamese Tag = Tag(compact.Vietnamese)
Chinese Tag = Tag(compact.Chinese)
SimplifiedChinese Tag = Tag(compact.SimplifiedChinese)
TraditionalChinese Tag = Tag(compact.TraditionalChinese)
Zulu Tag = Tag(compact.Zulu)
)
@@ -0,0 +1,389 @@
# TODO: this file has not yet been included in the main CLDR release.
# The intent is to verify this file against the Go implementation and then
# correct the cases and add merge in other interesting test cases.
# See TestCLDRCompliance in match_test.go, as well as the list of exceptions
# defined in the map skip below it, for the work in progress.
# Data-driven test for the XLocaleMatcher.
# Format
# • Everything after "#" is a comment
# • Arguments are separated by ";". They are:
# supported ; desired ; expected
# • The supported may have the threshold distance reset as a first item, eg 50, en, fr
# A line starting with @debug will reach a statement in the test code where you can put a breakpoint for debugging
# The test code also supports reformatting this file, by setting the REFORMAT flag.
##################################################
# testParentLocales
# es-419, es-AR, and es-MX are in a cluster; es is in a different one
es-419, es-ES ; es-AR ; es-419
es-ES, es-419 ; es-AR ; es-419
es-419, es ; es-AR ; es-419
es, es-419 ; es-AR ; es-419
es-MX, es ; es-AR ; es-MX
es, es-MX ; es-AR ; es-MX
# en-GB, en-AU, and en-NZ are in a cluster; en in a different one
en-GB, en-US ; en-AU ; en-GB
en-US, en-GB ; en-AU ; en-GB
en-GB, en ; en-AU ; en-GB
en, en-GB ; en-AU ; en-GB
en-NZ, en-US ; en-AU ; en-NZ
en-US, en-NZ ; en-AU ; en-NZ
en-NZ, en ; en-AU ; en-NZ
en, en-NZ ; en-AU ; en-NZ
# pt-AU and pt-PT in one cluster; pt-BR in another
pt-PT, pt-BR ; pt-AO ; pt-PT
pt-BR, pt-PT ; pt-AO ; pt-PT
pt-PT, pt ; pt-AO ; pt-PT
pt, pt-PT ; pt-AO ; pt-PT
zh-MO, zh-TW ; zh-HK ; zh-MO
zh-TW, zh-MO ; zh-HK ; zh-MO
zh-MO, zh-TW ; zh-HK ; zh-MO
zh-TW, zh-MO ; zh-HK ; zh-MO
zh-MO, zh-CN ; zh-HK ; zh-MO
zh-CN, zh-MO ; zh-HK ; zh-MO
zh-MO, zh ; zh-HK ; zh-MO
zh, zh-MO ; zh-HK ; zh-MO
##################################################
# testChinese
zh-CN, zh-TW, iw ; zh-Hant-TW ; zh-TW
zh-CN, zh-TW, iw ; zh-Hant ; zh-TW
zh-CN, zh-TW, iw ; zh-TW ; zh-TW
zh-CN, zh-TW, iw ; zh-Hans-CN ; zh-CN
zh-CN, zh-TW, iw ; zh-CN ; zh-CN
zh-CN, zh-TW, iw ; zh ; zh-CN
##################################################
# testenGB
fr, en, en-GB, es-419, es-MX, es ; en-NZ ; en-GB
fr, en, en-GB, es-419, es-MX, es ; es-ES ; es
fr, en, en-GB, es-419, es-MX, es ; es-AR ; es-419
fr, en, en-GB, es-419, es-MX, es ; es-MX ; es-MX
##################################################
# testFallbacks
91, en, hi ; sa ; hi
##################################################
# testBasics
fr, en-GB, en ; en-GB ; en-GB
fr, en-GB, en ; en ; en
fr, en-GB, en ; fr ; fr
fr, en-GB, en ; ja ; fr # return first if no match
##################################################
# testFallback
# check that script fallbacks are handled right
zh-CN, zh-TW, iw ; zh-Hant ; zh-TW
zh-CN, zh-TW, iw ; zh ; zh-CN
zh-CN, zh-TW, iw ; zh-Hans-CN ; zh-CN
zh-CN, zh-TW, iw ; zh-Hant-HK ; zh-TW
zh-CN, zh-TW, iw ; he-IT ; iw
##################################################
# testSpecials
# check that nearby languages are handled
en, fil, ro, nn ; tl ; fil
en, fil, ro, nn ; mo ; ro
en, fil, ro, nn ; nb ; nn
# make sure default works
en, fil, ro, nn ; ja ; en
##################################################
# testRegionalSpecials
# verify that en-AU is closer to en-GB than to en (which is en-US)
en, en-GB, es, es-419 ; es-MX ; es-419
en, en-GB, es, es-419 ; en-AU ; en-GB
en, en-GB, es, es-419 ; es-ES ; es
##################################################
# testHK
# HK and MO are closer to each other for Hant than to TW
zh, zh-TW, zh-MO ; zh-HK ; zh-MO
zh, zh-TW, zh-HK ; zh-MO ; zh-HK
##################################################
# testMatch-exact
# see localeDistance.txt
##################################################
# testMatch-none
# see localeDistance.txt
##################################################
# testMatch-matchOnMazimized
zh, zh-Hant ; und-TW ; zh-Hant # und-TW should be closer to zh-Hant than to zh
en-Hant-TW, und-TW ; zh-Hant ; und-TW # zh-Hant should be closer to und-TW than to en-Hant-TW
en-Hant-TW, und-TW ; zh ; und-TW # zh should be closer to und-TW than to en-Hant-TW
##################################################
# testMatchGrandfatheredCode
fr, i-klingon, en-Latn-US ; en-GB-oed ; en-Latn-US
##################################################
# testGetBestMatchForList-exactMatch
fr, en-GB, ja, es-ES, es-MX ; ja, de ; ja
##################################################
# testGetBestMatchForList-simpleVariantMatch
fr, en-GB, ja, es-ES, es-MX ; de, en-US ; en-GB # Intentionally avoiding a perfect-match or two candidates for variant matches.
# Fallback.
fr, en-GB, ja, es-ES, es-MX ; de, zh ; fr
##################################################
# testGetBestMatchForList-matchOnMaximized
# Check that if the preference is maximized already, it works as well.
en, ja ; ja-Jpan-JP, en-AU ; ja # Match for ja-Jpan-JP (maximized already)
# ja-JP matches ja on likely subtags, and it's listed first, thus it wins over the second preference en-GB.
en, ja ; ja-JP, en-US ; ja # Match for ja-Jpan-JP (maximized already)
# Check that if the preference is maximized already, it works as well.
en, ja ; ja-Jpan-JP, en-US ; ja # Match for ja-Jpan-JP (maximized already)
##################################################
# testGetBestMatchForList-noMatchOnMaximized
# Regression test for http://b/5714572 .
# de maximizes to de-DE. Pick the exact match for the secondary language instead.
en, de, fr, ja ; de-CH, fr ; de
##################################################
# testBestMatchForTraditionalChinese
# Scenario: An application that only supports Simplified Chinese (and some other languages),
# but does not support Traditional Chinese. zh-Hans-CN could be replaced with zh-CN, zh, or
# zh-Hans, it wouldn't make much of a difference.
# The script distance (simplified vs. traditional Han) is considered small enough
# to be an acceptable match. The regional difference is considered almost insignificant.
fr, zh-Hans-CN, en-US ; zh-TW ; zh-Hans-CN
fr, zh-Hans-CN, en-US ; zh-Hant ; zh-Hans-CN
# For geo-political reasons, you might want to avoid a zh-Hant -> zh-Hans match.
# In this case, if zh-TW, zh-HK or a tag starting with zh-Hant is requested, you can
# change your call to getBestMatch to include a 2nd language preference.
# "en" is a better match since its distance to "en-US" is closer than the distance
# from "zh-TW" to "zh-CN" (script distance).
fr, zh-Hans-CN, en-US ; zh-TW, en ; en-US
fr, zh-Hans-CN, en-US ; zh-Hant-CN, en, en ; en-US
fr, zh-Hans-CN, en-US ; zh-Hans, en ; zh-Hans-CN
##################################################
# testUndefined
# When the undefined language doesn't match anything in the list,
# getBestMatch returns the default, as usual.
it, fr ; und ; it
# When it *does* occur in the list, bestMatch returns it, as expected.
it, und ; und ; und
# The unusual part: max("und") = "en-Latn-US", and since matching is based on maximized
# tags, the undefined language would normally match English. But that would produce the
# counterintuitive results that getBestMatch("und", XLocaleMatcher("it,en")) would be "en", and
# getBestMatch("en", XLocaleMatcher("it,und")) would be "und".
# To avoid that, we change the matcher's definitions of max
# so that max("und")="und". That produces the following, more desirable
# results:
it, en ; und ; it
it, und ; en ; it
##################################################
# testGetBestMatch-regionDistance
es-AR, es ; es-MX ; es-AR
fr, en, en-GB ; en-CA ; en-GB
de-AT, de-DE, de-CH ; de ; de-DE
##################################################
# testAsymmetry
mul, nl ; af ; nl # af => nl
mul, af ; nl ; mul # but nl !=> af
##################################################
# testGetBestMatchForList-matchOnMaximized2
# ja-JP matches ja on likely subtags, and it's listed first, thus it wins over the second preference en-GB.
fr, en-GB, ja, es-ES, es-MX ; ja-JP, en-GB ; ja # Match for ja-JP, with likely region subtag
# Check that if the preference is maximized already, it works as well.
fr, en-GB, ja, es-ES, es-MX ; ja-Jpan-JP, en-GB ; ja # Match for ja-Jpan-JP (maximized already)
##################################################
# testGetBestMatchForList-closeEnoughMatchOnMaximized
en-GB, en, de, fr, ja ; de-CH, fr ; de
en-GB, en, de, fr, ja ; en-US, ar, nl, de, ja ; en
##################################################
# testGetBestMatchForPortuguese
# pt might be supported and not pt-PT
# European user who prefers Spanish over Brazillian Portuguese as a fallback.
pt-PT, pt-BR, es, es-419 ; pt-PT, es, pt ; pt-PT
pt-PT, pt, es, es-419 ; pt-PT, es, pt ; pt-PT # pt implicit
# Brazillian user who prefers South American Spanish over European Portuguese as a fallback.
# The asymmetry between this case and above is because it's "pt-PT" that's missing between the
# matchers as "pt-BR" is a much more common language.
pt-PT, pt-BR, es, es-419 ; pt, es-419, pt-PT ; pt-BR
pt-PT, pt-BR, es, es-419 ; pt-PT, es, pt ; pt-PT
pt-PT, pt, es, es-419 ; pt-PT, es, pt ; pt-PT
pt-PT, pt, es, es-419 ; pt, es-419, pt-PT ; pt
pt-BR, es, es-419 ; pt, es-419, pt-PT ; pt-BR
# Code that adds the user's country can get "pt-US" for a user's language.
# That should fall back to "pt-BR".
pt-PT, pt-BR, es, es-419 ; pt-US, pt-PT ; pt-BR
pt-PT, pt, es, es-419 ; pt-US, pt-PT, pt ; pt # pt-BR implicit
##################################################
# testVariantWithScriptMatch 1 and 2
fr, en, sv ; en-GB ; en
fr, en, sv ; en-GB ; en
en, sv ; en-GB, sv ; en
##################################################
# testLongLists
en, sv ; sv ; sv
af, am, ar, az, be, bg, bn, bs, ca, cs, cy, cy, da, de, el, en, en-GB, es, es-419, et, eu, fa, fi, fil, fr, ga, gl, gu, hi, hr, hu, hy, id, is, it, iw, ja, ka, kk, km, kn, ko, ky, lo, lt, lv, mk, ml, mn, mr, ms, my, ne, nl, no, pa, pl, pt, pt-PT, ro, ru, si, sk, sl, sq, sr, sr-Latn, sv, sw, ta, te, th, tr, uk, ur, uz, vi, zh-CN, zh-TW, zu ; sv ; sv
af, af-NA, af-ZA, agq, agq-CM, ak, ak-GH, am, am-ET, ar, ar-001, ar-AE, ar-BH, ar-DJ, ar-DZ, ar-EG, ar-EH, ar-ER, ar-IL, ar-IQ, ar-JO, ar-KM, ar-KW, ar-LB, ar-LY, ar-MA, ar-MR, ar-OM, ar-PS, ar-QA, ar-SA, ar-SD, ar-SO, ar-SS, ar-SY, ar-TD, ar-TN, ar-YE, as, as-IN, asa, asa-TZ, ast, ast-ES, az, az-Cyrl, az-Cyrl-AZ, az-Latn, az-Latn-AZ, bas, bas-CM, be, be-BY, bem, bem-ZM, bez, bez-TZ, bg, bg-BG, bm, bm-ML, bn, bn-BD, bn-IN, bo, bo-CN, bo-IN, br, br-FR, brx, brx-IN, bs, bs-Cyrl, bs-Cyrl-BA, bs-Latn, bs-Latn-BA, ca, ca-AD, ca-ES, ca-ES-VALENCIA, ca-FR, ca-IT, ce, ce-RU, cgg, cgg-UG, chr, chr-US, ckb, ckb-IQ, ckb-IR, cs, cs-CZ, cu, cu-RU, cy, cy-GB, da, da-DK, da-GL, dav, dav-KE, de, de-AT, de-BE, de-CH, de-DE, de-LI, de-LU, dje, dje-NE, dsb, dsb-DE, dua, dua-CM, dyo, dyo-SN, dz, dz-BT, ebu, ebu-KE, ee, ee-GH, ee-TG, el, el-CY, el-GR, en, en-001, en-150, en-AG, en-AI, en-AS, en-AT, en-AU, en-BB, en-BE, en-BI, en-BM, en-BS, en-BW, en-BZ, en-CA, en-CC, en-CH, en-CK, en-CM, en-CX, en-CY, en-DE, en-DG, en-DK, en-DM, en-ER, en-FI, en-FJ, en-FK, en-FM, en-GB, en-GD, en-GG, en-GH, en-GI, en-GM, en-GU, en-GY, en-HK, en-IE, en-IL, en-IM, en-IN, en-IO, en-JE, en-JM, en-KE, en-KI, en-KN, en-KY, en-LC, en-LR, en-LS, en-MG, en-MH, en-MO, en-MP, en-MS, en-MT, en-MU, en-MW, en-MY, en-NA, en-NF, en-NG, en-NL, en-NR, en-NU, en-NZ, en-PG, en-PH, en-PK, en-PN, en-PR, en-PW, en-RW, en-SB, en-SC, en-SD, en-SE, en-SG, en-SH, en-SI, en-SL, en-SS, en-SX, en-SZ, en-TC, en-TK, en-TO, en-TT, en-TV, en-TZ, en-UG, en-UM, en-US, en-US-POSIX, en-VC, en-VG, en-VI, en-VU, en-WS, en-ZA, en-ZM, en-ZW, eo, eo-001, es, es-419, es-AR, es-BO, es-CL, es-CO, es-CR, es-CU, es-DO, es-EA, es-EC, es-ES, es-GQ, es-GT, es-HN, es-IC, es-MX, es-NI, es-PA, es-PE, es-PH, es-PR, es-PY, es-SV, es-US, es-UY, es-VE, et, et-EE, eu, eu-ES, ewo, ewo-CM, fa, fa-AF, fa-IR, ff, ff-CM, ff-GN, ff-MR, ff-SN, fi, fi-FI, fil, fil-PH, fo, fo-DK, fo-FO, fr, fr-BE, fr-BF, fr-BI, fr-BJ, fr-BL, fr-CA, fr-CD, fr-CF, fr-CG, fr-CH, fr-CI, fr-CM, fr-DJ, fr-DZ, fr-FR, fr-GA, fr-GF, fr-GN, fr-GP, fr-GQ, fr-HT, fr-KM, fr-LU, fr-MA, fr-MC, fr-MF, fr-MG, fr-ML, fr-MQ, fr-MR, fr-MU, fr-NC, fr-NE, fr-PF, fr-PM, fr-RE, fr-RW, fr-SC, fr-SN, fr-SY, fr-TD, fr-TG, fr-TN, fr-VU, fr-WF, fr-YT, fur, fur-IT, fy, fy-NL, ga, ga-IE, gd, gd-GB, gl, gl-ES, gsw, gsw-CH, gsw-FR, gsw-LI, gu, gu-IN, guz, guz-KE, gv, gv-IM, ha, ha-GH, ha-NE, ha-NG, haw, haw-US, he, he-IL, hi, hi-IN, hr, hr-BA, hr-HR, hsb, hsb-DE, hu, hu-HU, hy, hy-AM, id, id-ID, ig, ig-NG, ii, ii-CN, is, is-IS, it, it-CH, it-IT, it-SM, ja, ja-JP, jgo, jgo-CM, jmc, jmc-TZ, ka, ka-GE, kab, kab-DZ, kam, kam-KE, kde, kde-TZ, kea, kea-CV, khq, khq-ML, ki, ki-KE, kk, kk-KZ, kkj, kkj-CM, kl, kl-GL, kln, kln-KE, km, km-KH, kn, kn-IN, ko, ko-KP, ko-KR, kok, kok-IN, ks, ks-IN, ksb, ksb-TZ, ksf, ksf-CM, ksh, ksh-DE, kw, kw-GB, ky, ky-KG, lag, lag-TZ, lb, lb-LU, lg, lg-UG, lkt, lkt-US, ln, ln-AO, ln-CD, ln-CF, ln-CG, lo, lo-LA, lrc, lrc-IQ, lrc-IR, lt, lt-LT, lu, lu-CD, luo, luo-KE, luy, luy-KE, lv, lv-LV, mas, mas-KE, mas-TZ, mer, mer-KE, mfe, mfe-MU, mg, mg-MG, mgh, mgh-MZ, mgo, mgo-CM, mk, mk-MK, ml, ml-IN, mn, mn-MN, mr, mr-IN, ms, ms-BN, ms-MY, ms-SG, mt, mt-MT, mua, mua-CM, my, my-MM, mzn, mzn-IR, naq, naq-NA, nb, nb-NO, nb-SJ, nd, nd-ZW, ne, ne-IN, ne-NP, nl, nl-AW, nl-BE, nl-BQ, nl-CW, nl-NL, nl-SR, nl-SX, nmg, nmg-CM, nn, nn-NO, nnh, nnh-CM, nus, nus-SS, nyn, nyn-UG, om, om-ET, om-KE, or, or-IN, os, os-GE, os-RU, pa, pa-Arab, pa-Arab-PK, pa-Guru, pa-Guru-IN, pl, pl-PL, prg, prg-001, ps, ps-AF, pt, pt-AO, pt-BR, pt-CV, pt-GW, pt-MO, pt-MZ, pt-PT, pt-ST, pt-TL, qu, qu-BO, qu-EC, qu-PE, rm, rm-CH, rn, rn-BI, ro, ro-MD, ro-RO, rof, rof-TZ, root, ru, ru-BY, ru-KG, ru-KZ, ru-MD, ru-RU, ru-UA, rw, rw-RW, rwk, rwk-TZ, sah, sah-RU, saq, saq-KE, sbp, sbp-TZ, se, se-FI, se-NO, se-SE, seh, seh-MZ, ses, ses-ML, sg, sg-CF, shi, shi-Latn, shi-Latn-MA, shi-Tfng, shi-Tfng-MA, si, si-LK, sk, sk-SK, sl, sl-SI, smn, smn-FI, sn, sn-ZW, so, so-DJ, so-ET, so-KE, so-SO, sq, sq-AL, sq-MK, sq-XK, sr, sr-Cyrl, sr-Cyrl-BA, sr-Cyrl-ME, sr-Cyrl-RS, sr-Cyrl-XK, sr-Latn, sr-Latn-BA, sr-Latn-ME, sr-Latn-RS, sr-Latn-XK, sv, sv-AX, sv-FI, sv-SE, sw, sw-CD, sw-KE, sw-TZ, sw-UG, ta, ta-IN, ta-LK, ta-MY, ta-SG, te, te-IN, teo, teo-KE, teo-UG, th, th-TH, ti, ti-ER, ti-ET, tk, tk-TM, to, to-TO, tr, tr-CY, tr-TR, twq, twq-NE, tzm, tzm-MA, ug, ug-CN, uk, uk-UA, ur, ur-IN, ur-PK, uz, uz-Arab, uz-Arab-AF, uz-Cyrl, uz-Cyrl-UZ, uz-Latn, uz-Latn-UZ, vai, vai-Latn, vai-Latn-LR, vai-Vaii, vai-Vaii-LR, vi, vi-VN, vo, vo-001, vun, vun-TZ, wae, wae-CH, xog, xog-UG, yav, yav-CM, yi, yi-001, yo, yo-BJ, yo-NG, zgh, zgh-MA, zh, zh-Hans, zh-Hans-CN, zh-Hans-HK, zh-Hans-MO, zh-Hans-SG, zh-Hant, zh-Hant-HK, zh-Hant-MO, zh-Hant-TW, zu, zu-ZA ; sv ; sv
##################################################
# test8288
it, en ; und ; it
it, en ; und, en ; en
# examples from
# https://unicode.org/repos/cldr/tags/latest/common/bcp47/
# https://unicode.org/repos/cldr/tags/latest/common/validity/variant.xml
##################################################
# testUnHack
en-NZ, en-IT ; en-US ; en-NZ
##################################################
# testEmptySupported => null
; en ; null
##################################################
# testVariantsAndExtensions
##################################################
# tests the .combine() method
und, fr ; fr-BE-fonipa ; fr ; fr-BE-fonipa
und, fr-CA ; fr-BE-fonipa ; fr-CA ; fr-BE-fonipa
und, fr-fonupa ; fr-BE-fonipa ; fr-fonupa ; fr-BE-fonipa
und, no ; nn-BE-fonipa ; no ; no-BE-fonipa
und, en-GB-u-sd-gbsct ; en-fonipa-u-nu-Arab-ca-buddhist-t-m0-iso-i0-pinyin ; en-GB-u-sd-gbsct ; en-GB-fonipa-u-nu-Arab-ca-buddhist-t-m0-iso-i0-pinyin
en-PSCRACK, de-PSCRACK, fr-PSCRACK, pt-PT-PSCRACK ; fr-PSCRACK ; fr-PSCRACK
en-PSCRACK, de-PSCRACK, fr-PSCRACK, pt-PT-PSCRACK ; fr ; fr-PSCRACK
en-PSCRACK, de-PSCRACK, fr-PSCRACK, pt-PT-PSCRACK ; de-CH ; de-PSCRACK
##################################################
# testClusters
# we favor es-419 over others in cluster. Clusters: es- {ES, MA, EA} {419, AR, MX}
und, es, es-MA, es-MX, es-419 ; es-AR ; es-419
und, es-MA, es, es-419, es-MX ; es-AR ; es-419
und, es, es-MA, es-MX, es-419 ; es-EA ; es
und, es-MA, es, es-419, es-MX ; es-EA ; es
# of course, fall back to within cluster
und, es, es-MA, es-MX ; es-AR ; es-MX
und, es-MA, es, es-MX ; es-AR ; es-MX
und, es-MA, es-MX, es-419 ; es-EA ; es-MA
und, es-MA, es-419, es-MX ; es-EA ; es-MA
# we favor es-GB over others in cluster. Clusters: en- {US, GU, VI} {GB, IN, ZA}
und, en, en-GU, en-IN, en-GB ; en-ZA ; en-GB
und, en-GU, en, en-GB, en-IN ; en-ZA ; en-GB
und, en, en-GU, en-IN, en-GB ; en-VI ; en
und, en-GU, en, en-GB, en-IN ; en-VI ; en
# of course, fall back to within cluster
und, en, en-GU, en-IN ; en-ZA ; en-IN
und, en-GU, en, en-IN ; en-ZA ; en-IN
und, en-GU, en-IN, en-GB ; en-VI ; en-GU
und, en-GU, en-GB, en-IN ; en-VI ; en-GU
##################################################
# testThreshold
@Threshold=60
50, und, fr-CA-fonupa ; fr-BE-fonipa ; fr-CA-fonupa ; fr-BE-fonipa
50, und, fr-Cyrl-CA-fonupa ; fr-BE-fonipa ; fr-Cyrl-CA-fonupa ; fr-Cyrl-BE-fonipa
@Threshold=-1 # restore
##################################################
# testScriptFirst
@DistanceOption=SCRIPT_FIRST
@debug
ru, fr ; zh, pl ; fr
ru, fr ; zh-Cyrl, pl ; ru
hr, en-Cyrl; sr ; en-Cyrl
da, ru, hr; sr ; ru
@@ -0,0 +1,231 @@
# basics
fr, en-GB, en ; en-GB ; en-GB
fr, en-GB, en ; en-US ; en
fr, en-GB, en ; fr-FR ; fr
fr, en-GB, en ; ja-JP ; fr
# script fallbacks
zh-CN, zh-TW, iw ; zh-Hant ; zh-TW
zh-CN, zh-TW, iw ; zh ; zh-CN
zh-CN, zh-TW, iw ; zh-Hans-CN ; zh-CN
zh-CN, zh-TW, iw ; zh-Hant-HK ; zh-TW
zh-CN, zh-TW, iw ; he-IT ; iw ; iw-u-rg-itzzzz
# language-specific script fallbacks 1
en, sr, nl ; sr-Latn ; sr
en, sr, nl ; sh ; sr # different script, but seems okay and is as CLDR suggests
en, sr, nl ; hr ; en
en, sr, nl ; bs ; en
en, sr, nl ; nl-Cyrl ; sr
# language-specific script fallbacks 2
en, sh ; sr ; sh
en, sh ; sr-Cyrl ; sh
en, sh ; hr ; sh
# don't match hr to sr-Latn
en, sr-Latn ; hr ; en
# both deprecated and not
fil, tl, iw, he ; he-IT ; he
fil, tl, iw, he ; he ; he
fil, tl, iw, he ; iw ; iw
fil, tl, iw, he ; fil-IT ; fil
fil, tl, iw, he ; fil ; fil
fil, tl, iw, he ; tl ; tl
# nearby languages
en, fil, ro, nn ; tl ; fil
en, fil, ro, nn ; mo ; ro
en, fil, ro, nn ; nb ; nn
en, fil, ro, nn ; ja ; en
# nearby languages: Nynorsk to Bokmål
en, nb ; nn ; nb
# nearby languages: Danish does not match nn
en, nn ; da ; en
# nearby languages: Danish matches no
en, no ; da ; no
# nearby languages: Danish matches nb
en, nb ; da ; nb
# prefer matching languages over language variants.
nn, en-GB ; no, en-US ; en-GB
nn, en-GB ; nb, en-US ; en-GB
# deprecated version is closer than same language with other differences
nl, he, en-GB ; iw, en-US ; he
# macro equivalent is closer than same language with other differences
nl, zh, en-GB, no ; cmn, en-US ; zh
nl, zh, en-GB, no ; nb, en-US ; no
# legacy equivalent is closer than same language with other differences
nl, fil, en-GB ; tl, en-US ; fil
# distinguish near equivalents
en, ro, mo, ro-MD ; ro ; ro
en, ro, mo, ro-MD ; mo ; mo
en, ro, mo, ro-MD ; ro-MD ; ro-MD
# maximization of legacy
sr-Cyrl, sr-Latn, ro, ro-MD ; sh ; sr-Latn
sr-Cyrl, sr-Latn, ro, ro-MD ; mo ; ro-MD
# empty
; fr ; und
; en ; und
# private use subtags
fr, en-GB, x-bork, es-ES, es-419 ; x-piglatin ; fr
fr, en-GB, x-bork, es-ES, es-419 ; x-bork ; x-bork
# grandfathered codes
fr, i-klingon, en-Latn-US ; en-GB-oed ; en-Latn-US
fr, i-klingon, en-Latn-US ; i-klingon ; tlh
# simple variant match
fr, en-GB, ja, es-ES, es-MX ; de, en-US ; en-GB
fr, en-GB, ja, es-ES, es-MX ; de, zh ; fr
# best match for traditional Chinese
fr, zh-Hans-CN, en-US ; zh-TW ; zh-Hans-CN
fr, zh-Hans-CN, en-US ; zh-Hant ; zh-Hans-CN
fr, zh-Hans-CN, en-US ; zh-TW, en ; en-US
fr, zh-Hans-CN, en-US ; zh-Hant-CN, en ; en-US
fr, zh-Hans-CN, en-US ; zh-Hans, en ; zh-Hans-CN
# more specific script should win in case regions are identical
af, af-Latn, af-Arab ; af ; af
af, af-Latn, af-Arab ; af-ZA ; af
af, af-Latn, af-Arab ; af-Latn-ZA ; af-Latn
af, af-Latn, af-Arab ; af-Latn ; af-Latn
# more specific region should win
nl, nl-NL, nl-BE ; nl ; nl
nl, nl-NL, nl-BE ; nl-Latn ; nl
nl, nl-NL, nl-BE ; nl-Latn-NL ; nl-NL
nl, nl-NL, nl-BE ; nl-NL ; nl-NL
# region may replace matched if matched is enclosing
es-419,es ; es-MX ; es-419 ; es-MX
es-419,es ; es-SG ; es
# more specific region wins over more specific script
nl, nl-Latn, nl-NL, nl-BE ; nl ; nl
nl, nl-Latn, nl-NL, nl-BE ; nl-Latn ; nl-Latn
nl, nl-Latn, nl-NL, nl-BE ; nl-NL ; nl-NL
nl, nl-Latn, nl-NL, nl-BE ; nl-Latn-NL ; nl-NL
# region distance Portuguese
pt, pt-PT ; pt-ES ; pt-PT
# if no preferred locale specified, pick top language, not regional
en, fr, fr-CA, fr-CH ; fr-US ; fr ; fr-u-rg-uszzzz
# region distance German
de-AT, de-DE, de-CH ; de ; de-DE
# en-AU is closer to en-GB than to en (which is en-US)
en, en-GB, es-ES, es-419 ; en-AU ; en-GB
en, en-GB, es-ES, es-419 ; es-MX ; es-419 ; es-MX
en, en-GB, es-ES, es-419 ; es-PT ; es-ES
# undefined
it, fr ; und ; it
# und does not match en
it, en ; und ; it
# undefined in priority list
it, und ; und ; und
it, und ; en ; it
# undefined
it, fr, zh ; und-FR ; fr
it, fr, zh ; und-CN ; zh
it, fr, zh ; und-Hans ; zh
it, fr, zh ; und-Hant ; zh
it, fr, zh ; und-Latn ; it
# match on maximized tag
fr, en-GB, ja, es-ES, es-MX ; ja-JP, en-GB ; ja
fr, en-GB, ja, es-ES, es-MX ; ja-Jpan-JP, en-GB ; ja
# pick best maximized tag
ja, ja-Jpan-US, ja-JP, en, ru ; ja-Jpan, ru ; ja
ja, ja-Jpan-US, ja-JP, en, ru ; ja-JP, ru ; ja-JP
ja, ja-Jpan-US, ja-JP, en, ru ; ja-US, ru ; ja-Jpan-US
# termination: pick best maximized match
ja, ja-Jpan, ja-JP, en, ru ; ja-Jpan-JP, ru ; ja-JP
ja, ja-Jpan, ja-JP, en, ru ; ja-Jpan, ru ; ja-Jpan
# same language over exact, but distinguish when user is explicit
fr, en-GB, ja, es-ES, es-MX ; ja, de ; ja
en, de, fr, ja ; de-CH, fr ; de # TODO: ; de-u-rg-CH
en-GB, nl ; en, nl ; en-GB
en-GB, nl ; en, nl, en-GB ; nl
# parent relation preserved
en, en-US, en-GB, es, es-419, pt, pt-BR, pt-PT, zh, zh-Hant, zh-Hant-HK ; en-150 ; en-GB
en, en-US, en-GB, es, es-419, pt, pt-BR, pt-PT, zh, zh-Hant, zh-Hant-HK ; en-AU ; en-GB
en, en-US, en-GB, es, es-419, pt, pt-BR, pt-PT, zh, zh-Hant, zh-Hant-HK ; en-BE ; en-GB
en, en-US, en-GB, es, es-419, pt, pt-BR, pt-PT, zh, zh-Hant, zh-Hant-HK ; en-GG ; en-GB
en, en-US, en-GB, es, es-419, pt, pt-BR, pt-PT, zh, zh-Hant, zh-Hant-HK ; en-GI ; en-GB
en, en-US, en-GB, es, es-419, pt, pt-BR, pt-PT, zh, zh-Hant, zh-Hant-HK ; en-HK ; en-GB
en, en-US, en-GB, es, es-419, pt, pt-BR, pt-PT, zh, zh-Hant, zh-Hant-HK ; en-IE ; en-GB
en, en-US, en-GB, es, es-419, pt, pt-BR, pt-PT, zh, zh-Hant, zh-Hant-HK ; en-IM ; en-GB
en, en-US, en-GB, es, es-419, pt, pt-BR, pt-PT, zh, zh-Hant, zh-Hant-HK ; en-IN ; en-GB
en, en-US, en-GB, es, es-419, pt, pt-BR, pt-PT, zh, zh-Hant, zh-Hant-HK ; en-JE ; en-GB
en, en-US, en-GB, es, es-419, pt, pt-BR, pt-PT, zh, zh-Hant, zh-Hant-HK ; en-MT ; en-GB
en, en-US, en-GB, es, es-419, pt, pt-BR, pt-PT, zh, zh-Hant, zh-Hant-HK ; en-NZ ; en-GB
en, en-US, en-GB, es, es-419, pt, pt-BR, pt-PT, zh, zh-Hant, zh-Hant-HK ; en-PK ; en-GB
en, en-US, en-GB, es, es-419, pt, pt-BR, pt-PT, zh, zh-Hant, zh-Hant-HK ; en-SG ; en-GB
en, en-US, en-GB, es, es-419, pt, pt-BR, pt-PT, zh, zh-Hant, zh-Hant-HK ; en-DE ; en-GB
en, en-US, en-GB, es, es-419, pt, pt-BR, pt-PT, zh, zh-Hant, zh-Hant-HK ; en-MT ; en-GB
en, en-US, en-GB, es, es-419, pt, pt-BR, pt-PT, zh, zh-Hant, zh-Hant-HK ; es-AR ; es-419 ; es-AR
en, en-US, en-GB, es, es-419, pt, pt-BR, pt-PT, zh, zh-Hant, zh-Hant-HK ; es-BO ; es-419 ; es-BO
en, en-US, en-GB, es, es-419, pt, pt-BR, pt-PT, zh, zh-Hant, zh-Hant-HK ; es-CL ; es-419 ; es-CL
en, en-US, en-GB, es, es-419, pt, pt-BR, pt-PT, zh, zh-Hant, zh-Hant-HK ; es-CO ; es-419 ; es-CO
en, en-US, en-GB, es, es-419, pt, pt-BR, pt-PT, zh, zh-Hant, zh-Hant-HK ; es-CR ; es-419 ; es-CR
en, en-US, en-GB, es, es-419, pt, pt-BR, pt-PT, zh, zh-Hant, zh-Hant-HK ; es-CU ; es-419 ; es-CU
en, en-US, en-GB, es, es-419, pt, pt-BR, pt-PT, zh, zh-Hant, zh-Hant-HK ; es-DO ; es-419 ; es-DO
en, en-US, en-GB, es, es-419, pt, pt-BR, pt-PT, zh, zh-Hant, zh-Hant-HK ; es-EC ; es-419 ; es-EC
en, en-US, en-GB, es, es-419, pt, pt-BR, pt-PT, zh, zh-Hant, zh-Hant-HK ; es-GT ; es-419 ; es-GT
en, en-US, en-GB, es, es-419, pt, pt-BR, pt-PT, zh, zh-Hant, zh-Hant-HK ; es-HN ; es-419 ; es-HN
en, en-US, en-GB, es, es-419, pt, pt-BR, pt-PT, zh, zh-Hant, zh-Hant-HK ; es-MX ; es-419 ; es-MX
en, en-US, en-GB, es, es-419, pt, pt-BR, pt-PT, zh, zh-Hant, zh-Hant-HK ; es-NI ; es-419 ; es-NI
en, en-US, en-GB, es, es-419, pt, pt-BR, pt-PT, zh, zh-Hant, zh-Hant-HK ; es-PA ; es-419 ; es-PA
en, en-US, en-GB, es, es-419, pt, pt-BR, pt-PT, zh, zh-Hant, zh-Hant-HK ; es-PE ; es-419 ; es-PE
en, en-US, en-GB, es, es-419, pt, pt-BR, pt-PT, zh, zh-Hant, zh-Hant-HK ; es-PR ; es-419 ; es-PR
en, en-US, en-GB, es, es-419, pt, pt-BR, pt-PT, zh, zh-Hant, zh-Hant-HK ; es-PT ; es
en, en-US, en-GB, es, es-419, pt, pt-BR, pt-PT, zh, zh-Hant, zh-Hant-HK ; es-PY ; es-419 ; es-PY
en, en-US, en-GB, es, es-419, pt, pt-BR, pt-PT, zh, zh-Hant, zh-Hant-HK ; es-SV ; es-419 ; es-SV
en, en-US, en-GB, es, es-419, pt, pt-BR, pt-PT, zh, zh-Hant, zh-Hant-HK ; es-US ; es-419
en, en-US, en-GB, es, es-419, pt, pt-BR, pt-PT, zh, zh-Hant, zh-Hant-HK ; es-UY ; es-419 ; es-UY
en, en-US, en-GB, es, es-419, pt, pt-BR, pt-PT, zh, zh-Hant, zh-Hant-HK ; es-VE ; es-419 ; es-VE
en, en-US, en-GB, es, es-419, pt, pt-BR, pt-PT, zh, zh-Hant, zh-Hant-HK ; pt-AO ; pt-PT
en, en-US, en-GB, es, es-419, pt, pt-BR, pt-PT, zh, zh-Hant, zh-Hant-HK ; pt-CV ; pt-PT
en, en-US, en-GB, es, es-419, pt, pt-BR, pt-PT, zh, zh-Hant, zh-Hant-HK ; pt-GW ; pt-PT
en, en-US, en-GB, es, es-419, pt, pt-BR, pt-PT, zh, zh-Hant, zh-Hant-HK ; pt-MO ; pt-PT
en, en-US, en-GB, es, es-419, pt, pt-BR, pt-PT, zh, zh-Hant, zh-Hant-HK ; pt-MZ ; pt-PT
en, en-US, en-GB, es, es-419, pt, pt-BR, pt-PT, zh, zh-Hant, zh-Hant-HK ; pt-ST ; pt-PT
en, en-US, en-GB, es, es-419, pt, pt-BR, pt-PT, zh, zh-Hant, zh-Hant-HK ; pt-TL ; pt-PT
# preserve extensions
en, de, sl-nedis ; de-FR-u-co-phonebk ; de ; de-u-co-phonebk-rg-frzzzz
en, de, sl-nedis ; sl-nedis-u-cu-eur ; sl-nedis ; sl-nedis-u-cu-eur
en, de, sl-nedis ; sl-u-cu-eur ; sl-nedis ; sl-nedis-u-cu-eur
en, de, sl-nedis ; sl-HR-nedis-u-cu-eur ; sl-nedis ; sl-nedis-u-cu-eur-rg-hrzzzz
en, de, sl-nedis ; de-t-m0-iso-i0-pinyin ; de ; de-t-m0-iso-i0-pinyin
und, nl ; nl-BE-fonipa ; nl ; nl-u-rg-bezzzz
und, nl-CA ; nl-BE-fonipa ; nl-CA ; nl-CA-u-rg-bezzzz
und, nl-fonupa ; nl-BE-fonipa ; nl-fonupa ; nl-fonupa-u-rg-bezzzz
und, no ; nn-DK-fonipa ; no ; no-u-rg-dkzzzz
und, en-GB-u-sd-usca ; en-US-fonipa-u-nu-Arab-ca-buddhist-sd-usdc-t-m0-iso-i0-pinyin ; en-GB-u-sd-usca ; en-GB-t-m0-iso-i0-pinyin-u-ca-buddhist-nu-Arab-rg-uszzzz-sd-usca