Files
LearnGO/go/pkg/mod/mvdan.cc/xurls/v2@v2.5.0/generate/unicodegen/main.go
T
2024-09-19 21:38:24 -04:00

153 lines
4.1 KiB
Go

// Copyright (c) 2015, Daniel Martí <mvdan@mvdan.cc>
// See LICENSE for licensing information
package main
import (
"log"
"os"
"strconv"
"strings"
"text/template"
"unicode"
)
const path = "unicode.go"
var tmpl = template.Must(template.New("tlds").Parse(`// Generated by unicodegen
package xurls
const allowedUcsChar = {{.withPunc}}
const allowedUcsCharMinusPunc = {{.withoutPunc}}
`))
func visit(rt *unicode.RangeTable, fn func(rune)) {
for _, r16 := range rt.R16 {
for r := rune(r16.Lo); r <= rune(r16.Hi); r += rune(r16.Stride) {
fn(r)
}
}
for _, r32 := range rt.R32 {
for r := rune(r32.Lo); r <= rune(r32.Hi); r += rune(r32.Stride) {
fn(r)
}
}
}
func writeUnicode() error {
// rfc3987Ranges contains the ranges of valid code points specified by RFC 3987.
rfc3987Ranges := [][2]rune{
{0xA0, 0xD7FF},
{0xF900, 0xFDCF},
{0xFDF0, 0xFFEF},
{0x10000, 0x1FFFD},
{0x20000, 0x2FFFD},
{0x30000, 0x3FFFD},
{0x40000, 0x4FFFD},
{0x50000, 0x5FFFD},
{0x60000, 0x6FFFD},
{0x70000, 0x7FFFD},
{0x80000, 0x8FFFD},
{0x90000, 0x9FFFD},
{0xA0000, 0xAFFFD},
{0xB0000, 0xBFFFD},
{0xC0000, 0xCFFFD},
{0xD0000, 0xDFFFD},
{0xE1000, 0xEFFFD},
}
// removeRune accepts a slice of inclusive code point ranges (in ascending order)
// and returns a new slice that is equivalent except for excluding a specified rune
// by removing/replacing/splitting any range containing it.
// Its linear searches over the ranges (including those added by previous invocations)
// are inefficient, but acceptable because this code runs only at build time.
removeRune := func(ranges [][2]rune, cp rune) [][2]rune {
for i, r := range ranges {
// Ranges are in ascending order. Skip any that precede `cp`,
// and bail out upon reaching one that follows `cp`.
if r[1] < cp {
continue
} else if cp < r[0] {
break
}
// `cp` is in this range and must be removed from it.
if cp == r[0] && cp == r[1] {
// Remove this single-element range.
return append(ranges[0:i], ranges[i+1:]...)
} else if cp == r[0] {
// Remove the first element of this range.
newRange := [2]rune{r[0] + 1, r[1]}
newTail := append([][2]rune{newRange}, ranges[i+1:]...)
return append(ranges[0:i], newTail...)
} else if cp == r[1] {
// Remove the last element of this range.
newRange := [2]rune{r[0], r[1] - 1}
newTail := append([][2]rune{newRange}, ranges[i+1:]...)
return append(ranges[0:i], newTail...)
} else {
// Split this range.
newTail := append(
[][2]rune{
{r[0], cp - 1},
{cp + 1, r[1]},
},
ranges[i+1:]...)
return append(ranges[0:i], newTail...)
}
}
return ranges
}
// sepFreeRanges excludes separators from rfc3987Ranges.
sepFreeRanges := append([][2]rune{}, rfc3987Ranges...)
visit(unicode.Z, func(cp rune) {
sepFreeRanges = removeRune(sepFreeRanges, cp)
})
// puncFreeRanges excludes punctuation from sepFreeRanges.
puncFreeRanges := append([][2]rune{}, sepFreeRanges...)
visit(unicode.Po, func(cp rune) {
puncFreeRanges = removeRune(puncFreeRanges, cp)
})
// Build the corresponding regular expression character class contents.
characterClassContents := func(ranges [][2]rune) strings.Builder {
var builder strings.Builder
for _, r := range ranges {
// regexp.QuoteMeta is not necessary because all metacharacters are ASCII.
// cf. https://golang.org/s/re2syntax and
// https://cs.opensource.google/go/go/+/refs/tags/go1.17.6:src/regexp/regexp.go;l=721
builder.WriteRune(r[0])
if r[0] == r[1] {
continue
}
builder.WriteRune('-')
builder.WriteRune(r[1])
}
return builder
}
allowedUcsChar := characterClassContents(sepFreeRanges)
allowedUcsCharMinusPunc := characterClassContents(puncFreeRanges)
// Write to file.
f, err := os.Create(path)
if err != nil {
return err
}
defer f.Close()
return tmpl.Execute(f, map[string]string{
"withPunc": strconv.Quote(allowedUcsChar.String()),
"withoutPunc": strconv.Quote(allowedUcsCharMinusPunc.String()),
})
}
func main() {
log.Printf("Generating %s...", path)
if err := writeUnicode(); err != nil {
log.Fatalf("Could not write path: %v", err)
}
}