// Copyright (c) 2015, Daniel Martí // See LICENSE for licensing information package main import ( "log" "os" "strconv" "strings" "text/template" "unicode" ) const path = "unicode.go" var tmpl = template.Must(template.New("tlds").Parse(`// Generated by unicodegen package xurls const allowedUcsChar = {{.withPunc}} const allowedUcsCharMinusPunc = {{.withoutPunc}} `)) func visit(rt *unicode.RangeTable, fn func(rune)) { for _, r16 := range rt.R16 { for r := rune(r16.Lo); r <= rune(r16.Hi); r += rune(r16.Stride) { fn(r) } } for _, r32 := range rt.R32 { for r := rune(r32.Lo); r <= rune(r32.Hi); r += rune(r32.Stride) { fn(r) } } } func writeUnicode() error { // rfc3987Ranges contains the ranges of valid code points specified by RFC 3987. rfc3987Ranges := [][2]rune{ {0xA0, 0xD7FF}, {0xF900, 0xFDCF}, {0xFDF0, 0xFFEF}, {0x10000, 0x1FFFD}, {0x20000, 0x2FFFD}, {0x30000, 0x3FFFD}, {0x40000, 0x4FFFD}, {0x50000, 0x5FFFD}, {0x60000, 0x6FFFD}, {0x70000, 0x7FFFD}, {0x80000, 0x8FFFD}, {0x90000, 0x9FFFD}, {0xA0000, 0xAFFFD}, {0xB0000, 0xBFFFD}, {0xC0000, 0xCFFFD}, {0xD0000, 0xDFFFD}, {0xE1000, 0xEFFFD}, } // removeRune accepts a slice of inclusive code point ranges (in ascending order) // and returns a new slice that is equivalent except for excluding a specified rune // by removing/replacing/splitting any range containing it. // Its linear searches over the ranges (including those added by previous invocations) // are inefficient, but acceptable because this code runs only at build time. removeRune := func(ranges [][2]rune, cp rune) [][2]rune { for i, r := range ranges { // Ranges are in ascending order. Skip any that precede `cp`, // and bail out upon reaching one that follows `cp`. if r[1] < cp { continue } else if cp < r[0] { break } // `cp` is in this range and must be removed from it. if cp == r[0] && cp == r[1] { // Remove this single-element range. return append(ranges[0:i], ranges[i+1:]...) } else if cp == r[0] { // Remove the first element of this range. newRange := [2]rune{r[0] + 1, r[1]} newTail := append([][2]rune{newRange}, ranges[i+1:]...) return append(ranges[0:i], newTail...) } else if cp == r[1] { // Remove the last element of this range. newRange := [2]rune{r[0], r[1] - 1} newTail := append([][2]rune{newRange}, ranges[i+1:]...) return append(ranges[0:i], newTail...) } else { // Split this range. newTail := append( [][2]rune{ {r[0], cp - 1}, {cp + 1, r[1]}, }, ranges[i+1:]...) return append(ranges[0:i], newTail...) } } return ranges } // sepFreeRanges excludes separators from rfc3987Ranges. sepFreeRanges := append([][2]rune{}, rfc3987Ranges...) visit(unicode.Z, func(cp rune) { sepFreeRanges = removeRune(sepFreeRanges, cp) }) // puncFreeRanges excludes punctuation from sepFreeRanges. puncFreeRanges := append([][2]rune{}, sepFreeRanges...) visit(unicode.Po, func(cp rune) { puncFreeRanges = removeRune(puncFreeRanges, cp) }) // Build the corresponding regular expression character class contents. characterClassContents := func(ranges [][2]rune) strings.Builder { var builder strings.Builder for _, r := range ranges { // regexp.QuoteMeta is not necessary because all metacharacters are ASCII. // cf. https://golang.org/s/re2syntax and // https://cs.opensource.google/go/go/+/refs/tags/go1.17.6:src/regexp/regexp.go;l=721 builder.WriteRune(r[0]) if r[0] == r[1] { continue } builder.WriteRune('-') builder.WriteRune(r[1]) } return builder } allowedUcsChar := characterClassContents(sepFreeRanges) allowedUcsCharMinusPunc := characterClassContents(puncFreeRanges) // Write to file. f, err := os.Create(path) if err != nil { return err } defer f.Close() return tmpl.Execute(f, map[string]string{ "withPunc": strconv.Quote(allowedUcsChar.String()), "withoutPunc": strconv.Quote(allowedUcsCharMinusPunc.String()), }) } func main() { log.Printf("Generating %s...", path) if err := writeUnicode(); err != nil { log.Fatalf("Could not write path: %v", err) } }