whatcanGOwrong
This commit is contained in:
@@ -0,0 +1,2 @@
|
||||
# To prevent CRLF breakages on Windows for fragile files, like testdata.
|
||||
* -text
|
||||
@@ -0,0 +1 @@
|
||||
github: mvdan
|
||||
@@ -0,0 +1,23 @@
|
||||
on: [push, pull_request]
|
||||
name: Test
|
||||
jobs:
|
||||
test:
|
||||
strategy:
|
||||
matrix:
|
||||
go-version: [1.19.x, 1.20.x]
|
||||
os: [ubuntu-latest, macos-11, windows-latest]
|
||||
runs-on: ${{ matrix.os }}
|
||||
steps:
|
||||
- uses: actions/setup-go@v3
|
||||
with:
|
||||
go-version: ${{ matrix.go-version }}
|
||||
- uses: actions/checkout@v3
|
||||
- run: go test ./...
|
||||
- run: go test -race ./...
|
||||
|
||||
# Static checks from this point forward. Only run on one Go version and on
|
||||
# Linux, since it's the fastest platform, and the tools behave the same.
|
||||
- if: matrix.os == 'ubuntu-latest' && matrix.go-version == '1.20.x'
|
||||
run: diff <(echo -n) <(gofmt -s -d .)
|
||||
- if: matrix.os == 'ubuntu-latest' && matrix.go-version == '1.20.x'
|
||||
run: go vet ./...
|
||||
@@ -0,0 +1,3 @@
|
||||
cmd/xurls/xurls
|
||||
generate/tldsgen/tldsgen
|
||||
generate/regexgen/regexgen
|
||||
@@ -0,0 +1,27 @@
|
||||
Copyright (c) 2015, Daniel Martí. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the following disclaimer
|
||||
in the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
* Neither the name of the copyright holder nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
@@ -0,0 +1,37 @@
|
||||
# xurls
|
||||
|
||||
[](https://pkg.go.dev/mvdan.cc/xurls/v2)
|
||||
|
||||
Extract urls from text using regular expressions. Requires Go 1.19 or later.
|
||||
|
||||
```go
|
||||
import "mvdan.cc/xurls/v2"
|
||||
|
||||
func main() {
|
||||
rxRelaxed := xurls.Relaxed()
|
||||
rxRelaxed.FindString("Do gophers live in golang.org?") // "golang.org"
|
||||
rxRelaxed.FindString("This string does not have a URL") // ""
|
||||
|
||||
rxStrict := xurls.Strict()
|
||||
rxStrict.FindAllString("must have scheme: http://foo.com/.", -1) // []string{"http://foo.com/"}
|
||||
rxStrict.FindAllString("no scheme, no match: foo.com", -1) // []string{}
|
||||
}
|
||||
```
|
||||
|
||||
Since API is centered around [regexp.Regexp](https://golang.org/pkg/regexp/#Regexp),
|
||||
many other methods are available, such as finding the [byte indexes](https://golang.org/pkg/regexp/#Regexp.FindAllIndex)
|
||||
for all matches.
|
||||
|
||||
The regular expressions are compiled when the API is first called.
|
||||
Any subsequent calls will use the same regular expression pointers.
|
||||
|
||||
#### cmd/xurls
|
||||
|
||||
To install the tool globally:
|
||||
|
||||
go install mvdan.cc/xurls/v2/cmd/xurls@latest
|
||||
|
||||
```shell
|
||||
$ echo "Do gophers live in http://golang.org?" | xurls
|
||||
http://golang.org
|
||||
```
|
||||
@@ -0,0 +1,293 @@
|
||||
// Copyright (c) 2015, Daniel Martí <mvdan@mvdan.cc>
|
||||
// See LICENSE for licensing information
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"bytes"
|
||||
"errors"
|
||||
"flag"
|
||||
"fmt"
|
||||
"io"
|
||||
"io/ioutil"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
"regexp"
|
||||
"runtime/debug"
|
||||
"strings"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"golang.org/x/mod/module"
|
||||
|
||||
"mvdan.cc/xurls/v2"
|
||||
)
|
||||
|
||||
var (
|
||||
matching = flag.String("m", "", "")
|
||||
relaxed = flag.Bool("r", false, "")
|
||||
fix boolString
|
||||
version = flag.Bool("version", false, "")
|
||||
)
|
||||
|
||||
type boolString string
|
||||
|
||||
func (s *boolString) Set(val string) error {
|
||||
*s = boolString(val)
|
||||
return nil
|
||||
}
|
||||
func (s *boolString) Get() any { return string(*s) }
|
||||
func (s *boolString) String() string { return string(*s) }
|
||||
func (*boolString) IsBoolFlag() bool { return true }
|
||||
|
||||
func init() {
|
||||
flag.Var(&fix, "fix", "")
|
||||
flag.Usage = func() {
|
||||
fmt.Fprint(os.Stderr, `
|
||||
Usage: xurls [-h] [files]
|
||||
|
||||
xurls extracts urls from text using regular expressions.
|
||||
If no files are given, it reads from standard input.
|
||||
|
||||
-m <regexp> only match urls whose scheme matches a regexp
|
||||
example: 'https?://|mailto:'
|
||||
-r also match urls without a scheme (relaxed)
|
||||
-version print version and exit
|
||||
|
||||
When the -fix or -fix=auto flag is used, xurls instead attempts to replace
|
||||
any urls which result in a permanent redirect (301 or 308).
|
||||
It also fails if any urls fail to load, so that they may be removed or replaced.
|
||||
To replace urls which result in temporary redirect as well, use -fix=all.
|
||||
`[1:])
|
||||
}
|
||||
}
|
||||
|
||||
func scanPath(re *regexp.Regexp, path string) error {
|
||||
in := os.Stdin
|
||||
out := io.Writer(os.Stdout)
|
||||
var outBuf *bytes.Buffer
|
||||
if path != "-" {
|
||||
var err error
|
||||
in, err = os.Open(path)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if fix != "" {
|
||||
outBuf = new(bytes.Buffer)
|
||||
out = outBuf
|
||||
}
|
||||
defer in.Close()
|
||||
}
|
||||
|
||||
// A maximum of 32 parallel requests.
|
||||
maxWeight := int64(32)
|
||||
seq := newSequencer(maxWeight, out, os.Stderr)
|
||||
|
||||
userAgent := fmt.Sprintf("mvdan.cc/xurls %s", readVersion())
|
||||
scanner := bufio.NewScanner(in)
|
||||
|
||||
// Doesn't need to be part of reporterState as order doesn't matter.
|
||||
var atomicFixedCount uint32
|
||||
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text() + "\n"
|
||||
matches := re.FindAllStringIndex(line, -1)
|
||||
if fix == "" {
|
||||
for _, pair := range matches {
|
||||
match := line[pair[0]:pair[1]]
|
||||
fmt.Printf("%s\n", match)
|
||||
}
|
||||
continue
|
||||
}
|
||||
weight := int64(len(matches))
|
||||
if weight > maxWeight {
|
||||
weight = maxWeight
|
||||
}
|
||||
seq.Add(weight, func(r *reporter) error {
|
||||
offsetWithinLine := 0
|
||||
for _, pair := range matches {
|
||||
// The indexes are based on the original line.
|
||||
pair[0] += offsetWithinLine
|
||||
pair[1] += offsetWithinLine
|
||||
match := line[pair[0]:pair[1]]
|
||||
origURL, err := url.Parse(match)
|
||||
if err != nil {
|
||||
r.appendBroken(match, err.Error())
|
||||
continue
|
||||
}
|
||||
fixed := origURL.String()
|
||||
switch origURL.Scheme {
|
||||
case "http", "https":
|
||||
// See if the URL redirects somewhere.
|
||||
client := &http.Client{
|
||||
Timeout: 10 * time.Second,
|
||||
CheckRedirect: func(req *http.Request, via []*http.Request) error {
|
||||
if len(via) >= 10 {
|
||||
return errors.New("stopped after 10 redirects")
|
||||
}
|
||||
switch req.Response.StatusCode {
|
||||
case http.StatusMovedPermanently, http.StatusPermanentRedirect:
|
||||
// "auto" and "all" fix permanent redirects.
|
||||
case http.StatusFound, http.StatusSeeOther, http.StatusTemporaryRedirect:
|
||||
// Only "all" fixes temporary redirects.
|
||||
if fix != "all" {
|
||||
return http.ErrUseLastResponse
|
||||
}
|
||||
default:
|
||||
// Any other redirects are ignored.
|
||||
return http.ErrUseLastResponse
|
||||
}
|
||||
// Inherit the fragment if empty.
|
||||
if req.URL.Fragment == "" {
|
||||
req.URL.Fragment = origURL.Fragment
|
||||
}
|
||||
fixed = req.URL.String()
|
||||
return nil
|
||||
},
|
||||
}
|
||||
method := http.MethodHead
|
||||
retry:
|
||||
req, err := http.NewRequest(method, fixed, nil)
|
||||
if err != nil {
|
||||
r.appendBroken(match, err.Error())
|
||||
continue
|
||||
}
|
||||
req.Header.Set("User-Agent", userAgent)
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
r.appendBroken(match, err.Error())
|
||||
continue
|
||||
}
|
||||
if code := resp.StatusCode; code >= 400 {
|
||||
if code == http.StatusMethodNotAllowed {
|
||||
method = http.MethodGet
|
||||
resp.Body.Close()
|
||||
goto retry
|
||||
}
|
||||
r.appendBroken(match, fmt.Sprintf("%d %s", code, http.StatusText(code)))
|
||||
}
|
||||
resp.Body.Close()
|
||||
}
|
||||
if fixed != match {
|
||||
// Replace the url, and update offsetWithinLine.
|
||||
newLine := line[:pair[0]] + fixed + line[pair[1]:]
|
||||
offsetWithinLine += len(newLine) - len(line)
|
||||
line = newLine
|
||||
atomic.AddUint32(&atomicFixedCount, 1)
|
||||
}
|
||||
}
|
||||
io.WriteString(r, line) // add the fixed line to outBuf
|
||||
return nil
|
||||
})
|
||||
if err := scanner.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
state := seq.finalState()
|
||||
if state.exitCode != 0 {
|
||||
panic("we aren't using sequencer for any errors")
|
||||
}
|
||||
// Note that all goroutines have stopped at this point.
|
||||
if atomicFixedCount > 0 && path != "-" {
|
||||
in.Close()
|
||||
// Overwrite the file, if we weren't reading stdin. Report its
|
||||
// path too.
|
||||
fmt.Println(path)
|
||||
if err := ioutil.WriteFile(path, outBuf.Bytes(), 0o666); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
if len(state.brokenURLs) > 0 {
|
||||
var s strings.Builder
|
||||
fmt.Fprintf(&s, "found %d broken urls in %q:\n", len(state.brokenURLs), path)
|
||||
for _, broken := range state.brokenURLs {
|
||||
fmt.Fprintf(&s, " * %s - %s\n", broken.url, broken.reason)
|
||||
}
|
||||
return errors.New(s.String())
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func main() { os.Exit(main1()) }
|
||||
|
||||
func main1() int {
|
||||
flag.Parse()
|
||||
if *version {
|
||||
fmt.Println(readVersion())
|
||||
return 0
|
||||
}
|
||||
if *relaxed && *matching != "" {
|
||||
fmt.Fprintln(os.Stderr, "-r and -m at the same time don't make much sense")
|
||||
return 1
|
||||
}
|
||||
switch fix {
|
||||
case "": // disabled by default
|
||||
case "false": // disabled via -fix=false; normalize
|
||||
fix = ""
|
||||
case "auto", "all": // enabled via -fix=auto, -fix=all, etc
|
||||
case "true": // enabled via -fix; normalize
|
||||
fix = "auto"
|
||||
}
|
||||
var re *regexp.Regexp
|
||||
if *relaxed {
|
||||
re = xurls.Relaxed()
|
||||
} else if *matching != "" {
|
||||
var err error
|
||||
if re, err = xurls.StrictMatchingScheme(*matching); err != nil {
|
||||
fmt.Fprintln(os.Stderr, err)
|
||||
return 1
|
||||
}
|
||||
} else {
|
||||
re = xurls.Strict()
|
||||
}
|
||||
args := flag.Args()
|
||||
if len(args) == 0 {
|
||||
args = []string{"-"}
|
||||
}
|
||||
for _, path := range args {
|
||||
if err := scanPath(re, path); err != nil {
|
||||
fmt.Fprintln(os.Stderr, err)
|
||||
return 1
|
||||
}
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
// Borrowed from https://github.com/burrowers/garble.
|
||||
|
||||
func readVersion() string {
|
||||
info, ok := debug.ReadBuildInfo()
|
||||
if !ok {
|
||||
return "unknown"
|
||||
}
|
||||
mod := &info.Main
|
||||
if mod.Replace != nil {
|
||||
mod = mod.Replace
|
||||
}
|
||||
|
||||
// Until https://github.com/golang/go/issues/50603 is implemented,
|
||||
// manually construct something like a pseudo-version.
|
||||
// TODO: remove when this code is dead, hopefully in Go 1.20.
|
||||
if mod.Version == "(devel)" {
|
||||
var vcsTime time.Time
|
||||
var vcsRevision string
|
||||
for _, setting := range info.Settings {
|
||||
switch setting.Key {
|
||||
case "vcs.time":
|
||||
// If the format is invalid, we'll print a zero timestamp.
|
||||
vcsTime, _ = time.Parse(time.RFC3339Nano, setting.Value)
|
||||
case "vcs.revision":
|
||||
vcsRevision = setting.Value
|
||||
if len(vcsRevision) > 12 {
|
||||
vcsRevision = vcsRevision[:12]
|
||||
}
|
||||
}
|
||||
}
|
||||
if vcsRevision != "" {
|
||||
mod.Version = module.PseudoVersion("", "", vcsTime, vcsRevision)
|
||||
}
|
||||
}
|
||||
return mod.Version
|
||||
}
|
||||
@@ -0,0 +1,125 @@
|
||||
// Copyright (c) 2019, Daniel Martí <mvdan@mvdan.cc>
|
||||
// See LICENSE for licensing information
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"net"
|
||||
"net/http"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
|
||||
"github.com/rogpeppe/go-internal/testscript"
|
||||
)
|
||||
|
||||
func TestMain(m *testing.M) {
|
||||
os.Exit(testscript.RunMain(m, map[string]func() int{
|
||||
"xurls": main1,
|
||||
}))
|
||||
}
|
||||
|
||||
func TestScript(t *testing.T) {
|
||||
t.Parallel()
|
||||
testscript.Run(t, testscript.Params{
|
||||
Dir: filepath.Join("testdata", "script"),
|
||||
RequireExplicitExec: true,
|
||||
Setup: func(env *testscript.Env) error {
|
||||
mux := http.NewServeMux()
|
||||
handle := func(method, pattern string, handler func(http.ResponseWriter, *http.Request)) {
|
||||
mux.HandleFunc(pattern, func(w http.ResponseWriter, r *http.Request) {
|
||||
if r.Method != method {
|
||||
t.Errorf("expected all requests to be %q, got %q", method, r.Method)
|
||||
}
|
||||
handler(w, r)
|
||||
})
|
||||
}
|
||||
handle("HEAD", "/plain-head", func(w http.ResponseWriter, r *http.Request) {
|
||||
w.WriteHeader(200)
|
||||
})
|
||||
handle("HEAD", "/redir-1", func(w http.ResponseWriter, r *http.Request) {
|
||||
http.Redirect(w, r, "/plain-head", http.StatusMovedPermanently)
|
||||
})
|
||||
handle("HEAD", "/redir-2", func(w http.ResponseWriter, r *http.Request) {
|
||||
http.Redirect(w, r, "/redir-1", http.StatusMovedPermanently)
|
||||
})
|
||||
|
||||
handle("HEAD", "/redir-longer", func(w http.ResponseWriter, r *http.Request) {
|
||||
http.Redirect(w, r, "/redir-longtarget", http.StatusMovedPermanently)
|
||||
})
|
||||
handle("HEAD", "/redir-longtarget", func(w http.ResponseWriter, r *http.Request) {
|
||||
w.WriteHeader(200)
|
||||
})
|
||||
handle("HEAD", "/redir-fragment", func(w http.ResponseWriter, r *http.Request) {
|
||||
http.Redirect(w, r, "/plain-head#bar", http.StatusMovedPermanently)
|
||||
})
|
||||
|
||||
handle("HEAD", "/redir-301", func(w http.ResponseWriter, r *http.Request) {
|
||||
http.Redirect(w, r, "/plain-head", 301)
|
||||
})
|
||||
handle("HEAD", "/redir-302", func(w http.ResponseWriter, r *http.Request) {
|
||||
http.Redirect(w, r, "/plain-head", 302)
|
||||
})
|
||||
handle("HEAD", "/redir-303", func(w http.ResponseWriter, r *http.Request) {
|
||||
http.Redirect(w, r, "/plain-head", 303)
|
||||
})
|
||||
handle("HEAD", "/redir-307", func(w http.ResponseWriter, r *http.Request) {
|
||||
http.Redirect(w, r, "/plain-head", 307)
|
||||
})
|
||||
handle("HEAD", "/redir-308", func(w http.ResponseWriter, r *http.Request) {
|
||||
http.Redirect(w, r, "/plain-head", 308)
|
||||
})
|
||||
|
||||
handle("HEAD", "/404", func(w http.ResponseWriter, r *http.Request) {
|
||||
http.Error(w, "", 404)
|
||||
})
|
||||
handle("HEAD", "/500", func(w http.ResponseWriter, r *http.Request) {
|
||||
http.Error(w, "", 500)
|
||||
})
|
||||
|
||||
handle("GET", "/plain-get", func(w http.ResponseWriter, r *http.Request) {
|
||||
fmt.Fprintf(w, "plaintext")
|
||||
})
|
||||
mux.HandleFunc("/get-only", func(w http.ResponseWriter, r *http.Request) {
|
||||
if r.Method == "GET" {
|
||||
http.Redirect(w, r, "/plain-get", 301)
|
||||
} else {
|
||||
http.Error(w, "", 405)
|
||||
}
|
||||
})
|
||||
|
||||
ln, err := net.Listen("tcp", ":0")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
server := &http.Server{Handler: mux}
|
||||
go server.Serve(ln)
|
||||
env.Vars = append(env.Vars, "SERVER=http://"+ln.Addr().String())
|
||||
env.Defer(func() {
|
||||
if err := server.Shutdown(context.TODO()); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
})
|
||||
return nil
|
||||
},
|
||||
Cmds: map[string]func(ts *testscript.TestScript, neg bool, args []string){
|
||||
"expand": func(ts *testscript.TestScript, neg bool, args []string) {
|
||||
if neg {
|
||||
ts.Fatalf("unsupported: ! expand")
|
||||
}
|
||||
if len(args) == 0 {
|
||||
ts.Fatalf("usage: expand file...")
|
||||
}
|
||||
for _, arg := range args {
|
||||
data := ts.ReadFile(arg)
|
||||
data = os.Expand(data, ts.Getenv)
|
||||
err := ioutil.WriteFile(ts.MkAbs(arg), []byte(data), 0o666)
|
||||
ts.Check(err)
|
||||
}
|
||||
},
|
||||
},
|
||||
})
|
||||
}
|
||||
@@ -0,0 +1,156 @@
|
||||
// Copyright 2009 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// The code below is borrowed from Go's cmd/gofmt as of 1.18beta1.
|
||||
// We tweaked it slightly to add the "broken URLs" result.
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"context"
|
||||
"go/scanner"
|
||||
"io"
|
||||
|
||||
"golang.org/x/sync/semaphore"
|
||||
)
|
||||
|
||||
// A sequencer performs concurrent tasks that may write output, but emits that
|
||||
// output in a deterministic order.
|
||||
type sequencer struct {
|
||||
maxWeight int64
|
||||
sem *semaphore.Weighted // weighted by input bytes (an approximate proxy for memory overhead)
|
||||
prev <-chan *reporterState // 1-buffered
|
||||
}
|
||||
|
||||
// newSequencer returns a sequencer that allows concurrent tasks up to maxWeight
|
||||
// and writes tasks' output to out and err.
|
||||
func newSequencer(maxWeight int64, out, err io.Writer) *sequencer {
|
||||
sem := semaphore.NewWeighted(maxWeight)
|
||||
prev := make(chan *reporterState, 1)
|
||||
prev <- &reporterState{out: out, err: err}
|
||||
return &sequencer{
|
||||
maxWeight: maxWeight,
|
||||
sem: sem,
|
||||
prev: prev,
|
||||
}
|
||||
}
|
||||
|
||||
// Add blocks until the sequencer has enough weight to spare, then adds f as a
|
||||
// task to be executed concurrently.
|
||||
//
|
||||
// If the weight is either negative or larger than the sequencer's maximum
|
||||
// weight, Add blocks until all other tasks have completed, then the task
|
||||
// executes exclusively (blocking all other calls to Add until it completes).
|
||||
//
|
||||
// f may run concurrently in a goroutine, but its output to the passed-in
|
||||
// reporter will be sequential relative to the other tasks in the sequencer.
|
||||
//
|
||||
// If f invokes a method on the reporter, execution of that method may block
|
||||
// until the previous task has finished. (To maximize concurrency, f should
|
||||
// avoid invoking the reporter until it has finished any parallelizable work.)
|
||||
//
|
||||
// If f returns a non-nil error, that error will be reported after f's output
|
||||
// (if any) and will cause a nonzero final exit code.
|
||||
func (s *sequencer) Add(weight int64, f func(*reporter) error) {
|
||||
if weight < 0 || weight > s.maxWeight {
|
||||
weight = s.maxWeight
|
||||
}
|
||||
if err := s.sem.Acquire(context.TODO(), weight); err != nil {
|
||||
// Change the task from "execute f" to "report err".
|
||||
weight = 0
|
||||
f = func(*reporter) error { return err }
|
||||
}
|
||||
|
||||
r := &reporter{prev: s.prev}
|
||||
next := make(chan *reporterState, 1)
|
||||
s.prev = next
|
||||
|
||||
// Start f in parallel: it can run until it invokes a method on r, at which
|
||||
// point it will block until the previous task releases the output state.
|
||||
go func() {
|
||||
if err := f(r); err != nil {
|
||||
r.Report(err)
|
||||
}
|
||||
next <- r.getState() // Release the next task.
|
||||
s.sem.Release(weight)
|
||||
}()
|
||||
}
|
||||
|
||||
// GetExitCode waits for all previously-added tasks to complete, then returns an
|
||||
// exit code for the sequence suitable for passing to os.Exit.
|
||||
func (s *sequencer) GetExitCode() int {
|
||||
c := make(chan int, 1)
|
||||
s.Add(0, func(r *reporter) error {
|
||||
c <- r.ExitCode()
|
||||
return nil
|
||||
})
|
||||
return <-c
|
||||
}
|
||||
|
||||
func (s *sequencer) finalState() reporterState {
|
||||
c := make(chan reporterState, 1)
|
||||
s.Add(0, func(r *reporter) error {
|
||||
c <- *r.getState()
|
||||
return nil
|
||||
})
|
||||
return <-c
|
||||
}
|
||||
|
||||
// A reporter reports output, warnings, and errors.
|
||||
type reporter struct {
|
||||
prev <-chan *reporterState
|
||||
state *reporterState
|
||||
}
|
||||
|
||||
// reporterState carries the state of a reporter instance.
|
||||
//
|
||||
// Only one reporter at a time may have access to a reporterState.
|
||||
type reporterState struct {
|
||||
out, err io.Writer
|
||||
exitCode int
|
||||
|
||||
brokenURLs []brokenURL
|
||||
}
|
||||
|
||||
type brokenURL struct {
|
||||
url string
|
||||
reason string
|
||||
}
|
||||
|
||||
// getState blocks until any prior reporters are finished with the reporter
|
||||
// state, then returns the state for manipulation.
|
||||
func (r *reporter) getState() *reporterState {
|
||||
if r.state == nil {
|
||||
r.state = <-r.prev
|
||||
}
|
||||
return r.state
|
||||
}
|
||||
|
||||
// Write emits a slice to the reporter's output stream.
|
||||
//
|
||||
// Any error is returned to the caller, and does not otherwise affect the
|
||||
// reporter's exit code.
|
||||
func (r *reporter) Write(p []byte) (int, error) {
|
||||
return r.getState().out.Write(p)
|
||||
}
|
||||
|
||||
func (r *reporter) appendBroken(url, reason string) {
|
||||
state := r.getState()
|
||||
state.brokenURLs = append(state.brokenURLs, brokenURL{url, reason})
|
||||
}
|
||||
|
||||
// Report emits a non-nil error to the reporter's error stream,
|
||||
// changing its exit code to a nonzero value.
|
||||
func (r *reporter) Report(err error) {
|
||||
if err == nil {
|
||||
panic("Report with nil error")
|
||||
}
|
||||
st := r.getState()
|
||||
scanner.PrintError(st.err, err)
|
||||
st.exitCode = 2
|
||||
}
|
||||
|
||||
func (r *reporter) ExitCode() int {
|
||||
return r.getState().exitCode
|
||||
}
|
||||
@@ -0,0 +1,33 @@
|
||||
stdin input
|
||||
exec xurls
|
||||
stdout 'https://foo.com'
|
||||
! stdout 'bar.com'
|
||||
! stdout 'custom://some-data'
|
||||
! stderr .
|
||||
|
||||
! exec xurls missing
|
||||
! stdout .
|
||||
stderr 'open missing'
|
||||
|
||||
exec xurls input
|
||||
stdout 'https://foo.com'
|
||||
! stdout 'bar.com'
|
||||
! stdout 'custom://some-data'
|
||||
! stderr .
|
||||
|
||||
exec xurls -r input
|
||||
stdout 'https://foo.com'
|
||||
stdout 'bar.com'
|
||||
! stdout 'custom://some-data'
|
||||
! stderr .
|
||||
|
||||
exec xurls -m 'custom://' input
|
||||
! stdout 'https://foo.com'
|
||||
! stdout 'bar.com'
|
||||
stdout 'custom://some-data'
|
||||
! stderr .
|
||||
|
||||
-- input --
|
||||
First, a link with a scheme, https://foo.com.
|
||||
Then, one without a scheme, like bar.com.
|
||||
Also, a link with a custom scheme, custom://some-data.
|
||||
@@ -0,0 +1,120 @@
|
||||
expand nothing
|
||||
cp nothing nothing.orig
|
||||
|
||||
expand redirects
|
||||
expand redirects.golden-auto
|
||||
expand redirects.golden-all
|
||||
cp redirects redirects.orig
|
||||
|
||||
expand broken
|
||||
expand broken.golden
|
||||
cp broken broken.orig
|
||||
|
||||
exec xurls -fix nothing
|
||||
! stdout .
|
||||
! stderr .
|
||||
cmp nothing nothing.orig
|
||||
|
||||
stdin redirects
|
||||
exec xurls -fix
|
||||
cmp stdout redirects.golden-auto
|
||||
cmp redirects redirects.orig
|
||||
! stderr .
|
||||
|
||||
exec xurls -fix redirects
|
||||
stdout '^redirects$'
|
||||
! stderr .
|
||||
cmp redirects redirects.golden-auto
|
||||
cp redirects.orig redirects
|
||||
|
||||
exec xurls -fix=auto redirects
|
||||
cmp redirects redirects.golden-auto
|
||||
cp redirects.orig redirects
|
||||
|
||||
exec xurls -fix=all redirects
|
||||
cmp redirects redirects.golden-all
|
||||
cp redirects.orig redirects
|
||||
|
||||
! exec xurls -fix broken
|
||||
stdout -count=1 '^broken$'
|
||||
stderr -count=1 '5 broken urls'
|
||||
stderr -count=2 '/404 - 404 Not Found'
|
||||
stderr -count=2 '/500 - 500 Internal Server Error'
|
||||
stderr -count=1 'totallydoesnotexist.localhost/ - Head .* dial tcp'
|
||||
cmp broken broken.golden
|
||||
|
||||
-- nothing --
|
||||
No redirect: ${SERVER}/plain-head
|
||||
-- redirects --
|
||||
No redirect: ${SERVER}/plain-head
|
||||
One redirect: ${SERVER}/redir-1
|
||||
Two redirects: ${SERVER}/redir-2
|
||||
Redirect inherits fragment: ${SERVER}/redir-1#foo
|
||||
Redirect replaces fragment: ${SERVER}/redir-fragment#foo
|
||||
|
||||
Three links in one line: ${SERVER}/redir-1 + ${SERVER}//redir-1 + ${SERVER}///redir-1
|
||||
|
||||
Redirect to a longer path ${SERVER}/redir-longer with trailing text
|
||||
|
||||
Permanent redirect codes:
|
||||
* ${SERVER}/redir-301
|
||||
* ${SERVER}/redir-308
|
||||
|
||||
Temporary redirect codes:
|
||||
* ${SERVER}/redir-302
|
||||
* ${SERVER}/redir-303
|
||||
* ${SERVER}/redir-307
|
||||
|
||||
Only GET allowed, HEAD fails: ${SERVER}/get-only
|
||||
-- redirects.golden-auto --
|
||||
No redirect: ${SERVER}/plain-head
|
||||
One redirect: ${SERVER}/plain-head
|
||||
Two redirects: ${SERVER}/plain-head
|
||||
Redirect inherits fragment: ${SERVER}/plain-head#foo
|
||||
Redirect replaces fragment: ${SERVER}/plain-head#bar
|
||||
|
||||
Three links in one line: ${SERVER}/plain-head + ${SERVER}/plain-head + ${SERVER}/plain-head
|
||||
|
||||
Redirect to a longer path ${SERVER}/redir-longtarget with trailing text
|
||||
|
||||
Permanent redirect codes:
|
||||
* ${SERVER}/plain-head
|
||||
* ${SERVER}/plain-head
|
||||
|
||||
Temporary redirect codes:
|
||||
* ${SERVER}/redir-302
|
||||
* ${SERVER}/redir-303
|
||||
* ${SERVER}/redir-307
|
||||
|
||||
Only GET allowed, HEAD fails: ${SERVER}/plain-get
|
||||
-- redirects.golden-all --
|
||||
No redirect: ${SERVER}/plain-head
|
||||
One redirect: ${SERVER}/plain-head
|
||||
Two redirects: ${SERVER}/plain-head
|
||||
Redirect inherits fragment: ${SERVER}/plain-head#foo
|
||||
Redirect replaces fragment: ${SERVER}/plain-head#bar
|
||||
|
||||
Three links in one line: ${SERVER}/plain-head + ${SERVER}/plain-head + ${SERVER}/plain-head
|
||||
|
||||
Redirect to a longer path ${SERVER}/redir-longtarget with trailing text
|
||||
|
||||
Permanent redirect codes:
|
||||
* ${SERVER}/plain-head
|
||||
* ${SERVER}/plain-head
|
||||
|
||||
Temporary redirect codes:
|
||||
* ${SERVER}/plain-head
|
||||
* ${SERVER}/plain-head
|
||||
* ${SERVER}/plain-head
|
||||
|
||||
Only GET allowed, HEAD fails: ${SERVER}/plain-get
|
||||
-- broken --
|
||||
One redirect: ${SERVER}/redir-1
|
||||
404 errors: ${SERVER}/404 ${SERVER}/404
|
||||
500 errors: ${SERVER}/500 ${SERVER}/500
|
||||
Dial error: http://totallydoesnotexist.localhost/
|
||||
-- broken.golden --
|
||||
One redirect: ${SERVER}/plain-head
|
||||
404 errors: ${SERVER}/404 ${SERVER}/404
|
||||
500 errors: ${SERVER}/500 ${SERVER}/500
|
||||
Dial error: http://totallydoesnotexist.localhost/
|
||||
@@ -0,0 +1,11 @@
|
||||
exec xurls -h
|
||||
! stderr 'flag provided but not defined'
|
||||
stderr 'Usage: xurls'
|
||||
! stderr 'help requested' # don't duplicate usage output
|
||||
! stderr '-test\.' # don't show the test binary's usage func
|
||||
|
||||
! exec xurls -r -m="whatever"
|
||||
stderr 'at the same time'
|
||||
|
||||
! exec xurls -m="bad(regexp"
|
||||
stderr 'missing closing \)'
|
||||
@@ -0,0 +1,5 @@
|
||||
# Note that "go test" does not embed vcs information by default.
|
||||
# We copied the code from another project which is tested,
|
||||
# so there's no need to fully test the VCS aspect.
|
||||
exec xurls -version
|
||||
stdout '\(devel\)'
|
||||
@@ -0,0 +1,19 @@
|
||||
// Copyright (c) 2015, Daniel Martí <mvdan@mvdan.cc>
|
||||
// See LICENSE for licensing information
|
||||
|
||||
package xurls_test
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"mvdan.cc/xurls/v2"
|
||||
)
|
||||
|
||||
func Example() {
|
||||
rx := xurls.Relaxed()
|
||||
fmt.Println(rx.FindString("Do gophers live in http://golang.org?"))
|
||||
fmt.Println(rx.FindAllString("foo.com is http://foo.com/.", -1))
|
||||
// Output:
|
||||
// http://golang.org
|
||||
// [foo.com http://foo.com/]
|
||||
}
|
||||
@@ -0,0 +1,74 @@
|
||||
// Copyright (c) 2017, Shreyas Khare <skhare@rapid7.com>
|
||||
// See LICENSE for licensing information
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/csv"
|
||||
"io"
|
||||
"log"
|
||||
"net/http"
|
||||
"os"
|
||||
"strings"
|
||||
"text/template"
|
||||
)
|
||||
|
||||
const path = "schemes.go"
|
||||
|
||||
var schemesTmpl = template.Must(template.New("schemes").Parse(`// Generated by schemesgen
|
||||
|
||||
package xurls
|
||||
|
||||
// Schemes is a sorted list of all IANA assigned schemes.
|
||||
//
|
||||
// Source: https://www.iana.org/assignments/uri-schemes/uri-schemes-1.csv
|
||||
var Schemes = []string{
|
||||
{{range $scheme := .Schemes}}` + "\t`" + `{{$scheme}}` + "`" + `,
|
||||
{{end}}}
|
||||
`))
|
||||
|
||||
func schemeList() []string {
|
||||
resp, err := http.Get("https://www.iana.org/assignments/uri-schemes/uri-schemes-1.csv")
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
r := csv.NewReader(resp.Body)
|
||||
r.Read() // ignore headers
|
||||
schemes := make([]string, 0)
|
||||
for {
|
||||
record, err := r.Read()
|
||||
if err == io.EOF {
|
||||
break
|
||||
}
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
if strings.Contains(record[0], "OBSOLETE") {
|
||||
continue // skip obsolete schemes; note the scheme column is abused
|
||||
}
|
||||
schemes = append(schemes, record[0])
|
||||
}
|
||||
return schemes
|
||||
}
|
||||
|
||||
func writeSchemes(schemes []string) error {
|
||||
f, err := os.Create(path)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer f.Close()
|
||||
return schemesTmpl.Execute(f, struct {
|
||||
Schemes []string
|
||||
}{
|
||||
Schemes: schemes,
|
||||
})
|
||||
}
|
||||
|
||||
func main() {
|
||||
schemes := schemeList()
|
||||
log.Printf("Generating %s...", path)
|
||||
if err := writeSchemes(schemes); err != nil {
|
||||
log.Fatalf("Could not write path: %v", err)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,111 @@
|
||||
// Copyright (c) 2015, Daniel Martí <mvdan@mvdan.cc>
|
||||
// See LICENSE for licensing information
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log"
|
||||
"net/http"
|
||||
"os"
|
||||
"regexp"
|
||||
"sort"
|
||||
"strings"
|
||||
"sync"
|
||||
"text/template"
|
||||
)
|
||||
|
||||
const path = "tlds.go"
|
||||
|
||||
var tldsTmpl = template.Must(template.New("tlds").Parse(`// Generated by tldsgen
|
||||
|
||||
package xurls
|
||||
|
||||
// TLDs is a sorted list of all public top-level domains.
|
||||
//
|
||||
// Sources:{{range $_, $url := .URLs}}
|
||||
// - {{$url}}{{end}}
|
||||
var TLDs = []string{
|
||||
{{range $_, $tld := .TLDs}}` + "\t`" + `{{$tld}}` + "`" + `,
|
||||
{{end}}}
|
||||
`))
|
||||
|
||||
func cleanTld(tld string) string {
|
||||
tld = strings.ToLower(tld)
|
||||
if strings.HasPrefix(tld, "xn--") {
|
||||
return ""
|
||||
}
|
||||
return tld
|
||||
}
|
||||
|
||||
func fetchFromURL(wg *sync.WaitGroup, url, pat string, tldSet map[string]bool) {
|
||||
defer wg.Done()
|
||||
log.Printf("Fetching %s", url)
|
||||
resp, err := http.Get(url)
|
||||
if err == nil && resp.StatusCode >= 400 {
|
||||
err = errors.New(resp.Status)
|
||||
}
|
||||
if err != nil {
|
||||
panic(fmt.Errorf("%s: %s", url, err))
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
scanner := bufio.NewScanner(resp.Body)
|
||||
re := regexp.MustCompile(pat)
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
tld := re.FindString(line)
|
||||
tld = cleanTld(tld)
|
||||
if tld == "" {
|
||||
continue
|
||||
}
|
||||
tldSet[tld] = true
|
||||
}
|
||||
if err := scanner.Err(); err != nil {
|
||||
panic(fmt.Errorf("%s: %s", url, err))
|
||||
}
|
||||
}
|
||||
|
||||
func tldList() ([]string, []string) {
|
||||
var urls []string
|
||||
var wg sync.WaitGroup
|
||||
tldSet := make(map[string]bool)
|
||||
fromURL := func(url, pat string) {
|
||||
urls = append(urls, url)
|
||||
wg.Add(1)
|
||||
go fetchFromURL(&wg, url, pat, tldSet)
|
||||
}
|
||||
fromURL("https://data.iana.org/TLD/tlds-alpha-by-domain.txt", `^[^#]+$`)
|
||||
fromURL("https://publicsuffix.org/list/effective_tld_names.dat", `^[^/.]+$`)
|
||||
wg.Wait()
|
||||
|
||||
tlds := make([]string, 0, len(tldSet))
|
||||
for tld := range tldSet {
|
||||
tlds = append(tlds, tld)
|
||||
}
|
||||
|
||||
sort.Strings(tlds)
|
||||
return tlds, urls
|
||||
}
|
||||
|
||||
func writeTlds(tlds, urls []string) error {
|
||||
f, err := os.Create(path)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
defer f.Close()
|
||||
return tldsTmpl.Execute(f, struct {
|
||||
TLDs []string
|
||||
URLs []string
|
||||
}{
|
||||
TLDs: tlds,
|
||||
URLs: urls,
|
||||
})
|
||||
}
|
||||
|
||||
func main() {
|
||||
tlds, urls := tldList()
|
||||
log.Printf("Generating %s...", path)
|
||||
writeTlds(tlds, urls)
|
||||
}
|
||||
@@ -0,0 +1,152 @@
|
||||
// Copyright (c) 2015, Daniel Martí <mvdan@mvdan.cc>
|
||||
// See LICENSE for licensing information
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"log"
|
||||
"os"
|
||||
"strconv"
|
||||
"strings"
|
||||
"text/template"
|
||||
"unicode"
|
||||
)
|
||||
|
||||
const path = "unicode.go"
|
||||
|
||||
var tmpl = template.Must(template.New("tlds").Parse(`// Generated by unicodegen
|
||||
|
||||
package xurls
|
||||
|
||||
const allowedUcsChar = {{.withPunc}}
|
||||
|
||||
const allowedUcsCharMinusPunc = {{.withoutPunc}}
|
||||
`))
|
||||
|
||||
func visit(rt *unicode.RangeTable, fn func(rune)) {
|
||||
for _, r16 := range rt.R16 {
|
||||
for r := rune(r16.Lo); r <= rune(r16.Hi); r += rune(r16.Stride) {
|
||||
fn(r)
|
||||
}
|
||||
}
|
||||
for _, r32 := range rt.R32 {
|
||||
for r := rune(r32.Lo); r <= rune(r32.Hi); r += rune(r32.Stride) {
|
||||
fn(r)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func writeUnicode() error {
|
||||
// rfc3987Ranges contains the ranges of valid code points specified by RFC 3987.
|
||||
rfc3987Ranges := [][2]rune{
|
||||
{0xA0, 0xD7FF},
|
||||
{0xF900, 0xFDCF},
|
||||
{0xFDF0, 0xFFEF},
|
||||
{0x10000, 0x1FFFD},
|
||||
{0x20000, 0x2FFFD},
|
||||
{0x30000, 0x3FFFD},
|
||||
{0x40000, 0x4FFFD},
|
||||
{0x50000, 0x5FFFD},
|
||||
{0x60000, 0x6FFFD},
|
||||
{0x70000, 0x7FFFD},
|
||||
{0x80000, 0x8FFFD},
|
||||
{0x90000, 0x9FFFD},
|
||||
{0xA0000, 0xAFFFD},
|
||||
{0xB0000, 0xBFFFD},
|
||||
{0xC0000, 0xCFFFD},
|
||||
{0xD0000, 0xDFFFD},
|
||||
{0xE1000, 0xEFFFD},
|
||||
}
|
||||
|
||||
// removeRune accepts a slice of inclusive code point ranges (in ascending order)
|
||||
// and returns a new slice that is equivalent except for excluding a specified rune
|
||||
// by removing/replacing/splitting any range containing it.
|
||||
// Its linear searches over the ranges (including those added by previous invocations)
|
||||
// are inefficient, but acceptable because this code runs only at build time.
|
||||
removeRune := func(ranges [][2]rune, cp rune) [][2]rune {
|
||||
for i, r := range ranges {
|
||||
// Ranges are in ascending order. Skip any that precede `cp`,
|
||||
// and bail out upon reaching one that follows `cp`.
|
||||
if r[1] < cp {
|
||||
continue
|
||||
} else if cp < r[0] {
|
||||
break
|
||||
}
|
||||
|
||||
// `cp` is in this range and must be removed from it.
|
||||
if cp == r[0] && cp == r[1] {
|
||||
// Remove this single-element range.
|
||||
return append(ranges[0:i], ranges[i+1:]...)
|
||||
} else if cp == r[0] {
|
||||
// Remove the first element of this range.
|
||||
newRange := [2]rune{r[0] + 1, r[1]}
|
||||
newTail := append([][2]rune{newRange}, ranges[i+1:]...)
|
||||
return append(ranges[0:i], newTail...)
|
||||
} else if cp == r[1] {
|
||||
// Remove the last element of this range.
|
||||
newRange := [2]rune{r[0], r[1] - 1}
|
||||
newTail := append([][2]rune{newRange}, ranges[i+1:]...)
|
||||
return append(ranges[0:i], newTail...)
|
||||
} else {
|
||||
// Split this range.
|
||||
newTail := append(
|
||||
[][2]rune{
|
||||
{r[0], cp - 1},
|
||||
{cp + 1, r[1]},
|
||||
},
|
||||
ranges[i+1:]...)
|
||||
return append(ranges[0:i], newTail...)
|
||||
}
|
||||
}
|
||||
return ranges
|
||||
}
|
||||
|
||||
// sepFreeRanges excludes separators from rfc3987Ranges.
|
||||
sepFreeRanges := append([][2]rune{}, rfc3987Ranges...)
|
||||
visit(unicode.Z, func(cp rune) {
|
||||
sepFreeRanges = removeRune(sepFreeRanges, cp)
|
||||
})
|
||||
|
||||
// puncFreeRanges excludes punctuation from sepFreeRanges.
|
||||
puncFreeRanges := append([][2]rune{}, sepFreeRanges...)
|
||||
visit(unicode.Po, func(cp rune) {
|
||||
puncFreeRanges = removeRune(puncFreeRanges, cp)
|
||||
})
|
||||
|
||||
// Build the corresponding regular expression character class contents.
|
||||
characterClassContents := func(ranges [][2]rune) strings.Builder {
|
||||
var builder strings.Builder
|
||||
for _, r := range ranges {
|
||||
// regexp.QuoteMeta is not necessary because all metacharacters are ASCII.
|
||||
// cf. https://golang.org/s/re2syntax and
|
||||
// https://cs.opensource.google/go/go/+/refs/tags/go1.17.6:src/regexp/regexp.go;l=721
|
||||
builder.WriteRune(r[0])
|
||||
if r[0] == r[1] {
|
||||
continue
|
||||
}
|
||||
builder.WriteRune('-')
|
||||
builder.WriteRune(r[1])
|
||||
}
|
||||
return builder
|
||||
}
|
||||
allowedUcsChar := characterClassContents(sepFreeRanges)
|
||||
allowedUcsCharMinusPunc := characterClassContents(puncFreeRanges)
|
||||
|
||||
// Write to file.
|
||||
f, err := os.Create(path)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer f.Close()
|
||||
return tmpl.Execute(f, map[string]string{
|
||||
"withPunc": strconv.Quote(allowedUcsChar.String()),
|
||||
"withoutPunc": strconv.Quote(allowedUcsCharMinusPunc.String()),
|
||||
})
|
||||
}
|
||||
|
||||
func main() {
|
||||
log.Printf("Generating %s...", path)
|
||||
if err := writeUnicode(); err != nil {
|
||||
log.Fatalf("Could not write path: %v", err)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,9 @@
|
||||
module mvdan.cc/xurls/v2
|
||||
|
||||
go 1.19
|
||||
|
||||
require (
|
||||
github.com/rogpeppe/go-internal v1.10.0
|
||||
golang.org/x/mod v0.10.0
|
||||
golang.org/x/sync v0.1.0
|
||||
)
|
||||
@@ -0,0 +1,6 @@
|
||||
github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ=
|
||||
github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog=
|
||||
golang.org/x/mod v0.10.0 h1:lFO9qtOdlre5W1jxS3r/4szv2/6iXxScdzjoBMXNhYk=
|
||||
golang.org/x/mod v0.10.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
|
||||
golang.org/x/sync v0.1.0 h1:wsuoTGHzEhffawBOhz5CYhcrV4IdKZbEyZjBMuTp12o=
|
||||
golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||
@@ -0,0 +1,375 @@
|
||||
// Generated by schemesgen
|
||||
|
||||
package xurls
|
||||
|
||||
// Schemes is a sorted list of all IANA assigned schemes.
|
||||
//
|
||||
// Source: https://www.iana.org/assignments/uri-schemes/uri-schemes-1.csv
|
||||
var Schemes = []string{
|
||||
`aaa`,
|
||||
`aaas`,
|
||||
`about`,
|
||||
`acap`,
|
||||
`acct`,
|
||||
`acd`,
|
||||
`acr`,
|
||||
`adiumxtra`,
|
||||
`adt`,
|
||||
`afp`,
|
||||
`afs`,
|
||||
`aim`,
|
||||
`amss`,
|
||||
`android`,
|
||||
`appdata`,
|
||||
`apt`,
|
||||
`ar`,
|
||||
`ark`,
|
||||
`attachment`,
|
||||
`aw`,
|
||||
`barion`,
|
||||
`bb`,
|
||||
`beshare`,
|
||||
`bitcoin`,
|
||||
`bitcoincash`,
|
||||
`blob`,
|
||||
`bolo`,
|
||||
`browserext`,
|
||||
`cabal`,
|
||||
`calculator`,
|
||||
`callto`,
|
||||
`cap`,
|
||||
`cast`,
|
||||
`casts`,
|
||||
`chrome`,
|
||||
`chrome-extension`,
|
||||
`cid`,
|
||||
`coap`,
|
||||
`coap+tcp`,
|
||||
`coap+ws`,
|
||||
`coaps`,
|
||||
`coaps+tcp`,
|
||||
`coaps+ws`,
|
||||
`com-eventbrite-attendee`,
|
||||
`content`,
|
||||
`content-type`,
|
||||
`crid`,
|
||||
`cstr`,
|
||||
`cvs`,
|
||||
`dab`,
|
||||
`dat`,
|
||||
`data`,
|
||||
`dav`,
|
||||
`diaspora`,
|
||||
`dict`,
|
||||
`did`,
|
||||
`dis`,
|
||||
`dlna-playcontainer`,
|
||||
`dlna-playsingle`,
|
||||
`dns`,
|
||||
`dntp`,
|
||||
`doi`,
|
||||
`dpp`,
|
||||
`drm`,
|
||||
`drop`,
|
||||
`dtmi`,
|
||||
`dtn`,
|
||||
`dvb`,
|
||||
`dvx`,
|
||||
`dweb`,
|
||||
`ed2k`,
|
||||
`eid`,
|
||||
`elsi`,
|
||||
`embedded`,
|
||||
`ens`,
|
||||
`ethereum`,
|
||||
`example`,
|
||||
`facetime`,
|
||||
`fax`,
|
||||
`feed`,
|
||||
`feedready`,
|
||||
`fido`,
|
||||
`file`,
|
||||
`filesystem`,
|
||||
`finger`,
|
||||
`first-run-pen-experience`,
|
||||
`fish`,
|
||||
`fm`,
|
||||
`ftp`,
|
||||
`fuchsia-pkg`,
|
||||
`geo`,
|
||||
`gg`,
|
||||
`git`,
|
||||
`gitoid`,
|
||||
`gizmoproject`,
|
||||
`go`,
|
||||
`gopher`,
|
||||
`graph`,
|
||||
`grd`,
|
||||
`gtalk`,
|
||||
`h323`,
|
||||
`ham`,
|
||||
`hcap`,
|
||||
`hcp`,
|
||||
`http`,
|
||||
`https`,
|
||||
`hxxp`,
|
||||
`hxxps`,
|
||||
`hydrazone`,
|
||||
`hyper`,
|
||||
`iax`,
|
||||
`icap`,
|
||||
`icon`,
|
||||
`im`,
|
||||
`imap`,
|
||||
`info`,
|
||||
`iotdisco`,
|
||||
`ipfs`,
|
||||
`ipn`,
|
||||
`ipns`,
|
||||
`ipp`,
|
||||
`ipps`,
|
||||
`irc`,
|
||||
`irc6`,
|
||||
`ircs`,
|
||||
`iris`,
|
||||
`iris.beep`,
|
||||
`iris.lwz`,
|
||||
`iris.xpc`,
|
||||
`iris.xpcs`,
|
||||
`isostore`,
|
||||
`itms`,
|
||||
`jabber`,
|
||||
`jar`,
|
||||
`jms`,
|
||||
`keyparc`,
|
||||
`lastfm`,
|
||||
`lbry`,
|
||||
`ldap`,
|
||||
`ldaps`,
|
||||
`leaptofrogans`,
|
||||
`lorawan`,
|
||||
`lpa`,
|
||||
`lvlt`,
|
||||
`magnet`,
|
||||
`mailserver`,
|
||||
`mailto`,
|
||||
`maps`,
|
||||
`market`,
|
||||
`matrix`,
|
||||
`message`,
|
||||
`microsoft.windows.camera`,
|
||||
`microsoft.windows.camera.multipicker`,
|
||||
`microsoft.windows.camera.picker`,
|
||||
`mid`,
|
||||
`mms`,
|
||||
`modem`,
|
||||
`mongodb`,
|
||||
`moz`,
|
||||
`ms-access`,
|
||||
`ms-appinstaller`,
|
||||
`ms-browser-extension`,
|
||||
`ms-calculator`,
|
||||
`ms-drive-to`,
|
||||
`ms-enrollment`,
|
||||
`ms-excel`,
|
||||
`ms-eyecontrolspeech`,
|
||||
`ms-gamebarservices`,
|
||||
`ms-gamingoverlay`,
|
||||
`ms-getoffice`,
|
||||
`ms-help`,
|
||||
`ms-infopath`,
|
||||
`ms-inputapp`,
|
||||
`ms-lockscreencomponent-config`,
|
||||
`ms-media-stream-id`,
|
||||
`ms-meetnow`,
|
||||
`ms-mixedrealitycapture`,
|
||||
`ms-mobileplans`,
|
||||
`ms-newsandinterests`,
|
||||
`ms-officeapp`,
|
||||
`ms-people`,
|
||||
`ms-project`,
|
||||
`ms-powerpoint`,
|
||||
`ms-publisher`,
|
||||
`ms-remotedesktop-launch`,
|
||||
`ms-restoretabcompanion`,
|
||||
`ms-screenclip`,
|
||||
`ms-screensketch`,
|
||||
`ms-search`,
|
||||
`ms-search-repair`,
|
||||
`ms-secondary-screen-controller`,
|
||||
`ms-secondary-screen-setup`,
|
||||
`ms-settings`,
|
||||
`ms-settings-airplanemode`,
|
||||
`ms-settings-bluetooth`,
|
||||
`ms-settings-camera`,
|
||||
`ms-settings-cellular`,
|
||||
`ms-settings-cloudstorage`,
|
||||
`ms-settings-connectabledevices`,
|
||||
`ms-settings-displays-topology`,
|
||||
`ms-settings-emailandaccounts`,
|
||||
`ms-settings-language`,
|
||||
`ms-settings-location`,
|
||||
`ms-settings-lock`,
|
||||
`ms-settings-nfctransactions`,
|
||||
`ms-settings-notifications`,
|
||||
`ms-settings-power`,
|
||||
`ms-settings-privacy`,
|
||||
`ms-settings-proximity`,
|
||||
`ms-settings-screenrotation`,
|
||||
`ms-settings-wifi`,
|
||||
`ms-settings-workplace`,
|
||||
`ms-spd`,
|
||||
`ms-stickers`,
|
||||
`ms-sttoverlay`,
|
||||
`ms-transit-to`,
|
||||
`ms-useractivityset`,
|
||||
`ms-virtualtouchpad`,
|
||||
`ms-visio`,
|
||||
`ms-walk-to`,
|
||||
`ms-whiteboard`,
|
||||
`ms-whiteboard-cmd`,
|
||||
`ms-word`,
|
||||
`msnim`,
|
||||
`msrp`,
|
||||
`msrps`,
|
||||
`mss`,
|
||||
`mt`,
|
||||
`mtqp`,
|
||||
`mumble`,
|
||||
`mupdate`,
|
||||
`mvn`,
|
||||
`news`,
|
||||
`nfs`,
|
||||
`ni`,
|
||||
`nih`,
|
||||
`nntp`,
|
||||
`notes`,
|
||||
`num`,
|
||||
`ocf`,
|
||||
`oid`,
|
||||
`onenote`,
|
||||
`onenote-cmd`,
|
||||
`opaquelocktoken`,
|
||||
`openpgp4fpr`,
|
||||
`otpauth`,
|
||||
`p1`,
|
||||
`pack`,
|
||||
`palm`,
|
||||
`paparazzi`,
|
||||
`payment`,
|
||||
`payto`,
|
||||
`pkcs11`,
|
||||
`platform`,
|
||||
`pop`,
|
||||
`pres`,
|
||||
`prospero`,
|
||||
`proxy`,
|
||||
`pwid`,
|
||||
`psyc`,
|
||||
`pttp`,
|
||||
`qb`,
|
||||
`query`,
|
||||
`quic-transport`,
|
||||
`redis`,
|
||||
`rediss`,
|
||||
`reload`,
|
||||
`res`,
|
||||
`resource`,
|
||||
`rmi`,
|
||||
`rsync`,
|
||||
`rtmfp`,
|
||||
`rtmp`,
|
||||
`rtsp`,
|
||||
`rtsps`,
|
||||
`rtspu`,
|
||||
`sarif`,
|
||||
`secondlife`,
|
||||
`secret-token`,
|
||||
`service`,
|
||||
`session`,
|
||||
`sftp`,
|
||||
`sgn`,
|
||||
`shc`,
|
||||
`sieve`,
|
||||
`simpleledger`,
|
||||
`simplex`,
|
||||
`sip`,
|
||||
`sips`,
|
||||
`skype`,
|
||||
`smb`,
|
||||
`smp`,
|
||||
`sms`,
|
||||
`smtp`,
|
||||
`snews`,
|
||||
`snmp`,
|
||||
`soap.beep`,
|
||||
`soap.beeps`,
|
||||
`soldat`,
|
||||
`spiffe`,
|
||||
`spotify`,
|
||||
`ssb`,
|
||||
`ssh`,
|
||||
`starknet`,
|
||||
`steam`,
|
||||
`stun`,
|
||||
`stuns`,
|
||||
`submit`,
|
||||
`svn`,
|
||||
`swh`,
|
||||
`swid`,
|
||||
`swidpath`,
|
||||
`tag`,
|
||||
`taler`,
|
||||
`teamspeak`,
|
||||
`tel`,
|
||||
`teliaeid`,
|
||||
`telnet`,
|
||||
`tftp`,
|
||||
`things`,
|
||||
`thismessage`,
|
||||
`tip`,
|
||||
`tn3270`,
|
||||
`tool`,
|
||||
`turn`,
|
||||
`turns`,
|
||||
`tv`,
|
||||
`udp`,
|
||||
`unreal`,
|
||||
`upt`,
|
||||
`urn`,
|
||||
`ut2004`,
|
||||
`uuid-in-package`,
|
||||
`v-event`,
|
||||
`vemmi`,
|
||||
`ventrilo`,
|
||||
`ves`,
|
||||
`videotex`,
|
||||
`vnc`,
|
||||
`view-source`,
|
||||
`vscode`,
|
||||
`vscode-insiders`,
|
||||
`vsls`,
|
||||
`w3`,
|
||||
`wais`,
|
||||
`web3`,
|
||||
`wcr`,
|
||||
`webcal`,
|
||||
`web+ap`,
|
||||
`wifi`,
|
||||
`wpid`,
|
||||
`ws`,
|
||||
`wss`,
|
||||
`wtai`,
|
||||
`wyciwyg`,
|
||||
`xcon`,
|
||||
`xcon-userid`,
|
||||
`xfire`,
|
||||
`xmlrpc.beep`,
|
||||
`xmlrpc.beeps`,
|
||||
`xmpp`,
|
||||
`xri`,
|
||||
`ymsgr`,
|
||||
`z39.50`,
|
||||
`z39.50r`,
|
||||
`z39.50s`,
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,24 @@
|
||||
// Copyright (c) 2015, Daniel Martí <mvdan@mvdan.cc>
|
||||
// See LICENSE for licensing information
|
||||
|
||||
package xurls
|
||||
|
||||
// PseudoTLDs is a sorted list of some widely used unofficial TLDs.
|
||||
//
|
||||
// Sources:
|
||||
// - https://en.wikipedia.org/wiki/Pseudo-top-level_domain
|
||||
// - https://en.wikipedia.org/wiki/Category:Pseudo-top-level_domains
|
||||
// - https://tools.ietf.org/html/draft-grothoff-iesg-special-use-p2p-names-00
|
||||
// - https://www.iana.org/assignments/special-use-domain-names/special-use-domain-names.xhtml
|
||||
var PseudoTLDs = []string{
|
||||
`bit`, // Namecoin
|
||||
`example`, // Example domain
|
||||
`exit`, // Tor exit node
|
||||
`gnu`, // GNS by public key
|
||||
`i2p`, // I2P network
|
||||
`invalid`, // Invalid domain
|
||||
`local`, // Local network
|
||||
`localhost`, // Local network
|
||||
`test`, // Test domain
|
||||
`zkey`, // GNS domain name
|
||||
}
|
||||
@@ -0,0 +1,7 @@
|
||||
// Generated by unicodegen
|
||||
|
||||
package xurls
|
||||
|
||||
const allowedUcsChar = "¡-ᙿᚁ-\u1fff\u200b-‧\u202a-\u202e‰-⁞\u2060-\u2fff、-\ud7ff豈-\ufdcfﷰ-\uffef𐀀-\U0001fffd𠀀-\U0002fffd𰀀-\U0003fffd\U00040000-\U0004fffd\U00050000-\U0005fffd\U00060000-\U0006fffd\U00070000-\U0007fffd\U00080000-\U0008fffd\U00090000-\U0009fffd\U000a0000-\U000afffd\U000b0000-\U000bfffd\U000c0000-\U000cfffd\U000d0000-\U000dfffd\U000e1000-\U000efffd"
|
||||
|
||||
const allowedUcsCharMinusPunc = "¢-¦¨-µ¸-¾À-ͽͿ-ΆΈ-ՙՠ-ֈ֊-ֿׁ-ׂׄ-ׇׅ-ײ\u05f5-؈؋؎-ؚ\u061c-\u061dؠ-٩ٮ-ۓە-ۿ\u070e-߶ߺ-\u082f\u083f-\u085d\u085f-ॣ०-९ॱ-ৼ৾-ੵ\u0a77-૯૱-\u0c76౸-ಃಅ-ෳ\u0df5-๎๐-๙\u0e5c-༃༓༕-྄྆-࿏࿕-࿘\u0fdb-၉ၐ-ჺჼ-፟፩-᙭ᙯ-ᙿᚁ-ᛪᛮ-᜴\u1737-៓ៗ៛-\u17ff᠆᠋-\u1943᥆-\u1a1dᨠ-\u1a9fᪧ\u1aae-᭙᭡-\u1bfbᰀ-\u1c3a᱀-ᱽᲀ-Ჿ\u1cc8-᳔᳒-\u1fff\u200b-―‘-‟\u202a-\u202e‹-›‿-⁀⁄-⁆⁒⁔\u2060-\u2cf8⳽ⴀ-ⵯ\u2d71-ⷿ⸂-⸅⸉-⸊⸌-⸍⸗⸚⸜-⸝⸠-⸩ⸯ⸺-⸻⹀⹂⹐-⹑\u2e53-\u2fff〄-〼〾-ヺー-ꓽꔀ-ꘌꘐ-꙲ꙴ-꙽ꙿ-꛱\ua6f8-ꡳ\ua878-\ua8cd꣐-ꣷꣻꣽ-꤭ꤰ-\ua95eꥠ-꧀\ua9ce-\ua9ddꧠ-\uaa5bꩠ-ꫝꫠ-ꫯꫲ-ꯪ꯬-\ud7ff豈-\ufdcfﷰ-️︗-︘\ufe1a-︯︱-﹄﹇-﹈﹍-﹏\ufe53﹘-﹞﹢-\ufe67﹩\ufe6c-\uff00$(-)+-0-9<->A-[]-⦆「-」ヲ-\uffef𐀀-\U000100ff\U00010103-\U0001039e𐎠-𐏏𐏑-\U0001056e\U00010570-\U00010856𐡘-\U0001091e𐤠-\U0001093e\U00010940-\U00010a4f\U00010a59-𐩾𐪀-𐫯\U00010af7-\U00010b38𐭀-\U00010b98\U00010b9d-𐽔\U00010f5a-𑁆\U0001104e-𑂺\U000110bd\U000110c2-𑄿𑅄-𑅳𑅶-𑇄𑇉-𑇌𑇎-𑇚𑇜\U000111e0-𑈷𑈾-𑊨\U000112aa-𑑊𑑐-𑑙\U0001145c𑑞-𑓅𑓇-𑗀𑗘-𑙀𑙄-\U0001165f\U0001166d-𑜻𑜿-𑠺\U0001183c-𑥃\U00011947-𑧡𑧣-𑨾𑩇-𑪙𑪝\U00011aa3-𑱀\U00011c46-\U00011c6f𑱲-𑻶\U00011ef9-\U00011ffe𒀀-\U0001246f\U00012475-\U00016a6d\U00016a70-𖫴\U00016af6-𖬶𖬼-𖭃𖭅-𖺖\U00016e9b-𖿡𖿣-𛲞\U0001bca0-𝪆\U0001da8c-\U0001e95d\U0001e960-\U0001fffd𠀀-\U0002fffd𰀀-\U0003fffd\U00040000-\U0004fffd\U00050000-\U0005fffd\U00060000-\U0006fffd\U00070000-\U0007fffd\U00080000-\U0008fffd\U00090000-\U0009fffd\U000a0000-\U000afffd\U000b0000-\U000bfffd\U000c0000-\U000cfffd\U000d0000-\U000dfffd\U000e1000-\U000efffd"
|
||||
@@ -0,0 +1,200 @@
|
||||
// Copyright (c) 2015, Daniel Martí <mvdan@mvdan.cc>
|
||||
// See LICENSE for licensing information
|
||||
|
||||
// Package xurls extracts urls from plain text using regular expressions.
|
||||
package xurls
|
||||
|
||||
import (
|
||||
"regexp"
|
||||
"strings"
|
||||
"sync"
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
||||
//go:generate go run ./generate/tldsgen
|
||||
//go:generate go run ./generate/schemesgen
|
||||
//go:generate go run ./generate/unicodegen
|
||||
|
||||
const (
|
||||
// pathCont is based on https://www.rfc-editor.org/rfc/rfc3987#section-2.2
|
||||
// but does not match separators anywhere or most puncutation in final position,
|
||||
// to avoid creating asymmetries like
|
||||
// `Did you know that **<a href="...">https://example.com/**</a> is reserved for documentation?`
|
||||
// from `Did you know that **https://example.com/** is reserved for documentation?`.
|
||||
unreservedChar = `a-zA-Z0-9\-._~`
|
||||
endUnreservedChar = `a-zA-Z0-9\-_~`
|
||||
midSubDelimChar = `!$&'*+,;=`
|
||||
endSubDelimChar = `$&+=`
|
||||
midIPathSegmentChar = unreservedChar + `%` + midSubDelimChar + `:@` + allowedUcsChar
|
||||
endIPathSegmentChar = endUnreservedChar + `%` + endSubDelimChar + allowedUcsCharMinusPunc
|
||||
iPrivateChar = `\x{E000}-\x{F8FF}\x{F0000}-\x{FFFFD}\x{100000}-\x{10FFFD}`
|
||||
midIChar = `/?#\\` + midIPathSegmentChar + iPrivateChar
|
||||
endIChar = `/#` + endIPathSegmentChar + iPrivateChar
|
||||
wellParen = `\((?:[` + midIChar + `]|\([` + midIChar + `]*\))*\)`
|
||||
wellBrack = `\[(?:[` + midIChar + `]|\[[` + midIChar + `]*\])*\]`
|
||||
wellBrace = `\{(?:[` + midIChar + `]|\{[` + midIChar + `]*\})*\}`
|
||||
wellAll = wellParen + `|` + wellBrack + `|` + wellBrace
|
||||
pathCont = `(?:[` + midIChar + `]*(?:` + wellAll + `|[` + endIChar + `]))+`
|
||||
|
||||
letter = `\p{L}`
|
||||
mark = `\p{M}`
|
||||
number = `\p{N}`
|
||||
iriChar = letter + mark + number
|
||||
iri = `[` + iriChar + `](?:[` + iriChar + `\-]*[` + iriChar + `])?`
|
||||
subdomain = `(?:` + iri + `\.)+`
|
||||
octet = `(?:25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9][0-9]|[0-9])`
|
||||
ipv4Addr = octet + `\.` + octet + `\.` + octet + `\.` + octet
|
||||
|
||||
// ipv6Addr is based on https://datatracker.ietf.org/doc/html/rfc4291#section-2.2
|
||||
// with a specific alternative for each valid count of leading 16-bit hexadecimal "chomps"
|
||||
// that have not been replaced with a `::` elision.
|
||||
h4 = `[0-9a-fA-F]{1,4}`
|
||||
ipv6AddrMinusEmpty = `(?:` +
|
||||
// 7 colon-terminated chomps, followed by a final chomp or the rest of an elision.
|
||||
`(?:` + h4 + `:){7}(?:` + h4 + `|:)|` +
|
||||
// 6 chomps, followed by an IPv4 address or elision with final chomp or final elision.
|
||||
`(?:` + h4 + `:){6}(?:` + ipv4Addr + `|:` + h4 + `|:)|` +
|
||||
// 5 chomps, followed by an elision with optional IPv4 or up to 2 final chomps.
|
||||
`(?:` + h4 + `:){5}(?::` + ipv4Addr + `|(?::` + h4 + `){1,2}|:)|` +
|
||||
// 4 chomps, followed by an elision with optional IPv4 (optionally preceded by a chomp) or
|
||||
// up to 3 final chomps.
|
||||
`(?:` + h4 + `:){4}(?:(?::` + h4 + `){0,1}:` + ipv4Addr + `|(?::` + h4 + `){1,3}|:)|` +
|
||||
// 3 chomps, followed by an elision with optional IPv4 (preceded by up to 2 chomps) or
|
||||
// up to 4 final chomps.
|
||||
`(?:` + h4 + `:){3}(?:(?::` + h4 + `){0,2}:` + ipv4Addr + `|(?::` + h4 + `){1,4}|:)|` +
|
||||
// 2 chomps, followed by an elision with optional IPv4 (preceded by up to 3 chomps) or
|
||||
// up to 5 final chomps.
|
||||
`(?:` + h4 + `:){2}(?:(?::` + h4 + `){0,3}:` + ipv4Addr + `|(?::` + h4 + `){1,5}|:)|` +
|
||||
// 1 chomp, followed by an elision with optional IPv4 (preceded by up to 4 chomps) or
|
||||
// up to 6 final chomps.
|
||||
`(?:` + h4 + `:){1}(?:(?::` + h4 + `){0,4}:` + ipv4Addr + `|(?::` + h4 + `){1,6}|:)|` +
|
||||
// elision, followed by optional IPv4 (preceded by up to 5 chomps) or
|
||||
// up to 7 final chomps.
|
||||
// `:` is an intentionally omitted alternative, to avoid matching `::`.
|
||||
`:(?:(?::` + h4 + `){0,5}:` + ipv4Addr + `|(?::` + h4 + `){1,7})` +
|
||||
`)`
|
||||
ipv6Addr = `(?:` + ipv6AddrMinusEmpty + `|::)`
|
||||
ipAddrMinusEmpty = `(?:` + ipv6AddrMinusEmpty + `|\b` + ipv4Addr + `\b)`
|
||||
port = `(?::[0-9]*)?`
|
||||
)
|
||||
|
||||
// AnyScheme can be passed to StrictMatchingScheme to match any possibly valid
|
||||
// scheme, and not just the known ones.
|
||||
var AnyScheme = `(?:[a-zA-Z][a-zA-Z.\-+]*://|` + anyOf(SchemesNoAuthority...) + `:)`
|
||||
|
||||
// SchemesNoAuthority is a sorted list of some well-known url schemes that are
|
||||
// followed by ":" instead of "://". The list includes both officially
|
||||
// registered and unofficial schemes.
|
||||
var SchemesNoAuthority = []string{
|
||||
`bitcoin`, // Bitcoin
|
||||
`cid`, // Content-ID
|
||||
`file`, // Files
|
||||
`magnet`, // Torrent magnets
|
||||
`mailto`, // Mail
|
||||
`mid`, // Message-ID
|
||||
`sms`, // SMS
|
||||
`tel`, // Telephone
|
||||
`xmpp`, // XMPP
|
||||
}
|
||||
|
||||
// SchemesUnofficial is a sorted list of some well-known url schemes which
|
||||
// aren't officially registered just yet. They tend to correspond to software.
|
||||
//
|
||||
// Mostly collected from https://en.wikipedia.org/wiki/List_of_URI_schemes#Unofficial_but_common_URI_schemes.
|
||||
var SchemesUnofficial = []string{
|
||||
`gemini`, // gemini
|
||||
`jdbc`, // Java database Connectivity
|
||||
`moz-extension`, // Firefox extension
|
||||
`postgres`, // PostgreSQL (short form)
|
||||
`postgresql`, // PostgreSQL
|
||||
`slack`, // Slack
|
||||
`zoommtg`, // Zoom (desktop)
|
||||
`zoomus`, // Zoom (mobile)
|
||||
}
|
||||
|
||||
// The regular expressions are compiled when the API is first called.
|
||||
// Any subsequent calls will use the same regular expression pointers.
|
||||
//
|
||||
// We do not need to make a copy of them for each API call,
|
||||
// as Copy is now only useful if one copy calls Longest but not another,
|
||||
// and we always call Longest after compiling the regular expression.
|
||||
var (
|
||||
strictRe *regexp.Regexp
|
||||
strictInit sync.Once
|
||||
|
||||
relaxedRe *regexp.Regexp
|
||||
relaxedInit sync.Once
|
||||
)
|
||||
|
||||
func anyOf(strs ...string) string {
|
||||
var b strings.Builder
|
||||
b.WriteString("(?:")
|
||||
for i, s := range strs {
|
||||
if i != 0 {
|
||||
b.WriteByte('|')
|
||||
}
|
||||
b.WriteString(regexp.QuoteMeta(s))
|
||||
}
|
||||
b.WriteByte(')')
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func strictExp() string {
|
||||
schemes := `(?:(?i)(?:` + anyOf(Schemes...) + `|` + anyOf(SchemesUnofficial...) + `)://|` + anyOf(SchemesNoAuthority...) + `:)`
|
||||
return schemes + pathCont
|
||||
}
|
||||
|
||||
func relaxedExp() string {
|
||||
var asciiTLDs, unicodeTLDs []string
|
||||
for i, tld := range TLDs {
|
||||
if tld[0] >= utf8.RuneSelf {
|
||||
asciiTLDs = TLDs[:i:i]
|
||||
unicodeTLDs = TLDs[i:]
|
||||
break
|
||||
}
|
||||
}
|
||||
punycode := `xn--[a-z0-9-]+`
|
||||
|
||||
// Use \b to make sure ASCII TLDs are immediately followed by a word break.
|
||||
// We can't do that with unicode TLDs, as they don't see following
|
||||
// whitespace as a word break.
|
||||
tlds := `(?:(?i)` + punycode + `|` + anyOf(append(asciiTLDs, PseudoTLDs...)...) + `\b|` + anyOf(unicodeTLDs...) + `)`
|
||||
domain := subdomain + tlds
|
||||
|
||||
hostName := `(?:` + domain + `|\[` + ipv6Addr + `\]|\b` + ipv4Addr + `\b)`
|
||||
webURL := hostName + port + `(?:/` + pathCont + `|/)?`
|
||||
email := `[a-zA-Z0-9._%\-+]+@` + domain
|
||||
return strictExp() + `|` + webURL + `|` + email + `|` + ipv6AddrMinusEmpty
|
||||
}
|
||||
|
||||
// Strict produces a regexp that matches any URL with a scheme in either the
|
||||
// Schemes or SchemesNoAuthority lists.
|
||||
func Strict() *regexp.Regexp {
|
||||
strictInit.Do(func() {
|
||||
strictRe = regexp.MustCompile(strictExp())
|
||||
strictRe.Longest()
|
||||
})
|
||||
return strictRe
|
||||
}
|
||||
|
||||
// Relaxed produces a regexp that matches any URL matched by Strict, plus any
|
||||
// URL with no scheme or email address.
|
||||
func Relaxed() *regexp.Regexp {
|
||||
relaxedInit.Do(func() {
|
||||
relaxedRe = regexp.MustCompile(relaxedExp())
|
||||
relaxedRe.Longest()
|
||||
})
|
||||
return relaxedRe
|
||||
}
|
||||
|
||||
// StrictMatchingScheme produces a regexp similar to Strict, but requiring that
|
||||
// the scheme match the given regular expression. See AnyScheme too.
|
||||
func StrictMatchingScheme(exp string) (*regexp.Regexp, error) {
|
||||
strictMatching := `(?i)(?:` + exp + `)(?-i)` + pathCont
|
||||
re, err := regexp.Compile(strictMatching)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
re.Longest()
|
||||
return re, nil
|
||||
}
|
||||
@@ -0,0 +1,469 @@
|
||||
// Copyright (c) 2015, Daniel Martí <mvdan@mvdan.cc>
|
||||
// See LICENSE for licensing information
|
||||
|
||||
package xurls
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"regexp"
|
||||
"sync"
|
||||
"testing"
|
||||
)
|
||||
|
||||
type testCase struct {
|
||||
in string
|
||||
want interface{}
|
||||
}
|
||||
|
||||
func wantStr(in string, want interface{}) string {
|
||||
switch x := want.(type) {
|
||||
case string:
|
||||
return x
|
||||
case bool:
|
||||
if x {
|
||||
return in
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func doTest(t *testing.T, name string, re *regexp.Regexp, cases []testCase) {
|
||||
for i, c := range cases {
|
||||
t.Run(fmt.Sprintf("%s/%03d", name, i), func(t *testing.T) {
|
||||
want := wantStr(c.in, c.want)
|
||||
for _, surround := range []string{"", "\n"} {
|
||||
in := surround + c.in + surround
|
||||
got := re.FindString(in)
|
||||
if got != want {
|
||||
t.Errorf(`FindString(%q) got %q, want %q`, in, got, want)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
var constantTestCases = []testCase{
|
||||
{``, nil},
|
||||
{` `, nil},
|
||||
{`:`, nil},
|
||||
{`::`, nil},
|
||||
{`:::`, nil},
|
||||
{`::::`, nil},
|
||||
{`.`, nil},
|
||||
{`..`, nil},
|
||||
{`...`, nil},
|
||||
{`1.1`, nil},
|
||||
{`.1.`, nil},
|
||||
{`1.1.1`, nil},
|
||||
{`1:1`, nil},
|
||||
{`:1:`, nil},
|
||||
{`1:1:1`, nil},
|
||||
{`://`, nil},
|
||||
{`foo`, nil},
|
||||
{`foo:`, nil},
|
||||
{`mailto:`, nil},
|
||||
{`foo://`, nil},
|
||||
{`http://`, nil},
|
||||
{`http:// foo`, nil},
|
||||
{`http:// foo`, nil},
|
||||
{`:foo`, nil},
|
||||
{`://foo`, nil},
|
||||
{`foorandom:bar`, nil},
|
||||
{`foo.randombar`, nil},
|
||||
{`zzz.`, nil},
|
||||
{`.zzz`, nil},
|
||||
{`zzz.zzz`, nil},
|
||||
{`/some/path`, nil},
|
||||
{`rel/path`, nil},
|
||||
{`localhost`, nil},
|
||||
{`com`, nil},
|
||||
{`.com`, nil},
|
||||
{`com.`, nil},
|
||||
{`http`, nil},
|
||||
|
||||
{`http://foo`, true},
|
||||
{`http://FOO`, true},
|
||||
{`http://FAÀ`, true},
|
||||
{`https://localhost`, true},
|
||||
{`mailto:foo`, true},
|
||||
{`MAILTO:foo`, true},
|
||||
{`sms:123`, true},
|
||||
{`xmpp:foo@bar`, true},
|
||||
{`bitcoin:Addr23?amount=1&message=foo`, true},
|
||||
{`cid:foo-32x32.v2_fe0f1423.png`, true},
|
||||
{`mid:960830.1639@XIson.com`, true},
|
||||
{`http://foo.com`, true},
|
||||
{`http://foo.co.uk`, true},
|
||||
{`http://foo.random`, true},
|
||||
{` http://foo.com/bar `, `http://foo.com/bar`},
|
||||
{` http://foo.com/bar more`, `http://foo.com/bar`},
|
||||
{`<http://foo.com/bar>`, `http://foo.com/bar`},
|
||||
{`<http://foo.com/bar>more`, `http://foo.com/bar`},
|
||||
{`.http://foo.com/bar.`, `http://foo.com/bar`},
|
||||
{`.http://foo.com/bar.more`, `http://foo.com/bar.more`},
|
||||
{`,http://foo.com/bar,`, `http://foo.com/bar`},
|
||||
{`,http://foo.com/bar,more`, `http://foo.com/bar,more`},
|
||||
{`*http://foo.com/bar*`, `http://foo.com/bar`},
|
||||
{`*http://foo.com/bar*more`, `http://foo.com/bar*more`},
|
||||
{`_http://foo.com/bar_`, `http://foo.com/bar_`},
|
||||
{`_http://foo.com/bar_more`, `http://foo.com/bar_more`},
|
||||
{`(http://foo.com/bar)`, `http://foo.com/bar`},
|
||||
{`(http://foo.com/bar)more`, `http://foo.com/bar`},
|
||||
{`[http://foo.com/bar]`, `http://foo.com/bar`},
|
||||
{`[http://foo.com/bar]more`, `http://foo.com/bar`},
|
||||
{`'http://foo.com/bar'`, `http://foo.com/bar`},
|
||||
{`'http://foo.com/bar'more`, `http://foo.com/bar'more`},
|
||||
{`"http://foo.com/bar"`, `http://foo.com/bar`},
|
||||
{`"http://foo.com/bar"more`, `http://foo.com/bar`},
|
||||
{`{"url":"http://foo.com/bar"}`, `http://foo.com/bar`},
|
||||
{`{"before":"foo","url":"http://foo.com/bar","after":"bar"}`, `http://foo.com/bar`},
|
||||
{`http://a.b/a0/-+_&~*%=#@.,:;'?![]()a`, true},
|
||||
{`http://a.b/a0/$€¥`, true},
|
||||
{`http://✪foo.bar/pa✪th©more`, true},
|
||||
{`http://foo.bar/path/`, true},
|
||||
{`http://foo.bar/path-`, true},
|
||||
{`http://foo.bar/path+`, true},
|
||||
{`http://foo.bar/path&`, true},
|
||||
{`http://foo.bar/path~`, true},
|
||||
{`http://foo.bar/path%`, true},
|
||||
{`http://foo.bar/path=`, true},
|
||||
{`http://foo.bar/path#`, true},
|
||||
{`http://foo.bar/path.`, `http://foo.bar/path`},
|
||||
{`http://foo.bar/path,`, `http://foo.bar/path`},
|
||||
{`http://foo.bar/path:`, `http://foo.bar/path`},
|
||||
{`http://foo.bar/path;`, `http://foo.bar/path`},
|
||||
{`http://foo.bar/path'`, `http://foo.bar/path`},
|
||||
{`http://foo.bar/path?`, `http://foo.bar/path`},
|
||||
{`http://foo.bar/path!`, `http://foo.bar/path`},
|
||||
{`http://foo.bar/path@`, `http://foo.bar/path`},
|
||||
{`http://foo.bar/path|`, `http://foo.bar/path`},
|
||||
{`http://foo.bar/path|more`, `http://foo.bar/path`},
|
||||
{`http://foo.bar/path<`, `http://foo.bar/path`},
|
||||
{`http://foo.bar/path<more`, `http://foo.bar/path`},
|
||||
{`http://foo.com/path_(more)`, true},
|
||||
{`(http://foo.com/path_(more))`, `http://foo.com/path_(more)`},
|
||||
{`http://foo.com/path_(even)-(more)`, true},
|
||||
{`http://foo.com/path_(even)(more)`, true},
|
||||
{`http://foo.com/path_(even_(nested))`, true},
|
||||
{`(http://foo.com/path_(even_(nested)))`, `http://foo.com/path_(even_(nested))`},
|
||||
{`http://foo.com/path_[more]`, true},
|
||||
{`[http://foo.com/path_[more]]`, `http://foo.com/path_[more]`},
|
||||
{`http://foo.com/path_[even]-[more]`, true},
|
||||
{`http://foo.com/path_[even][more]`, true},
|
||||
{`http://foo.com/path_[even_[nested]]`, true},
|
||||
{`[http://foo.com/path_[even_[nested]]]`, `http://foo.com/path_[even_[nested]]`},
|
||||
{`http://foo.com/path_{more}`, true},
|
||||
{`{http://foo.com/path_{more}}`, `http://foo.com/path_{more}`},
|
||||
{`http://foo.com/path_{even}-{more}`, true},
|
||||
{`http://foo.com/path_{even}{more}`, true},
|
||||
{`http://foo.com/path_{even_{nested}}`, true},
|
||||
{`{http://foo.com/path_{even_{nested}}}`, `http://foo.com/path_{even_{nested}}`},
|
||||
{`http://foo.com/path#fragment`, true},
|
||||
{`http://foo.com/emptyfrag#`, true},
|
||||
{`http://foo.com/spaced%20path`, true},
|
||||
{`http://foo.com/?p=spaced%20param`, true},
|
||||
{`http://test.foo.com/`, true},
|
||||
{`http://foo.com/path`, true},
|
||||
{`http://foo.com:8080/path`, true},
|
||||
{`http://1.1.1.1/path`, true},
|
||||
{`http://1.1.1.1:8080/path`, true},
|
||||
{`http://[1080::8:800:200c:417a]/path`, true},
|
||||
{`http://[1080::8:800:200c:417a]:8080/path`, true},
|
||||
|
||||
// scheme://IPv6_addr is not valid per RFC 3987, but is supported anyway (for now).
|
||||
{`http://1080::8:800:200c:417a/path`, true},
|
||||
{`http://2001.db8:0/path`, true},
|
||||
|
||||
{`http://中国.中国/中国`, true},
|
||||
{`http://中国.中国/foo中国`, true},
|
||||
{`http://उदाहरण.परीकषा`, true},
|
||||
{`http://xn-foo.xn--p1acf/path`, true},
|
||||
{`what is http://foo.com?`, `http://foo.com`},
|
||||
{`go visit http://foo.com/path.`, `http://foo.com/path`},
|
||||
{`go visit http://foo.com/path...`, `http://foo.com/path`},
|
||||
{`what is http://foo.com/path?`, `http://foo.com/path`},
|
||||
{`the http://foo.com!`, `http://foo.com`},
|
||||
{`https://test.foo.bar/path?a=b`, `https://test.foo.bar/path?a=b`},
|
||||
{`ftp://user@foo.bar`, true},
|
||||
{`http://foo.com/base64-bCBwbGVhcw==`, true},
|
||||
{`http://foo.com/–`, true},
|
||||
{`http://foo.com/🐼`, true},
|
||||
{`https://shmibbles.me/tmp/自殺でも?.png`, true},
|
||||
{`randomtexthttp://foo.bar/etc`, "http://foo.bar/etc"},
|
||||
{`postgres://user:pass@host.com:5432/path?k=v#f`, true},
|
||||
{`postgres://user:pass@host.com:5432/path?k=v#f`, true},
|
||||
{`zoommtg://zoom.us/join?confno=1234&pwd=xxx`, true},
|
||||
{`zoomus://zoom.us/join?confno=1234&pwd=xxx`, true},
|
||||
}
|
||||
|
||||
func TestRegexes(t *testing.T) {
|
||||
doTest(t, "Relaxed", Relaxed(), constantTestCases)
|
||||
doTest(t, "Strict", Strict(), constantTestCases)
|
||||
doTest(t, "Relaxed2", Relaxed(), []testCase{
|
||||
{`foo.a`, nil},
|
||||
{`foo.com`, true},
|
||||
{`foo.com bar.com`, `foo.com`},
|
||||
{`foo.com-foo`, `foo.com`},
|
||||
{`foo.company`, true},
|
||||
{`foo.comrandom`, nil},
|
||||
{`some.guy`, nil},
|
||||
{`foo.example`, true},
|
||||
{`foo.i2p`, true},
|
||||
{`foo.local`, true},
|
||||
{`foo.onion`, true},
|
||||
{`中国.中国`, true},
|
||||
{`中国.中国/foo中国`, true},
|
||||
{`test.联通`, true},
|
||||
{`test.联通 extra`, `test.联通`},
|
||||
{`test.xn--8y0a063a`, true},
|
||||
{`test.xn--8y0a063a/foobar`, true},
|
||||
{`test.xn-foo`, nil},
|
||||
{`test.xn--`, nil},
|
||||
{`foo.com/`, true},
|
||||
{`1.1.1.1`, true},
|
||||
{`10.50.23.250`, true},
|
||||
{`121.1.1.1`, true},
|
||||
{`255.1.1.1`, true},
|
||||
{`300.1.1.1`, nil},
|
||||
{`1.1.1.300`, nil},
|
||||
{`foo@1.2.3.4`, `1.2.3.4`},
|
||||
|
||||
// https://www.iana.org/assignments/iana-ipv6-special-registry/iana-ipv6-special-registry.xhtml
|
||||
{`::1`, true},
|
||||
//{`::`, true},
|
||||
{`::ffff:0:0`, true},
|
||||
{`64:ff9b::`, true},
|
||||
{`64:ff9b:1::`, true},
|
||||
{`100::`, true},
|
||||
{`2001::`, true},
|
||||
{`2001:1::1`, true},
|
||||
{`2001:1::2`, true},
|
||||
{`2001:2::`, true},
|
||||
{`2001:3::`, true},
|
||||
{`2001:4:112::`, true},
|
||||
{`2001:10::`, true},
|
||||
{`2001:20::`, true},
|
||||
{`2001:db8::`, true},
|
||||
{`2002::`, true},
|
||||
{`2620:4f:8000::`, true},
|
||||
{`fc00::`, true},
|
||||
{`fe80::`, true},
|
||||
|
||||
// https://datatracker.ietf.org/doc/html/rfc4291#section-2.2
|
||||
{`ABCD:EF01:2345:6789:ABCD:EF01:2345:6789`, true},
|
||||
{`2001:DB8:0:0:8:800:200C:417A`, true},
|
||||
{`2001:DB8:0:0:8:800:200C:417A`, true}, // a unicast address
|
||||
{`FF01:0:0:0:0:0:0:101`, true}, // a multicast address
|
||||
{`0:0:0:0:0:0:0:1`, true}, // the loopback address
|
||||
{`0:0:0:0:0:0:0:0`, true}, // the unspecified address
|
||||
{`2001:DB8::8:800:200C:417A`, true}, // a unicast address
|
||||
{`FF01::101`, true}, // a multicast address
|
||||
{`::1`, true}, // the loopback address
|
||||
//{`::`, true}, // the unspecified address
|
||||
{`::`, nil},
|
||||
{`0:0:0:0:0:0:13.1.68.3`, true},
|
||||
{`0:0:0:0:0:FFFF:129.144.52.38`, true},
|
||||
{`::13.1.68.3`, true},
|
||||
{`::FFFF:129.144.52.38`, true},
|
||||
|
||||
// https://datatracker.ietf.org/doc/html/rfc5952#section-1
|
||||
{`2001:db8:0:0:1:0:0:1`, true},
|
||||
{`2001:0db8:0:0:1:0:0:1`, true},
|
||||
{`2001:db8::1:0:0:1`, true},
|
||||
{`2001:db8::0:1:0:0:1`, true},
|
||||
{`2001:0db8::1:0:0:1`, true},
|
||||
{`2001:db8:0:0:1::1`, true},
|
||||
{`2001:db8:0000:0:1::1`, true},
|
||||
{`2001:DB8:0:0:1::1`, true},
|
||||
|
||||
// https://datatracker.ietf.org/doc/html/rfc5952#section-2.1
|
||||
{`2001:db8:aaaa:bbbb:cccc:dddd:eeee:0001`, true},
|
||||
{`2001:db8:aaaa:bbbb:cccc:dddd:eeee:001`, true},
|
||||
{`2001:db8:aaaa:bbbb:cccc:dddd:eeee:01`, true},
|
||||
{`2001:db8:aaaa:bbbb:cccc:dddd:eeee:1`, true},
|
||||
|
||||
// https://datatracker.ietf.org/doc/html/rfc5952#section-2.2
|
||||
{`2001:db8:aaaa:bbbb:cccc:dddd::1`, true},
|
||||
{`2001:db8:aaaa:bbbb:cccc:dddd:0:1`, true},
|
||||
{`2001:db8:0:0:0::1`, true},
|
||||
{`2001:db8:0:0::1`, true},
|
||||
{`2001:db8:0::1`, true},
|
||||
{`2001:db8::1`, true},
|
||||
{`2001:db8::aaaa:0:0:1`, true},
|
||||
{`2001:db8:0:0:aaaa::1`, true},
|
||||
|
||||
// https://datatracker.ietf.org/doc/html/rfc5952#section-2.3
|
||||
{`2001:db8:aaaa:bbbb:cccc:dddd:eeee:aaaa`, true},
|
||||
{`2001:db8:aaaa:bbbb:cccc:dddd:eeee:AAAA`, true},
|
||||
{`2001:db8:aaaa:bbbb:cccc:dddd:eeee:AaAa`, true},
|
||||
|
||||
// An IP address in URI host position must be bracketed unless it is IPv4.
|
||||
// https://www.rfc-editor.org/rfc/rfc3986#section-3.2.2
|
||||
// TODO: Implement this restriction, ideally without matching the `http://1080` prefix.
|
||||
//{`http://1080::8:800:200c:417a/path`, `1080::8:800:200c:417a`},
|
||||
|
||||
{`foo.com:8080`, true},
|
||||
{`foo.com:8080/path`, true},
|
||||
{`test.foo.com`, true},
|
||||
{`test.foo.com/path`, true},
|
||||
{`test.foo.com/path/more/`, true},
|
||||
{`TEST.FOO.COM/PATH`, true},
|
||||
{`TEST.FÓO.COM/PÁTH`, true},
|
||||
{`foo.com/path_(more)`, true},
|
||||
{`foo.com/path_(even)_(more)`, true},
|
||||
{`foo.com/path_(more)/more`, true},
|
||||
{`foo.com/path_(more)/end)`, `foo.com/path_(more)/end`},
|
||||
{`www.foo.com`, true},
|
||||
{` foo.com/bar `, `foo.com/bar`},
|
||||
{` foo.com/bar more`, `foo.com/bar`},
|
||||
{`<foo.com/bar>`, `foo.com/bar`},
|
||||
{`<foo.com/bar>more`, `foo.com/bar`},
|
||||
{`,foo.com/bar.`, `foo.com/bar`},
|
||||
{`,foo.com/bar.more`, `foo.com/bar.more`},
|
||||
{`,foo.com/bar,`, `foo.com/bar`},
|
||||
{`,foo.com/bar,more`, `foo.com/bar,more`},
|
||||
{`(foo.com/bar)`, `foo.com/bar`},
|
||||
{`"foo.com/bar'`, `foo.com/bar`},
|
||||
{`"foo.com/bar'more`, `foo.com/bar'more`},
|
||||
{`"foo.com/bar"`, `foo.com/bar`},
|
||||
{`what is foo.com?`, `foo.com`},
|
||||
{`the foo.com!`, `foo.com`},
|
||||
|
||||
{`foo@bar`, nil},
|
||||
{`foo@bar.a`, nil},
|
||||
{`foo@bar.com`, true},
|
||||
{`foo@sub.bar.com`, true},
|
||||
{`foo@bar.com bar@bar.com`, `foo@bar.com`},
|
||||
{`foo@bar.onion`, true},
|
||||
{`foo@中国.中国`, true},
|
||||
{`foo@test.bar.com`, true},
|
||||
{`FOO@TEST.BAR.COM`, true},
|
||||
{`foo@bar.com/path`, `foo@bar.com`},
|
||||
{`foo+test@bar.com`, true},
|
||||
{`foo+._%-@bar.com`, true},
|
||||
})
|
||||
doTest(t, "Strict2", Strict(), []testCase{
|
||||
{`http:// foo.com`, nil},
|
||||
{`foo.a`, nil},
|
||||
{`foo.com`, nil},
|
||||
{`foo.com/`, nil},
|
||||
{`1.1.1.1`, nil},
|
||||
{`3ffe:2a00:100:7031::1`, nil},
|
||||
{`test.foo.com:8080/path`, nil},
|
||||
{`foo@bar.com`, nil},
|
||||
|
||||
// An IP address in URI host position must be bracketed unless it is IPv4.
|
||||
// https://www.rfc-editor.org/rfc/rfc3986#section-3.2.2
|
||||
// TODO: Implement this restriction, ideally without matching the `http://1080` prefix.
|
||||
//{`http://1080::8:800:200c:417a/path`, nil},
|
||||
})
|
||||
}
|
||||
|
||||
func TestStrictMatchingSchemeError(t *testing.T) {
|
||||
for _, c := range []struct {
|
||||
exp string
|
||||
wantErr bool
|
||||
}{
|
||||
{`http://`, false},
|
||||
{`https?://`, false},
|
||||
{`http://|mailto:`, false},
|
||||
{`http://(`, true},
|
||||
} {
|
||||
_, err := StrictMatchingScheme(c.exp)
|
||||
if c.wantErr && err == nil {
|
||||
t.Errorf(`StrictMatchingScheme("%s") did not error as expected`, c.exp)
|
||||
} else if !c.wantErr && err != nil {
|
||||
t.Errorf(`StrictMatchingScheme("%s") unexpectedly errored`, c.exp)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestStrictMatchingScheme(t *testing.T) {
|
||||
strictMatching, _ := StrictMatchingScheme("http://|ftps?://|mailto:")
|
||||
doTest(t, "StrictMatchingScheme", strictMatching, []testCase{
|
||||
{`foo.com`, nil},
|
||||
{`foo@bar.com`, nil},
|
||||
{`http://foo`, true},
|
||||
{`Http://foo`, true},
|
||||
{`https://foo`, nil},
|
||||
{`ftp://foo`, true},
|
||||
{`ftps://foo`, true},
|
||||
{`mailto:foo`, true},
|
||||
{`MAILTO:foo`, true},
|
||||
{`sms:123`, nil},
|
||||
})
|
||||
}
|
||||
|
||||
func TestStrictMatchingSchemeAny(t *testing.T) {
|
||||
strictMatching, _ := StrictMatchingScheme(AnyScheme)
|
||||
doTest(t, "StrictMatchingScheme", strictMatching, []testCase{
|
||||
{`http://foo`, true},
|
||||
{`git+https://foo`, true},
|
||||
{`randomtexthttp://foo.bar/etc`, true},
|
||||
{`mailto:foo`, true},
|
||||
})
|
||||
}
|
||||
|
||||
func bench(b *testing.B, re func() *regexp.Regexp, str string) {
|
||||
b.ReportAllocs()
|
||||
b.SetBytes(int64(len(str)))
|
||||
|
||||
b.RunParallel(func(pb *testing.PB) {
|
||||
for pb.Next() {
|
||||
re().FindAllString(str, -1)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
const inputNone = `
|
||||
foo bar
|
||||
yaml: "as well"
|
||||
some more plaintext
|
||||
which does not contain any urls.
|
||||
`
|
||||
|
||||
const inputMany = `
|
||||
foo bar http://foo.foo https://192.168.1.1/path
|
||||
foo.com bitcoin:address ftp://
|
||||
xmpp:foo@bar.com
|
||||
`
|
||||
|
||||
func BenchmarkStrict_none(b *testing.B) {
|
||||
bench(b, Strict, inputNone)
|
||||
}
|
||||
|
||||
func BenchmarkStrict_many(b *testing.B) {
|
||||
bench(b, Strict, inputMany)
|
||||
}
|
||||
|
||||
func BenchmarkRelaxed_none(b *testing.B) {
|
||||
bench(b, Relaxed, inputNone)
|
||||
}
|
||||
|
||||
func BenchmarkRelaxed_many(b *testing.B) {
|
||||
bench(b, Relaxed, inputMany)
|
||||
}
|
||||
|
||||
var (
|
||||
rxMatchingScheme *regexp.Regexp
|
||||
rxMatchingSchemeOnce sync.Once
|
||||
)
|
||||
|
||||
func matchingScheme() *regexp.Regexp {
|
||||
rxMatchingSchemeOnce.Do(func() {
|
||||
rx, err := StrictMatchingScheme("https?://")
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
rxMatchingScheme = rx
|
||||
})
|
||||
return rxMatchingScheme
|
||||
}
|
||||
|
||||
func BenchmarkStrictMatchingScheme_none(b *testing.B) {
|
||||
bench(b, matchingScheme, inputNone)
|
||||
}
|
||||
|
||||
func BenchmarkStrictMatchingScheme_many(b *testing.B) {
|
||||
bench(b, matchingScheme, inputMany)
|
||||
}
|
||||
Reference in New Issue
Block a user