whatcanGOwrong

This commit is contained in:
2024-09-19 21:38:24 -04:00
commit d0ae4d841d
17908 changed files with 4096831 additions and 0 deletions
@@ -0,0 +1,2 @@
# To prevent CRLF breakages on Windows for fragile files, like testdata.
* -text
@@ -0,0 +1 @@
github: mvdan
@@ -0,0 +1,23 @@
on: [push, pull_request]
name: Test
jobs:
test:
strategy:
matrix:
go-version: [1.19.x, 1.20.x]
os: [ubuntu-latest, macos-11, windows-latest]
runs-on: ${{ matrix.os }}
steps:
- uses: actions/setup-go@v3
with:
go-version: ${{ matrix.go-version }}
- uses: actions/checkout@v3
- run: go test ./...
- run: go test -race ./...
# Static checks from this point forward. Only run on one Go version and on
# Linux, since it's the fastest platform, and the tools behave the same.
- if: matrix.os == 'ubuntu-latest' && matrix.go-version == '1.20.x'
run: diff <(echo -n) <(gofmt -s -d .)
- if: matrix.os == 'ubuntu-latest' && matrix.go-version == '1.20.x'
run: go vet ./...
@@ -0,0 +1,3 @@
cmd/xurls/xurls
generate/tldsgen/tldsgen
generate/regexgen/regexgen
@@ -0,0 +1,27 @@
Copyright (c) 2015, Daniel Martí. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the
distribution.
* Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
@@ -0,0 +1,37 @@
# xurls
[![Go Reference](https://pkg.go.dev/badge/mvdan.cc/xurls/v2.svg)](https://pkg.go.dev/mvdan.cc/xurls/v2)
Extract urls from text using regular expressions. Requires Go 1.19 or later.
```go
import "mvdan.cc/xurls/v2"
func main() {
rxRelaxed := xurls.Relaxed()
rxRelaxed.FindString("Do gophers live in golang.org?") // "golang.org"
rxRelaxed.FindString("This string does not have a URL") // ""
rxStrict := xurls.Strict()
rxStrict.FindAllString("must have scheme: http://foo.com/.", -1) // []string{"http://foo.com/"}
rxStrict.FindAllString("no scheme, no match: foo.com", -1) // []string{}
}
```
Since API is centered around [regexp.Regexp](https://golang.org/pkg/regexp/#Regexp),
many other methods are available, such as finding the [byte indexes](https://golang.org/pkg/regexp/#Regexp.FindAllIndex)
for all matches.
The regular expressions are compiled when the API is first called.
Any subsequent calls will use the same regular expression pointers.
#### cmd/xurls
To install the tool globally:
go install mvdan.cc/xurls/v2/cmd/xurls@latest
```shell
$ echo "Do gophers live in http://golang.org?" | xurls
http://golang.org
```
@@ -0,0 +1,293 @@
// Copyright (c) 2015, Daniel Martí <mvdan@mvdan.cc>
// See LICENSE for licensing information
package main
import (
"bufio"
"bytes"
"errors"
"flag"
"fmt"
"io"
"io/ioutil"
"net/http"
"net/url"
"os"
"regexp"
"runtime/debug"
"strings"
"sync/atomic"
"time"
"golang.org/x/mod/module"
"mvdan.cc/xurls/v2"
)
var (
matching = flag.String("m", "", "")
relaxed = flag.Bool("r", false, "")
fix boolString
version = flag.Bool("version", false, "")
)
type boolString string
func (s *boolString) Set(val string) error {
*s = boolString(val)
return nil
}
func (s *boolString) Get() any { return string(*s) }
func (s *boolString) String() string { return string(*s) }
func (*boolString) IsBoolFlag() bool { return true }
func init() {
flag.Var(&fix, "fix", "")
flag.Usage = func() {
fmt.Fprint(os.Stderr, `
Usage: xurls [-h] [files]
xurls extracts urls from text using regular expressions.
If no files are given, it reads from standard input.
-m <regexp> only match urls whose scheme matches a regexp
example: 'https?://|mailto:'
-r also match urls without a scheme (relaxed)
-version print version and exit
When the -fix or -fix=auto flag is used, xurls instead attempts to replace
any urls which result in a permanent redirect (301 or 308).
It also fails if any urls fail to load, so that they may be removed or replaced.
To replace urls which result in temporary redirect as well, use -fix=all.
`[1:])
}
}
func scanPath(re *regexp.Regexp, path string) error {
in := os.Stdin
out := io.Writer(os.Stdout)
var outBuf *bytes.Buffer
if path != "-" {
var err error
in, err = os.Open(path)
if err != nil {
return err
}
if fix != "" {
outBuf = new(bytes.Buffer)
out = outBuf
}
defer in.Close()
}
// A maximum of 32 parallel requests.
maxWeight := int64(32)
seq := newSequencer(maxWeight, out, os.Stderr)
userAgent := fmt.Sprintf("mvdan.cc/xurls %s", readVersion())
scanner := bufio.NewScanner(in)
// Doesn't need to be part of reporterState as order doesn't matter.
var atomicFixedCount uint32
for scanner.Scan() {
line := scanner.Text() + "\n"
matches := re.FindAllStringIndex(line, -1)
if fix == "" {
for _, pair := range matches {
match := line[pair[0]:pair[1]]
fmt.Printf("%s\n", match)
}
continue
}
weight := int64(len(matches))
if weight > maxWeight {
weight = maxWeight
}
seq.Add(weight, func(r *reporter) error {
offsetWithinLine := 0
for _, pair := range matches {
// The indexes are based on the original line.
pair[0] += offsetWithinLine
pair[1] += offsetWithinLine
match := line[pair[0]:pair[1]]
origURL, err := url.Parse(match)
if err != nil {
r.appendBroken(match, err.Error())
continue
}
fixed := origURL.String()
switch origURL.Scheme {
case "http", "https":
// See if the URL redirects somewhere.
client := &http.Client{
Timeout: 10 * time.Second,
CheckRedirect: func(req *http.Request, via []*http.Request) error {
if len(via) >= 10 {
return errors.New("stopped after 10 redirects")
}
switch req.Response.StatusCode {
case http.StatusMovedPermanently, http.StatusPermanentRedirect:
// "auto" and "all" fix permanent redirects.
case http.StatusFound, http.StatusSeeOther, http.StatusTemporaryRedirect:
// Only "all" fixes temporary redirects.
if fix != "all" {
return http.ErrUseLastResponse
}
default:
// Any other redirects are ignored.
return http.ErrUseLastResponse
}
// Inherit the fragment if empty.
if req.URL.Fragment == "" {
req.URL.Fragment = origURL.Fragment
}
fixed = req.URL.String()
return nil
},
}
method := http.MethodHead
retry:
req, err := http.NewRequest(method, fixed, nil)
if err != nil {
r.appendBroken(match, err.Error())
continue
}
req.Header.Set("User-Agent", userAgent)
resp, err := client.Do(req)
if err != nil {
r.appendBroken(match, err.Error())
continue
}
if code := resp.StatusCode; code >= 400 {
if code == http.StatusMethodNotAllowed {
method = http.MethodGet
resp.Body.Close()
goto retry
}
r.appendBroken(match, fmt.Sprintf("%d %s", code, http.StatusText(code)))
}
resp.Body.Close()
}
if fixed != match {
// Replace the url, and update offsetWithinLine.
newLine := line[:pair[0]] + fixed + line[pair[1]:]
offsetWithinLine += len(newLine) - len(line)
line = newLine
atomic.AddUint32(&atomicFixedCount, 1)
}
}
io.WriteString(r, line) // add the fixed line to outBuf
return nil
})
if err := scanner.Err(); err != nil {
return err
}
}
state := seq.finalState()
if state.exitCode != 0 {
panic("we aren't using sequencer for any errors")
}
// Note that all goroutines have stopped at this point.
if atomicFixedCount > 0 && path != "-" {
in.Close()
// Overwrite the file, if we weren't reading stdin. Report its
// path too.
fmt.Println(path)
if err := ioutil.WriteFile(path, outBuf.Bytes(), 0o666); err != nil {
return err
}
}
if len(state.brokenURLs) > 0 {
var s strings.Builder
fmt.Fprintf(&s, "found %d broken urls in %q:\n", len(state.brokenURLs), path)
for _, broken := range state.brokenURLs {
fmt.Fprintf(&s, " * %s - %s\n", broken.url, broken.reason)
}
return errors.New(s.String())
}
return nil
}
func main() { os.Exit(main1()) }
func main1() int {
flag.Parse()
if *version {
fmt.Println(readVersion())
return 0
}
if *relaxed && *matching != "" {
fmt.Fprintln(os.Stderr, "-r and -m at the same time don't make much sense")
return 1
}
switch fix {
case "": // disabled by default
case "false": // disabled via -fix=false; normalize
fix = ""
case "auto", "all": // enabled via -fix=auto, -fix=all, etc
case "true": // enabled via -fix; normalize
fix = "auto"
}
var re *regexp.Regexp
if *relaxed {
re = xurls.Relaxed()
} else if *matching != "" {
var err error
if re, err = xurls.StrictMatchingScheme(*matching); err != nil {
fmt.Fprintln(os.Stderr, err)
return 1
}
} else {
re = xurls.Strict()
}
args := flag.Args()
if len(args) == 0 {
args = []string{"-"}
}
for _, path := range args {
if err := scanPath(re, path); err != nil {
fmt.Fprintln(os.Stderr, err)
return 1
}
}
return 0
}
// Borrowed from https://github.com/burrowers/garble.
func readVersion() string {
info, ok := debug.ReadBuildInfo()
if !ok {
return "unknown"
}
mod := &info.Main
if mod.Replace != nil {
mod = mod.Replace
}
// Until https://github.com/golang/go/issues/50603 is implemented,
// manually construct something like a pseudo-version.
// TODO: remove when this code is dead, hopefully in Go 1.20.
if mod.Version == "(devel)" {
var vcsTime time.Time
var vcsRevision string
for _, setting := range info.Settings {
switch setting.Key {
case "vcs.time":
// If the format is invalid, we'll print a zero timestamp.
vcsTime, _ = time.Parse(time.RFC3339Nano, setting.Value)
case "vcs.revision":
vcsRevision = setting.Value
if len(vcsRevision) > 12 {
vcsRevision = vcsRevision[:12]
}
}
}
if vcsRevision != "" {
mod.Version = module.PseudoVersion("", "", vcsTime, vcsRevision)
}
}
return mod.Version
}
@@ -0,0 +1,125 @@
// Copyright (c) 2019, Daniel Martí <mvdan@mvdan.cc>
// See LICENSE for licensing information
package main
import (
"context"
"fmt"
"io/ioutil"
"net"
"net/http"
"os"
"path/filepath"
"testing"
"github.com/rogpeppe/go-internal/testscript"
)
func TestMain(m *testing.M) {
os.Exit(testscript.RunMain(m, map[string]func() int{
"xurls": main1,
}))
}
func TestScript(t *testing.T) {
t.Parallel()
testscript.Run(t, testscript.Params{
Dir: filepath.Join("testdata", "script"),
RequireExplicitExec: true,
Setup: func(env *testscript.Env) error {
mux := http.NewServeMux()
handle := func(method, pattern string, handler func(http.ResponseWriter, *http.Request)) {
mux.HandleFunc(pattern, func(w http.ResponseWriter, r *http.Request) {
if r.Method != method {
t.Errorf("expected all requests to be %q, got %q", method, r.Method)
}
handler(w, r)
})
}
handle("HEAD", "/plain-head", func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(200)
})
handle("HEAD", "/redir-1", func(w http.ResponseWriter, r *http.Request) {
http.Redirect(w, r, "/plain-head", http.StatusMovedPermanently)
})
handle("HEAD", "/redir-2", func(w http.ResponseWriter, r *http.Request) {
http.Redirect(w, r, "/redir-1", http.StatusMovedPermanently)
})
handle("HEAD", "/redir-longer", func(w http.ResponseWriter, r *http.Request) {
http.Redirect(w, r, "/redir-longtarget", http.StatusMovedPermanently)
})
handle("HEAD", "/redir-longtarget", func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(200)
})
handle("HEAD", "/redir-fragment", func(w http.ResponseWriter, r *http.Request) {
http.Redirect(w, r, "/plain-head#bar", http.StatusMovedPermanently)
})
handle("HEAD", "/redir-301", func(w http.ResponseWriter, r *http.Request) {
http.Redirect(w, r, "/plain-head", 301)
})
handle("HEAD", "/redir-302", func(w http.ResponseWriter, r *http.Request) {
http.Redirect(w, r, "/plain-head", 302)
})
handle("HEAD", "/redir-303", func(w http.ResponseWriter, r *http.Request) {
http.Redirect(w, r, "/plain-head", 303)
})
handle("HEAD", "/redir-307", func(w http.ResponseWriter, r *http.Request) {
http.Redirect(w, r, "/plain-head", 307)
})
handle("HEAD", "/redir-308", func(w http.ResponseWriter, r *http.Request) {
http.Redirect(w, r, "/plain-head", 308)
})
handle("HEAD", "/404", func(w http.ResponseWriter, r *http.Request) {
http.Error(w, "", 404)
})
handle("HEAD", "/500", func(w http.ResponseWriter, r *http.Request) {
http.Error(w, "", 500)
})
handle("GET", "/plain-get", func(w http.ResponseWriter, r *http.Request) {
fmt.Fprintf(w, "plaintext")
})
mux.HandleFunc("/get-only", func(w http.ResponseWriter, r *http.Request) {
if r.Method == "GET" {
http.Redirect(w, r, "/plain-get", 301)
} else {
http.Error(w, "", 405)
}
})
ln, err := net.Listen("tcp", ":0")
if err != nil {
return err
}
server := &http.Server{Handler: mux}
go server.Serve(ln)
env.Vars = append(env.Vars, "SERVER=http://"+ln.Addr().String())
env.Defer(func() {
if err := server.Shutdown(context.TODO()); err != nil {
t.Fatal(err)
}
})
return nil
},
Cmds: map[string]func(ts *testscript.TestScript, neg bool, args []string){
"expand": func(ts *testscript.TestScript, neg bool, args []string) {
if neg {
ts.Fatalf("unsupported: ! expand")
}
if len(args) == 0 {
ts.Fatalf("usage: expand file...")
}
for _, arg := range args {
data := ts.ReadFile(arg)
data = os.Expand(data, ts.Getenv)
err := ioutil.WriteFile(ts.MkAbs(arg), []byte(data), 0o666)
ts.Check(err)
}
},
},
})
}
@@ -0,0 +1,156 @@
// Copyright 2009 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// The code below is borrowed from Go's cmd/gofmt as of 1.18beta1.
// We tweaked it slightly to add the "broken URLs" result.
package main
import (
"context"
"go/scanner"
"io"
"golang.org/x/sync/semaphore"
)
// A sequencer performs concurrent tasks that may write output, but emits that
// output in a deterministic order.
type sequencer struct {
maxWeight int64
sem *semaphore.Weighted // weighted by input bytes (an approximate proxy for memory overhead)
prev <-chan *reporterState // 1-buffered
}
// newSequencer returns a sequencer that allows concurrent tasks up to maxWeight
// and writes tasks' output to out and err.
func newSequencer(maxWeight int64, out, err io.Writer) *sequencer {
sem := semaphore.NewWeighted(maxWeight)
prev := make(chan *reporterState, 1)
prev <- &reporterState{out: out, err: err}
return &sequencer{
maxWeight: maxWeight,
sem: sem,
prev: prev,
}
}
// Add blocks until the sequencer has enough weight to spare, then adds f as a
// task to be executed concurrently.
//
// If the weight is either negative or larger than the sequencer's maximum
// weight, Add blocks until all other tasks have completed, then the task
// executes exclusively (blocking all other calls to Add until it completes).
//
// f may run concurrently in a goroutine, but its output to the passed-in
// reporter will be sequential relative to the other tasks in the sequencer.
//
// If f invokes a method on the reporter, execution of that method may block
// until the previous task has finished. (To maximize concurrency, f should
// avoid invoking the reporter until it has finished any parallelizable work.)
//
// If f returns a non-nil error, that error will be reported after f's output
// (if any) and will cause a nonzero final exit code.
func (s *sequencer) Add(weight int64, f func(*reporter) error) {
if weight < 0 || weight > s.maxWeight {
weight = s.maxWeight
}
if err := s.sem.Acquire(context.TODO(), weight); err != nil {
// Change the task from "execute f" to "report err".
weight = 0
f = func(*reporter) error { return err }
}
r := &reporter{prev: s.prev}
next := make(chan *reporterState, 1)
s.prev = next
// Start f in parallel: it can run until it invokes a method on r, at which
// point it will block until the previous task releases the output state.
go func() {
if err := f(r); err != nil {
r.Report(err)
}
next <- r.getState() // Release the next task.
s.sem.Release(weight)
}()
}
// GetExitCode waits for all previously-added tasks to complete, then returns an
// exit code for the sequence suitable for passing to os.Exit.
func (s *sequencer) GetExitCode() int {
c := make(chan int, 1)
s.Add(0, func(r *reporter) error {
c <- r.ExitCode()
return nil
})
return <-c
}
func (s *sequencer) finalState() reporterState {
c := make(chan reporterState, 1)
s.Add(0, func(r *reporter) error {
c <- *r.getState()
return nil
})
return <-c
}
// A reporter reports output, warnings, and errors.
type reporter struct {
prev <-chan *reporterState
state *reporterState
}
// reporterState carries the state of a reporter instance.
//
// Only one reporter at a time may have access to a reporterState.
type reporterState struct {
out, err io.Writer
exitCode int
brokenURLs []brokenURL
}
type brokenURL struct {
url string
reason string
}
// getState blocks until any prior reporters are finished with the reporter
// state, then returns the state for manipulation.
func (r *reporter) getState() *reporterState {
if r.state == nil {
r.state = <-r.prev
}
return r.state
}
// Write emits a slice to the reporter's output stream.
//
// Any error is returned to the caller, and does not otherwise affect the
// reporter's exit code.
func (r *reporter) Write(p []byte) (int, error) {
return r.getState().out.Write(p)
}
func (r *reporter) appendBroken(url, reason string) {
state := r.getState()
state.brokenURLs = append(state.brokenURLs, brokenURL{url, reason})
}
// Report emits a non-nil error to the reporter's error stream,
// changing its exit code to a nonzero value.
func (r *reporter) Report(err error) {
if err == nil {
panic("Report with nil error")
}
st := r.getState()
scanner.PrintError(st.err, err)
st.exitCode = 2
}
func (r *reporter) ExitCode() int {
return r.getState().exitCode
}
@@ -0,0 +1,33 @@
stdin input
exec xurls
stdout 'https://foo.com'
! stdout 'bar.com'
! stdout 'custom://some-data'
! stderr .
! exec xurls missing
! stdout .
stderr 'open missing'
exec xurls input
stdout 'https://foo.com'
! stdout 'bar.com'
! stdout 'custom://some-data'
! stderr .
exec xurls -r input
stdout 'https://foo.com'
stdout 'bar.com'
! stdout 'custom://some-data'
! stderr .
exec xurls -m 'custom://' input
! stdout 'https://foo.com'
! stdout 'bar.com'
stdout 'custom://some-data'
! stderr .
-- input --
First, a link with a scheme, https://foo.com.
Then, one without a scheme, like bar.com.
Also, a link with a custom scheme, custom://some-data.
@@ -0,0 +1,120 @@
expand nothing
cp nothing nothing.orig
expand redirects
expand redirects.golden-auto
expand redirects.golden-all
cp redirects redirects.orig
expand broken
expand broken.golden
cp broken broken.orig
exec xurls -fix nothing
! stdout .
! stderr .
cmp nothing nothing.orig
stdin redirects
exec xurls -fix
cmp stdout redirects.golden-auto
cmp redirects redirects.orig
! stderr .
exec xurls -fix redirects
stdout '^redirects$'
! stderr .
cmp redirects redirects.golden-auto
cp redirects.orig redirects
exec xurls -fix=auto redirects
cmp redirects redirects.golden-auto
cp redirects.orig redirects
exec xurls -fix=all redirects
cmp redirects redirects.golden-all
cp redirects.orig redirects
! exec xurls -fix broken
stdout -count=1 '^broken$'
stderr -count=1 '5 broken urls'
stderr -count=2 '/404 - 404 Not Found'
stderr -count=2 '/500 - 500 Internal Server Error'
stderr -count=1 'totallydoesnotexist.localhost/ - Head .* dial tcp'
cmp broken broken.golden
-- nothing --
No redirect: ${SERVER}/plain-head
-- redirects --
No redirect: ${SERVER}/plain-head
One redirect: ${SERVER}/redir-1
Two redirects: ${SERVER}/redir-2
Redirect inherits fragment: ${SERVER}/redir-1#foo
Redirect replaces fragment: ${SERVER}/redir-fragment#foo
Three links in one line: ${SERVER}/redir-1 + ${SERVER}//redir-1 + ${SERVER}///redir-1
Redirect to a longer path ${SERVER}/redir-longer with trailing text
Permanent redirect codes:
* ${SERVER}/redir-301
* ${SERVER}/redir-308
Temporary redirect codes:
* ${SERVER}/redir-302
* ${SERVER}/redir-303
* ${SERVER}/redir-307
Only GET allowed, HEAD fails: ${SERVER}/get-only
-- redirects.golden-auto --
No redirect: ${SERVER}/plain-head
One redirect: ${SERVER}/plain-head
Two redirects: ${SERVER}/plain-head
Redirect inherits fragment: ${SERVER}/plain-head#foo
Redirect replaces fragment: ${SERVER}/plain-head#bar
Three links in one line: ${SERVER}/plain-head + ${SERVER}/plain-head + ${SERVER}/plain-head
Redirect to a longer path ${SERVER}/redir-longtarget with trailing text
Permanent redirect codes:
* ${SERVER}/plain-head
* ${SERVER}/plain-head
Temporary redirect codes:
* ${SERVER}/redir-302
* ${SERVER}/redir-303
* ${SERVER}/redir-307
Only GET allowed, HEAD fails: ${SERVER}/plain-get
-- redirects.golden-all --
No redirect: ${SERVER}/plain-head
One redirect: ${SERVER}/plain-head
Two redirects: ${SERVER}/plain-head
Redirect inherits fragment: ${SERVER}/plain-head#foo
Redirect replaces fragment: ${SERVER}/plain-head#bar
Three links in one line: ${SERVER}/plain-head + ${SERVER}/plain-head + ${SERVER}/plain-head
Redirect to a longer path ${SERVER}/redir-longtarget with trailing text
Permanent redirect codes:
* ${SERVER}/plain-head
* ${SERVER}/plain-head
Temporary redirect codes:
* ${SERVER}/plain-head
* ${SERVER}/plain-head
* ${SERVER}/plain-head
Only GET allowed, HEAD fails: ${SERVER}/plain-get
-- broken --
One redirect: ${SERVER}/redir-1
404 errors: ${SERVER}/404 ${SERVER}/404
500 errors: ${SERVER}/500 ${SERVER}/500
Dial error: http://totallydoesnotexist.localhost/
-- broken.golden --
One redirect: ${SERVER}/plain-head
404 errors: ${SERVER}/404 ${SERVER}/404
500 errors: ${SERVER}/500 ${SERVER}/500
Dial error: http://totallydoesnotexist.localhost/
@@ -0,0 +1,11 @@
exec xurls -h
! stderr 'flag provided but not defined'
stderr 'Usage: xurls'
! stderr 'help requested' # don't duplicate usage output
! stderr '-test\.' # don't show the test binary's usage func
! exec xurls -r -m="whatever"
stderr 'at the same time'
! exec xurls -m="bad(regexp"
stderr 'missing closing \)'
@@ -0,0 +1,5 @@
# Note that "go test" does not embed vcs information by default.
# We copied the code from another project which is tested,
# so there's no need to fully test the VCS aspect.
exec xurls -version
stdout '\(devel\)'
@@ -0,0 +1,19 @@
// Copyright (c) 2015, Daniel Martí <mvdan@mvdan.cc>
// See LICENSE for licensing information
package xurls_test
import (
"fmt"
"mvdan.cc/xurls/v2"
)
func Example() {
rx := xurls.Relaxed()
fmt.Println(rx.FindString("Do gophers live in http://golang.org?"))
fmt.Println(rx.FindAllString("foo.com is http://foo.com/.", -1))
// Output:
// http://golang.org
// [foo.com http://foo.com/]
}
@@ -0,0 +1,74 @@
// Copyright (c) 2017, Shreyas Khare <skhare@rapid7.com>
// See LICENSE for licensing information
package main
import (
"encoding/csv"
"io"
"log"
"net/http"
"os"
"strings"
"text/template"
)
const path = "schemes.go"
var schemesTmpl = template.Must(template.New("schemes").Parse(`// Generated by schemesgen
package xurls
// Schemes is a sorted list of all IANA assigned schemes.
//
// Source: https://www.iana.org/assignments/uri-schemes/uri-schemes-1.csv
var Schemes = []string{
{{range $scheme := .Schemes}}` + "\t`" + `{{$scheme}}` + "`" + `,
{{end}}}
`))
func schemeList() []string {
resp, err := http.Get("https://www.iana.org/assignments/uri-schemes/uri-schemes-1.csv")
if err != nil {
log.Fatal(err)
}
defer resp.Body.Close()
r := csv.NewReader(resp.Body)
r.Read() // ignore headers
schemes := make([]string, 0)
for {
record, err := r.Read()
if err == io.EOF {
break
}
if err != nil {
log.Fatal(err)
}
if strings.Contains(record[0], "OBSOLETE") {
continue // skip obsolete schemes; note the scheme column is abused
}
schemes = append(schemes, record[0])
}
return schemes
}
func writeSchemes(schemes []string) error {
f, err := os.Create(path)
if err != nil {
return err
}
defer f.Close()
return schemesTmpl.Execute(f, struct {
Schemes []string
}{
Schemes: schemes,
})
}
func main() {
schemes := schemeList()
log.Printf("Generating %s...", path)
if err := writeSchemes(schemes); err != nil {
log.Fatalf("Could not write path: %v", err)
}
}
@@ -0,0 +1,111 @@
// Copyright (c) 2015, Daniel Martí <mvdan@mvdan.cc>
// See LICENSE for licensing information
package main
import (
"bufio"
"errors"
"fmt"
"log"
"net/http"
"os"
"regexp"
"sort"
"strings"
"sync"
"text/template"
)
const path = "tlds.go"
var tldsTmpl = template.Must(template.New("tlds").Parse(`// Generated by tldsgen
package xurls
// TLDs is a sorted list of all public top-level domains.
//
// Sources:{{range $_, $url := .URLs}}
// - {{$url}}{{end}}
var TLDs = []string{
{{range $_, $tld := .TLDs}}` + "\t`" + `{{$tld}}` + "`" + `,
{{end}}}
`))
func cleanTld(tld string) string {
tld = strings.ToLower(tld)
if strings.HasPrefix(tld, "xn--") {
return ""
}
return tld
}
func fetchFromURL(wg *sync.WaitGroup, url, pat string, tldSet map[string]bool) {
defer wg.Done()
log.Printf("Fetching %s", url)
resp, err := http.Get(url)
if err == nil && resp.StatusCode >= 400 {
err = errors.New(resp.Status)
}
if err != nil {
panic(fmt.Errorf("%s: %s", url, err))
}
defer resp.Body.Close()
scanner := bufio.NewScanner(resp.Body)
re := regexp.MustCompile(pat)
for scanner.Scan() {
line := scanner.Text()
tld := re.FindString(line)
tld = cleanTld(tld)
if tld == "" {
continue
}
tldSet[tld] = true
}
if err := scanner.Err(); err != nil {
panic(fmt.Errorf("%s: %s", url, err))
}
}
func tldList() ([]string, []string) {
var urls []string
var wg sync.WaitGroup
tldSet := make(map[string]bool)
fromURL := func(url, pat string) {
urls = append(urls, url)
wg.Add(1)
go fetchFromURL(&wg, url, pat, tldSet)
}
fromURL("https://data.iana.org/TLD/tlds-alpha-by-domain.txt", `^[^#]+$`)
fromURL("https://publicsuffix.org/list/effective_tld_names.dat", `^[^/.]+$`)
wg.Wait()
tlds := make([]string, 0, len(tldSet))
for tld := range tldSet {
tlds = append(tlds, tld)
}
sort.Strings(tlds)
return tlds, urls
}
func writeTlds(tlds, urls []string) error {
f, err := os.Create(path)
if err != nil {
panic(err)
}
defer f.Close()
return tldsTmpl.Execute(f, struct {
TLDs []string
URLs []string
}{
TLDs: tlds,
URLs: urls,
})
}
func main() {
tlds, urls := tldList()
log.Printf("Generating %s...", path)
writeTlds(tlds, urls)
}
@@ -0,0 +1,152 @@
// Copyright (c) 2015, Daniel Martí <mvdan@mvdan.cc>
// See LICENSE for licensing information
package main
import (
"log"
"os"
"strconv"
"strings"
"text/template"
"unicode"
)
const path = "unicode.go"
var tmpl = template.Must(template.New("tlds").Parse(`// Generated by unicodegen
package xurls
const allowedUcsChar = {{.withPunc}}
const allowedUcsCharMinusPunc = {{.withoutPunc}}
`))
func visit(rt *unicode.RangeTable, fn func(rune)) {
for _, r16 := range rt.R16 {
for r := rune(r16.Lo); r <= rune(r16.Hi); r += rune(r16.Stride) {
fn(r)
}
}
for _, r32 := range rt.R32 {
for r := rune(r32.Lo); r <= rune(r32.Hi); r += rune(r32.Stride) {
fn(r)
}
}
}
func writeUnicode() error {
// rfc3987Ranges contains the ranges of valid code points specified by RFC 3987.
rfc3987Ranges := [][2]rune{
{0xA0, 0xD7FF},
{0xF900, 0xFDCF},
{0xFDF0, 0xFFEF},
{0x10000, 0x1FFFD},
{0x20000, 0x2FFFD},
{0x30000, 0x3FFFD},
{0x40000, 0x4FFFD},
{0x50000, 0x5FFFD},
{0x60000, 0x6FFFD},
{0x70000, 0x7FFFD},
{0x80000, 0x8FFFD},
{0x90000, 0x9FFFD},
{0xA0000, 0xAFFFD},
{0xB0000, 0xBFFFD},
{0xC0000, 0xCFFFD},
{0xD0000, 0xDFFFD},
{0xE1000, 0xEFFFD},
}
// removeRune accepts a slice of inclusive code point ranges (in ascending order)
// and returns a new slice that is equivalent except for excluding a specified rune
// by removing/replacing/splitting any range containing it.
// Its linear searches over the ranges (including those added by previous invocations)
// are inefficient, but acceptable because this code runs only at build time.
removeRune := func(ranges [][2]rune, cp rune) [][2]rune {
for i, r := range ranges {
// Ranges are in ascending order. Skip any that precede `cp`,
// and bail out upon reaching one that follows `cp`.
if r[1] < cp {
continue
} else if cp < r[0] {
break
}
// `cp` is in this range and must be removed from it.
if cp == r[0] && cp == r[1] {
// Remove this single-element range.
return append(ranges[0:i], ranges[i+1:]...)
} else if cp == r[0] {
// Remove the first element of this range.
newRange := [2]rune{r[0] + 1, r[1]}
newTail := append([][2]rune{newRange}, ranges[i+1:]...)
return append(ranges[0:i], newTail...)
} else if cp == r[1] {
// Remove the last element of this range.
newRange := [2]rune{r[0], r[1] - 1}
newTail := append([][2]rune{newRange}, ranges[i+1:]...)
return append(ranges[0:i], newTail...)
} else {
// Split this range.
newTail := append(
[][2]rune{
{r[0], cp - 1},
{cp + 1, r[1]},
},
ranges[i+1:]...)
return append(ranges[0:i], newTail...)
}
}
return ranges
}
// sepFreeRanges excludes separators from rfc3987Ranges.
sepFreeRanges := append([][2]rune{}, rfc3987Ranges...)
visit(unicode.Z, func(cp rune) {
sepFreeRanges = removeRune(sepFreeRanges, cp)
})
// puncFreeRanges excludes punctuation from sepFreeRanges.
puncFreeRanges := append([][2]rune{}, sepFreeRanges...)
visit(unicode.Po, func(cp rune) {
puncFreeRanges = removeRune(puncFreeRanges, cp)
})
// Build the corresponding regular expression character class contents.
characterClassContents := func(ranges [][2]rune) strings.Builder {
var builder strings.Builder
for _, r := range ranges {
// regexp.QuoteMeta is not necessary because all metacharacters are ASCII.
// cf. https://golang.org/s/re2syntax and
// https://cs.opensource.google/go/go/+/refs/tags/go1.17.6:src/regexp/regexp.go;l=721
builder.WriteRune(r[0])
if r[0] == r[1] {
continue
}
builder.WriteRune('-')
builder.WriteRune(r[1])
}
return builder
}
allowedUcsChar := characterClassContents(sepFreeRanges)
allowedUcsCharMinusPunc := characterClassContents(puncFreeRanges)
// Write to file.
f, err := os.Create(path)
if err != nil {
return err
}
defer f.Close()
return tmpl.Execute(f, map[string]string{
"withPunc": strconv.Quote(allowedUcsChar.String()),
"withoutPunc": strconv.Quote(allowedUcsCharMinusPunc.String()),
})
}
func main() {
log.Printf("Generating %s...", path)
if err := writeUnicode(); err != nil {
log.Fatalf("Could not write path: %v", err)
}
}
@@ -0,0 +1,9 @@
module mvdan.cc/xurls/v2
go 1.19
require (
github.com/rogpeppe/go-internal v1.10.0
golang.org/x/mod v0.10.0
golang.org/x/sync v0.1.0
)
@@ -0,0 +1,6 @@
github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ=
github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog=
golang.org/x/mod v0.10.0 h1:lFO9qtOdlre5W1jxS3r/4szv2/6iXxScdzjoBMXNhYk=
golang.org/x/mod v0.10.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
golang.org/x/sync v0.1.0 h1:wsuoTGHzEhffawBOhz5CYhcrV4IdKZbEyZjBMuTp12o=
golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
@@ -0,0 +1,375 @@
// Generated by schemesgen
package xurls
// Schemes is a sorted list of all IANA assigned schemes.
//
// Source: https://www.iana.org/assignments/uri-schemes/uri-schemes-1.csv
var Schemes = []string{
`aaa`,
`aaas`,
`about`,
`acap`,
`acct`,
`acd`,
`acr`,
`adiumxtra`,
`adt`,
`afp`,
`afs`,
`aim`,
`amss`,
`android`,
`appdata`,
`apt`,
`ar`,
`ark`,
`attachment`,
`aw`,
`barion`,
`bb`,
`beshare`,
`bitcoin`,
`bitcoincash`,
`blob`,
`bolo`,
`browserext`,
`cabal`,
`calculator`,
`callto`,
`cap`,
`cast`,
`casts`,
`chrome`,
`chrome-extension`,
`cid`,
`coap`,
`coap+tcp`,
`coap+ws`,
`coaps`,
`coaps+tcp`,
`coaps+ws`,
`com-eventbrite-attendee`,
`content`,
`content-type`,
`crid`,
`cstr`,
`cvs`,
`dab`,
`dat`,
`data`,
`dav`,
`diaspora`,
`dict`,
`did`,
`dis`,
`dlna-playcontainer`,
`dlna-playsingle`,
`dns`,
`dntp`,
`doi`,
`dpp`,
`drm`,
`drop`,
`dtmi`,
`dtn`,
`dvb`,
`dvx`,
`dweb`,
`ed2k`,
`eid`,
`elsi`,
`embedded`,
`ens`,
`ethereum`,
`example`,
`facetime`,
`fax`,
`feed`,
`feedready`,
`fido`,
`file`,
`filesystem`,
`finger`,
`first-run-pen-experience`,
`fish`,
`fm`,
`ftp`,
`fuchsia-pkg`,
`geo`,
`gg`,
`git`,
`gitoid`,
`gizmoproject`,
`go`,
`gopher`,
`graph`,
`grd`,
`gtalk`,
`h323`,
`ham`,
`hcap`,
`hcp`,
`http`,
`https`,
`hxxp`,
`hxxps`,
`hydrazone`,
`hyper`,
`iax`,
`icap`,
`icon`,
`im`,
`imap`,
`info`,
`iotdisco`,
`ipfs`,
`ipn`,
`ipns`,
`ipp`,
`ipps`,
`irc`,
`irc6`,
`ircs`,
`iris`,
`iris.beep`,
`iris.lwz`,
`iris.xpc`,
`iris.xpcs`,
`isostore`,
`itms`,
`jabber`,
`jar`,
`jms`,
`keyparc`,
`lastfm`,
`lbry`,
`ldap`,
`ldaps`,
`leaptofrogans`,
`lorawan`,
`lpa`,
`lvlt`,
`magnet`,
`mailserver`,
`mailto`,
`maps`,
`market`,
`matrix`,
`message`,
`microsoft.windows.camera`,
`microsoft.windows.camera.multipicker`,
`microsoft.windows.camera.picker`,
`mid`,
`mms`,
`modem`,
`mongodb`,
`moz`,
`ms-access`,
`ms-appinstaller`,
`ms-browser-extension`,
`ms-calculator`,
`ms-drive-to`,
`ms-enrollment`,
`ms-excel`,
`ms-eyecontrolspeech`,
`ms-gamebarservices`,
`ms-gamingoverlay`,
`ms-getoffice`,
`ms-help`,
`ms-infopath`,
`ms-inputapp`,
`ms-lockscreencomponent-config`,
`ms-media-stream-id`,
`ms-meetnow`,
`ms-mixedrealitycapture`,
`ms-mobileplans`,
`ms-newsandinterests`,
`ms-officeapp`,
`ms-people`,
`ms-project`,
`ms-powerpoint`,
`ms-publisher`,
`ms-remotedesktop-launch`,
`ms-restoretabcompanion`,
`ms-screenclip`,
`ms-screensketch`,
`ms-search`,
`ms-search-repair`,
`ms-secondary-screen-controller`,
`ms-secondary-screen-setup`,
`ms-settings`,
`ms-settings-airplanemode`,
`ms-settings-bluetooth`,
`ms-settings-camera`,
`ms-settings-cellular`,
`ms-settings-cloudstorage`,
`ms-settings-connectabledevices`,
`ms-settings-displays-topology`,
`ms-settings-emailandaccounts`,
`ms-settings-language`,
`ms-settings-location`,
`ms-settings-lock`,
`ms-settings-nfctransactions`,
`ms-settings-notifications`,
`ms-settings-power`,
`ms-settings-privacy`,
`ms-settings-proximity`,
`ms-settings-screenrotation`,
`ms-settings-wifi`,
`ms-settings-workplace`,
`ms-spd`,
`ms-stickers`,
`ms-sttoverlay`,
`ms-transit-to`,
`ms-useractivityset`,
`ms-virtualtouchpad`,
`ms-visio`,
`ms-walk-to`,
`ms-whiteboard`,
`ms-whiteboard-cmd`,
`ms-word`,
`msnim`,
`msrp`,
`msrps`,
`mss`,
`mt`,
`mtqp`,
`mumble`,
`mupdate`,
`mvn`,
`news`,
`nfs`,
`ni`,
`nih`,
`nntp`,
`notes`,
`num`,
`ocf`,
`oid`,
`onenote`,
`onenote-cmd`,
`opaquelocktoken`,
`openpgp4fpr`,
`otpauth`,
`p1`,
`pack`,
`palm`,
`paparazzi`,
`payment`,
`payto`,
`pkcs11`,
`platform`,
`pop`,
`pres`,
`prospero`,
`proxy`,
`pwid`,
`psyc`,
`pttp`,
`qb`,
`query`,
`quic-transport`,
`redis`,
`rediss`,
`reload`,
`res`,
`resource`,
`rmi`,
`rsync`,
`rtmfp`,
`rtmp`,
`rtsp`,
`rtsps`,
`rtspu`,
`sarif`,
`secondlife`,
`secret-token`,
`service`,
`session`,
`sftp`,
`sgn`,
`shc`,
`sieve`,
`simpleledger`,
`simplex`,
`sip`,
`sips`,
`skype`,
`smb`,
`smp`,
`sms`,
`smtp`,
`snews`,
`snmp`,
`soap.beep`,
`soap.beeps`,
`soldat`,
`spiffe`,
`spotify`,
`ssb`,
`ssh`,
`starknet`,
`steam`,
`stun`,
`stuns`,
`submit`,
`svn`,
`swh`,
`swid`,
`swidpath`,
`tag`,
`taler`,
`teamspeak`,
`tel`,
`teliaeid`,
`telnet`,
`tftp`,
`things`,
`thismessage`,
`tip`,
`tn3270`,
`tool`,
`turn`,
`turns`,
`tv`,
`udp`,
`unreal`,
`upt`,
`urn`,
`ut2004`,
`uuid-in-package`,
`v-event`,
`vemmi`,
`ventrilo`,
`ves`,
`videotex`,
`vnc`,
`view-source`,
`vscode`,
`vscode-insiders`,
`vsls`,
`w3`,
`wais`,
`web3`,
`wcr`,
`webcal`,
`web+ap`,
`wifi`,
`wpid`,
`ws`,
`wss`,
`wtai`,
`wyciwyg`,
`xcon`,
`xcon-userid`,
`xfire`,
`xmlrpc.beep`,
`xmlrpc.beeps`,
`xmpp`,
`xri`,
`ymsgr`,
`z39.50`,
`z39.50r`,
`z39.50s`,
}
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,24 @@
// Copyright (c) 2015, Daniel Martí <mvdan@mvdan.cc>
// See LICENSE for licensing information
package xurls
// PseudoTLDs is a sorted list of some widely used unofficial TLDs.
//
// Sources:
// - https://en.wikipedia.org/wiki/Pseudo-top-level_domain
// - https://en.wikipedia.org/wiki/Category:Pseudo-top-level_domains
// - https://tools.ietf.org/html/draft-grothoff-iesg-special-use-p2p-names-00
// - https://www.iana.org/assignments/special-use-domain-names/special-use-domain-names.xhtml
var PseudoTLDs = []string{
`bit`, // Namecoin
`example`, // Example domain
`exit`, // Tor exit node
`gnu`, // GNS by public key
`i2p`, // I2P network
`invalid`, // Invalid domain
`local`, // Local network
`localhost`, // Local network
`test`, // Test domain
`zkey`, // GNS domain name
}
@@ -0,0 +1,7 @@
// Generated by unicodegen
package xurls
const allowedUcsChar = "¡-ᙿᚁ-\u1fff\u200b-‧\u202a-\u202e‰-⁞\u2060-\u2fff、-\ud7ff豈-\ufdcfﷰ-\uffef𐀀-\U0001fffd𠀀-\U0002fffd𰀀-\U0003fffd\U00040000-\U0004fffd\U00050000-\U0005fffd\U00060000-\U0006fffd\U00070000-\U0007fffd\U00080000-\U0008fffd\U00090000-\U0009fffd\U000a0000-\U000afffd\U000b0000-\U000bfffd\U000c0000-\U000cfffd\U000d0000-\U000dfffd\U000e1000-\U000efffd"
const allowedUcsCharMinusPunc = "¢-¦¨-µ¸-¾À-ͽͿ-ΆΈ-ՙՠ-ֈ֊-ֿׁ-ׂׄ-ׇׅ-ײ\u05f5-؈؋؎-ؚ\u061c-\u061dؠ-٩ٮ-ۓە-ۿ\u070e-߶ߺ-\u082f\u083f-\u085d\u085f-ॣ०-९ॱ-ৼ৾-ੵ\u0a77-૯૱-\u0c76౸-ಃಅ-ෳ\u0df5-๎๐-๙\u0e5c-༃༓༕-྄྆-࿏࿕-࿘\u0fdb-၉ၐ-ჺჼ-፟፩-᙭ᙯ-ᙿᚁ-ᛪᛮ-᜴\u1737-៓ៗ៛-\u17ff᠆᠋-\u1943᥆-\u1a1dᨠ-\u1a9fᪧ\u1aae-᭙᭡-\u1bfbᰀ-\u1c3a᱀-ᱽᲀ-Ჿ\u1cc8-᳔᳒-\u1fff\u200b-―‘-‟\u202a-\u202e-›‿-⁀⁄-⁆⁒⁔\u2060-\u2cf8⳽ⴀ-ⵯ\u2d71-ⷿ⸂-⸅⸉-⸊⸌-⸍⸗⸚⸜-⸝⸠-⸩ⸯ⸺-⸻⹀⹂⹐-⹑\u2e53-\u2fff〄-〼〾-ヺー-ꓽꔀ-ꘌꘐ-꙲ꙴ-꙽ꙿ-꛱\ua6f8-ꡳ\ua878-\ua8cd꣐-ꣷꣻꣽ-꤭ꤰ-\ua95eꥠ-꧀\ua9ce-\ua9ddꧠ-\uaa5bꩠ-ꫝꫠ-ꫯꫲ-ꯪ꯬-\ud7ff豈-\ufdcfﷰ-️︗-︘\ufe1a-︯︱-﹄﹇-﹈﹍-\ufe53-﹞﹢-\ufe67﹩\ufe6c-\uff00$(-)+-0-9<->A-[]-⦆「-」ヲ-\uffef𐀀-\U000100ff\U00010103-\U0001039e𐎠-𐏏𐏑-\U0001056e\U00010570-\U00010856𐡘-\U0001091e𐤠-\U0001093e\U00010940-\U00010a4f\U00010a59-𐩾𐪀-𐫯\U00010af7-\U00010b38𐭀-\U00010b98\U00010b9d-𐽔\U00010f5a-𑁆\U0001104e-𑂺\U000110bd\U000110c2-𑄿𑅄-𑅳𑅶-𑇄𑇉-𑇌𑇎-𑇚𑇜\U000111e0-𑈷𑈾-𑊨\U000112aa-𑑊𑑐-𑑙\U0001145c𑑞-𑓅𑓇-𑗀𑗘-𑙀𑙄-\U0001165f\U0001166d-𑜻𑜿-𑠺\U0001183c-𑥃\U00011947-𑧡𑧣-𑨾𑩇-𑪙𑪝\U00011aa3-𑱀\U00011c46-\U00011c6f𑱲-𑻶\U00011ef9-\U00011ffe𒀀-\U0001246f\U00012475-\U00016a6d\U00016a70-𖫴\U00016af6-𖬶𖬼-𖭃𖭅-𖺖\U00016e9b-𖿡𖿣-𛲞\U0001bca0-𝪆\U0001da8c-\U0001e95d\U0001e960-\U0001fffd𠀀-\U0002fffd𰀀-\U0003fffd\U00040000-\U0004fffd\U00050000-\U0005fffd\U00060000-\U0006fffd\U00070000-\U0007fffd\U00080000-\U0008fffd\U00090000-\U0009fffd\U000a0000-\U000afffd\U000b0000-\U000bfffd\U000c0000-\U000cfffd\U000d0000-\U000dfffd\U000e1000-\U000efffd"
@@ -0,0 +1,200 @@
// Copyright (c) 2015, Daniel Martí <mvdan@mvdan.cc>
// See LICENSE for licensing information
// Package xurls extracts urls from plain text using regular expressions.
package xurls
import (
"regexp"
"strings"
"sync"
"unicode/utf8"
)
//go:generate go run ./generate/tldsgen
//go:generate go run ./generate/schemesgen
//go:generate go run ./generate/unicodegen
const (
// pathCont is based on https://www.rfc-editor.org/rfc/rfc3987#section-2.2
// but does not match separators anywhere or most puncutation in final position,
// to avoid creating asymmetries like
// `Did you know that **<a href="...">https://example.com/**</a> is reserved for documentation?`
// from `Did you know that **https://example.com/** is reserved for documentation?`.
unreservedChar = `a-zA-Z0-9\-._~`
endUnreservedChar = `a-zA-Z0-9\-_~`
midSubDelimChar = `!$&'*+,;=`
endSubDelimChar = `$&+=`
midIPathSegmentChar = unreservedChar + `%` + midSubDelimChar + `:@` + allowedUcsChar
endIPathSegmentChar = endUnreservedChar + `%` + endSubDelimChar + allowedUcsCharMinusPunc
iPrivateChar = `\x{E000}-\x{F8FF}\x{F0000}-\x{FFFFD}\x{100000}-\x{10FFFD}`
midIChar = `/?#\\` + midIPathSegmentChar + iPrivateChar
endIChar = `/#` + endIPathSegmentChar + iPrivateChar
wellParen = `\((?:[` + midIChar + `]|\([` + midIChar + `]*\))*\)`
wellBrack = `\[(?:[` + midIChar + `]|\[[` + midIChar + `]*\])*\]`
wellBrace = `\{(?:[` + midIChar + `]|\{[` + midIChar + `]*\})*\}`
wellAll = wellParen + `|` + wellBrack + `|` + wellBrace
pathCont = `(?:[` + midIChar + `]*(?:` + wellAll + `|[` + endIChar + `]))+`
letter = `\p{L}`
mark = `\p{M}`
number = `\p{N}`
iriChar = letter + mark + number
iri = `[` + iriChar + `](?:[` + iriChar + `\-]*[` + iriChar + `])?`
subdomain = `(?:` + iri + `\.)+`
octet = `(?:25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9][0-9]|[0-9])`
ipv4Addr = octet + `\.` + octet + `\.` + octet + `\.` + octet
// ipv6Addr is based on https://datatracker.ietf.org/doc/html/rfc4291#section-2.2
// with a specific alternative for each valid count of leading 16-bit hexadecimal "chomps"
// that have not been replaced with a `::` elision.
h4 = `[0-9a-fA-F]{1,4}`
ipv6AddrMinusEmpty = `(?:` +
// 7 colon-terminated chomps, followed by a final chomp or the rest of an elision.
`(?:` + h4 + `:){7}(?:` + h4 + `|:)|` +
// 6 chomps, followed by an IPv4 address or elision with final chomp or final elision.
`(?:` + h4 + `:){6}(?:` + ipv4Addr + `|:` + h4 + `|:)|` +
// 5 chomps, followed by an elision with optional IPv4 or up to 2 final chomps.
`(?:` + h4 + `:){5}(?::` + ipv4Addr + `|(?::` + h4 + `){1,2}|:)|` +
// 4 chomps, followed by an elision with optional IPv4 (optionally preceded by a chomp) or
// up to 3 final chomps.
`(?:` + h4 + `:){4}(?:(?::` + h4 + `){0,1}:` + ipv4Addr + `|(?::` + h4 + `){1,3}|:)|` +
// 3 chomps, followed by an elision with optional IPv4 (preceded by up to 2 chomps) or
// up to 4 final chomps.
`(?:` + h4 + `:){3}(?:(?::` + h4 + `){0,2}:` + ipv4Addr + `|(?::` + h4 + `){1,4}|:)|` +
// 2 chomps, followed by an elision with optional IPv4 (preceded by up to 3 chomps) or
// up to 5 final chomps.
`(?:` + h4 + `:){2}(?:(?::` + h4 + `){0,3}:` + ipv4Addr + `|(?::` + h4 + `){1,5}|:)|` +
// 1 chomp, followed by an elision with optional IPv4 (preceded by up to 4 chomps) or
// up to 6 final chomps.
`(?:` + h4 + `:){1}(?:(?::` + h4 + `){0,4}:` + ipv4Addr + `|(?::` + h4 + `){1,6}|:)|` +
// elision, followed by optional IPv4 (preceded by up to 5 chomps) or
// up to 7 final chomps.
// `:` is an intentionally omitted alternative, to avoid matching `::`.
`:(?:(?::` + h4 + `){0,5}:` + ipv4Addr + `|(?::` + h4 + `){1,7})` +
`)`
ipv6Addr = `(?:` + ipv6AddrMinusEmpty + `|::)`
ipAddrMinusEmpty = `(?:` + ipv6AddrMinusEmpty + `|\b` + ipv4Addr + `\b)`
port = `(?::[0-9]*)?`
)
// AnyScheme can be passed to StrictMatchingScheme to match any possibly valid
// scheme, and not just the known ones.
var AnyScheme = `(?:[a-zA-Z][a-zA-Z.\-+]*://|` + anyOf(SchemesNoAuthority...) + `:)`
// SchemesNoAuthority is a sorted list of some well-known url schemes that are
// followed by ":" instead of "://". The list includes both officially
// registered and unofficial schemes.
var SchemesNoAuthority = []string{
`bitcoin`, // Bitcoin
`cid`, // Content-ID
`file`, // Files
`magnet`, // Torrent magnets
`mailto`, // Mail
`mid`, // Message-ID
`sms`, // SMS
`tel`, // Telephone
`xmpp`, // XMPP
}
// SchemesUnofficial is a sorted list of some well-known url schemes which
// aren't officially registered just yet. They tend to correspond to software.
//
// Mostly collected from https://en.wikipedia.org/wiki/List_of_URI_schemes#Unofficial_but_common_URI_schemes.
var SchemesUnofficial = []string{
`gemini`, // gemini
`jdbc`, // Java database Connectivity
`moz-extension`, // Firefox extension
`postgres`, // PostgreSQL (short form)
`postgresql`, // PostgreSQL
`slack`, // Slack
`zoommtg`, // Zoom (desktop)
`zoomus`, // Zoom (mobile)
}
// The regular expressions are compiled when the API is first called.
// Any subsequent calls will use the same regular expression pointers.
//
// We do not need to make a copy of them for each API call,
// as Copy is now only useful if one copy calls Longest but not another,
// and we always call Longest after compiling the regular expression.
var (
strictRe *regexp.Regexp
strictInit sync.Once
relaxedRe *regexp.Regexp
relaxedInit sync.Once
)
func anyOf(strs ...string) string {
var b strings.Builder
b.WriteString("(?:")
for i, s := range strs {
if i != 0 {
b.WriteByte('|')
}
b.WriteString(regexp.QuoteMeta(s))
}
b.WriteByte(')')
return b.String()
}
func strictExp() string {
schemes := `(?:(?i)(?:` + anyOf(Schemes...) + `|` + anyOf(SchemesUnofficial...) + `)://|` + anyOf(SchemesNoAuthority...) + `:)`
return schemes + pathCont
}
func relaxedExp() string {
var asciiTLDs, unicodeTLDs []string
for i, tld := range TLDs {
if tld[0] >= utf8.RuneSelf {
asciiTLDs = TLDs[:i:i]
unicodeTLDs = TLDs[i:]
break
}
}
punycode := `xn--[a-z0-9-]+`
// Use \b to make sure ASCII TLDs are immediately followed by a word break.
// We can't do that with unicode TLDs, as they don't see following
// whitespace as a word break.
tlds := `(?:(?i)` + punycode + `|` + anyOf(append(asciiTLDs, PseudoTLDs...)...) + `\b|` + anyOf(unicodeTLDs...) + `)`
domain := subdomain + tlds
hostName := `(?:` + domain + `|\[` + ipv6Addr + `\]|\b` + ipv4Addr + `\b)`
webURL := hostName + port + `(?:/` + pathCont + `|/)?`
email := `[a-zA-Z0-9._%\-+]+@` + domain
return strictExp() + `|` + webURL + `|` + email + `|` + ipv6AddrMinusEmpty
}
// Strict produces a regexp that matches any URL with a scheme in either the
// Schemes or SchemesNoAuthority lists.
func Strict() *regexp.Regexp {
strictInit.Do(func() {
strictRe = regexp.MustCompile(strictExp())
strictRe.Longest()
})
return strictRe
}
// Relaxed produces a regexp that matches any URL matched by Strict, plus any
// URL with no scheme or email address.
func Relaxed() *regexp.Regexp {
relaxedInit.Do(func() {
relaxedRe = regexp.MustCompile(relaxedExp())
relaxedRe.Longest()
})
return relaxedRe
}
// StrictMatchingScheme produces a regexp similar to Strict, but requiring that
// the scheme match the given regular expression. See AnyScheme too.
func StrictMatchingScheme(exp string) (*regexp.Regexp, error) {
strictMatching := `(?i)(?:` + exp + `)(?-i)` + pathCont
re, err := regexp.Compile(strictMatching)
if err != nil {
return nil, err
}
re.Longest()
return re, nil
}
@@ -0,0 +1,469 @@
// Copyright (c) 2015, Daniel Martí <mvdan@mvdan.cc>
// See LICENSE for licensing information
package xurls
import (
"fmt"
"regexp"
"sync"
"testing"
)
type testCase struct {
in string
want interface{}
}
func wantStr(in string, want interface{}) string {
switch x := want.(type) {
case string:
return x
case bool:
if x {
return in
}
}
return ""
}
func doTest(t *testing.T, name string, re *regexp.Regexp, cases []testCase) {
for i, c := range cases {
t.Run(fmt.Sprintf("%s/%03d", name, i), func(t *testing.T) {
want := wantStr(c.in, c.want)
for _, surround := range []string{"", "\n"} {
in := surround + c.in + surround
got := re.FindString(in)
if got != want {
t.Errorf(`FindString(%q) got %q, want %q`, in, got, want)
}
}
})
}
}
var constantTestCases = []testCase{
{``, nil},
{` `, nil},
{`:`, nil},
{`::`, nil},
{`:::`, nil},
{`::::`, nil},
{`.`, nil},
{`..`, nil},
{`...`, nil},
{`1.1`, nil},
{`.1.`, nil},
{`1.1.1`, nil},
{`1:1`, nil},
{`:1:`, nil},
{`1:1:1`, nil},
{`://`, nil},
{`foo`, nil},
{`foo:`, nil},
{`mailto:`, nil},
{`foo://`, nil},
{`http://`, nil},
{`http:// foo`, nil},
{`http:// foo`, nil},
{`:foo`, nil},
{`://foo`, nil},
{`foorandom:bar`, nil},
{`foo.randombar`, nil},
{`zzz.`, nil},
{`.zzz`, nil},
{`zzz.zzz`, nil},
{`/some/path`, nil},
{`rel/path`, nil},
{`localhost`, nil},
{`com`, nil},
{`.com`, nil},
{`com.`, nil},
{`http`, nil},
{`http://foo`, true},
{`http://FOO`, true},
{`http://FAÀ`, true},
{`https://localhost`, true},
{`mailto:foo`, true},
{`MAILTO:foo`, true},
{`sms:123`, true},
{`xmpp:foo@bar`, true},
{`bitcoin:Addr23?amount=1&message=foo`, true},
{`cid:foo-32x32.v2_fe0f1423.png`, true},
{`mid:960830.1639@XIson.com`, true},
{`http://foo.com`, true},
{`http://foo.co.uk`, true},
{`http://foo.random`, true},
{` http://foo.com/bar `, `http://foo.com/bar`},
{` http://foo.com/bar more`, `http://foo.com/bar`},
{`<http://foo.com/bar>`, `http://foo.com/bar`},
{`<http://foo.com/bar>more`, `http://foo.com/bar`},
{`.http://foo.com/bar.`, `http://foo.com/bar`},
{`.http://foo.com/bar.more`, `http://foo.com/bar.more`},
{`,http://foo.com/bar,`, `http://foo.com/bar`},
{`,http://foo.com/bar,more`, `http://foo.com/bar,more`},
{`*http://foo.com/bar*`, `http://foo.com/bar`},
{`*http://foo.com/bar*more`, `http://foo.com/bar*more`},
{`_http://foo.com/bar_`, `http://foo.com/bar_`},
{`_http://foo.com/bar_more`, `http://foo.com/bar_more`},
{`(http://foo.com/bar)`, `http://foo.com/bar`},
{`(http://foo.com/bar)more`, `http://foo.com/bar`},
{`[http://foo.com/bar]`, `http://foo.com/bar`},
{`[http://foo.com/bar]more`, `http://foo.com/bar`},
{`'http://foo.com/bar'`, `http://foo.com/bar`},
{`'http://foo.com/bar'more`, `http://foo.com/bar'more`},
{`"http://foo.com/bar"`, `http://foo.com/bar`},
{`"http://foo.com/bar"more`, `http://foo.com/bar`},
{`{"url":"http://foo.com/bar"}`, `http://foo.com/bar`},
{`{"before":"foo","url":"http://foo.com/bar","after":"bar"}`, `http://foo.com/bar`},
{`http://a.b/a0/-+_&~*%=#@.,:;'?![]()a`, true},
{`http://a.b/a0/$€¥`, true},
{`http://✪foo.bar/pa✪th©more`, true},
{`http://foo.bar/path/`, true},
{`http://foo.bar/path-`, true},
{`http://foo.bar/path+`, true},
{`http://foo.bar/path&`, true},
{`http://foo.bar/path~`, true},
{`http://foo.bar/path%`, true},
{`http://foo.bar/path=`, true},
{`http://foo.bar/path#`, true},
{`http://foo.bar/path.`, `http://foo.bar/path`},
{`http://foo.bar/path,`, `http://foo.bar/path`},
{`http://foo.bar/path:`, `http://foo.bar/path`},
{`http://foo.bar/path;`, `http://foo.bar/path`},
{`http://foo.bar/path'`, `http://foo.bar/path`},
{`http://foo.bar/path?`, `http://foo.bar/path`},
{`http://foo.bar/path!`, `http://foo.bar/path`},
{`http://foo.bar/path@`, `http://foo.bar/path`},
{`http://foo.bar/path|`, `http://foo.bar/path`},
{`http://foo.bar/path|more`, `http://foo.bar/path`},
{`http://foo.bar/path<`, `http://foo.bar/path`},
{`http://foo.bar/path<more`, `http://foo.bar/path`},
{`http://foo.com/path_(more)`, true},
{`(http://foo.com/path_(more))`, `http://foo.com/path_(more)`},
{`http://foo.com/path_(even)-(more)`, true},
{`http://foo.com/path_(even)(more)`, true},
{`http://foo.com/path_(even_(nested))`, true},
{`(http://foo.com/path_(even_(nested)))`, `http://foo.com/path_(even_(nested))`},
{`http://foo.com/path_[more]`, true},
{`[http://foo.com/path_[more]]`, `http://foo.com/path_[more]`},
{`http://foo.com/path_[even]-[more]`, true},
{`http://foo.com/path_[even][more]`, true},
{`http://foo.com/path_[even_[nested]]`, true},
{`[http://foo.com/path_[even_[nested]]]`, `http://foo.com/path_[even_[nested]]`},
{`http://foo.com/path_{more}`, true},
{`{http://foo.com/path_{more}}`, `http://foo.com/path_{more}`},
{`http://foo.com/path_{even}-{more}`, true},
{`http://foo.com/path_{even}{more}`, true},
{`http://foo.com/path_{even_{nested}}`, true},
{`{http://foo.com/path_{even_{nested}}}`, `http://foo.com/path_{even_{nested}}`},
{`http://foo.com/path#fragment`, true},
{`http://foo.com/emptyfrag#`, true},
{`http://foo.com/spaced%20path`, true},
{`http://foo.com/?p=spaced%20param`, true},
{`http://test.foo.com/`, true},
{`http://foo.com/path`, true},
{`http://foo.com:8080/path`, true},
{`http://1.1.1.1/path`, true},
{`http://1.1.1.1:8080/path`, true},
{`http://[1080::8:800:200c:417a]/path`, true},
{`http://[1080::8:800:200c:417a]:8080/path`, true},
// scheme://IPv6_addr is not valid per RFC 3987, but is supported anyway (for now).
{`http://1080::8:800:200c:417a/path`, true},
{`http://2001.db8:0/path`, true},
{`http://中国.中国/中国`, true},
{`http://中国.中国/foo中国`, true},
{`http://उदाहरण.परीकषा`, true},
{`http://xn-foo.xn--p1acf/path`, true},
{`what is http://foo.com?`, `http://foo.com`},
{`go visit http://foo.com/path.`, `http://foo.com/path`},
{`go visit http://foo.com/path...`, `http://foo.com/path`},
{`what is http://foo.com/path?`, `http://foo.com/path`},
{`the http://foo.com!`, `http://foo.com`},
{`https://test.foo.bar/path?a=b`, `https://test.foo.bar/path?a=b`},
{`ftp://user@foo.bar`, true},
{`http://foo.com/base64-bCBwbGVhcw==`, true},
{`http://foo.com/`, true},
{`http://foo.com/🐼`, true},
{`https://shmibbles.me/tmp/自殺でも?.png`, true},
{`randomtexthttp://foo.bar/etc`, "http://foo.bar/etc"},
{`postgres://user:pass@host.com:5432/path?k=v#f`, true},
{`postgres://user:pass@host.com:5432/path?k=v#f`, true},
{`zoommtg://zoom.us/join?confno=1234&pwd=xxx`, true},
{`zoomus://zoom.us/join?confno=1234&pwd=xxx`, true},
}
func TestRegexes(t *testing.T) {
doTest(t, "Relaxed", Relaxed(), constantTestCases)
doTest(t, "Strict", Strict(), constantTestCases)
doTest(t, "Relaxed2", Relaxed(), []testCase{
{`foo.a`, nil},
{`foo.com`, true},
{`foo.com bar.com`, `foo.com`},
{`foo.com-foo`, `foo.com`},
{`foo.company`, true},
{`foo.comrandom`, nil},
{`some.guy`, nil},
{`foo.example`, true},
{`foo.i2p`, true},
{`foo.local`, true},
{`foo.onion`, true},
{`中国.中国`, true},
{`中国.中国/foo中国`, true},
{`test.联通`, true},
{`test.联通 extra`, `test.联通`},
{`test.xn--8y0a063a`, true},
{`test.xn--8y0a063a/foobar`, true},
{`test.xn-foo`, nil},
{`test.xn--`, nil},
{`foo.com/`, true},
{`1.1.1.1`, true},
{`10.50.23.250`, true},
{`121.1.1.1`, true},
{`255.1.1.1`, true},
{`300.1.1.1`, nil},
{`1.1.1.300`, nil},
{`foo@1.2.3.4`, `1.2.3.4`},
// https://www.iana.org/assignments/iana-ipv6-special-registry/iana-ipv6-special-registry.xhtml
{`::1`, true},
//{`::`, true},
{`::ffff:0:0`, true},
{`64:ff9b::`, true},
{`64:ff9b:1::`, true},
{`100::`, true},
{`2001::`, true},
{`2001:1::1`, true},
{`2001:1::2`, true},
{`2001:2::`, true},
{`2001:3::`, true},
{`2001:4:112::`, true},
{`2001:10::`, true},
{`2001:20::`, true},
{`2001:db8::`, true},
{`2002::`, true},
{`2620:4f:8000::`, true},
{`fc00::`, true},
{`fe80::`, true},
// https://datatracker.ietf.org/doc/html/rfc4291#section-2.2
{`ABCD:EF01:2345:6789:ABCD:EF01:2345:6789`, true},
{`2001:DB8:0:0:8:800:200C:417A`, true},
{`2001:DB8:0:0:8:800:200C:417A`, true}, // a unicast address
{`FF01:0:0:0:0:0:0:101`, true}, // a multicast address
{`0:0:0:0:0:0:0:1`, true}, // the loopback address
{`0:0:0:0:0:0:0:0`, true}, // the unspecified address
{`2001:DB8::8:800:200C:417A`, true}, // a unicast address
{`FF01::101`, true}, // a multicast address
{`::1`, true}, // the loopback address
//{`::`, true}, // the unspecified address
{`::`, nil},
{`0:0:0:0:0:0:13.1.68.3`, true},
{`0:0:0:0:0:FFFF:129.144.52.38`, true},
{`::13.1.68.3`, true},
{`::FFFF:129.144.52.38`, true},
// https://datatracker.ietf.org/doc/html/rfc5952#section-1
{`2001:db8:0:0:1:0:0:1`, true},
{`2001:0db8:0:0:1:0:0:1`, true},
{`2001:db8::1:0:0:1`, true},
{`2001:db8::0:1:0:0:1`, true},
{`2001:0db8::1:0:0:1`, true},
{`2001:db8:0:0:1::1`, true},
{`2001:db8:0000:0:1::1`, true},
{`2001:DB8:0:0:1::1`, true},
// https://datatracker.ietf.org/doc/html/rfc5952#section-2.1
{`2001:db8:aaaa:bbbb:cccc:dddd:eeee:0001`, true},
{`2001:db8:aaaa:bbbb:cccc:dddd:eeee:001`, true},
{`2001:db8:aaaa:bbbb:cccc:dddd:eeee:01`, true},
{`2001:db8:aaaa:bbbb:cccc:dddd:eeee:1`, true},
// https://datatracker.ietf.org/doc/html/rfc5952#section-2.2
{`2001:db8:aaaa:bbbb:cccc:dddd::1`, true},
{`2001:db8:aaaa:bbbb:cccc:dddd:0:1`, true},
{`2001:db8:0:0:0::1`, true},
{`2001:db8:0:0::1`, true},
{`2001:db8:0::1`, true},
{`2001:db8::1`, true},
{`2001:db8::aaaa:0:0:1`, true},
{`2001:db8:0:0:aaaa::1`, true},
// https://datatracker.ietf.org/doc/html/rfc5952#section-2.3
{`2001:db8:aaaa:bbbb:cccc:dddd:eeee:aaaa`, true},
{`2001:db8:aaaa:bbbb:cccc:dddd:eeee:AAAA`, true},
{`2001:db8:aaaa:bbbb:cccc:dddd:eeee:AaAa`, true},
// An IP address in URI host position must be bracketed unless it is IPv4.
// https://www.rfc-editor.org/rfc/rfc3986#section-3.2.2
// TODO: Implement this restriction, ideally without matching the `http://1080` prefix.
//{`http://1080::8:800:200c:417a/path`, `1080::8:800:200c:417a`},
{`foo.com:8080`, true},
{`foo.com:8080/path`, true},
{`test.foo.com`, true},
{`test.foo.com/path`, true},
{`test.foo.com/path/more/`, true},
{`TEST.FOO.COM/PATH`, true},
{`TEST.FÓO.COM/PÁTH`, true},
{`foo.com/path_(more)`, true},
{`foo.com/path_(even)_(more)`, true},
{`foo.com/path_(more)/more`, true},
{`foo.com/path_(more)/end)`, `foo.com/path_(more)/end`},
{`www.foo.com`, true},
{` foo.com/bar `, `foo.com/bar`},
{` foo.com/bar more`, `foo.com/bar`},
{`<foo.com/bar>`, `foo.com/bar`},
{`<foo.com/bar>more`, `foo.com/bar`},
{`,foo.com/bar.`, `foo.com/bar`},
{`,foo.com/bar.more`, `foo.com/bar.more`},
{`,foo.com/bar,`, `foo.com/bar`},
{`,foo.com/bar,more`, `foo.com/bar,more`},
{`(foo.com/bar)`, `foo.com/bar`},
{`"foo.com/bar'`, `foo.com/bar`},
{`"foo.com/bar'more`, `foo.com/bar'more`},
{`"foo.com/bar"`, `foo.com/bar`},
{`what is foo.com?`, `foo.com`},
{`the foo.com!`, `foo.com`},
{`foo@bar`, nil},
{`foo@bar.a`, nil},
{`foo@bar.com`, true},
{`foo@sub.bar.com`, true},
{`foo@bar.com bar@bar.com`, `foo@bar.com`},
{`foo@bar.onion`, true},
{`foo@中国.中国`, true},
{`foo@test.bar.com`, true},
{`FOO@TEST.BAR.COM`, true},
{`foo@bar.com/path`, `foo@bar.com`},
{`foo+test@bar.com`, true},
{`foo+._%-@bar.com`, true},
})
doTest(t, "Strict2", Strict(), []testCase{
{`http:// foo.com`, nil},
{`foo.a`, nil},
{`foo.com`, nil},
{`foo.com/`, nil},
{`1.1.1.1`, nil},
{`3ffe:2a00:100:7031::1`, nil},
{`test.foo.com:8080/path`, nil},
{`foo@bar.com`, nil},
// An IP address in URI host position must be bracketed unless it is IPv4.
// https://www.rfc-editor.org/rfc/rfc3986#section-3.2.2
// TODO: Implement this restriction, ideally without matching the `http://1080` prefix.
//{`http://1080::8:800:200c:417a/path`, nil},
})
}
func TestStrictMatchingSchemeError(t *testing.T) {
for _, c := range []struct {
exp string
wantErr bool
}{
{`http://`, false},
{`https?://`, false},
{`http://|mailto:`, false},
{`http://(`, true},
} {
_, err := StrictMatchingScheme(c.exp)
if c.wantErr && err == nil {
t.Errorf(`StrictMatchingScheme("%s") did not error as expected`, c.exp)
} else if !c.wantErr && err != nil {
t.Errorf(`StrictMatchingScheme("%s") unexpectedly errored`, c.exp)
}
}
}
func TestStrictMatchingScheme(t *testing.T) {
strictMatching, _ := StrictMatchingScheme("http://|ftps?://|mailto:")
doTest(t, "StrictMatchingScheme", strictMatching, []testCase{
{`foo.com`, nil},
{`foo@bar.com`, nil},
{`http://foo`, true},
{`Http://foo`, true},
{`https://foo`, nil},
{`ftp://foo`, true},
{`ftps://foo`, true},
{`mailto:foo`, true},
{`MAILTO:foo`, true},
{`sms:123`, nil},
})
}
func TestStrictMatchingSchemeAny(t *testing.T) {
strictMatching, _ := StrictMatchingScheme(AnyScheme)
doTest(t, "StrictMatchingScheme", strictMatching, []testCase{
{`http://foo`, true},
{`git+https://foo`, true},
{`randomtexthttp://foo.bar/etc`, true},
{`mailto:foo`, true},
})
}
func bench(b *testing.B, re func() *regexp.Regexp, str string) {
b.ReportAllocs()
b.SetBytes(int64(len(str)))
b.RunParallel(func(pb *testing.PB) {
for pb.Next() {
re().FindAllString(str, -1)
}
})
}
const inputNone = `
foo bar
yaml: "as well"
some more plaintext
which does not contain any urls.
`
const inputMany = `
foo bar http://foo.foo https://192.168.1.1/path
foo.com bitcoin:address ftp://
xmpp:foo@bar.com
`
func BenchmarkStrict_none(b *testing.B) {
bench(b, Strict, inputNone)
}
func BenchmarkStrict_many(b *testing.B) {
bench(b, Strict, inputMany)
}
func BenchmarkRelaxed_none(b *testing.B) {
bench(b, Relaxed, inputNone)
}
func BenchmarkRelaxed_many(b *testing.B) {
bench(b, Relaxed, inputMany)
}
var (
rxMatchingScheme *regexp.Regexp
rxMatchingSchemeOnce sync.Once
)
func matchingScheme() *regexp.Regexp {
rxMatchingSchemeOnce.Do(func() {
rx, err := StrictMatchingScheme("https?://")
if err != nil {
panic(err)
}
rxMatchingScheme = rx
})
return rxMatchingScheme
}
func BenchmarkStrictMatchingScheme_none(b *testing.B) {
bench(b, matchingScheme, inputNone)
}
func BenchmarkStrictMatchingScheme_many(b *testing.B) {
bench(b, matchingScheme, inputMany)
}