Files
2024-09-19 21:38:24 -04:00

470 lines
14 KiB
Go
Raw Permalink Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
// Copyright (c) 2015, Daniel Martí <mvdan@mvdan.cc>
// See LICENSE for licensing information
package xurls
import (
"fmt"
"regexp"
"sync"
"testing"
)
type testCase struct {
in string
want interface{}
}
func wantStr(in string, want interface{}) string {
switch x := want.(type) {
case string:
return x
case bool:
if x {
return in
}
}
return ""
}
func doTest(t *testing.T, name string, re *regexp.Regexp, cases []testCase) {
for i, c := range cases {
t.Run(fmt.Sprintf("%s/%03d", name, i), func(t *testing.T) {
want := wantStr(c.in, c.want)
for _, surround := range []string{"", "\n"} {
in := surround + c.in + surround
got := re.FindString(in)
if got != want {
t.Errorf(`FindString(%q) got %q, want %q`, in, got, want)
}
}
})
}
}
var constantTestCases = []testCase{
{``, nil},
{` `, nil},
{`:`, nil},
{`::`, nil},
{`:::`, nil},
{`::::`, nil},
{`.`, nil},
{`..`, nil},
{`...`, nil},
{`1.1`, nil},
{`.1.`, nil},
{`1.1.1`, nil},
{`1:1`, nil},
{`:1:`, nil},
{`1:1:1`, nil},
{`://`, nil},
{`foo`, nil},
{`foo:`, nil},
{`mailto:`, nil},
{`foo://`, nil},
{`http://`, nil},
{`http:// foo`, nil},
{`http:// foo`, nil},
{`:foo`, nil},
{`://foo`, nil},
{`foorandom:bar`, nil},
{`foo.randombar`, nil},
{`zzz.`, nil},
{`.zzz`, nil},
{`zzz.zzz`, nil},
{`/some/path`, nil},
{`rel/path`, nil},
{`localhost`, nil},
{`com`, nil},
{`.com`, nil},
{`com.`, nil},
{`http`, nil},
{`http://foo`, true},
{`http://FOO`, true},
{`http://FAÀ`, true},
{`https://localhost`, true},
{`mailto:foo`, true},
{`MAILTO:foo`, true},
{`sms:123`, true},
{`xmpp:foo@bar`, true},
{`bitcoin:Addr23?amount=1&message=foo`, true},
{`cid:foo-32x32.v2_fe0f1423.png`, true},
{`mid:960830.1639@XIson.com`, true},
{`http://foo.com`, true},
{`http://foo.co.uk`, true},
{`http://foo.random`, true},
{` http://foo.com/bar `, `http://foo.com/bar`},
{` http://foo.com/bar more`, `http://foo.com/bar`},
{`<http://foo.com/bar>`, `http://foo.com/bar`},
{`<http://foo.com/bar>more`, `http://foo.com/bar`},
{`.http://foo.com/bar.`, `http://foo.com/bar`},
{`.http://foo.com/bar.more`, `http://foo.com/bar.more`},
{`,http://foo.com/bar,`, `http://foo.com/bar`},
{`,http://foo.com/bar,more`, `http://foo.com/bar,more`},
{`*http://foo.com/bar*`, `http://foo.com/bar`},
{`*http://foo.com/bar*more`, `http://foo.com/bar*more`},
{`_http://foo.com/bar_`, `http://foo.com/bar_`},
{`_http://foo.com/bar_more`, `http://foo.com/bar_more`},
{`(http://foo.com/bar)`, `http://foo.com/bar`},
{`(http://foo.com/bar)more`, `http://foo.com/bar`},
{`[http://foo.com/bar]`, `http://foo.com/bar`},
{`[http://foo.com/bar]more`, `http://foo.com/bar`},
{`'http://foo.com/bar'`, `http://foo.com/bar`},
{`'http://foo.com/bar'more`, `http://foo.com/bar'more`},
{`"http://foo.com/bar"`, `http://foo.com/bar`},
{`"http://foo.com/bar"more`, `http://foo.com/bar`},
{`{"url":"http://foo.com/bar"}`, `http://foo.com/bar`},
{`{"before":"foo","url":"http://foo.com/bar","after":"bar"}`, `http://foo.com/bar`},
{`http://a.b/a0/-+_&~*%=#@.,:;'?![]()a`, true},
{`http://a.b/a0/$€¥`, true},
{`http://✪foo.bar/pa✪th©more`, true},
{`http://foo.bar/path/`, true},
{`http://foo.bar/path-`, true},
{`http://foo.bar/path+`, true},
{`http://foo.bar/path&`, true},
{`http://foo.bar/path~`, true},
{`http://foo.bar/path%`, true},
{`http://foo.bar/path=`, true},
{`http://foo.bar/path#`, true},
{`http://foo.bar/path.`, `http://foo.bar/path`},
{`http://foo.bar/path,`, `http://foo.bar/path`},
{`http://foo.bar/path:`, `http://foo.bar/path`},
{`http://foo.bar/path;`, `http://foo.bar/path`},
{`http://foo.bar/path'`, `http://foo.bar/path`},
{`http://foo.bar/path?`, `http://foo.bar/path`},
{`http://foo.bar/path!`, `http://foo.bar/path`},
{`http://foo.bar/path@`, `http://foo.bar/path`},
{`http://foo.bar/path|`, `http://foo.bar/path`},
{`http://foo.bar/path|more`, `http://foo.bar/path`},
{`http://foo.bar/path<`, `http://foo.bar/path`},
{`http://foo.bar/path<more`, `http://foo.bar/path`},
{`http://foo.com/path_(more)`, true},
{`(http://foo.com/path_(more))`, `http://foo.com/path_(more)`},
{`http://foo.com/path_(even)-(more)`, true},
{`http://foo.com/path_(even)(more)`, true},
{`http://foo.com/path_(even_(nested))`, true},
{`(http://foo.com/path_(even_(nested)))`, `http://foo.com/path_(even_(nested))`},
{`http://foo.com/path_[more]`, true},
{`[http://foo.com/path_[more]]`, `http://foo.com/path_[more]`},
{`http://foo.com/path_[even]-[more]`, true},
{`http://foo.com/path_[even][more]`, true},
{`http://foo.com/path_[even_[nested]]`, true},
{`[http://foo.com/path_[even_[nested]]]`, `http://foo.com/path_[even_[nested]]`},
{`http://foo.com/path_{more}`, true},
{`{http://foo.com/path_{more}}`, `http://foo.com/path_{more}`},
{`http://foo.com/path_{even}-{more}`, true},
{`http://foo.com/path_{even}{more}`, true},
{`http://foo.com/path_{even_{nested}}`, true},
{`{http://foo.com/path_{even_{nested}}}`, `http://foo.com/path_{even_{nested}}`},
{`http://foo.com/path#fragment`, true},
{`http://foo.com/emptyfrag#`, true},
{`http://foo.com/spaced%20path`, true},
{`http://foo.com/?p=spaced%20param`, true},
{`http://test.foo.com/`, true},
{`http://foo.com/path`, true},
{`http://foo.com:8080/path`, true},
{`http://1.1.1.1/path`, true},
{`http://1.1.1.1:8080/path`, true},
{`http://[1080::8:800:200c:417a]/path`, true},
{`http://[1080::8:800:200c:417a]:8080/path`, true},
// scheme://IPv6_addr is not valid per RFC 3987, but is supported anyway (for now).
{`http://1080::8:800:200c:417a/path`, true},
{`http://2001.db8:0/path`, true},
{`http://中国.中国/中国`, true},
{`http://中国.中国/foo中国`, true},
{`http://उदाहरण.परीकषा`, true},
{`http://xn-foo.xn--p1acf/path`, true},
{`what is http://foo.com?`, `http://foo.com`},
{`go visit http://foo.com/path.`, `http://foo.com/path`},
{`go visit http://foo.com/path...`, `http://foo.com/path`},
{`what is http://foo.com/path?`, `http://foo.com/path`},
{`the http://foo.com!`, `http://foo.com`},
{`https://test.foo.bar/path?a=b`, `https://test.foo.bar/path?a=b`},
{`ftp://user@foo.bar`, true},
{`http://foo.com/base64-bCBwbGVhcw==`, true},
{`http://foo.com/`, true},
{`http://foo.com/🐼`, true},
{`https://shmibbles.me/tmp/自殺でも?.png`, true},
{`randomtexthttp://foo.bar/etc`, "http://foo.bar/etc"},
{`postgres://user:pass@host.com:5432/path?k=v#f`, true},
{`postgres://user:pass@host.com:5432/path?k=v#f`, true},
{`zoommtg://zoom.us/join?confno=1234&pwd=xxx`, true},
{`zoomus://zoom.us/join?confno=1234&pwd=xxx`, true},
}
func TestRegexes(t *testing.T) {
doTest(t, "Relaxed", Relaxed(), constantTestCases)
doTest(t, "Strict", Strict(), constantTestCases)
doTest(t, "Relaxed2", Relaxed(), []testCase{
{`foo.a`, nil},
{`foo.com`, true},
{`foo.com bar.com`, `foo.com`},
{`foo.com-foo`, `foo.com`},
{`foo.company`, true},
{`foo.comrandom`, nil},
{`some.guy`, nil},
{`foo.example`, true},
{`foo.i2p`, true},
{`foo.local`, true},
{`foo.onion`, true},
{`中国.中国`, true},
{`中国.中国/foo中国`, true},
{`test.联通`, true},
{`test.联通 extra`, `test.联通`},
{`test.xn--8y0a063a`, true},
{`test.xn--8y0a063a/foobar`, true},
{`test.xn-foo`, nil},
{`test.xn--`, nil},
{`foo.com/`, true},
{`1.1.1.1`, true},
{`10.50.23.250`, true},
{`121.1.1.1`, true},
{`255.1.1.1`, true},
{`300.1.1.1`, nil},
{`1.1.1.300`, nil},
{`foo@1.2.3.4`, `1.2.3.4`},
// https://www.iana.org/assignments/iana-ipv6-special-registry/iana-ipv6-special-registry.xhtml
{`::1`, true},
//{`::`, true},
{`::ffff:0:0`, true},
{`64:ff9b::`, true},
{`64:ff9b:1::`, true},
{`100::`, true},
{`2001::`, true},
{`2001:1::1`, true},
{`2001:1::2`, true},
{`2001:2::`, true},
{`2001:3::`, true},
{`2001:4:112::`, true},
{`2001:10::`, true},
{`2001:20::`, true},
{`2001:db8::`, true},
{`2002::`, true},
{`2620:4f:8000::`, true},
{`fc00::`, true},
{`fe80::`, true},
// https://datatracker.ietf.org/doc/html/rfc4291#section-2.2
{`ABCD:EF01:2345:6789:ABCD:EF01:2345:6789`, true},
{`2001:DB8:0:0:8:800:200C:417A`, true},
{`2001:DB8:0:0:8:800:200C:417A`, true}, // a unicast address
{`FF01:0:0:0:0:0:0:101`, true}, // a multicast address
{`0:0:0:0:0:0:0:1`, true}, // the loopback address
{`0:0:0:0:0:0:0:0`, true}, // the unspecified address
{`2001:DB8::8:800:200C:417A`, true}, // a unicast address
{`FF01::101`, true}, // a multicast address
{`::1`, true}, // the loopback address
//{`::`, true}, // the unspecified address
{`::`, nil},
{`0:0:0:0:0:0:13.1.68.3`, true},
{`0:0:0:0:0:FFFF:129.144.52.38`, true},
{`::13.1.68.3`, true},
{`::FFFF:129.144.52.38`, true},
// https://datatracker.ietf.org/doc/html/rfc5952#section-1
{`2001:db8:0:0:1:0:0:1`, true},
{`2001:0db8:0:0:1:0:0:1`, true},
{`2001:db8::1:0:0:1`, true},
{`2001:db8::0:1:0:0:1`, true},
{`2001:0db8::1:0:0:1`, true},
{`2001:db8:0:0:1::1`, true},
{`2001:db8:0000:0:1::1`, true},
{`2001:DB8:0:0:1::1`, true},
// https://datatracker.ietf.org/doc/html/rfc5952#section-2.1
{`2001:db8:aaaa:bbbb:cccc:dddd:eeee:0001`, true},
{`2001:db8:aaaa:bbbb:cccc:dddd:eeee:001`, true},
{`2001:db8:aaaa:bbbb:cccc:dddd:eeee:01`, true},
{`2001:db8:aaaa:bbbb:cccc:dddd:eeee:1`, true},
// https://datatracker.ietf.org/doc/html/rfc5952#section-2.2
{`2001:db8:aaaa:bbbb:cccc:dddd::1`, true},
{`2001:db8:aaaa:bbbb:cccc:dddd:0:1`, true},
{`2001:db8:0:0:0::1`, true},
{`2001:db8:0:0::1`, true},
{`2001:db8:0::1`, true},
{`2001:db8::1`, true},
{`2001:db8::aaaa:0:0:1`, true},
{`2001:db8:0:0:aaaa::1`, true},
// https://datatracker.ietf.org/doc/html/rfc5952#section-2.3
{`2001:db8:aaaa:bbbb:cccc:dddd:eeee:aaaa`, true},
{`2001:db8:aaaa:bbbb:cccc:dddd:eeee:AAAA`, true},
{`2001:db8:aaaa:bbbb:cccc:dddd:eeee:AaAa`, true},
// An IP address in URI host position must be bracketed unless it is IPv4.
// https://www.rfc-editor.org/rfc/rfc3986#section-3.2.2
// TODO: Implement this restriction, ideally without matching the `http://1080` prefix.
//{`http://1080::8:800:200c:417a/path`, `1080::8:800:200c:417a`},
{`foo.com:8080`, true},
{`foo.com:8080/path`, true},
{`test.foo.com`, true},
{`test.foo.com/path`, true},
{`test.foo.com/path/more/`, true},
{`TEST.FOO.COM/PATH`, true},
{`TEST.FÓO.COM/PÁTH`, true},
{`foo.com/path_(more)`, true},
{`foo.com/path_(even)_(more)`, true},
{`foo.com/path_(more)/more`, true},
{`foo.com/path_(more)/end)`, `foo.com/path_(more)/end`},
{`www.foo.com`, true},
{` foo.com/bar `, `foo.com/bar`},
{` foo.com/bar more`, `foo.com/bar`},
{`<foo.com/bar>`, `foo.com/bar`},
{`<foo.com/bar>more`, `foo.com/bar`},
{`,foo.com/bar.`, `foo.com/bar`},
{`,foo.com/bar.more`, `foo.com/bar.more`},
{`,foo.com/bar,`, `foo.com/bar`},
{`,foo.com/bar,more`, `foo.com/bar,more`},
{`(foo.com/bar)`, `foo.com/bar`},
{`"foo.com/bar'`, `foo.com/bar`},
{`"foo.com/bar'more`, `foo.com/bar'more`},
{`"foo.com/bar"`, `foo.com/bar`},
{`what is foo.com?`, `foo.com`},
{`the foo.com!`, `foo.com`},
{`foo@bar`, nil},
{`foo@bar.a`, nil},
{`foo@bar.com`, true},
{`foo@sub.bar.com`, true},
{`foo@bar.com bar@bar.com`, `foo@bar.com`},
{`foo@bar.onion`, true},
{`foo@中国.中国`, true},
{`foo@test.bar.com`, true},
{`FOO@TEST.BAR.COM`, true},
{`foo@bar.com/path`, `foo@bar.com`},
{`foo+test@bar.com`, true},
{`foo+._%-@bar.com`, true},
})
doTest(t, "Strict2", Strict(), []testCase{
{`http:// foo.com`, nil},
{`foo.a`, nil},
{`foo.com`, nil},
{`foo.com/`, nil},
{`1.1.1.1`, nil},
{`3ffe:2a00:100:7031::1`, nil},
{`test.foo.com:8080/path`, nil},
{`foo@bar.com`, nil},
// An IP address in URI host position must be bracketed unless it is IPv4.
// https://www.rfc-editor.org/rfc/rfc3986#section-3.2.2
// TODO: Implement this restriction, ideally without matching the `http://1080` prefix.
//{`http://1080::8:800:200c:417a/path`, nil},
})
}
func TestStrictMatchingSchemeError(t *testing.T) {
for _, c := range []struct {
exp string
wantErr bool
}{
{`http://`, false},
{`https?://`, false},
{`http://|mailto:`, false},
{`http://(`, true},
} {
_, err := StrictMatchingScheme(c.exp)
if c.wantErr && err == nil {
t.Errorf(`StrictMatchingScheme("%s") did not error as expected`, c.exp)
} else if !c.wantErr && err != nil {
t.Errorf(`StrictMatchingScheme("%s") unexpectedly errored`, c.exp)
}
}
}
func TestStrictMatchingScheme(t *testing.T) {
strictMatching, _ := StrictMatchingScheme("http://|ftps?://|mailto:")
doTest(t, "StrictMatchingScheme", strictMatching, []testCase{
{`foo.com`, nil},
{`foo@bar.com`, nil},
{`http://foo`, true},
{`Http://foo`, true},
{`https://foo`, nil},
{`ftp://foo`, true},
{`ftps://foo`, true},
{`mailto:foo`, true},
{`MAILTO:foo`, true},
{`sms:123`, nil},
})
}
func TestStrictMatchingSchemeAny(t *testing.T) {
strictMatching, _ := StrictMatchingScheme(AnyScheme)
doTest(t, "StrictMatchingScheme", strictMatching, []testCase{
{`http://foo`, true},
{`git+https://foo`, true},
{`randomtexthttp://foo.bar/etc`, true},
{`mailto:foo`, true},
})
}
func bench(b *testing.B, re func() *regexp.Regexp, str string) {
b.ReportAllocs()
b.SetBytes(int64(len(str)))
b.RunParallel(func(pb *testing.PB) {
for pb.Next() {
re().FindAllString(str, -1)
}
})
}
const inputNone = `
foo bar
yaml: "as well"
some more plaintext
which does not contain any urls.
`
const inputMany = `
foo bar http://foo.foo https://192.168.1.1/path
foo.com bitcoin:address ftp://
xmpp:foo@bar.com
`
func BenchmarkStrict_none(b *testing.B) {
bench(b, Strict, inputNone)
}
func BenchmarkStrict_many(b *testing.B) {
bench(b, Strict, inputMany)
}
func BenchmarkRelaxed_none(b *testing.B) {
bench(b, Relaxed, inputNone)
}
func BenchmarkRelaxed_many(b *testing.B) {
bench(b, Relaxed, inputMany)
}
var (
rxMatchingScheme *regexp.Regexp
rxMatchingSchemeOnce sync.Once
)
func matchingScheme() *regexp.Regexp {
rxMatchingSchemeOnce.Do(func() {
rx, err := StrictMatchingScheme("https?://")
if err != nil {
panic(err)
}
rxMatchingScheme = rx
})
return rxMatchingScheme
}
func BenchmarkStrictMatchingScheme_none(b *testing.B) {
bench(b, matchingScheme, inputNone)
}
func BenchmarkStrictMatchingScheme_many(b *testing.B) {
bench(b, matchingScheme, inputMany)
}