diff --git a/go.mod b/go.mod index 914e28d3..bd6e7bce 100644 --- a/go.mod +++ b/go.mod @@ -6,7 +6,6 @@ require ( github.com/ImVexed/fasturl v0.0.0-20230304231329-4e41488060f3 github.com/MatusOllah/slogcolor v1.6.0 github.com/PuerkitoBio/goquery v1.10.3 - github.com/ada-url/goada v0.0.0-20250104020233-00cbf4dc9da1 github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc github.com/dustin/go-humanize v1.0.1 github.com/fatih/color v1.18.0 @@ -19,6 +18,7 @@ require ( github.com/internetarchive/gocrawlhq v1.2.31 github.com/internetarchive/gowarc v0.8.82 github.com/ncruces/go-sqlite3 v0.26.1 + github.com/nlnwa/whatwg-url v0.6.2 github.com/pdfcpu/pdfcpu v0.11.0 github.com/philippgille/gokv/leveldb v0.7.0 github.com/prometheus/client_golang v1.22.0 @@ -38,6 +38,7 @@ require ( github.com/andybalholm/cascadia v1.3.3 // indirect github.com/armon/go-metrics v0.4.1 // indirect github.com/beorn7/perks v1.0.1 // indirect + github.com/bits-and-blooms/bitset v1.20.0 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/cloudflare/circl v1.6.1 // indirect github.com/dolthub/maphash v0.1.0 // indirect diff --git a/go.sum b/go.sum index 8b733355..6fd486df 100644 --- a/go.sum +++ b/go.sum @@ -5,8 +5,6 @@ github.com/MatusOllah/slogcolor v1.6.0 h1:JAKer0xj5l1jYTXyQvs5ggqmJqYDuLnxgR9jfM github.com/MatusOllah/slogcolor v1.6.0/go.mod h1:5y1H50XuQIBvuYTJlmokWi+4FuPiJN5L7Z0jM4K4bYA= github.com/PuerkitoBio/goquery v1.10.3 h1:pFYcNSqHxBD06Fpj/KsbStFRsgRATgnf3LeXiUkhzPo= github.com/PuerkitoBio/goquery v1.10.3/go.mod h1:tMUX0zDMHXYlAQk6p35XxQMqMweEKB7iK7iLNd4RH4Y= -github.com/ada-url/goada v0.0.0-20250104020233-00cbf4dc9da1 h1:K54lYH7ZY/NHweMd9/R82dHaFelQQmwjEhUfwUqCqEk= -github.com/ada-url/goada v0.0.0-20250104020233-00cbf4dc9da1/go.mod h1:+D/veNwI2mA1hDYLVrYSobYcLFWm6e3DJ/H/d/dxlu8= github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc= github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc= github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0= @@ -26,6 +24,8 @@ github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+Ce github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/bgentry/speakeasy v0.1.0/go.mod h1:+zsyZBPWlz7T6j88CTgSN5bM796AkVf0kBD4zp0CCIs= +github.com/bits-and-blooms/bitset v1.20.0 h1:2F+rfL86jE2d/bmw7OhqUg2Sj/1rURkBn3MdfoPyRVU= +github.com/bits-and-blooms/bitset v1.20.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= github.com/cespare/xxhash/v2 v2.1.1/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= @@ -217,6 +217,8 @@ github.com/ncruces/go-sqlite3 v0.26.1 h1:lBXmbmucH1Bsj57NUQR6T84UoMN7jnNImhF+ibE github.com/ncruces/go-sqlite3 v0.26.1/go.mod h1:XFTPtFIo1DmGCh+XVP8KGn9b/o2f+z0WZuT09x2N6eo= github.com/ncruces/julianday v1.0.0 h1:fH0OKwa7NWvniGQtxdJRxAgkBMolni2BjDHaWTxqt7M= github.com/ncruces/julianday v1.0.0/go.mod h1:Dusn2KvZrrovOMJuOt0TNXL6tB7U2E8kvza5fFc9G7g= +github.com/nlnwa/whatwg-url v0.6.2 h1:jU61lU2ig4LANydbEJmA2nPrtCGiKdtgT0rmMd2VZ/Q= +github.com/nlnwa/whatwg-url v0.6.2/go.mod h1:x0FPXJzzOEieQtsBT/AKvbiBbQ46YlL6Xa7m02M1ECk= github.com/onsi/ginkgo v1.6.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= github.com/onsi/ginkgo v1.7.0 h1:WSHQ+IS43OoUrWtD1/bbclrwK8TTH5hzp+umCiuxHgs= github.com/onsi/ginkgo v1.7.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= @@ -343,6 +345,7 @@ golang.org/x/crypto v0.13.0/go.mod h1:y6Z2r+Rw4iayiXXAIxJIDAJ1zMW4yaTpebo8fPOliY golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU= golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8= golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk= +golang.org/x/crypto v0.32.0/go.mod h1:ZnnJkOaASj8g0AjIduWNlq2NRxL0PlBrbKVyZ6V/Ugc= golang.org/x/crypto v0.39.0 h1:SHs+kF4LP+f+p14esP5jAoDpHU8Gu/v9lFRK6IT5imM= golang.org/x/crypto v0.39.0/go.mod h1:L+Xg3Wf6HoL4Bn4238Z6ft6KfEpN0tJGo53AAPC632U= golang.org/x/exp v0.0.0-20250305212735-054e65f0b394 h1:nDVHiLt8aIbd/VzvPWN6kSOPE7+F/fNFDSXLVYkE/Iw= @@ -371,6 +374,7 @@ golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk= golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44= golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM= golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4= +golang.org/x/net v0.34.0/go.mod h1:di0qlW3YNM5oh6GqDGQr92MyTozJPmybPK4Ev/Gm31k= golang.org/x/net v0.41.0 h1:vBTly1HeNPEn3wtREYfy4GZ/NECgw2Cnl+nK6Nz3uvw= golang.org/x/net v0.41.0/go.mod h1:B/K4NNqkfmg07DQYrbwvSluqCJOOXwUjeb/5lOisjbA= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= diff --git a/internal/pkg/preprocessor/url.go b/internal/pkg/preprocessor/url.go index 78a6089b..86b156c4 100644 --- a/internal/pkg/preprocessor/url.go +++ b/internal/pkg/preprocessor/url.go @@ -1,11 +1,18 @@ package preprocessor import ( + "fmt" "net/url" "strings" - "github.com/ada-url/goada" "github.com/internetarchive/Zeno/pkg/models" + wu "github.com/nlnwa/whatwg-url/url" +) + +const ( + httpPrefix = "http://" + httpsPrefix = "https://" + ftpPrefix = "ftp://" ) // Normalize the URL by removing fragments, attempting to add URL scheme if missing, @@ -15,48 +22,57 @@ func NormalizeURL(URL *models.URL, parentURL *models.URL) (err error) { // Clean the URL by removing leading and trailing quotes URL.Raw = strings.Trim(URL.Raw, `"'`) - var adaParse *goada.Url - - parsedURL, err := url.Parse(URL.Raw) - if err != nil { - return err - } + var wuParse *wu.Url - if parentURL != nil && !parsedURL.IsAbs() { - // Determine the base with the following logic: - // - always with the tag found in the HTML document, if it exists (TBI) - // - if the URL starts with a slash, use the parent URL's scheme and host - // - if the URL does not start with a slash, use the parent URL's scheme, host, and path - baseURL := parentURL.GetParsed() - if strings.HasPrefix(parsedURL.Path, "/") { - adaParse, err = goada.NewWithBase(URL.Raw, baseURL.Scheme+"://"+baseURL.Host) - if err != nil { - return err + if parentURL == nil { + wuParse, err = wu.Parse(URL.Raw) + if err != nil { + lowerURL := strings.ToLower(URL.Raw) + if !strings.HasPrefix(lowerURL, httpPrefix) && + !strings.HasPrefix(lowerURL, httpsPrefix) && + !strings.HasPrefix(lowerURL, ftpPrefix) && + !strings.Contains(lowerURL, "://") { + URL.Raw = httpPrefix + URL.Raw } - } else { - adaParse, err = goada.NewWithBase(URL.Raw, baseURL.String()) + wuParse, err = wu.Parse(URL.Raw) if err != nil { return err } } } else { - if parsedURL.Scheme == "" { - parsedURL.Scheme = "http" - } - - adaParse, err = goada.New(models.URLToString(parsedURL)) + parsedURL, err := url.Parse(URL.Raw) if err != nil { return err } + + if parsedURL.IsAbs() { + wuParse, err = wu.Parse(URL.Raw) + if err != nil { + return err + } + } else { + baseURL := parentURL.GetParsed() + if baseURL == nil { + return fmt.Errorf("invalid baseURL in parentURL: %s", parentURL.Raw) + } + + resolved := baseURL.ResolveReference(parsedURL) + wuParse, err = wu.Parse(resolved.String()) + if err != nil { + return err + } + } } - adaParse.SetHash("") - if scheme := adaParse.Protocol(); scheme != "http:" && scheme != "https:" { + wuParse.SetHash("") + + scheme := strings.ToLower(wuParse.Protocol()) + if scheme != "http:" && scheme != "https:" { return ErrUnsupportedScheme } // Check for localhost and 127.0.0.1 - host := adaParse.Hostname() + host := wuParse.Hostname() if host == "localhost" || host == "127.0.0.1" { return ErrUnsupportedHost } @@ -66,8 +82,8 @@ func NormalizeURL(URL *models.URL, parentURL *models.URL) (err error) { return ErrUnsupportedHost } - URL.Raw = adaParse.Href() - adaParse.Free() + // Update the URL with the normalized version + URL.Raw = wuParse.Href(false) return URL.Parse() }