Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ require (
github.com/ImVexed/fasturl v0.0.0-20230304231329-4e41488060f3
github.com/MatusOllah/slogcolor v1.6.0
github.com/PuerkitoBio/goquery v1.10.3
github.com/ada-url/goada v0.0.0-20250104020233-00cbf4dc9da1
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc
github.com/dustin/go-humanize v1.0.1
github.com/fatih/color v1.18.0
Expand All @@ -19,6 +18,7 @@ require (
github.com/internetarchive/gocrawlhq v1.2.31
github.com/internetarchive/gowarc v0.8.82
github.com/ncruces/go-sqlite3 v0.26.1
github.com/nlnwa/whatwg-url v0.6.2
github.com/pdfcpu/pdfcpu v0.11.0
github.com/philippgille/gokv/leveldb v0.7.0
github.com/prometheus/client_golang v1.22.0
Expand All @@ -38,6 +38,7 @@ require (
github.com/andybalholm/cascadia v1.3.3 // indirect
github.com/armon/go-metrics v0.4.1 // indirect
github.com/beorn7/perks v1.0.1 // indirect
github.com/bits-and-blooms/bitset v1.20.0 // indirect
github.com/cespare/xxhash/v2 v2.3.0 // indirect
github.com/cloudflare/circl v1.6.1 // indirect
github.com/dolthub/maphash v0.1.0 // indirect
Expand Down
8 changes: 6 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,6 @@ github.com/MatusOllah/slogcolor v1.6.0 h1:JAKer0xj5l1jYTXyQvs5ggqmJqYDuLnxgR9jfM
github.com/MatusOllah/slogcolor v1.6.0/go.mod h1:5y1H50XuQIBvuYTJlmokWi+4FuPiJN5L7Z0jM4K4bYA=
github.com/PuerkitoBio/goquery v1.10.3 h1:pFYcNSqHxBD06Fpj/KsbStFRsgRATgnf3LeXiUkhzPo=
github.com/PuerkitoBio/goquery v1.10.3/go.mod h1:tMUX0zDMHXYlAQk6p35XxQMqMweEKB7iK7iLNd4RH4Y=
github.com/ada-url/goada v0.0.0-20250104020233-00cbf4dc9da1 h1:K54lYH7ZY/NHweMd9/R82dHaFelQQmwjEhUfwUqCqEk=
github.com/ada-url/goada v0.0.0-20250104020233-00cbf4dc9da1/go.mod h1:+D/veNwI2mA1hDYLVrYSobYcLFWm6e3DJ/H/d/dxlu8=
github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc=
github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc=
github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0=
Expand All @@ -26,6 +24,8 @@ github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+Ce
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
github.com/bgentry/speakeasy v0.1.0/go.mod h1:+zsyZBPWlz7T6j88CTgSN5bM796AkVf0kBD4zp0CCIs=
github.com/bits-and-blooms/bitset v1.20.0 h1:2F+rfL86jE2d/bmw7OhqUg2Sj/1rURkBn3MdfoPyRVU=
github.com/bits-and-blooms/bitset v1.20.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8=
github.com/cespare/xxhash/v2 v2.1.1/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
Expand Down Expand Up @@ -217,6 +217,8 @@ github.com/ncruces/go-sqlite3 v0.26.1 h1:lBXmbmucH1Bsj57NUQR6T84UoMN7jnNImhF+ibE
github.com/ncruces/go-sqlite3 v0.26.1/go.mod h1:XFTPtFIo1DmGCh+XVP8KGn9b/o2f+z0WZuT09x2N6eo=
github.com/ncruces/julianday v1.0.0 h1:fH0OKwa7NWvniGQtxdJRxAgkBMolni2BjDHaWTxqt7M=
github.com/ncruces/julianday v1.0.0/go.mod h1:Dusn2KvZrrovOMJuOt0TNXL6tB7U2E8kvza5fFc9G7g=
github.com/nlnwa/whatwg-url v0.6.2 h1:jU61lU2ig4LANydbEJmA2nPrtCGiKdtgT0rmMd2VZ/Q=
github.com/nlnwa/whatwg-url v0.6.2/go.mod h1:x0FPXJzzOEieQtsBT/AKvbiBbQ46YlL6Xa7m02M1ECk=
github.com/onsi/ginkgo v1.6.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE=
github.com/onsi/ginkgo v1.7.0 h1:WSHQ+IS43OoUrWtD1/bbclrwK8TTH5hzp+umCiuxHgs=
github.com/onsi/ginkgo v1.7.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE=
Expand Down Expand Up @@ -343,6 +345,7 @@ golang.org/x/crypto v0.13.0/go.mod h1:y6Z2r+Rw4iayiXXAIxJIDAJ1zMW4yaTpebo8fPOliY
golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU=
golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8=
golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk=
golang.org/x/crypto v0.32.0/go.mod h1:ZnnJkOaASj8g0AjIduWNlq2NRxL0PlBrbKVyZ6V/Ugc=
golang.org/x/crypto v0.39.0 h1:SHs+kF4LP+f+p14esP5jAoDpHU8Gu/v9lFRK6IT5imM=
golang.org/x/crypto v0.39.0/go.mod h1:L+Xg3Wf6HoL4Bn4238Z6ft6KfEpN0tJGo53AAPC632U=
golang.org/x/exp v0.0.0-20250305212735-054e65f0b394 h1:nDVHiLt8aIbd/VzvPWN6kSOPE7+F/fNFDSXLVYkE/Iw=
Expand Down Expand Up @@ -371,6 +374,7 @@ golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk=
golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44=
golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM=
golang.org/x/net v0.33.0/go.mod h1:HXLR5J+9DxmrqMwG9qjGCxZ+zKXxBru04zlTvWlWuN4=
golang.org/x/net v0.34.0/go.mod h1:di0qlW3YNM5oh6GqDGQr92MyTozJPmybPK4Ev/Gm31k=
golang.org/x/net v0.41.0 h1:vBTly1HeNPEn3wtREYfy4GZ/NECgw2Cnl+nK6Nz3uvw=
golang.org/x/net v0.41.0/go.mod h1:B/K4NNqkfmg07DQYrbwvSluqCJOOXwUjeb/5lOisjbA=
golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
Expand Down
74 changes: 45 additions & 29 deletions internal/pkg/preprocessor/url.go
Original file line number Diff line number Diff line change
@@ -1,11 +1,18 @@
package preprocessor

import (
"fmt"
"net/url"
"strings"

"github.com/ada-url/goada"
"github.com/internetarchive/Zeno/pkg/models"
wu "github.com/nlnwa/whatwg-url/url"
)

const (
httpPrefix = "http://"
httpsPrefix = "https://"
ftpPrefix = "ftp://"
)

// Normalize the URL by removing fragments, attempting to add URL scheme if missing,
Expand All @@ -15,48 +22,57 @@ func NormalizeURL(URL *models.URL, parentURL *models.URL) (err error) {
// Clean the URL by removing leading and trailing quotes
URL.Raw = strings.Trim(URL.Raw, `"'`)

var adaParse *goada.Url

parsedURL, err := url.Parse(URL.Raw)
if err != nil {
return err
}
var wuParse *wu.Url

if parentURL != nil && !parsedURL.IsAbs() {
// Determine the base with the following logic:
// - always with the <base> tag found in the HTML document, if it exists (TBI)
// - if the URL starts with a slash, use the parent URL's scheme and host
// - if the URL does not start with a slash, use the parent URL's scheme, host, and path
baseURL := parentURL.GetParsed()
if strings.HasPrefix(parsedURL.Path, "/") {
adaParse, err = goada.NewWithBase(URL.Raw, baseURL.Scheme+"://"+baseURL.Host)
if err != nil {
return err
if parentURL == nil {
wuParse, err = wu.Parse(URL.Raw)
if err != nil {
lowerURL := strings.ToLower(URL.Raw)
if !strings.HasPrefix(lowerURL, httpPrefix) &&
!strings.HasPrefix(lowerURL, httpsPrefix) &&
!strings.HasPrefix(lowerURL, ftpPrefix) &&
!strings.Contains(lowerURL, "://") {
URL.Raw = httpPrefix + URL.Raw
}
} else {
adaParse, err = goada.NewWithBase(URL.Raw, baseURL.String())
wuParse, err = wu.Parse(URL.Raw)
if err != nil {
return err
}
}
} else {
if parsedURL.Scheme == "" {
parsedURL.Scheme = "http"
}

adaParse, err = goada.New(models.URLToString(parsedURL))
parsedURL, err := url.Parse(URL.Raw)
if err != nil {
return err
}

if parsedURL.IsAbs() {
wuParse, err = wu.Parse(URL.Raw)
if err != nil {
return err
}
} else {
baseURL := parentURL.GetParsed()
if baseURL == nil {
return fmt.Errorf("invalid baseURL in parentURL: %s", parentURL.Raw)
}

resolved := baseURL.ResolveReference(parsedURL)
wuParse, err = wu.Parse(resolved.String())
if err != nil {
return err
}
}
}

adaParse.SetHash("")
if scheme := adaParse.Protocol(); scheme != "http:" && scheme != "https:" {
wuParse.SetHash("")

scheme := strings.ToLower(wuParse.Protocol())
if scheme != "http:" && scheme != "https:" {
return ErrUnsupportedScheme
}

// Check for localhost and 127.0.0.1
host := adaParse.Hostname()
host := wuParse.Hostname()
if host == "localhost" || host == "127.0.0.1" {
return ErrUnsupportedHost
}
Expand All @@ -66,8 +82,8 @@ func NormalizeURL(URL *models.URL, parentURL *models.URL) (err error) {
return ErrUnsupportedHost
}

URL.Raw = adaParse.Href()
adaParse.Free()
// Update the URL with the normalized version
URL.Raw = wuParse.Href(false)

return URL.Parse()
}