From 2ea576f175325f584b871b52f418facb94b5c248 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Do=C4=9Fan=20Can=20Bak=C4=B1r?= Date: Tue, 6 Jan 2026 12:53:09 +0300 Subject: [PATCH 1/3] feat: add database output support for storing scan results Add support for storing httpx scan results directly to databases with both CLI flags and YAML config file options. Supported databases: - MongoDB - PostgreSQL - MySQL Features: - Batched writes for performance (configurable batch size) - Auto-flush with configurable interval - Individual columns for each Result field (not JSON blob) - Support for environment variable HTTPX_DB_CONNECTION_STRING - Option to omit raw request/response data (-rdbor) New CLI flags under OUTPUT group: - -rdb, -result-db: enable database storage - -rdbc, -result-db-config: path to YAML config file - -rdbt, -result-db-type: database type (mongodb, postgres, mysql) - -rdbcs, -result-db-conn: connection string - -rdbn, -result-db-name: database name (default: httpx) - -rdbtb, -result-db-table: table/collection name (default: results) - -rdbbs, -result-db-batch-size: batch size (default: 100) - -rdbor, -result-db-omit-raw: omit raw data Closes #1973 Closes #2360 Closes #2361 Closes #2362 --- README.md | 98 +++++++-------- cmd/httpx/httpx.go | 63 ++++++++++ go.mod | 10 +- go.sum | 17 +++ internal/db/config.go | 125 ++++++++++++++++++++ internal/db/db.go | 70 +++++++++++ internal/db/mongodb.go | 127 ++++++++++++++++++++ internal/db/mysql.go | 256 ++++++++++++++++++++++++++++++++++++++++ internal/db/postgres.go | 256 ++++++++++++++++++++++++++++++++++++++++ internal/db/writer.go | 148 +++++++++++++++++++++++ runner/options.go | 17 +++ 11 files changed, 1140 insertions(+), 47 deletions(-) create mode 100644 internal/db/config.go create mode 100644 internal/db/db.go create mode 100644 internal/db/mongodb.go create mode 100644 internal/db/mysql.go create mode 100644 internal/db/postgres.go create mode 100644 internal/db/writer.go diff --git a/README.md b/README.md index 283e6691..af2dd799 100644 --- a/README.md +++ b/README.md @@ -97,42 +97,40 @@ INPUT: -u, -target string[] input target host(s) to probe PROBES: - -sc, -status-code display response status-code - -cl, -content-length display response content-length - -ct, -content-type display response content-type - -location display response redirect location - -favicon display mmh3 hash for '/favicon.ico' file - -hash string display response body hash (supported: md5,mmh3,simhash,sha1,sha256,sha512) - -jarm display jarm fingerprint hash - -rt, -response-time display response time - -lc, -line-count display response body line count - -wc, -word-count display response body word count - -title display page title - -bp, -body-preview display first N characters of response body (default 100) - -server, -web-server display server name + -sc, -status-code display response status-code + -cl, -content-length display response content-length + -ct, -content-type display response content-type + -location display response redirect location + -favicon display mmh3 hash for '/favicon.ico' file + -hash string display response body hash (supported: md5,mmh3,simhash,sha1,sha256,sha512) + -jarm display jarm fingerprint hash + -rt, -response-time display response time + -lc, -line-count display response body line count + -wc, -word-count display response body word count + -title display page title + -bp, -body-preview display first N characters of response body (default 100) + -server, -web-server display server name -td, -tech-detect display technology in use based on wappalyzer dataset -cff, -custom-fingerprint-file string path to a custom fingerprint file for technology detection - -cpe display CPE (Common Platform Enumeration) based on awesome-search-queries - -wp, -wordpress display WordPress plugins and themes -method display http request method - -ws, -websocket display server using websocket - -ip display host ip - -cname display host cname - -extract-fqdn, -efqdn get domain and subdomains from response body and header in jsonl/csv output - -asn display host asn information - -cdn display cdn/waf in use (default true) - -probe display probe status + -ws, -websocket display server using websocket + -ip display host ip + -cname display host cname + -extract-fqdn, -efqdn get domain and subdomains from response body and header in jsonl/csv output + -asn display host asn information + -cdn display cdn/waf in use (default true) + -probe display probe status HEADLESS: -ss, -screenshot enable saving screenshot of the page using headless browser -system-chrome enable using local installed chrome for screenshot -ho, -headless-options string[] start headless chrome with additional options -esb, -exclude-screenshot-bytes enable excluding screenshot bytes from json output - -no-screenshot-full-page disable saving full page screenshot -ehb, -exclude-headless-body enable excluding headless header from json output + -no-screenshot-full-page disable saving full page screenshot -st, -screenshot-timeout value set timeout for screenshot in seconds (default 10s) -sid, -screenshot-idle value set idle time before taking screenshot in seconds (default 1s) - -jsc, -javascript-code string[] execute JavaScript code after navigation + -jsc, -javascript-code string[] execute JavaScript code after navigation MATCHERS: -mc, -match-code string match response with specified status code (-mc 200,302) @@ -142,7 +140,7 @@ MATCHERS: -mfc, -match-favicon string[] match response with specified favicon hash (-mfc 1494302000) -ms, -match-string string[] match response with specified string (-ms admin) -mr, -match-regex string[] match response with specified regex (-mr admin) - -mcdn, -match-cdn string[] match host with specified cdn provider (cloudfront, fastly, google) + -mcdn, -match-cdn string[] match host with specified cdn provider (cloudfront, fastly, google, etc.) -mrt, -match-response-time string match response with specified response time in seconds (-mrt '< 1') -mdc, -match-condition string match response with dsl expression condition @@ -151,19 +149,21 @@ EXTRACTOR: -ep, -extract-preset string[] display response content matched by a pre-defined regex (url,ipv4,mail) FILTERS: - -fc, -filter-code string filter response with specified status code (-fc 403,401) - -fep, -filter-error-page filter response with ML based error page detection - -fd, -filter-duplicates filter out near-duplicate responses (only first response is retained) - -fl, -filter-length string filter response with specified content length (-fl 23,33) - -flc, -filter-line-count string filter response body with specified line count (-flc 423,532) - -fwc, -filter-word-count string filter response body with specified word count (-fwc 423,532) - -ffc, -filter-favicon string[] filter response with specified favicon hash (-ffc 1494302000) - -fs, -filter-string string[] filter response with specified string (-fs admin) - -fe, -filter-regex string[] filter response with specified regex (-fe admin) - -fcdn, -filter-cdn string[] filter host with specified cdn provider (cloudfront, fastly, google) - -frt, -filter-response-time string filter response with specified response time in seconds (-frt '> 1') - -fdc, -filter-condition string filter response with dsl expression condition - -strip strips all tags in response. supported formats: html,xml (default html) + -fc, -filter-code string filter response with specified status code (-fc 403,401) + -fep, -filter-error-page filter response with ML based error page detection + -fd, -filter-duplicates filter out near-duplicate responses (only first response is retained) + -fl, -filter-length string filter response with specified content length (-fl 23,33) + -flc, -filter-line-count string filter response body with specified line count (-flc 423,532) + -fwc, -filter-word-count string filter response body with specified word count (-fwc 423,532) + -ffc, -filter-favicon string[] filter response with specified favicon hash (-ffc 1494302000) + -fs, -filter-string string[] filter response with specified string (-fs admin) + -fe, -filter-regex string[] filter response with specified regex (-fe admin) + -fcdn, -filter-cdn string[] filter host with specified cdn provider (cloudfront, fastly, google, etc.) + -frt, -filter-response-time string filter response with specified response time in seconds (-frt '> 1') + -fdc, -filter-condition string filter response with dsl expression condition + -strip strips all tags in response. supported formats: html,xml (default html) + -lof, -list-output-fields list of fields to output (comma separated) + -eof, -exclude-output-fields string[] exclude output fields output based on a condition RATE-LIMIT: -t, -threads int number of threads to use (default 50) @@ -201,10 +201,16 @@ OUTPUT: -include-chain include redirect http chain in JSON output (-json only) -store-chain include http redirect chain in responses (-sr only) -svrc, -store-vision-recon-cluster include visual recon clusters (-ss and -sr only) - -pr, -protocol string protocol to use (unknown, http11) + -pr, -protocol string protocol to use (unknown, http11, http2, http3) -fepp, -filter-error-page-path string path to store filtered error pages (default "filtered_error_page.json") - -lof, -list-output-fields list available output field names for filtering - -eof, -exclude-output-fields string[] exclude specified output fields from results + -rdb, -result-db store results in database + -rdbc, -result-db-config string path to database config file + -rdbt, -result-db-type string database type (mongodb, postgres, mysql) + -rdbcs, -result-db-conn string database connection string (env: HTTPX_DB_CONNECTION_STRING) + -rdbn, -result-db-name string database name (default "httpx") + -rdbtb, -result-db-table string table/collection name (default "results") + -rdbbs, -result-db-batch-size int batch size for database inserts (default 100) + -rdbor, -result-db-omit-raw omit raw request/response data from database CONFIGURATIONS: -config string path to the httpx configuration file (default $HOME/.config/httpx/config.yaml) @@ -213,9 +219,9 @@ CONFIGURATIONS: -deny string[] denied list of IP/CIDR's to process (file or comma separated) -sni, -sni-name string custom TLS SNI name -random-agent enable Random User-Agent to use (default true) - -auto-referer set the Referer header to the current URL (default false) + -auto-referer set the Referer header to the current URL -H, -header string[] custom http headers to send with request - -http-proxy, -proxy string http proxy to use (eg http://127.0.0.1:8080) + -http-proxy, -proxy string proxy (http|socks) to use (eg http://127.0.0.1:8080) -unsafe send raw requests skipping golang normalization -resume resume scan using resume.cfg -fr, -follow-redirects follow http redirects @@ -250,14 +256,14 @@ DEBUG: OPTIMIZATIONS: -nf, -no-fallback display both probed protocol (HTTPS and HTTP) - -nfs, -no-fallback-scheme probe with protocol scheme specified in input + -nfs, -no-fallback-scheme probe with protocol scheme specified in input -maxhr, -max-host-error int max error count per host before skipping remaining path/s (default 30) -e, -exclude string[] exclude host matching specified filter ('cdn', 'private-ips', cidr, ip, regex) -retries int number of retries -timeout int timeout in seconds (default 10) -delay value duration between each http request (eg: 200ms, 1s) (default -1ns) - -rsts, -response-size-to-save int max response size to save in bytes (default 2147483647) - -rstr, -response-size-to-read int max response size to read in bytes (default 2147483647) + -rsts, -response-size-to-save int max response size to save in bytes (default 512000000) + -rstr, -response-size-to-read int max response size to read in bytes (default 512000000) CLOUD: -auth configure projectdiscovery cloud (pdcp) api key (default true) diff --git a/cmd/httpx/httpx.go b/cmd/httpx/httpx.go index 77e54f4c..f7b26810 100644 --- a/cmd/httpx/httpx.go +++ b/cmd/httpx/httpx.go @@ -10,6 +10,7 @@ import ( "github.com/logrusorgru/aurora" "github.com/projectdiscovery/gologger" + "github.com/projectdiscovery/httpx/internal/db" "github.com/projectdiscovery/httpx/internal/pdcp" "github.com/projectdiscovery/httpx/runner" pdcpauth "github.com/projectdiscovery/utils/auth/pdcp" @@ -64,6 +65,9 @@ func main() { // setup optional asset upload _ = setupOptionalAssetUpload(options) + // setup optional database output + _ = setupDatabaseOutput(options) + httpxRunner, err := runner.New(options) if err != nil { gologger.Fatal().Msgf("Could not create runner: %s\n", err) @@ -143,3 +147,62 @@ func setupOptionalAssetUpload(opts *runner.Options) *pdcp.UploadWriter { } return writer } + +// setupDatabaseOutput sets up database output for storing results +// This is optional and only initialized when explicitly enabled via -rdb flag +func setupDatabaseOutput(opts *runner.Options) *db.Writer { + if !opts.ResultDatabase { + return nil + } + + var cfg *db.Config + var err error + + if opts.ResultDatabaseConfig != "" { + // Load configuration from file + cfg, err = db.LoadConfigFromFile(opts.ResultDatabaseConfig) + if err != nil { + gologger.Fatal().Msgf("Could not load database config: %s\n", err) + } + } else { + // Build configuration from CLI options + dbOpts := &db.Options{ + Enabled: opts.ResultDatabase, + Type: opts.ResultDatabaseType, + ConnectionString: opts.ResultDatabaseConnStr, + DatabaseName: opts.ResultDatabaseName, + TableName: opts.ResultDatabaseTable, + BatchSize: opts.ResultDatabaseBatchSize, + OmitRaw: opts.ResultDatabaseOmitRaw, + } + cfg, err = dbOpts.ToConfig() + if err != nil { + gologger.Fatal().Msgf("Invalid database configuration: %s\n", err) + } + } + + writer, err := db.NewWriter(context.Background(), cfg) + if err != nil { + gologger.Fatal().Msgf("Could not setup database output: %s\n", err) + } + + // Chain with existing OnResult callback if present + existingCallback := opts.OnResult + opts.OnResult = func(r runner.Result) { + if existingCallback != nil { + existingCallback(r) + } + writer.GetWriterCallback()(r) + } + + // Chain with existing OnClose callback if present + existingClose := opts.OnClose + opts.OnClose = func() { + writer.Close() + if existingClose != nil { + existingClose() + } + } + + return writer +} diff --git a/go.mod b/go.mod index 1ac2fdd6..97ebc341 100644 --- a/go.mod +++ b/go.mod @@ -52,13 +52,18 @@ require ( require ( github.com/JohannesKaufmann/html-to-markdown/v2 v2.5.0 github.com/dustin/go-humanize v1.0.1 + github.com/go-sql-driver/mysql v1.9.3 github.com/go-viper/mapstructure/v2 v2.4.0 github.com/gocarina/gocsv v0.0.0-20240520201108-78e41c74b4b1 + github.com/lib/pq v1.10.9 github.com/weppos/publicsuffix-go v0.50.2 + go.mongodb.org/mongo-driver v1.17.6 + gopkg.in/yaml.v3 v3.0.1 ) require ( aead.dev/minisign v0.2.0 // indirect + filippo.io/edwards25519 v1.1.0 // indirect github.com/JohannesKaufmann/dom v0.2.0 // indirect github.com/Knetic/govaluate v3.0.1-0.20171022003610-9aa49832a739+incompatible // indirect github.com/Masterminds/semver/v3 v3.2.1 // indirect @@ -121,6 +126,7 @@ require ( github.com/minio/selfupdate v0.6.1-0.20230907112617-f11e74f84ca7 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.2 // indirect + github.com/montanaflynn/stats v0.7.1 // indirect github.com/muesli/reflow v0.3.0 // indirect github.com/muesli/termenv v0.16.0 // indirect github.com/nfnt/resize v0.0.0-20180221191011-83c6a9932646 // indirect @@ -157,6 +163,9 @@ require ( github.com/ulikunitz/xz v0.5.15 // indirect github.com/vulncheck-oss/go-exploit v1.51.0 // indirect github.com/xdg-go/pbkdf2 v1.0.0 // indirect + github.com/xdg-go/scram v1.1.2 // indirect + github.com/xdg-go/stringprep v1.0.4 // indirect + github.com/youmark/pkcs8 v0.0.0-20240726163527-a2c0da244d78 // indirect github.com/ysmood/fetchup v0.2.3 // indirect github.com/ysmood/goob v0.4.0 // indirect github.com/ysmood/got v0.40.0 // indirect @@ -176,5 +185,4 @@ require ( golang.org/x/time v0.11.0 // indirect golang.org/x/tools v0.39.0 // indirect gopkg.in/ini.v1 v1.67.0 // indirect - gopkg.in/yaml.v3 v3.0.1 // indirect ) diff --git a/go.sum b/go.sum index 0188b44c..163c35d7 100644 --- a/go.sum +++ b/go.sum @@ -18,6 +18,8 @@ cloud.google.com/go/pubsub v1.1.0/go.mod h1:EwwdRX2sKPjnvnqCa270oGRyludottCI76h+ cloud.google.com/go/storage v1.0.0/go.mod h1:IhtSnM/ZTZV8YYJWCY8RULGVqBDmpoyjwiyrjsg+URw= cloud.google.com/go/storage v1.5.0/go.mod h1:tpKbwo567HUNpVclU5sGELwQWBDZ8gh0ZeosJ0Rtdos= dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU= +filippo.io/edwards25519 v1.1.0 h1:FNf4tywRC1HmFuKW5xopWpigGjJKiJSV0Cqo0cJWDaA= +filippo.io/edwards25519 v1.1.0/go.mod h1:BxyFTGdWcka3PhytdK4V28tE5sGfRvvvRV7EaN4VDT4= github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo= github.com/JohannesKaufmann/dom v0.2.0 h1:1bragmEb19K8lHAqgFgqCpiPCFEZMTXzOIEjuxkUfLQ= @@ -141,6 +143,8 @@ github.com/go-ole/go-ole v1.2.6 h1:/Fpf6oFPoeFik9ty7siob0G6Ke8QvQEuVcuChpwXzpY= github.com/go-ole/go-ole v1.2.6/go.mod h1:pprOEPIfldk/42T2oK7lQ4v4JSDwmV0As9GaiUsvbm0= github.com/go-rod/rod v0.116.2 h1:A5t2Ky2A+5eD/ZJQr1EfsQSe5rms5Xof/qj296e+ZqA= github.com/go-rod/rod v0.116.2/go.mod h1:H+CMO9SCNc2TJ2WfrG+pKhITz57uGNYU43qYHh438Mg= +github.com/go-sql-driver/mysql v1.9.3 h1:U/N249h2WzJ3Ukj8SowVFjdtZKfu9vlLZxjPXV1aweo= +github.com/go-sql-driver/mysql v1.9.3/go.mod h1:qn46aNg1333BRMNU69Lq93t8du/dwxI64Gl8i5p1WMU= github.com/go-viper/mapstructure/v2 v2.4.0 h1:EBsztssimR/CONLSZZ04E8qAkxNYq4Qp9LvH92wZUgs= github.com/go-viper/mapstructure/v2 v2.4.0/go.mod h1:oJDH3BJKyqBA2TXFhDsKDGDTlndYOZ6rGS0BRZIxGhM= github.com/gobwas/httphead v0.1.0/go.mod h1:O/RXo79gxV8G+RqlR/otEwx4Q36zl9rqC5u12GKvMCM= @@ -252,6 +256,8 @@ github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80/go.mod h1:imJHygn/1yfhB7XSJJKlFZKl/J+dCPAknuiaGOshXAs= +github.com/lib/pq v1.10.9 h1:YXG7RB+JIjhP29X+OtkiDnYaXQwpS4JEWq7dtCCRUEw= +github.com/lib/pq v1.10.9/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o= github.com/logrusorgru/aurora v2.0.3+incompatible h1:tOpm7WcpBTn4fjmVfgpQq0EfczGlG91VSDkswnjF5A8= github.com/logrusorgru/aurora v2.0.3+incompatible/go.mod h1:7rIyQOR62GCctdiQpZ/zOJlFyk6y+94wXzv6RNZgaR4= github.com/logrusorgru/aurora/v4 v4.0.0 h1:sRjfPpun/63iADiSvGGjgA1cAYegEWMPCJdUpJYn9JA= @@ -288,6 +294,8 @@ github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M= github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= +github.com/montanaflynn/stats v0.7.1 h1:etflOAAHORrCC44V+aR6Ftzort912ZU+YLiSTuV8eaE= +github.com/montanaflynn/stats v0.7.1/go.mod h1:etXPPgVO6n31NxCd9KQUMvCM+ve0ruNzt6R8Bnaayow= github.com/mreiferson/go-httpclient v0.0.0-20160630210159-31f0106b4474/go.mod h1:OQA4XLvDbMgS8P0CevmM4m9Q3Jq4phKUzcocxuGJ5m8= github.com/mreiferson/go-httpclient v0.0.0-20201222173833-5e475fde3a4d/go.mod h1:OQA4XLvDbMgS8P0CevmM4m9Q3Jq4phKUzcocxuGJ5m8= github.com/muesli/reflow v0.3.0 h1:IFsN6K9NfGtjeggFP+68I4chLZV2yIKsXJFNZ+eWh6s= @@ -462,10 +470,16 @@ github.com/weppos/publicsuffix-go v0.50.2 h1:KsJFc8IEKTJovM46SRCnGNsM+rFShxcs6VE github.com/weppos/publicsuffix-go v0.50.2/go.mod h1:CbQCKDtXF8UcT7hrxeMa0MDjwhpOI9iYOU7cfq+yo8k= github.com/xdg-go/pbkdf2 v1.0.0 h1:Su7DPu48wXMwC3bs7MCNG+z4FhcyEuz5dlvchbq0B0c= github.com/xdg-go/pbkdf2 v1.0.0/go.mod h1:jrpuAogTd400dnrH08LKmI/xc1MbPOebTwRqcT5RDeI= +github.com/xdg-go/scram v1.1.2 h1:FHX5I5B4i4hKRVRBCFRxq1iQRej7WO3hhBuJf+UUySY= +github.com/xdg-go/scram v1.1.2/go.mod h1:RT/sEzTbU5y00aCK8UOx6R7YryM0iF1N2MOmC3kKLN4= +github.com/xdg-go/stringprep v1.0.4 h1:XLI/Ng3O1Atzq0oBs3TWm+5ZVgkq2aqdlvP9JtoZ6c8= +github.com/xdg-go/stringprep v1.0.4/go.mod h1:mPGuuIYwz7CmR2bT9j4GbQqutWS1zV24gijq1dTyGkM= github.com/xyproto/randomstring v1.0.5 h1:YtlWPoRdgMu3NZtP45drfy1GKoojuR7hmRcnhZqKjWU= github.com/xyproto/randomstring v1.0.5/go.mod h1:rgmS5DeNXLivK7YprL0pY+lTuhNQW3iGxZ18UQApw/E= github.com/yl2chen/cidranger v1.0.2 h1:lbOWZVCG1tCRX4u24kuM1Tb4nHqWkDxwLdoS+SevawU= github.com/yl2chen/cidranger v1.0.2/go.mod h1:9U1yz7WPYDwf0vpNWFaeRh0bjwz5RVgRy/9UEQfHl0g= +github.com/youmark/pkcs8 v0.0.0-20240726163527-a2c0da244d78 h1:ilQV1hzziu+LLM3zUTJ0trRztfwgjqKnBWNtSRkbmwM= +github.com/youmark/pkcs8 v0.0.0-20240726163527-a2c0da244d78/go.mod h1:aL8wCCfTfSfmXjznFBSZNN13rSJjlIOI1fUNAtF7rmI= github.com/ysmood/fetchup v0.2.3 h1:ulX+SonA0Vma5zUFXtv52Kzip/xe7aj4vqT5AJwQ+ZQ= github.com/ysmood/fetchup v0.2.3/go.mod h1:xhibcRKziSvol0H1/pj33dnKrYyI2ebIvz5cOOkYGns= github.com/ysmood/goob v0.4.0 h1:HsxXhyLBeGzWXnqVKtmT9qM7EuVs/XOgkX7T6r1o1AQ= @@ -502,6 +516,8 @@ github.com/zmap/zcrypto v0.0.0-20240512203510-0fef58d9a9db/go.mod h1:mo/07mo6reD github.com/zmap/zlint/v3 v3.0.0/go.mod h1:paGwFySdHIBEMJ61YjoqT4h7Ge+fdYG4sUQhnTb1lJ8= go.etcd.io/bbolt v1.4.0 h1:TU77id3TnN/zKr7CO/uk+fBCwF2jGcMuw2B/FMAzYIk= go.etcd.io/bbolt v1.4.0/go.mod h1:AsD+OCi/qPN1giOX1aiLAha3o1U8rAz65bvN4j0sRuk= +go.mongodb.org/mongo-driver v1.17.6 h1:87JUG1wZfWsr6rIz3ZmpH90rL5tea7O3IHuSwHUpsss= +go.mongodb.org/mongo-driver v1.17.6/go.mod h1:Hy04i7O2kC4RS06ZrhPRqj/u4DTYkFDAAccj+rVKqgQ= go.opencensus.io v0.21.0/go.mod h1:mSImk1erAIZhrmZN+AvHh14ztQfjbGwt4TtuofqLduU= go.opencensus.io v0.22.0/go.mod h1:+kGneAE2xo2IficOXnaByMWTGM9T73dGwxeWcUqIpI8= go.opencensus.io v0.22.2/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw= @@ -684,6 +700,7 @@ golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.4/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/text v0.3.8/go.mod h1:E6s5w1FMmriuDzIBO73fBruAKo1PCIq6d2Q6DHfQ8WQ= golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.8.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= diff --git a/internal/db/config.go b/internal/db/config.go new file mode 100644 index 00000000..c72dcac3 --- /dev/null +++ b/internal/db/config.go @@ -0,0 +1,125 @@ +package db + +import ( + "fmt" + "os" + "time" + + "gopkg.in/yaml.v3" +) + +const ( + DefaultBatchSize = 100 + + DefaultFlushInterval = time.Minute + + DefaultTableName = "results" + + DefaultDatabaseName = "httpx" + + EnvConnectionString = "HTTPX_DB_CONNECTION_STRING" +) + +type Config struct { + Type DatabaseType `yaml:"type"` + + ConnectionString string `yaml:"connection-string"` + + DatabaseName string `yaml:"database-name"` + + TableName string `yaml:"table-name"` + + BatchSize int `yaml:"batch-size"` + + FlushInterval time.Duration `yaml:"flush-interval"` + + OmitRaw bool `yaml:"omit-raw"` +} + +func (c *Config) Validate() error { + if !c.Type.IsValid() { + return fmt.Errorf("invalid database type: %s (supported: %v)", c.Type, SupportedDatabases()) + } + + if c.ConnectionString == "" { + return fmt.Errorf("connection string is required") + } + + return nil +} + +func (c *Config) ApplyDefaults() { + if c.DatabaseName == "" { + c.DatabaseName = DefaultDatabaseName + } + + if c.TableName == "" { + c.TableName = DefaultTableName + } + + if c.BatchSize <= 0 { + c.BatchSize = DefaultBatchSize + } + + if c.FlushInterval <= 0 { + c.FlushInterval = DefaultFlushInterval + } +} + +func LoadConfigFromFile(configPath string) (*Config, error) { + data, err := os.ReadFile(configPath) + if err != nil { + return nil, fmt.Errorf("failed to read config file: %w", err) + } + + var cfg Config + if err := yaml.Unmarshal(data, &cfg); err != nil { + return nil, fmt.Errorf("failed to parse config file: %w", err) + } + + if cfg.ConnectionString == "" { + cfg.ConnectionString = os.Getenv(EnvConnectionString) + } + + cfg.ApplyDefaults() + + if err := cfg.Validate(); err != nil { + return nil, err + } + + return &cfg, nil +} + +type Options struct { + Enabled bool + ConfigFile string + Type string + ConnectionString string + DatabaseName string + TableName string + BatchSize int + OmitRaw bool +} + +func (o *Options) ToConfig() (*Config, error) { + cfg := &Config{ + Type: DatabaseType(o.Type), + ConnectionString: o.ConnectionString, + DatabaseName: o.DatabaseName, + TableName: o.TableName, + BatchSize: o.BatchSize, + OmitRaw: o.OmitRaw, + } + + if cfg.ConnectionString == "" { + cfg.ConnectionString = os.Getenv(EnvConnectionString) + } + + cfg.ApplyDefaults() + + if err := cfg.Validate(); err != nil { + return nil, err + } + + return cfg, nil +} diff --git a/internal/db/db.go b/internal/db/db.go new file mode 100644 index 00000000..a4f7e717 --- /dev/null +++ b/internal/db/db.go @@ -0,0 +1,70 @@ +package db + +import ( + "context" + "fmt" + + "github.com/projectdiscovery/httpx/runner" +) + +type DatabaseType string + +const ( + MongoDB DatabaseType = "mongodb" + PostgreSQL DatabaseType = "postgres" + MySQL DatabaseType = "mysql" +) + +func (d DatabaseType) String() string { + return string(d) +} + +func (d DatabaseType) IsValid() bool { + switch d { + case MongoDB, PostgreSQL, MySQL: + return true + default: + return false + } +} + +type Database interface { + Connect(ctx context.Context) error + + Close() error + + InsertBatch(ctx context.Context, results []runner.Result) error + + EnsureSchema(ctx context.Context) error + + Type() DatabaseType +} + +type databaseFactory func(cfg *Config) (Database, error) + +var registry = make(map[DatabaseType]databaseFactory) + +func Register(dbType DatabaseType, factory databaseFactory) { + registry[dbType] = factory +} + +func NewDatabase(cfg *Config) (Database, error) { + if cfg == nil { + return nil, fmt.Errorf("database configuration is required") + } + + if !cfg.Type.IsValid() { + return nil, fmt.Errorf("unsupported database type: %s", cfg.Type) + } + + factory, ok := registry[cfg.Type] + if !ok { + return nil, fmt.Errorf("database type %s is not registered", cfg.Type) + } + + return factory(cfg) +} + +func SupportedDatabases() []DatabaseType { + return []DatabaseType{MongoDB, PostgreSQL, MySQL} +} diff --git a/internal/db/mongodb.go b/internal/db/mongodb.go new file mode 100644 index 00000000..5db69729 --- /dev/null +++ b/internal/db/mongodb.go @@ -0,0 +1,127 @@ +package db + +import ( + "context" + "encoding/json" + "fmt" + "time" + + "github.com/projectdiscovery/httpx/runner" + "go.mongodb.org/mongo-driver/bson" + "go.mongodb.org/mongo-driver/mongo" + "go.mongodb.org/mongo-driver/mongo/options" +) + +func init() { + Register(MongoDB, newMongoDatabase) +} + +type mongoDatabase struct { + cfg *Config + client *mongo.Client + database *mongo.Database + collection *mongo.Collection +} + +func newMongoDatabase(cfg *Config) (Database, error) { + return &mongoDatabase{cfg: cfg}, nil +} + +func (m *mongoDatabase) Connect(ctx context.Context) error { + clientOpts := options.Client(). + ApplyURI(m.cfg.ConnectionString). + SetConnectTimeout(10 * time.Second). + SetServerSelectionTimeout(10 * time.Second) + + client, err := mongo.Connect(ctx, clientOpts) + if err != nil { + return fmt.Errorf("failed to connect to MongoDB: %w", err) + } + + if err := client.Ping(ctx, nil); err != nil { + return fmt.Errorf("failed to ping MongoDB: %w", err) + } + + m.client = client + m.database = client.Database(m.cfg.DatabaseName) + m.collection = m.database.Collection(m.cfg.TableName) + + return nil +} + +func (m *mongoDatabase) Close() error { + if m.client != nil { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + return m.client.Disconnect(ctx) + } + return nil +} + +func (m *mongoDatabase) EnsureSchema(ctx context.Context) error { + indexes := []mongo.IndexModel{ + { + Keys: bson.D{{Key: "timestamp", Value: -1}}, + }, + { + Keys: bson.D{{Key: "url", Value: 1}}, + }, + { + Keys: bson.D{{Key: "host", Value: 1}}, + }, + { + Keys: bson.D{{Key: "status_code", Value: 1}}, + }, + { + Keys: bson.D{{Key: "tech", Value: 1}}, + Options: options.Index().SetSparse(true), + }, + } + + _, err := m.collection.Indexes().CreateMany(ctx, indexes) + if err != nil { + return fmt.Errorf("failed to create indexes: %w", err) + } + + return nil +} + +func (m *mongoDatabase) InsertBatch(ctx context.Context, results []runner.Result) error { + if len(results) == 0 { + return nil + } + + documents := make([]interface{}, len(results)) + for i, r := range results { + doc, err := m.resultToDocument(r) + if err != nil { + return fmt.Errorf("failed to convert result to document: %w", err) + } + documents[i] = doc + } + + _, err := m.collection.InsertMany(ctx, documents) + if err != nil { + return fmt.Errorf("failed to insert batch: %w", err) + } + + return nil +} + +func (m *mongoDatabase) Type() DatabaseType { + return MongoDB +} + +func (m *mongoDatabase) resultToDocument(r runner.Result) (bson.M, error) { + jsonBytes, err := json.Marshal(r) + if err != nil { + return nil, fmt.Errorf("failed to marshal result to JSON: %w", err) + } + + var doc bson.M + if err := json.Unmarshal(jsonBytes, &doc); err != nil { + return nil, fmt.Errorf("failed to unmarshal JSON to BSON: %w", err) + } + + return doc, nil +} diff --git a/internal/db/mysql.go b/internal/db/mysql.go new file mode 100644 index 00000000..50434985 --- /dev/null +++ b/internal/db/mysql.go @@ -0,0 +1,256 @@ +package db + +import ( + "context" + "database/sql" + "encoding/json" + "fmt" + + _ "github.com/go-sql-driver/mysql" + "github.com/projectdiscovery/httpx/runner" +) + +func init() { + Register(MySQL, newMySQLDatabase) +} + +type mysqlDatabase struct { + cfg *Config + db *sql.DB +} + +func newMySQLDatabase(cfg *Config) (Database, error) { + return &mysqlDatabase{cfg: cfg}, nil +} + +func (m *mysqlDatabase) Connect(ctx context.Context) error { + db, err := sql.Open("mysql", m.cfg.ConnectionString) + if err != nil { + return fmt.Errorf("failed to open MySQL connection: %w", err) + } + + if err := db.PingContext(ctx); err != nil { + return fmt.Errorf("failed to ping MySQL: %w", err) + } + + m.db = db + return nil +} + +func (m *mysqlDatabase) Close() error { + if m.db != nil { + return m.db.Close() + } + return nil +} + +func (m *mysqlDatabase) EnsureSchema(ctx context.Context) error { + schema := fmt.Sprintf(` + CREATE TABLE IF NOT EXISTS %s ( + id BIGINT AUTO_INCREMENT PRIMARY KEY, + + -- Basic info + timestamp DATETIME(6), + url TEXT, + input TEXT, + host VARCHAR(255), + port VARCHAR(10), + scheme VARCHAR(10), + path TEXT, + method VARCHAR(10), + final_url TEXT, + + -- Response data + status_code INT, + content_length INT, + content_type VARCHAR(255), + title TEXT, + webserver VARCHAR(255), + response_time VARCHAR(50), + location TEXT, + body LONGTEXT, + body_preview TEXT, + raw_header LONGTEXT, + request LONGTEXT, + + -- Network info + host_ip VARCHAR(45), + a JSON, + aaaa JSON, + cname JSON, + resolvers JSON, + body_fqdn JSON, + body_domains JSON, + sni TEXT, + + -- Technology detection + tech JSON, + + -- Hashes and fingerprints + hash JSON, + favicon VARCHAR(100), + favicon_md5 VARCHAR(32), + favicon_path TEXT, + favicon_url TEXT, + jarm_hash VARCHAR(62), + + -- CDN info + cdn BOOLEAN, + cdn_name VARCHAR(100), + cdn_type VARCHAR(50), + + -- ASN info + asn JSON, + + -- TLS data + tls JSON, + + -- CSP data + csp JSON, + + -- Status flags + failed BOOLEAN, + error TEXT, + websocket BOOLEAN, + http2 BOOLEAN, + pipeline BOOLEAN, + vhost BOOLEAN, + + -- Metrics + words INT, + ` + "`lines`" + ` INT, + + -- Headers and extracts + header JSON, + extracts JSON, + extract_regex JSON, + + -- Chain data + chain JSON, + chain_status_codes JSON, + + -- Headless/Screenshot + headless_body LONGTEXT, + screenshot_bytes LONGBLOB, + screenshot_path TEXT, + screenshot_path_rel TEXT, + stored_response_path TEXT, + + -- Knowledge base + knowledgebase JSON, + + -- Link requests + link_request JSON, + + -- Trace + trace JSON, + + INDEX idx_timestamp (timestamp), + INDEX idx_host (host), + INDEX idx_status_code (status_code) + ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; + `, m.cfg.TableName) + + _, err := m.db.ExecContext(ctx, schema) + if err != nil { + return fmt.Errorf("failed to create schema: %w", err) + } + + return nil +} + +func (m *mysqlDatabase) InsertBatch(ctx context.Context, results []runner.Result) error { + if len(results) == 0 { + return nil + } + + tx, err := m.db.BeginTx(ctx, nil) + if err != nil { + return fmt.Errorf("failed to begin transaction: %w", err) + } + defer func() { + _ = tx.Rollback() + }() + + query := fmt.Sprintf(` + INSERT INTO %s ( + timestamp, url, input, host, port, scheme, path, method, final_url, + status_code, content_length, content_type, title, webserver, response_time, + location, body, body_preview, raw_header, request, + host_ip, a, aaaa, cname, resolvers, body_fqdn, body_domains, sni, + tech, hash, favicon, favicon_md5, favicon_path, favicon_url, jarm_hash, + cdn, cdn_name, cdn_type, asn, tls, csp, + failed, error, websocket, http2, pipeline, vhost, + words, `+"`lines`"+`, header, extracts, extract_regex, + chain, chain_status_codes, + headless_body, screenshot_bytes, screenshot_path, screenshot_path_rel, stored_response_path, + knowledgebase, link_request, trace + ) VALUES ( + ?, ?, ?, ?, ?, ?, ?, ?, ?, + ?, ?, ?, ?, ?, ?, + ?, ?, ?, ?, ?, + ?, ?, ?, ?, ?, ?, ?, ?, + ?, ?, ?, ?, ?, ?, ?, + ?, ?, ?, ?, ?, ?, + ?, ?, ?, ?, ?, ?, + ?, ?, ?, ?, ?, + ?, ?, + ?, ?, ?, ?, ?, + ?, ?, ? + )`, m.cfg.TableName) + + stmt, err := tx.PrepareContext(ctx, query) + if err != nil { + return fmt.Errorf("failed to prepare statement: %w", err) + } + defer stmt.Close() + + for _, r := range results { + aJSON, _ := json.Marshal(r.A) + aaaaJSON, _ := json.Marshal(r.AAAA) + cnameJSON, _ := json.Marshal(r.CNAMEs) + resolversJSON, _ := json.Marshal(r.Resolvers) + fqdnJSON, _ := json.Marshal(r.Fqdns) + domainsJSON, _ := json.Marshal(r.Domains) + techJSON, _ := json.Marshal(r.Technologies) + hashJSON, _ := json.Marshal(r.Hashes) + asnJSON, _ := json.Marshal(r.ASN) + tlsJSON, _ := json.Marshal(r.TLSData) + cspJSON, _ := json.Marshal(r.CSPData) + headerJSON, _ := json.Marshal(r.ResponseHeaders) + extractsJSON, _ := json.Marshal(r.Extracts) + extractRegexJSON, _ := json.Marshal(r.ExtractRegex) + chainJSON, _ := json.Marshal(r.Chain) + chainStatusJSON, _ := json.Marshal(r.ChainStatusCodes) + kbJSON, _ := json.Marshal(r.KnowledgeBase) + linkReqJSON, _ := json.Marshal(r.LinkRequest) + traceJSON, _ := json.Marshal(r.Trace) + + _, err = stmt.ExecContext(ctx, + r.Timestamp, r.URL, r.Input, r.Host, r.Port, r.Scheme, r.Path, r.Method, r.FinalURL, + r.StatusCode, r.ContentLength, r.ContentType, r.Title, r.WebServer, r.ResponseTime, + r.Location, r.ResponseBody, r.BodyPreview, r.RawHeaders, r.Request, + r.HostIP, aJSON, aaaaJSON, cnameJSON, resolversJSON, fqdnJSON, domainsJSON, r.SNI, + techJSON, hashJSON, r.FavIconMMH3, r.FavIconMD5, r.FaviconPath, r.FaviconURL, r.JarmHash, + r.CDN, r.CDNName, r.CDNType, asnJSON, tlsJSON, cspJSON, + r.Failed, r.Error, r.WebSocket, r.HTTP2, r.Pipeline, r.VHost, + r.Words, r.Lines, headerJSON, extractsJSON, extractRegexJSON, + chainJSON, chainStatusJSON, + r.HeadlessBody, r.ScreenshotBytes, r.ScreenshotPath, r.ScreenshotPathRel, r.StoredResponsePath, + kbJSON, linkReqJSON, traceJSON, + ) + if err != nil { + return fmt.Errorf("failed to insert result: %w", err) + } + } + + if err := tx.Commit(); err != nil { + return fmt.Errorf("failed to commit transaction: %w", err) + } + + return nil +} + +func (m *mysqlDatabase) Type() DatabaseType { + return MySQL +} diff --git a/internal/db/postgres.go b/internal/db/postgres.go new file mode 100644 index 00000000..80ecf7dc --- /dev/null +++ b/internal/db/postgres.go @@ -0,0 +1,256 @@ +package db + +import ( + "context" + "database/sql" + "encoding/json" + "fmt" + + "github.com/lib/pq" + "github.com/projectdiscovery/httpx/runner" +) + +func init() { + Register(PostgreSQL, newPostgresDatabase) +} + +type postgresDatabase struct { + cfg *Config + db *sql.DB +} + +func newPostgresDatabase(cfg *Config) (Database, error) { + return &postgresDatabase{cfg: cfg}, nil +} + +func (p *postgresDatabase) Connect(ctx context.Context) error { + db, err := sql.Open("postgres", p.cfg.ConnectionString) + if err != nil { + return fmt.Errorf("failed to open PostgreSQL connection: %w", err) + } + + if err := db.PingContext(ctx); err != nil { + return fmt.Errorf("failed to ping PostgreSQL: %w", err) + } + + p.db = db + return nil +} + +func (p *postgresDatabase) Close() error { + if p.db != nil { + return p.db.Close() + } + return nil +} + +func (p *postgresDatabase) EnsureSchema(ctx context.Context) error { + schema := fmt.Sprintf(` + CREATE TABLE IF NOT EXISTS %s ( + id BIGSERIAL PRIMARY KEY, + + -- Basic info + timestamp TIMESTAMP WITH TIME ZONE, + url TEXT, + input TEXT, + host TEXT, + port TEXT, + scheme TEXT, + path TEXT, + method TEXT, + final_url TEXT, + + -- Response data + status_code INTEGER, + content_length INTEGER, + content_type TEXT, + title TEXT, + webserver TEXT, + response_time TEXT, + location TEXT, + body TEXT, + body_preview TEXT, + raw_header TEXT, + request TEXT, + + -- Network info + host_ip TEXT, + a TEXT[], + aaaa TEXT[], + cname TEXT[], + resolvers TEXT[], + body_fqdn TEXT[], + body_domains TEXT[], + sni TEXT, + + -- Technology detection + tech TEXT[], + + -- Hashes and fingerprints + hash JSONB, + favicon TEXT, + favicon_md5 TEXT, + favicon_path TEXT, + favicon_url TEXT, + jarm_hash TEXT, + + -- CDN info + cdn BOOLEAN, + cdn_name TEXT, + cdn_type TEXT, + + -- ASN info + asn JSONB, + + -- TLS data + tls JSONB, + + -- CSP data + csp JSONB, + + -- Status flags + failed BOOLEAN, + error TEXT, + websocket BOOLEAN, + http2 BOOLEAN, + pipeline BOOLEAN, + vhost BOOLEAN, + + -- Metrics + words INTEGER, + lines INTEGER, + + -- Headers and extracts + header JSONB, + extracts JSONB, + extract_regex TEXT[], + + -- Chain data + chain JSONB, + chain_status_codes INTEGER[], + + -- Headless/Screenshot + headless_body TEXT, + screenshot_bytes BYTEA, + screenshot_path TEXT, + screenshot_path_rel TEXT, + stored_response_path TEXT, + + -- Knowledge base + knowledgebase JSONB, + + -- Link requests + link_request JSONB, + + -- Trace + trace JSONB + ); + + CREATE INDEX IF NOT EXISTS idx_%s_timestamp ON %s(timestamp DESC); + CREATE INDEX IF NOT EXISTS idx_%s_url ON %s(url); + CREATE INDEX IF NOT EXISTS idx_%s_host ON %s(host); + CREATE INDEX IF NOT EXISTS idx_%s_status_code ON %s(status_code); + CREATE INDEX IF NOT EXISTS idx_%s_tech ON %s USING GIN(tech); + `, + p.cfg.TableName, + p.cfg.TableName, p.cfg.TableName, + p.cfg.TableName, p.cfg.TableName, + p.cfg.TableName, p.cfg.TableName, + p.cfg.TableName, p.cfg.TableName, + p.cfg.TableName, p.cfg.TableName, + ) + + _, err := p.db.ExecContext(ctx, schema) + if err != nil { + return fmt.Errorf("failed to create schema: %w", err) + } + + return nil +} + +func (p *postgresDatabase) InsertBatch(ctx context.Context, results []runner.Result) error { + if len(results) == 0 { + return nil + } + + tx, err := p.db.BeginTx(ctx, nil) + if err != nil { + return fmt.Errorf("failed to begin transaction: %w", err) + } + defer func() { + _ = tx.Rollback() + }() + + query := fmt.Sprintf(` + INSERT INTO %s ( + timestamp, url, input, host, port, scheme, path, method, final_url, + status_code, content_length, content_type, title, webserver, response_time, + location, body, body_preview, raw_header, request, + host_ip, a, aaaa, cname, resolvers, body_fqdn, body_domains, sni, + tech, hash, favicon, favicon_md5, favicon_path, favicon_url, jarm_hash, + cdn, cdn_name, cdn_type, asn, tls, csp, + failed, error, websocket, http2, pipeline, vhost, + words, lines, header, extracts, extract_regex, + chain, chain_status_codes, + headless_body, screenshot_bytes, screenshot_path, screenshot_path_rel, stored_response_path, + knowledgebase, link_request, trace + ) VALUES ( + $1, $2, $3, $4, $5, $6, $7, $8, $9, + $10, $11, $12, $13, $14, $15, + $16, $17, $18, $19, $20, + $21, $22, $23, $24, $25, $26, $27, $28, + $29, $30, $31, $32, $33, $34, $35, + $36, $37, $38, $39, $40, $41, + $42, $43, $44, $45, $46, $47, + $48, $49, $50, $51, $52, + $53, $54, + $55, $56, $57, $58, $59, + $60, $61, $62 + )`, p.cfg.TableName) + + stmt, err := tx.PrepareContext(ctx, query) + if err != nil { + return fmt.Errorf("failed to prepare statement: %w", err) + } + defer stmt.Close() + + for _, r := range results { + hashJSON, _ := json.Marshal(r.Hashes) + asnJSON, _ := json.Marshal(r.ASN) + tlsJSON, _ := json.Marshal(r.TLSData) + cspJSON, _ := json.Marshal(r.CSPData) + headerJSON, _ := json.Marshal(r.ResponseHeaders) + extractsJSON, _ := json.Marshal(r.Extracts) + chainJSON, _ := json.Marshal(r.Chain) + kbJSON, _ := json.Marshal(r.KnowledgeBase) + linkReqJSON, _ := json.Marshal(r.LinkRequest) + traceJSON, _ := json.Marshal(r.Trace) + + _, err = stmt.ExecContext(ctx, + r.Timestamp, r.URL, r.Input, r.Host, r.Port, r.Scheme, r.Path, r.Method, r.FinalURL, + r.StatusCode, r.ContentLength, r.ContentType, r.Title, r.WebServer, r.ResponseTime, + r.Location, r.ResponseBody, r.BodyPreview, r.RawHeaders, r.Request, + r.HostIP, pq.Array(r.A), pq.Array(r.AAAA), pq.Array(r.CNAMEs), pq.Array(r.Resolvers), pq.Array(r.Fqdns), pq.Array(r.Domains), r.SNI, + pq.Array(r.Technologies), hashJSON, r.FavIconMMH3, r.FavIconMD5, r.FaviconPath, r.FaviconURL, r.JarmHash, + r.CDN, r.CDNName, r.CDNType, asnJSON, tlsJSON, cspJSON, + r.Failed, r.Error, r.WebSocket, r.HTTP2, r.Pipeline, r.VHost, + r.Words, r.Lines, headerJSON, extractsJSON, pq.Array(r.ExtractRegex), + chainJSON, pq.Array(r.ChainStatusCodes), + r.HeadlessBody, r.ScreenshotBytes, r.ScreenshotPath, r.ScreenshotPathRel, r.StoredResponsePath, + kbJSON, linkReqJSON, traceJSON, + ) + if err != nil { + return fmt.Errorf("failed to insert result: %w", err) + } + } + + if err := tx.Commit(); err != nil { + return fmt.Errorf("failed to commit transaction: %w", err) + } + + return nil +} + +func (p *postgresDatabase) Type() DatabaseType { + return PostgreSQL +} diff --git a/internal/db/writer.go b/internal/db/writer.go new file mode 100644 index 00000000..ebc894a2 --- /dev/null +++ b/internal/db/writer.go @@ -0,0 +1,148 @@ +package db + +import ( + "context" + "sync" + "sync/atomic" + "time" + + "github.com/projectdiscovery/gologger" + "github.com/projectdiscovery/httpx/runner" +) + +type Writer struct { + db Database + cfg *Config + data chan runner.Result + done chan struct{} + counter atomic.Int64 + closed atomic.Bool + wg sync.WaitGroup + ctx context.Context + cancel context.CancelFunc + omitRaw bool +} + +func NewWriter(ctx context.Context, cfg *Config) (*Writer, error) { + db, err := NewDatabase(cfg) + if err != nil { + return nil, err + } + + if err := db.Connect(ctx); err != nil { + return nil, err + } + + if err := db.EnsureSchema(ctx); err != nil { + _ = db.Close() + return nil, err + } + + writerCtx, cancel := context.WithCancel(ctx) + + w := &Writer{ + db: db, + cfg: cfg, + data: make(chan runner.Result, cfg.BatchSize), + done: make(chan struct{}), + ctx: writerCtx, + cancel: cancel, + omitRaw: cfg.OmitRaw, + } + + w.wg.Add(1) + go w.run() + + gologger.Info().Msgf("Database output enabled: %s (%s/%s)", cfg.Type, cfg.DatabaseName, cfg.TableName) + + return w, nil +} + +func (w *Writer) GetWriterCallback() runner.OnResultCallback { + return func(r runner.Result) { + if r.Err != nil { + return + } + + if w.omitRaw { + r.Raw = "" + r.Request = "" + r.ResponseBody = "" + } + + select { + case w.data <- r: + case <-w.ctx.Done(): + } + } +} + +func (w *Writer) run() { + defer w.wg.Done() + defer close(w.done) + + batch := make([]runner.Result, 0, w.cfg.BatchSize) + ticker := time.NewTicker(w.cfg.FlushInterval) + defer ticker.Stop() + + flush := func() { + if len(batch) == 0 { + return + } + + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + + if err := w.db.InsertBatch(ctx, batch); err != nil { + gologger.Error().Msgf("Failed to insert batch to database: %v", err) + } else { + w.counter.Add(int64(len(batch))) + gologger.Verbose().Msgf("Inserted %d results to database (total: %d)", len(batch), w.counter.Load()) + } + + batch = batch[:0] + } + + for { + select { + case <-w.ctx.Done(): + flush() + return + + case <-ticker.C: + flush() + + case result, ok := <-w.data: + if !ok { + flush() + return + } + + batch = append(batch, result) + + if len(batch) >= w.cfg.BatchSize { + flush() + } + } + } +} + +func (w *Writer) Close() { + if !w.closed.CompareAndSwap(false, true) { + return + } + + w.cancel() + close(w.data) + <-w.done + + if err := w.db.Close(); err != nil { + gologger.Error().Msgf("Error closing database connection: %v", err) + } + + gologger.Info().Msgf("Database writer closed. Total results stored: %d", w.counter.Load()) +} + +func (w *Writer) Stats() int64 { + return w.counter.Load() +} diff --git a/runner/options.go b/runner/options.go index 46e9bc80..1862164b 100644 --- a/runner/options.go +++ b/runner/options.go @@ -357,6 +357,15 @@ type Options struct { Trace bool + ResultDatabase bool + ResultDatabaseConfig string + ResultDatabaseType string + ResultDatabaseConnStr string + ResultDatabaseName string + ResultDatabaseTable string + ResultDatabaseBatchSize int + ResultDatabaseOmitRaw bool + // Optional pre-created objects to reduce allocations Wappalyzer *wappalyzer.Wappalyze Networkpolicy *networkpolicy.NetworkPolicy @@ -494,6 +503,14 @@ func ParseOptions() *Options { flagSet.BoolVarP(&options.StoreVisionReconClusters, "store-vision-recon-cluster", "svrc", false, "include visual recon clusters (-ss and -sr only)"), flagSet.StringVarP(&options.Protocol, "protocol", "pr", "", "protocol to use (unknown, http11, http2 [experimental], http3 [experimental])"), flagSet.StringVarP(&options.OutputFilterErrorPagePath, "filter-error-page-path", "fepp", "filtered_error_page.json", "path to store filtered error pages"), + flagSet.BoolVarP(&options.ResultDatabase, "result-db", "rdb", false, "store results in database"), + flagSet.StringVarP(&options.ResultDatabaseConfig, "result-db-config", "rdbc", "", "path to database config file"), + flagSet.StringVarP(&options.ResultDatabaseType, "result-db-type", "rdbt", "", "database type (mongodb, postgres, mysql)"), + flagSet.StringVarP(&options.ResultDatabaseConnStr, "result-db-conn", "rdbcs", "", "database connection string (env: HTTPX_DB_CONNECTION_STRING)"), + flagSet.StringVarP(&options.ResultDatabaseName, "result-db-name", "rdbn", "httpx", "database name"), + flagSet.StringVarP(&options.ResultDatabaseTable, "result-db-table", "rdbtb", "results", "table/collection name"), + flagSet.IntVarP(&options.ResultDatabaseBatchSize, "result-db-batch-size", "rdbbs", 100, "batch size for database inserts"), + flagSet.BoolVarP(&options.ResultDatabaseOmitRaw, "result-db-omit-raw", "rdbor", false, "omit raw request/response data from database"), ) flagSet.CreateGroup("configs", "Configurations", From f2690f50afbf13b8b2a577606ad43ca1e1e38869 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Do=C4=9Fan=20Can=20Bak=C4=B1r?= Date: Tue, 6 Jan 2026 13:31:04 +0300 Subject: [PATCH 2/3] fix: address CodeRabbitAI review comments for database output - Fix SQL injection in postgres.go using pq.QuoteIdentifier for table/index names - Fix SQL injection in mysql.go using custom quoteIdentifier function - Fix race condition in writer.go by checking closed state before channel send - Add missing idx_url index in mysql.go for parity with PostgreSQL schema - Include RawHeaders in omitRaw check for consistency --- internal/db/mysql.go | 15 ++++++++++++--- internal/db/postgres.go | 32 ++++++++++++++++++++------------ internal/db/writer.go | 5 +++++ 3 files changed, 37 insertions(+), 15 deletions(-) diff --git a/internal/db/mysql.go b/internal/db/mysql.go index 50434985..6ebc0011 100644 --- a/internal/db/mysql.go +++ b/internal/db/mysql.go @@ -5,11 +5,16 @@ import ( "database/sql" "encoding/json" "fmt" + "strings" _ "github.com/go-sql-driver/mysql" "github.com/projectdiscovery/httpx/runner" ) +func quoteIdentifier(name string) string { + return "`" + strings.ReplaceAll(name, "`", "``") + "`" +} + func init() { Register(MySQL, newMySQLDatabase) } @@ -45,6 +50,7 @@ func (m *mysqlDatabase) Close() error { } func (m *mysqlDatabase) EnsureSchema(ctx context.Context) error { + tableName := quoteIdentifier(m.cfg.TableName) schema := fmt.Sprintf(` CREATE TABLE IF NOT EXISTS %s ( id BIGINT AUTO_INCREMENT PRIMARY KEY, @@ -118,7 +124,7 @@ func (m *mysqlDatabase) EnsureSchema(ctx context.Context) error { -- Metrics words INT, - ` + "`lines`" + ` INT, + `+"`lines`"+` INT, -- Headers and extracts header JSON, @@ -146,10 +152,11 @@ func (m *mysqlDatabase) EnsureSchema(ctx context.Context) error { trace JSON, INDEX idx_timestamp (timestamp), + INDEX idx_url (url(255)), INDEX idx_host (host), INDEX idx_status_code (status_code) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; - `, m.cfg.TableName) + `, tableName) _, err := m.db.ExecContext(ctx, schema) if err != nil { @@ -172,6 +179,8 @@ func (m *mysqlDatabase) InsertBatch(ctx context.Context, results []runner.Result _ = tx.Rollback() }() + // Use quoteIdentifier to safely quote table name to prevent SQL injection + tableName := quoteIdentifier(m.cfg.TableName) query := fmt.Sprintf(` INSERT INTO %s ( timestamp, url, input, host, port, scheme, path, method, final_url, @@ -197,7 +206,7 @@ func (m *mysqlDatabase) InsertBatch(ctx context.Context, results []runner.Result ?, ?, ?, ?, ?, ?, ?, ?, ?, ? - )`, m.cfg.TableName) + )`, tableName) stmt, err := tx.PrepareContext(ctx, query) if err != nil { diff --git a/internal/db/postgres.go b/internal/db/postgres.go index 80ecf7dc..b53d5aaf 100644 --- a/internal/db/postgres.go +++ b/internal/db/postgres.go @@ -45,6 +45,13 @@ func (p *postgresDatabase) Close() error { } func (p *postgresDatabase) EnsureSchema(ctx context.Context) error { + tableName := pq.QuoteIdentifier(p.cfg.TableName) + idxTimestamp := pq.QuoteIdentifier("idx_" + p.cfg.TableName + "_timestamp") + idxURL := pq.QuoteIdentifier("idx_" + p.cfg.TableName + "_url") + idxHost := pq.QuoteIdentifier("idx_" + p.cfg.TableName + "_host") + idxStatusCode := pq.QuoteIdentifier("idx_" + p.cfg.TableName + "_status_code") + idxTech := pq.QuoteIdentifier("idx_" + p.cfg.TableName + "_tech") + schema := fmt.Sprintf(` CREATE TABLE IF NOT EXISTS %s ( id BIGSERIAL PRIMARY KEY, @@ -146,18 +153,18 @@ func (p *postgresDatabase) EnsureSchema(ctx context.Context) error { trace JSONB ); - CREATE INDEX IF NOT EXISTS idx_%s_timestamp ON %s(timestamp DESC); - CREATE INDEX IF NOT EXISTS idx_%s_url ON %s(url); - CREATE INDEX IF NOT EXISTS idx_%s_host ON %s(host); - CREATE INDEX IF NOT EXISTS idx_%s_status_code ON %s(status_code); - CREATE INDEX IF NOT EXISTS idx_%s_tech ON %s USING GIN(tech); + CREATE INDEX IF NOT EXISTS %s ON %s(timestamp DESC); + CREATE INDEX IF NOT EXISTS %s ON %s(url); + CREATE INDEX IF NOT EXISTS %s ON %s(host); + CREATE INDEX IF NOT EXISTS %s ON %s(status_code); + CREATE INDEX IF NOT EXISTS %s ON %s USING GIN(tech); `, - p.cfg.TableName, - p.cfg.TableName, p.cfg.TableName, - p.cfg.TableName, p.cfg.TableName, - p.cfg.TableName, p.cfg.TableName, - p.cfg.TableName, p.cfg.TableName, - p.cfg.TableName, p.cfg.TableName, + tableName, + idxTimestamp, tableName, + idxURL, tableName, + idxHost, tableName, + idxStatusCode, tableName, + idxTech, tableName, ) _, err := p.db.ExecContext(ctx, schema) @@ -181,6 +188,7 @@ func (p *postgresDatabase) InsertBatch(ctx context.Context, results []runner.Res _ = tx.Rollback() }() + tableName := pq.QuoteIdentifier(p.cfg.TableName) query := fmt.Sprintf(` INSERT INTO %s ( timestamp, url, input, host, port, scheme, path, method, final_url, @@ -206,7 +214,7 @@ func (p *postgresDatabase) InsertBatch(ctx context.Context, results []runner.Res $53, $54, $55, $56, $57, $58, $59, $60, $61, $62 - )`, p.cfg.TableName) + )`, tableName) stmt, err := tx.PrepareContext(ctx, query) if err != nil { diff --git a/internal/db/writer.go b/internal/db/writer.go index ebc894a2..62ba3a71 100644 --- a/internal/db/writer.go +++ b/internal/db/writer.go @@ -60,6 +60,10 @@ func NewWriter(ctx context.Context, cfg *Config) (*Writer, error) { func (w *Writer) GetWriterCallback() runner.OnResultCallback { return func(r runner.Result) { + if w.closed.Load() { + return + } + if r.Err != nil { return } @@ -68,6 +72,7 @@ func (w *Writer) GetWriterCallback() runner.OnResultCallback { r.Raw = "" r.Request = "" r.ResponseBody = "" + r.RawHeaders = "" } select { From c1c1fd0a76653985adc1297e36f9bc1d4a9784a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Do=C4=9Fan=20Can=20Bak=C4=B1r?= Date: Mon, 12 Jan 2026 12:29:38 +0300 Subject: [PATCH 3/3] refactor: use batcher utility and StringVarEnv --- internal/db/config.go | 4 -- internal/db/writer.go | 109 +++++++++++++----------------------------- runner/options.go | 2 +- 3 files changed, 34 insertions(+), 81 deletions(-) diff --git a/internal/db/config.go b/internal/db/config.go index c72dcac3..c057b80b 100644 --- a/internal/db/config.go +++ b/internal/db/config.go @@ -111,10 +111,6 @@ func (o *Options) ToConfig() (*Config, error) { OmitRaw: o.OmitRaw, } - if cfg.ConnectionString == "" { - cfg.ConnectionString = os.Getenv(EnvConnectionString) - } - cfg.ApplyDefaults() if err := cfg.Validate(); err != nil { diff --git a/internal/db/writer.go b/internal/db/writer.go index 62ba3a71..ec0042df 100644 --- a/internal/db/writer.go +++ b/internal/db/writer.go @@ -2,25 +2,21 @@ package db import ( "context" - "sync" "sync/atomic" "time" "github.com/projectdiscovery/gologger" "github.com/projectdiscovery/httpx/runner" + "github.com/projectdiscovery/utils/batcher" ) type Writer struct { - db Database - cfg *Config - data chan runner.Result - done chan struct{} - counter atomic.Int64 - closed atomic.Bool - wg sync.WaitGroup - ctx context.Context - cancel context.CancelFunc - omitRaw bool + db Database + cfg *Config + batcher *batcher.Batcher[runner.Result] + counter atomic.Int64 + closed atomic.Bool + omitRaw bool } func NewWriter(ctx context.Context, cfg *Config) (*Writer, error) { @@ -38,26 +34,41 @@ func NewWriter(ctx context.Context, cfg *Config) (*Writer, error) { return nil, err } - writerCtx, cancel := context.WithCancel(ctx) - w := &Writer{ db: db, cfg: cfg, - data: make(chan runner.Result, cfg.BatchSize), - done: make(chan struct{}), - ctx: writerCtx, - cancel: cancel, omitRaw: cfg.OmitRaw, } - w.wg.Add(1) - go w.run() + w.batcher = batcher.New( + batcher.WithMaxCapacity[runner.Result](cfg.BatchSize), + batcher.WithFlushInterval[runner.Result](cfg.FlushInterval), + batcher.WithFlushCallback(w.flush), + ) + + w.batcher.Run() gologger.Info().Msgf("Database output enabled: %s (%s/%s)", cfg.Type, cfg.DatabaseName, cfg.TableName) return w, nil } +func (w *Writer) flush(batch []runner.Result) { + if len(batch) == 0 { + return + } + + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + + if err := w.db.InsertBatch(ctx, batch); err != nil { + gologger.Error().Msgf("Failed to insert batch to database: %v", err) + } else { + w.counter.Add(int64(len(batch))) + gologger.Verbose().Msgf("Inserted %d results to database (total: %d)", len(batch), w.counter.Load()) + } +} + func (w *Writer) GetWriterCallback() runner.OnResultCallback { return func(r runner.Result) { if w.closed.Load() { @@ -75,60 +86,7 @@ func (w *Writer) GetWriterCallback() runner.OnResultCallback { r.RawHeaders = "" } - select { - case w.data <- r: - case <-w.ctx.Done(): - } - } -} - -func (w *Writer) run() { - defer w.wg.Done() - defer close(w.done) - - batch := make([]runner.Result, 0, w.cfg.BatchSize) - ticker := time.NewTicker(w.cfg.FlushInterval) - defer ticker.Stop() - - flush := func() { - if len(batch) == 0 { - return - } - - ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) - defer cancel() - - if err := w.db.InsertBatch(ctx, batch); err != nil { - gologger.Error().Msgf("Failed to insert batch to database: %v", err) - } else { - w.counter.Add(int64(len(batch))) - gologger.Verbose().Msgf("Inserted %d results to database (total: %d)", len(batch), w.counter.Load()) - } - - batch = batch[:0] - } - - for { - select { - case <-w.ctx.Done(): - flush() - return - - case <-ticker.C: - flush() - - case result, ok := <-w.data: - if !ok { - flush() - return - } - - batch = append(batch, result) - - if len(batch) >= w.cfg.BatchSize { - flush() - } - } + w.batcher.Append(r) } } @@ -137,9 +95,8 @@ func (w *Writer) Close() { return } - w.cancel() - close(w.data) - <-w.done + w.batcher.Stop() + w.batcher.WaitDone() if err := w.db.Close(); err != nil { gologger.Error().Msgf("Error closing database connection: %v", err) diff --git a/runner/options.go b/runner/options.go index 1862164b..9af8e541 100644 --- a/runner/options.go +++ b/runner/options.go @@ -506,7 +506,7 @@ func ParseOptions() *Options { flagSet.BoolVarP(&options.ResultDatabase, "result-db", "rdb", false, "store results in database"), flagSet.StringVarP(&options.ResultDatabaseConfig, "result-db-config", "rdbc", "", "path to database config file"), flagSet.StringVarP(&options.ResultDatabaseType, "result-db-type", "rdbt", "", "database type (mongodb, postgres, mysql)"), - flagSet.StringVarP(&options.ResultDatabaseConnStr, "result-db-conn", "rdbcs", "", "database connection string (env: HTTPX_DB_CONNECTION_STRING)"), + flagSet.StringVarEnv(&options.ResultDatabaseConnStr, "result-db-conn", "rdbcs", "", "HTTPX_DB_CONNECTION_STRING", "database connection string"), flagSet.StringVarP(&options.ResultDatabaseName, "result-db-name", "rdbn", "httpx", "database name"), flagSet.StringVarP(&options.ResultDatabaseTable, "result-db-table", "rdbtb", "results", "table/collection name"), flagSet.IntVarP(&options.ResultDatabaseBatchSize, "result-db-batch-size", "rdbbs", 100, "batch size for database inserts"),