Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file.
67 changes: 67 additions & 0 deletions build/lib/paramspider/client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import requests
import random
import json
import logging
import time
import sys



logging.basicConfig(level=logging.INFO)


MAX_RETRIES = 3

def load_user_agents():
"""
Loads user agents
"""

return [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36 Edg/89.0.774.45",
"Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; AS; rv:11.0) like Gecko",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.96 Safari/537.36 Edge/16.16299",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 OPR/45.0.2552.898",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 Vivaldi/1.8.770.50",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:54.0) Gecko/20100101 Firefox/54.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 Edge/15.15063",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 Edge/15.15063",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36"
]

def fetch_url_content(url,proxy):
"""
Fetches the content of a URL using a random user agent.
Retries up to MAX_RETRIES times if the request fails.
"""
user_agents = load_user_agents()
if proxy is not None:
proxy={
'http':proxy,
'https':proxy
}
for i in range(MAX_RETRIES):
user_agent = random.choice(user_agents)
headers = {
"User-Agent": user_agent
}

try:
response = requests.get(url, proxies=proxy,headers=headers)
response.raise_for_status()
return response
except (requests.exceptions.RequestException, ValueError):
logging.warning(f"Error fetching URL {url}. Retrying in 5 seconds...")
time.sleep(5)
except KeyboardInterrupt:
logging.warning("Keyboard Interrupt re ceived. Exiting gracefully...")
sys.exit()

logging.error(f"Failed to fetch URL {url} after {MAX_RETRIES} retries.")
sys.exit()
168 changes: 168 additions & 0 deletions build/lib/paramspider/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
import argparse
import os
import logging
import colorama
from colorama import Fore, Style
from . import client # Importing client from a module named "client"
from urllib.parse import urlparse, parse_qs, urlencode
import os

yellow_color_code = "\033[93m"
reset_color_code = "\033[0m"

colorama.init(autoreset=True) # Initialize colorama for colored terminal output

log_format = '%(message)s'
logging.basicConfig(format=log_format, level=logging.INFO)
logging.getLogger('').handlers[0].setFormatter(logging.Formatter(log_format))

HARDCODED_EXTENSIONS = [
".jpg", ".jpeg", ".png", ".gif", ".pdf", ".svg", ".json",
".css", ".js", ".webp", ".woff", ".woff2", ".eot", ".ttf", ".otf", ".mp4", ".txt"
]

def has_extension(url, extensions):
"""
Check if the URL has a file extension matching any of the provided extensions.

Args:
url (str): The URL to check.
extensions (list): List of file extensions to match against.

Returns:
bool: True if the URL has a matching extension, False otherwise.
"""
parsed_url = urlparse(url)
path = parsed_url.path
extension = os.path.splitext(path)[1].lower()

return extension in extensions

def clean_url(url):
"""
Clean the URL by removing redundant port information for HTTP and HTTPS URLs.

Args:
url (str): The URL to clean.

Returns:
str: Cleaned URL.
"""
parsed_url = urlparse(url)

if (parsed_url.port == 80 and parsed_url.scheme == "http") or (parsed_url.port == 443 and parsed_url.scheme == "https"):
parsed_url = parsed_url._replace(netloc=parsed_url.netloc.rsplit(":", 1)[0])

return parsed_url.geturl()

def clean_urls(urls, extensions, placeholder):
"""
Clean a list of URLs by removing unnecessary parameters and query strings.

Args:
urls (list): List of URLs to clean.
extensions (list): List of file extensions to check against.

Returns:
list: List of cleaned URLs.
"""
cleaned_urls = set()
for url in urls:
cleaned_url = clean_url(url)
if not has_extension(cleaned_url, extensions):
parsed_url = urlparse(cleaned_url)
query_params = parse_qs(parsed_url.query)
cleaned_params = {key: placeholder for key in query_params}
cleaned_query = urlencode(cleaned_params, doseq=True)
cleaned_url = parsed_url._replace(query=cleaned_query).geturl()
cleaned_urls.add(cleaned_url)
return list(cleaned_urls)

def fetch_and_clean_urls(domain, extensions, stream_output,proxy, placeholder):
"""
Fetch and clean URLs related to a specific domain from the Wayback Machine.

Args:
domain (str): The domain name to fetch URLs for.
extensions (list): List of file extensions to check against.
stream_output (bool): True to stream URLs to the terminal.

Returns:
None
"""
logging.info(f"{Fore.YELLOW}[INFO]{Style.RESET_ALL} Fetching URLs for {Fore.CYAN + domain + Style.RESET_ALL}")
wayback_uri = f"https://web.archive.org/cdx/search/cdx?url={domain}/*&output=txt&collapse=urlkey&fl=original&page=/"
response = client.fetch_url_content(wayback_uri,proxy)
urls = response.text.split()

logging.info(f"{Fore.YELLOW}[INFO]{Style.RESET_ALL} Found {Fore.GREEN + str(len(urls)) + Style.RESET_ALL} URLs for {Fore.CYAN + domain + Style.RESET_ALL}")

cleaned_urls = clean_urls(urls, extensions, placeholder)
logging.info(f"{Fore.YELLOW}[INFO]{Style.RESET_ALL} Cleaning URLs for {Fore.CYAN + domain + Style.RESET_ALL}")
logging.info(f"{Fore.YELLOW}[INFO]{Style.RESET_ALL} Found {Fore.GREEN + str(len(cleaned_urls)) + Style.RESET_ALL} URLs after cleaning")
logging.info(f"{Fore.YELLOW}[INFO]{Style.RESET_ALL} Extracting URLs with parameters")

results_dir = "results"
if not os.path.exists(results_dir):
os.makedirs(results_dir)

result_file = os.path.join(results_dir, f"{domain}.txt")

with open(result_file, "w") as f:
for url in cleaned_urls:
if "?" in url:
f.write(url + "\n")
if stream_output:
print(url)

logging.info(f"{Fore.YELLOW}[INFO]{Style.RESET_ALL} Saved cleaned URLs to {Fore.CYAN + result_file + Style.RESET_ALL}")

def main():
"""
Main function to handle command-line arguments and start URL mining process.
"""
log_text = """

_ __
___ ___ ________ ___ _ ___ ___ (_)__/ /__ ____
/ _ \/ _ `/ __/ _ `/ ' \(_-</ _ \/ / _ / -_) __/
/ .__/\_,_/_/ \_,_/_/_/_/___/ .__/_/\_,_/\__/_/
/_/ /_/

with <3 by @0xasm0d3us
"""
colored_log_text = f"{yellow_color_code}{log_text}{reset_color_code}"
print(colored_log_text)
parser = argparse.ArgumentParser(description="Mining URLs from dark corners of Web Archives ")
parser.add_argument("-d", "--domain", help="Domain name to fetch related URLs for.")
parser.add_argument("-l", "--list", help="File containing a list of domain names.")
parser.add_argument("-s", "--stream", action="store_true", help="Stream URLs on the terminal.")
parser.add_argument("--proxy", help="Set the proxy address for web requests.",default=None)
parser.add_argument("-p", "--placeholder", help="placeholder for parameter values", default="FUZZ")
args = parser.parse_args()

if not args.domain and not args.list:
parser.error("Please provide either the -d option or the -l option.")

if args.domain and args.list:
parser.error("Please provide either the -d option or the -l option, not both.")

if args.list:
with open(args.list, "r") as f:
domains = [line.strip().lower().replace('https://', '').replace('http://', '') for line in f.readlines()]
domains = [domain for domain in domains if domain] # Remove empty lines
domains = list(set(domains)) # Remove duplicates
else:
domain = args.domain

extensions = HARDCODED_EXTENSIONS

if args.domain:
fetch_and_clean_urls(domain, extensions, args.stream, args.proxy, args.placeholder)

if args.list:
for domain in domains:
fetch_and_clean_urls(domain, extensions, args.stream,args.proxy, args.placeholder)

if __name__ == "__main__":
main()
99 changes: 99 additions & 0 deletions paramspider.egg-info/PKG-INFO
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
Metadata-Version: 2.4
Name: paramspider
Version: 0.1.0
Summary: Mining parameters from dark corners of Web Archives
Author: Devansh Batham
Author-email: devanshbatham009@gmail.com
License: MIT
Description-Content-Type: text/markdown
License-File: LICENSE
Requires-Dist: requests
Requires-Dist: colorama
Dynamic: author
Dynamic: author-email
Dynamic: description
Dynamic: description-content-type
Dynamic: license
Dynamic: license-file
Dynamic: requires-dist
Dynamic: summary

<h1 align="center">
paramspider
<br>
</h1>

<h4 align="center"> Mining URLs from dark corners of Web Archives for bug hunting/fuzzing/further probing </h4>

<p align="center">
<a href="#about">📖 About</a> •
<a href="#installation">🏗️ Installation</a> •
<a href="#usage">⛏️ Usage</a> •
<a href="#examples">🚀 Examples</a> •
<a href="#contributing">🤝 Contributing</a> •
</p>


![paramspider](https://github.com/devanshbatham/ParamSpider/blob/master/static/paramspider.png?raw=true)

## About

`paramspider` allows you to fetch URLs related to any domain or a list of domains from Wayback achives. It filters out "boring" URLs, allowing you to focus on the ones that matter the most.

## Installation

To install `paramspider`, follow these steps:

```sh
git clone https://github.com/devanshbatham/paramspider
cd paramspider
pip install .
```

## Usage

To use `paramspider`, follow these steps:

```sh
paramspider -d example.com
```

## Examples

Here are a few examples of how to use `paramspider`:

- Discover URLs for a single domain:

```sh
paramspider -d example.com
```

- Discover URLs for multiple domains from a file:

```sh
paramspider -l domains.txt
```

- Stream URLs on the termial:

```sh
paramspider -d example.com -s
```

- Set up web request proxy:

```sh
paramspider -d example.com --proxy '127.0.0.1:7890'
```
- Adding a placeholder for URL parameter values (default: "FUZZ"):

```sh
paramspider -d example.com -p '"><h1>reflection</h1>'
```


## Star History

[![Star History Chart](https://api.star-history.com/svg?repos=devanshbatham/paramspider&type=Date)](https://star-history.com/#devanshbatham/paramspider&Date)


12 changes: 12 additions & 0 deletions paramspider.egg-info/SOURCES.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
LICENSE
README.md
setup.py
paramspider/__init__.py
paramspider/client.py
paramspider/main.py
paramspider.egg-info/PKG-INFO
paramspider.egg-info/SOURCES.txt
paramspider.egg-info/dependency_links.txt
paramspider.egg-info/entry_points.txt
paramspider.egg-info/requires.txt
paramspider.egg-info/top_level.txt
1 change: 1 addition & 0 deletions paramspider.egg-info/dependency_links.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@

2 changes: 2 additions & 0 deletions paramspider.egg-info/entry_points.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[console_scripts]
paramspider = paramspider.main:main
2 changes: 2 additions & 0 deletions paramspider.egg-info/requires.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
requests
colorama
1 change: 1 addition & 0 deletions paramspider.egg-info/top_level.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
paramspider