-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathmain.py
More file actions
152 lines (140 loc) · 6.57 KB
/
main.py
File metadata and controls
152 lines (140 loc) · 6.57 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import xml.dom.minidom as md
import requests as rq
import time
import os
import argparse
import sys
class SitemapParser:
def __init__(self, sitemapurl, outfile, verbosity=None, delete=False, output=False, exclude=None):
if "http" not in sitemapurl:
sitemapurl = f"https://{sitemapurl}"
self.start_time = time.time()
if output:
print(f"[+] Constructing new Parser at {time.ctime(self.start_time)}, this may take a few minutes.")
else:
print(f"[+] Constructing new Parser at {time.ctime(self.start_time)}, this may take a few minutes. "
f"A URL count will be displayed, but no URLs will be saved. Use -o to save URLs to a file")
self.sitemaps = []
self.exclude = exclude
self.files = []
self.urls = []
self.outfile = outfile
self.robots = sitemapurl
self.delete = delete
self.verbosity = verbosity
print(self.verbosity)
self.output = output
self.__get_sitemap_urls()
self.__get_sitemap()
self.__get_urls()
if self.output:
self._write_urls()
if self.delete is not False:
self._delete_local_files()
self._count_urls()
def __get_sitemap_urls(self):
print(f'[+] Getting urls for sitemaps from {self.robots}')
r = rq.get(self.robots)
for line in r.text.split('\n'):
if "sitemap" in line:
if line.split(" ")[-1] not in self.sitemaps and line.split(" ")[-1] != self.exclude:
self.sitemaps.append(line.split(' ')[-1])
if len(self.sitemaps) > 0:
print("[+] Finished. Starting next step")
else:
print("[-] No sitemaps found")
sys.exit()
def __get_sitemap(self):
print(f"[+] Getting files from {len(self.sitemaps)} URLs and writing locally at ./sitemaps/")
for sitemap in self.sitemaps:
if self.verbosity:
print(f"[+] Getting sitemap @ {sitemap}")
if sitemap.split('/')[-1] not in self.files:
self.files.append(sitemap.split('/')[-1])
r = rq.get(sitemap)
if not os.path.exists("./sitemaps"):
os.makedirs("./sitemaps")
with open(f"./sitemaps/{sitemap.split('/')[-1]}", "w+") as f:
f.write(r.text)
print("[+] Finished. Starting next step")
def __get_urls(self):
print(f"[+] Parsing for URLs in {len(self.files)} Files")
for file in self.files:
if self.verbosity is not None:
print(f"[+] Parsing ./sitemaps/{file}for urls")
xml_file = f'./sitemaps/{file}'
DOMTree = md.parse(xml_file)
root_node = DOMTree.documentElement
loc_nodes = root_node.getElementsByTagName("loc")
for loc in loc_nodes:
if loc.childNodes[0].data not in self.urls:
self.urls.append(loc.childNodes[0].data)
return self.urls
def _delete_local_files(self):
print(f'[-] Deleting locally saved xml files')
for file in self.files:
if self.verbosity:
print(f'Removing ./sitemap/{file}')
os.remove(f'./sitemaps/{file}')
os.rmdir("./sitemaps")
def _write_urls(self):
print(f'[+] Writing URLs to {self.outfile}')
with open(f'{self.outfile}', "w+") as f:
for url in self.urls:
f.write(url+"\n")
def _count_urls(self):
if not self.delete:
if time.time() - self.start_time < 60:
print(f'[+] Parser found {len(self.urls)} URLs within {len(self.files)} files in {round((time.time() - self.start_time))} seconds.')
else:
print(f'[+] Parser found {len(self.urls)} URLs within {len(self.files)} files in {round((time.time() - self.start_time))/60} minutes.')
else:
if time.time() - self.start_time < 60:
print(f'[+] Parser found {len(self.urls)} URLs within {len(self.files)} files in {round((time.time() - self.start_time))} seconds. {len(self.files)} files deleted from local storage')
else:
print(f'[+] Parser found {len(self.urls)} URLs within {len(self.files)} files in {round((time.time() - self.start_time))/60} minutes. {len(self.files)} files deleted from local storage')
def parse_args():
description = "Parse Sitemap.xml files."
parser = argparse.ArgumentParser(description=description)
requirednamed = parser.add_argument_group('Required Arguments')
requirednamed.add_argument('-u',
'--url',
action="store",
help="Enter the URL that lists sitemap.xml files, ex. https://www.tiktok.com/robots.txt",
required=True)
parser.add_argument('-o',
'--output',
action="store",
nargs="?",
const="all_urls.txt",
help="Use -o/--output to write all URLs found to a file")
parser.add_argument('-d',
'--delete',
action="store_true",
help="Use -d/--delete to delete all locally written xml files")
parser.add_argument('-f',
'--file',
action='store',
nargs='?',
const="all_urls.txt",
help="Future use")
parser.add_argument('-dl',
'--delete_url_file',
action="store",
nargs='?',
const="all_urls.txt",
help="Use to clean up a url file from previous runs, "
"mostly was used during testing the script.")
parser.add_argument('-v', '--verbose', action="store_true", help="Use for debugging")
parser.add_argument('-e', '--exclude', action="store", nargs="?", const=None, help="Use to exclude urls breaking the script")
args = parser.parse_args()
return args
if __name__ == "__main__":
args = parse_args()
if args.delete_url_file:
os.remove(f'{args.delete_url_file}')
else:
if args.output:
new_parser = SitemapParser(args.url, delete=args.delete, output=True, outfile=args.output, verbosity=args.verbose, exclude=args.exclude)
else:
new_parser = SitemapParser(args.url, outfile=args.output, delete=args.delete, output=False, verbosity=args.verbose, exclude=args.exclude)