-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathgeneric_downloader.py
More file actions
72 lines (62 loc) · 2.11 KB
/
generic_downloader.py
File metadata and controls
72 lines (62 loc) · 2.11 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import requests,bs4
urllist=[] #list all urls visited so that you don't visit any of them again.
"""
TO-DO LIST:
1.Download all JS files on each page
2.Download all CSS files on each page
3.Get weblink,title and depth as command line parameters
4.Apply regEx to stop the code from getting swayed to other websites who's links may be present on some web page.
"""
def genricWebsiteDownloader(weblink,title,n): #n is the depth
if n==0 or weblink in urllist:
return
new=n-1
res=requests.get(weblink)
urllist.append(weblink)
bsobj=bs4.BeautifulSoup(res.text,'html.parser')
html=bsobj.prettify()
f=open(title+'.html','w')
f.write(html)
f.close()
#Download all pics on the page
itr=0
for link in bsobj.findAll('img'):
if 'src' in link.attrs:
try:
res=requests.get(link['src'],stream=True)
print("Downloading "+link['src'])
except:
try:
res=requests.get(weblink + link['src'],stream=True)
urllist.append(weblink + link['src'])
except:
pass
pass
try:
ext=link['src']
ext=ext[len(ext)-4:]
itr+=1
FileName=title+str(itr)+ext
with open(FileName, 'wb') as f:
for i in res:
f.write(i)
link['src']=title
except:
pass
#go to all links in website and run the function recursively
for link in bsobj.findAll('a'):
if 'href' in link.attrs:
try:
genricWebsiteDownloader(link['href'],link.string,new)
print("Downloading "+link['href'])
except:
try:
genricWebsiteDownloader(weblink + link['href'],link.string,new)
print("Downloading " +weblink + link['href'])
except:
pass
pass
weblink='http://meter-down.com'
title="meter-down"
depth=10
genricWebsiteDownloader(weblink,title,depth)