-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscraper.py
More file actions
136 lines (124 loc) · 6.03 KB
/
scraper.py
File metadata and controls
136 lines (124 loc) · 6.03 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
from bs4 import BeautifulSoup
import requests
import os
import time
from tqdm import tqdm
import json
from urllib.parse import urljoin
from urllib.parse import urlparse
def get_content(home_url, links, visited_links, headers):
if not links:
return None
home_domain = urlparse(home_url).netloc
if len(links)>0:
if urlparse(links[0]).scheme not in ('http', 'https'):
return None, links, visited_links
response = requests.get(links[0], headers=headers)
content_type = response.headers.get('content-type')
if "text/html" not in content_type:
visited_links.append(links[0])
links.remove(links[0])
return None, links, visited_links
if response.status_code == 200:
current_link = str(links[0])
visited_links.append(links[0])
links.remove(links[0])
soup = BeautifulSoup(response.text, 'html.parser')
if soup is None:
visited_links.append(links[0])
links.remove(links[0])
return None, links, visited_links
# body = soup.find('body')
content = soup.get_text(separator="\n", strip=True)
content = content.replace("\"", "'").replace("\n", " ").replace("\t", " ")
if soup.title is None:
title = home_domain
else:
title = soup.title.string
a_tags = soup.find_all("a")
for link in a_tags:
href = link.get("href")
if href is not None:
if '#' in href:
continue
elif home_url not in href and urlparse(href).scheme not in ('mailto', 'javascript', 'ftp', 'tel', 'file', 'data'):
href = urljoin(home_url, href)
if urlparse(href).netloc != home_domain:
continue
if href not in visited_links:
links.append(href)
links = list(set(links))
data = {"title": title, "link": current_link, "content": content}
return data, links, visited_links
else:
visited_links.append(links[0])
links.remove(links[0])
return None, links, visited_links
if __name__ == "__main__":
home_urls = ["https://numpy.org/doc/stable/", "https://pandas.pydata.org/docs/", "https://www.python.org/doc/", "https://www.w3schools.com/", "https://whoosh.readthedocs.io/en/latest/", "https://pytorch.org/docs/stable", "https://docs.flutter.dev/", "https://go.dev/doc/"]
headers = {
"User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Mobile Safari/537.36",
"Accept-Encoding": "*",
"Accept-Language": "en-US,en;q=0.9",
"Connection": "keep-alive"
}
base_dir = "base/documentation"
with tqdm(total=len(home_urls)) as pbar:
for home_url in tqdm(home_urls, desc=""):
pbar.set_description(f"Scraping {home_url}:")
parts = home_url.split("//")[1].split(".")
name = parts[1] if parts[0] == "www" or parts[0]=="docs" else parts[0]
path = os.path.join(base_dir, name)
os.makedirs(path, exist_ok=True)
try:
response = requests.get(home_url, headers=headers)
except ConnectionResetError:
print("ConnectionResetError, continuing not next home_url")
continue
if response.status_code != 200:
continue
soup = BeautifulSoup(response.text, 'html.parser')
visited_links = []
json_file = os.path.join(path, "visited_links.json")
if os.path.exists(json_file):
with open(os.path.join(path, "visited_links.json"), "r") as f:
link_data = json.load(f)
visited_links = link_data['visited_links']
a_tags = soup.find_all("a")
links = []
home_domain = urlparse(home_url).netloc
for tag in a_tags:
href = tag.get("href")
if href is not None:
if '#' in href:
continue
elif home_url not in href:
href = urljoin(home_url, href)
if urlparse(href).netloc != home_domain:
continue
links.append(href)
links = list(set(links))
visited_links.append(home_url)
if home_url in links:
links.remove(home_url)
# Remove any links that are already in the visited_links list
links = [link for link in links if link not in visited_links]
# data_list = []
counter = 0
while len(links)>0:
time.sleep(3)
data, links, visited_links = get_content(home_url, links, visited_links, headers=headers)
if data is not None:
# data_list.append(data)
file_name = data['link'].split("//")[-1].replace(".", "_").replace("/", "_").replace("?", "_").replace("=", "_")
with open(os.path.join(path, f"{counter}_{file_name}.json"), "w", encoding="utf-8") as f:
json.dump(data, f, indent=4, ensure_ascii=False)
counter += 1
v_link = {"visited_links": visited_links}
with open(os.path.join(path, "visited_links.json"), "w", encoding='utf-8') as f:
json.dump(v_link, f, indent=4, ensure_ascii=False)
# if counter is greater than 100, exit the while loop and continue with the next home_url
if counter > 150:
break
pbar.set_postfix(total_links=len(links), visited_links=len(visited_links), count=counter)
pbar.update(1)