-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscrapingNode.py
More file actions
79 lines (63 loc) · 2.43 KB
/
scrapingNode.py
File metadata and controls
79 lines (63 loc) · 2.43 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
##https://practicaldatascience.co.uk/data-science/how-to-scrape-google-search-results-using-python
import requests
import urllib
from requests_html import HTMLSession
from trafilatura import extract
import requests
class Scraper:
def __init__(self):
pass
def is_wikipedia_url(self, url):
return "wikipedia.org" in url # Check if the URL contains "wikipedia.org"
def remove_edit_links(self, text):
text = text.replace("[edit]", "")
text = text.replace("[Bearbeiten | Quelltext bearbeiten]", "")
return text
def fetchTextFromURL(self, url):
try:
response = requests.get(url)
response.raise_for_status() # Raise an exception for bad responses (e.g., 404)
html_content = response.text
except requests.exceptions.RequestException as e:
print(f"Failed to fetch the URL: {e}")
return None
try:
text = extract(html_content, favor_precision=True) # Extract main content
except Exception as e:
print(f"Failed to extract text: {e}")
return None
if self.is_wikipedia_url(url):
text = self.remove_edit_links(text)
return text
def saveExtractedText(self):
text_file = open("test.txt", "w")
#write string to file
text_file.write(self.text)
#close file
text_file.close()
print("DOCUMENT CLOSED<<>>> FETCHING SUCCESSFUL")
class GoogleScraper:
def get_source(self, url):
try:
s = HTMLSession()
response = s.get(url)
return response
except requests.exceptions.RequestException as e:
print(e)
def scrape_google(self, query):
query = urllib.parse.quote_plus(query)
response = self.get_source("https://www.google.de/search?q=" + query)
urls = list(response.html.absolute_links)
googleLinks = (
'https://www.google.',
'https://google.',
'https://webcache.googleusercontent.',
'http://webcache.googleusercontent.',
'https://policies.google.',
'https://support.google.',
'https://maps.google.',
)
for url in urls[:]:
if url.startswith(googleLinks):
urls.remove(url)
return urls