-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathweb_scrape.py
More file actions
184 lines (147 loc) · 5.21 KB
/
web_scrape.py
File metadata and controls
184 lines (147 loc) · 5.21 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
import requests
from bs4 import BeautifulSoup
import docx
import os
import configparser
import msvcrt
import sys
import tkinter as tk
from tkinter import filedialog
root = tk.Tk()
root.withdraw()
config = configparser.ConfigParser()
def status(status):
if (str(status)[0] == "2"):
print(f"Status: {status}")
elif (str(status)[0] == "4"):
print(f"Status: {status}")
else:
print(f"Status: {status}")
def add_hyperlink(paragraph, url, text):
"""
A function that places a hyperlink within a paragraph object.
:param paragraph: The paragraph we are adding the hyperlink to.
:param url: A string containing the required url
:param text: The text displayed for the url
:return: The hyperlink object
"""
# This gets access to the document.xml.rels file and gets a new relation id value
part = paragraph.part
r_id = part.relate_to(url, docx.opc.constants.RELATIONSHIP_TYPE.HYPERLINK, is_external=True)
# Create the w:hyperlink tag and add needed values
hyperlink = docx.oxml.shared.OxmlElement('w:hyperlink')
hyperlink.set(docx.oxml.shared.qn('r:id'), r_id, )
# Create a w:r element
new_run = docx.oxml.shared.OxmlElement('w:r')
# Create a new w:rPr element
rPr = docx.oxml.shared.OxmlElement('w:rPr')
# Change font color
color = docx.oxml.shared.OxmlElement('w:color')
color.set(docx.oxml.shared.qn('w:val'), '0000EE')
# Add underline
underline = docx.oxml.shared.OxmlElement('w:u')
underline.set(docx.oxml.shared.qn('w:val'), 'single')
# Join all the xml elements together add add the required text to the w:r element
rPr.append(color)
rPr.append(underline)
new_run.append(rPr)
new_run.text = text
hyperlink.append(new_run)
paragraph._p.append(hyperlink)
return hyperlink
def getSite():
url = input("URL: ")
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
try:
try:
page = requests.get(url, headers=headers)
except:
page = requests.get(url)
#status(page.status_code)
print(f"Status: {page.status_code}")
except:
print("Invalid URL")
return 1
soup = BeautifulSoup(page.content, "html.parser")
print(f"Title:\n\t{soup.title.text}")
if page.status_code == 200:
return [soup, url]
else:
return 1
def parseSite(soup, url):
doc = docx.Document()
# Get location path
defaultLoc = input("Use default location? (y/n) > ").lower()
if (defaultLoc == "y"):
config.read('settings.ini')
location = config["Location"]["defaultPath"]
else:
location = filedialog.askdirectory()
# Compile full file path
fileName = input("File name: ")
path = os.path.join(location, f"{fileName}.docx")
header = soup.find_all("header", class_="entry-header")
content = soup.find_all("div", class_="entry-content")
# for items in content:
# data = '\n'.join([item.text for item in items.find_all(["h2","p"])])
# print(data)
# Extract main title
for items in header:
for item in items.find_all(["h1"]):
doc.add_heading(item.text, 0)
# Add url to top of the page
p = doc.add_paragraph()
link = add_hyperlink(p, url, 'Link to pattern')
# Extract text and headings
for sections in content:
for item in sections.find_all(["h2","p", "ul"]):
if (item.name == "h2"):
doc.add_heading(item.text)
elif (item.name == "p"):
doc.add_paragraph(item.text)
elif (item.name == "ul"):
for list in item.find_all(["li"]):
doc.add_paragraph(f"- {list.text}", style='List Bullet')
doc.save(path)
print("File saved")
def settings():
options = "\n 1. Set default folder \n 2. Reset settings \n 3. Back "
choice = int(input(f"{options}\n > "))
# Get user input for directory
if choice == 1:
config['Location'] = {'defaultPath':f'{filedialog.askdirectory()}'}
with open('settings.ini', 'w') as configfile:
config.write(configfile)
# Set defaultPath to HOME directory
elif choice == 2:
config['Location'] = {'defaultPath':f'{os.path.join(os.path.expanduser("~"), "Documents")}'}
with open('settings.ini', 'w') as configfile:
config.write(configfile)
elif choice == 3:
return
else:
print("Invalid option")
def wait():
print("Press Enter...")
msvcrt.getch()
def menu():
exit = False
while not exit:
#os.system('cls')
sys.stderr.write("\x1b[2J\x1b[H") # Clear screen but leave scrollback
options = " Web Scraper\n -------------------\n 1. Scrape a website \n 2. Settings \n 3. Exit "
choice = int(input(f"{options}\n > "))
if choice == 1:
site = getSite()
parseSite(site[0], site[1]) if site != 1 else 0
wait()
elif choice == 2:
settings()
wait()
elif choice == 3:
break
else:
print("Invalid option")
wait()
if __name__ == '__main__':
menu()