-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathstack-overflow.py
More file actions
41 lines (31 loc) · 1.48 KB
/
stack-overflow.py
File metadata and controls
41 lines (31 loc) · 1.48 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import pdfcrowd
from bs4 import BeautifulSoup
url = input('Enter the url, press space and then hit Enter: ')
print(url + "opening firefox...........")
driver = webdriver.Firefox()
driver.find_element_by_tag_name('body').send_keys(Keys.CONTROL + 't')
driver.get(url)
# get the page source
html_source = driver.page_source
soup = BeautifulSoup(html_source, "html.parser")
# optimising the html code to have only required data
# to get the style properties, so that the pdf will have the same style
head = soup.find('head')
# To get the question
question = soup.find("a", {"class": "question-hyperlink"}).contents[0]
# to get the portion of the html that contains only the answer, comments ignoring all the unwanted data
soup = soup.find("div", {"id": "mainbar"})
# form = soup.find("form", {"id": "post-form"}).replaceWith('')
# bottom_notice = soup.find("h2", {"class": "bottom-notice"}).replaceWith('')
question = '<h1>'+question+'</h1>'
# generate a new html code with only required data
html_source = str(question) + str(head) + str(soup)
# create a free account from pdf crowd and replace the following with your own username and API key or you can use mine
client = pdfcrowd.HtmlToPdfClient('allwinraju', '705d1ef50bf58f16f663e3a95aa47497')
file_name = url.split('/')[-1] + '.pdf'
print("Converting to pdf...")
pdf = client.convertStringToFile(html_source, file_name)
print("File " + file_name + "created")
driver.close()