-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathch4ex.py
More file actions
38 lines (31 loc) · 1.08 KB
/
ch4ex.py
File metadata and controls
38 lines (31 loc) · 1.08 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
from bs4 import BeautifulSoup
import requests
class Content:
def __init__(self, url, title, body):
self.url = url
self.title = title
self.body = body
def getPage(url):
req = requests.get(url)
return BeautifulSoup(req.text, 'html.parser')
def scrapeNYTimes(url):
bs = getPage(url)
title = bs.find("h1").text
lines = bs.find_all("p", {"class":"story-content"})
body = '\n'.join([line.text for line in lines])
return Content(url, title, body)
def scrapeBrookings(url):
bs = getPage(url)
title = bs.find("h1").text
body = bs.find("div",{"class","post-body"}).text
return Content(url, title, body)
url = 'https://www.brookings.edu/blog/future-development' + '/2018/01/26/delivering-inclusive-urban-access-3-unc' + 'omfortable-truths/'
content = scrapeBrookings(url)
print('Title: {}'.format(content.title))
print('URL: {}\n'.format(content.url))
print(content.body)
url = 'https://www.nytimes.com/2018/01/25/opinion/sunday/' + 'silicon-valley-immortality.html'
content = scrapeNYTimes(url)
print('Title: {}'.format(content.title))
print('URL: {}\n'.format(content.url))
print(content.body)