WebScrapingProjects/ch4ex.py at master · jhchang/WebScrapingProjects · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
from bs4 import BeautifulSoup
import requests

class Content:
	def __init__(self, url, title, body):
		self.url = url
		self.title = title
		self.body = body

def getPage(url):
	req = requests.get(url)
	return BeautifulSoup(req.text, 'html.parser')

def scrapeNYTimes(url):
	bs = getPage(url)
	title = bs.find("h1").text
	lines = bs.find_all("p", {"class":"story-content"})
	body = '\n'.join([line.text for line in lines])
	return Content(url, title, body)

def scrapeBrookings(url):
	bs = getPage(url)
	title = bs.find("h1").text
	body = bs.find("div",{"class","post-body"}).text
	return Content(url, title, body)

url = 'https://www.brookings.edu/blog/future-development' + '/2018/01/26/delivering-inclusive-urban-access-3-unc' + 'omfortable-truths/'

content = scrapeBrookings(url)
print('Title: {}'.format(content.title))
print('URL: {}\n'.format(content.url))
print(content.body)

url = 'https://www.nytimes.com/2018/01/25/opinion/sunday/' + 'silicon-valley-immortality.html'
content = scrapeNYTimes(url)
print('Title: {}'.format(content.title))
print('URL: {}\n'.format(content.url))
print(content.body)