This repository was archived by the owner on Mar 7, 2024. It is now read-only.
forked from TheEquidistantProject/ml-suite
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbot.py
More file actions
109 lines (95 loc) · 3.81 KB
/
bot.py
File metadata and controls
109 lines (95 loc) · 3.81 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import requests
import json
from bs4 import BeautifulSoup
import time
def scrape_cnn_article(url):
# Send an HTTP GET request to the URL
response = requests.get(url)
# Check for a valid response (HTTP Status Code 200)
response.raise_for_status()
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')
# Assume the article text is contained within <div> elements with a class of 'zn-body__paragraph'
# (This is a simplification and may not work for all CNN articles)
return json.loads("".join(soup.find("script", {"type":"application/ld+json"}).contents))
# Example usage:
# article_text = scrape_cnn_article('https://www.cnn.com/2023/09/22/us/some-article/index.html')
def cnn():
# Download xml file to parse for article urls
url = "https://www.cnn.com/sitemaps/cnn/news.xml"
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
urls = [loc.text for loc in soup.find_all("loc")]
# print(urls)
articles = []
for url in urls:
try:
articles.append(scrape_cnn_article(url))
except:
continue
return articles
def scrape_fox_article(url):
# Send an HTTP GET request to the URL
response = requests.get(url)
# Check for a valid response (HTTP Status Code 200)
response.raise_for_status()
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')
# Assume the article text is contained within <div> elements with a class of 'zn-body__paragraph'
# (This is a simplification and may not work for all CNN articles)
return json.loads("".join(soup.find("script", {"type":"application/ld+json"}).contents))
# Example usage:
# article_text = scrape_cnn_article('https://www.cnn.com/2023/09/22/us/some-article/index.html')
#print(scrape_fox_article("https://www.foxnews.com/us/judge-rule-whether-9-11-defendant-deemed-psychotic-delusional-cia-torture-stand-trial-report")["articleBody"])
def fox():
# Download xml file to parse for article urls
url = "https://www.foxnews.com/sitemap.xml?type=news"
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
urls = [loc.text for loc in soup.find_all("loc")]
# print(urls)
articles = []
for url in urls:
try:
articles.append(scrape_fox_article(url))
except:
continue
return articles
if __name__ == "__main__":
done = 0
while True:
fox_list = []
cnn_list = []
lst = fox()
for i in lst:
try:
urlToImage = i["image"]["url"]
title = i["headline"]
content = i["articleBody"]
source = i["publisher"]["name"]
publishedAt = i["datePublished"]
fox_list.append({"urlToImage": urlToImage, "title": title, "content": content, "source": source, "publishedAt": publishedAt})
except:
continue
lst = cnn()
for i in lst:
try:
urlToImage = i["image"][0]["contentUrl"]
title = i["headline"]
content = i["articleBody"]
source = i["publisher"]["name"]
publishedAt = i["datePublished"]
cnn_list.append({"urlToImage": urlToImage, "title": title, "content": content, "source": source, "publishedAt": publishedAt})
except:
continue
print(len(fox_list))
print(len(cnn_list))
with open("fox.json", "w") as f:
json.dump(fox_list, f)
with open("cnn.json", "w") as f:
json.dump(cnn_list, f)
done += 1
print(f"done {done}")
time.sleep(60*60*5)