Skip to content

Commit cff5167

Browse files
authored
Merge pull request #71 from 19-ayushi/news_headline_scapper
Added news headline scraper
2 parents 517e4cb + 2f856b9 commit cff5167

File tree

4 files changed

+723
-0
lines changed

4 files changed

+723
-0
lines changed
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
# News Headline Scraper (Python)
2+
3+
A simple Python script that fetches news headlines from various news outlets using their public RSS feeds and saves them to a JSOn file, no API keys are required in the process.
4+
---
5+
6+
## 🚀 Features
7+
- Saves headlines to JSON with timestamps.
8+
- Multiple trusted sources: BBC, Reuters, CNN, NYTimes, HackerNews
9+
- No API key needed (uses official RSS feeds)
10+
11+
---
12+
## Usage
13+
14+
python3 news_headline_scraper.py
15+
16+
Output:
17+
Fetching: BBC
18+
Fetching: Reuters
19+
Fetching: CNN
20+
Fetching: NYTimes
21+
Fetching: HackerNews
22+
Headlines saved to the file
23+
24+
## 🧰 Requirements
25+
26+
Install the dependencies using pip:
27+
```bash
28+
pip install feedparser
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
import json
2+
from datetime import datetime
3+
import feedparser
4+
import sys
5+
6+
def fetch_headlines(feed):
7+
headlines = {}
8+
for name, url in feed.items():
9+
print(f"Fetching: {name}")
10+
feed = feedparser.parse(url)
11+
# print("FEED", feed)
12+
temp_list=[]
13+
for entry in feed.entries:
14+
article = {
15+
"title": entry.title,
16+
"link": entry.link,
17+
"published": entry.get("published", None)
18+
}
19+
temp_list.append(article)
20+
headlines[name]= temp_list
21+
return headlines
22+
23+
24+
25+
def save_to_json(data, filename="news_headlines.json"):
26+
timestamp = datetime.now().isoformat()
27+
output = {"timestamp": timestamp, "sources": data}
28+
with open(filename, "w", encoding="utf-8") as f:
29+
json.dump(output, f, indent=4, ensure_ascii=False)
30+
print("Headlines saved to the file")
31+
32+
if __name__=="__main__":
33+
34+
feed={
35+
"BBC": "https://feeds.bbci.co.uk/news/rss.xml",
36+
"Reuters": "https://feeds.reuters.com/reuters/topNews",
37+
"CNN": "http://rss.cnn.com/rss/edition.rss",
38+
"NYTimes": "https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml",
39+
"HackerNews": "https://hnrss.org/frontpage"
40+
}
41+
42+
headlines=fetch_headlines(feed)
43+
save_to_json(headlines)

0 commit comments

Comments
 (0)