-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathreddit_scraper.py
More file actions
58 lines (48 loc) · 1.63 KB
/
reddit_scraper.py
File metadata and controls
58 lines (48 loc) · 1.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import praw
import pandas as pd
import os
from dotenv import load_dotenv
# Load environment variables from .env file
load_dotenv()
# Initialize Reddit API client with credentials from .env
reddit = praw.Reddit(
client_id=os.getenv("CLIENT_ID"),
client_secret=os.getenv("CLIENT_SECRET"),
user_agent=os.getenv("USER_AGENT")
)
#print(client_id)
# Choose a subreddit
subreddit_name = "learnpython"
subreddit = reddit.subreddit(subreddit_name)
# Fetch top 100 posts
posts = []
for post in subreddit.hot(limit=100): # Change the limit as needed
posts.append({
"title": post.title,
"upvotes": post.score,
"url": post.url,
"num_comments": post.num_comments,
"id": post.id
})
# Convert to Pandas DataFrame
df_posts = pd.DataFrame(posts)
df_posts = df_posts.drop_duplicates(subset="id")
# Fetch comments for each post
comments = []
seen_comments = set() # Track unique comments
for post_id in df_posts["id"]:
submission = reddit.submission(id=post_id)
submission.comments.replace_more(limit=0) # Removes "load more" comments
for top_comment in submission.comments: # Fetches only top-level comments
if top_comment.id not in seen_comments:
seen_comments.add(top_comment.id)
comments.append({
"post_id": post_id,
"comment": top_comment.body
})
# Convert comments to DataFrame
df_comments = pd.DataFrame(comments)
# Save data to CSV for further analysis
df_posts.to_csv("reddit_posts.csv", index=False)
df_comments.to_csv("reddit_comments.csv", index=False)
print("✅ Data fetching complete. Posts and comments saved.")