From 0ef16c70e36fae8b56ebc63b52901ac7ec6cd29a Mon Sep 17 00:00:00 2001 From: swarnimshukla Date: Sat, 21 Oct 2017 00:37:45 +0530 Subject: [PATCH 1/2] reddit_scraper --- reddit_scraper | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 reddit_scraper diff --git a/reddit_scraper b/reddit_scraper new file mode 100644 index 0000000..9ef6cbf --- /dev/null +++ b/reddit_scraper @@ -0,0 +1,45 @@ +from urlparse import urlparse +from bs4 import BeautifulSoup +import requests +import io +import json +import praw + +def scrape_quora(url): + source_code = requests.get(url).text + soup = BeautifulSoup(source_code, "html.parser") + output = dict() + output['question'] = soup.find('div', {'class': 'question_text_edit'}).find('span', {'class': 'rendered_qtext'}).get_text() + output['answers'] = list() + for answer in soup.findAll('div', {'class': 'AnswerBase'}): + ans = dict() + try: + ans['author'] = answer.find('a', {'class': 'user'}).get_text() + except: + ans['author'] = "Hidden" + ans['content'] = answer.find('span', {'class': 'rendered_qtext'}).get_text() + output['answers'].append(ans) + json_object = json.dumps(output, ensure_ascii=False, indent=4) + io.open('quora_output.json', 'w').write(json_object) + + +def scrape_reddit(url): + reddit = praw.Reddit(user_agent='Comment Extraction (by /u/USERNAME)', + client_id='INSERT CLIENT ID', client_secret="INSERT CLIENT SECRET") + submission = reddit.submission(url=URL2) + output = dict() + output['Submission'] = submission.title + output['Comments'] = list() + submission.comments.replace_more(limit=0) + for top_level_comment in submission.comments: + ans = dict() + try: + ans['Author'] = top_level_comment.author.name + except: + ans['Author'] = "Hidden" + ans['content'] = top_level_comment.body + output['Comments'].append(ans) + json_object = json.dumps(output, ensure_ascii=False, indent=4) + io.open('reddit_output.json', 'w').write(json_object) + +scrape_reddit('https://www.reddit.com/r/AskReddit/comments/4lx5a9/serious_what_is_the_creepiest_most_blood_chilling/') From fcca1ca3b3f7a182e95e2722c31f8d709409b2c9 Mon Sep 17 00:00:00 2001 From: swarnimshukla Date: Sat, 21 Oct 2017 00:38:29 +0530 Subject: [PATCH 2/2] reddit_scraper --- reddit_scraper => reddit_scraper.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename reddit_scraper => reddit_scraper.py (100%) diff --git a/reddit_scraper b/reddit_scraper.py similarity index 100% rename from reddit_scraper rename to reddit_scraper.py