diff --git a/reddit_scraper.py b/reddit_scraper.py new file mode 100644 index 0000000..9ef6cbf --- /dev/null +++ b/reddit_scraper.py @@ -0,0 +1,45 @@ +from urlparse import urlparse +from bs4 import BeautifulSoup +import requests +import io +import json +import praw + +def scrape_quora(url): + source_code = requests.get(url).text + soup = BeautifulSoup(source_code, "html.parser") + output = dict() + output['question'] = soup.find('div', {'class': 'question_text_edit'}).find('span', {'class': 'rendered_qtext'}).get_text() + output['answers'] = list() + for answer in soup.findAll('div', {'class': 'AnswerBase'}): + ans = dict() + try: + ans['author'] = answer.find('a', {'class': 'user'}).get_text() + except: + ans['author'] = "Hidden" + ans['content'] = answer.find('span', {'class': 'rendered_qtext'}).get_text() + output['answers'].append(ans) + json_object = json.dumps(output, ensure_ascii=False, indent=4) + io.open('quora_output.json', 'w').write(json_object) + + +def scrape_reddit(url): + reddit = praw.Reddit(user_agent='Comment Extraction (by /u/USERNAME)', + client_id='INSERT CLIENT ID', client_secret="INSERT CLIENT SECRET") + submission = reddit.submission(url=URL2) + output = dict() + output['Submission'] = submission.title + output['Comments'] = list() + submission.comments.replace_more(limit=0) + for top_level_comment in submission.comments: + ans = dict() + try: + ans['Author'] = top_level_comment.author.name + except: + ans['Author'] = "Hidden" + ans['content'] = top_level_comment.body + output['Comments'].append(ans) + json_object = json.dumps(output, ensure_ascii=False, indent=4) + io.open('reddit_output.json', 'w').write(json_object) + +scrape_reddit('https://www.reddit.com/r/AskReddit/comments/4lx5a9/serious_what_is_the_creepiest_most_blood_chilling/')