-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsubreddit_parser.py
More file actions
executable file
·127 lines (121 loc) · 4.34 KB
/
subreddit_parser.py
File metadata and controls
executable file
·127 lines (121 loc) · 4.34 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
#!/usr/bin/env python3
from bs4 import BeautifulSoup
import urllib3, getopt, sys
from collections import OrderedDict
import time
'''
Scans a subreddit for a search with a certain sorting method and
optionaally appends the flair to the beginning of the title
@param sb - subreddit to search
@param search - search term
@param sort - sorting method, which can be: new, top, relevance
@param flair - whether or not to append the flair to the title
@return resultdict - dictionary mapping the search result urls to their
titles
'''
def get_results(sb, search, sort="new", flair=False):
#have to use OrderedDict because Python dicts before 3.6
#do not keep order that keys are added
resultdict = OrderedDict()
urllib3.disable_warnings()
sorts = ["new", "top", "relevance"] #three sorting options for Reddit search
#properly format this for url construction
if "/r/" not in sb:
sb = "/r/" + sb
#can't have spaces in URL
if " " in search:
search = search.replace(" ", "+")
#default to new sorting if invalid sort is provided
if sort.lower() not in sorts:
sort = "new"
#construct the URL
url = "https://www.reddit.com%s/" % (sb) + \
"search?q=%s&sort=%s&restrict_sr=on&t=all" % (search, sort.lower())
#get HTML to parse and initialize parser
response = urllib3.connection_from_url(url)
r = response.urlopen('GET', url)
soup = BeautifulSoup(r.data.decode("utf-8"), "html.parser")
#find contents class
contents = soup.find("div", {"class": "contents"})
if(contents == None):
print("Invalid subreddit entered")
sys.exit(2)
#some python versions don't iterate dictonaries in order of
#when each item was added, so we must keep track
#for each entry in the search
for header in contents.children:
#specific parsing to make sure that this works for subreddits
#with custom CSS
soup = BeautifulSoup(str(header), "html.parser")
srheader = soup.find("header")
soup = BeautifulSoup(str(srheader), "html.parser")
#get the flair if the user requested it
if flair:
flairt = soup.find("span", {"class": "linkflairlabel"})
if flairt != None:
flairt = flairt.text
#get the part of the entry that contains the link and title
link = soup.find('a', href=True)
#prepend the flair to the title if requested or don't otherwise
if flair and flairt != None:
resultdict[link['href']] = flairt + " " + link.text
else:
resultdict[link['href']] = link.text
#allows us to print out each entry in order
if __name__ == "__main__":
for key in resultdict:
print("%s: %s" % (key, resultdict[key]))
return resultdict
'''
Print a standard usage message when using the parser by itself
'''
def usage_message():
std_usage = "subreddit_parser [-f] [-s subreddit] [-m sort method] \
[-t search term]"
print("usage: ")
print(std_usage)
print("Option\t\tDefault\t\tExample\t\t\t\tDescription")
print("'-f'\t\tFalse\t\t-f\t\t\t\tAppend flair to entry title")
print("'-t'\t\tNone\t\t-t Planck\t\t\tTerm to search for")
print("'-s'\t\tNone\t\t-s mechmarket\t\t\tSubreddit to search for term")
print("'-m'\t\tnew\t\t-m new, -m relevance, " + \
"-m top\tMethod to use for sorting results")
print("'-h'\t\tN/A\t\t-h\t\t\t\tPrints this usage message")
def main():
try:
opts, args = getopt.getopt(sys.argv[1:], "hfs:t:m:", ["help", "output="])
except getopt.GetoptError as err:
print(err)
usage_message()
sys.exit(2)
sort = "new"
sb = ""
search = ""
flair = False
for opt, arg in opts:
if opt == "-s":
sb = arg
elif opt == "-t":
search = arg
elif opt == "-m":
sort = arg
elif opt == "-f":
flair = True
elif opt == "-h":
usage_message()
sys.exit(1)
else:
assert False, "unrecognized option"
fail = False
if sb == "":
fail = True
print("Missing option for subreddit: -s")
if search == "":
fail = True
print("Missing option for search term: -t")
if fail:
usage_message()
sys.exit(2)
get_results(sb, search, sort, flair)
if __name__ == "__main__":
main()