-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgetMostPopularThreads.py
More file actions
75 lines (65 loc) · 3.07 KB
/
getMostPopularThreads.py
File metadata and controls
75 lines (65 loc) · 3.07 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import sqlite3
import pandas as pd
from datetime import datetime, timedelta
#HEADERS: num, subnum, thread_num, op, timestamp, timestamp_expired, preview_orig,
# preview_w, preview_h, media_filename, media_w, media_h, media_size,
# media_hash, media_orig, spoiler, deleted, capcode, email, name, trip,
# title, comment, sticky, locked, poster_hash, poster_country, exif
# test db: 4plebs_pol_test_database
# test table: poldatabase
# full db: 4plebs_pol_18_03_2018
# full table: poldatabase_18_03_2018
#generates a csv with the most commented on threads, per day, month, or in total. Limit denotes when to stop
def getMostPopularThreads(timeframe='full', limit=25):
print('Connecting to database')
conn = sqlite3.connect("../4plebs_pol_18_03_2018.db")
print('Beginning SQL query to get most popular threads')
limit = limit
headers = ['date','comments','sicky','timestamp','thread_num','op','title','comment']
if timeframe=='full':
df_timethreads = pd.read_sql_query("SELECT COUNT(*)comments, sticky, timestamp, thread_num, title, comment, MAX(op) AS op FROM poldatabase_18_03_2018 WHERE comment LIKE '%cuck%' GROUP BY thread_num ORDER BY comments DESC, op DESC LIMIT ?;", conn, params=[limit])
li_dates = []
dateformat = '%Y-%m-%d-%H:%M:%S'
for timestamp in df_timethreads['timestamp']:
li_dates.append(datetime.fromtimestamp(timestamp).strftime(dateformat))
df_timethreads['date'] = li_dates
if timeframe == 'days' or timeframe == 'months':
if timeframe == 'days':
dateformat = '%Y-%m-%d'
elif timeframe == 'months':
dateformat = '%Y-%m'
print('Getting first and last timestamps')
dates = pd.read_sql_query("SELECT DISTINCT(timestamp) FROM poldatabase_18_03_2018;", conn)
print(dates)
li_alldates = dates['timestamp'].values.tolist()
li_alldates.sort()
firstdate = li_alldates[0]
lastdate = li_alldates[len(li_alldates) - 1]
print(firstdate, lastdate)
df_timethreads = pd.DataFrame(columns=headers)
newtime = ''
minquerydate = firstdate
currenttime = datetime.fromtimestamp(minquerydate).strftime(dateformat)
for timestamp in li_alldates:
#print(timestamp)
if timestamp != lastdate:
newtime = datetime.fromtimestamp(timestamp).strftime(dateformat)
#if there's a new date
if currenttime != newtime:
print('SQL query for ' + str(newtime))
maxquerydate = timestamp
df = pd.DataFrame
df = pd.read_sql_query("SELECT COUNT(*)comments, thread_num, title, comment, MAX(op) AS op, timestamp FROM poldatabase WHERE timestamp > ? AND timestamp < ? GROUP BY thread_num ORDER BY comments DESC, op DESC LIMIT ?;", conn, params=[minquerydate, maxquerydate, limit])
tmp_dates = []
for x in range(len(df['op'])):
tmp_dates.append(newtime)
#tmp_dates = pd.Series(tmp_dates)
df['date'] = tmp_dates
df_timethreads = df_timethreads.append(df)
minquerydate = timestamp
currenttime = newtime
print('Writing results to csv')
df_timethreads.to_csv('top_threads/top_threads_' + timeframe + '.csv', columns=headers)
print(df_timethreads)
#timeframe: days, months, full
getMostPopularThreads(timeframe='days', limit=25)