-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcodefragments.py
More file actions
164 lines (119 loc) · 4.9 KB
/
codefragments.py
File metadata and controls
164 lines (119 loc) · 4.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import os
import sys
from keybert import KeyBERT
import pandas as pd
from pandas import json_normalize
from glom import glom
import json
def count_keys(selected_key, obj):
count = 0
# iterate arrays
if isinstance(obj, list):
for item in obj:
count += count_keys(selected_key, item)
# iterate objects
elif isinstance(obj, dict):
for key in obj:
if key == selected_key:
#channeljoinmsg = "> has joined the channel"
#emote = "emoji"
# if channeljoinmsg not in str(obj[key]):
#if emote in str(obj[key]):
count += 1
count += count_keys(selected_key, obj[key])
return count
def get_all_attributes(selected_key, obj):
all_messages = ''
if isinstance(obj, list):
for item in obj:
all_messages += get_all_attributes(selected_key, item)
elif isinstance(obj, dict):
for key in obj:
if key == selected_key:
channeljoinmsg = "> has joined the channel"
if channeljoinmsg not in str(obj[key]):
all_messages += ' ' + obj[key]
all_messages += get_all_attributes(selected_key, obj[key])
return all_messages
def pandatest(count,attribute, path):
#sets pandas output
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)
with open(path,
"r") as read_file:
jsondata= json.loads(read_file.read())
#gets datagram
pandanorm = json_normalize(jsondata['messages'])
#get only specific rows
text_subtype = pandanorm[["text", "subtype"]]
print(text_subtype)
#gets specific value as condition
print(pandanorm.loc[pandanorm['subtype'] == 'channel_join'])
#gets specific value as condition + count
print(len(pandanorm.loc[pandanorm['subtype'] == 'channel_join']))
# gets all texts without specified subtype
print(text_subtype[pd.isna(text_subtype['subtype'])])
#gets all messages via glom
glomdata = glom(jsondata, ('messages',['text']))
#goes over datalist and removes elements with given string via glom
glomdata[:] = [x for x in glomdata if "> has joined the channel" not in x]
#count messages via glom
print(len(glomdata))
#clean data from uninteded characters etc.
for i,x in enumerate(glomdata):
glomdata[i] = x.replace("\n"," ").replace("\\xa0", " ")
print(glomdata)
def messages_to_txt_pd(path):
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)
with open(path,
"r") as read_file:
jsondata = json.loads(read_file.read())
#prepare filename
filefullname = path.split('\\')
filename = filefullname[-1]
filename = filename.replace('.json','')
# gets datagram
pandanorm = json_normalize(jsondata['messages'])
# get only specific rows
text_subtype = pandanorm[["text", "subtype"]]
messageframe = text_subtype[pd.isna(text_subtype['subtype'])]
#save to file
messageframe.to_csv('messagetext_dataset/'+filename+'.txt', sep='\t', index=False, header=False)
def messages_to_txt_glom(path):
with open(path,
"r") as read_file:
jsondata = json.loads(read_file.read())
# gets all messages via glom
glomdata = glom(jsondata, ('messages', ['text']))
# goes over datalist and removes elements with given string via glom
glomdata[:] = [x for x in glomdata if "> has joined the channel" not in x]
# clean data from uninteded characters etc.
for i, x in enumerate(glomdata):
glomdata[i] = x.replace("\n", " ").replace("\\xa0", " ")
print(glomdata)
# main function
def main(count, attribute, path):
with open(path,
"r") as read_file:
jsondata = json.loads(read_file.read())
print(count_keys(count, jsondata))
# TODO need to filter out joined channel messages
data = get_all_attributes(attribute, jsondata)
kw_model = KeyBERT()
# change parameters here to change keyword to key-sentence and stopwords
keywords = kw_model.extract_keywords(data, keyphrase_ngram_range=(1, 2), stop_words=None)
print(keywords)
if __name__ == '__main__':
dataset_path = sys.argv[1]
#create directory for txt files
if not os.path.exists('/datasets/messagetext_dataset'):
os.mkdir('datasets/messagetext_dataset')
#iterate through all json files in dataset
for filename in os.listdir(dataset_path):
if filename.endswith(".json"):
messages_to_txt_pd(dataset_path+filename)