-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathutility.py
More file actions
272 lines (221 loc) · 9.81 KB
/
utility.py
File metadata and controls
272 lines (221 loc) · 9.81 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
import re
import string
import pickle
import os
import csv
import glob
def handle_file(filename,operation = 'r'):
"""
This function reads any file and put in into list
where each row is a list item
Arguments:
filename {string} -- The file name with the full path
operation {string} -- default is 'r'
Returns:
list -- list of items where each item is a row
"""
with open(filename,operation) as f:
data = f.readlines()
return data
def get_all_filenames_from_dir(directory,suffex, filename_allowed_list = None):
"""
Giving a directory and suffex for which targeted file extension are
Will read all the filenames inside that directory with that suffex
Arguments:
directory {string} -- The full path to the directory
suffex {string} -- The targeted file extension such as '.csv'
Keyword Arguments:
filename_allowed_list {list} -- gets only the files that has a match in this list (default: {None})
Returns:
list -- the full path to the files
"""
files_list = list()
if filename_allowed_list == None:
for item in glob.glob(directory+'*'+suffex): # Example /datasets/Stock_dataset/Stocks/*.txt
files_list.append(item)
else:
filename_allowed_list = [v.lower() for v in filename_allowed_list] # To avoid case sensitve
for item in glob.glob(directory+'*'+suffex):
if item.split("/")[-1].split('.')[0].lower() in filename_allowed_list: # Since linux is case sensitive, then so is this function, make sure the names match correctly
files_list.append(item)
if not len(files_list) == len(filename_allowed_list):
print 'Some Stocks files are missing'
return files_list
def write_dataset(new_dataset, filename = "simple_sentence_twitter_dataset_2008_to_2014"):
# write the data
# Write list of strings with a condition
with open(filename,"w") as f:
for item in new_dataset:
if not item.split(";")[10] == '': # Take of rows that stock mentioned is not detected .. those could be cuase of irregular formatting or string encoding
f.write("%s\n"%item.replace("\n",""))
def write_file(data, filename):
# write list of strings
with open(filename, "w") as f:
for item in data:
f.write(str(item).replace('#','').replace("'","")+"\n")
# --------------------------------
def unique_instance(un_data):
"""
This methods finds how many duplicate instances is in a list
Arguments:
un_data {list} -- the list to check the duplicates in
Returns:
int,list -- an integer of how many duplicates found, and a list of the indexes for those duplicates
"""
test_dict = dict()
indexed = list()
count = 0
for i,item in enumerate(un_data):
if not test_dict.has_key( hash(item) ):
test_dict[ hash(item) ] = 0
else:
count = count + 1
indexed.append(i)
return count, indexed
def remove_repeted(data,num_list):
"""
Takes a list of items and a list of indexes and remove all the indexs from the list of items
Arguments:
data {list} -- The list of items
num_list {list} -- The list of indexes
Returns:
list -- The list of items after removing the items at the specified indexes
"""
tmp_data = list(data)
for row_index in sorted( num_list,reverse=True):
del tmp_data[row_index]
return tmp_data
# --------------------------------
def remove_repeted_by_highest_value(data, hieghst_value_index,target_value_index, remove_header = True):
"""
Remove duplicates tweets by the highest rewteet value
Arguments:
data {list} -- The list of items
hieghst_value_index {int} -- index of the retweets column
target_value_index {int} -- The targest text column index
Keyword Arguments:
remove_header {bool} -- [description] (default: {True})
Returns:
dict -- dictionary of the items in the form of {'max':int, 'item': str}
"""
data_clone = list(data)
if remove_header:
data_clone = data_clone[1:]
output_data = dict()
for item in data_clone:
item = item.replace('\n','').split(';')
hash_value = hash( item[target_value_index] )
if output_data.has_key(hash_value):
if int( output_data[hash_value]['max']) < int( item[hieghst_value_index] ):
output_data[hash_value]['max'] = int( item[hieghst_value_index] )
output_data[hash_value]['item'] = item
else:
output_data[hash_value] = dict()
output_data[hash_value]['max'] = int( item[hieghst_value_index] )
output_data[hash_value]['item'] = item
return output_data
def remove_repeted_by_highest_value_pandas(df_data, target_index, compare_col):
"""
Remove duplicates tweets by the highest rewteet value
for panda dataframe
Arguments:
df_data {panda dataframe} -- The panda dataframe of items
target_index {string} -- The column name (for retweets for example)
compare_col {string} -- The target column to find the largest target_index for (for example the tweets text)
Returns:
dataframe -- The dataframe after removing the repeted items by highest value
"""
hash_tmp = {}
for i,item in df_data[target_index].iteritems():
if hash_tmp.has_key(str(item)):
if hash_tmp[str(item)]['count'] > int(df_data[compare_col][i]):
hash_tmp[str(item)]['count'] = int(df_data[compare_col][i])
hash_tmp[str(item)]['index'] = i
else:
hash_tmp[str(item)] = {}
hash_tmp[str(item)]['count'] = int(df_data[compare_col][i])
hash_tmp[str(item)]['index'] = i
return df_data.iloc[ [ hash_tmp[key]['index'] for key in hash_tmp.keys()] ]
# ------------------------------------
# Text cleaning functions
remove_char_list = ["~",",", ":", "\"", "=", "&", ";", "%", "$","@", "%", "^", "*", "(", ")", "{", "}","[", "]", "|", "/", "\\", ">", "<", "-","!", "?", ".", "'","--", "---", "#"]
def remove_special_chars(tweets): # it unrolls the hashtags to normal words
global remove_char_list
for remove in remove_char_list:
tweets = tweets.replace(remove," ")
return tweets
def remove_by_regex(tweets, regexp):
return re.sub(regexp, "", tweets, flags=re.MULTILINE)
def remove_urls(tweets):
text_to_remove = list()
url_found = False
second_search = False
for text in tweets.split(" "):
if 'http' in text or 'https' in text:
text_to_remove.append(text)
url_found = True
elif url_found:
text_to_remove.append(text)
url_found = False
second_search = True
elif second_search and ('/' in text or '-' in text):
text_to_remove.append(text)
second_search = False
elif 'pic.twitter.com' in text:
text_to_remove.append(text)
else:
second_search = False
for remove in text_to_remove:
tweets = tweets.replace(remove, ' ')
return tweets
def remove_usernames(tweets):
return remove_by_regex(tweets,r"@[^\s]+[\s]?")
def remove_numbers(tweets):
return remove_by_regex(tweets,r"\s?[0-9]+\.?[0-9]*")
def remove_non_ascii_char(line):
return filter(lambda x: x in set(string.printable), line)
def find_stocks(words_list, stocks_d):
stocks_list = list()
for word in words_list:
if word.lower() in stocks_d:
if not word.lower() in stocks_list: # so we won't add it again if the user mention the same stock again
stocks_list.append(word.lower())
return stocks_list
def remove_stocks_name_from_words_list(words_list, stocks_d):
return filter(lambda a: not a.lower() in find_stocks(words_list, stocks_d),words_list)
def find_stocks_v2(words_list, stocks_fullname_dict, full_name_stocks_list):
stocks_list = list()
for word in words_list:
if not stocks_fullname_dict.has_key( str( word.replace('$','') ).lower() ):
stocks_list.append( word.lower() )
text = ' '.join( stocks_list )
for name in full_name_stocks_list:
text = text.lower().replace(name.lower(), ' ')
stocks_list = filter(None, text.split(' '))
return stocks_list
def remove_stocks_name_from_words_list_v2(words_list, stocks_fullname_dict):
full_name_stocks_list = list()
for key in stocks_fullname_dict.keys():
for name in stocks_fullname_dict[key]:
full_name_stocks_list.append(name)
return find_stocks_v2(words_list, stocks_fullname_dict, full_name_stocks_list)
def reduce_text(line):
return remove_numbers(remove_non_ascii_char(remove_special_chars(remove_usernames(remove_urls(line)))))
def reduce_text_with_websites_data_kept(line):
return remove_numbers(remove_non_ascii_char(remove_special_chars(remove_usernames(line))))
def reduce_w_remove_stock(line, stocks_fullname_dict):
"""
Will reduced the text by taking out stopwords, hashtags, mentions, and urls
Also it will remove all the stock mentioned names and apprefixes like $AAPL
Arguments:
line {string} -- the text we want to reduce
stocks_fullname_dict {dict} -- Holds the stocks apprefix as index and its possible names the we want to remove
Returns:
string -- the tweet after reduced
"""
full_name_stocks_list = list()
for key in stocks_fullname_dict.keys():
for name in stocks_fullname_dict[key]:
full_name_stocks_list.append(name)
line = reduce_text(line)
return ' '.join( find_stocks_v2( filter(None, line.split(' ') ), stocks_fullname_dict, full_name_stocks_list) )