-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathglobal_method.py
More file actions
113 lines (107 loc) · 4.62 KB
/
global_method.py
File metadata and controls
113 lines (107 loc) · 4.62 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import os
import pandas as pd
import json
from collections import Counter
from indexer import Indexer
class GlobalMethod:
def __init__(self,inverted_index,path):
self.inverted_index=inverted_index
self.matrix=pd.DataFrame()
self.path=path
def execute_global_method_and_generate_matrix(self):
path = os.path.dirname(os.path.abspath(__file__))
file =path+'\\Global_method_matrix.json'
if os.path.isfile(file) :
return self.load_json_to_df()
average_freq = int(self.calculate_average_of_frequency()*50*7)
print(average_freq)
columns = []
dic_of_designated_terms ={}
for term in self.inverted_index.keys():
num_of_freq = int(self.inverted_index[term]['tf'])
if num_of_freq > average_freq:
dict_of_term = Indexer.get_details_about_term_in_inverted_index(term=term,inverted_index=self.inverted_index)
details_dic_in_inverted_index=Indexer.get_values_in_posting_file_of_dictionary_term(term=term,pointer=dict_of_term['pt'],path=self.path)
columns.append(term)
dic_of_designated_terms[term]= {}
dic_of_designated_terms[term]= details_dic_in_inverted_index
df = pd.DataFrame(index=columns, columns=columns)
for column in columns:
for row in columns:
df[row][column]=-1
for column in columns:
dic_with_tweet_id_col = dic_of_designated_terms[column]
temp_list_tweet_id_row = []
for row in columns:
if df[row][column]!=-1:continue
dic_with_tweet_id_row = dic_of_designated_terms[row]
dic_temp ={}
keys_1=dic_with_tweet_id_row.keys()
keys_2=dic_with_tweet_id_col.keys()
mutual_list=[]
for tweet in keys_1:
temp_list_tweet_id_row.append(tweet)
dic_temp[tweet]=1
keys_dic_temp = dic_temp.keys()
for tweet in keys_2:
if tweet in keys_dic_temp:
mutual_list.append(tweet)
temp_list_tweet_id_row.clear()
temp_list_tweet_id_row=list()
sigma = 0
for item in mutual_list:
item = str(item)
column = str(column)
row = str(row)
try:
sigma += int(dic_of_designated_terms[row][item]['tf']) * int(dic_of_designated_terms[column][item]['tf'])
except:
print("error")
continue
freq_row = int(self.inverted_index[row]['tf'])**2
freq_col= int(self.inverted_index[column]['tf'])**2
val = self.calculate_frequency_and_normalize(c_i_j=int(sigma),
c_i_i=int(freq_row),
c_j_j=int(freq_col))
df[row][column] = val
df[column][row] = val
df.to_json('Global_method_matrix.json')
print(df)
def calculate_frequency_and_normalize(self, c_i_j, c_i_i, c_j_j):
down = (c_i_i) + (c_j_j) - c_i_j
return c_i_j / down
def calculate_average_of_frequency(self):
keys = self.inverted_index.keys()
sum=0
number_of_terms=len(keys)
for key in keys :
sum+=int(self.inverted_index[key]['tf'])
return int(sum/number_of_terms)
def load_json_to_df(self):
path = os.path.dirname(os.path.abspath(__file__))
file =path+'\\Global_method_matrix.json'
with open(file) as train_file:
data = json.load(train_file)
self.matrix = pd.DataFrame.from_dict(data, orient='columns')
return self.matrix
def get_values_to_expand_query(self,term):
dic={}
columns = self.matrix.columns
for column in self.matrix.columns:
if term==column:
for row in self.matrix.columns:
if row==column:
continue
dic[row]=self.matrix[column][row]
sorted_d = sorted((value, key) for (key, value) in dic.items())
sorted_d.reverse()
words =""
index =0
for word in sorted_d[0:1]:
if index==0:
words=str(word[1])
else:
words +=" "+str(word[1])
index+=1
return words
return ""