-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathtestsimilarity.py
More file actions
117 lines (106 loc) · 4.3 KB
/
testsimilarity.py
File metadata and controls
117 lines (106 loc) · 4.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import pickle
import matplotlib.pyplot as plt
import re
workingdir='./data'
def generate():
workingdir="./data"
similarity=pickle.load(open(workingdir+'/sim_hash_string','rb')) # {hash:{'hash','vector','cosine_1','cosine_2','sim_sim}}
cos1d={}
cos2d={}
for hash in similarity:
cos1=similarity[hash]['cosine_1']
cos2=similarity[hash]['cosine_2']
if cos1>=0.8:
cos1=similarity[hash]['cosine_1']
score=int(cos1*100)
if score not in cos1d:
cos1d[score]=[]
# if len(cos1d[score])<5:
cos1d[score].append(hash)
if cos2>=0.8:
score2=int(cos2*100)
if score2 not in cos2d:
cos2d[score2]=[]
# if len(cos2d[score2])<5:
cos2d[score2].append(hash)
hashstring=pickle.load(open(workingdir+'/hash_string_binded','rb'))
# print(hashstring['95b795ee33f3444b2d49dda8ec4c76c632ee643ea1676e833b86f7367ce4b4b6'])
# print(hashstring['ea1f308cdc2c9089d55be45f0f17028cd48f5d2ae530c08e8b1e2034393d2d9b'])
# print(hashstring['611b898fda18835068c4e3496442408882627ba2f21b3d6e2e8f51d2bdd5b6a1'])
# exit()
hashmap=pickle.load(open(workingdir+'/miniapp_hash_mappings','rb'))
hash2miniapp={}
hashwhitelist=set()
for item in hashmap:
# if 'original' in workingdir:
# # if 'pages'==item[1].split('/')[0]:
# # hashwhitelist.add(item[2])
# # else:
# # continue
# if item[2] not in hash2miniapp:
# hash2miniapp[item[2]]=set()
# hash2miniapp[item[2]].add(item[0]+";;"+item[1])
# elif 'binded' in workingdir:
# if 'pages'==item[1].split('/')[0]:
# hashwhitelist.add(item[3])
# else:
# continue
if item[3] not in hash2miniapp:
hash2miniapp[item[3]]=set()
hash2miniapp[item[3]].add(item[0]+";;"+item[1])
# hash2miniapp[item[3]]=item[0]+"/"+item[1]
# (appid,k,hashvalue,withbindhashvalue)
hitminiapp1=set()
hitminiapp2=set()
fw=open(workingdir+'/text_cos1_all.csv','w')
print(len(hash2miniapp))
sorted_similarities=sorted(list(cos1d.keys()))
for key in sorted_similarities:# cos1d:
data={'bin':key,'text':[]}
#
for hash in cos1d[key]:
# if hash not in hashwhitelist:
# continue
sent=hashstring[hash].decode()
sent=re.sub('[^\u4e00-\u9fa5]', '', sent)
if True:#len(data['text'])<5:
data['text'].append({'hash':hash,'txt':sent, 'cos1':similarity[hash]['cosine_1']})
hitminiapp1=hitminiapp1.union(hash2miniapp[hash])
fw.write(str(key)+","+hash+","+sent+","+str(similarity[hash]['cosine_1'])+","+str(len(hash2miniapp[hash]))+'\n')
# fw.write(str(data)+'\n')
fw.close()
fw=open(workingdir+'/text_cos2_all.csv','w')
sorted_similarities=sorted(list(cos2d.keys()))
for key in sorted_similarities:#cos2d:
data={'bin':key,'text':[]}
# sent=re.sub('[^\u4e00-\u9fa5]', '', sent)
for hash in cos2d[key]:
# if hash not in hashwhitelist:
# continue
sent=hashstring[hash].decode()
sent=re.sub('[^\u4e00-\u9fa5]', '', sent)
if True:#len(data['text'])<5:
data['text'].append({'hash':hash,'txt':sent, 'cos2':similarity[hash]['cosine_2']})
hitminiapp2=hitminiapp2.union(hash2miniapp[hash])
fw.write(str(key)+","+hash+","+sent+","+str(similarity[hash]['cosine_2'])+","+str(len(hash2miniapp[hash]))+'\n')
fw.close()
print(len(hitminiapp1))
print(len(hitminiapp2))
fw=open(workingdir+'/miniapp_hit_pages1.csv','w')
for item in hitminiapp1:
appid=item.split(";;")[0]
path=item.split(";;")[1]
fw.write(appid+","+path+"\n")
fw=open(workingdir+'/miniapp_hit_pages2.csv','w')
for item in hitminiapp2:
appid=item.split(";;")[0]
path=item.split(";;")[1]
fw.write(appid+","+path+"\n")
# print(hitminiapp1)
# print(hitminiapp2)
# generate('data-original')
# import seaborn as sns
# sns.displot(cos1)
# plt.savefig('cos1.png')
# sns.displot(cos2)
# plt.savefig('cos2.png')