-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathexample_threshold.py
More file actions
70 lines (60 loc) · 2.82 KB
/
example_threshold.py
File metadata and controls
70 lines (60 loc) · 2.82 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
from textdetection.detection import Thresholds
import pandas as pd
import time
import pickle
import logging
import os
from matplotlib import pyplot as plt
def main():
data_path = "../data/benign/imdb_cleaned_all.csv"
# data_path="../data/benign/yelp_review_full_csv/mytest.csv"
len_df = 1000 # 10000 for imdb and 15000 for yelp
name_file = "imdb"
df = pd.read_csv(data_path)['text'][:len_df]
# df = pd.read_csv(data_path)["sentence"]
time_str = time.strftime("%m-%d_%H-%M")
folder_name = f'{name_file}_{time_str}'
file_name_threshold = 'outputs/{}/{}_{}_{}_thresholds.txt'.format(folder_name, name_file, len_df,
time_str)
file_name_k = 'outputs/{}/{}_{}_{}_k.txt'.format(folder_name, name_file, len_df, time_str)
file_name_plot_PDF = 'outputs/{}/{}_{}_{}_plot.pdf'.format(folder_name, name_file, len_df, time_str)
logging.info(data_path)
k_ = 0
threshold = 0
logging.info(f"data::::: {name_file}")
thd = Thresholds()
start_time = time.time()
k_, threshold = thd.calculate_thresholds(df, k=30, chunk=16, up_to_k=True,
multiprocess="pooling", num_process=8, pair="pdist", percentile=0.1)
elapsed_time = time.time() - start_time
logging.info(f"time\t>>>>\t{elapsed_time} s")
print("time\t>>>>>>>>\t\033[44m\033[31m", elapsed_time, " s \033[00m")
# k_, threshold = thd.calculate_thresholds(benign_queries[:100], k=30, chunk=10, up_to_k=True,
# multiprocess=True, num_process=4)
print("\nK:\t\033[40m \033[31m", k_, "\033[00m\t<<<>>>\tthreshold:\t\033[40m \033[31m", threshold, "\033[00m")
logging.info(f"K:{k_} <<<>>> threshold:{threshold}")
results_thresholds = thd.list_of_thresholds
results_of_k = thd.list_of_k
# thd.__init__()
f, ax = plt.subplots(1)
ax.plot(results_of_k, results_thresholds)
ax.set_ylim(ymin=0)
ax.set_xlim(xmin=0)
plt.xlabel("k# of nearest neighbors")
plt.ylabel("Threshold")
plt.plot(results_of_k, results_thresholds)
plt.show()
if not os.path.exists(f"outputs/{folder_name}"):
os.makedirs(f"outputs/{folder_name}")
f.savefig(file_name_plot_PDF, bbox_inches='tight')
logging.info(f"The plot was saved into: {file_name_plot_PDF}")
if not os.path.exists(f"outputs/{folder_name}"):
os.makedirs(f"outputs/{folder_name}")
with open(file_name_threshold, "wb") as fp: # Pickling
pickle.dump(results_thresholds, fp)
logging.info(f"The list of thresholds was saved into: {results_thresholds}")
with open(file_name_k, "wb") as fp: # Pickling
pickle.dump(results_of_k, fp)
logging.info(f"The list of k values was saved into: {results_of_k}")
if __name__ == '__main__':
main()