diff --git a/Part1_TF-IDF/example.py b/Part1_TF-IDF/example.py index 8d0c7d2..8697465 100644 --- a/Part1_TF-IDF/example.py +++ b/Part1_TF-IDF/example.py @@ -29,7 +29,7 @@ def com_tf(): # 统计结果写入result.txt(字典的遍历) for (k, v) in num_dict.items(): - open('data/result.txt', 'a+').write(str(k) + ' ' + str(v) + '\n') # 将k,v转换为str类型 + open('data/result.txt', 'a+').write(f'{str(k)} {str(v)}' + '\n') if __name__ == '__main__': diff --git a/Part1_TF-IDF/src/GrobalParament.py b/Part1_TF-IDF/src/GrobalParament.py index e1169e1..febce78 100644 --- a/Part1_TF-IDF/src/GrobalParament.py +++ b/Part1_TF-IDF/src/GrobalParament.py @@ -17,8 +17,8 @@ ResultFileName = "result.txt" # 搜索结果文件名 path = '/home/kaifun/PycharmProjects/TextInfoExp/Part1_TF-IDF/' # 原始数据 -path1 = path + 'data/title_and_abs/' -newpath = path + "data/pro_keyword/" +path1 = f'{path}data/title_and_abs/' +newpath = f"{path}data/pro_keyword/" newpath2 = path # path1 = 'C:/Users/kaifun/Desktop/ass_TIP/TextInfoProcess/Test_one_TF-IDF/data_afterprocess/title_and_abs/' # 处理后的标题和摘要 diff --git a/Part1_TF-IDF/src/get_TF_IDF.py b/Part1_TF-IDF/src/get_TF_IDF.py index b6152a1..99136cd 100644 --- a/Part1_TF-IDF/src/get_TF_IDF.py +++ b/Part1_TF-IDF/src/get_TF_IDF.py @@ -8,44 +8,39 @@ def TF_IDF_Compute(file_import_url_temp): file_import_url = file_import_url_temp.replace('\\', '/') - data_source = open(file_import_url, 'r') - data = data_source.readline() - word_in_afile_stat = {} - word_in_allfiles_stat = {} - files_num = 0 - while data != "": # 对文件pro_res.txt进行处理 - data_temp_1 = data.strip("\n").split("\t") # file name and key words of a file - data_temp_2 = data_temp_1[1].split(",") # key words of a file - file_name = data_temp_1[0] - data_temp_len = len(data_temp_2) - files_num += 1 - data_dict = {} - data_dict.clear() - for word in data_temp_2: - if word not in word_in_allfiles_stat: - word_in_allfiles_stat[word] = 1 - data_dict[word] = 1 - else: - if word not in data_dict: # 如果这个单词在这个文件中之前没有出现过 + with open(file_import_url, 'r') as data_source: + data = data_source.readline() + word_in_afile_stat = {} + word_in_allfiles_stat = {} + files_num = 0 + while data != "": # 对文件pro_res.txt进行处理 + data_temp_1 = data.strip("\n").split("\t") # file name and key words of a file + data_temp_2 = data_temp_1[1].split(",") # key words of a file + file_name = data_temp_1[0] + data_temp_len = len(data_temp_2) + files_num += 1 + data_dict = {} + data_dict.clear() + for word in data_temp_2: + if word not in word_in_allfiles_stat: + word_in_allfiles_stat[word] = 1 + data_dict[word] = 1 + elif word not in data_dict: # 如果这个单词在这个文件中之前没有出现过 word_in_allfiles_stat[word] += 1 data_dict[word] = 1 - if not word_in_afile_stat.has_key(file_name): - word_in_afile_stat[file_name] = {} - if not word_in_afile_stat[file_name].has_key(word): - word_in_afile_stat[file_name][word] = [] - word_in_afile_stat[file_name][word].append(data_temp_2.count(word)) - word_in_afile_stat[file_name][word].append(data_temp_len) - data = data_source.readline() - data_source.close() - + if not word_in_afile_stat.has_key(file_name): + word_in_afile_stat[file_name] = {} + if not word_in_afile_stat[file_name].has_key(word): + word_in_afile_stat[file_name][word] = [data_temp_2.count(word), data_temp_len] + data = data_source.readline() # filelist = os.listdir(newpath2) # 取得当前路径下的所有文件 TF_IDF_last_result = [] if (word_in_afile_stat) and (word_in_allfiles_stat) and (files_num != 0): - for filename in word_in_afile_stat.keys(): + for filename, value in word_in_afile_stat.items(): TF_IDF_result = {} TF_IDF_result.clear() - for word in word_in_afile_stat[filename].keys(): + for word in value.keys(): word_n = word_in_afile_stat[filename][word][0] word_sum = word_in_afile_stat[filename][word][1] with_word_sum = word_in_allfiles_stat[word] @@ -56,19 +51,17 @@ def TF_IDF_Compute(file_import_url_temp): # line = f1.readline() TF_IDF_last_result.append(filename) - TF_IDF_last_result.extend(result_temp[0:10]) + TF_IDF_last_result.extend(result_temp[:10]) # TF_IDF_last_result.append(line) TF_IDF_last_result.append('\n') - f = open("results.txt", "a+") - - for s in TF_IDF_last_result: - # print s - for i in s: - f.write(str(i)) - f.write("\n") - f.close() + with open("results.txt", "a+") as f: + for s in TF_IDF_last_result: + # print s + for i in s: + f.write(str(i)) + f.write("\n") if __name__ == '__main__': diff --git a/Part1_TF-IDF/src/get_data.py b/Part1_TF-IDF/src/get_data.py index caa6200..213ba4a 100644 --- a/Part1_TF-IDF/src/get_data.py +++ b/Part1_TF-IDF/src/get_data.py @@ -14,10 +14,10 @@ # path='/home/mbtrec/mhwang/pro/computer/' base_path = GrobalParament.path -path = base_path + 'data/computer/'# 原始数据 -path1 = base_path + 'data/title_and_abs/' # 处理后的标题和摘要 -newpath = base_path + 'data/pro_keyword/' -newpath2 = base_path + 'data/keyword/' +path = f'{base_path}data/computer/' +path1 = f'{base_path}data/title_and_abs/' +newpath = f'{base_path}data/pro_keyword/' +newpath2 = f'{base_path}data/keyword/' filelist = os.listdir(path) # 取得当前路径下的所有文件 @@ -29,18 +29,15 @@ def get_text(): filename = os.path.splitext(files)[0] # 取文件名 soup = BeautifulSoup(open(path + filename + '.xml'), 'html.parser') # 解析网页 b = soup.find("p", class_="abstracts") # 取得"p", class_="abstracts"为标签的内容 - # print b if b is None or b.string is None: continue - else: - abstracts.extend(soup.title.stripped_strings) - s = b.string - abstracts.extend(s.encode('utf-8')) - f = open(path1 + filename + ".txt", "w+") # 写入txt文件 + abstracts.extend(soup.title.stripped_strings) + s = b.string + abstracts.extend(s.encode('utf-8')) + with open(path1 + filename + ".txt", "w+") as f: for i in abstracts: f.write(i) - f.close() - abstracts = [] + abstracts = [] # getPro_keyword,清洗出xml文件中dl标签中的文本信息 links = soup.find_all("dl") @@ -48,10 +45,9 @@ def get_text(): for link in links: s1 = link.get_text() # print s1 - f = open(newpath + filename + ".txt", "w+") # 将得到的未处理的文字放在pro_keyword文件夹中 - for i in s1: - f.write(i) - f.close() + with open(newpath + filename + ".txt", "w+") as f: + for i in s1: + f.write(i) # 对上一步得到的getPro_keyword文件夹中的文件进行进一步处理,得到每个文件的关键字 @@ -62,18 +58,17 @@ def get_keyword(): filename = os.path.splitext(files)[0] begin = 100000 end = 10000 - f1 = open(newpath + filename + ".txt", "r") - f2 = open(newpath2 + filename + '.txt', "w+") - for (num, value) in enumerate(f1): - if value.count("关键词") > 0: # 得到关键词的行号 - begin = num - if value.count("基金项目") > 0 or value.count("机标分类号") > 0 or value.count("机标关键词") > 0 or value.count( - "基金项目") > 0 or value.count("DOI") > 0: - end = num - if num > begin and num < end and value[:-1].strip(): - f2.write(value.strip()) - f2.write(" ") - f1.close() + with open(newpath + filename + ".txt", "r") as f1: + f2 = open(newpath2 + filename + '.txt', "w+") + for (num, value) in enumerate(f1): + if value.count("关键词") > 0: # 得到关键词的行号 + begin = num + if value.count("基金项目") > 0 or value.count("机标分类号") > 0 or value.count("机标关键词") > 0 or value.count( + "基金项目") > 0 or value.count("DOI") > 0: + end = num + if num > begin and num < end and value[:-1].strip(): + f2.write(value.strip()) + f2.write(" ") f2.close() diff --git a/Part1_TF-IDF/src/utils.py b/Part1_TF-IDF/src/utils.py index 17478f7..154afde 100644 --- a/Part1_TF-IDF/src/utils.py +++ b/Part1_TF-IDF/src/utils.py @@ -13,14 +13,17 @@ def fullcut(content): cut_content = jieba.cut(content, cut_all=False) word_list_temp = list(cut_content) word_list = [] - if not GrobalParament.ruler_list: + if GrobalParament.ruler_list: + word_list.extend( + word + for word in word_list_temp + if word not in GrobalParament.ruler_list + ) + + else: r = r'[^/]{2,}' temp = '/'.join(word_list_temp) word_list = re.findall(r, temp) - else: - for word in word_list_temp: - if word not in GrobalParament.ruler_list: - word_list.append(word) return word_list @@ -41,9 +44,8 @@ def halfcut(content): # print len(word_list) if (len(word_list) >= GrobalParament.n): break - else: - word_list = [] - k += 1 + word_list = [] + k += 1 return word_list @@ -59,13 +61,11 @@ def UniToStr_try(str, type_1): def UniToStr(str, *out_Format): if not out_Format: return str.encode('utf-8') - else: - for type_2 in out_Format: - if UniToStr_try(str, type_2): - return str.encode(type_2) - else: - if type_2 == out_Format[-1]: - print ("输入的目标编码格式不正确") + for type_2 in out_Format: + if UniToStr_try(str, type_2): + return str.encode(type_2) + if type_2 == out_Format[-1]: + print ("输入的目标编码格式不正确") # 多字符串替换函数,对于str_source中的某些字符(从*words传入)用char代替 @@ -86,18 +86,17 @@ def StrToUni_try(str, type_1): def StrToUni(str, *type_list): - if not type_list: - if StrToUni_try(str, 'utf-8'): - return str.decode('utf-8') - else: - print ("输入的源文件的编码格式不是utf-8") - else: + if type_list: for type_2 in type_list: if StrToUni_try(str, type_2): return str.decode(type_2) - else: - if type_2 == type_list[-1]: - print ("输入的源文件的编码格式不在您提供的格式列表中") + if type_2 == type_list[-1]: + print ("输入的源文件的编码格式不在您提供的格式列表中") + + elif StrToUni_try(str, 'utf-8'): + return str.decode('utf-8') + else: + print ("输入的源文件的编码格式不是utf-8") # 将所有文本分词,结果汇总到pro_res.txt @@ -107,20 +106,19 @@ def prepro_file(fl_in_url, re_out_url, *wd_be_del): fl_in = os.listdir(in_url) # out_file=out_url+'/'+GrobalParament.PreprocessResultName re_out = open(out_url, 'w') - i = 0 - for file in fl_in: - i += 1 + for i, file in enumerate(fl_in, start=1): print (i) - afile_url = fl_in_url + '/' + file + afile_url = f'{fl_in_url}/{file}' if os.path.isfile(afile_url): afile = open(afile_url, "r") content_temp = "".join(afile.readlines()) - if not wd_be_del: - # content=str_replace("aaiowefhaw","","\t","\n") - content = str_replace(content_temp, "", "\t", "\n", " ") # 删除某些特殊字符如\t,\n等以保证是一行的连续的 - else: - content = str_replace(content_temp, '', *wd_be_del) + content = ( + str_replace(content_temp, '', *wd_be_del) + if wd_be_del + else str_replace(content_temp, "", "\t", "\n", " ") + ) + con_unicode = StrToUni(content, *(GrobalParament.InputFormatList)) if GrobalParament.pattern == "full": cut_result = fullcut(con_unicode) @@ -140,47 +138,42 @@ def prepro_file(fl_in_url, re_out_url, *wd_be_del): def TF_IDF_Compute(file_import_url_temp): file_import_url = file_import_url_temp.replace('\\', '/') - data_source = open(file_import_url, 'r') - data = data_source.readline() - word_in_afile_stat = {} - word_in_allfiles_stat = {} - files_num = 0 - while (data != ""): # 对文件pro_res.txt进行处理 - data_temp_1 = [] - data_temp_2 = [] - data_temp_1 = data.strip("\n").split("\t") # file name and key words of a file - data_temp_2 = data_temp_1[1].split(",") # key words of a file - file_name = data_temp_1[0] - data_temp_len = len(data_temp_2) - files_num += 1 - data_dict = {} - data_dict.clear() - for word in data_temp_2: - if word not in word_in_allfiles_stat: - word_in_allfiles_stat[word] = 1 - data_dict[word] = 1 - else: - if word not in data_dict: # 如果这个单词在这个文件中之前没有出现过 + with open(file_import_url, 'r') as data_source: + data = data_source.readline() + word_in_afile_stat = {} + word_in_allfiles_stat = {} + files_num = 0 + while (data != ""): # 对文件pro_res.txt进行处理 + data_temp_1 = [] + data_temp_2 = [] + data_temp_1 = data.strip("\n").split("\t") # file name and key words of a file + data_temp_2 = data_temp_1[1].split(",") # key words of a file + file_name = data_temp_1[0] + data_temp_len = len(data_temp_2) + files_num += 1 + data_dict = {} + data_dict.clear() + for word in data_temp_2: + if word not in word_in_allfiles_stat: + word_in_allfiles_stat[word] = 1 + data_dict[word] = 1 + elif word not in data_dict: # 如果这个单词在这个文件中之前没有出现过 word_in_allfiles_stat[word] += 1 data_dict[word] = 1 - if not word_in_afile_stat.has_key(file_name): - word_in_afile_stat[file_name] = {} - if not word_in_afile_stat[file_name].has_key(word): - word_in_afile_stat[file_name][word] = [] - word_in_afile_stat[file_name][word].append(data_temp_2.count(word)) - word_in_afile_stat[file_name][word].append(data_temp_len) - data = data_source.readline() - data_source.close() - + if not word_in_afile_stat.has_key(file_name): + word_in_afile_stat[file_name] = {} + if not word_in_afile_stat[file_name].has_key(word): + word_in_afile_stat[file_name][word] = [data_temp_2.count(word), data_temp_len] + data = data_source.readline() newpath2 = GrobalParament.newpath2 # filelist = os.listdir(newpath2) # 取得当前路径下的所有文件 TF_IDF_last_result = [] if (word_in_afile_stat) and (word_in_allfiles_stat) and (files_num != 0): - for filename in word_in_afile_stat.keys(): + for filename, value in word_in_afile_stat.items(): TF_IDF_result = {} TF_IDF_result.clear() - for word in word_in_afile_stat[filename].keys(): + for word in value.keys(): word_n = word_in_afile_stat[filename][word][0] word_sum = word_in_afile_stat[filename][word][1] with_word_sum = word_in_allfiles_stat[word] @@ -192,16 +185,12 @@ def TF_IDF_Compute(file_import_url_temp): line = f1.readline() TF_IDF_last_result.append(filename) - TF_IDF_last_result.extend(result_temp[0:3]) - - TF_IDF_last_result.append(line) - TF_IDF_last_result.append('\n') - - f = open("results.txt", "a+") - - for s in TF_IDF_last_result: - # print s - for i in s: - f.write(str(i)) - f.write("\n") - f.close() + TF_IDF_last_result.extend(result_temp[:3]) + + TF_IDF_last_result.extend((line, '\n')) + with open("results.txt", "a+") as f: + for s in TF_IDF_last_result: + # print s + for i in s: + f.write(str(i)) + f.write("\n") diff --git a/Part2_Text_Classify/classifier.py b/Part2_Text_Classify/classifier.py index 728e24a..25ad8e3 100644 --- a/Part2_Text_Classify/classifier.py +++ b/Part2_Text_Classify/classifier.py @@ -25,11 +25,11 @@ def __init__(self, model_name): self.model = None def save_model(self): - with open('./model/' + self.model_name + '.pkl', 'wb') as fw: + with open(f'./model/{self.model_name}.pkl', 'wb') as fw: pickle.dump(self.model, fw) def load_model(self): - with open('./model/' + self.model_name + '.pkl', 'rb') as fr: + with open(f'./model/{self.model_name}.pkl', 'rb') as fr: self.model = pickle.load(self.model, fr) def eval_prob(self, X_test): diff --git a/Part2_Text_Classify/cnn-text-classification-tf-chinese/data_helpers.py b/Part2_Text_Classify/cnn-text-classification-tf-chinese/data_helpers.py index f5607f9..9f596bc 100644 --- a/Part2_Text_Classify/cnn-text-classification-tf-chinese/data_helpers.py +++ b/Part2_Text_Classify/cnn-text-classification-tf-chinese/data_helpers.py @@ -106,7 +106,7 @@ def batch_iter(data, batch_size, num_epochs): data = np.array(data) data_size = len(data) num_batches_per_epoch = int(len(data)/batch_size) + 1 - for epoch in range(num_epochs): + for _ in range(num_epochs): # Shuffle the data at each epoch shuffle_indices = np.random.permutation(np.arange(data_size)) shuffled_data = data[shuffle_indices] diff --git a/Part2_Text_Classify/cnn-text-classification-tf-chinese/text_cnn.py b/Part2_Text_Classify/cnn-text-classification-tf-chinese/text_cnn.py index a518166..fda16fa 100644 --- a/Part2_Text_Classify/cnn-text-classification-tf-chinese/text_cnn.py +++ b/Part2_Text_Classify/cnn-text-classification-tf-chinese/text_cnn.py @@ -18,9 +18,9 @@ def linear(input_, output_size, scope=None): shape = input_.get_shape().as_list() if len(shape) != 2: - raise ValueError("Linear is expecting 2D arguments: %s" % str(shape)) + raise ValueError(f"Linear is expecting 2D arguments: {str(shape)}") if not shape[1]: - raise ValueError("Linear expects shape[1] of arguments: %s" % str(shape)) + raise ValueError(f"Linear expects shape[1] of arguments: {str(shape)}") input_size = shape[1] # Now the computation. @@ -83,7 +83,7 @@ def __init__( # Create a convolution + maxpool layer for each filter size pooled_outputs = [] for filter_size, num_filter in zip(filter_sizes, num_filters): - with tf.name_scope("conv-maxpool-%s" % filter_size): + with tf.name_scope(f"conv-maxpool-{filter_size}"): # Convolution Layer filter_shape = [filter_size, embedding_size, 1, num_filter] W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W") diff --git a/Part2_Text_Classify/cnn-text-classification-tf-chinese/train.py b/Part2_Text_Classify/cnn-text-classification-tf-chinese/train.py index 8c98793..a8e4850 100755 --- a/Part2_Text_Classify/cnn-text-classification-tf-chinese/train.py +++ b/Part2_Text_Classify/cnn-text-classification-tf-chinese/train.py @@ -33,7 +33,7 @@ FLAGS = tf.flags.FLAGS print("\nParameters:") for attr, value in sorted(FLAGS.__flags.iteritems()): - print("{}={}".format(attr.upper(), value)) + print(f"{attr.upper()}={value}") print("") @@ -43,7 +43,7 @@ # Load data print("Loading data...") x, y, vocabulary, vocabulary_inv = data_helpers.load_data() -print x,y,vocabulary,vocabulary_inv +import tensorflow as tf # Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) diff --git a/Part2_Text_Classify/feature.py b/Part2_Text_Classify/feature.py index bee5770..3bd3dfe 100644 --- a/Part2_Text_Classify/feature.py +++ b/Part2_Text_Classify/feature.py @@ -18,8 +18,7 @@ def get_text(): data = ''.join(re.findall(u'[\u4e00-\u9fff]+', text)) # 必须为unicode类型,取出所有中文字符 也可去除停用词 text_tag.append([file.strip('.txt'), data, tag]) - df = pd.DataFrame(text_tag, columns=['id', 'text', 'tag']) - return df + return pd.DataFrame(text_tag, columns=['id', 'text', 'tag']) def get_feature(row): @@ -28,8 +27,7 @@ def get_feature(row): # 简单的写两个特征,实际中文本分类可用tf-df poi 向量等方式构造特征 text_len = len(text) isHasSH = 1 if '上海' in text else 0 - features = [text_len, isHasSH] - return features + return [text_len, isHasSH] def load_data(): @@ -39,12 +37,11 @@ def load_data(): X, Y = df.ix[:, 1:].values, df.ix[:, 0].values # print(X[0:10], Y[0:10]) X = list(map(lambda x: list(x)[0], X)) - print(X[0:10]) - print(Y[0:10]) + print(X[:10]) + print(Y[:10]) X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=1, shuffle=True) return X_train, X_test, y_train, y_test -if __name__ == '__main__': - pass +pass diff --git a/Part2_Text_Classify/src/get_cls.py b/Part2_Text_Classify/src/get_cls.py index 5d99af3..590cbdb 100644 --- a/Part2_Text_Classify/src/get_cls.py +++ b/Part2_Text_Classify/src/get_cls.py @@ -41,10 +41,9 @@ def get_text(item): data3 = " ".join(data2) # 结果转换为字符串(列表转换为字符串) data_dict[data3] = item - f2 = open('%s.txt' % item, 'a+') - for (k, v) in data_dict.items(): - f2.write(v + ',' + k + ' ' + '\n') - f2.close() + with open(f'{item}.txt', 'a+') as f2: + for (k, v) in data_dict.items(): + f2.write(v + ',' + k + ' ' + '\n') # 获取数据和标记 @@ -80,14 +79,16 @@ def trans_text(): f3 = open('id2class2.txt', 'a') filelist = os.listdir(base_path) for files in filelist: - # print (files) - f = open(base_path + files, 'r') - text = (f.read().decode('GB2312', 'ignore').encode('utf-8')) - salt = ''.join(random.sample(string.ascii_letters + string.digits, 8)) # 产生随机数 - f2 = open("C:/Users/kaifun/Desktop/ass_TIP/TextInfoExp/Part2_Text_Classify/test3/" + salt + '.txt', 'w') - f2.write(text) - f3.write(salt + ' ' + 'e' + '\n') - f.close() + with open(base_path + files, 'r') as f: + text = (f.read().decode('GB2312', 'ignore').encode('utf-8')) + salt = ''.join(random.sample(string.ascii_letters + string.digits, 8)) # 产生随机数 + f2 = open( + f"C:/Users/kaifun/Desktop/ass_TIP/TextInfoExp/Part2_Text_Classify/test3/{salt}.txt", + 'w', + ) + + f2.write(text) + f3.write(f'{salt} e' + '\n') f2.close() @@ -121,7 +122,7 @@ def get_classify(): f = open('result2.txt', 'w') for i in range(len(test_name)): - f.write(str(test_name[i]) + ' ' + str(result[i]) + '\n') + f.write(f'{str(test_name[i])} {str(result[i])}' + '\n') print (result, len(result)) num_dict = Counter(result) diff --git a/Part3_Text_Cluster/src/TextCluster.py b/Part3_Text_Cluster/src/TextCluster.py index 2970623..9372162 100644 --- a/Part3_Text_Cluster/src/TextCluster.py +++ b/Part3_Text_Cluster/src/TextCluster.py @@ -43,9 +43,8 @@ def load_processfile(self, process_file): def output_file(self, out_file, item): try: - fw = open(out_file, "a") - fw.write('%s' % (item.encode("utf-8"))) - fw.close() + with open(out_file, "a") as fw: + fw.write(f'{item.encode("utf-8")}') except: logging.error(traceback.format_exc()) return False, "out file fail" @@ -56,13 +55,11 @@ def __del__(self): def process(self, process_file, tf_ResFileName, tfidf_ResFileName, num_clusters, cluster_ResFileName): try: - sen_seg_list = [] flag, lines = self.load_processfile(process_file) if flag == False: logging.error("load error") return False, "load error" - for line in lines: - sen_seg_list.append(self.seg_words(line)) + sen_seg_list = [self.seg_words(line) for line in lines] # 该类会将文本中的词语转换为词频矩阵,矩阵元素a[i][j] 表示j词在i类文本下的词频 tf_vectorizer = CountVectorizer() @@ -139,10 +136,10 @@ def process(self, process_file, tf_ResFileName, tfidf_ResFileName, num_clusters, while count <= len(km.labels_): clusterRes.write(str(count) + '\t' + str(km.labels_[count - 1])) clusterRes.write('\r\n') - count = count + 1 + count += 1 clusterRes.close() - # 用来评估簇的个数是否合适,距离越小说明簇分的越好,选取临界点的簇个数 958.137281791 - # print(km.inertia_) + # 用来评估簇的个数是否合适,距离越小说明簇分的越好,选取临界点的簇个数 958.137281791 + # print(km.inertia_) except: logging.error(traceback.format_exc()) return False, "process fail" diff --git a/Part3_Text_Cluster/src/get_res.py b/Part3_Text_Cluster/src/get_res.py index ad7d614..f287c66 100644 --- a/Part3_Text_Cluster/src/get_res.py +++ b/Part3_Text_Cluster/src/get_res.py @@ -18,36 +18,37 @@ def get_text(): base_path = "C:/Users/kaifun/Desktop/ass_TIP/TextInfoExp/Part3_Text_Cluster/data/" filelist = os.listdir(base_path) data_dict = {} - f2 = open('../data.txt', 'w') - for files in filelist: - # print (files) - f = open(base_path + files, 'r') - text = f.read().replace('\n', '') - - data_temp = text.decode('utf-8') # 转换为unicode编码形式 - data = ''.join(re.findall(u'[\u4e00-\u9fff]+', data_temp)) # 必须为unicode类型,取出所有中文字符 - f2.write(data.encode('utf-8') + '\n') - # data2 = jieba.cut(data) # 分词 - # data3 = " ".join(data2) # 结果转换为字符串(列表转换为字符串) - # data_dict[data3] = "Art" + with open('../data.txt', 'w') as f2: + for files in filelist: + # print (files) + f = open(base_path + files, 'r') + text = f.read().replace('\n', '') - f2.close() + data_temp = text.decode('utf-8') # 转换为unicode编码形式 + data = ''.join(re.findall(u'[\u4e00-\u9fff]+', data_temp)) # 必须为unicode类型,取出所有中文字符 + f2.write(data.encode('utf-8') + '\n') + # data2 = jieba.cut(data) # 分词 + # data3 = " ".join(data2) # 结果转换为字符串(列表转换为字符串) + # data_dict[data3] = "Art" def trans_text(item): - base_path = "C:/Users/kaifun/Desktop/ass_TIP/TextInfoExp/Part3_Text_Cluster/data_temp2/%s/" % item + base_path = f"C:/Users/kaifun/Desktop/ass_TIP/TextInfoExp/Part3_Text_Cluster/data_temp2/{item}/" + filelist = os.listdir(base_path) # salt = ''.join(random.sample(string.ascii_letters + string.digits, 8)) f3 = open('id2class2.txt', 'a') for files in filelist: - # print (files) - f = open(base_path + files, 'r') - text = (f.read().decode('GB2312', 'ignore').encode('utf-8')) - salt = ''.join(random.sample(string.ascii_letters + string.digits, 8)) # 产生随机数 - f2 = open("C:/Users/kaifun/Desktop/ass_TIP/TextInfoExp/Part3_Text_Cluster/data_test/" + salt + '.txt', 'w') - f2.write(text) - f3.write(salt + ' ' + item + '\n') # 添加类别 - f.close() + with open(base_path + files, 'r') as f: + text = (f.read().decode('GB2312', 'ignore').encode('utf-8')) + salt = ''.join(random.sample(string.ascii_letters + string.digits, 8)) # 产生随机数 + f2 = open( + f"C:/Users/kaifun/Desktop/ass_TIP/TextInfoExp/Part3_Text_Cluster/data_test/{salt}.txt", + 'w', + ) + + f2.write(text) + f3.write(f'{salt} ' + item + '\n') f2.close() @@ -57,49 +58,44 @@ def trans_res(): # print (data) # print (data2) res_dic = {} - f = open('result.txt', 'w') - for i in range(len(data2)): - res_dic[data.iloc[i, 0]] = data2.iloc[i, 1] - # f.write(str(data.iloc[i,0]) +' ' + str(data2.iloc[i,1]) + '\n' ) - # print (res_dic) - res = sorted(res_dic.items(), key=lambda e: e[1], reverse=False) - print (res) - print (type(res)) - - ff0 = open('data_abs0.txt', 'w') - ff1 = open('data_abs1.txt', 'w') - ff2 = open('data_abs2.txt', 'w') - for (k, v) in res: - if str(v) == '1': - fff1 = open("C:/Users/kaifun/Desktop/ass_TIP/TextInfoExp/Part3_Text_Cluster/data/%s.txt" % k, 'r') - text = fff1.read() - data_temp = text.decode('utf-8') # 转换为unicode编码形式 - data = ''.join(re.findall(u'[\u4e00-\u9fff]+', data_temp)) # 必须为unicode类型,取出所有中文字符 - fff1.close() - ff1.write(data.encode('utf-8')) - # text_list1.append(k) - if str(v) == '2': - fff2 = open("C:/Users/kaifun/Desktop/ass_TIP/TextInfoExp/Part3_Text_Cluster/data/%s.txt" % k, 'r') - text = fff2.read() - data_temp = text.decode('utf-8') # 转换为unicode编码形式 - data = ''.join(re.findall(u'[\u4e00-\u9fff]+', data_temp)) # 必须为unicode类型,取出所有中文字符 - fff2.close() - ff2.write(data.encode('utf-8')) - # text_list2.append(k) - if str(v) == '0': - fff0 = open("C:/Users/kaifun/Desktop/ass_TIP/TextInfoExp/Part3_Text_Cluster/data/%s.txt" % k, 'r') - text = fff0.read() - data_temp = text.decode('utf-8') # 转换为unicode编码形式 - data = ''.join(re.findall(u'[\u4e00-\u9fff]+', data_temp)) # 必须为unicode类型,取出所有中文字符 - fff0.close() - ff0.write(data.encode('utf-8')) - # text_list0.append(k) - f.write(str(k) + ' ' + str(v) + '\n') - - ff0.close() - ff1.close() - ff2.close() - f.close() + with open('result.txt', 'w') as f: + for i in range(len(data2)): + res_dic[data.iloc[i, 0]] = data2.iloc[i, 1] + # f.write(str(data.iloc[i,0]) +' ' + str(data2.iloc[i,1]) + '\n' ) + # print (res_dic) + res = sorted(res_dic.items(), key=lambda e: e[1], reverse=False) + print (res) + print (type(res)) + + with open('data_abs0.txt', 'w') as ff0: + ff1 = open('data_abs1.txt', 'w') + ff2 = open('data_abs2.txt', 'w') + for (k, v) in res: + if str(v) == '1': + with open(f"C:/Users/kaifun/Desktop/ass_TIP/TextInfoExp/Part3_Text_Cluster/data/{k}.txt", 'r') as fff1: + text = fff1.read() + data_temp = text.decode('utf-8') # 转换为unicode编码形式 + data = ''.join(re.findall(u'[\u4e00-\u9fff]+', data_temp)) # 必须为unicode类型,取出所有中文字符 + ff1.write(data.encode('utf-8')) + # text_list1.append(k) + if str(v) == '2': + with open(f"C:/Users/kaifun/Desktop/ass_TIP/TextInfoExp/Part3_Text_Cluster/data/{k}.txt", 'r') as fff2: + text = fff2.read() + data_temp = text.decode('utf-8') # 转换为unicode编码形式 + data = ''.join(re.findall(u'[\u4e00-\u9fff]+', data_temp)) # 必须为unicode类型,取出所有中文字符 + ff2.write(data.encode('utf-8')) + # text_list2.append(k) + if str(v) == '0': + with open(f"C:/Users/kaifun/Desktop/ass_TIP/TextInfoExp/Part3_Text_Cluster/data/{k}.txt", 'r') as fff0: + text = fff0.read() + data_temp = text.decode('utf-8') # 转换为unicode编码形式 + data = ''.join(re.findall(u'[\u4e00-\u9fff]+', data_temp)) # 必须为unicode类型,取出所有中文字符 + ff0.write(data.encode('utf-8')) + # text_list0.append(k) + f.write(f'{str(k)} {str(v)}' + '\n') + + ff1.close() + ff2.close() def get_metrics(): diff --git a/Part5_Sentiment_Analysis/src/classifiers.py b/Part5_Sentiment_Analysis/src/classifiers.py index 31f602f..d7c8071 100644 --- a/Part5_Sentiment_Analysis/src/classifiers.py +++ b/Part5_Sentiment_Analysis/src/classifiers.py @@ -18,12 +18,24 @@ def __init__(self): # 准备情感词典词典 self.__phrase_dict = self.__get_phrase_dict() - self.__positive_dict = self.__get_dict(self.__root_filepath + "positive_dict.txt") - self.__negative_dict = self.__get_dict(self.__root_filepath + "negative_dict.txt") - self.__conjunction_dict = self.__get_dict(self.__root_filepath + "conjunction_dict.txt") - self.__punctuation_dict = self.__get_dict(self.__root_filepath + "punctuation_dict.txt") - self.__adverb_dict = self.__get_dict(self.__root_filepath + "adverb_dict.txt") - self.__denial_dict = self.__get_dict(self.__root_filepath + "denial_dict.txt") + self.__positive_dict = self.__get_dict( + f"{self.__root_filepath}positive_dict.txt" + ) + + self.__negative_dict = self.__get_dict( + f"{self.__root_filepath}negative_dict.txt" + ) + + self.__conjunction_dict = self.__get_dict( + f"{self.__root_filepath}conjunction_dict.txt" + ) + + self.__punctuation_dict = self.__get_dict( + f"{self.__root_filepath}punctuation_dict.txt" + ) + + self.__adverb_dict = self.__get_dict(f"{self.__root_filepath}adverb_dict.txt") + self.__denial_dict = self.__get_dict(f"{self.__root_filepath}denial_dict.txt") def classify(self, sentence): return self.analyse_sentence(sentence) @@ -53,7 +65,7 @@ def analyse_sentence(self, sentence, runout_filepath=None, print_show=False): comment_analysis = {"score": 0} # 将评论分句 - the_clauses = self.__divide_sentence_into_clauses(sentence + "%") + the_clauses = self.__divide_sentence_into_clauses(f"{sentence}%") # 对每分句进行情感分析 for i in range(len(the_clauses)): @@ -61,7 +73,7 @@ def analyse_sentence(self, sentence, runout_filepath=None, print_show=False): sub_clause = self.__analyse_clause(the_clauses[i].replace("。", "."), runout_filepath, print_show) # 将子句分析的数据结果添加到整体数据结构中 - comment_analysis["su-clause" + str(i)] = sub_clause + comment_analysis[f"su-clause{str(i)}"] = sub_clause comment_analysis['score'] += sub_clause['score'] if runout_filepath is not None: @@ -76,10 +88,7 @@ def analyse_sentence(self, sentence, runout_filepath=None, print_show=False): self.__output_analysis(comment_analysis) print(comment_analysis) - if comment_analysis["score"] > 0: - return 1 - else: - return 0 + return 1 if comment_analysis["score"] > 0 else 0 def __analyse_clause(self, the_clause, runout_filepath, print_show): sub_clause = {"score": 0, "positive": [], "negative": [], "conjunction": [], "punctuation": [], "pattern": []} @@ -112,15 +121,17 @@ def __analyse_clause(self, the_clause, runout_filepath, print_show): sub_clause["score"] += judgement["score"] if judgement["score"] >= 0: sub_clause["positive"].append(judgement) - elif judgement["score"] < 0: + else: sub_clause["negative"].append(judgement) match_result = judgement["key"].split(":")[-1] i = 0 while i < len(seg_result): - if seg_result[i].word in match_result: - if i + 1 == len(seg_result) or seg_result[i + 1].word in match_result: - del (seg_result[i]) - continue + if seg_result[i].word in match_result and ( + i + 1 == len(seg_result) + or seg_result[i + 1].word in match_result + ): + del (seg_result[i]) + continue i += 1 # 逐个分析分词 @@ -153,10 +164,7 @@ def __analyse_clause(self, the_clause, runout_filepath, print_show): def __is_clause_pattern2(the_clause): re_pattern = re.compile(r".*(如果|要是|希望).+就[\u4e00-\u9fa5]+(好|完美)了") match = re_pattern.match(the_clause) - if match is not None: - pattern = {"key": "如果…就好了", "value": 1.0} - return pattern - return "" + return {"key": "如果…就好了", "value": 1.0} if match is not None else "" def __is_clause_pattern3(self, the_clause, seg_result): for a_phrase in self.__phrase_dict: @@ -172,9 +180,12 @@ def __is_clause_pattern3(self, the_clause, seg_result): if match is not None: can_continue = True pos = [flag for word, flag in posseg.cut(match.group())] - if "between_tag" in keys: - if a_phrase["between_tag"] not in pos and len(pos) > 2: - can_continue = False + if ( + "between_tag" in keys + and a_phrase["between_tag"] not in pos + and len(pos) > 2 + ): + can_continue = False if can_continue: for i in range(len(seg_result)): @@ -208,30 +219,22 @@ def __analyse_word(self, the_word, seg_result=None, index=-1): # 判断是否是负向情感词 judgement = self.__is_word_negative(the_word, seg_result, index) - if judgement != "": - return 4, judgement - - return 0, "" + return (4, judgement) if judgement != "" else (0, "") @staticmethod def __is_clause_pattern1(the_clause): re_pattern = re.compile(r".*(要|选)的.+(送|给).*") match = re_pattern.match(the_clause) - if match is not None: - pattern = {"key": "要的是…给的是…", "value": 1} - return pattern - return "" + return {"key": "要的是…给的是…", "value": 1} if match is not None else "" def __is_word_conjunction(self, the_word): if the_word in self.__conjunction_dict: - conjunction = {"key": the_word, "value": self.__conjunction_dict[the_word]} - return conjunction + return {"key": the_word, "value": self.__conjunction_dict[the_word]} return "" def __is_word_punctuation(self, the_word): if the_word in self.__punctuation_dict: - punctuation = {"key": the_word, "value": self.__punctuation_dict[the_word]} - return punctuation + return {"key": the_word, "value": self.__punctuation_dict[the_word]} return "" def __is_word_positive(self, the_word, seg_result, index): @@ -329,8 +332,8 @@ def __output_analysis(self, comment_analysis, runout_filepath=None): output = "Score:" + str(comment_analysis["score"]) + "\n" for i in range(len(comment_analysis) - 1): - output += "Sub-clause" + str(i) + ": " - clause = comment_analysis["su-clause" + str(i)] + output += f"Sub-clause{str(i)}: " + clause = comment_analysis[f"su-clause{str(i)}"] if len(clause["conjunction"]) > 0: output += "conjunction:" for punctuation in clause["conjunction"]: @@ -378,29 +381,27 @@ def __divide_sentence_into_clauses(self, the_sentence): pattern = re.compile(r"([,、。%!;??,!~~.… ]*)([\u4e00-\u9fa5]*?(要|选)" r"的.+(送|给)[\u4e00-\u9fa5]+?[,。!%;、??,!~~.… ]+)") match = re.search(pattern, the_sentence.strip()) - if match is not None and len(self.__split_sentence(match.group(2))) <= 2: - to_delete = [] - for i in range(len(the_clauses)): - if the_clauses[i] in match.group(2): - to_delete.append(i) - if len(to_delete) > 0: - for i in range(len(to_delete)): + if match is not None and len(self.__split_sentence(match[2])) <= 2: + if to_delete := [ + i for i in range(len(the_clauses)) if the_clauses[i] in match[2] + ]: + for item in to_delete: the_clauses.remove(the_clauses[to_delete[0]]) - the_clauses.insert(to_delete[0], match.group(2)) + the_clauses.insert(to_delete[0], match[2]) # 识别“要是|如果……就好了”的假设句式 pattern = re.compile(r"([,%。、!;??,!~~.… ]*)([\u4e00-\u9fa5]*?(如果|要是|" r"希望).+就[\u4e00-\u9fa5]+(好|完美)了[,。;!%、??,!~~.… ]+)") match = re.search(pattern, the_sentence.strip()) - if match is not None and len(self.__split_sentence(match.group(2))) <= 3: + if match is not None and len(self.__split_sentence(match[2])) <= 3: to_delete = [] for i in range(len(the_clauses)): - if the_clauses[i] in match.group(2): + if the_clauses[i] in match[2]: to_delete.append(i) - if len(to_delete) > 0: - for i in range(len(to_delete)): + if to_delete: + for item_ in to_delete: the_clauses.remove(the_clauses[to_delete[0]]) - the_clauses.insert(to_delete[0], match.group(2)) + the_clauses.insert(to_delete[0], match[2]) the_clauses[-1] = the_clauses[-1][:-1] return the_clauses @@ -417,14 +418,12 @@ def __split_sentence(sentence): pass punctuations.append("") - clauses = [''.join(x) for x in zip(split_clauses, punctuations)] - - return clauses + return [''.join(x) for x in zip(split_clauses, punctuations)] def __get_phrase_dict(self): sentiment_dict = [] pattern = re.compile(r"\s+") - with open(self.__root_filepath + "phrase_dict.txt", "r", encoding="utf-8") as f: + with open(f"{self.__root_filepath}phrase_dict.txt", "r", encoding="utf-8") as f: for line in f: a_phrase = {} result = pattern.split(line.strip()) @@ -434,9 +433,8 @@ def __get_phrase_dict(self): for i, a_split in enumerate(result): if i < 2: continue - else: - a, b = a_split.split(":") - a_phrase[a] = b + a, b = a_split.split(":") + a_phrase[a] = b sentiment_dict.append(a_phrase) return sentiment_dict @@ -456,7 +454,7 @@ def __get_dict(path, encoding="utf-8"): @staticmethod def __write_runout_file(path, info, encoding="utf-8"): with open(path, "a") as f: - f.write("%s" % info) + f.write(f"{info}") # ################################################ @@ -535,9 +533,7 @@ def __get_sorted_distances(self, input_data): sq_diff_mat = diff_mat ** 2 sq_distances = sq_diff_mat.sum(axis=1) distances = sq_distances ** 0.5 - sorted_distances = distances.argsort() - - return sorted_distances + return distances.argsort() def classify(self, input_data): if isinstance(self.__k, int): @@ -573,10 +569,7 @@ def multiple_k_classify(self, input_data): else: final_record[1] += 1 - if final_record[0] > final_record[1]: - return 0 - else: - return 1 + return 0 if final_record[0] > final_record[1] else 1 def single_k_classify(self, input_data): # get the distance sorted list @@ -593,10 +586,7 @@ def single_k_classify(self, input_data): class_count[label] += 1 i += 1 - if class_count[0] > class_count[1]: - return 0 - else: - return 1 + return 0 if class_count[0] > class_count[1] else 1 # ################################################ @@ -626,18 +616,15 @@ def _train(self, train_data, train_data_labels, best_words=None): total_pos_length, total_neg_length = 0, 0 total_word = set() for i, doc in enumerate(train_data): - if train_data_labels[i] == 1: - for word in doc: - if best_words is None or word in best_words: + for word in doc: + if best_words is None or word in best_words: + if train_data_labels[i] == 1: total_pos_data[word] = total_pos_data.get(word, 0) + 1 total_pos_length += 1 - total_word.add(word) - else: - for word in doc: - if best_words is None or word in best_words: + else: total_neg_data[word] = total_neg_data.get(word, 0) + 1 total_neg_length += 1 - total_word.add(word) + total_word.add(word) self._pos_p = total_pos_length / (total_pos_length + total_neg_length) self._neg_p = total_neg_length / (total_pos_length + total_neg_length) @@ -663,10 +650,7 @@ def classify(self, input_data): neg_score += self._neg_word_p.get(word, 0.) neg_score += np.log(self._neg_p) - if pos_score > neg_score: - return 1 - else: - return 0 + return 1 if pos_score > neg_score else 0 # ################################################ @@ -689,17 +673,14 @@ def prob_weight(self, features, label): def calculate_probability(self, features): weights = [(self.prob_weight(features, label), label) for label in self.labels] try: - z = sum([weight for weight, label in weights]) + z = sum(weight for weight, label in weights) prob = [(weight / z, label) for weight, label in weights] except ZeroDivisionError: return "collapse" return prob def convergence(self, last_weight): - for w1, w2 in zip(last_weight, self.weight): - if abs(w1 - w2) >= 0.001: - return False - return True + return all(abs(w1 - w2) < 0.001 for w1, w2 in zip(last_weight, self.weight)) def train(self, train_data, train_data_labels, best_words=None): print("MaxEntClassifier is training ...... ") @@ -716,7 +697,7 @@ def train(self, train_data, train_data_labels, best_words=None): if word in best_words: self.feats[(train_data_labels[i], word)] += 1 - the_max = max([len(record) - 1 for record in train_data]) # the_max param for GIS training algorithm + the_max = max(len(record) - 1 for record in train_data) self.weight = [0.0] * len(self.feats) # init weight for each feature ep_empirical = [0.0] * len(self.feats) # init the feature expectation on empirical distribution for i, f in enumerate(self.feats): @@ -763,7 +744,7 @@ def test(self, train_data, train_labels, best_words, test_data): if word in best_words: self.feats[(train_labels[i], word)] += 1 - the_max = max([len(record) - 1 for record in train_data]) # the_max param for GIS training algorithm + the_max = max(len(record) - 1 for record in train_data) self.weight = [0.0] * len(self.feats) # init weight for each feature ep_empirical = [0.0] * len(self.feats) # init the feature expectation on empirical distribution for i, f in enumerate(self.feats): @@ -791,9 +772,7 @@ def test(self, train_data, train_labels, best_words, test_data): self.weight[j] += delta # update weight print("MaxEntClassifier is testing ...") - classify_labels = [] - for data in test_data: - classify_labels.append(self.classify(data)) + classify_labels = [self.classify(data) for data in test_data] classify_results.append(classify_labels) # test if the algorithm is convergence @@ -808,10 +787,7 @@ def test(self, train_data, train_labels, best_words, test_data): def classify(self, the_input_features): prob = self.calculate_probability(the_input_features) prob.sort(reverse=True) - if prob[0][0] > prob[1][0]: - return prob[0][1] - else: - return prob[1][1] + return prob[0][1] if prob[0][0] > prob[1][0] else prob[1][1] # ################################################ @@ -832,9 +808,7 @@ def __init__(self, train_data, train_labels, best_words, C): def words2vector(self, all_data): vectors = [] for data in all_data: - vector = [] - for feature in self.best_words: - vector.append(data.count(feature)) + vector = [data.count(feature) for feature in self.best_words] vectors.append(vector) vectors = np.array(vectors) diff --git a/Part5_Sentiment_Analysis/src/corpus.py b/Part5_Sentiment_Analysis/src/corpus.py index 585db12..0dcc052 100644 --- a/Part5_Sentiment_Analysis/src/corpus.py +++ b/Part5_Sentiment_Analysis/src/corpus.py @@ -171,10 +171,8 @@ def test_corpus(): a = WaimaiCorpus() a = Waimai2Corpus() a = HotelCorpus() - pass if __name__ == "__main__": - pass get_movie_corpus() # get_movie2_corpus() # get_hotel_corpus() diff --git a/Part5_Sentiment_Analysis/src/feature_extraction.py b/Part5_Sentiment_Analysis/src/feature_extraction.py index 425b48d..523674a 100644 --- a/Part5_Sentiment_Analysis/src/feature_extraction.py +++ b/Part5_Sentiment_Analysis/src/feature_extraction.py @@ -33,10 +33,7 @@ def __calculate(n_ii, n_ix, n_xi, n_xx): def best_words(self, num, need_score=False): words = sorted(self.words.items(), key=lambda word_pair: word_pair[1], reverse=True) - if need_score: - return [word for word in words[:num]] - else: - return [word[0] for word in words[:num]] + return list(words[:num]) if need_score else [word[0] for word in words[:num]] diff --git a/Part5_Sentiment_Analysis/src/test.py b/Part5_Sentiment_Analysis/src/test.py index c8d3c6d..0fa112d 100644 --- a/Part5_Sentiment_Analysis/src/test.py +++ b/Part5_Sentiment_Analysis/src/test.py @@ -38,16 +38,12 @@ def set_precisions(self, precisions): def test_knn(self): from classifiers import KNNClassifier - if type(self.k) == int: - k = "%s" % self.k - else: - k = "-".join([str(i) for i in self.k]) - + k = f"{self.k}" if type(self.k) == int else "-".join([str(i) for i in self.k]) print("KNNClassifier") print("---" * 45) - print("Train num = %s" % self.train_num) - print("Test num = %s" % self.test_num) - print("K = %s" % k) + print(f"Train num = {self.train_num}") + print(f"Test num = {self.test_num}") + print(f"K = {k}") # print self.train_data print (self.train_labels) @@ -55,12 +51,9 @@ def test_knn(self): print (self.train_data[0]) knn = KNNClassifier(self.train_data, self.train_labels, k=self.k, best_words=self.best_words) - classify_labels = [] - print("KNNClassifiers is testing ...") - for data in self.test_data: - classify_labels.append(knn.classify(data)) + classify_labels = [knn.classify(data) for data in self.test_data] print("KNNClassifiers tests over.") filepath = "f_runout/KNN-%s-train-%d-test-%d-f-%d-k-%s-%s.xls" % \ @@ -75,16 +68,14 @@ def test_knn(self): def test_bayes(self): print("BayesClassifier") print("---" * 45) - print("Train num = %s" % self.train_num) - print("Test num = %s" % self.test_num) + print(f"Train num = {self.train_num}") + print(f"Test num = {self.test_num}") from classifiers import BayesClassifier bayes = BayesClassifier(self.train_data, self.train_labels, self.best_words) - classify_labels = [] print("BayesClassifier is testing ...") - for data in self.test_data: - classify_labels.append(bayes.classify(data)) + classify_labels = [bayes.classify(data) for data in self.test_data] print("BayesClassifier tests over.") filepath = "f_runout/Bayes-%s-train-%d-test-%d-f-%d-%s.xls" % \ @@ -106,9 +97,9 @@ def write(self, filepath, classify_labels, i=-1): def test_maxent_iteration(self): print("MaxEntClassifier iteration") print("---" * 45) - print("Train num = %s" % self.train_num) - print("Test num = %s" % self.test_num) - print("maxiter = %s" % self.max_iter) + print(f"Train num = {self.train_num}") + print(f"Test num = {self.test_num}") + print(f"maxiter = {self.max_iter}") from classifiers import MaxEntClassifier @@ -136,9 +127,9 @@ def test_maxent_iteration(self): def test_maxent(self): print("MaxEntClassifier") print("---" * 45) - print("Train num = %s" % self.train_num) - print("Test num = %s" % self.test_num) - print("maxiter = %s" % self.max_iter) + print(f"Train num = {self.train_num}") + print(f"Test num = {self.test_num}") + print(f"maxiter = {self.max_iter}") from classifiers import MaxEntClassifier @@ -146,9 +137,7 @@ def test_maxent(self): m.train(self.train_data, self.train_labels, self.best_words) print("MaxEntClassifier is testing ...") - classify_results = [] - for data in self.test_data: - classify_results.append(m.classify(data)) + classify_results = [m.classify(data) for data in self.test_data] print("MaxEntClassifier tests over.") filepath = "f_runout/MaxEnt-%s-train-%d-test-%d-f-%d-maxiter-%d-%s.xls" % \ @@ -163,17 +152,15 @@ def test_maxent(self): def test_svm(self): print("SVMClassifier") print("---" * 45) - print("Train num = %s" % self.train_num) - print("Test num = %s" % self.test_num) - print("C = %s" % self.C) + print(f"Train num = {self.train_num}") + print(f"Test num = {self.test_num}") + print(f"C = {self.C}") from classifiers import SVMClassifier svm = SVMClassifier(self.train_data, self.train_labels, self.best_words, self.C) - classify_labels = [] print("SVMClassifier is testing ...") - for data in self.test_data: - classify_labels.append(svm.classify(data)) + classify_labels = [svm.classify(data) for data in self.test_data] print("SVMClassifier tests over.") filepath = "f_runout/SVM-%s-train-%d-test-%d-f-%d-C-%d-%s-lin.xls" % \ @@ -346,7 +333,6 @@ def test_dict(): if __name__ == "__main__": - pass test_movie() test_movie2() test_waimai() diff --git a/Part5_Sentiment_Analysis/src/tools.py b/Part5_Sentiment_Analysis/src/tools.py index 33ec631..c1b205a 100644 --- a/Part5_Sentiment_Analysis/src/tools.py +++ b/Part5_Sentiment_Analysis/src/tools.py @@ -65,18 +65,24 @@ def write_results(origin_labels, classify_labels, filepath): def get_accuracy(origin_labels, classify_labels, parameters): assert len(origin_labels) == len(classify_labels) - print 'result' - print classify_labels - print len(classify_labels) + assert len(origin_labels) == len(classify_labels) + + assert len(origin_labels) == len(classify_labels) - print 'ori' - print origin_labels - print len(origin_labels) + assert len(origin_labels) == len(classify_labels) - xls_contents = [] + assert len(origin_labels) == len(classify_labels) + + assert len(origin_labels) == len(classify_labels) + + assert len(origin_labels) == len(classify_labels) + + xls_contents = [ + ("train num", parameters[0]), + ("test num", parameters[1]), + ("feature num", parameters[2]), + ] - xls_contents.extend([("train num", parameters[0]), ("test num", parameters[1])]) - xls_contents.append(("feature num", parameters[2])) pos_right, pos_false = 0, 0 neg_right, neg_false = 0, 0 @@ -86,13 +92,18 @@ def get_accuracy(origin_labels, classify_labels, parameters): pos_right += 1 # 负负11 else: neg_false += 1 # 负正10 + elif classify_labels[i] == 0: + neg_right += 1 # 正正00 else: - if classify_labels[i] == 0: - neg_right += 1 # 正正00 - else: - pos_false += 1 # 正负01 - xls_contents.extend([("neg-right", neg_right), ("neg-false", neg_false)]) - xls_contents.extend([("pos-right", pos_right), ("pos-false", pos_false)]) + pos_false += 1 # 正负01 + xls_contents.extend( + [ + ("neg-right", neg_right), + ("neg-false", neg_false), + ("pos-right", pos_right), + ("pos-false", pos_false), + ] + ) print (neg_right, pos_right, neg_false, pos_false) diff --git a/Part6_Relation_Extraction/feature_extract.py b/Part6_Relation_Extraction/feature_extract.py index 88854d7..c9f4085 100644 --- a/Part6_Relation_Extraction/feature_extract.py +++ b/Part6_Relation_Extraction/feature_extract.py @@ -12,11 +12,7 @@ def load_stopwords(): with open(os.path.join(dataDir, 'stopwords.txt'), 'r') as stopwords_file: - stopwords = [] - for line in stopwords_file: - stopwords.append(line.strip()) - - return stopwords + return [line.strip() for line in stopwords_file] # 构建字典:词到id、词性到id的映射 @@ -79,12 +75,9 @@ def generateDic2(sentence_filepath, save_num=15000): def align(sentence_filepath, train_filepath, peopleset_filepath): jieba.load_userdict(os.path.join(dataDir, 'people.txt')) - with open(sentence_filepath, 'r') as sentence_file, open(train_filepath, 'r') as train_r_file, \ - open(peopleset_filepath, 'r') as peopleset_file, \ - open(os.path.join(dataDir, 'train.txt'), 'w') as train_file, \ - open(os.path.join(dataDir, 'test.txt'), 'w') as test_file: + with open(sentence_filepath, 'r') as sentence_file, open(train_filepath, 'r') as train_r_file, open(peopleset_filepath, 'r') as peopleset_file, open(os.path.join(dataDir, 'train.txt'), 'w') as train_file, open(os.path.join(dataDir, 'test.txt'), 'w') as test_file: - train_r_dict = dict() + train_r_dict = {} # loading train relation # 训练集 关系对 @@ -92,7 +85,7 @@ def align(sentence_filepath, train_filepath, peopleset_filepath): line = line.strip() entry = line.split('\t') p1, p2, relation = entry[0], entry[1], entry[2] - train_r_dict[p1 + ',' + p2] = relation + train_r_dict[f'{p1},{p2}'] = relation peopleset = set() @@ -118,8 +111,8 @@ def align(sentence_filepath, train_filepath, peopleset_filepath): for j in range(len(peoplelist_line)): if i != j: p2 = peoplelist_line[j] - if p1 + ',' + p2 in train_r_dict: - relation = train_r_dict[p1 + ',' + p2] + if f'{p1},{p2}' in train_r_dict: + relation = train_r_dict[f'{p1},{p2}'] train_file.write(p1 + '\t' + p2 + '\t' + relation + '\t' + line + '\n') else: test_file.write(p1 + '\t' + p2 + '\tunknown\t' + line + '\n') @@ -275,8 +268,7 @@ def feature_extract2(filepath, win=3): # 根据libsvm的预测结果整理,得到预测结果 def handle_libsvm_result(predict_filepath, entitypair_filepath): - with open(predict_filepath, 'r') as predict_file, open(entitypair_filepath, 'r') as entitypair_file, \ - open(os.path.join(dataDir, 'rsl_' + os.path.split(predict_filepath)[-1]), 'w') as rsl_file: + with open(predict_filepath, 'r') as predict_file, open(entitypair_filepath, 'r') as entitypair_file, open(os.path.join(dataDir, f'rsl_{os.path.split(predict_filepath)[-1]}'), 'w') as rsl_file: line_idx = 0 line = predict_file.readline() @@ -300,10 +292,10 @@ def handle_libsvm_result(predict_filepath, entitypair_filepath): def read_relation(relation_filepath): with open(relation_filepath, 'r') as relation_file: - relation_dict = dict() + relation_dict = {} for line in relation_file: p1, p2, relation = line.strip().split('\t') - relation_dict[p1 + ',' + p2] = relation + relation_dict[f'{p1},{p2}'] = relation return relation_dict diff --git a/Part6_Relation_Extraction/libsvm-3.21/python/svm.py b/Part6_Relation_Extraction/libsvm-3.21/python/svm.py index 577160d..7f438e9 100644 --- a/Part6_Relation_Extraction/libsvm-3.21/python/svm.py +++ b/Part6_Relation_Extraction/libsvm-3.21/python/svm.py @@ -82,9 +82,7 @@ def gen_svm_nodearray(xi, feature_max=None, isKernel=None): for idx, j in enumerate(index_range): ret[idx].index = j ret[idx].value = xi[j] - max_idx = 0 - if index_range: - max_idx = index_range[-1] + max_idx = index_range[-1] if index_range else 0 return ret, max_idx class svm_problem(Structure): @@ -99,7 +97,7 @@ def __init__(self, y, x, isKernel=None): max_idx = 0 x_space = self.x_space = [] - for i, xi in enumerate(x): + for xi in x: tmp_xi, tmp_idx = gen_svm_nodearray(xi,isKernel=isKernel) x_space += [tmp_xi] max_idx = max(max_idx, tmp_idx) @@ -121,16 +119,14 @@ class svm_parameter(Structure): _fields_ = genFields(_names, _types) def __init__(self, options = None): - if options == None: + if options is None: options = '' self.parse_options(options) def __str__(self): - s = '' attrs = svm_parameter._names + list(self.__dict__.keys()) values = map(lambda attr: getattr(self, attr), attrs) - for attr, val in zip(attrs, values): - s += (' %s: %s\n' % (attr, val)) + s = ''.join((' %s: %s\n' % (attr, val)) for attr, val in zip(attrs, values)) s = s.strip() return s @@ -170,51 +166,51 @@ def parse_options(self, options): i = 0 while i < len(argv): if argv[i] == "-s": - i = i + 1 + i += 1 self.svm_type = int(argv[i]) elif argv[i] == "-t": - i = i + 1 + i += 1 self.kernel_type = int(argv[i]) elif argv[i] == "-d": - i = i + 1 + i += 1 self.degree = int(argv[i]) elif argv[i] == "-g": - i = i + 1 + i += 1 self.gamma = float(argv[i]) elif argv[i] == "-r": - i = i + 1 + i += 1 self.coef0 = float(argv[i]) elif argv[i] == "-n": - i = i + 1 + i += 1 self.nu = float(argv[i]) elif argv[i] == "-m": - i = i + 1 + i += 1 self.cache_size = float(argv[i]) elif argv[i] == "-c": - i = i + 1 + i += 1 self.C = float(argv[i]) elif argv[i] == "-e": - i = i + 1 + i += 1 self.eps = float(argv[i]) elif argv[i] == "-p": - i = i + 1 + i += 1 self.p = float(argv[i]) elif argv[i] == "-h": - i = i + 1 + i += 1 self.shrinking = int(argv[i]) elif argv[i] == "-b": - i = i + 1 + i += 1 self.probability = int(argv[i]) elif argv[i] == "-q": self.print_func = PRINT_STRING_FUN(print_null) elif argv[i] == "-v": - i = i + 1 + i += 1 self.cross_validation = 1 self.nr_fold = int(argv[i]) if self.nr_fold < 2: raise ValueError("n-fold cross validation: n must >= 2") elif argv[i].startswith("-w"): - i = i + 1 + i += 1 self.nr_weight += 1 weight_label += [int(argv[i-1][2:])] weight += [float(argv[i])] @@ -280,7 +276,7 @@ def get_sv_coef(self): def get_SV(self): result = [] for sparse_sv in self.SV[:self.l]: - row = dict() + row = {} i = 0 while True: @@ -298,7 +294,7 @@ def toPyModel(model_ptr): Convert a ctypes POINTER(svm_model) to a Python svm_model """ - if bool(model_ptr) == False: + if not bool(model_ptr): raise ValueError("Null pointer") m = model_ptr.contents m.__createfrom__ = 'C' diff --git a/Part6_Relation_Extraction/libsvm-3.21/python/svmutil.py b/Part6_Relation_Extraction/libsvm-3.21/python/svmutil.py index d353010..5caf7c2 100644 --- a/Part6_Relation_Extraction/libsvm-3.21/python/svmutil.py +++ b/Part6_Relation_Extraction/libsvm-3.21/python/svmutil.py @@ -128,11 +128,8 @@ def svm_train(arg1, arg2=None, arg3=None): prob = svm_problem(y, x, isKernel=(param.kernel_type == PRECOMPUTED)) elif isinstance(arg1, svm_problem): prob = arg1 - if isinstance(arg2, svm_parameter): - param = arg2 - else: - param = svm_parameter(arg2) - if prob == None or param == None: + param = arg2 if isinstance(arg2, svm_parameter) else svm_parameter(arg2) + if prob is None or param is None: raise TypeError("Wrong types for the arguments") if param.kernel_type == PRECOMPUTED: @@ -146,9 +143,8 @@ def svm_train(arg1, arg2=None, arg3=None): if param.gamma == 0 and prob.n > 0: param.gamma = 1.0 / prob.n libsvm.svm_set_print_string_function(param.print_func) - err_msg = libsvm.svm_check_parameter(prob, param) - if err_msg: - raise ValueError('Error: %s' % err_msg) + if err_msg := libsvm.svm_check_parameter(prob, param): + raise ValueError(f'Error: {err_msg}') if param.cross_validation: l, nr_fold = prob.l, param.nr_fold @@ -242,10 +238,7 @@ def info(s): for xi in x: xi, idx = gen_svm_nodearray(xi, isKernel=(m.param.kernel_type == PRECOMPUTED)) label = libsvm.svm_predict_values(m, xi, dec_values) - if(nr_class == 1): - values = [1] - else: - values = dec_values[:nr_classifier] + values = [1] if (nr_class == 1) else dec_values[:nr_classifier] pred_labels += [label] pred_values += [values] diff --git a/Part6_Relation_Extraction/libsvm-3.21/tools/checkdata.py b/Part6_Relation_Extraction/libsvm-3.21/tools/checkdata.py index a1d8414..9b4ef27 100644 --- a/Part6_Relation_Extraction/libsvm-3.21/tools/checkdata.py +++ b/Part6_Relation_Extraction/libsvm-3.21/tools/checkdata.py @@ -36,9 +36,8 @@ def main(): print("dataset {0} not found".format(dataset)) exit(1) - line_no = 1 error_line_count = 0 - for line in open(dataset, 'r'): + for line_no, line in enumerate(open(dataset, 'r'), start=1): line_error = False # each line must end with a newline character @@ -51,7 +50,7 @@ def main(): # check label try: label = nodes.pop(0) - + if label.find(',') != -1: # multi-label format try: @@ -92,11 +91,9 @@ def main(): err(line_no, "feature '{0}' not an : pair, integer, real number ".format(nodes[i])) line_error = True - line_no += 1 - if line_error: error_line_count += 1 - + if error_line_count > 0: print("Found {0} lines with error.".format(error_line_count)) return 1 diff --git a/Part6_Relation_Extraction/libsvm-3.21/tools/easy.py b/Part6_Relation_Extraction/libsvm-3.21/tools/easy.py index 9cf4362..1ab9390 100644 --- a/Part6_Relation_Extraction/libsvm-3.21/tools/easy.py +++ b/Part6_Relation_Extraction/libsvm-3.21/tools/easy.py @@ -34,16 +34,16 @@ train_pathname = sys.argv[1] assert os.path.exists(train_pathname),"training file not found" file_name = os.path.split(train_pathname)[1] -scaled_file = file_name + ".scale" -model_file = file_name + ".model" -range_file = file_name + ".range" +scaled_file = f"{file_name}.scale" +model_file = f"{file_name}.model" +range_file = f"{file_name}.range" if len(sys.argv) > 2: test_pathname = sys.argv[2] file_name = os.path.split(test_pathname)[1] assert os.path.exists(test_pathname),"testing file not found" - scaled_test_file = file_name + ".scale" - predict_test_file = file_name + ".predict" + scaled_test_file = f"{file_name}.scale" + predict_test_file = f"{file_name}.predict" cmd = '{0} -s "{1}" "{2}" > "{3}"'.format(svmscale_exe, range_file, train_pathname, scaled_file) print('Scaling training data...') diff --git a/Part6_Relation_Extraction/libsvm-3.21/tools/grid.py b/Part6_Relation_Extraction/libsvm-3.21/tools/grid.py index 40cb082..805b813 100644 --- a/Part6_Relation_Extraction/libsvm-3.21/tools/grid.py +++ b/Part6_Relation_Extraction/libsvm-3.21/tools/grid.py @@ -42,52 +42,46 @@ def parse_options(self, options): options = options.split() i = 0 pass_through_options = [] - + while i < len(options): if options[i] == '-log2c': - i = i + 1 + i += 1 if options[i] == 'null': self.grid_with_c = False else: self.c_begin, self.c_end, self.c_step = map(float,options[i].split(',')) elif options[i] == '-log2g': - i = i + 1 + i += 1 if options[i] == 'null': self.grid_with_g = False else: self.g_begin, self.g_end, self.g_step = map(float,options[i].split(',')) elif options[i] == '-v': - i = i + 1 + i += 1 self.fold = options[i] elif options[i] in ('-c','-g'): raise ValueError('Use -log2c and -log2g.') elif options[i] == '-svmtrain': - i = i + 1 + i += 1 self.svmtrain_pathname = options[i] elif options[i] == '-gnuplot': - i = i + 1 - if options[i] == 'null': - self.gnuplot_pathname = None - else: - self.gnuplot_pathname = options[i] + i += 1 + self.gnuplot_pathname = None if options[i] == 'null' else options[i] elif options[i] == '-out': - i = i + 1 - if options[i] == 'null': - self.out_pathname = None - else: - self.out_pathname = options[i] + i += 1 + self.out_pathname = None if options[i] == 'null' else options[i] elif options[i] == '-png': - i = i + 1 + i += 1 self.png_pathname = options[i] elif options[i] == '-resume': if i == (len(options)-1) or options[i+1].startswith('-'): - self.resume_pathname = self.dataset_title + '.out' + self.resume_pathname = f'{self.dataset_title}.out' else: - i = i + 1 + i += 1 self.resume_pathname = options[i] else: pass_through_options.append(options[i]) - i = i + 1 + i += 1 self.pass_through_string = ' '.join(pass_through_options) if not os.path.exists(self.svmtrain_pathname): @@ -142,7 +136,7 @@ def redraw(db,best_param,gnuplot,options,tofile=False): " at screen 0.5,0.8 center\n".format(2**best_log2c, 2**best_log2g).encode()) gnuplot.write(b"set key at screen 0.9,0.9\n") gnuplot.write(b"splot \"-\" with lines\n") - + db.sort(key = lambda x:(x[0], -x[1])) prevc = db[0][0] @@ -161,29 +155,31 @@ def calculate_jobs(options): def range_f(begin,end,step): # like range, but works on non-integer too seq = [] - while True: - if step > 0 and begin > end: break - if step < 0 and begin < end: break + while ( + True + and not (step > 0 and begin > end) + and not (step < 0 and begin < end) + ): seq.append(begin) begin = begin + step return seq - + def permute_sequence(seq): n = len(seq) if n <= 1: return seq - + mid = int(n/2) left = permute_sequence(seq[:mid]) right = permute_sequence(seq[mid+1:]) - + ret = [seq[mid]] while left or right: if left: ret.append(left.pop(0)) if right: ret.append(right.pop(0)) - + return ret - + c_seq = permute_sequence(range_f(options.c_begin,options.c_end,options.c_step)) g_seq = permute_sequence(range_f(options.g_begin,options.g_end,options.g_step)) @@ -191,7 +187,7 @@ def permute_sequence(seq): c_seq = [None] if not options.grid_with_g: g_seq = [None] - + nr_c = float(len(c_seq)) nr_g = float(len(g_seq)) i, j = 0, 0 @@ -201,20 +197,20 @@ def permute_sequence(seq): if i/nr_c < j/nr_g: # increase C resolution line = [] - for k in range(0,j): + for k in range(j): line.append((c_seq[i],g_seq[k])) - i = i + 1 + i += 1 jobs.append(line) else: # increase g resolution line = [] - for k in range(0,i): + for k in range(i): line.append((c_seq[k],g_seq[j])) - j = j + 1 + j += 1 jobs.append(line) resumed_jobs = {} - + if options.resume_pathname is None: return jobs, resumed_jobs @@ -291,8 +287,8 @@ def run_one(self,c,g): cmdline = self.get_cmd(c,g) result = Popen(cmdline,shell=True,stdout=PIPE,stderr=PIPE,stdin=PIPE).stdout for line in result.readlines(): - if str(line).find('Cross') != -1: - return float(line.split()[-1][0:-1]) + if 'Cross' in str(line): + return float(line.split()[-1][:-1]) class SSHWorker(Worker): def __init__(self,name,job_queue,result_queue,host,options): @@ -304,8 +300,8 @@ def run_one(self,c,g): (self.host,self.cwd,self.get_cmd(c,g)) result = Popen(cmdline,shell=True,stdout=PIPE,stderr=PIPE,stdin=PIPE).stdout for line in result.readlines(): - if str(line).find('Cross') != -1: - return float(line.split()[-1][0:-1]) + if 'Cross' in str(line): + return float(line.split()[-1][:-1]) class TelnetWorker(Worker): def __init__(self,name,job_queue,result_queue,host,username,password,options): @@ -325,7 +321,7 @@ def run(self): tn.read_until(self.username) # print('login ok', self.host) - tn.write('cd '+os.getcwd()+'\n') + tn.write(f'cd {os.getcwd()}' + '\n') Worker.run(self) tn.write('exit\n') def run_one(self,c,g): @@ -333,11 +329,11 @@ def run_one(self,c,g): result = self.tn.write(cmdline+'\n') (idx,matchm,output) = self.tn.expect(['Cross.*\n']) for line in output.split('\n'): - if str(line).find('Cross') != -1: - return float(line.split()[-1][0:-1]) + if 'Cross' in str(line): + return float(line.split()[-1][:-1]) def find_parameters(dataset_pathname, options=''): - + def update_param(c,g,rate,best_c,best_g,best_rate,worker,resumed): if (rate > best_rate) or (rate==best_rate and g==best_g and c