From 434755c77c44b42b2c26eec7a0f124c909016f41 Mon Sep 17 00:00:00 2001 From: Nikolay Korolev Date: Thu, 3 Dec 2020 17:11:34 +0300 Subject: [PATCH 1/3] Delete unused dependency on pattern library --- preprocess/data_process.py | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/preprocess/data_process.py b/preprocess/data_process.py index 43efb8b..4901372 100644 --- a/preprocess/data_process.py +++ b/preprocess/data_process.py @@ -13,7 +13,7 @@ import nltk import os import pickle -from utils import symbol_filter, re_lemma, fully_part_header, group_header, partial_header, num2year, group_symbol, group_values, group_digital +from utils import symbol_filter, fully_part_header, group_header, partial_header, num2year, group_symbol, group_values, group_digital from utils import AGG, wordnet_lemmatizer from utils import load_dataSets @@ -43,29 +43,19 @@ def process_datas(datas, args): entry['question_toks'] = question_toks table_names = [] - table_names_pattern = [] for y in entry['table_names']: x = [wordnet_lemmatizer.lemmatize(x.lower()) for x in y.split(' ')] table_names.append(" ".join(x)) - x = [re_lemma(x.lower()) for x in y.split(' ')] - table_names_pattern.append(" ".join(x)) header_toks = [] header_toks_list = [] - header_toks_pattern = [] - header_toks_list_pattern = [] - for y in entry['col_set']: x = [wordnet_lemmatizer.lemmatize(x.lower()) for x in y.split(' ')] header_toks.append(" ".join(x)) header_toks_list.append(x) - x = [re_lemma(x.lower()) for x in y.split(' ')] - header_toks_pattern.append(" ".join(x)) - header_toks_list_pattern.append(x) - num_toks = len(question_toks) idx = 0 tok_concol = [] @@ -215,5 +205,3 @@ def get_concept_result(toks, graph): with open(args.output, 'w') as f: json.dump(datas, f) - - From 04f2002e733f540a6cf457a01b00319f22810854 Mon Sep 17 00:00:00 2001 From: Nikolay Korolev Date: Thu, 3 Dec 2020 17:13:20 +0300 Subject: [PATCH 2/3] Update utils.py --- preprocess/utils.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/preprocess/utils.py b/preprocess/utils.py index 166a8e8..347f03b 100644 --- a/preprocess/utils.py +++ b/preprocess/utils.py @@ -8,9 +8,7 @@ # @File : utils.py # @Software: PyCharm """ -import os import json -from pattern.en import lemma from nltk.stem import WordNetLemmatizer VALUE_FILTER = ['what', 'how', 'list', 'give', 'show', 'find', 'id', 'order', 'when'] @@ -164,10 +162,3 @@ def check_in(list_one, list_two): if check_in(toks, heads): return heads return None - -def re_lemma(string): - lema = lemma(string.lower()) - if len(lema) > 0: - return lema - else: - return string.lower() From e5c4de21c2f1feea1985b0143a4ff9e32c6f632e Mon Sep 17 00:00:00 2001 From: Nikolay Korolev Date: Thu, 3 Dec 2020 17:19:05 +0300 Subject: [PATCH 3/3] Update requirements.txt --- requirements.txt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index a0e23ba..83ebd68 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,6 @@ # Licensed under the MIT license. nltk==3.4 -pattern numpy==1.14.0 pytorch-pretrained-bert==0.5.1 -tqdm==4.31.1 \ No newline at end of file +tqdm==4.31.1