-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathnlp_preprocess.py
More file actions
executable file
·62 lines (54 loc) · 2.57 KB
/
nlp_preprocess.py
File metadata and controls
executable file
·62 lines (54 loc) · 2.57 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import argparse
import pathlib
import subprocess
from tqdm import tqdm
'''
python3 nlp_preprocess.py --input /export/usuarios_ml4ds/cggamella/RAG_tool/files/anotacion_manual/fam/df_esp_first100.xlsx
--output /export/usuarios_ml4ds/cggamella/RAG_tool/data/preprocessed/manual_fam_df_esp_first100.xlsx
--path_add_acr /export/usuarios_ml4ds/cggamella/RAG_tool/topicmodelling/data/acronyms/df_esp_first100_both_equivalences.json
'''
if __name__ == '__main__':
# Parse arguments
parser = argparse.ArgumentParser()
parser.add_argument("--input", type=str, required=False, help="Path to the input file", default="/export/usuarios_ml4ds/cggamella/NP-Search-Tool/sample_data/models/Mallet/es_Mallet_df_merged_20_topics_45_ENTREGABLE/datos_modelo.parquet")
parser.add_argument("--output", type=str, required=False, help="Path to the output file", default="/export/usuarios_ml4ds/lbartolome/Repos/repos_con_carlos/RAG_tool/data/preprocessed/optimized/cpv_45_preproc.parquet")
parser.add_argument("--path_add_acr", type=str, required=False, help="Path to the acr file", default="/export/usuarios_ml4ds/cggamella/RAG_tool/src/topicmodeling/data/acronyms/df_esp_first100_both_equivalences.json")
args = parser.parse_args()
preprocessing_script = "/export/usuarios_ml4ds/lbartolome/Repos/repos_con_carlos/RAG_tool/src/preprocessing/pipe/nlpipe.py"
source_path = args.input
source_type = "parquet"
source = "cpv45"
destination_path = args.output
spacy_model = "es_core_news_lg"
lang = "es"
# Construct the command
if args.path_add_acr:
cmd = [
"python", preprocessing_script,
"--source_path", source_path,
"--source_type", source_type,
"--source", source,
"--destination_path", destination_path,
"--lang", lang,
"--spacy_model", spacy_model,
"--path_add_acr", args.path_add_acr,
"--do_embeddings"
]
else:
cmd = [
"python", preprocessing_script,
"--source_path", source_path,
"--source_type", source_type,
"--source", source,
"--destination_path", destination_path,
"--lang", lang,
"--spacy_model", spacy_model,
"--do_embeddings"
]
try:
print(f'-- -- Running preprocessing command {" ".join(cmd)}')
subprocess.check_output(cmd)
except subprocess.CalledProcessError as e:
print('-- -- Preprocessing failed. Revise command')
print(e.output)
print("-- -- Preprocessing done")