-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathnlp_preprocess.py
More file actions
58 lines (51 loc) · 2.68 KB
/
nlp_preprocess.py
File metadata and controls
58 lines (51 loc) · 2.68 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import argparse
import pathlib
import subprocess
from tqdm import tqdm
if __name__ == '__main__':
# Parse arguments
parser = argparse.ArgumentParser()
parser.add_argument("--input", type=str, required=False, help="Path to the input file", default="/export/usuarios_ml4ds/lbartolome/NextProcurement/NP-Text_Object/data/train_data/to_process")
#parser.add_argument("--input", type=str, required=False, help="Path to the input file", default="/export/usuarios_ml4ds/lbartolome/NextProcurement/sproc/place_feb_21/preprocessed")
parser.add_argument("--output", type=str, required=False, help="Path to the output file", default="/export/usuarios_ml4ds/lbartolome/NextProcurement/NP-Text_Object/data/train_data/to_process")
#parser.add_argument("--output", type=str, required=False, help="Path to the output file", default="/export/usuarios_ml4ds/lbartolome/NextProcurement/sproc/place_feb_21/preprocessed")
args = parser.parse_args()
print(f"Input path: {args.input}")
for el in tqdm(pathlib.Path(args.input).rglob('*')):
print(f"Processing {el}")
this_out_save = pathlib.Path(args.output).joinpath(f"{el.stem}.parquet")
# Correct the replacement syntax for out_save
out_save = this_out_save.as_posix().replace(f"{el.stem}.parquet", "pliegos.parquet")
if not this_out_save.exists():
# Copy input file to temporary file
subprocess.run(["cp", el.as_posix(), str(out_save)])
else:
print(f"Not recreating {this_out_save} since it already exists")
preprocessing_script = "/export/usuarios_ml4ds/lbartolome/NextProcurement/NP-Text_Object/src/preprocessing/pipe/nlpipe.py"
source_path = str(out_save)
source_type = "parquet"
source = "pliegos"
destination_path = this_out_save.as_posix()
spacy_model = "es_dep_news_trf"
lang = "es"
embeddings_model = "paraphrase-multilingual-MiniLM-L12-v2"
# Construct the command
cmd = [
"python", preprocessing_script,
"--source_path", source_path,
"--source_type", source_type,
"--source", source,
"--destination_path", destination_path,
"--lang", lang,
"--spacy_model", spacy_model,
"--embeddings_model", embeddings_model,
"--do_embeddings",
"--no_preproc"
]
try:
print(f'-- -- Running preprocessing command {" ".join(cmd)}')
subprocess.check_output(cmd)
except subprocess.CalledProcessError as e:
print('-- -- Preprocessing failed. Revise command')
print(e.output)
print("-- -- Preprocessing done")