-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathtext_normalization.py
More file actions
54 lines (44 loc) · 1.71 KB
/
text_normalization.py
File metadata and controls
54 lines (44 loc) · 1.71 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
from pynini.lib import pynutil
import pynini
import nemo_text_processing
from pynini.lib import pynutil
import os
import argparse
parser = argparse.ArgumentParser(description='Normalize and print a text argument provided by the user.')
parser.add_argument('-t','--text', type=str, help='The text to normalize.')
args = parser.parse_args()
import re
def normalize_spaces(text):
# Replace occurrences of one or more spaces with a single space
normalized_text = re.sub(r'\s+', ' ', text)
return normalized_text
def compress_consecutive_spaced_na(text):
pattern = r"(?:\b|(?<=\s))(na )+na(?=\s|$)"
# Replace found patterns with a single " na "
result = re.sub(pattern, ' na ', text)
return result
def apply_fst(text, fst):
try:
return pynini.shortestpath(text @ fst).string()
except pynini.FstOpError:
print(f"Error: no valid output with given input: '{text}'")
PATH="."
text = args.text
classify_far_file = os.path.join(PATH, "tokenize_and_classify.far")
verbalize_far_file = os.path.join(PATH, "verbalize.far")
classify = pynini.Far(classify_far_file, mode="r")["TOKENIZE_AND_CLASSIFY"]
verbalize = pynini.Far(verbalize_far_file,mode="r")["ALL"]
# classes = classes.replace("'"," ")
#print("processed: ",classes)
#print("\nverbalizing")
try:
classes = apply_fst(text,classify)
verbalized_text = apply_fst(classes,verbalize)
normalized_text = normalize_spaces(verbalized_text)
compressed_text = compress_consecutive_spaced_na(normalized_text)
print(normalize_spaces(compressed_text))
# print("\n******************************************\n")
# print("and then we are here")
except Exception as E:
print("Failed to process the string........")
print(text)