Skip to content

Commit 5e19b4e

Browse files
RajanPuttyRajanv307pre-commit-ci[bot]
authored
Hindi ITN Merge Telephone Semiotic Class (#344)
* feat(hi): Add Telephone class and all Hindi ITN updates Signed-off-by: RajanPutty <rputty@nvidia.com> * refactor(hi/telephone): Load digits and context from TSV files Addresses review comments on PR #344 by refactoring hardcoded variables to use data loaded from TSV files. - The `hindi_digits` and `english_digits` variables are no longer hardcoded. They are now populated by loading and creating a pynini union of their respective TSV files (`data/numbers/digit.tsv`, `data/numbers/zero.tsv`, `data/telephone/eng_digit.tsv`, `data/telephone/eng_zero.tsv`). - The hardcoded `context` dictionary has been removed. Its values are now loaded from a new `data/telephone/context_cues.tsv` file, matching the existing pattern used for `cc_cues` in the 'en' implementation. Signed-off-by: P V RAJAN <rajanv307@gmail.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: P V RAJAN <rajanv307@gmail.com> --------- Signed-off-by: RajanPutty <rputty@nvidia.com> Signed-off-by: P V RAJAN <rajanv307@gmail.com> Co-authored-by: P V RAJAN <rajanv307@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent 9aa050b commit 5e19b4e

File tree

8 files changed

+165
-156
lines changed

8 files changed

+165
-156
lines changed
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
mobile नंबर
2+
mobile मोबाइल
3+
mobile फोन
4+
mobile कॉल
5+
landline नंबर
6+
landline मोबाइल
7+
landline फोन
8+
landline लैंडलाइन
9+
landline कॉल
10+
pincode पिन
11+
pincode कोड
12+
pincode पिनकोड
13+
credit नंबर
14+
credit कार्ड
15+
credit क्रेडिट
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
९१ नौ एक

nemo_text_processing/inverse_text_normalization/hi/data/telephone/eng_to_hindi_digit.tsv renamed to nemo_text_processing/inverse_text_normalization/hi/data/telephone/eng_digit.tsv

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
zero
21
one
32
two
43
three
@@ -7,4 +6,4 @@
76
six
87
seven
98
eight
10-
nine
9+
nine
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
zero

nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py

Lines changed: 117 additions & 114 deletions
Original file line numberDiff line numberDiff line change
@@ -15,143 +15,146 @@
1515
import pynini
1616
from pynini.lib import pynutil
1717

18-
from nemo_text_processing.inverse_text_normalization.hi.graph_utils import GraphFst, delete_space
18+
from nemo_text_processing.inverse_text_normalization.hi.graph_utils import (
19+
NEMO_CHAR,
20+
NEMO_WHITE_SPACE,
21+
GraphFst,
22+
delete_space,
23+
)
1924
from nemo_text_processing.inverse_text_normalization.hi.utils import get_abs_path
2025

26+
shunya = (
27+
pynini.string_file(get_abs_path("data/numbers/zero.tsv")).invert()
28+
| pynini.string_file(get_abs_path("data/telephone/eng_zero.tsv")).invert()
29+
)
30+
digit_without_shunya = (
31+
pynini.string_file(get_abs_path("data/numbers/digit.tsv")).invert()
32+
| pynini.string_file(get_abs_path("data/telephone/eng_digit.tsv")).invert()
33+
)
34+
digit = digit_without_shunya | shunya
2135

22-
class TelephoneFst(GraphFst):
23-
"""
24-
Finite state transducer for classifying telephone numbers, e.g.
25-
e.g. प्लस इक्यानवे नौ आठ सात छह पांच चार तीन दो एक शून्य => tokens { name: "+९१ ९८७६५ ४३२१०" }
26-
Args:
27-
Cardinal: CardinalFst
28-
"""
2936

30-
def __init__(self, cardinal: GraphFst):
31-
super().__init__(name="telephone", kind="classify")
37+
def get_context(keywords: list):
38+
keywords = pynini.union(*keywords)
3239

33-
hindi_digit_graph = pynini.string_file(get_abs_path("data/numbers/digit.tsv")).invert()
34-
hindi_digit_graph |= pynini.string_file(get_abs_path("data/numbers/zero.tsv")).invert()
40+
# Load Hindi digits from TSV files
41+
hindi_digits = (
42+
pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
43+
| pynini.string_file(get_abs_path("data/numbers/zero.tsv"))
44+
).project("output")
3545

36-
english_digit_graph = pynini.string_file(get_abs_path("data/telephone/eng_to_hindi_digit.tsv")).invert()
46+
# Load English digits from TSV files
47+
english_digits = (
48+
pynini.string_file(get_abs_path("data/telephone/eng_digit.tsv"))
49+
| pynini.string_file(get_abs_path("data/telephone/eng_zero.tsv"))
50+
).project("output")
3751

38-
country_code_graph_single_digits = pynini.string_file(get_abs_path("data/numbers/digit.tsv")).invert()
39-
country_code_graph_single_digits |= pynini.string_file(get_abs_path("data/numbers/zero.tsv")).invert()
40-
country_code_graph_single_digits |= pynini.string_file(
41-
get_abs_path("data/telephone/eng_to_hindi_digit.tsv")
42-
).invert()
52+
all_digits = hindi_digits | english_digits
4353

44-
country_code_graph_double_digits = pynini.string_file(get_abs_path("data/numbers/teens_and_ties.tsv")).invert()
45-
country_code_graph_double_digits |= pynini.string_file(
46-
get_abs_path("data/telephone/teens_and_ties_eng_to_hin.tsv")
47-
).invert()
54+
non_digit_char = pynini.difference(NEMO_CHAR, pynini.union(all_digits, NEMO_WHITE_SPACE))
55+
word = pynini.closure(non_digit_char, 1) + NEMO_WHITE_SPACE
56+
window = pynini.closure(word, 0, 5)
57+
before = (keywords + window).optimize()
58+
after = (window + keywords).optimize()
4859

49-
self.hindi_digit = (
50-
pynutil.insert("number_part: \"")
51-
+ pynini.closure(hindi_digit_graph + delete_space, 0, 9)
52-
+ hindi_digit_graph
53-
+ pynutil.insert("\" ")
54-
)
55-
self.english_digit = (
56-
pynutil.insert("number_part: \"")
57-
+ pynini.closure(english_digit_graph + delete_space, 0, 9)
58-
+ english_digit_graph
59-
+ delete_space
60-
+ pynutil.insert("\" ")
61-
)
60+
return before, after
6261

63-
self.country_code_with_single_digits = (
64-
pynutil.insert("country_code: \"")
65-
+ pynini.closure(country_code_graph_single_digits + delete_space, 0, 2)
66-
+ pynutil.insert("\" ")
67-
)
68-
self.country_code_with_double_digits = (
69-
pynutil.insert("country_code: \"")
70-
+ pynini.closure(country_code_graph_double_digits + delete_space, 0, 1)
71-
+ pynutil.insert("\" ")
72-
)
73-
self.country_code = self.country_code_with_single_digits | self.country_code_with_double_digits
7462

75-
# two, three, four-digit extension code with zero
76-
self.city_code_hindi = (
77-
pynutil.insert("extension: \"")
78-
+ pynini.closure(hindi_digit_graph + delete_space, 2, 5)
79-
+ pynutil.insert("\" ")
80-
)
81-
self.city_code_english = (
82-
pynutil.insert("extension: \"")
83-
+ pynini.closure(english_digit_graph + delete_space, 2, 5)
84-
+ pynutil.insert("\" ")
85-
)
63+
def generate_context_graph(context_keywords, length):
64+
context_before, context_after = get_context(context_keywords)
65+
digits = pynini.closure(digit + delete_space, length - 1, length - 1) + digit
8666

87-
self.city_extension = self.city_code_hindi | self.city_code_english
67+
graph_after_context = digits + NEMO_WHITE_SPACE + context_after
68+
graph_before_context = context_before + NEMO_WHITE_SPACE + digits
69+
graph_without_context = digits
8870

89-
# 7-digit landline graph in hindi and english digits
90-
self.landline_hindi = (
91-
pynutil.insert("number_part: \"")
92-
+ pynini.closure(hindi_digit_graph + delete_space, 7, 7)
93-
+ pynutil.insert("\" ")
94-
)
95-
self.landline_english = (
96-
pynutil.insert("number_part: \"")
97-
+ pynini.closure(english_digit_graph + delete_space, 7, 7)
98-
+ pynutil.insert("\" ")
99-
)
71+
return (
72+
pynutil.insert("number_part: \"")
73+
+ (graph_before_context | graph_after_context | graph_without_context)
74+
+ pynutil.insert("\" ")
75+
).optimize()
10076

101-
self.landline = self.landline_hindi | self.landline_english
10277

103-
self.pincode_in_hindi = (
104-
pynutil.insert("number_part: \"")
105-
+ pynini.closure(hindi_digit_graph + delete_space, 0, 5)
106-
+ hindi_digit_graph
107-
+ pynutil.insert("\" ")
108-
)
109-
self.pincode_in_english = (
110-
pynutil.insert("number_part: \"")
111-
+ pynini.closure(english_digit_graph + delete_space, 0, 5)
112-
+ english_digit_graph
113-
+ pynutil.insert("\" ")
114-
)
78+
def generate_pincode(context_keywords):
79+
return generate_context_graph(context_keywords, 6)
11580

116-
self.credit_card_last_digits_hindi = (
117-
pynutil.insert("number_part: \"")
118-
+ pynini.closure(hindi_digit_graph + delete_space, 0, 3)
119-
+ hindi_digit_graph
120-
+ pynutil.insert("\" ")
121-
)
122-
self.credit_card_last_digits_english = (
123-
pynutil.insert("number_part: \"")
124-
+ pynini.closure(english_digit_graph + delete_space, 0, 3)
125-
+ english_digit_graph
126-
+ pynutil.insert("\" ")
127-
)
12881

129-
delete_plus = pynini.union(
130-
pynutil.delete("प्लस") | pynutil.delete("plus") | pynutil.delete("Plus") | pynutil.delete("PLUS")
131-
)
82+
def generate_credit(context_keywords):
83+
return generate_context_graph(context_keywords, 4)
13284

133-
delete_zero = pynini.union(
134-
pynutil.delete("शून्य") | pynutil.delete("zero") | pynutil.delete("Zero") | pynutil.delete("ZERO")
135-
)
13685

137-
graph_number_with_hindi_digit = (
138-
delete_plus + delete_space + self.country_code + delete_space + self.hindi_digit
139-
)
140-
graph_number_with_english_digit = delete_plus + delete_space + self.country_code + self.english_digit
86+
def generate_mobile(context_keywords):
87+
context_before, context_after = get_context(context_keywords)
88+
89+
country_code = pynini.cross("प्लस", "+") + pynini.closure(delete_space + digit, 2, 2) + NEMO_WHITE_SPACE
90+
graph_country_code = (
91+
pynutil.insert("country_code: \"")
92+
+ (context_before + NEMO_WHITE_SPACE) ** (0, 1)
93+
+ country_code
94+
+ pynutil.insert("\" ")
95+
)
96+
97+
number_part = digit_without_shunya + delete_space + pynini.closure(digit + delete_space, 8, 8) + digit
98+
graph_number = (
99+
pynutil.insert("number_part: \"")
100+
+ number_part
101+
+ pynini.closure(NEMO_WHITE_SPACE + context_after, 0, 1)
102+
+ pynutil.insert("\" ")
103+
)
104+
105+
graph = (graph_country_code + graph_number) | graph_number
106+
return graph.optimize()
107+
108+
109+
def generate_telephone(context_keywords):
110+
context_before, context_after = get_context(context_keywords)
111+
112+
landline = shunya + delete_space + pynini.closure(digit + delete_space, 9, 9) + digit
113+
landline_with_context_before = context_before + NEMO_WHITE_SPACE + landline
114+
landline_with_context_after = landline + NEMO_WHITE_SPACE + context_after
115+
116+
return (
117+
pynutil.insert("number_part: \"")
118+
+ (landline | landline_with_context_before | landline_with_context_after)
119+
+ pynutil.insert("\" ")
120+
)
121+
122+
123+
class TelephoneFst(GraphFst):
124+
"""
125+
Finite state transducer for classifying telephone numbers, e.g.
126+
e.g. प्लस इक्यानवे नौ आठ सात छह पांच चार तीन दो एक शून्य => tokens { name: "+९१ ९८७६५ ४३२१०" }
127+
Args:
128+
Cardinal: CardinalFst
129+
"""
130+
131+
def __init__(self, cardinal: GraphFst):
132+
super().__init__(name="telephone", kind="classify")
133+
134+
# Load context cues from TSV file
135+
context_cues = pynini.string_file(get_abs_path("data/telephone/context_cues.tsv"))
136+
137+
# Extract keywords for each category
138+
mobile_keywords = pynini.compose(pynutil.delete("mobile"), context_cues).project("output").optimize()
139+
140+
landline_keywords = pynini.compose(pynutil.delete("landline"), context_cues).project("output").optimize()
141141

142-
graph_landline_with_extension = delete_zero + delete_space + self.city_extension + delete_space + self.landline
142+
pincode_keywords = pynini.compose(pynutil.delete("pincode"), context_cues).project("output").optimize()
143143

144-
graph_pincode = self.pincode_in_hindi | self.pincode_in_english
144+
credit_keywords = pynini.compose(pynutil.delete("credit"), context_cues).project("output").optimize()
145145

146-
graph_credit_card_last_digits = self.credit_card_last_digits_hindi | self.credit_card_last_digits_english
146+
# Convert FSTs to keyword lists for generate_* functions
147+
mobile = generate_mobile([mobile_keywords])
148+
landline = generate_telephone([landline_keywords])
149+
pincode = generate_pincode([pincode_keywords])
150+
credit = generate_credit([credit_keywords])
147151

148152
graph = (
149-
graph_number_with_hindi_digit
150-
| graph_number_with_english_digit
151-
| graph_landline_with_extension
152-
| graph_pincode
153-
| graph_credit_card_last_digits
153+
pynutil.add_weight(mobile, 0.7)
154+
| pynutil.add_weight(landline, 0.8)
155+
| pynutil.add_weight(credit, 0.9)
156+
| pynutil.add_weight(pincode, 1)
154157
)
155158

156-
final_graph = self.add_tokens(graph)
157-
self.fst = final_graph
159+
self.final = graph.optimize()
160+
self.fst = self.add_tokens(self.final)

nemo_text_processing/inverse_text_normalization/hi/verbalizers/telephone.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,6 @@ def __init__(self, cardinal: GraphFst):
3232
number_part = pynutil.delete("number_part: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")
3333
optional_country_code = pynini.closure(
3434
pynutil.delete("country_code: \"")
35-
+ pynutil.insert("+")
3635
+ delete_space
3736
+ pynini.closure(NEMO_NOT_QUOTE, 1)
3837
+ pynutil.delete("\"")

0 commit comments

Comments
 (0)