|
15 | 15 | import pynini |
16 | 16 | from pynini.lib import pynutil |
17 | 17 |
|
18 | | -from nemo_text_processing.inverse_text_normalization.hi.graph_utils import GraphFst, delete_space |
| 18 | +from nemo_text_processing.inverse_text_normalization.hi.graph_utils import ( |
| 19 | + NEMO_CHAR, |
| 20 | + NEMO_WHITE_SPACE, |
| 21 | + GraphFst, |
| 22 | + delete_space, |
| 23 | +) |
19 | 24 | from nemo_text_processing.inverse_text_normalization.hi.utils import get_abs_path |
20 | 25 |
|
| 26 | +shunya = ( |
| 27 | + pynini.string_file(get_abs_path("data/numbers/zero.tsv")).invert() |
| 28 | + | pynini.string_file(get_abs_path("data/telephone/eng_zero.tsv")).invert() |
| 29 | +) |
| 30 | +digit_without_shunya = ( |
| 31 | + pynini.string_file(get_abs_path("data/numbers/digit.tsv")).invert() |
| 32 | + | pynini.string_file(get_abs_path("data/telephone/eng_digit.tsv")).invert() |
| 33 | +) |
| 34 | +digit = digit_without_shunya | shunya |
21 | 35 |
|
22 | | -class TelephoneFst(GraphFst): |
23 | | - """ |
24 | | - Finite state transducer for classifying telephone numbers, e.g. |
25 | | - e.g. प्लस इक्यानवे नौ आठ सात छह पांच चार तीन दो एक शून्य => tokens { name: "+९१ ९८७६५ ४३२१०" } |
26 | | - Args: |
27 | | - Cardinal: CardinalFst |
28 | | - """ |
29 | 36 |
|
30 | | - def __init__(self, cardinal: GraphFst): |
31 | | - super().__init__(name="telephone", kind="classify") |
| 37 | +def get_context(keywords: list): |
| 38 | + keywords = pynini.union(*keywords) |
32 | 39 |
|
33 | | - hindi_digit_graph = pynini.string_file(get_abs_path("data/numbers/digit.tsv")).invert() |
34 | | - hindi_digit_graph |= pynini.string_file(get_abs_path("data/numbers/zero.tsv")).invert() |
| 40 | + # Load Hindi digits from TSV files |
| 41 | + hindi_digits = ( |
| 42 | + pynini.string_file(get_abs_path("data/numbers/digit.tsv")) |
| 43 | + | pynini.string_file(get_abs_path("data/numbers/zero.tsv")) |
| 44 | + ).project("output") |
35 | 45 |
|
36 | | - english_digit_graph = pynini.string_file(get_abs_path("data/telephone/eng_to_hindi_digit.tsv")).invert() |
| 46 | + # Load English digits from TSV files |
| 47 | + english_digits = ( |
| 48 | + pynini.string_file(get_abs_path("data/telephone/eng_digit.tsv")) |
| 49 | + | pynini.string_file(get_abs_path("data/telephone/eng_zero.tsv")) |
| 50 | + ).project("output") |
37 | 51 |
|
38 | | - country_code_graph_single_digits = pynini.string_file(get_abs_path("data/numbers/digit.tsv")).invert() |
39 | | - country_code_graph_single_digits |= pynini.string_file(get_abs_path("data/numbers/zero.tsv")).invert() |
40 | | - country_code_graph_single_digits |= pynini.string_file( |
41 | | - get_abs_path("data/telephone/eng_to_hindi_digit.tsv") |
42 | | - ).invert() |
| 52 | + all_digits = hindi_digits | english_digits |
43 | 53 |
|
44 | | - country_code_graph_double_digits = pynini.string_file(get_abs_path("data/numbers/teens_and_ties.tsv")).invert() |
45 | | - country_code_graph_double_digits |= pynini.string_file( |
46 | | - get_abs_path("data/telephone/teens_and_ties_eng_to_hin.tsv") |
47 | | - ).invert() |
| 54 | + non_digit_char = pynini.difference(NEMO_CHAR, pynini.union(all_digits, NEMO_WHITE_SPACE)) |
| 55 | + word = pynini.closure(non_digit_char, 1) + NEMO_WHITE_SPACE |
| 56 | + window = pynini.closure(word, 0, 5) |
| 57 | + before = (keywords + window).optimize() |
| 58 | + after = (window + keywords).optimize() |
48 | 59 |
|
49 | | - self.hindi_digit = ( |
50 | | - pynutil.insert("number_part: \"") |
51 | | - + pynini.closure(hindi_digit_graph + delete_space, 0, 9) |
52 | | - + hindi_digit_graph |
53 | | - + pynutil.insert("\" ") |
54 | | - ) |
55 | | - self.english_digit = ( |
56 | | - pynutil.insert("number_part: \"") |
57 | | - + pynini.closure(english_digit_graph + delete_space, 0, 9) |
58 | | - + english_digit_graph |
59 | | - + delete_space |
60 | | - + pynutil.insert("\" ") |
61 | | - ) |
| 60 | + return before, after |
62 | 61 |
|
63 | | - self.country_code_with_single_digits = ( |
64 | | - pynutil.insert("country_code: \"") |
65 | | - + pynini.closure(country_code_graph_single_digits + delete_space, 0, 2) |
66 | | - + pynutil.insert("\" ") |
67 | | - ) |
68 | | - self.country_code_with_double_digits = ( |
69 | | - pynutil.insert("country_code: \"") |
70 | | - + pynini.closure(country_code_graph_double_digits + delete_space, 0, 1) |
71 | | - + pynutil.insert("\" ") |
72 | | - ) |
73 | | - self.country_code = self.country_code_with_single_digits | self.country_code_with_double_digits |
74 | 62 |
|
75 | | - # two, three, four-digit extension code with zero |
76 | | - self.city_code_hindi = ( |
77 | | - pynutil.insert("extension: \"") |
78 | | - + pynini.closure(hindi_digit_graph + delete_space, 2, 5) |
79 | | - + pynutil.insert("\" ") |
80 | | - ) |
81 | | - self.city_code_english = ( |
82 | | - pynutil.insert("extension: \"") |
83 | | - + pynini.closure(english_digit_graph + delete_space, 2, 5) |
84 | | - + pynutil.insert("\" ") |
85 | | - ) |
| 63 | +def generate_context_graph(context_keywords, length): |
| 64 | + context_before, context_after = get_context(context_keywords) |
| 65 | + digits = pynini.closure(digit + delete_space, length - 1, length - 1) + digit |
86 | 66 |
|
87 | | - self.city_extension = self.city_code_hindi | self.city_code_english |
| 67 | + graph_after_context = digits + NEMO_WHITE_SPACE + context_after |
| 68 | + graph_before_context = context_before + NEMO_WHITE_SPACE + digits |
| 69 | + graph_without_context = digits |
88 | 70 |
|
89 | | - # 7-digit landline graph in hindi and english digits |
90 | | - self.landline_hindi = ( |
91 | | - pynutil.insert("number_part: \"") |
92 | | - + pynini.closure(hindi_digit_graph + delete_space, 7, 7) |
93 | | - + pynutil.insert("\" ") |
94 | | - ) |
95 | | - self.landline_english = ( |
96 | | - pynutil.insert("number_part: \"") |
97 | | - + pynini.closure(english_digit_graph + delete_space, 7, 7) |
98 | | - + pynutil.insert("\" ") |
99 | | - ) |
| 71 | + return ( |
| 72 | + pynutil.insert("number_part: \"") |
| 73 | + + (graph_before_context | graph_after_context | graph_without_context) |
| 74 | + + pynutil.insert("\" ") |
| 75 | + ).optimize() |
100 | 76 |
|
101 | | - self.landline = self.landline_hindi | self.landline_english |
102 | 77 |
|
103 | | - self.pincode_in_hindi = ( |
104 | | - pynutil.insert("number_part: \"") |
105 | | - + pynini.closure(hindi_digit_graph + delete_space, 0, 5) |
106 | | - + hindi_digit_graph |
107 | | - + pynutil.insert("\" ") |
108 | | - ) |
109 | | - self.pincode_in_english = ( |
110 | | - pynutil.insert("number_part: \"") |
111 | | - + pynini.closure(english_digit_graph + delete_space, 0, 5) |
112 | | - + english_digit_graph |
113 | | - + pynutil.insert("\" ") |
114 | | - ) |
| 78 | +def generate_pincode(context_keywords): |
| 79 | + return generate_context_graph(context_keywords, 6) |
115 | 80 |
|
116 | | - self.credit_card_last_digits_hindi = ( |
117 | | - pynutil.insert("number_part: \"") |
118 | | - + pynini.closure(hindi_digit_graph + delete_space, 0, 3) |
119 | | - + hindi_digit_graph |
120 | | - + pynutil.insert("\" ") |
121 | | - ) |
122 | | - self.credit_card_last_digits_english = ( |
123 | | - pynutil.insert("number_part: \"") |
124 | | - + pynini.closure(english_digit_graph + delete_space, 0, 3) |
125 | | - + english_digit_graph |
126 | | - + pynutil.insert("\" ") |
127 | | - ) |
128 | 81 |
|
129 | | - delete_plus = pynini.union( |
130 | | - pynutil.delete("प्लस") | pynutil.delete("plus") | pynutil.delete("Plus") | pynutil.delete("PLUS") |
131 | | - ) |
| 82 | +def generate_credit(context_keywords): |
| 83 | + return generate_context_graph(context_keywords, 4) |
132 | 84 |
|
133 | | - delete_zero = pynini.union( |
134 | | - pynutil.delete("शून्य") | pynutil.delete("zero") | pynutil.delete("Zero") | pynutil.delete("ZERO") |
135 | | - ) |
136 | 85 |
|
137 | | - graph_number_with_hindi_digit = ( |
138 | | - delete_plus + delete_space + self.country_code + delete_space + self.hindi_digit |
139 | | - ) |
140 | | - graph_number_with_english_digit = delete_plus + delete_space + self.country_code + self.english_digit |
| 86 | +def generate_mobile(context_keywords): |
| 87 | + context_before, context_after = get_context(context_keywords) |
| 88 | + |
| 89 | + country_code = pynini.cross("प्लस", "+") + pynini.closure(delete_space + digit, 2, 2) + NEMO_WHITE_SPACE |
| 90 | + graph_country_code = ( |
| 91 | + pynutil.insert("country_code: \"") |
| 92 | + + (context_before + NEMO_WHITE_SPACE) ** (0, 1) |
| 93 | + + country_code |
| 94 | + + pynutil.insert("\" ") |
| 95 | + ) |
| 96 | + |
| 97 | + number_part = digit_without_shunya + delete_space + pynini.closure(digit + delete_space, 8, 8) + digit |
| 98 | + graph_number = ( |
| 99 | + pynutil.insert("number_part: \"") |
| 100 | + + number_part |
| 101 | + + pynini.closure(NEMO_WHITE_SPACE + context_after, 0, 1) |
| 102 | + + pynutil.insert("\" ") |
| 103 | + ) |
| 104 | + |
| 105 | + graph = (graph_country_code + graph_number) | graph_number |
| 106 | + return graph.optimize() |
| 107 | + |
| 108 | + |
| 109 | +def generate_telephone(context_keywords): |
| 110 | + context_before, context_after = get_context(context_keywords) |
| 111 | + |
| 112 | + landline = shunya + delete_space + pynini.closure(digit + delete_space, 9, 9) + digit |
| 113 | + landline_with_context_before = context_before + NEMO_WHITE_SPACE + landline |
| 114 | + landline_with_context_after = landline + NEMO_WHITE_SPACE + context_after |
| 115 | + |
| 116 | + return ( |
| 117 | + pynutil.insert("number_part: \"") |
| 118 | + + (landline | landline_with_context_before | landline_with_context_after) |
| 119 | + + pynutil.insert("\" ") |
| 120 | + ) |
| 121 | + |
| 122 | + |
| 123 | +class TelephoneFst(GraphFst): |
| 124 | + """ |
| 125 | + Finite state transducer for classifying telephone numbers, e.g. |
| 126 | + e.g. प्लस इक्यानवे नौ आठ सात छह पांच चार तीन दो एक शून्य => tokens { name: "+९१ ९८७६५ ४३२१०" } |
| 127 | + Args: |
| 128 | + Cardinal: CardinalFst |
| 129 | + """ |
| 130 | + |
| 131 | + def __init__(self, cardinal: GraphFst): |
| 132 | + super().__init__(name="telephone", kind="classify") |
| 133 | + |
| 134 | + # Load context cues from TSV file |
| 135 | + context_cues = pynini.string_file(get_abs_path("data/telephone/context_cues.tsv")) |
| 136 | + |
| 137 | + # Extract keywords for each category |
| 138 | + mobile_keywords = pynini.compose(pynutil.delete("mobile"), context_cues).project("output").optimize() |
| 139 | + |
| 140 | + landline_keywords = pynini.compose(pynutil.delete("landline"), context_cues).project("output").optimize() |
141 | 141 |
|
142 | | - graph_landline_with_extension = delete_zero + delete_space + self.city_extension + delete_space + self.landline |
| 142 | + pincode_keywords = pynini.compose(pynutil.delete("pincode"), context_cues).project("output").optimize() |
143 | 143 |
|
144 | | - graph_pincode = self.pincode_in_hindi | self.pincode_in_english |
| 144 | + credit_keywords = pynini.compose(pynutil.delete("credit"), context_cues).project("output").optimize() |
145 | 145 |
|
146 | | - graph_credit_card_last_digits = self.credit_card_last_digits_hindi | self.credit_card_last_digits_english |
| 146 | + # Convert FSTs to keyword lists for generate_* functions |
| 147 | + mobile = generate_mobile([mobile_keywords]) |
| 148 | + landline = generate_telephone([landline_keywords]) |
| 149 | + pincode = generate_pincode([pincode_keywords]) |
| 150 | + credit = generate_credit([credit_keywords]) |
147 | 151 |
|
148 | 152 | graph = ( |
149 | | - graph_number_with_hindi_digit |
150 | | - | graph_number_with_english_digit |
151 | | - | graph_landline_with_extension |
152 | | - | graph_pincode |
153 | | - | graph_credit_card_last_digits |
| 153 | + pynutil.add_weight(mobile, 0.7) |
| 154 | + | pynutil.add_weight(landline, 0.8) |
| 155 | + | pynutil.add_weight(credit, 0.9) |
| 156 | + | pynutil.add_weight(pincode, 1) |
154 | 157 | ) |
155 | 158 |
|
156 | | - final_graph = self.add_tokens(graph) |
157 | | - self.fst = final_graph |
| 159 | + self.final = graph.optimize() |
| 160 | + self.fst = self.add_tokens(self.final) |
0 commit comments