Hindi ITN Merge Telephone Semiotic Class (#344)

RajanPutty · Rajanv307 · pre-commit-ci[bot] · web-flow · commit 5e19b4e8ac58 · 2025-10-28T17:50:08.000-04:00
* feat(hi): Add Telephone class and all Hindi ITN updates Signed-off-by: RajanPutty <rputty@nvidia.com> * refactor(hi/telephone): Load digits and context from TSV files Addresses review comments on PR #344 by refactoring hardcoded variables to use data loaded from TSV files. - The `hindi_digits` and `english_digits` variables are no longer hardcoded. They are now populated by loading and creating a pynini union of their respective TSV files (`data/numbers/digit.tsv`, `data/numbers/zero.tsv`, `data/telephone/eng_digit.tsv`, `data/telephone/eng_zero.tsv`). - The hardcoded `context` dictionary has been removed. Its values are now loaded from a new `data/telephone/context_cues.tsv` file, matching the existing pattern used for `cc_cues` in the 'en' implementation. Signed-off-by: P V RAJAN <rajanv307@gmail.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Signed-off-by: P V RAJAN <rajanv307@gmail.com> --------- Signed-off-by: RajanPutty <rputty@nvidia.com> Signed-off-by: P V RAJAN <rajanv307@gmail.com> Co-authored-by: P V RAJAN <rajanv307@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/telephone/context_cues.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/telephone/context_cues.tsv
@@ -0,0 +1,15 @@
+mobile	नंबर
+mobile	मोबाइल
+mobile	फोन
+mobile	कॉल
+landline	नंबर
+landline	मोबाइल
+landline	फोन
+landline	लैंडलाइन
+landline	कॉल
+pincode	पिन
+pincode	कोड
+pincode	पिनकोड
+credit	नंबर
+credit	कार्ड
+credit	क्रेडिट
diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/telephone/country_codes.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/telephone/country_codes.tsv
@@ -0,0 +1 @@
+९१	नौ एक
diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/telephone/eng_digit.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/telephone/eng_digit.tsv
@@ -1,4 +1,3 @@
-०	zero
 १	one
 २	two
 ३	three
@@ -7,4 +6,4 @@
 ६	six
 ७	seven
 ८	eight
-९	nine
+९	nine
diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/telephone/eng_zero.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/telephone/eng_zero.tsv
@@ -0,0 +1 @@
+०	zero
diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/telephone.py
@@ -15,143 +15,146 @@
 import pynini
 from pynini.lib import pynutil
 
-from nemo_text_processing.inverse_text_normalization.hi.graph_utils import GraphFst, delete_space
+from nemo_text_processing.inverse_text_normalization.hi.graph_utils import (
+    NEMO_CHAR,
+    NEMO_WHITE_SPACE,
+    GraphFst,
+    delete_space,
+)
 from nemo_text_processing.inverse_text_normalization.hi.utils import get_abs_path
 
+shunya = (
+    pynini.string_file(get_abs_path("data/numbers/zero.tsv")).invert()
+    | pynini.string_file(get_abs_path("data/telephone/eng_zero.tsv")).invert()
+)
+digit_without_shunya = (
+    pynini.string_file(get_abs_path("data/numbers/digit.tsv")).invert()
+    | pynini.string_file(get_abs_path("data/telephone/eng_digit.tsv")).invert()
+)
+digit = digit_without_shunya | shunya
 
-class TelephoneFst(GraphFst):
-    """
-    Finite state transducer for classifying telephone numbers, e.g.
-    e.g. प्लस इक्यानवे नौ आठ सात छह पांच चार तीन दो एक शून्य => tokens { name: "+९१ ९८७६५ ४३२१०" }
-    Args:
-        Cardinal: CardinalFst
-    """
 
-    def __init__(self, cardinal: GraphFst):
-        super().__init__(name="telephone", kind="classify")
+def get_context(keywords: list):
+    keywords = pynini.union(*keywords)
 
-        hindi_digit_graph = pynini.string_file(get_abs_path("data/numbers/digit.tsv")).invert()
-        hindi_digit_graph |= pynini.string_file(get_abs_path("data/numbers/zero.tsv")).invert()
+    # Load Hindi digits from TSV files
+    hindi_digits = (
+        pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
+        | pynini.string_file(get_abs_path("data/numbers/zero.tsv"))
+    ).project("output")
 
-        english_digit_graph = pynini.string_file(get_abs_path("data/telephone/eng_to_hindi_digit.tsv")).invert()
+    # Load English digits from TSV files
+    english_digits = (
+        pynini.string_file(get_abs_path("data/telephone/eng_digit.tsv"))
+        | pynini.string_file(get_abs_path("data/telephone/eng_zero.tsv"))
+    ).project("output")
 
-        country_code_graph_single_digits = pynini.string_file(get_abs_path("data/numbers/digit.tsv")).invert()
-        country_code_graph_single_digits |= pynini.string_file(get_abs_path("data/numbers/zero.tsv")).invert()
-        country_code_graph_single_digits |= pynini.string_file(
-            get_abs_path("data/telephone/eng_to_hindi_digit.tsv")
-        ).invert()
+    all_digits = hindi_digits | english_digits
 
-        country_code_graph_double_digits = pynini.string_file(get_abs_path("data/numbers/teens_and_ties.tsv")).invert()
-        country_code_graph_double_digits |= pynini.string_file(
-            get_abs_path("data/telephone/teens_and_ties_eng_to_hin.tsv")
-        ).invert()
+    non_digit_char = pynini.difference(NEMO_CHAR, pynini.union(all_digits, NEMO_WHITE_SPACE))
+    word = pynini.closure(non_digit_char, 1) + NEMO_WHITE_SPACE
+    window = pynini.closure(word, 0, 5)
+    before = (keywords + window).optimize()
+    after = (window + keywords).optimize()
 
-        self.hindi_digit = (
-            pynutil.insert("number_part: \"")
-            + pynini.closure(hindi_digit_graph + delete_space, 0, 9)
-            + hindi_digit_graph
-            + pynutil.insert("\" ")
-        )
-        self.english_digit = (
-            pynutil.insert("number_part: \"")
-            + pynini.closure(english_digit_graph + delete_space, 0, 9)
-            + english_digit_graph
-            + delete_space
-            + pynutil.insert("\" ")
-        )
+    return before, after
 
-        self.country_code_with_single_digits = (
-            pynutil.insert("country_code: \"")
-            + pynini.closure(country_code_graph_single_digits + delete_space, 0, 2)
-            + pynutil.insert("\" ")
-        )
-        self.country_code_with_double_digits = (
-            pynutil.insert("country_code: \"")
-            + pynini.closure(country_code_graph_double_digits + delete_space, 0, 1)
-            + pynutil.insert("\" ")
-        )
-        self.country_code = self.country_code_with_single_digits | self.country_code_with_double_digits
 
-        # two, three, four-digit extension code with zero
-        self.city_code_hindi = (
-            pynutil.insert("extension: \"")
-            + pynini.closure(hindi_digit_graph + delete_space, 2, 5)
-            + pynutil.insert("\" ")
-        )
-        self.city_code_english = (
-            pynutil.insert("extension: \"")
-            + pynini.closure(english_digit_graph + delete_space, 2, 5)
-            + pynutil.insert("\" ")
-        )
+def generate_context_graph(context_keywords, length):
+    context_before, context_after = get_context(context_keywords)
+    digits = pynini.closure(digit + delete_space, length - 1, length - 1) + digit
 
-        self.city_extension = self.city_code_hindi | self.city_code_english
+    graph_after_context = digits + NEMO_WHITE_SPACE + context_after
+    graph_before_context = context_before + NEMO_WHITE_SPACE + digits
+    graph_without_context = digits
 
-        # 7-digit landline graph in hindi and english digits
-        self.landline_hindi = (
-            pynutil.insert("number_part: \"")
-            + pynini.closure(hindi_digit_graph + delete_space, 7, 7)
-            + pynutil.insert("\" ")
-        )
-        self.landline_english = (
-            pynutil.insert("number_part: \"")
-            + pynini.closure(english_digit_graph + delete_space, 7, 7)
-            + pynutil.insert("\" ")
-        )
+    return (
+        pynutil.insert("number_part: \"")
+        + (graph_before_context | graph_after_context | graph_without_context)
+        + pynutil.insert("\" ")
+    ).optimize()
 
-        self.landline = self.landline_hindi | self.landline_english
 
-        self.pincode_in_hindi = (
-            pynutil.insert("number_part: \"")
-            + pynini.closure(hindi_digit_graph + delete_space, 0, 5)
-            + hindi_digit_graph
-            + pynutil.insert("\" ")
-        )
-        self.pincode_in_english = (
-            pynutil.insert("number_part: \"")
-            + pynini.closure(english_digit_graph + delete_space, 0, 5)
-            + english_digit_graph
-            + pynutil.insert("\" ")
-        )
+def generate_pincode(context_keywords):
+    return generate_context_graph(context_keywords, 6)
 
-        self.credit_card_last_digits_hindi = (
-            pynutil.insert("number_part: \"")
-            + pynini.closure(hindi_digit_graph + delete_space, 0, 3)
-            + hindi_digit_graph
-            + pynutil.insert("\" ")
-        )
-        self.credit_card_last_digits_english = (
-            pynutil.insert("number_part: \"")
-            + pynini.closure(english_digit_graph + delete_space, 0, 3)
-            + english_digit_graph
-            + pynutil.insert("\" ")
-        )
 
-        delete_plus = pynini.union(
-            pynutil.delete("प्लस") | pynutil.delete("plus") | pynutil.delete("Plus") | pynutil.delete("PLUS")
-        )
+def generate_credit(context_keywords):
+    return generate_context_graph(context_keywords, 4)
 
-        delete_zero = pynini.union(
-            pynutil.delete("शून्य") | pynutil.delete("zero") | pynutil.delete("Zero") | pynutil.delete("ZERO")
-        )
 
-        graph_number_with_hindi_digit = (
-            delete_plus + delete_space + self.country_code + delete_space + self.hindi_digit
-        )
-        graph_number_with_english_digit = delete_plus + delete_space + self.country_code + self.english_digit
+def generate_mobile(context_keywords):
+    context_before, context_after = get_context(context_keywords)
+
+    country_code = pynini.cross("प्लस", "+") + pynini.closure(delete_space + digit, 2, 2) + NEMO_WHITE_SPACE
+    graph_country_code = (
+        pynutil.insert("country_code: \"")
+        + (context_before + NEMO_WHITE_SPACE) ** (0, 1)
+        + country_code
+        + pynutil.insert("\" ")
+    )
+
+    number_part = digit_without_shunya + delete_space + pynini.closure(digit + delete_space, 8, 8) + digit
+    graph_number = (
+        pynutil.insert("number_part: \"")
+        + number_part
+        + pynini.closure(NEMO_WHITE_SPACE + context_after, 0, 1)
+        + pynutil.insert("\" ")
+    )
+
+    graph = (graph_country_code + graph_number) | graph_number
+    return graph.optimize()
+
+
+def generate_telephone(context_keywords):
+    context_before, context_after = get_context(context_keywords)
+
+    landline = shunya + delete_space + pynini.closure(digit + delete_space, 9, 9) + digit
+    landline_with_context_before = context_before + NEMO_WHITE_SPACE + landline
+    landline_with_context_after = landline + NEMO_WHITE_SPACE + context_after
+
+    return (
+        pynutil.insert("number_part: \"")
+        + (landline | landline_with_context_before | landline_with_context_after)
+        + pynutil.insert("\" ")
+    )
+
+
+class TelephoneFst(GraphFst):
+    """
+    Finite state transducer for classifying telephone numbers, e.g.
+    e.g. प्लस इक्यानवे नौ आठ सात छह पांच चार तीन दो एक शून्य => tokens { name: "+९१ ९८७६५ ४३२१०" }
+    Args:
+        Cardinal: CardinalFst
+    """
+
+    def __init__(self, cardinal: GraphFst):
+        super().__init__(name="telephone", kind="classify")
+
+        # Load context cues from TSV file
+        context_cues = pynini.string_file(get_abs_path("data/telephone/context_cues.tsv"))
+
+        # Extract keywords for each category
+        mobile_keywords = pynini.compose(pynutil.delete("mobile"), context_cues).project("output").optimize()
+
+        landline_keywords = pynini.compose(pynutil.delete("landline"), context_cues).project("output").optimize()
 
-        graph_landline_with_extension = delete_zero + delete_space + self.city_extension + delete_space + self.landline
+        pincode_keywords = pynini.compose(pynutil.delete("pincode"), context_cues).project("output").optimize()
 
-        graph_pincode = self.pincode_in_hindi | self.pincode_in_english
+        credit_keywords = pynini.compose(pynutil.delete("credit"), context_cues).project("output").optimize()
 
-        graph_credit_card_last_digits = self.credit_card_last_digits_hindi | self.credit_card_last_digits_english
+        # Convert FSTs to keyword lists for generate_* functions
+        mobile = generate_mobile([mobile_keywords])
+        landline = generate_telephone([landline_keywords])
+        pincode = generate_pincode([pincode_keywords])
+        credit = generate_credit([credit_keywords])
 
         graph = (
-            graph_number_with_hindi_digit
-            | graph_number_with_english_digit
-            | graph_landline_with_extension
-            | graph_pincode
-            | graph_credit_card_last_digits
+            pynutil.add_weight(mobile, 0.7)
+            | pynutil.add_weight(landline, 0.8)
+            | pynutil.add_weight(credit, 0.9)
+            | pynutil.add_weight(pincode, 1)
         )
 
-        final_graph = self.add_tokens(graph)
-        self.fst = final_graph
+        self.final = graph.optimize()
+        self.fst = self.add_tokens(self.final)
diff --git a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/telephone.py b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/telephone.py
@@ -32,7 +32,6 @@ def __init__(self, cardinal: GraphFst):
         number_part = pynutil.delete("number_part: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")
         optional_country_code = pynini.closure(
             pynutil.delete("country_code: \"")
-            + pynutil.insert("+")
             + delete_space
             + pynini.closure(NEMO_NOT_QUOTE, 1)
             + pynutil.delete("\"")
diff --git a/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_telephone.txt b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_telephone.txt
diff --git a/tests/nemo_text_processing/utils.py b/tests/nemo_text_processing/utils.py