From bf27c23835a0b3fc2a3d6099eca3cd7dbd1f73e7 Mon Sep 17 00:00:00 2001 From: Sasha Meister Date: Tue, 14 May 2024 13:53:56 +0000 Subject: [PATCH 1/2] Added new SearchRegex processor which searches for regex patterns in the text field of given manifest. Signed-off-by: Sasha Meister --- sdp/processors/__init__.py | 1 + .../modify_manifest/data_to_data.py | 46 +++++++++++++++++++ 2 files changed, 47 insertions(+) diff --git a/sdp/processors/__init__.py b/sdp/processors/__init__.py index ba4c8093..250a2332 100644 --- a/sdp/processors/__init__.py +++ b/sdp/processors/__init__.py @@ -63,6 +63,7 @@ SubIfASRSubstitution, SubMakeLowercase, SubRegex, + SearchRegex, ) from sdp.processors.modify_manifest.data_to_dropbool import ( DropASRError, diff --git a/sdp/processors/modify_manifest/data_to_data.py b/sdp/processors/modify_manifest/data_to_data.py index f379308f..0914be23 100644 --- a/sdp/processors/modify_manifest/data_to_data.py +++ b/sdp/processors/modify_manifest/data_to_data.py @@ -577,3 +577,49 @@ def finalize(self, metrics): for word, count in total_counter_sorted.items(): logger.info(f"{word} {count}") super().finalize(metrics) + + +class SearchRegex(BaseParallelProcessor): + """Searches for patterns in the input string. + + Args: + search_patterns (list[str]): List of search patterns. + text_key (str): Key in the data entry containing the text to search. + output_key (str): Key in the data entry to store the output value indicating if any pattern has been found. + """ + + def __init__( + self, + search_patterns: List[str], + text_key: str = "text", + output_key: str = "pattern_found", + **kwargs, + ): + super().__init__(**kwargs) + self.search_patterns = search_patterns + self.text_key = text_key + self.output_key = output_key + + def process_dataset_entry(self, data_entry) -> List: + """Searches for each pattern in the input text.""" + search_results = {} + + text_in = data_entry[self.text_key] + pattern_found = False + + for pattern in self.search_patterns: + found = bool(re.search(pattern, text_in)) + search_results[pattern] = found + if found: + pattern_found = True + + data_entry[self.output_key] = pattern_found + + return [DataEntry(data=data_entry, metrics=pattern_found)] + + def finalize(self, metrics): + """Reports counts of how many data entries contained patterns.""" + print(f"Samples amount which contain patterns: {sum(metrics)}") + print(f"Samples amount which don't contain patterns: {len(metrics) - sum(metrics)}") + + super().finalize(metrics) From e011e7685ec9363da917ea3c9b286827e85ee930 Mon Sep 17 00:00:00 2001 From: Sasha Meister Date: Wed, 15 May 2024 13:11:49 +0000 Subject: [PATCH 2/2] SearchRegex test added Signed-off-by: Sasha Meister --- docs/src/sdp/api.rst | 3 +++ tests/test_data_to_data.py | 11 +++++++++++ 2 files changed, 14 insertions(+) diff --git a/docs/src/sdp/api.rst b/docs/src/sdp/api.rst index 5654576d..ae769b8b 100644 --- a/docs/src/sdp/api.rst +++ b/docs/src/sdp/api.rst @@ -132,6 +132,9 @@ Data modifications :annotation: :noindex: +.. autodata:: sdp.processors.SearchRegex + :annotation: + .. autodata:: sdp.processors.SubMakeLowercase :annotation: diff --git a/tests/test_data_to_data.py b/tests/test_data_to_data.py index 5bd75f47..260efdba 100644 --- a/tests/test_data_to_data.py +++ b/tests/test_data_to_data.py @@ -19,6 +19,7 @@ SubIfASRSubstitution, SubMakeLowercase, SubRegex, + SearchRegex ) test_params_list = [] @@ -90,6 +91,16 @@ ] ) +test_params_list.extend( + [ + ( + SearchRegex, + {"search_patterns": ["[^a-zA-Z\\s]+"]}, + {"text": "Hola, bienvenido seas a este Canal de Ministerio Latino por Cristo."}, + {"text": "Hola, bienvenido seas a este Canal de Ministerio Latino por Cristo.", "pattern_found": True}, + ), + ] +) @pytest.mark.parametrize("test_class,class_kwargs,test_input,expected_output", test_params_list, ids=str) def test_data_to_data(test_class, class_kwargs, test_input, expected_output):