diff --git a/cdisc_rules_engine/check_operators/dataframe_operators.py b/cdisc_rules_engine/check_operators/dataframe_operators.py index 78e4f5a9f..5625b5a81 100644 --- a/cdisc_rules_engine/check_operators/dataframe_operators.py +++ b/cdisc_rules_engine/check_operators/dataframe_operators.py @@ -1664,12 +1664,12 @@ def _verify_neighbor_consistency( return is_valid - def check_target_ascending_in_sorted_group( + def check_target_ascending_in_sorted_group_with_regex( self, group, target, comparator, ascending, na_pos ): """ Check if target values are in ascending order within a group - already sorted by comparator. + already sorted by comparator. Supports regex extraction. """ is_valid = pd.Series(True, index=group.index) is_numeric_comparator = pd.api.types.is_numeric_dtype(group[comparator]) @@ -1798,12 +1798,45 @@ def _process_grouped_result( grouped_result = pd.Series(result_list, index=index_list) return grouped_result.reindex(sorted_df.index, fill_value=True) + def _extract_regex_group(self, series: pd.Series, regex_pattern: str) -> pd.Series: + """ + Extract the first capturing group from a regex pattern and convert to numeric if possible. + Handles zero-padded numbers by converting to numeric. + + Args: + series: Pandas series with string values + regex_pattern: Regex pattern with capturing group(s) + + Returns: + Series with extracted and converted values + """ + + def extract_and_convert(value): + if pd.isna(value) or value == "": + return np.nan + + # YAML escapes backslashes, so we receive ".*\\d+$" which Python interprets as raw \ + # We need to convert this to the actual regex pattern by replacing \\ with \ + # However, since strings from YAML come already unescaped, we just use as-is + match = re.search(regex_pattern, str(value)) + if match and match.groups(): + extracted = match.group(1) # First capturing group + # Try to convert to numeric to handle both padded and non-padded numbers + try: + return pd.to_numeric(extracted) + except (ValueError, TypeError): + return extracted + return np.nan + + return series.apply(extract_and_convert) + @log_operator_execution @type_operator(FIELD_DATAFRAME) def target_is_sorted_by(self, other_value: dict): target = other_value.get("target") within_columns = self._normalize_grouping_columns(other_value.get("within")) columns = other_value["comparator"] + target_regex = other_value.get("regex") # parameter for regex extraction result = pd.Series([True] * len(self.value), index=self.value.index) @@ -1816,16 +1849,32 @@ def target_is_sorted_by(self, other_value: dict): dict.fromkeys([target, comparator, *within_columns]) ) - sorted_df = self.value[selected_columns].sort_values( - by=[*within_columns, target], - ascending=[True] * (len(within_columns) + 1), - ) + # If regex is provided, extract and convert target values + if target_regex: + working_df = self.value[selected_columns].copy() + # Create a temporary column with extracted regex values + working_df[f"{target}_extracted"] = self._extract_regex_group( + working_df[target], target_regex + ) + target_for_sorting = f"{target}_extracted" + # Sort by within columns AND extracted target + sorted_df = working_df.sort_values( + by=[*within_columns, target_for_sorting], + ascending=[True] * (len(within_columns) + 1), + ) + else: + working_df = self.value[selected_columns] + target_for_sorting = target + sorted_df = working_df.sort_values( + by=[*within_columns, target], + ascending=[True] * (len(within_columns) + 1), + ) grouped_df = sorted_df.groupby(within_columns, sort=False) target_check = grouped_df.apply( - lambda x: self.check_target_ascending_in_sorted_group( - x, target, comparator, ascending, na_pos + lambda x: self.check_target_ascending_in_sorted_group_with_regex( + x, target_for_sorting, comparator, ascending, na_pos ) ) target_check = self._process_grouped_result( @@ -1833,20 +1882,22 @@ def target_is_sorted_by(self, other_value: dict): grouped_df, within_columns, sorted_df, - lambda group: self.check_target_ascending_in_sorted_group( - group, target, comparator, ascending, na_pos + lambda group: self.check_target_ascending_in_sorted_group_with_regex( + group, target_for_sorting, comparator, ascending, na_pos ), ) date_overlap_check = grouped_df.apply( - lambda x: self.check_date_overlaps(x, target, comparator) + lambda x: self.check_date_overlaps(x, target_for_sorting, comparator) ) date_overlap_check = self._process_grouped_result( date_overlap_check, grouped_df, within_columns, sorted_df, - lambda group: self.check_date_overlaps(group, target, comparator), + lambda group: self.check_date_overlaps( + group, target_for_sorting, comparator + ), ) combined_check = target_check & date_overlap_check diff --git a/resources/schema/rule-merged/Operator.json b/resources/schema/rule-merged/Operator.json index 80c62ac56..1c1f13a64 100644 --- a/resources/schema/rule-merged/Operator.json +++ b/resources/schema/rule-merged/Operator.json @@ -850,7 +850,7 @@ "properties": { "operator": { "const": "target_is_sorted_by", - "markdownDescription": "\nTrue if the values in name are ordered according to the values specified by value in ascending/descending order, grouped by the values in within. Each value entry requires a variable name, a sort_order of asc or desc, and an optional null_position of first or last (defaults to last) which controls where null/empty comparator values are placed in the expected ordering. Within accepts either a single column or an ordered list of columns. Columns can be either number or Char Dates in ISO8601 YYYY-MM-DD format. Date value(s) with different precisions that overlap (e.g. 2005-10, 2005-10-3 and 2005-10-08) are all flagged as not sorted as their order cannot be inferred.\n\n```yaml\nCheck:\n all:\n - name: --SEQ\n within:\n - USUBJID\n - MIDSTYPE\n operator: target_is_sorted_by\n value:\n - name: --STDTC\n sort_order: asc\n null_position: last\n```\n" + "markdownDescription": "\nTrue if the values in name are ordered according to the values specified by value\nin ascending/descending order, grouped by the values in within. Each value entry\nrequires a variable name, a sort_order of asc or desc, and an optional\nnull_position of first or last (defaults to last) which controls where null/empty\ncomparator values are placed in the expected ordering. Within accepts either a\nsingle column or an ordered list of columns. Columns can be either number or Char\nDates in ISO8601 YYYY-MM-DD format. Date value(s) with different precisions that\noverlap (e.g. 2005-10, 2005-10-3 and 2005-10-08) are all flagged as not sorted as\ntheir order cannot be inferred.\n\nOptionally supports a `regex` parameter that extracts a portion of the target\nvalue for sorting. The regex must contain at least one capturing group. The first\ncaptured group is extracted and converted to numeric if possible, allowing proper\nsorting of sequence numbers (e.g., \"MIDS1\", \"MIDS2\", ..., \"MIDS10\" with regex\n`.*?(\\\\d+)$`). This is particularly useful for variables that end with sequence\nnumbers that may or may not be zero-padded.\n\n```yaml\nCheck:\n all:\n - name: --SEQ\n within:\n - USUBJID\n - MIDSTYPE\n operator: target_is_sorted_by\n value:\n - name: --STDTC\n sort_order: asc\n null_position: last\n```\n\nExample with regex for extracting sequence numbers:\n\n```yaml\nCheck:\n all:\n - name: MIDS\n operator: target_is_sorted_by\n regex: \".*?(\\\\d+)$\" # Extract trailing digits, convert to numeric\n value:\n - name: SMSTDTC\n sort_order: asc\n within:\n - USUBJID\n - MIDSTYPE\n```\n" } }, "required": ["operator", "value", "within"], diff --git a/resources/schema/rule/Operator.md b/resources/schema/rule/Operator.md index 2ba4b3015..4685cac0f 100644 --- a/resources/schema/rule/Operator.md +++ b/resources/schema/rule/Operator.md @@ -1127,7 +1127,22 @@ Complement of `is_ordered_by` ### target_is_sorted_by -True if the values in name are ordered according to the values specified by value in ascending/descending order, grouped by the values in within. Each value entry requires a variable name, a sort_order of asc or desc, and an optional null_position of first or last (defaults to last) which controls where null/empty comparator values are placed in the expected ordering. Within accepts either a single column or an ordered list of columns. Columns can be either number or Char Dates in ISO8601 YYYY-MM-DD format. Date value(s) with different precisions that overlap (e.g. 2005-10, 2005-10-3 and 2005-10-08) are all flagged as not sorted as their order cannot be inferred. +True if the values in name are ordered according to the values specified by value +in ascending/descending order, grouped by the values in within. Each value entry +requires a variable name, a sort_order of asc or desc, and an optional +null_position of first or last (defaults to last) which controls where null/empty +comparator values are placed in the expected ordering. Within accepts either a +single column or an ordered list of columns. Columns can be either number or Char +Dates in ISO8601 YYYY-MM-DD format. Date value(s) with different precisions that +overlap (e.g. 2005-10, 2005-10-3 and 2005-10-08) are all flagged as not sorted as +their order cannot be inferred. + +Optionally supports a `regex` parameter that extracts a portion of the target +value for sorting. The regex must contain at least one capturing group. The first +captured group is extracted and converted to numeric if possible, allowing proper +sorting of sequence numbers (e.g., "MIDS1", "MIDS2", ..., "MIDS10" with regex +`.*?(\\d+)$`). This is particularly useful for variables that end with sequence +numbers that may or may not be zero-padded. ```yaml Check: @@ -1143,6 +1158,22 @@ Check: null_position: last ``` +Example with regex for extracting sequence numbers: + +```yaml +Check: + all: + - name: MIDS + operator: target_is_sorted_by + regex: ".*?(\\d+)$" # Extract trailing digits, convert to numeric + value: + - name: SMSTDTC + sort_order: asc + within: + - USUBJID + - MIDSTYPE +``` + ### target_is_not_sorted_by Complement of `target_is_sorted_by` diff --git a/tests/unit/test_check_operators/test_target_is_sorted_by_regex.py b/tests/unit/test_check_operators/test_target_is_sorted_by_regex.py new file mode 100644 index 000000000..0ff2989af --- /dev/null +++ b/tests/unit/test_check_operators/test_target_is_sorted_by_regex.py @@ -0,0 +1,185 @@ +""" +Tests for target_is_sorted_by operator with regex support +""" + +import pytest +import pandas as pd +from cdisc_rules_engine.check_operators.dataframe_operators import DataframeType +from cdisc_rules_engine.models.dataset.pandas_dataset import PandasDataset +from cdisc_rules_engine.models.dataset.dask_dataset import DaskDataset + + +@pytest.mark.parametrize("dataset_class", [PandasDataset, DaskDataset]) +def test_target_is_sorted_by_with_regex_non_padded(dataset_class): + """ + Test target_is_sorted_by with regex extraction for non-zero-padded sequence numbers. + Example: lalala1, lalala2, ..., lalala9, lalala10 + """ + df = dataset_class.from_dict( + { + "USUBJID": ["001", "001", "001", "001", "002", "002", "002"], + "MIDSTYPE": ["A", "A", "A", "A", "B", "B", "B"], + "MIDS": [ + "lalala1", + "lalala2", + "lalala9", + "lalala10", + "test1", + "test2", + "test10", + ], + "SMSTDTC": [ + "2020-01-01", + "2020-01-02", + "2020-01-09", + "2020-01-10", + "2020-02-01", + "2020-02-02", + "2020-02-10", + ], + } + ) + + other_value = { + "target": "MIDS", + "regex": ".*?(\\d+)$", # Non-greedy to correctly extract multi-digit numbers + "within": ["USUBJID", "MIDSTYPE"], + "comparator": [ + {"name": "SMSTDTC", "sort_order": "ASC", "null_position": "first"} + ], + } + + result = DataframeType({"value": df}).target_is_sorted_by(other_value) + # All should be True - sorted correctly by chronological order + assert result.equals(pd.Series([True, True, True, True, True, True, True])) + + +@pytest.mark.parametrize("dataset_class", [PandasDataset, DaskDataset]) +def test_target_is_sorted_by_with_regex_zero_padded(dataset_class): + """ + Test target_is_sorted_by with regex extraction for zero-padded sequence numbers. + Example: lalala01, lalala02, ..., lalala09, lalala10 + """ + df = dataset_class.from_dict( + { + "USUBJID": ["001", "001", "001", "001"], + "MIDSTYPE": ["A", "A", "A", "A"], + "MIDS": ["lalala01", "lalala02", "lalala09", "lalala10"], + "SMSTDTC": [ + "2020-01-01", + "2020-01-02", + "2020-01-09", + "2020-01-10", + ], + } + ) + + other_value = { + "target": "MIDS", + "regex": ".*?(\\d+)$", # Non-greedy to correctly extract multi-digit numbers + "within": ["USUBJID", "MIDSTYPE"], + "comparator": [ + {"name": "SMSTDTC", "sort_order": "ASC", "null_position": "first"} + ], + } + + result = DataframeType({"value": df}).target_is_sorted_by(other_value) + # All should be True - numeric conversion handles zero-padding + assert result.equals(pd.Series([True, True, True, True])) + + +@pytest.mark.parametrize("dataset_class", [PandasDataset, DaskDataset]) +def test_target_is_sorted_by_with_regex_invalid_order(dataset_class): + """ + Test that invalid order is detected even with regex extraction. + """ + df = dataset_class.from_dict( + { + "USUBJID": ["001", "001", "001", "001"], + "MIDSTYPE": ["A", "A", "A", "A"], + "MIDS": ["lalala1", "lalala10", "lalala2", "lalala9"], + "SMSTDTC": [ + "2020-01-01", + "2020-01-02", + "2020-01-09", + "2020-01-10", + ], + } + ) + + other_value = { + "target": "MIDS", + "regex": ".*?(\\d+)$", # Non-greedy to correctly extract multi-digit numbers + "within": ["USUBJID", "MIDSTYPE"], + "comparator": [ + {"name": "SMSTDTC", "sort_order": "ASC", "null_position": "first"} + ], + } + + result = DataframeType({"value": df}).target_is_sorted_by(other_value) + # After sorting by extracted MIDS (1, 2, 9, 10), dates should be: + # MIDS=1 (2020-01-01) -> MIDS=2 (should be 2020-01-02) -> MIDS=9 (should be 2020-01-09) -> MIDS=10 (should be 2020-01-10) + # Actual dates: 2020-01-01, 2020-01-09, 2020-01-10, 2020-01-02 + # Only MIDS=1 is in correct chronological position + assert result.equals(pd.Series([True, False, False, False])) + + +@pytest.mark.parametrize("dataset_class", [PandasDataset, DaskDataset]) +def test_target_is_sorted_by_with_regex_multiple_groups(dataset_class): + """ + Test regex sorting with multiple USUBJID and MIDSTYPE groups. + """ + df = dataset_class.from_dict( + { + "USUBJID": ["001", "001", "001", "002", "002", "002"], + "MIDSTYPE": ["A", "A", "A", "A", "A", "A"], + "MIDS": ["M1", "M2", "M3", "M1", "M2", "M3"], + "SMSTDTC": [ + "2020-01-01", + "2020-01-02", + "2020-01-03", + "2020-02-01", + "2020-02-02", + "2020-02-03", + ], + } + ) + + other_value = { + "target": "MIDS", + "regex": ".*?(\\d+)$", # Non-greedy to correctly extract multi-digit numbers + "within": ["USUBJID", "MIDSTYPE"], + "comparator": [ + {"name": "SMSTDTC", "sort_order": "ASC", "null_position": "first"} + ], + } + + result = DataframeType({"value": df}).target_is_sorted_by(other_value) + assert result.equals(pd.Series([True, True, True, True, True, True])) + + +@pytest.mark.parametrize("dataset_class", [PandasDataset, DaskDataset]) +def test_target_is_sorted_by_without_regex_still_works(dataset_class): + """ + Test that the operator still works without regex (backward compatibility). + """ + df = dataset_class.from_dict( + { + "USUBJID": ["001", "001", "001"], + "SESEQ": [1, 2, 3], + "SESTDTC": [ + "2020-01-01", + "2020-01-02", + "2020-01-03", + ], + } + ) + + other_value = { + "target": "SESEQ", + "within": "USUBJID", + "comparator": [{"name": "SESTDTC", "sort_order": "ASC"}], + } + + result = DataframeType({"value": df}).target_is_sorted_by(other_value) + assert result.equals(pd.Series([True, True, True]))