Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
16 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 63 additions & 12 deletions cdisc_rules_engine/check_operators/dataframe_operators.py
Original file line number Diff line number Diff line change
Expand Up @@ -1664,12 +1664,12 @@ def _verify_neighbor_consistency(

return is_valid

def check_target_ascending_in_sorted_group(
def check_target_ascending_in_sorted_group_with_regex(
self, group, target, comparator, ascending, na_pos
):
"""
Check if target values are in ascending order within a group
already sorted by comparator.
already sorted by comparator. Supports regex extraction.
"""
is_valid = pd.Series(True, index=group.index)
is_numeric_comparator = pd.api.types.is_numeric_dtype(group[comparator])
Expand Down Expand Up @@ -1798,12 +1798,45 @@ def _process_grouped_result(
grouped_result = pd.Series(result_list, index=index_list)
return grouped_result.reindex(sorted_df.index, fill_value=True)

def _extract_regex_group(self, series: pd.Series, regex_pattern: str) -> pd.Series:
"""
Extract the first capturing group from a regex pattern and convert to numeric if possible.
Handles zero-padded numbers by converting to numeric.

Args:
series: Pandas series with string values
regex_pattern: Regex pattern with capturing group(s)

Returns:
Series with extracted and converted values
"""

def extract_and_convert(value):
if pd.isna(value) or value == "":
return np.nan

# YAML escapes backslashes, so we receive ".*\\d+$" which Python interprets as raw \
# We need to convert this to the actual regex pattern by replacing \\ with \
# However, since strings from YAML come already unescaped, we just use as-is
match = re.search(regex_pattern, str(value))
if match and match.groups():
extracted = match.group(1) # First capturing group
# Try to convert to numeric to handle both padded and non-padded numbers
try:
return pd.to_numeric(extracted)
except (ValueError, TypeError):
return extracted
return np.nan

return series.apply(extract_and_convert)

@log_operator_execution
@type_operator(FIELD_DATAFRAME)
def target_is_sorted_by(self, other_value: dict):
target = other_value.get("target")
within_columns = self._normalize_grouping_columns(other_value.get("within"))
columns = other_value["comparator"]
target_regex = other_value.get("regex") # parameter for regex extraction

result = pd.Series([True] * len(self.value), index=self.value.index)

Expand All @@ -1816,37 +1849,55 @@ def target_is_sorted_by(self, other_value: dict):
dict.fromkeys([target, comparator, *within_columns])
)

sorted_df = self.value[selected_columns].sort_values(
by=[*within_columns, target],
ascending=[True] * (len(within_columns) + 1),
)
# If regex is provided, extract and convert target values
if target_regex:
working_df = self.value[selected_columns].copy()
# Create a temporary column with extracted regex values
working_df[f"{target}_extracted"] = self._extract_regex_group(
working_df[target], target_regex
)
target_for_sorting = f"{target}_extracted"
# Sort by within columns AND extracted target
sorted_df = working_df.sort_values(
by=[*within_columns, target_for_sorting],
ascending=[True] * (len(within_columns) + 1),
)
else:
working_df = self.value[selected_columns]
target_for_sorting = target
sorted_df = working_df.sort_values(
by=[*within_columns, target],
ascending=[True] * (len(within_columns) + 1),
)

grouped_df = sorted_df.groupby(within_columns, sort=False)

target_check = grouped_df.apply(
lambda x: self.check_target_ascending_in_sorted_group(
x, target, comparator, ascending, na_pos
lambda x: self.check_target_ascending_in_sorted_group_with_regex(
x, target_for_sorting, comparator, ascending, na_pos
)
)
target_check = self._process_grouped_result(
target_check,
grouped_df,
within_columns,
sorted_df,
lambda group: self.check_target_ascending_in_sorted_group(
group, target, comparator, ascending, na_pos
lambda group: self.check_target_ascending_in_sorted_group_with_regex(
group, target_for_sorting, comparator, ascending, na_pos
),
)

date_overlap_check = grouped_df.apply(
lambda x: self.check_date_overlaps(x, target, comparator)
lambda x: self.check_date_overlaps(x, target_for_sorting, comparator)
)
date_overlap_check = self._process_grouped_result(
date_overlap_check,
grouped_df,
within_columns,
sorted_df,
lambda group: self.check_date_overlaps(group, target, comparator),
lambda group: self.check_date_overlaps(
group, target_for_sorting, comparator
),
)

combined_check = target_check & date_overlap_check
Expand Down
2 changes: 1 addition & 1 deletion resources/schema/rule-merged/Operator.json
Original file line number Diff line number Diff line change
Expand Up @@ -850,7 +850,7 @@
"properties": {
"operator": {
"const": "target_is_sorted_by",
"markdownDescription": "\nTrue if the values in name are ordered according to the values specified by value in ascending/descending order, grouped by the values in within. Each value entry requires a variable name, a sort_order of asc or desc, and an optional null_position of first or last (defaults to last) which controls where null/empty comparator values are placed in the expected ordering. Within accepts either a single column or an ordered list of columns. Columns can be either number or Char Dates in ISO8601 YYYY-MM-DD format. Date value(s) with different precisions that overlap (e.g. 2005-10, 2005-10-3 and 2005-10-08) are all flagged as not sorted as their order cannot be inferred.\n\n```yaml\nCheck:\n all:\n - name: --SEQ\n within:\n - USUBJID\n - MIDSTYPE\n operator: target_is_sorted_by\n value:\n - name: --STDTC\n sort_order: asc\n null_position: last\n```\n"
"markdownDescription": "\nTrue if the values in name are ordered according to the values specified by value\nin ascending/descending order, grouped by the values in within. Each value entry\nrequires a variable name, a sort_order of asc or desc, and an optional\nnull_position of first or last (defaults to last) which controls where null/empty\ncomparator values are placed in the expected ordering. Within accepts either a\nsingle column or an ordered list of columns. Columns can be either number or Char\nDates in ISO8601 YYYY-MM-DD format. Date value(s) with different precisions that\noverlap (e.g. 2005-10, 2005-10-3 and 2005-10-08) are all flagged as not sorted as\ntheir order cannot be inferred.\n\nOptionally supports a `regex` parameter that extracts a portion of the target\nvalue for sorting. The regex must contain at least one capturing group. The first\ncaptured group is extracted and converted to numeric if possible, allowing proper\nsorting of sequence numbers (e.g., \"MIDS1\", \"MIDS2\", ..., \"MIDS10\" with regex\n`.*?(\\\\d+)$`). This is particularly useful for variables that end with sequence\nnumbers that may or may not be zero-padded.\n\n```yaml\nCheck:\n all:\n - name: --SEQ\n within:\n - USUBJID\n - MIDSTYPE\n operator: target_is_sorted_by\n value:\n - name: --STDTC\n sort_order: asc\n null_position: last\n```\n\nExample with regex for extracting sequence numbers:\n\n```yaml\nCheck:\n all:\n - name: MIDS\n operator: target_is_sorted_by\n regex: \".*?(\\\\d+)$\" # Extract trailing digits, convert to numeric\n value:\n - name: SMSTDTC\n sort_order: asc\n within:\n - USUBJID\n - MIDSTYPE\n```\n"
}
},
"required": ["operator", "value", "within"],
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should add the new regex property here.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It’s not very visible here, but I actually added information about the regex. It’s easier to view it in the editor.

Expand Down
33 changes: 32 additions & 1 deletion resources/schema/rule/Operator.md
Original file line number Diff line number Diff line change
Expand Up @@ -1127,7 +1127,22 @@ Complement of `is_ordered_by`

### target_is_sorted_by

True if the values in name are ordered according to the values specified by value in ascending/descending order, grouped by the values in within. Each value entry requires a variable name, a sort_order of asc or desc, and an optional null_position of first or last (defaults to last) which controls where null/empty comparator values are placed in the expected ordering. Within accepts either a single column or an ordered list of columns. Columns can be either number or Char Dates in ISO8601 YYYY-MM-DD format. Date value(s) with different precisions that overlap (e.g. 2005-10, 2005-10-3 and 2005-10-08) are all flagged as not sorted as their order cannot be inferred.
True if the values in name are ordered according to the values specified by value
in ascending/descending order, grouped by the values in within. Each value entry
requires a variable name, a sort_order of asc or desc, and an optional
null_position of first or last (defaults to last) which controls where null/empty
comparator values are placed in the expected ordering. Within accepts either a
single column or an ordered list of columns. Columns can be either number or Char
Dates in ISO8601 YYYY-MM-DD format. Date value(s) with different precisions that
overlap (e.g. 2005-10, 2005-10-3 and 2005-10-08) are all flagged as not sorted as
their order cannot be inferred.

Optionally supports a `regex` parameter that extracts a portion of the target
value for sorting. The regex must contain at least one capturing group. The first
captured group is extracted and converted to numeric if possible, allowing proper
sorting of sequence numbers (e.g., "MIDS1", "MIDS2", ..., "MIDS10" with regex
`.*?(\\d+)$`). This is particularly useful for variables that end with sequence
numbers that may or may not be zero-padded.

```yaml
Check:
Expand All @@ -1143,6 +1158,22 @@ Check:
null_position: last
```

Example with regex for extracting sequence numbers:

```yaml
Check:
all:
- name: MIDS
operator: target_is_sorted_by
regex: ".*?(\\d+)$" # Extract trailing digits, convert to numeric
value:
- name: SMSTDTC
sort_order: asc
within:
- USUBJID
- MIDSTYPE
```

### target_is_not_sorted_by

Complement of `target_is_sorted_by`
Expand Down
185 changes: 185 additions & 0 deletions tests/unit/test_check_operators/test_target_is_sorted_by_regex.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
"""
Tests for target_is_sorted_by operator with regex support
"""

import pytest
import pandas as pd
from cdisc_rules_engine.check_operators.dataframe_operators import DataframeType
from cdisc_rules_engine.models.dataset.pandas_dataset import PandasDataset
from cdisc_rules_engine.models.dataset.dask_dataset import DaskDataset


@pytest.mark.parametrize("dataset_class", [PandasDataset, DaskDataset])
def test_target_is_sorted_by_with_regex_non_padded(dataset_class):
"""
Test target_is_sorted_by with regex extraction for non-zero-padded sequence numbers.
Example: lalala1, lalala2, ..., lalala9, lalala10
"""
df = dataset_class.from_dict(
{
"USUBJID": ["001", "001", "001", "001", "002", "002", "002"],
"MIDSTYPE": ["A", "A", "A", "A", "B", "B", "B"],
"MIDS": [
"lalala1",
"lalala2",
"lalala9",
"lalala10",
"test1",
"test2",
"test10",
],
"SMSTDTC": [
"2020-01-01",
"2020-01-02",
"2020-01-09",
"2020-01-10",
"2020-02-01",
"2020-02-02",
"2020-02-10",
],
}
)

other_value = {
"target": "MIDS",
"regex": ".*?(\\d+)$", # Non-greedy to correctly extract multi-digit numbers
"within": ["USUBJID", "MIDSTYPE"],
"comparator": [
{"name": "SMSTDTC", "sort_order": "ASC", "null_position": "first"}
],
}

result = DataframeType({"value": df}).target_is_sorted_by(other_value)
# All should be True - sorted correctly by chronological order
assert result.equals(pd.Series([True, True, True, True, True, True, True]))


@pytest.mark.parametrize("dataset_class", [PandasDataset, DaskDataset])
def test_target_is_sorted_by_with_regex_zero_padded(dataset_class):
"""
Test target_is_sorted_by with regex extraction for zero-padded sequence numbers.
Example: lalala01, lalala02, ..., lalala09, lalala10
"""
df = dataset_class.from_dict(
{
"USUBJID": ["001", "001", "001", "001"],
"MIDSTYPE": ["A", "A", "A", "A"],
"MIDS": ["lalala01", "lalala02", "lalala09", "lalala10"],
"SMSTDTC": [
"2020-01-01",
"2020-01-02",
"2020-01-09",
"2020-01-10",
],
}
)

other_value = {
"target": "MIDS",
"regex": ".*?(\\d+)$", # Non-greedy to correctly extract multi-digit numbers
"within": ["USUBJID", "MIDSTYPE"],
"comparator": [
{"name": "SMSTDTC", "sort_order": "ASC", "null_position": "first"}
],
}

result = DataframeType({"value": df}).target_is_sorted_by(other_value)
# All should be True - numeric conversion handles zero-padding
assert result.equals(pd.Series([True, True, True, True]))


@pytest.mark.parametrize("dataset_class", [PandasDataset, DaskDataset])
def test_target_is_sorted_by_with_regex_invalid_order(dataset_class):
"""
Test that invalid order is detected even with regex extraction.
"""
df = dataset_class.from_dict(
{
"USUBJID": ["001", "001", "001", "001"],
"MIDSTYPE": ["A", "A", "A", "A"],
"MIDS": ["lalala1", "lalala10", "lalala2", "lalala9"],
"SMSTDTC": [
"2020-01-01",
"2020-01-02",
"2020-01-09",
"2020-01-10",
],
}
)

other_value = {
"target": "MIDS",
"regex": ".*?(\\d+)$", # Non-greedy to correctly extract multi-digit numbers
"within": ["USUBJID", "MIDSTYPE"],
"comparator": [
{"name": "SMSTDTC", "sort_order": "ASC", "null_position": "first"}
],
}

result = DataframeType({"value": df}).target_is_sorted_by(other_value)
# After sorting by extracted MIDS (1, 2, 9, 10), dates should be:
# MIDS=1 (2020-01-01) -> MIDS=2 (should be 2020-01-02) -> MIDS=9 (should be 2020-01-09) -> MIDS=10 (should be 2020-01-10)
# Actual dates: 2020-01-01, 2020-01-09, 2020-01-10, 2020-01-02
# Only MIDS=1 is in correct chronological position
assert result.equals(pd.Series([True, False, False, False]))


@pytest.mark.parametrize("dataset_class", [PandasDataset, DaskDataset])
def test_target_is_sorted_by_with_regex_multiple_groups(dataset_class):
"""
Test regex sorting with multiple USUBJID and MIDSTYPE groups.
"""
df = dataset_class.from_dict(
{
"USUBJID": ["001", "001", "001", "002", "002", "002"],
"MIDSTYPE": ["A", "A", "A", "A", "A", "A"],
"MIDS": ["M1", "M2", "M3", "M1", "M2", "M3"],
"SMSTDTC": [
"2020-01-01",
"2020-01-02",
"2020-01-03",
"2020-02-01",
"2020-02-02",
"2020-02-03",
],
}
)

other_value = {
"target": "MIDS",
"regex": ".*?(\\d+)$", # Non-greedy to correctly extract multi-digit numbers
"within": ["USUBJID", "MIDSTYPE"],
"comparator": [
{"name": "SMSTDTC", "sort_order": "ASC", "null_position": "first"}
],
}

result = DataframeType({"value": df}).target_is_sorted_by(other_value)
assert result.equals(pd.Series([True, True, True, True, True, True]))


@pytest.mark.parametrize("dataset_class", [PandasDataset, DaskDataset])
def test_target_is_sorted_by_without_regex_still_works(dataset_class):
"""
Test that the operator still works without regex (backward compatibility).
"""
df = dataset_class.from_dict(
{
"USUBJID": ["001", "001", "001"],
"SESEQ": [1, 2, 3],
"SESTDTC": [
"2020-01-01",
"2020-01-02",
"2020-01-03",
],
}
)

other_value = {
"target": "SESEQ",
"within": "USUBJID",
"comparator": [{"name": "SESTDTC", "sort_order": "ASC"}],
}

result = DataframeType({"value": df}).target_is_sorted_by(other_value)
assert result.equals(pd.Series([True, True, True]))
Loading