diff --git a/cdisc_rules_engine/models/operation_params.py b/cdisc_rules_engine/models/operation_params.py index df77c0501..5d953ead4 100644 --- a/cdisc_rules_engine/models/operation_params.py +++ b/cdisc_rules_engine/models/operation_params.py @@ -59,6 +59,7 @@ class OperationParams: source: str = None target: str = None subtract: str = None + order_insensitive: bool = True value_is_reference: bool = False namespace: str = None delimiter: str = None diff --git a/cdisc_rules_engine/operations/minus.py b/cdisc_rules_engine/operations/minus.py index 71061b3f1..8755ec074 100644 --- a/cdisc_rules_engine/operations/minus.py +++ b/cdisc_rules_engine/operations/minus.py @@ -17,7 +17,7 @@ def _normalize_to_list(val): return [val] -def _set_difference_preserve_order(list_a: list, list_b: list) -> list: +def _set_difference_order_insensitive(list_a: list, list_b: list) -> list: """ Compute set difference A \\ B (elements in A not in B). Preserves order from list_a. @@ -26,6 +26,43 @@ def _set_difference_preserve_order(list_a: list, list_b: list) -> list: return [x for x in _normalize_to_list(list_a) if x not in set_b] +def _set_difference_order_sensitive(list_a: list, list_b: list) -> list: + """ + Compute set difference A \\ B (elements in A not in B). + Take into account order of elements + Preserves order from list_a. + """ + result = [] + a_start_index = 0 + list_a_normalized = _normalize_to_list(list_a) + for b_item in _normalize_to_list(list_b): + # Check if b_item is in the remaining part of A + set_a = set(list_a_normalized[a_start_index:]) + if b_item in set_a: + match_found = False + # Iterate through A starting from last matched index + for i in range(a_start_index, len(list_a_normalized)): + a_item = list_a_normalized[i] + if a_item != b_item: + if match_found: + break + else: + result.append(a_item) + else: + # Move start index to next position after matched item + a_start_index = i + 1 + # We have to continue checking for duplicates of b_item in A, so we don't break here + match_found = True + else: + # If B item is not in A, ignore it since there is nothing to subtract from A + continue + + # Add any remaining items in A after last matched index + result.extend(list_a_normalized[a_start_index:]) + + return result + + class Minus(BaseOperation): """ Operation that computes set difference: name minus subtract. @@ -43,4 +80,7 @@ def _execute_operation(self): if not subtract_ref or subtract_ref not in self.evaluation_dataset.columns: return _normalize_to_list(list_a) list_b = self.evaluation_dataset[subtract_ref].iloc[0] - return _set_difference_preserve_order(list_a, list_b) + if self.params.order_insensitive: + return _set_difference_order_insensitive(list_a, list_b) + else: + return _set_difference_order_sensitive(list_a, list_b) diff --git a/cdisc_rules_engine/utilities/rule_processor.py b/cdisc_rules_engine/utilities/rule_processor.py index 119be5724..cc664a00a 100644 --- a/cdisc_rules_engine/utilities/rule_processor.py +++ b/cdisc_rules_engine/utilities/rule_processor.py @@ -429,6 +429,7 @@ def perform_rule_operations( term_pref_term=operation.get("term_pref_term"), term_value=operation.get("term_value"), value_is_reference=operation.get("value_is_reference", False), + order_insensitive=operation.get("order_insensitive", True), ) try: # execute operation diff --git a/resources/schema/rule-merged/Operations.json b/resources/schema/rule-merged/Operations.json index a93003e72..ed7cce0f9 100644 --- a/resources/schema/rule-merged/Operations.json +++ b/resources/schema/rule-merged/Operations.json @@ -267,7 +267,7 @@ "properties": { "operator": { "const": "minus", - "markdownDescription": "\nComputes set difference: elements in `name` that are not in `subtract`. Uses [set difference]() semantics (A \u2216 B). Preserves order from the first list. Both `name` and `subtract` must reference other operation results (e.g., `$expected_variables`, `$dataset_variables`). When `subtract` is empty or missing, returns all elements from `name`. Can be computed and added to output variables to display missing elements in error results.\n\n```yaml\nOperations:\n - id: $expected_variables\n operator: expected_variables\n - id: $dataset_variables\n operator: get_column_order_from_dataset\n - id: $expected_minus_dataset\n name: $expected_variables\n operator: minus\n subtract: $dataset_variables\n```\n" + "markdownDescription": "\nComputes set difference: elements in `name` that are not in `subtract`. By default a standard [set difference]() semantics (A \u2216 B) is applied. Optional `order_insensitive` property allows to have the element order to be taken into consideration and only those `name` elements are removed which follow the same order as in `subtract` . Preserves order from the first list. Both `name` and `subtract` must reference other operation results (e.g., `$expected_variables`, `$dataset_variables`). When `subtract` is empty or missing, returns all elements from `name`. Can be computed and added to output variables to display missing elements in error results.\n\n```yaml\nOperations:\n - id: $expected_variables\n operator: expected_variables\n - id: $dataset_variables\n operator: get_column_order_from_dataset\n - id: $expected_minus_dataset\n name: $expected_variables\n operator: minus\n subtract: $dataset_variables\n order_insensitive: false\n```\n" } }, "required": ["id", "operator", "name", "subtract"], @@ -697,6 +697,9 @@ "value_is_reference": { "type": "boolean" }, + "order_insensitive": { + "type": "boolean" + }, "version": { "type": "string" } diff --git a/resources/schema/rule/Operations.json b/resources/schema/rule/Operations.json index 0a8879851..e375fa0af 100644 --- a/resources/schema/rule/Operations.json +++ b/resources/schema/rule/Operations.json @@ -644,6 +644,9 @@ "value_is_reference": { "type": "boolean" }, + "order_insensitive": { + "type": "boolean" + }, "version": { "type": "string" } diff --git a/resources/schema/rule/Operations.md b/resources/schema/rule/Operations.md index 0d9b24bac..4063cd73f 100644 --- a/resources/schema/rule/Operations.md +++ b/resources/schema/rule/Operations.md @@ -779,7 +779,7 @@ Operations: ### minus -Computes set difference: elements in `name` that are not in `subtract`. Uses [set difference]() semantics (A ∖ B). Preserves order from the first list. Both `name` and `subtract` must reference other operation results (e.g., `$expected_variables`, `$dataset_variables`). When `subtract` is empty or missing, returns all elements from `name`. Can be computed and added to output variables to display missing elements in error results. +Computes set difference: elements in `name` that are not in `subtract`. By default a standard [set difference]() semantics (A ∖ B) is applied. Optional `order_insensitive` property allows to have the element order to be taken into consideration and only those `name` elements are removed which follow the same order as in `subtract` . Preserves order from the first list. Both `name` and `subtract` must reference other operation results (e.g., `$expected_variables`, `$dataset_variables`). When `subtract` is empty or missing, returns all elements from `name`. Can be computed and added to output variables to display missing elements in error results. ```yaml Operations: @@ -791,6 +791,7 @@ Operations: name: $expected_variables operator: minus subtract: $dataset_variables + order_insensitive: false ``` ### label_referenced_variable_metadata diff --git a/resources/schema/rule/check_parameter.md b/resources/schema/rule/check_parameter.md index 8eb7c2984..d80fb4294 100644 --- a/resources/schema/rule/check_parameter.md +++ b/resources/schema/rule/check_parameter.md @@ -277,6 +277,10 @@ Reference to another operation result, used as the second operand in operations subtract: $dataset_variables ``` +### order_insensitive + +Optional boolean parameter for the `minus` operator. When set to `true` or not specified, `minus` performs a standard [set difference]() operation, ignoring the order of elements. When set to `false`, `minus` considers element order and only removes elements from `name` that follow the same order as in `subtract`. + ### term_code Terminology code value used in controlled terminology operations for code-based lookups. diff --git a/tests/unit/test_operations/test_minus.py b/tests/unit/test_operations/test_minus.py index 6969ea99b..4dcae53b3 100644 --- a/tests/unit/test_operations/test_minus.py +++ b/tests/unit/test_operations/test_minus.py @@ -4,7 +4,11 @@ from cdisc_rules_engine.models.dataset.pandas_dataset import PandasDataset from cdisc_rules_engine.models.operation_params import OperationParams -from cdisc_rules_engine.operations.minus import Minus, _set_difference_preserve_order +from cdisc_rules_engine.operations.minus import ( + Minus, + _set_difference_order_insensitive, + _set_difference_order_sensitive, +) import pytest @@ -32,10 +36,24 @@ def minus_params(operation_params: OperationParams) -> OperationParams: (["a", "", "b"], [""], ["a", "b"]), (["a", "", "b"], ["c"], ["a", "", "b"]), ([""], [""], []), + (["A", "C", "D", "B"], ["B", "D"], ["A", "C", "D"]), + (["A", "C", "D", "B"], ["A", "B", "D"], ["C", "D"]), ], ) -def test_set_difference_preserve_order(list_a, list_b, expected): - assert _set_difference_preserve_order(list_a, list_b) == expected +def test_set_difference_order_sensitive(list_a, list_b, expected): + assert _set_difference_order_sensitive(list_a, list_b) == expected + + +@pytest.mark.parametrize( + "list_a,list_b,expected", + [ + (["c", "b", "a"], ["b"], ["c", "a"]), + (["A", "C", "D", "B"], ["B", "D"], ["A", "C"]), + (["A", "C", "D", "B"], ["A", "B", "D"], ["C"]), + ], +) +def test_set_difference_order_insensitive(list_a, list_b, expected): + assert _set_difference_order_insensitive(list_a, list_b) == expected @pytest.mark.parametrize("dataset_type", [PandasDataset, DaskDataset]) @@ -58,6 +76,48 @@ def test_minus_operation(minus_params: OperationParams, dataset_type): assert list(result[minus_params.operation_id].iloc[0]) == ["AEDECOD"] +@pytest.mark.parametrize("dataset_type", [PandasDataset, DaskDataset]) +def test_minus_operation_incorrect_order(minus_params: OperationParams, dataset_type): + eval_dataset = dataset_type.from_dict( + { + "$expected_variables": [ + ["STUDYID", "DOMAIN", "AESEQ", "AETERM", "AEDECOD"], + ["STUDYID", "DOMAIN", "AESEQ", "AETERM", "AEDECOD"], + ], + "$dataset_variables": [ + ["STUDYID", "AESEQ", "DOMAIN", "AETERM"], + ["STUDYID", "AESEQ", "DOMAIN", "AETERM"], + ], + } + ) + + minus_params.order_insensitive = False + operation = Minus(minus_params, eval_dataset, MagicMock(), MagicMock()) + result = operation.execute() + assert list(result[minus_params.operation_id].iloc[0]) == ["DOMAIN", "AEDECOD"] + + +@pytest.mark.parametrize("dataset_type", [PandasDataset, DaskDataset]) +def test_minus_operation_order_insensitive(minus_params: OperationParams, dataset_type): + eval_dataset = dataset_type.from_dict( + { + "$expected_variables": [ + ["STUDYID", "DOMAIN", "AESEQ", "AETERM", "AEDECOD"], + ["STUDYID", "DOMAIN", "AESEQ", "AETERM", "AEDECOD"], + ], + "$dataset_variables": [ + ["STUDYID", "AESEQ", "DOMAIN", "AETERM"], + ["STUDYID", "AESEQ", "DOMAIN", "AETERM"], + ], + } + ) + + minus_params.order_insensitive = True + operation = Minus(minus_params, eval_dataset, MagicMock(), MagicMock()) + result = operation.execute() + assert list(result[minus_params.operation_id].iloc[0]) == ["AEDECOD"] + + @pytest.mark.parametrize("dataset_type", [PandasDataset, DaskDataset]) def test_minus_empty_subtract_returns_all_of_name( minus_params: OperationParams, dataset_type