From ad0893eccd0203aac4aed79ccff5cf38bf0d4570 Mon Sep 17 00:00:00 2001 From: Dmitry Date: Tue, 14 Apr 2026 23:50:26 -0300 Subject: [PATCH 1/4] Updating minus operator --- cdisc_rules_engine/models/operation_params.py | 1 + cdisc_rules_engine/operations/minus.py | 43 +++++++++++- .../utilities/rule_processor.py | 1 + resources/schema/rule/Operations.json | 3 + resources/schema/rule/Operations.md | 3 +- resources/schema/rule/check_parameter.md | 4 ++ tests/unit/test_operations/test_minus.py | 65 ++++++++++++++++++- 7 files changed, 114 insertions(+), 6 deletions(-) diff --git a/cdisc_rules_engine/models/operation_params.py b/cdisc_rules_engine/models/operation_params.py index 3cc8b938a..16fd975f2 100644 --- a/cdisc_rules_engine/models/operation_params.py +++ b/cdisc_rules_engine/models/operation_params.py @@ -59,6 +59,7 @@ class OperationParams: source: str = None target: str = None subtract: str = None + order_insensitive: bool = False value_is_reference: bool = False namespace: str = None delimiter: str = None diff --git a/cdisc_rules_engine/operations/minus.py b/cdisc_rules_engine/operations/minus.py index 71061b3f1..067b7ad8e 100644 --- a/cdisc_rules_engine/operations/minus.py +++ b/cdisc_rules_engine/operations/minus.py @@ -17,7 +17,7 @@ def _normalize_to_list(val): return [val] -def _set_difference_preserve_order(list_a: list, list_b: list) -> list: +def _set_difference_order_insensitive(list_a: list, list_b: list) -> list: """ Compute set difference A \\ B (elements in A not in B). Preserves order from list_a. @@ -26,6 +26,42 @@ def _set_difference_preserve_order(list_a: list, list_b: list) -> list: return [x for x in _normalize_to_list(list_a) if x not in set_b] +def _set_difference_order_sensitive(list_a: list, list_b: list) -> list: + """ + Compute set difference A \\ B (elements in A not in B). + Take into account order of elements + Preserves order from list_a. + """ + result = [] + a_start_index = 0 + for b_item in _normalize_to_list(list_b): + # Check if b_item is in the remaining part of A + set_a = set(_normalize_to_list(list_a[a_start_index:])) + if b_item in set_a: + match_found = False + # Iterate through A starting from last matched index + for i in range(a_start_index, len(list_a)): + a_item = list_a[i] + if a_item != b_item: + if match_found: + break + else: + result.append(a_item) + else: + # Move start index to next position after matched item + a_start_index = i + 1 + # We have to continue checking for duplicates of b_item in A, so we don't break here + match_found = True + else: + # If B item is not in A, ignore it since there is nothing to subtract from A + continue + + # Add any remaining items in A after last matched index + result.extend(list_a[a_start_index:]) + + return result + + class Minus(BaseOperation): """ Operation that computes set difference: name minus subtract. @@ -43,4 +79,7 @@ def _execute_operation(self): if not subtract_ref or subtract_ref not in self.evaluation_dataset.columns: return _normalize_to_list(list_a) list_b = self.evaluation_dataset[subtract_ref].iloc[0] - return _set_difference_preserve_order(list_a, list_b) + if self.params.order_insensitive: + return _set_difference_order_insensitive(list_a, list_b) + else: + return _set_difference_order_sensitive(list_a, list_b) diff --git a/cdisc_rules_engine/utilities/rule_processor.py b/cdisc_rules_engine/utilities/rule_processor.py index 0d289d107..5e8139001 100644 --- a/cdisc_rules_engine/utilities/rule_processor.py +++ b/cdisc_rules_engine/utilities/rule_processor.py @@ -413,6 +413,7 @@ def perform_rule_operations( term_pref_term=operation.get("term_pref_term"), term_value=operation.get("term_value"), value_is_reference=operation.get("value_is_reference", False), + order_insensitive=operation.get("order_insensitive", False), ) try: # execute operation diff --git a/resources/schema/rule/Operations.json b/resources/schema/rule/Operations.json index 0a8879851..e375fa0af 100644 --- a/resources/schema/rule/Operations.json +++ b/resources/schema/rule/Operations.json @@ -644,6 +644,9 @@ "value_is_reference": { "type": "boolean" }, + "order_insensitive": { + "type": "boolean" + }, "version": { "type": "string" } diff --git a/resources/schema/rule/Operations.md b/resources/schema/rule/Operations.md index 0d9b24bac..c9e1a1d43 100644 --- a/resources/schema/rule/Operations.md +++ b/resources/schema/rule/Operations.md @@ -779,7 +779,7 @@ Operations: ### minus -Computes set difference: elements in `name` that are not in `subtract`. Uses [set difference]() semantics (A ∖ B). Preserves order from the first list. Both `name` and `subtract` must reference other operation results (e.g., `$expected_variables`, `$dataset_variables`). When `subtract` is empty or missing, returns all elements from `name`. Can be computed and added to output variables to display missing elements in error results. +Computes set difference: elements in `name` that are not in `subtract`. By default the element order is taken into consideration and only those `name` elements are removed which follow the same order as in `subtract`. Optional `order_insensitive` property allows to have a standard [set difference]() semantics (A ∖ B). Preserves order from the first list. Both `name` and `subtract` must reference other operation results (e.g., `$expected_variables`, `$dataset_variables`). When `subtract` is empty or missing, returns all elements from `name`. Can be computed and added to output variables to display missing elements in error results. ```yaml Operations: @@ -791,6 +791,7 @@ Operations: name: $expected_variables operator: minus subtract: $dataset_variables + order_insensitive: true ``` ### label_referenced_variable_metadata diff --git a/resources/schema/rule/check_parameter.md b/resources/schema/rule/check_parameter.md index 8eb7c2984..fcc525ef1 100644 --- a/resources/schema/rule/check_parameter.md +++ b/resources/schema/rule/check_parameter.md @@ -277,6 +277,10 @@ Reference to another operation result, used as the second operand in operations subtract: $dataset_variables ``` +### order_insensitive + +Optional boolean parameter for the `minus` operator. When set to `true`, `minus` performs a standard [set difference]() operation, ignoring the order of elements. When `false` or not specified, `minus` considers element order and only removes elements from `name` that follow the same order as in `subtract`. + ### term_code Terminology code value used in controlled terminology operations for code-based lookups. diff --git a/tests/unit/test_operations/test_minus.py b/tests/unit/test_operations/test_minus.py index 6969ea99b..6181c240b 100644 --- a/tests/unit/test_operations/test_minus.py +++ b/tests/unit/test_operations/test_minus.py @@ -4,7 +4,11 @@ from cdisc_rules_engine.models.dataset.pandas_dataset import PandasDataset from cdisc_rules_engine.models.operation_params import OperationParams -from cdisc_rules_engine.operations.minus import Minus, _set_difference_preserve_order +from cdisc_rules_engine.operations.minus import ( + Minus, + _set_difference_order_insensitive, + _set_difference_order_sensitive, +) import pytest @@ -32,10 +36,24 @@ def minus_params(operation_params: OperationParams) -> OperationParams: (["a", "", "b"], [""], ["a", "b"]), (["a", "", "b"], ["c"], ["a", "", "b"]), ([""], [""], []), + (["A", "C", "D", "B"], ["B", "D"], ["A", "C", "D"]), + (["A", "C", "D", "B"], ["A", "B", "D"], ["C", "D"]), ], ) -def test_set_difference_preserve_order(list_a, list_b, expected): - assert _set_difference_preserve_order(list_a, list_b) == expected +def test_set_difference_order_sensitive(list_a, list_b, expected): + assert _set_difference_order_sensitive(list_a, list_b) == expected + + +@pytest.mark.parametrize( + "list_a,list_b,expected", + [ + (["c", "b", "a"], ["b"], ["c", "a"]), + (["A", "C", "D", "B"], ["B", "D"], ["A", "C"]), + (["A", "C", "D", "B"], ["A", "B", "D"], ["C"]), + ], +) +def test_set_difference_order_insensitive(list_a, list_b, expected): + assert _set_difference_order_insensitive(list_a, list_b) == expected @pytest.mark.parametrize("dataset_type", [PandasDataset, DaskDataset]) @@ -58,6 +76,47 @@ def test_minus_operation(minus_params: OperationParams, dataset_type): assert list(result[minus_params.operation_id].iloc[0]) == ["AEDECOD"] +@pytest.mark.parametrize("dataset_type", [PandasDataset, DaskDataset]) +def test_minus_operation_incorrect_order(minus_params: OperationParams, dataset_type): + eval_dataset = dataset_type.from_dict( + { + "$expected_variables": [ + ["STUDYID", "DOMAIN", "AESEQ", "AETERM", "AEDECOD"], + ["STUDYID", "DOMAIN", "AESEQ", "AETERM", "AEDECOD"], + ], + "$dataset_variables": [ + ["STUDYID", "AESEQ", "DOMAIN", "AETERM"], + ["STUDYID", "AESEQ", "DOMAIN", "AETERM"], + ], + } + ) + + operation = Minus(minus_params, eval_dataset, MagicMock(), MagicMock()) + result = operation.execute() + assert list(result[minus_params.operation_id].iloc[0]) == ["DOMAIN", "AEDECOD"] + + +@pytest.mark.parametrize("dataset_type", [PandasDataset, DaskDataset]) +def test_minus_operation_order_insensitive(minus_params: OperationParams, dataset_type): + eval_dataset = dataset_type.from_dict( + { + "$expected_variables": [ + ["STUDYID", "DOMAIN", "AESEQ", "AETERM", "AEDECOD"], + ["STUDYID", "DOMAIN", "AESEQ", "AETERM", "AEDECOD"], + ], + "$dataset_variables": [ + ["STUDYID", "AESEQ", "DOMAIN", "AETERM"], + ["STUDYID", "AESEQ", "DOMAIN", "AETERM"], + ], + } + ) + + minus_params.order_insensitive = True + operation = Minus(minus_params, eval_dataset, MagicMock(), MagicMock()) + result = operation.execute() + assert list(result[minus_params.operation_id].iloc[0]) == ["AEDECOD"] + + @pytest.mark.parametrize("dataset_type", [PandasDataset, DaskDataset]) def test_minus_empty_subtract_returns_all_of_name( minus_params: OperationParams, dataset_type From 184d53e88eff2dbd8abe2c5886cbbaf1ba95af00 Mon Sep 17 00:00:00 2001 From: Dmitry Date: Mon, 11 May 2026 00:00:37 -0300 Subject: [PATCH 2/4] Addressing comments, changing default value of oreder_insensitive to True --- cdisc_rules_engine/models/operation_params.py | 2 +- cdisc_rules_engine/operations/minus.py | 9 +++++---- cdisc_rules_engine/utilities/rule_processor.py | 2 +- resources/schema/rule/Operations.md | 4 ++-- resources/schema/rule/check_parameter.md | 2 +- 5 files changed, 10 insertions(+), 9 deletions(-) diff --git a/cdisc_rules_engine/models/operation_params.py b/cdisc_rules_engine/models/operation_params.py index 16fd975f2..8e319e134 100644 --- a/cdisc_rules_engine/models/operation_params.py +++ b/cdisc_rules_engine/models/operation_params.py @@ -60,7 +60,7 @@ class OperationParams: target: str = None subtract: str = None order_insensitive: bool = False - value_is_reference: bool = False + value_is_reference: bool = True namespace: str = None delimiter: str = None define_xml_path: str = None diff --git a/cdisc_rules_engine/operations/minus.py b/cdisc_rules_engine/operations/minus.py index 067b7ad8e..8755ec074 100644 --- a/cdisc_rules_engine/operations/minus.py +++ b/cdisc_rules_engine/operations/minus.py @@ -34,14 +34,15 @@ def _set_difference_order_sensitive(list_a: list, list_b: list) -> list: """ result = [] a_start_index = 0 + list_a_normalized = _normalize_to_list(list_a) for b_item in _normalize_to_list(list_b): # Check if b_item is in the remaining part of A - set_a = set(_normalize_to_list(list_a[a_start_index:])) + set_a = set(list_a_normalized[a_start_index:]) if b_item in set_a: match_found = False # Iterate through A starting from last matched index - for i in range(a_start_index, len(list_a)): - a_item = list_a[i] + for i in range(a_start_index, len(list_a_normalized)): + a_item = list_a_normalized[i] if a_item != b_item: if match_found: break @@ -57,7 +58,7 @@ def _set_difference_order_sensitive(list_a: list, list_b: list) -> list: continue # Add any remaining items in A after last matched index - result.extend(list_a[a_start_index:]) + result.extend(list_a_normalized[a_start_index:]) return result diff --git a/cdisc_rules_engine/utilities/rule_processor.py b/cdisc_rules_engine/utilities/rule_processor.py index 5e8139001..d8d3dcc26 100644 --- a/cdisc_rules_engine/utilities/rule_processor.py +++ b/cdisc_rules_engine/utilities/rule_processor.py @@ -413,7 +413,7 @@ def perform_rule_operations( term_pref_term=operation.get("term_pref_term"), term_value=operation.get("term_value"), value_is_reference=operation.get("value_is_reference", False), - order_insensitive=operation.get("order_insensitive", False), + order_insensitive=operation.get("order_insensitive", True), ) try: # execute operation diff --git a/resources/schema/rule/Operations.md b/resources/schema/rule/Operations.md index c9e1a1d43..4063cd73f 100644 --- a/resources/schema/rule/Operations.md +++ b/resources/schema/rule/Operations.md @@ -779,7 +779,7 @@ Operations: ### minus -Computes set difference: elements in `name` that are not in `subtract`. By default the element order is taken into consideration and only those `name` elements are removed which follow the same order as in `subtract`. Optional `order_insensitive` property allows to have a standard [set difference]() semantics (A ∖ B). Preserves order from the first list. Both `name` and `subtract` must reference other operation results (e.g., `$expected_variables`, `$dataset_variables`). When `subtract` is empty or missing, returns all elements from `name`. Can be computed and added to output variables to display missing elements in error results. +Computes set difference: elements in `name` that are not in `subtract`. By default a standard [set difference]() semantics (A ∖ B) is applied. Optional `order_insensitive` property allows to have the element order to be taken into consideration and only those `name` elements are removed which follow the same order as in `subtract` . Preserves order from the first list. Both `name` and `subtract` must reference other operation results (e.g., `$expected_variables`, `$dataset_variables`). When `subtract` is empty or missing, returns all elements from `name`. Can be computed and added to output variables to display missing elements in error results. ```yaml Operations: @@ -791,7 +791,7 @@ Operations: name: $expected_variables operator: minus subtract: $dataset_variables - order_insensitive: true + order_insensitive: false ``` ### label_referenced_variable_metadata diff --git a/resources/schema/rule/check_parameter.md b/resources/schema/rule/check_parameter.md index fcc525ef1..d80fb4294 100644 --- a/resources/schema/rule/check_parameter.md +++ b/resources/schema/rule/check_parameter.md @@ -279,7 +279,7 @@ Reference to another operation result, used as the second operand in operations ### order_insensitive -Optional boolean parameter for the `minus` operator. When set to `true`, `minus` performs a standard [set difference]() operation, ignoring the order of elements. When `false` or not specified, `minus` considers element order and only removes elements from `name` that follow the same order as in `subtract`. +Optional boolean parameter for the `minus` operator. When set to `true` or not specified, `minus` performs a standard [set difference]() operation, ignoring the order of elements. When set to `false`, `minus` considers element order and only removes elements from `name` that follow the same order as in `subtract`. ### term_code From d56add487c08ab705f56e7b97a2da07ed7fc5c0d Mon Sep 17 00:00:00 2001 From: Dmitry Date: Mon, 11 May 2026 15:13:51 -0300 Subject: [PATCH 3/4] Fixed an error, as I have updated incorrect default value --- cdisc_rules_engine/models/operation_params.py | 4 ++-- tests/unit/test_operations/test_minus.py | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/cdisc_rules_engine/models/operation_params.py b/cdisc_rules_engine/models/operation_params.py index 8e319e134..34329030e 100644 --- a/cdisc_rules_engine/models/operation_params.py +++ b/cdisc_rules_engine/models/operation_params.py @@ -59,8 +59,8 @@ class OperationParams: source: str = None target: str = None subtract: str = None - order_insensitive: bool = False - value_is_reference: bool = True + order_insensitive: bool = True + value_is_reference: bool = False namespace: str = None delimiter: str = None define_xml_path: str = None diff --git a/tests/unit/test_operations/test_minus.py b/tests/unit/test_operations/test_minus.py index 6181c240b..4dcae53b3 100644 --- a/tests/unit/test_operations/test_minus.py +++ b/tests/unit/test_operations/test_minus.py @@ -91,6 +91,7 @@ def test_minus_operation_incorrect_order(minus_params: OperationParams, dataset_ } ) + minus_params.order_insensitive = False operation = Minus(minus_params, eval_dataset, MagicMock(), MagicMock()) result = operation.execute() assert list(result[minus_params.operation_id].iloc[0]) == ["DOMAIN", "AEDECOD"] From 2c4a4850ac14399b23acc9b19ecd4b2dbce23f40 Mon Sep 17 00:00:00 2001 From: github-actions Date: Tue, 12 May 2026 18:24:14 +0000 Subject: [PATCH 4/4] Update merged schema files with markdown descriptions --- resources/schema/rule-merged/Operations.json | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/resources/schema/rule-merged/Operations.json b/resources/schema/rule-merged/Operations.json index a93003e72..ed7cce0f9 100644 --- a/resources/schema/rule-merged/Operations.json +++ b/resources/schema/rule-merged/Operations.json @@ -267,7 +267,7 @@ "properties": { "operator": { "const": "minus", - "markdownDescription": "\nComputes set difference: elements in `name` that are not in `subtract`. Uses [set difference]() semantics (A \u2216 B). Preserves order from the first list. Both `name` and `subtract` must reference other operation results (e.g., `$expected_variables`, `$dataset_variables`). When `subtract` is empty or missing, returns all elements from `name`. Can be computed and added to output variables to display missing elements in error results.\n\n```yaml\nOperations:\n - id: $expected_variables\n operator: expected_variables\n - id: $dataset_variables\n operator: get_column_order_from_dataset\n - id: $expected_minus_dataset\n name: $expected_variables\n operator: minus\n subtract: $dataset_variables\n```\n" + "markdownDescription": "\nComputes set difference: elements in `name` that are not in `subtract`. By default a standard [set difference]() semantics (A \u2216 B) is applied. Optional `order_insensitive` property allows to have the element order to be taken into consideration and only those `name` elements are removed which follow the same order as in `subtract` . Preserves order from the first list. Both `name` and `subtract` must reference other operation results (e.g., `$expected_variables`, `$dataset_variables`). When `subtract` is empty or missing, returns all elements from `name`. Can be computed and added to output variables to display missing elements in error results.\n\n```yaml\nOperations:\n - id: $expected_variables\n operator: expected_variables\n - id: $dataset_variables\n operator: get_column_order_from_dataset\n - id: $expected_minus_dataset\n name: $expected_variables\n operator: minus\n subtract: $dataset_variables\n order_insensitive: false\n```\n" } }, "required": ["id", "operator", "name", "subtract"], @@ -697,6 +697,9 @@ "value_is_reference": { "type": "boolean" }, + "order_insensitive": { + "type": "boolean" + }, "version": { "type": "string" }