Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cdisc_rules_engine/models/operation_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ class OperationParams:
source: str = None
target: str = None
subtract: str = None
order_insensitive: bool = True
value_is_reference: bool = False
namespace: str = None
delimiter: str = None
Expand Down
44 changes: 42 additions & 2 deletions cdisc_rules_engine/operations/minus.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def _normalize_to_list(val):
return [val]


def _set_difference_preserve_order(list_a: list, list_b: list) -> list:
def _set_difference_order_insensitive(list_a: list, list_b: list) -> list:
"""
Compute set difference A \\ B (elements in A not in B).
Preserves order from list_a.
Expand All @@ -26,6 +26,43 @@ def _set_difference_preserve_order(list_a: list, list_b: list) -> list:
return [x for x in _normalize_to_list(list_a) if x not in set_b]


def _set_difference_order_sensitive(list_a: list, list_b: list) -> list:
"""
Compute set difference A \\ B (elements in A not in B).
Take into account order of elements
Preserves order from list_a.
"""
result = []
a_start_index = 0
list_a_normalized = _normalize_to_list(list_a)
for b_item in _normalize_to_list(list_b):
# Check if b_item is in the remaining part of A
set_a = set(list_a_normalized[a_start_index:])
if b_item in set_a:
match_found = False
# Iterate through A starting from last matched index
for i in range(a_start_index, len(list_a_normalized)):
a_item = list_a_normalized[i]
if a_item != b_item:
Comment thread
DmitryMK marked this conversation as resolved.
if match_found:
break
else:
result.append(a_item)
else:
# Move start index to next position after matched item
a_start_index = i + 1
# We have to continue checking for duplicates of b_item in A, so we don't break here
match_found = True
else:
# If B item is not in A, ignore it since there is nothing to subtract from A
continue

# Add any remaining items in A after last matched index
result.extend(list_a_normalized[a_start_index:])

Comment thread
DmitryMK marked this conversation as resolved.
return result


Comment thread
DmitryMK marked this conversation as resolved.
class Minus(BaseOperation):
"""
Operation that computes set difference: name minus subtract.
Expand All @@ -43,4 +80,7 @@ def _execute_operation(self):
if not subtract_ref or subtract_ref not in self.evaluation_dataset.columns:
return _normalize_to_list(list_a)
list_b = self.evaluation_dataset[subtract_ref].iloc[0]
return _set_difference_preserve_order(list_a, list_b)
if self.params.order_insensitive:
return _set_difference_order_insensitive(list_a, list_b)
else:
return _set_difference_order_sensitive(list_a, list_b)
1 change: 1 addition & 0 deletions cdisc_rules_engine/utilities/rule_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -429,6 +429,7 @@ def perform_rule_operations(
term_pref_term=operation.get("term_pref_term"),
term_value=operation.get("term_value"),
value_is_reference=operation.get("value_is_reference", False),
order_insensitive=operation.get("order_insensitive", True),
)
try:
# execute operation
Expand Down
5 changes: 4 additions & 1 deletion resources/schema/rule-merged/Operations.json
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,7 @@
"properties": {
"operator": {
"const": "minus",
"markdownDescription": "\nComputes set difference: elements in `name` that are not in `subtract`. Uses [set difference](<https://en.wikipedia.org/wiki/Set_(mathematics)#Set_difference>) semantics (A \u2216 B). Preserves order from the first list. Both `name` and `subtract` must reference other operation results (e.g., `$expected_variables`, `$dataset_variables`). When `subtract` is empty or missing, returns all elements from `name`. Can be computed and added to output variables to display missing elements in error results.\n\n```yaml\nOperations:\n - id: $expected_variables\n operator: expected_variables\n - id: $dataset_variables\n operator: get_column_order_from_dataset\n - id: $expected_minus_dataset\n name: $expected_variables\n operator: minus\n subtract: $dataset_variables\n```\n"
"markdownDescription": "\nComputes set difference: elements in `name` that are not in `subtract`. By default a standard [set difference](<https://en.wikipedia.org/wiki/Set_(mathematics)#Set_difference>) semantics (A \u2216 B) is applied. Optional `order_insensitive` property allows to have the element order to be taken into consideration and only those `name` elements are removed which follow the same order as in `subtract` . Preserves order from the first list. Both `name` and `subtract` must reference other operation results (e.g., `$expected_variables`, `$dataset_variables`). When `subtract` is empty or missing, returns all elements from `name`. Can be computed and added to output variables to display missing elements in error results.\n\n```yaml\nOperations:\n - id: $expected_variables\n operator: expected_variables\n - id: $dataset_variables\n operator: get_column_order_from_dataset\n - id: $expected_minus_dataset\n name: $expected_variables\n operator: minus\n subtract: $dataset_variables\n order_insensitive: false\n```\n"
}
},
"required": ["id", "operator", "name", "subtract"],
Expand Down Expand Up @@ -697,6 +697,9 @@
"value_is_reference": {
"type": "boolean"
},
"order_insensitive": {
"type": "boolean"
},
"version": {
"type": "string"
}
Expand Down
3 changes: 3 additions & 0 deletions resources/schema/rule/Operations.json
Original file line number Diff line number Diff line change
Expand Up @@ -644,6 +644,9 @@
"value_is_reference": {
"type": "boolean"
},
"order_insensitive": {
"type": "boolean"
},
"version": {
"type": "string"
}
Expand Down
3 changes: 2 additions & 1 deletion resources/schema/rule/Operations.md
Original file line number Diff line number Diff line change
Expand Up @@ -779,7 +779,7 @@ Operations:

### minus

Computes set difference: elements in `name` that are not in `subtract`. Uses [set difference](<https://en.wikipedia.org/wiki/Set_(mathematics)#Set_difference>) semantics (A ∖ B). Preserves order from the first list. Both `name` and `subtract` must reference other operation results (e.g., `$expected_variables`, `$dataset_variables`). When `subtract` is empty or missing, returns all elements from `name`. Can be computed and added to output variables to display missing elements in error results.
Computes set difference: elements in `name` that are not in `subtract`. By default a standard [set difference](<https://en.wikipedia.org/wiki/Set_(mathematics)#Set_difference>) semantics (A ∖ B) is applied. Optional `order_insensitive` property allows to have the element order to be taken into consideration and only those `name` elements are removed which follow the same order as in `subtract` . Preserves order from the first list. Both `name` and `subtract` must reference other operation results (e.g., `$expected_variables`, `$dataset_variables`). When `subtract` is empty or missing, returns all elements from `name`. Can be computed and added to output variables to display missing elements in error results.

```yaml
Operations:
Expand All @@ -791,6 +791,7 @@ Operations:
name: $expected_variables
operator: minus
subtract: $dataset_variables
order_insensitive: false
```

### label_referenced_variable_metadata
Expand Down
4 changes: 4 additions & 0 deletions resources/schema/rule/check_parameter.md
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,10 @@ Reference to another operation result, used as the second operand in operations
subtract: $dataset_variables
```

### order_insensitive

Optional boolean parameter for the `minus` operator. When set to `true` or not specified, `minus` performs a standard [set difference](<https://en.wikipedia.org/wiki/Set_(mathematics)#Set_difference>) operation, ignoring the order of elements. When set to `false`, `minus` considers element order and only removes elements from `name` that follow the same order as in `subtract`.

### term_code

Terminology code value used in controlled terminology operations for code-based lookups.
Expand Down
66 changes: 63 additions & 3 deletions tests/unit/test_operations/test_minus.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,11 @@
from cdisc_rules_engine.models.dataset.pandas_dataset import PandasDataset

from cdisc_rules_engine.models.operation_params import OperationParams
from cdisc_rules_engine.operations.minus import Minus, _set_difference_preserve_order
from cdisc_rules_engine.operations.minus import (
Minus,
_set_difference_order_insensitive,
_set_difference_order_sensitive,
)
import pytest


Expand Down Expand Up @@ -32,10 +36,24 @@ def minus_params(operation_params: OperationParams) -> OperationParams:
(["a", "", "b"], [""], ["a", "b"]),
(["a", "", "b"], ["c"], ["a", "", "b"]),
([""], [""], []),
(["A", "C", "D", "B"], ["B", "D"], ["A", "C", "D"]),
(["A", "C", "D", "B"], ["A", "B", "D"], ["C", "D"]),
],
)
def test_set_difference_preserve_order(list_a, list_b, expected):
assert _set_difference_preserve_order(list_a, list_b) == expected
def test_set_difference_order_sensitive(list_a, list_b, expected):
assert _set_difference_order_sensitive(list_a, list_b) == expected


@pytest.mark.parametrize(
"list_a,list_b,expected",
[
(["c", "b", "a"], ["b"], ["c", "a"]),
(["A", "C", "D", "B"], ["B", "D"], ["A", "C"]),
(["A", "C", "D", "B"], ["A", "B", "D"], ["C"]),
],
)
def test_set_difference_order_insensitive(list_a, list_b, expected):
assert _set_difference_order_insensitive(list_a, list_b) == expected


@pytest.mark.parametrize("dataset_type", [PandasDataset, DaskDataset])
Expand All @@ -58,6 +76,48 @@ def test_minus_operation(minus_params: OperationParams, dataset_type):
assert list(result[minus_params.operation_id].iloc[0]) == ["AEDECOD"]


@pytest.mark.parametrize("dataset_type", [PandasDataset, DaskDataset])
def test_minus_operation_incorrect_order(minus_params: OperationParams, dataset_type):
eval_dataset = dataset_type.from_dict(
{
"$expected_variables": [
["STUDYID", "DOMAIN", "AESEQ", "AETERM", "AEDECOD"],
["STUDYID", "DOMAIN", "AESEQ", "AETERM", "AEDECOD"],
],
"$dataset_variables": [
["STUDYID", "AESEQ", "DOMAIN", "AETERM"],
["STUDYID", "AESEQ", "DOMAIN", "AETERM"],
],
}
)

minus_params.order_insensitive = False
operation = Minus(minus_params, eval_dataset, MagicMock(), MagicMock())
result = operation.execute()
assert list(result[minus_params.operation_id].iloc[0]) == ["DOMAIN", "AEDECOD"]


@pytest.mark.parametrize("dataset_type", [PandasDataset, DaskDataset])
def test_minus_operation_order_insensitive(minus_params: OperationParams, dataset_type):
eval_dataset = dataset_type.from_dict(
{
"$expected_variables": [
["STUDYID", "DOMAIN", "AESEQ", "AETERM", "AEDECOD"],
["STUDYID", "DOMAIN", "AESEQ", "AETERM", "AEDECOD"],
],
"$dataset_variables": [
["STUDYID", "AESEQ", "DOMAIN", "AETERM"],
["STUDYID", "AESEQ", "DOMAIN", "AETERM"],
],
}
)

minus_params.order_insensitive = True
operation = Minus(minus_params, eval_dataset, MagicMock(), MagicMock())
result = operation.execute()
assert list(result[minus_params.operation_id].iloc[0]) == ["AEDECOD"]


@pytest.mark.parametrize("dataset_type", [PandasDataset, DaskDataset])
def test_minus_empty_subtract_returns_all_of_name(
minus_params: OperationParams, dataset_type
Expand Down