Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -97,3 +97,11 @@ Example:
]
}
```

### NormalizeBoolean defaults

If you use the `normalize_boolean` primitive without specifying `truthy` or
`falsy` lists, the following defaults are applied:

- truthy: `["true", "t", "yes", "y", "1", 1, true, "on"]`
- falsy: `["false", "f", "no", "n", "0", 0, false, "off", ""]`
4 changes: 3 additions & 1 deletion src/harmonization_framework/harmonization_rule.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from typing import Any, List
from .element import DataElement
from .primitives.base import PrimitiveOperation
from .primitives import PrimitiveVocabulary, Bin, Cast, ConvertDate, ConvertUnits, DoNothing, EnumToEnum, FormatNumber, NormalizeText, Offset, Reduce, Round, Scale, Substitute, Threshold, Truncate
from .primitives import PrimitiveVocabulary, Bin, Cast, ConvertDate, ConvertUnits, DoNothing, EnumToEnum, FormatNumber, NormalizeBoolean, NormalizeText, Offset, Reduce, Round, Scale, Substitute, Threshold, Truncate

import json

Expand Down Expand Up @@ -63,6 +63,8 @@ def from_serialization(cls, serialization):
primitive = EnumToEnum.from_serialization(operation)
case PrimitiveVocabulary.FORMAT_NUMBER.value:
primitive = FormatNumber.from_serialization(operation)
case PrimitiveVocabulary.NORMALIZE_BOOLEAN.value:
primitive = NormalizeBoolean.from_serialization(operation)
case PrimitiveVocabulary.NORMALIZE_TEXT.value:
primitive = NormalizeText.from_serialization(operation)
case PrimitiveVocabulary.OFFSET.value:
Expand Down
1 change: 1 addition & 0 deletions src/harmonization_framework/primitives/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from .donothing import DoNothing
from .enum2enum import EnumToEnum
from .format_number import FormatNumber
from .normalize_boolean import NormalizeBoolean
from .normalize import NormalizeText
from .offset import Offset
from .reduce import Reduce
Expand Down
107 changes: 107 additions & 0 deletions src/harmonization_framework/primitives/normalize_boolean.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
from .base import PrimitiveOperation, support_iterable
from typing import Any, Iterable, List, Optional


class NormalizeBoolean(PrimitiveOperation):
"""
Normalize common truthy/falsy representations to booleans.

This primitive is intended for datasets that encode booleans as strings
or numeric flags (e.g., "Yes", "y", "1", "no", "0").

Defaults:
truthy: ["true", "t", "yes", "y", "1", 1, True, "on"]
falsy: ["false", "f", "no", "n", "0", 0, False, "off", ""]

If truthy/falsy are not provided, these defaults are used.
"""

DEFAULT_TRUTHY: List[Any] = ["true", "t", "yes", "y", "1", 1, True, "on"]
DEFAULT_FALSY: List[Any] = ["false", "f", "no", "n", "0", 0, False, "off", ""]

def __init__(
self,
truthy: Optional[Iterable[Any]] = None,
falsy: Optional[Iterable[Any]] = None,
strict: bool = True,
default: Optional[bool] = None,
):
"""
Create a boolean normalization operation.

Args:
truthy: Iterable of values treated as True.
Strings are normalized via strip().lower().
falsy: Iterable of values treated as False.
Strings are normalized via strip().lower().
strict: If True, unknown values raise ValueError.
If False, unknown values return `default`.
default: Fallback value when strict=False. Often None.
"""
self.truthy = list(truthy) if truthy is not None else list(self.DEFAULT_TRUTHY)
self.falsy = list(falsy) if falsy is not None else list(self.DEFAULT_FALSY)
self.strict = strict
self.default = default

self._truthy_set = {self._normalize_token(v) for v in self.truthy}
self._falsy_set = {self._normalize_token(v) for v in self.falsy}

def __str__(self) -> str:
return "Normalize boolean-like values"

def to_dict(self):
"""
Serialize this operation to a JSON-friendly dict.

Includes the configured truthy/falsy lists and strict/default behavior.
"""
output = {
"operation": "normalize_boolean",
"truthy": self.truthy,
"falsy": self.falsy,
"strict": self.strict,
}
if self.default is not None:
output["default"] = self.default
return output

@support_iterable
def transform(self, value: Any) -> bool:
"""
Convert a single value to True/False based on configured mappings.

Raises:
ValueError if strict=True and the value is not recognized.
"""
token = self._normalize_token(value)
if token in self._truthy_set:
return True
if token in self._falsy_set:
return False
if self.strict:
raise ValueError(f"Unknown boolean-like value: {value!r}")
return self.default

@classmethod
def from_serialization(cls, serialization):
"""
Reconstruct a NormalizeBoolean operation from a serialized dict.
"""
return NormalizeBoolean(
truthy=serialization.get("truthy"),
falsy=serialization.get("falsy"),
strict=bool(serialization.get("strict", True)),
default=serialization.get("default"),
)

@staticmethod
def _normalize_token(value: Any) -> Any:
"""
Normalize a token for set membership checks.

- Strings are trimmed and lowercased.
- Non-strings are passed through unchanged.
"""
if isinstance(value, str):
return value.strip().lower()
return value
1 change: 1 addition & 0 deletions src/harmonization_framework/primitives/vocabulary.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ class PrimitiveVocabulary(Enum):
DO_NOTHING = "do_nothing"
ENUM_TO_ENUM = "enum_to_enum"
FORMAT_NUMBER = "format_number"
NORMALIZE_BOOLEAN = "normalize_boolean"
NORMALIZE_TEXT = "normalize_text"
OFFSET = "offset"
REDUCE = "reduce"
Expand Down
28 changes: 28 additions & 0 deletions tests/test_primitives_serialization.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
DoNothing,
EnumToEnum,
FormatNumber,
NormalizeBoolean,
NormalizeText,
Offset,
Reduce,
Expand Down Expand Up @@ -137,6 +138,33 @@ def test_enum_to_enum_strict_raises_for_missing_value():
primitive.transform(2)


def test_normalize_boolean_serialization_and_transform():
payload = {
"operation": "normalize_boolean",
"truthy": ["true", "t", "yes", "y", "1", "on"],
"falsy": ["false", "f", "no", "n", "0", "off", ""],
"strict": True,
}

roundtrip = NormalizeBoolean.from_serialization(payload)
assert roundtrip.to_dict() == payload
assert roundtrip.transform("Yes") is True
assert roundtrip.transform(" n ") is False
assert roundtrip.transform("1") is True
assert roundtrip.transform(["true", "false"]) == [True, False]


def test_normalize_boolean_strict_raises_on_unknown():
primitive = NormalizeBoolean()
with pytest.raises(ValueError, match="Unknown boolean-like value"):
primitive.transform("maybe")


def test_normalize_boolean_non_strict_default():
primitive = NormalizeBoolean(strict=False, default=None)
assert primitive.transform("maybe") is None


def test_format_number_serialization_and_transform():
payload = {"operation": "format_number", "precision": 2}

Expand Down