From eae04c37297a20915e45368106cc435387063c3f Mon Sep 17 00:00:00 2001 From: Matthew Horridge Date: Tue, 27 Jan 2026 12:41:53 -0800 Subject: [PATCH] Add format_number primitive for fixed decimal formatting Introduce FormatNumber to format numeric values as fixed-width strings. Expose the primitive in the vocabulary and rule deserializer. Add tests for round-trip serialization, formatting, and validation. Update the harmonize demo to use format_number for weight_kg output. Closes #76. --- demo/harmonize_example/README.md | 2 +- demo/harmonize_example/output.csv | 2 +- demo/harmonize_example/rules.json | 7 +--- .../harmonization_rule.py | 4 +- .../primitives/__init__.py | 1 + .../primitives/format_number.py | 38 +++++++++++++++++++ .../primitives/vocabulary.py | 1 + tests/test_primitives_serialization.py | 24 ++++++++++++ 8 files changed, 70 insertions(+), 9 deletions(-) create mode 100644 src/harmonization_framework/primitives/format_number.py diff --git a/demo/harmonize_example/README.md b/demo/harmonize_example/README.md index a408d57..bc932d3 100644 --- a/demo/harmonize_example/README.md +++ b/demo/harmonize_example/README.md @@ -11,7 +11,7 @@ It includes a small input CSV, a rules JSON file, and a Python script that perfo ## What the example does - Renames `age` to `age_years` (pass-through). -- Converts `weight_lbs` to `weight_kg` (multiply by 0.453592). +- Converts `weight_lbs` to `weight_kg` (multiply by 0.453592) and formats to two decimals. - Splits `name` (stored as `"Last, First"`) into two new columns: - `given_name` - `family_name` diff --git a/demo/harmonize_example/output.csv b/demo/harmonize_example/output.csv index cf0fc3c..a400265 100644 --- a/demo/harmonize_example/output.csv +++ b/demo/harmonize_example/output.csv @@ -1,4 +1,4 @@ given_name,family_name,age_years,weight_kg,visit_type_label,source dataset,original_id -Alice,Smith,10,35.5,baseline,demo,0 +Alice,Smith,10,35.50,baseline,demo,0 Bob,Jones,5,20.18,follow_up,demo,1 Carol,Nguyen,8,41.82,screening,demo,2 diff --git a/demo/harmonize_example/rules.json b/demo/harmonize_example/rules.json index b67bbae..3637cc4 100644 --- a/demo/harmonize_example/rules.json +++ b/demo/harmonize_example/rules.json @@ -16,13 +16,8 @@ "scaling_factor": 0.453592 }, { - "operation": "round", + "operation": "format_number", "precision": 2 - }, - { - "operation": "cast", - "source": "float", - "target": "text" } ] } diff --git a/src/harmonization_framework/harmonization_rule.py b/src/harmonization_framework/harmonization_rule.py index c9dfba5..8527b10 100644 --- a/src/harmonization_framework/harmonization_rule.py +++ b/src/harmonization_framework/harmonization_rule.py @@ -1,7 +1,7 @@ from typing import Any, List from .element import DataElement from .primitives.base import PrimitiveOperation -from .primitives import PrimitiveVocabulary, Bin, Cast, ConvertDate, ConvertUnits, DoNothing, EnumToEnum, NormalizeText, Offset, Reduce, Round, Scale, Substitute, Threshold, Truncate +from .primitives import PrimitiveVocabulary, Bin, Cast, ConvertDate, ConvertUnits, DoNothing, EnumToEnum, FormatNumber, NormalizeText, Offset, Reduce, Round, Scale, Substitute, Threshold, Truncate import json @@ -61,6 +61,8 @@ def from_serialization(cls, serialization): primitive = DoNothing.from_serialization(operation) case PrimitiveVocabulary.ENUM_TO_ENUM.value: primitive = EnumToEnum.from_serialization(operation) + case PrimitiveVocabulary.FORMAT_NUMBER.value: + primitive = FormatNumber.from_serialization(operation) case PrimitiveVocabulary.NORMALIZE_TEXT.value: primitive = NormalizeText.from_serialization(operation) case PrimitiveVocabulary.OFFSET.value: diff --git a/src/harmonization_framework/primitives/__init__.py b/src/harmonization_framework/primitives/__init__.py index 7937ad4..6aefb5b 100644 --- a/src/harmonization_framework/primitives/__init__.py +++ b/src/harmonization_framework/primitives/__init__.py @@ -4,6 +4,7 @@ from .dates import ConvertDate from .donothing import DoNothing from .enum2enum import EnumToEnum +from .format_number import FormatNumber from .normalize import NormalizeText from .offset import Offset from .reduce import Reduce diff --git a/src/harmonization_framework/primitives/format_number.py b/src/harmonization_framework/primitives/format_number.py new file mode 100644 index 0000000..c7f2943 --- /dev/null +++ b/src/harmonization_framework/primitives/format_number.py @@ -0,0 +1,38 @@ +from .base import PrimitiveOperation, support_iterable +from typing import Union + +class FormatNumber(PrimitiveOperation): + """ + Format numeric values to a fixed number of decimal places. + + Output is a string, intended for stable presentation (e.g., CSV output). + """ + def __init__(self, precision: int): + if not isinstance(precision, int): + raise TypeError(f"Precision must be an integer, got {type(precision).__name__}") + if precision < 0: + raise ValueError("Precision must be non-negative") + self.precision = precision + + def __str__(self): + return f"Format number to {self.precision} decimal places" + + def to_dict(self): + """Serialize this operation to a JSON-friendly dict.""" + return { + "operation": "format_number", + "precision": self.precision, + } + + @support_iterable + def transform(self, value: Union[int, float]) -> str: + """Format the numeric value to the configured decimal precision.""" + if not isinstance(value, (int, float)) or isinstance(value, bool): + raise TypeError(f"FormatNumber expects a numeric value, got {type(value).__name__}") + return f"{value:.{self.precision}f}" + + @classmethod + def from_serialization(cls, serialization): + """Reconstruct a FormatNumber operation from a serialized dict.""" + precision = int(serialization["precision"]) + return FormatNumber(precision) diff --git a/src/harmonization_framework/primitives/vocabulary.py b/src/harmonization_framework/primitives/vocabulary.py index b6e6a80..cdc8005 100644 --- a/src/harmonization_framework/primitives/vocabulary.py +++ b/src/harmonization_framework/primitives/vocabulary.py @@ -7,6 +7,7 @@ class PrimitiveVocabulary(Enum): CONVERT_UNITS = "convert_units" DO_NOTHING = "do_nothing" ENUM_TO_ENUM = "enum_to_enum" + FORMAT_NUMBER = "format_number" NORMALIZE_TEXT = "normalize_text" OFFSET = "offset" REDUCE = "reduce" diff --git a/tests/test_primitives_serialization.py b/tests/test_primitives_serialization.py index 6edda9d..e236eab 100644 --- a/tests/test_primitives_serialization.py +++ b/tests/test_primitives_serialization.py @@ -7,6 +7,7 @@ ConvertUnits, DoNothing, EnumToEnum, + FormatNumber, NormalizeText, Offset, Reduce, @@ -136,6 +137,29 @@ def test_enum_to_enum_strict_raises_for_missing_value(): primitive.transform(2) +def test_format_number_serialization_and_transform(): + payload = {"operation": "format_number", "precision": 2} + + roundtrip = FormatNumber.from_serialization(payload) + assert roundtrip.to_dict() == payload + assert roundtrip.transform(35.5) == "35.50" + assert roundtrip.transform(3) == "3.00" + assert roundtrip.transform([1.234, 2]) == ["1.23", "2.00"] + + +def test_format_number_rejects_invalid_precision(): + with pytest.raises(TypeError, match="Precision must be an integer"): + FormatNumber("2") # type: ignore[arg-type] + with pytest.raises(ValueError, match="non-negative"): + FormatNumber(-1) + + +def test_format_number_rejects_non_numeric(): + primitive = FormatNumber(2) + with pytest.raises(TypeError, match="numeric"): + primitive.transform("nope") + + def test_round_serialization_and_transform(): primitive = Round(2) payload = primitive.to_dict()