diff --git a/README.md b/README.md index 6acbd31..d50c87b 100644 --- a/README.md +++ b/README.md @@ -135,6 +135,7 @@ All settings are provided in the operation dict for rule serialization. | `normalize_boolean` | Normalize truthy/falsy values to booleans. | `truthy` (list, optional; defaults below)
`falsy` (list, optional; defaults below)
`strict` (bool, default `true`)
`default` (optional; used when `strict=false`) | | `normalize_text` | Apply a single text normalization. | `normalization` (`strip`, `lower`, `upper`, `remove_accents`, `remove_punctuation`, `remove_special_characters`) | | `offset` | Add an offset to numeric values. | `offset` (number) | +| `parse_array` | Parse array-like values into a list for downstream operations. | `format` (`json` default, `delimiter`)
`delimiter` (string; used for `delimiter` format, default `|`, supports `\\n` for newline)
`item_type` (`auto`, `string`, `integer`, `float`, `boolean`)
`strict` (bool, default `true`)
`default` (optional; used when `strict=false`)
`allow_singleton` (bool, default `false`) | | `reduce` | Reduce a list of values to one value. | `reduction` (`any`, `none`, `all`, `one-hot`, `sum`); expects a list/tuple input; one-hot returns index or None | | `round` | Round numeric values to a given precision. | `precision` (int, >=0); uses Python `round` semantics | | `scale` | Multiply numeric values by a factor. | `scaling_factor` (number) | @@ -162,6 +163,7 @@ Each operation is represented by a JSON-friendly dict. Examples: | `normalize_boolean` | `{"operation":"normalize_boolean","truthy":["yes","y","1"],"falsy":["no","n","0"],"strict":true}` | | `normalize_text` | `{"operation":"normalize_text","normalization":"lower"}` | | `offset` | `{"operation":"offset","offset":2.5}` | +| `parse_array` | `{"operation":"parse_array","format":"json","item_type":"integer","strict":true}` | | `reduce` | `{"operation":"reduce","reduction":"one-hot"}` | | `round` | `{"operation":"round","precision":2}` | | `scale` | `{"operation":"scale","scaling_factor":0.453592}` | @@ -176,3 +178,31 @@ If you use the `normalize_boolean` primitive without specifying `truthy` or - truthy: `["true", "t", "yes", "y", "1", 1, true, "on"]` - falsy: `["false", "f", "no", "n", "0", 0, false, "off", ""]` + +### ParseArray + Reduce for CSV data + +When arrays are serialized as text in CSV (for example `"[8,8,8,8,6]"` or +`"8|8|8|8|6"`), chain `parse_array` before `reduce`: + +```json +{ + "source": "week_hours", + "target": "total_hours", + "operations": [ + {"operation": "parse_array", "format": "json", "item_type": "integer", "strict": true}, + {"operation": "reduce", "reduction": "sum"} + ] +} +``` + +For delimiter input, use: + +```json +{"operation": "parse_array", "format": "delimiter", "delimiter": "|", "item_type": "integer"} +``` + +For newline-separated input, use: + +```json +{"operation": "parse_array", "format": "delimiter", "delimiter": "\\n", "item_type": "integer"} +``` diff --git a/demo/primitives_ui/input.csv b/demo/primitives_ui/input.csv new file mode 100644 index 0000000..77f690d --- /dev/null +++ b/demo/primitives_ui/input.csv @@ -0,0 +1,4 @@ +age_years,zip_code_text,visit_date_iso,weight_kg,record_id,bmi,smoker_response,city_raw,thermometer_c,week_hours,medication_dose_mg,price_usd,name_last_first,pulse_rate,username +34,02139,2026-02-17,70.5,REC-0001,27.345,Yes," New York ",36.6,"[8,8,8,8,6]",2.675,19.99,"DOE, Jane",220,alexandria +12,60614,2025-12-31,82.0,REC-0002,18.0,No," San Francisco ",37.1,"[10,10,10,10,5]",1.005,3.5,"SMITH, John",35,bo +70,98101,2024-07-04,95.2,REC-0003,31.889,unknown," Austin ",36.2,"[0,0,0,0,0]",0.999,120.0,"LEE, Ada",88,charlie diff --git a/demo/primitives_ui/rules.json b/demo/primitives_ui/rules.json new file mode 100644 index 0000000..1682d60 --- /dev/null +++ b/demo/primitives_ui/rules.json @@ -0,0 +1,219 @@ +{ + "age_years": { + "age_group": { + "source": "age_years", + "target": "age_group", + "operations": [ + { + "operation": "bin", + "bins": [ + {"label": 1, "start": 0, "end": 12}, + {"label": 2, "start": 13, "end": 17}, + {"label": 3, "start": 18, "end": 64}, + {"label": 4, "start": 65, "end": 120} + ] + } + ] + } + }, + "age_group": { + "age_group_label": { + "source": "age_group", + "target": "age_group_label", + "operations": [ + { + "operation": "enum_to_enum", + "mapping": { + "1": "child", + "2": "teen", + "3": "adult", + "4": "senior" + }, + "default": "unknown", + "strict": false + } + ] + } + }, + "zip_code_text": { + "zip_code": { + "source": "zip_code_text", + "target": "zip_code", + "operations": [ + { + "operation": "cast", + "source": "text", + "target": "integer" + } + ] + } + }, + "visit_date_iso": { + "visit_date_us": { + "source": "visit_date_iso", + "target": "visit_date_us", + "operations": [ + { + "operation": "convert_date", + "source_format": "%Y-%m-%d", + "target_format": "%m/%d/%Y" + } + ] + } + }, + "weight_kg": { + "weight_lb": { + "source": "weight_kg", + "target": "weight_lb", + "operations": [ + { + "operation": "convert_units", + "source_unit": "kg", + "target_unit": "lb" + } + ] + } + }, + "record_id": { + "record_id_copy": { + "source": "record_id", + "target": "record_id_copy", + "operations": [ + { + "operation": "do_nothing" + } + ] + } + }, + "bmi": { + "bmi_formatted": { + "source": "bmi", + "target": "bmi_formatted", + "operations": [ + { + "operation": "format_number", + "precision": 1 + } + ] + } + }, + "smoker_response": { + "is_smoker": { + "source": "smoker_response", + "target": "is_smoker", + "operations": [ + { + "operation": "normalize_boolean", + "truthy": ["true", "t", "yes", "y", "1", 1, true, "on"], + "falsy": ["false", "f", "no", "n", "0", 0, false, "off", ""], + "strict": false, + "default": null + } + ] + } + }, + "city_raw": { + "city_normalized": { + "source": "city_raw", + "target": "city_normalized", + "operations": [ + { + "operation": "normalize_text", + "normalization": "lower" + } + ] + } + }, + "thermometer_c": { + "calibrated_c": { + "source": "thermometer_c", + "target": "calibrated_c", + "operations": [ + { + "operation": "offset", + "offset": 0.5 + } + ] + } + }, + "week_hours": { + "total_hours": { + "source": "week_hours", + "target": "total_hours", + "operations": [ + { + "operation": "parse_array", + "format": "json", + "item_type": "integer", + "strict": true + }, + { + "operation": "reduce", + "reduction": "sum" + } + ] + } + }, + "medication_dose_mg": { + "medication_dose_mg_rounded": { + "source": "medication_dose_mg", + "target": "medication_dose_mg_rounded", + "operations": [ + { + "operation": "round", + "precision": 2 + } + ] + } + }, + "price_usd": { + "price_cents": { + "source": "price_usd", + "target": "price_cents", + "operations": [ + { + "operation": "scale", + "scaling_factor": 100 + } + ] + } + }, + "name_last_first": { + "name_first_last": { + "source": "name_last_first", + "target": "name_first_last", + "operations": [ + { + "operation": "substitute", + "expression": "^\\s*([^,]+),\\s*(.+)$", + "substitution": "\\2 \\1" + } + ] + } + }, + "pulse_rate": { + "pulse_rate_clamped": { + "source": "pulse_rate", + "target": "pulse_rate_clamped", + "operations": [ + { + "operation": "threshold", + "lower": 40, + "upper": 200 + } + ] + } + }, + "username": { + "username_short": { + "source": "username", + "target": "username_short", + "operations": [ + { + "operation": "truncate", + "length": 8 + } + ] + } + } +} diff --git a/src/harmonization_framework/harmonization_rule.py b/src/harmonization_framework/harmonization_rule.py index f2168b1..2926227 100644 --- a/src/harmonization_framework/harmonization_rule.py +++ b/src/harmonization_framework/harmonization_rule.py @@ -1,7 +1,7 @@ from typing import Any, List from .element import DataElement from .primitives.base import PrimitiveOperation -from .primitives import PrimitiveVocabulary, Bin, Cast, ConvertDate, ConvertUnits, DoNothing, EnumToEnum, FormatNumber, NormalizeBoolean, NormalizeText, Offset, Reduce, Round, Scale, Substitute, Threshold, Truncate +from .primitives import PrimitiveVocabulary, Bin, Cast, ConvertDate, ConvertUnits, DoNothing, EnumToEnum, FormatNumber, NormalizeBoolean, NormalizeText, Offset, ParseArray, Reduce, Round, Scale, Substitute, Threshold, Truncate import json @@ -69,6 +69,8 @@ def from_serialization(cls, serialization): primitive = NormalizeText.from_serialization(operation) case PrimitiveVocabulary.OFFSET.value: primitive = Offset.from_serialization(operation) + case PrimitiveVocabulary.PARSE_ARRAY.value: + primitive = ParseArray.from_serialization(operation) case PrimitiveVocabulary.REDUCE.value: primitive = Reduce.from_serialization(operation) case PrimitiveVocabulary.ROUND.value: diff --git a/src/harmonization_framework/primitives/__init__.py b/src/harmonization_framework/primitives/__init__.py index 16acdb4..fb0616a 100644 --- a/src/harmonization_framework/primitives/__init__.py +++ b/src/harmonization_framework/primitives/__init__.py @@ -8,6 +8,7 @@ from .normalize_boolean import NormalizeBoolean from .normalize import NormalizeText from .offset import Offset +from .parse_array import ParseArray from .reduce import Reduce from .round_decimal import Round from .scale import Scale diff --git a/src/harmonization_framework/primitives/parse_array.py b/src/harmonization_framework/primitives/parse_array.py new file mode 100644 index 0000000..beae9ac --- /dev/null +++ b/src/harmonization_framework/primitives/parse_array.py @@ -0,0 +1,129 @@ +import json +from typing import Any, List + +from .base import PrimitiveOperation +from .normalize_boolean import NormalizeBoolean + + +class ParseArray(PrimitiveOperation): + """ + Parse array-like values into Python lists. + + Supported formats: + - json: parse JSON arrays from strings (default) + - delimiter: split strings by a configured delimiter + """ + + SUPPORTED_FORMATS = {"json", "delimiter"} + SUPPORTED_ITEM_TYPES = {"auto", "string", "integer", "float", "boolean"} + + def __init__(self, format: str = "json", delimiter: str = "|", item_type: str = "auto", strict: bool = True, + default: Any = None, allow_singleton: bool = False): + super().__init__() + if format not in self.SUPPORTED_FORMATS: + raise ValueError(f"Unsupported parse_array format: {format}") + if item_type not in self.SUPPORTED_ITEM_TYPES: + raise ValueError(f"Unsupported parse_array item_type: {item_type}") + if not isinstance(delimiter, str): + raise TypeError("Delimiter must be a string") + if format == "delimiter" and delimiter == "": + raise ValueError("Delimiter must be non-empty for delimiter format") + + self.format = format + self.delimiter = delimiter + self.item_type = item_type + self.strict = strict + self.default = default + self.allow_singleton = allow_singleton + # Keep boolean coercion semantics aligned with NormalizeBoolean defaults. + self._boolean_normalizer = NormalizeBoolean(strict=True) + + def __str__(self): + return f"Parse array using {self.format} format" + + def to_dict(self): + output = { + "operation": "parse_array", + "format": self.format, + "item_type": self.item_type, + "strict": self.strict, + "allow_singleton": self.allow_singleton, + } + if self.format == "delimiter": + output["delimiter"] = self.delimiter + if self.default is not None: + output["default"] = self.default + return output + + def transform(self, value: Any) -> Any: + try: + items = self._parse_items(value) + return [self._coerce_item(item) for item in items] + except Exception as exc: + if self.strict: + raise ValueError( + f"Failed to parse array from value={value!r} " + f"with format={self.format!r}" + ) from exc + return self.default + + @classmethod + def from_serialization(cls, serialization): + return ParseArray( + format=serialization.get("format", "json"), + delimiter=serialization.get("delimiter", "|"), + item_type=serialization.get("item_type", "auto"), + strict=bool(serialization.get("strict", True)), + default=serialization.get("default"), + allow_singleton=bool(serialization.get("allow_singleton", False)), + ) + + def _parse_items(self, value: Any) -> List[Any]: + if isinstance(value, (list, tuple)): + return list(value) + + if isinstance(value, str): + if self.format == "json": + return self._parse_json(value) + return self._parse_delimiter(value) + + if self.allow_singleton: + return [value] + + raise TypeError(f"parse_array expects list/tuple/string input, got {type(value).__name__}") + + def _parse_json(self, value: str) -> List[Any]: + parsed = json.loads(value) + if isinstance(parsed, list): + return parsed + if self.allow_singleton and not isinstance(parsed, dict): + return [parsed] + raise ValueError("JSON payload is not an array") + + def _parse_delimiter(self, value: str) -> List[str]: + delimiter = self._resolve_delimiter(self.delimiter) + if value == "": + return [] + if delimiter == "\n": + # Normalize Windows-style newlines for predictable splitting. + value = value.replace("\r\n", "\n") + return [item.strip() for item in value.split(delimiter)] + + def _coerce_item(self, item: Any) -> Any: + if self.item_type == "auto": + return item + if self.item_type == "string": + return str(item) + if self.item_type == "integer": + return int(item) + if self.item_type == "float": + return float(item) + if self.item_type == "boolean": + return self._boolean_normalizer.transform(item) + return item + + @staticmethod + def _resolve_delimiter(delimiter: str) -> str: + if delimiter == "\\n": + return "\n" + return delimiter diff --git a/src/harmonization_framework/primitives/vocabulary.py b/src/harmonization_framework/primitives/vocabulary.py index cb95b18..8d65cf3 100644 --- a/src/harmonization_framework/primitives/vocabulary.py +++ b/src/harmonization_framework/primitives/vocabulary.py @@ -11,6 +11,7 @@ class PrimitiveVocabulary(Enum): NORMALIZE_BOOLEAN = "normalize_boolean" NORMALIZE_TEXT = "normalize_text" OFFSET = "offset" + PARSE_ARRAY = "parse_array" REDUCE = "reduce" ROUND = "round" SCALE = "scale" diff --git a/tests/test_cli.py b/tests/test_cli.py index 70dd916..c67a19a 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -137,14 +137,15 @@ def test_cli_missing_behavior(tmp_path, capsys): assert set(reader.fieldnames) == {"c"} assert out_rows == [{"c": "1"}] - # error should raise - with pytest.raises(ValueError, match="Missing source columns"): + # error should surface as argparse failure (SystemExit code 2) + with pytest.raises(SystemExit) as exc: cli.main([ "--rules", str(rules_path), "--input", str(input_path), "--output", str(output_path), "--on-missing", "error", ]) + assert exc.value.code == 2 def test_cli_tsv_autodetect(tmp_path): diff --git a/tests/test_primitives_serialization.py b/tests/test_primitives_serialization.py index 8454c96..d3b0b4c 100644 --- a/tests/test_primitives_serialization.py +++ b/tests/test_primitives_serialization.py @@ -11,6 +11,7 @@ NormalizeBoolean, NormalizeText, Offset, + ParseArray, Reduce, Round, Scale, @@ -352,6 +353,53 @@ def test_reduce_serialization_and_transform(): assert primitive.transform([1, 2, 3]) == 6 +def test_parse_array_json_default_serialization_and_transform(): + primitive = ParseArray() + payload = primitive.to_dict() + + assert payload == { + "operation": "parse_array", + "format": "json", + "item_type": "auto", + "strict": True, + "allow_singleton": False, + } + + roundtrip = ParseArray.from_serialization(payload) + assert roundtrip.to_dict() == payload + assert primitive.transform("[1, 2, 3]") == [1, 2, 3] + assert primitive.transform((1, 2, 3)) == [1, 2, 3] + + +def test_parse_array_delimiter_pipe_and_newline_supported(): + pipe_parser = ParseArray(format="delimiter", delimiter="|", item_type="integer") + newline_parser = ParseArray(format="delimiter", delimiter="\\n", item_type="integer") + + assert pipe_parser.transform("8|8|8|8|6") == [8, 8, 8, 8, 6] + assert pipe_parser.transform(" 8 | 8 | 8 | 8 | 6 ") == [8, 8, 8, 8, 6] + assert newline_parser.transform("1\n2\n3") == [1, 2, 3] + assert newline_parser.transform("1\r\n2\r\n3") == [1, 2, 3] + + +def test_parse_array_non_strict_default_on_failure(): + primitive = ParseArray(strict=False, default=[]) + assert primitive.transform("not valid json") == [] + + +def test_parse_array_boolean_item_type_matches_normalize_boolean_defaults(): + primitive = ParseArray(item_type="boolean") + assert primitive.transform("[\" yes \", \"off\", \"1\", \"0\"]") == [True, False, True, False] + + +def test_parse_array_boolean_item_type_rejects_unknown_like_normalize_boolean(): + strict_primitive = ParseArray(item_type="boolean") + non_strict_primitive = ParseArray(item_type="boolean", strict=False, default=[]) + + with pytest.raises(ValueError, match="Failed to parse array"): + strict_primitive.transform("[2]") + assert non_strict_primitive.transform("[2]") == [] + + def test_reduce_rejects_non_list(): primitive = Reduce(Reduction.SUM) with pytest.raises(TypeError, match="list or tuple"): diff --git a/tests/test_rule_serialization.py b/tests/test_rule_serialization.py index 5285da1..7c724f5 100644 --- a/tests/test_rule_serialization.py +++ b/tests/test_rule_serialization.py @@ -2,7 +2,8 @@ import pytest from harmonization_framework.harmonization_rule import HarmonizationRule -from harmonization_framework.primitives import Cast, DoNothing, Round +from harmonization_framework.primitives import Cast, DoNothing, ParseArray, Reduce, Round +from harmonization_framework.primitives.reduce import Reduction def test_rule_serializes_with_empty_operations(): @@ -50,3 +51,15 @@ def test_rule_from_serialization_unknown_operation_raises(): def test_rule_transform_with_do_nothing(): rule = HarmonizationRule("x", "y", [DoNothing()]) assert rule.transform("abc") == "abc" + + +def test_rule_with_parse_array_then_reduce_sum(): + rule = HarmonizationRule( + "week_hours", + "total_hours", + [ParseArray(), Reduce(Reduction.SUM)], + ) + payload = rule.serialize() + + roundtrip = HarmonizationRule.from_serialization(payload) + assert roundtrip.transform("[8, 8, 8, 8, 6]") == 38