diff --git a/README.md b/README.md
index 6acbd31..d50c87b 100644
--- a/README.md
+++ b/README.md
@@ -135,6 +135,7 @@ All settings are provided in the operation dict for rule serialization.
| `normalize_boolean` | Normalize truthy/falsy values to booleans. | `truthy` (list, optional; defaults below)
`falsy` (list, optional; defaults below)
`strict` (bool, default `true`)
`default` (optional; used when `strict=false`) |
| `normalize_text` | Apply a single text normalization. | `normalization` (`strip`, `lower`, `upper`, `remove_accents`, `remove_punctuation`, `remove_special_characters`) |
| `offset` | Add an offset to numeric values. | `offset` (number) |
+| `parse_array` | Parse array-like values into a list for downstream operations. | `format` (`json` default, `delimiter`)
`delimiter` (string; used for `delimiter` format, default `|`, supports `\\n` for newline)
`item_type` (`auto`, `string`, `integer`, `float`, `boolean`)
`strict` (bool, default `true`)
`default` (optional; used when `strict=false`)
`allow_singleton` (bool, default `false`) |
| `reduce` | Reduce a list of values to one value. | `reduction` (`any`, `none`, `all`, `one-hot`, `sum`); expects a list/tuple input; one-hot returns index or None |
| `round` | Round numeric values to a given precision. | `precision` (int, >=0); uses Python `round` semantics |
| `scale` | Multiply numeric values by a factor. | `scaling_factor` (number) |
@@ -162,6 +163,7 @@ Each operation is represented by a JSON-friendly dict. Examples:
| `normalize_boolean` | `{"operation":"normalize_boolean","truthy":["yes","y","1"],"falsy":["no","n","0"],"strict":true}` |
| `normalize_text` | `{"operation":"normalize_text","normalization":"lower"}` |
| `offset` | `{"operation":"offset","offset":2.5}` |
+| `parse_array` | `{"operation":"parse_array","format":"json","item_type":"integer","strict":true}` |
| `reduce` | `{"operation":"reduce","reduction":"one-hot"}` |
| `round` | `{"operation":"round","precision":2}` |
| `scale` | `{"operation":"scale","scaling_factor":0.453592}` |
@@ -176,3 +178,31 @@ If you use the `normalize_boolean` primitive without specifying `truthy` or
- truthy: `["true", "t", "yes", "y", "1", 1, true, "on"]`
- falsy: `["false", "f", "no", "n", "0", 0, false, "off", ""]`
+
+### ParseArray + Reduce for CSV data
+
+When arrays are serialized as text in CSV (for example `"[8,8,8,8,6]"` or
+`"8|8|8|8|6"`), chain `parse_array` before `reduce`:
+
+```json
+{
+ "source": "week_hours",
+ "target": "total_hours",
+ "operations": [
+ {"operation": "parse_array", "format": "json", "item_type": "integer", "strict": true},
+ {"operation": "reduce", "reduction": "sum"}
+ ]
+}
+```
+
+For delimiter input, use:
+
+```json
+{"operation": "parse_array", "format": "delimiter", "delimiter": "|", "item_type": "integer"}
+```
+
+For newline-separated input, use:
+
+```json
+{"operation": "parse_array", "format": "delimiter", "delimiter": "\\n", "item_type": "integer"}
+```
diff --git a/demo/primitives_ui/input.csv b/demo/primitives_ui/input.csv
new file mode 100644
index 0000000..77f690d
--- /dev/null
+++ b/demo/primitives_ui/input.csv
@@ -0,0 +1,4 @@
+age_years,zip_code_text,visit_date_iso,weight_kg,record_id,bmi,smoker_response,city_raw,thermometer_c,week_hours,medication_dose_mg,price_usd,name_last_first,pulse_rate,username
+34,02139,2026-02-17,70.5,REC-0001,27.345,Yes," New York ",36.6,"[8,8,8,8,6]",2.675,19.99,"DOE, Jane",220,alexandria
+12,60614,2025-12-31,82.0,REC-0002,18.0,No," San Francisco ",37.1,"[10,10,10,10,5]",1.005,3.5,"SMITH, John",35,bo
+70,98101,2024-07-04,95.2,REC-0003,31.889,unknown," Austin ",36.2,"[0,0,0,0,0]",0.999,120.0,"LEE, Ada",88,charlie
diff --git a/demo/primitives_ui/rules.json b/demo/primitives_ui/rules.json
new file mode 100644
index 0000000..1682d60
--- /dev/null
+++ b/demo/primitives_ui/rules.json
@@ -0,0 +1,219 @@
+{
+ "age_years": {
+ "age_group": {
+ "source": "age_years",
+ "target": "age_group",
+ "operations": [
+ {
+ "operation": "bin",
+ "bins": [
+ {"label": 1, "start": 0, "end": 12},
+ {"label": 2, "start": 13, "end": 17},
+ {"label": 3, "start": 18, "end": 64},
+ {"label": 4, "start": 65, "end": 120}
+ ]
+ }
+ ]
+ }
+ },
+ "age_group": {
+ "age_group_label": {
+ "source": "age_group",
+ "target": "age_group_label",
+ "operations": [
+ {
+ "operation": "enum_to_enum",
+ "mapping": {
+ "1": "child",
+ "2": "teen",
+ "3": "adult",
+ "4": "senior"
+ },
+ "default": "unknown",
+ "strict": false
+ }
+ ]
+ }
+ },
+ "zip_code_text": {
+ "zip_code": {
+ "source": "zip_code_text",
+ "target": "zip_code",
+ "operations": [
+ {
+ "operation": "cast",
+ "source": "text",
+ "target": "integer"
+ }
+ ]
+ }
+ },
+ "visit_date_iso": {
+ "visit_date_us": {
+ "source": "visit_date_iso",
+ "target": "visit_date_us",
+ "operations": [
+ {
+ "operation": "convert_date",
+ "source_format": "%Y-%m-%d",
+ "target_format": "%m/%d/%Y"
+ }
+ ]
+ }
+ },
+ "weight_kg": {
+ "weight_lb": {
+ "source": "weight_kg",
+ "target": "weight_lb",
+ "operations": [
+ {
+ "operation": "convert_units",
+ "source_unit": "kg",
+ "target_unit": "lb"
+ }
+ ]
+ }
+ },
+ "record_id": {
+ "record_id_copy": {
+ "source": "record_id",
+ "target": "record_id_copy",
+ "operations": [
+ {
+ "operation": "do_nothing"
+ }
+ ]
+ }
+ },
+ "bmi": {
+ "bmi_formatted": {
+ "source": "bmi",
+ "target": "bmi_formatted",
+ "operations": [
+ {
+ "operation": "format_number",
+ "precision": 1
+ }
+ ]
+ }
+ },
+ "smoker_response": {
+ "is_smoker": {
+ "source": "smoker_response",
+ "target": "is_smoker",
+ "operations": [
+ {
+ "operation": "normalize_boolean",
+ "truthy": ["true", "t", "yes", "y", "1", 1, true, "on"],
+ "falsy": ["false", "f", "no", "n", "0", 0, false, "off", ""],
+ "strict": false,
+ "default": null
+ }
+ ]
+ }
+ },
+ "city_raw": {
+ "city_normalized": {
+ "source": "city_raw",
+ "target": "city_normalized",
+ "operations": [
+ {
+ "operation": "normalize_text",
+ "normalization": "lower"
+ }
+ ]
+ }
+ },
+ "thermometer_c": {
+ "calibrated_c": {
+ "source": "thermometer_c",
+ "target": "calibrated_c",
+ "operations": [
+ {
+ "operation": "offset",
+ "offset": 0.5
+ }
+ ]
+ }
+ },
+ "week_hours": {
+ "total_hours": {
+ "source": "week_hours",
+ "target": "total_hours",
+ "operations": [
+ {
+ "operation": "parse_array",
+ "format": "json",
+ "item_type": "integer",
+ "strict": true
+ },
+ {
+ "operation": "reduce",
+ "reduction": "sum"
+ }
+ ]
+ }
+ },
+ "medication_dose_mg": {
+ "medication_dose_mg_rounded": {
+ "source": "medication_dose_mg",
+ "target": "medication_dose_mg_rounded",
+ "operations": [
+ {
+ "operation": "round",
+ "precision": 2
+ }
+ ]
+ }
+ },
+ "price_usd": {
+ "price_cents": {
+ "source": "price_usd",
+ "target": "price_cents",
+ "operations": [
+ {
+ "operation": "scale",
+ "scaling_factor": 100
+ }
+ ]
+ }
+ },
+ "name_last_first": {
+ "name_first_last": {
+ "source": "name_last_first",
+ "target": "name_first_last",
+ "operations": [
+ {
+ "operation": "substitute",
+ "expression": "^\\s*([^,]+),\\s*(.+)$",
+ "substitution": "\\2 \\1"
+ }
+ ]
+ }
+ },
+ "pulse_rate": {
+ "pulse_rate_clamped": {
+ "source": "pulse_rate",
+ "target": "pulse_rate_clamped",
+ "operations": [
+ {
+ "operation": "threshold",
+ "lower": 40,
+ "upper": 200
+ }
+ ]
+ }
+ },
+ "username": {
+ "username_short": {
+ "source": "username",
+ "target": "username_short",
+ "operations": [
+ {
+ "operation": "truncate",
+ "length": 8
+ }
+ ]
+ }
+ }
+}
diff --git a/src/harmonization_framework/harmonization_rule.py b/src/harmonization_framework/harmonization_rule.py
index f2168b1..2926227 100644
--- a/src/harmonization_framework/harmonization_rule.py
+++ b/src/harmonization_framework/harmonization_rule.py
@@ -1,7 +1,7 @@
from typing import Any, List
from .element import DataElement
from .primitives.base import PrimitiveOperation
-from .primitives import PrimitiveVocabulary, Bin, Cast, ConvertDate, ConvertUnits, DoNothing, EnumToEnum, FormatNumber, NormalizeBoolean, NormalizeText, Offset, Reduce, Round, Scale, Substitute, Threshold, Truncate
+from .primitives import PrimitiveVocabulary, Bin, Cast, ConvertDate, ConvertUnits, DoNothing, EnumToEnum, FormatNumber, NormalizeBoolean, NormalizeText, Offset, ParseArray, Reduce, Round, Scale, Substitute, Threshold, Truncate
import json
@@ -69,6 +69,8 @@ def from_serialization(cls, serialization):
primitive = NormalizeText.from_serialization(operation)
case PrimitiveVocabulary.OFFSET.value:
primitive = Offset.from_serialization(operation)
+ case PrimitiveVocabulary.PARSE_ARRAY.value:
+ primitive = ParseArray.from_serialization(operation)
case PrimitiveVocabulary.REDUCE.value:
primitive = Reduce.from_serialization(operation)
case PrimitiveVocabulary.ROUND.value:
diff --git a/src/harmonization_framework/primitives/__init__.py b/src/harmonization_framework/primitives/__init__.py
index 16acdb4..fb0616a 100644
--- a/src/harmonization_framework/primitives/__init__.py
+++ b/src/harmonization_framework/primitives/__init__.py
@@ -8,6 +8,7 @@
from .normalize_boolean import NormalizeBoolean
from .normalize import NormalizeText
from .offset import Offset
+from .parse_array import ParseArray
from .reduce import Reduce
from .round_decimal import Round
from .scale import Scale
diff --git a/src/harmonization_framework/primitives/parse_array.py b/src/harmonization_framework/primitives/parse_array.py
new file mode 100644
index 0000000..beae9ac
--- /dev/null
+++ b/src/harmonization_framework/primitives/parse_array.py
@@ -0,0 +1,129 @@
+import json
+from typing import Any, List
+
+from .base import PrimitiveOperation
+from .normalize_boolean import NormalizeBoolean
+
+
+class ParseArray(PrimitiveOperation):
+ """
+ Parse array-like values into Python lists.
+
+ Supported formats:
+ - json: parse JSON arrays from strings (default)
+ - delimiter: split strings by a configured delimiter
+ """
+
+ SUPPORTED_FORMATS = {"json", "delimiter"}
+ SUPPORTED_ITEM_TYPES = {"auto", "string", "integer", "float", "boolean"}
+
+ def __init__(self, format: str = "json", delimiter: str = "|", item_type: str = "auto", strict: bool = True,
+ default: Any = None, allow_singleton: bool = False):
+ super().__init__()
+ if format not in self.SUPPORTED_FORMATS:
+ raise ValueError(f"Unsupported parse_array format: {format}")
+ if item_type not in self.SUPPORTED_ITEM_TYPES:
+ raise ValueError(f"Unsupported parse_array item_type: {item_type}")
+ if not isinstance(delimiter, str):
+ raise TypeError("Delimiter must be a string")
+ if format == "delimiter" and delimiter == "":
+ raise ValueError("Delimiter must be non-empty for delimiter format")
+
+ self.format = format
+ self.delimiter = delimiter
+ self.item_type = item_type
+ self.strict = strict
+ self.default = default
+ self.allow_singleton = allow_singleton
+ # Keep boolean coercion semantics aligned with NormalizeBoolean defaults.
+ self._boolean_normalizer = NormalizeBoolean(strict=True)
+
+ def __str__(self):
+ return f"Parse array using {self.format} format"
+
+ def to_dict(self):
+ output = {
+ "operation": "parse_array",
+ "format": self.format,
+ "item_type": self.item_type,
+ "strict": self.strict,
+ "allow_singleton": self.allow_singleton,
+ }
+ if self.format == "delimiter":
+ output["delimiter"] = self.delimiter
+ if self.default is not None:
+ output["default"] = self.default
+ return output
+
+ def transform(self, value: Any) -> Any:
+ try:
+ items = self._parse_items(value)
+ return [self._coerce_item(item) for item in items]
+ except Exception as exc:
+ if self.strict:
+ raise ValueError(
+ f"Failed to parse array from value={value!r} "
+ f"with format={self.format!r}"
+ ) from exc
+ return self.default
+
+ @classmethod
+ def from_serialization(cls, serialization):
+ return ParseArray(
+ format=serialization.get("format", "json"),
+ delimiter=serialization.get("delimiter", "|"),
+ item_type=serialization.get("item_type", "auto"),
+ strict=bool(serialization.get("strict", True)),
+ default=serialization.get("default"),
+ allow_singleton=bool(serialization.get("allow_singleton", False)),
+ )
+
+ def _parse_items(self, value: Any) -> List[Any]:
+ if isinstance(value, (list, tuple)):
+ return list(value)
+
+ if isinstance(value, str):
+ if self.format == "json":
+ return self._parse_json(value)
+ return self._parse_delimiter(value)
+
+ if self.allow_singleton:
+ return [value]
+
+ raise TypeError(f"parse_array expects list/tuple/string input, got {type(value).__name__}")
+
+ def _parse_json(self, value: str) -> List[Any]:
+ parsed = json.loads(value)
+ if isinstance(parsed, list):
+ return parsed
+ if self.allow_singleton and not isinstance(parsed, dict):
+ return [parsed]
+ raise ValueError("JSON payload is not an array")
+
+ def _parse_delimiter(self, value: str) -> List[str]:
+ delimiter = self._resolve_delimiter(self.delimiter)
+ if value == "":
+ return []
+ if delimiter == "\n":
+ # Normalize Windows-style newlines for predictable splitting.
+ value = value.replace("\r\n", "\n")
+ return [item.strip() for item in value.split(delimiter)]
+
+ def _coerce_item(self, item: Any) -> Any:
+ if self.item_type == "auto":
+ return item
+ if self.item_type == "string":
+ return str(item)
+ if self.item_type == "integer":
+ return int(item)
+ if self.item_type == "float":
+ return float(item)
+ if self.item_type == "boolean":
+ return self._boolean_normalizer.transform(item)
+ return item
+
+ @staticmethod
+ def _resolve_delimiter(delimiter: str) -> str:
+ if delimiter == "\\n":
+ return "\n"
+ return delimiter
diff --git a/src/harmonization_framework/primitives/vocabulary.py b/src/harmonization_framework/primitives/vocabulary.py
index cb95b18..8d65cf3 100644
--- a/src/harmonization_framework/primitives/vocabulary.py
+++ b/src/harmonization_framework/primitives/vocabulary.py
@@ -11,6 +11,7 @@ class PrimitiveVocabulary(Enum):
NORMALIZE_BOOLEAN = "normalize_boolean"
NORMALIZE_TEXT = "normalize_text"
OFFSET = "offset"
+ PARSE_ARRAY = "parse_array"
REDUCE = "reduce"
ROUND = "round"
SCALE = "scale"
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 70dd916..c67a19a 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -137,14 +137,15 @@ def test_cli_missing_behavior(tmp_path, capsys):
assert set(reader.fieldnames) == {"c"}
assert out_rows == [{"c": "1"}]
- # error should raise
- with pytest.raises(ValueError, match="Missing source columns"):
+ # error should surface as argparse failure (SystemExit code 2)
+ with pytest.raises(SystemExit) as exc:
cli.main([
"--rules", str(rules_path),
"--input", str(input_path),
"--output", str(output_path),
"--on-missing", "error",
])
+ assert exc.value.code == 2
def test_cli_tsv_autodetect(tmp_path):
diff --git a/tests/test_primitives_serialization.py b/tests/test_primitives_serialization.py
index 8454c96..d3b0b4c 100644
--- a/tests/test_primitives_serialization.py
+++ b/tests/test_primitives_serialization.py
@@ -11,6 +11,7 @@
NormalizeBoolean,
NormalizeText,
Offset,
+ ParseArray,
Reduce,
Round,
Scale,
@@ -352,6 +353,53 @@ def test_reduce_serialization_and_transform():
assert primitive.transform([1, 2, 3]) == 6
+def test_parse_array_json_default_serialization_and_transform():
+ primitive = ParseArray()
+ payload = primitive.to_dict()
+
+ assert payload == {
+ "operation": "parse_array",
+ "format": "json",
+ "item_type": "auto",
+ "strict": True,
+ "allow_singleton": False,
+ }
+
+ roundtrip = ParseArray.from_serialization(payload)
+ assert roundtrip.to_dict() == payload
+ assert primitive.transform("[1, 2, 3]") == [1, 2, 3]
+ assert primitive.transform((1, 2, 3)) == [1, 2, 3]
+
+
+def test_parse_array_delimiter_pipe_and_newline_supported():
+ pipe_parser = ParseArray(format="delimiter", delimiter="|", item_type="integer")
+ newline_parser = ParseArray(format="delimiter", delimiter="\\n", item_type="integer")
+
+ assert pipe_parser.transform("8|8|8|8|6") == [8, 8, 8, 8, 6]
+ assert pipe_parser.transform(" 8 | 8 | 8 | 8 | 6 ") == [8, 8, 8, 8, 6]
+ assert newline_parser.transform("1\n2\n3") == [1, 2, 3]
+ assert newline_parser.transform("1\r\n2\r\n3") == [1, 2, 3]
+
+
+def test_parse_array_non_strict_default_on_failure():
+ primitive = ParseArray(strict=False, default=[])
+ assert primitive.transform("not valid json") == []
+
+
+def test_parse_array_boolean_item_type_matches_normalize_boolean_defaults():
+ primitive = ParseArray(item_type="boolean")
+ assert primitive.transform("[\" yes \", \"off\", \"1\", \"0\"]") == [True, False, True, False]
+
+
+def test_parse_array_boolean_item_type_rejects_unknown_like_normalize_boolean():
+ strict_primitive = ParseArray(item_type="boolean")
+ non_strict_primitive = ParseArray(item_type="boolean", strict=False, default=[])
+
+ with pytest.raises(ValueError, match="Failed to parse array"):
+ strict_primitive.transform("[2]")
+ assert non_strict_primitive.transform("[2]") == []
+
+
def test_reduce_rejects_non_list():
primitive = Reduce(Reduction.SUM)
with pytest.raises(TypeError, match="list or tuple"):
diff --git a/tests/test_rule_serialization.py b/tests/test_rule_serialization.py
index 5285da1..7c724f5 100644
--- a/tests/test_rule_serialization.py
+++ b/tests/test_rule_serialization.py
@@ -2,7 +2,8 @@
import pytest
from harmonization_framework.harmonization_rule import HarmonizationRule
-from harmonization_framework.primitives import Cast, DoNothing, Round
+from harmonization_framework.primitives import Cast, DoNothing, ParseArray, Reduce, Round
+from harmonization_framework.primitives.reduce import Reduction
def test_rule_serializes_with_empty_operations():
@@ -50,3 +51,15 @@ def test_rule_from_serialization_unknown_operation_raises():
def test_rule_transform_with_do_nothing():
rule = HarmonizationRule("x", "y", [DoNothing()])
assert rule.transform("abc") == "abc"
+
+
+def test_rule_with_parse_array_then_reduce_sum():
+ rule = HarmonizationRule(
+ "week_hours",
+ "total_hours",
+ [ParseArray(), Reduce(Reduction.SUM)],
+ )
+ payload = rule.serialize()
+
+ roundtrip = HarmonizationRule.from_serialization(payload)
+ assert roundtrip.transform("[8, 8, 8, 8, 6]") == 38