Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,7 @@ All settings are provided in the operation dict for rule serialization.
| `normalize_boolean` | Normalize truthy/falsy values to booleans. | `truthy` (list, optional; defaults below)<br>`falsy` (list, optional; defaults below)<br>`strict` (bool, default `true`)<br>`default` (optional; used when `strict=false`) |
| `normalize_text` | Apply a single text normalization. | `normalization` (`strip`, `lower`, `upper`, `remove_accents`, `remove_punctuation`, `remove_special_characters`) |
| `offset` | Add an offset to numeric values. | `offset` (number) |
| `parse_array` | Parse array-like values into a list for downstream operations. | `format` (`json` default, `delimiter`)<br>`delimiter` (string; used for `delimiter` format, default `|`, supports `\\n` for newline)<br>`item_type` (`auto`, `string`, `integer`, `float`, `boolean`)<br>`strict` (bool, default `true`)<br>`default` (optional; used when `strict=false`)<br>`allow_singleton` (bool, default `false`) |
| `reduce` | Reduce a list of values to one value. | `reduction` (`any`, `none`, `all`, `one-hot`, `sum`); expects a list/tuple input; one-hot returns index or None |
| `round` | Round numeric values to a given precision. | `precision` (int, >=0); uses Python `round` semantics |
| `scale` | Multiply numeric values by a factor. | `scaling_factor` (number) |
Expand Down Expand Up @@ -162,6 +163,7 @@ Each operation is represented by a JSON-friendly dict. Examples:
| `normalize_boolean` | `{"operation":"normalize_boolean","truthy":["yes","y","1"],"falsy":["no","n","0"],"strict":true}` |
| `normalize_text` | `{"operation":"normalize_text","normalization":"lower"}` |
| `offset` | `{"operation":"offset","offset":2.5}` |
| `parse_array` | `{"operation":"parse_array","format":"json","item_type":"integer","strict":true}` |
| `reduce` | `{"operation":"reduce","reduction":"one-hot"}` |
| `round` | `{"operation":"round","precision":2}` |
| `scale` | `{"operation":"scale","scaling_factor":0.453592}` |
Expand All @@ -176,3 +178,31 @@ If you use the `normalize_boolean` primitive without specifying `truthy` or

- truthy: `["true", "t", "yes", "y", "1", 1, true, "on"]`
- falsy: `["false", "f", "no", "n", "0", 0, false, "off", ""]`

### ParseArray + Reduce for CSV data

When arrays are serialized as text in CSV (for example `"[8,8,8,8,6]"` or
`"8|8|8|8|6"`), chain `parse_array` before `reduce`:

```json
{
"source": "week_hours",
"target": "total_hours",
"operations": [
{"operation": "parse_array", "format": "json", "item_type": "integer", "strict": true},
{"operation": "reduce", "reduction": "sum"}
]
}
```

For delimiter input, use:

```json
{"operation": "parse_array", "format": "delimiter", "delimiter": "|", "item_type": "integer"}
```

For newline-separated input, use:

```json
{"operation": "parse_array", "format": "delimiter", "delimiter": "\\n", "item_type": "integer"}
```
4 changes: 4 additions & 0 deletions demo/primitives_ui/input.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
age_years,zip_code_text,visit_date_iso,weight_kg,record_id,bmi,smoker_response,city_raw,thermometer_c,week_hours,medication_dose_mg,price_usd,name_last_first,pulse_rate,username
34,02139,2026-02-17,70.5,REC-0001,27.345,Yes," New York ",36.6,"[8,8,8,8,6]",2.675,19.99,"DOE, Jane",220,alexandria
12,60614,2025-12-31,82.0,REC-0002,18.0,No," San Francisco ",37.1,"[10,10,10,10,5]",1.005,3.5,"SMITH, John",35,bo
70,98101,2024-07-04,95.2,REC-0003,31.889,unknown," Austin ",36.2,"[0,0,0,0,0]",0.999,120.0,"LEE, Ada",88,charlie
219 changes: 219 additions & 0 deletions demo/primitives_ui/rules.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,219 @@
{
"age_years": {
"age_group": {
"source": "age_years",
"target": "age_group",
"operations": [
{
"operation": "bin",
"bins": [
{"label": 1, "start": 0, "end": 12},
{"label": 2, "start": 13, "end": 17},
{"label": 3, "start": 18, "end": 64},
{"label": 4, "start": 65, "end": 120}
]
}
]
}
},
"age_group": {
"age_group_label": {
"source": "age_group",
"target": "age_group_label",
"operations": [
{
"operation": "enum_to_enum",
"mapping": {
"1": "child",
"2": "teen",
"3": "adult",
"4": "senior"
},
"default": "unknown",
"strict": false
}
]
}
},
"zip_code_text": {
"zip_code": {
"source": "zip_code_text",
"target": "zip_code",
"operations": [
{
"operation": "cast",
"source": "text",
"target": "integer"
}
]
}
},
"visit_date_iso": {
"visit_date_us": {
"source": "visit_date_iso",
"target": "visit_date_us",
"operations": [
{
"operation": "convert_date",
"source_format": "%Y-%m-%d",
"target_format": "%m/%d/%Y"
}
]
}
},
"weight_kg": {
"weight_lb": {
"source": "weight_kg",
"target": "weight_lb",
"operations": [
{
"operation": "convert_units",
"source_unit": "kg",
"target_unit": "lb"
}
]
}
},
"record_id": {
"record_id_copy": {
"source": "record_id",
"target": "record_id_copy",
"operations": [
{
"operation": "do_nothing"
}
]
}
},
"bmi": {
"bmi_formatted": {
"source": "bmi",
"target": "bmi_formatted",
"operations": [
{
"operation": "format_number",
"precision": 1
}
]
}
},
"smoker_response": {
"is_smoker": {
"source": "smoker_response",
"target": "is_smoker",
"operations": [
{
"operation": "normalize_boolean",
"truthy": ["true", "t", "yes", "y", "1", 1, true, "on"],
"falsy": ["false", "f", "no", "n", "0", 0, false, "off", ""],
"strict": false,
"default": null
}
]
}
},
"city_raw": {
"city_normalized": {
"source": "city_raw",
"target": "city_normalized",
"operations": [
{
"operation": "normalize_text",
"normalization": "lower"
}
]
}
},
"thermometer_c": {
"calibrated_c": {
"source": "thermometer_c",
"target": "calibrated_c",
"operations": [
{
"operation": "offset",
"offset": 0.5
}
]
}
},
"week_hours": {
"total_hours": {
"source": "week_hours",
"target": "total_hours",
"operations": [
{
"operation": "parse_array",
"format": "json",
"item_type": "integer",
"strict": true
},
{
"operation": "reduce",
"reduction": "sum"
}
]
}
},
"medication_dose_mg": {
"medication_dose_mg_rounded": {
"source": "medication_dose_mg",
"target": "medication_dose_mg_rounded",
"operations": [
{
"operation": "round",
"precision": 2
}
]
}
},
"price_usd": {
"price_cents": {
"source": "price_usd",
"target": "price_cents",
"operations": [
{
"operation": "scale",
"scaling_factor": 100
}
]
}
},
"name_last_first": {
"name_first_last": {
"source": "name_last_first",
"target": "name_first_last",
"operations": [
{
"operation": "substitute",
"expression": "^\\s*([^,]+),\\s*(.+)$",
"substitution": "\\2 \\1"
}
]
}
},
"pulse_rate": {
"pulse_rate_clamped": {
"source": "pulse_rate",
"target": "pulse_rate_clamped",
"operations": [
{
"operation": "threshold",
"lower": 40,
"upper": 200
}
]
}
},
"username": {
"username_short": {
"source": "username",
"target": "username_short",
"operations": [
{
"operation": "truncate",
"length": 8
}
]
}
}
}
4 changes: 3 additions & 1 deletion src/harmonization_framework/harmonization_rule.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from typing import Any, List
from .element import DataElement
from .primitives.base import PrimitiveOperation
from .primitives import PrimitiveVocabulary, Bin, Cast, ConvertDate, ConvertUnits, DoNothing, EnumToEnum, FormatNumber, NormalizeBoolean, NormalizeText, Offset, Reduce, Round, Scale, Substitute, Threshold, Truncate
from .primitives import PrimitiveVocabulary, Bin, Cast, ConvertDate, ConvertUnits, DoNothing, EnumToEnum, FormatNumber, NormalizeBoolean, NormalizeText, Offset, ParseArray, Reduce, Round, Scale, Substitute, Threshold, Truncate

import json

Expand Down Expand Up @@ -69,6 +69,8 @@ def from_serialization(cls, serialization):
primitive = NormalizeText.from_serialization(operation)
case PrimitiveVocabulary.OFFSET.value:
primitive = Offset.from_serialization(operation)
case PrimitiveVocabulary.PARSE_ARRAY.value:
primitive = ParseArray.from_serialization(operation)
case PrimitiveVocabulary.REDUCE.value:
primitive = Reduce.from_serialization(operation)
case PrimitiveVocabulary.ROUND.value:
Expand Down
1 change: 1 addition & 0 deletions src/harmonization_framework/primitives/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from .normalize_boolean import NormalizeBoolean
from .normalize import NormalizeText
from .offset import Offset
from .parse_array import ParseArray
from .reduce import Reduce
from .round_decimal import Round
from .scale import Scale
Expand Down
Loading