Skip to content

Commit 2892ef3

Browse files
Added example
1 parent 48fce1b commit 2892ef3

5 files changed

Lines changed: 204 additions & 0 deletions

File tree

demo/harmonize_example/README.md

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
# Harmonization Example
2+
3+
This folder contains a **minimal, self‑contained** example of how to use the harmonization framework.
4+
It includes a small input CSV, a rules JSON file, and a Python script that performs the harmonization.
5+
6+
## Files
7+
- `input.csv` — The raw input data.
8+
- `rules.json` — Harmonization rules saved in the framework’s JSON format.
9+
- `run_example.py` — Loads the rules, applies them, and writes `output.csv`.
10+
- `output.csv` — Generated when you run the script.
11+
12+
## What the example does
13+
- Renames `age` to `age_years` (pass-through).
14+
- Converts `weight_lbs` to `weight_kg` (multiply by 0.453592).
15+
- Splits `name` (stored as `"Last, First"`) into two new columns:
16+
- `given_name`
17+
- `family_name`
18+
- Maps `visit_type_code` (numeric codes) to `visit_type_label` using an enum-to-enum rule.
19+
20+
## Run it
21+
From the repository root:
22+
23+
```bash
24+
python demo/harmonize_example/run_example.py
25+
```
26+
27+
This writes `demo/harmonize_example/output.csv`.
28+
29+
## Expected output columns
30+
The output CSV includes:
31+
- `age_years`
32+
- `weight_kg`
33+
- `given_name`
34+
- `family_name`
35+
- `visit_type_label`
36+
- `source dataset` (set to `"demo"` for each row)
37+
- `original_id` (the original row index)
38+
39+
## Notes
40+
- The rules in `rules.json` are loaded into a `RuleRegistry` via `rules.load(...)`.
41+
- The `(source, target)` pairs in `run_example.py` must match the keys in `rules.json`.

demo/harmonize_example/input.csv

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
age,weight_lbs,name,visit_type_code
2+
10,78.26,"Smith, Alice",1
3+
5,44.5,"Jones, Bob",2
4+
8,92.2,"Nguyen, Carol",3

demo/harmonize_example/output.csv

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
family_name,given_name,age_years,weight_kg,visit_type_label,source dataset,original_id
2+
Smith,Alice,10,35.50,baseline,demo,0
3+
Jones,Bob,5,20.18,follow_up,demo,1
4+
Nguyen,Carol,8,41.82,screening,demo,2

demo/harmonize_example/rules.json

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
{
2+
"age": {
3+
"age_years": {
4+
"source": "age",
5+
"target": "age_years",
6+
"operations": []
7+
}
8+
},
9+
"weight_lbs": {
10+
"weight_kg": {
11+
"source": "weight_lbs",
12+
"target": "weight_kg",
13+
"operations": [
14+
{
15+
"operation": "scale",
16+
"scaling_factor": 0.453592
17+
},
18+
{
19+
"operation": "round",
20+
"precision": 2
21+
},
22+
{
23+
"operation": "cast",
24+
"source": "float",
25+
"target": "text"
26+
}
27+
]
28+
}
29+
},
30+
"name": {
31+
"given_name": {
32+
"source": "name",
33+
"target": "given_name",
34+
"operations": [
35+
{
36+
"operation": "substitute",
37+
"expression": "^\\s*([^,]+),\\s*(.+)$",
38+
"substitution": "\\2"
39+
},
40+
{
41+
"operation": "normalize_text",
42+
"normalization": "strip"
43+
}
44+
]
45+
},
46+
"family_name": {
47+
"source": "name",
48+
"target": "family_name",
49+
"operations": [
50+
{
51+
"operation": "substitute",
52+
"expression": "^\\s*([^,]+),\\s*(.+)$",
53+
"substitution": "\\1"
54+
},
55+
{
56+
"operation": "normalize_text",
57+
"normalization": "strip"
58+
}
59+
]
60+
}
61+
},
62+
"visit_type_code": {
63+
"visit_type_label": {
64+
"source": "visit_type_code",
65+
"target": "visit_type_label",
66+
"operations": [
67+
{
68+
"operation": "enum_to_enum",
69+
"mapping": {
70+
"1": "baseline",
71+
"2": "follow_up",
72+
"3": "screening"
73+
},
74+
"default": "unknown",
75+
"strict": false
76+
}
77+
]
78+
}
79+
}
80+
}
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
from pathlib import Path
2+
3+
# Example script: load rules + data, run harmonization, and write output.
4+
#
5+
# This file is intentionally verbose and commented so a new user can
6+
# understand the minimal steps required to use the framework.
7+
8+
from harmonization_framework.harmonize import harmonize_file
9+
from harmonization_framework.rule_registry import RuleRegistry
10+
11+
12+
def main() -> None:
13+
# Find this demo directory so all paths are absolute and stable.
14+
base_dir = Path(__file__).resolve().parent
15+
16+
input_path = base_dir / "input.csv"
17+
print("input_path: ", input_path)
18+
19+
rules_path = base_dir / "rules.json"
20+
print("rules_path: ", rules_path)
21+
22+
output_path = base_dir / "output.csv"
23+
print("output_path: ", output_path)
24+
25+
# The RuleRegistry is the in-memory container for all harmonization rules.
26+
# Loading the JSON file builds a registry of source/target rules that
27+
# harmonize values from the source column into the target column.
28+
rules = RuleRegistry()
29+
rules.load(str(rules_path), clean=True)
30+
31+
# Each pair is (source_column, target_column). These names must match
32+
# the rules in rules.json. The framework will:
33+
# 1) rename the column to the target name
34+
# 2) apply the rule for that source/target to each value
35+
harmonization_pairs = [
36+
("age", "age_years"),
37+
("weight_lbs", "weight_kg"),
38+
("name", "given_name"),
39+
("name", "family_name"),
40+
("visit_type_code", "visit_type_label"),
41+
]
42+
43+
# Run the harmonization and save the output CSV.
44+
# dataset_name becomes the value of the "source dataset" column.
45+
harmonized = harmonize_file(
46+
input_path=str(input_path),
47+
output_path=str(output_path),
48+
harmonization_pairs=harmonization_pairs,
49+
rules=rules,
50+
dataset_name="demo",
51+
)
52+
53+
# Reorder columns for readability in the demo output.
54+
preferred_order = [
55+
"family_name",
56+
"given_name",
57+
"age_years",
58+
"weight_kg",
59+
"visit_type_label",
60+
"source dataset",
61+
"original_id",
62+
]
63+
ordered = [col for col in preferred_order if col in harmonized.columns]
64+
ordered += [col for col in harmonized.columns if col not in ordered]
65+
harmonized = harmonized[ordered]
66+
harmonized.to_csv(output_path, index=False)
67+
68+
# The output file contains the transformed columns plus:
69+
# - "source dataset": the dataset_name value for each row
70+
# - "original_id": the original row index from the input file
71+
print(f"Wrote harmonized CSV to: {output_path}")
72+
73+
74+
if __name__ == "__main__":
75+
main()

0 commit comments

Comments
 (0)