Added example

matthewhorridge · matthewhorridge · commit 2892ef349aee · 2026-01-26T18:03:26.000-08:00
diff --git a/demo/harmonize_example/README.md b/demo/harmonize_example/README.md
@@ -0,0 +1,41 @@
+# Harmonization Example
+
+This folder contains a **minimal, self‑contained** example of how to use the harmonization framework.
+It includes a small input CSV, a rules JSON file, and a Python script that performs the harmonization.
+
+## Files
+- `input.csv` — The raw input data.
+- `rules.json` — Harmonization rules saved in the framework’s JSON format.
+- `run_example.py` — Loads the rules, applies them, and writes `output.csv`.
+- `output.csv` — Generated when you run the script.
+
+## What the example does
+- Renames `age` to `age_years` (pass-through).
+- Converts `weight_lbs` to `weight_kg` (multiply by 0.453592).
+- Splits `name` (stored as `"Last, First"`) into two new columns:
+  - `given_name`
+  - `family_name`
+- Maps `visit_type_code` (numeric codes) to `visit_type_label` using an enum-to-enum rule.
+
+## Run it
+From the repository root:
+
+```bash
+python demo/harmonize_example/run_example.py
+```
+
+This writes `demo/harmonize_example/output.csv`.
+
+## Expected output columns
+The output CSV includes:
+- `age_years`
+- `weight_kg`
+- `given_name`
+- `family_name`
+- `visit_type_label`
+- `source dataset` (set to `"demo"` for each row)
+- `original_id` (the original row index)
+
+## Notes
+- The rules in `rules.json` are loaded into a `RuleRegistry` via `rules.load(...)`.
+- The `(source, target)` pairs in `run_example.py` must match the keys in `rules.json`.
diff --git a/demo/harmonize_example/input.csv b/demo/harmonize_example/input.csv
@@ -0,0 +1,4 @@
+age,weight_lbs,name,visit_type_code
+10,78.26,"Smith, Alice",1
+5,44.5,"Jones, Bob",2
+8,92.2,"Nguyen, Carol",3
diff --git a/demo/harmonize_example/output.csv b/demo/harmonize_example/output.csv
@@ -0,0 +1,4 @@
+family_name,given_name,age_years,weight_kg,visit_type_label,source dataset,original_id
+Smith,Alice,10,35.50,baseline,demo,0
+Jones,Bob,5,20.18,follow_up,demo,1
+Nguyen,Carol,8,41.82,screening,demo,2
diff --git a/demo/harmonize_example/rules.json b/demo/harmonize_example/rules.json
@@ -0,0 +1,80 @@
+{
+  "age": {
+    "age_years": {
+      "source": "age",
+      "target": "age_years",
+      "operations": []
+    }
+  },
+  "weight_lbs": {
+    "weight_kg": {
+      "source": "weight_lbs",
+      "target": "weight_kg",
+      "operations": [
+        {
+          "operation": "scale",
+          "scaling_factor": 0.453592
+        },
+        {
+          "operation": "round",
+          "precision": 2
+        },
+        {
+          "operation": "cast",
+          "source": "float",
+          "target": "text"
+        }
+      ]
+    }
+  },
+  "name": {
+    "given_name": {
+      "source": "name",
+      "target": "given_name",
+      "operations": [
+        {
+          "operation": "substitute",
+          "expression": "^\\s*([^,]+),\\s*(.+)$",
+          "substitution": "\\2"
+        },
+        {
+          "operation": "normalize_text",
+          "normalization": "strip"
+        }
+      ]
+    },
+    "family_name": {
+      "source": "name",
+      "target": "family_name",
+      "operations": [
+        {
+          "operation": "substitute",
+          "expression": "^\\s*([^,]+),\\s*(.+)$",
+          "substitution": "\\1"
+        },
+        {
+          "operation": "normalize_text",
+          "normalization": "strip"
+        }
+      ]
+    }
+  },
+  "visit_type_code": {
+    "visit_type_label": {
+      "source": "visit_type_code",
+      "target": "visit_type_label",
+      "operations": [
+        {
+          "operation": "enum_to_enum",
+          "mapping": {
+            "1": "baseline",
+            "2": "follow_up",
+            "3": "screening"
+          },
+          "default": "unknown",
+          "strict": false
+        }
+      ]
+    }
+  }
+}
diff --git a/demo/harmonize_example/run_example.py b/demo/harmonize_example/run_example.py
@@ -0,0 +1,75 @@
+from pathlib import Path
+
+# Example script: load rules + data, run harmonization, and write output.
+#
+# This file is intentionally verbose and commented so a new user can
+# understand the minimal steps required to use the framework.
+
+from harmonization_framework.harmonize import harmonize_file
+from harmonization_framework.rule_registry import RuleRegistry
+
+
+def main() -> None:
+    # Find this demo directory so all paths are absolute and stable.
+    base_dir = Path(__file__).resolve().parent
+
+    input_path = base_dir / "input.csv"
+    print("input_path: ", input_path)
+
+    rules_path = base_dir / "rules.json"
+    print("rules_path: ", rules_path)
+
+    output_path = base_dir / "output.csv"
+    print("output_path: ", output_path)
+
+    # The RuleRegistry is the in-memory container for all harmonization rules.
+    # Loading the JSON file builds a registry of source/target rules that
+    # harmonize values from the source column into the target column.
+    rules = RuleRegistry()
+    rules.load(str(rules_path), clean=True)
+
+    # Each pair is (source_column, target_column). These names must match
+    # the rules in rules.json. The framework will:
+    # 1) rename the column to the target name
+    # 2) apply the rule for that source/target to each value
+    harmonization_pairs = [
+        ("age", "age_years"),
+        ("weight_lbs", "weight_kg"),
+        ("name", "given_name"),
+        ("name", "family_name"),
+        ("visit_type_code", "visit_type_label"),
+    ]
+
+    # Run the harmonization and save the output CSV.
+    # dataset_name becomes the value of the "source dataset" column.
+    harmonized = harmonize_file(
+        input_path=str(input_path),
+        output_path=str(output_path),
+        harmonization_pairs=harmonization_pairs,
+        rules=rules,
+        dataset_name="demo",
+    )
+
+    # Reorder columns for readability in the demo output.
+    preferred_order = [
+        "family_name",
+        "given_name",
+        "age_years",
+        "weight_kg",
+        "visit_type_label",
+        "source dataset",
+        "original_id",
+    ]
+    ordered = [col for col in preferred_order if col in harmonized.columns]
+    ordered += [col for col in harmonized.columns if col not in ordered]
+    harmonized = harmonized[ordered]
+    harmonized.to_csv(output_path, index=False)
+
+    # The output file contains the transformed columns plus:
+    # - "source dataset": the dataset_name value for each row
+    # - "original_id": the original row index from the input file
+    print(f"Wrote harmonized CSV to: {output_path}")
+
+
+if __name__ == "__main__":
+    main()