diff --git a/.gitignore b/.gitignore
index a1c9df5..0d8ca03 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,7 +15,10 @@ coverage.xml
 coverage.json
 !input/rodent_dataset.xlsx
 input/*
+PHIX Reference*.xlsx
+PHIX Reference*.xls
+*PHIX*Reference*List*.xlsx
 phu_templates/*
 !phu_templates/README.md
 !phu_templates/.gitkeep
-.gitmodules
\ No newline at end of file
+.gitmodules
diff --git a/config/parameters.yaml b/config/parameters.yaml
index 5a76b33..5296686 100644
--- a/config/parameters.yaml
+++ b/config/parameters.yaml
@@ -1,6 +1,27 @@
 bundling:
   bundle_size: 100
   group_by: null
+
+# Step 2: Preprocessing - PHIX Reference Validation
+# Validates school/daycare names against the official PHIX reference list
+phix_validation:
+  enabled: true
+  # Path to PHIX reference Excel file (relative to project root)
+  # IMPORTANT: We cannot redistribute the PHIX workbook. Provide your own copy
+  # and update this path (absolute path or relative to repo root).
+  reference_file: BYO_PHIX_REFERENCE.xlsx
+  # Optional mapping file that converts PHIX PHU column names to canonical
+  # template codes. Required when target_phu_code or --template is provided.
+  phu_mapping_file: config/phu_aliases.yaml
+  # Optional default PHU scope when running without --template.
+  # Accepts a single code or list of codes. Leave null to accept all PHUs.
+  target_phu_code: null
+  # How to handle unmatched facilities: 'warn', 'error', or 'skip'
+  # - warn: Log warning, continue processing all records
+  # - error: Fail pipeline if any facilities don't match
+  # - skip: Filter out records with unmatched facilities
+  unmatched_behavior: warn
+
 chart_diseases_header:
 - Diphtheria
 - Tetanus
diff --git a/config/phu_aliases.yaml b/config/phu_aliases.yaml
new file mode 100644
index 0000000..bbf1c4c
--- /dev/null
+++ b/config/phu_aliases.yaml
@@ -0,0 +1,30 @@
+# Maps canonical PHU acronyms (matching template folder names) to every
+# alias that may appear in the PHIX reference workbook. Customize this file
+# so each Public Health Unit is represented with all historic spellings,
+# ensuring PHIX validation can restrict facilities to the correct PHU.
+#
+# Example usage:
+#   phu_aliases:
+#     wdgph:
+#       display_name: Wellington-Dufferin-Guelph Public Health
+#       aliases:
+#         - Wellington Dufferin Guelph Public Health
+#         - WDGPH
+
+phu_aliases:
+  wdgph:
+    display_name: Wellington-Dufferin-Guelph Public Health
+    aliases:
+      - Wellington Dufferin Guelph Public Health
+      - Wellington-Dufferin-Guelph Health Unit
+      - WDGPH
+  peel:
+    display_name: Peel Public Health
+    aliases:
+      - Region of Peel Public Health
+      - PEEL HEALTH UNIT
+  ottawa:
+    display_name: Ottawa Public Health
+    aliases:
+      - City of Ottawa Health Unit
+      - OTTAWA PHU
diff --git a/docs/PDF_VALIDATION.md b/docs/PDF_VALIDATION.md
index ec1b8db..5b72ea5 100644
--- a/docs/PDF_VALIDATION.md
+++ b/docs/PDF_VALIDATION.md
@@ -151,7 +151,8 @@ This markerless approach is also suitable for checks like:
 In step 6 (validation), the orchestrator:
 1. Loads `preprocessed_clients_{run_id}.json` from `output/artifacts/`.
 2. Builds a mapping: `filename -> expected_value` (e.g., client ID, sequence number).
-3. Passes this mapping to `validate_pdfs.main(..., client_id_map=client_id_map)`.
+3. Passes this mapping to `validate_pdfs.main(..., client_id_map=client_id_map, client_metadata_map=client_metadata_map)`.
+4. `client_metadata_map` currently carries PHIX validation context (target PHU, matched PHU, school name) so every per-PDF log records which facility/PHU was validated upstream.
 
 Rules then validate against the mapping using artifact data as the source of truth.
 
@@ -165,6 +166,22 @@ Current rule: Searches for any 10-digit number in the PDF text and compares to t
 
 This ensures every generated PDF contains the correct client ID, catching generation errors or data drift early.
 
+### Example: PHIX facility scope tracking
+
+- Preprocessing stores PHIX validation metadata (`phix_validation`) in each client's artifact entry.
+- The orchestrator passes this data via `client_metadata_map`.
+- `validate_pdf_layout` records `phix_target_phu_code`, `phix_matched_phu_code`, and `phix_match_confidence` in each PDF's `measurements`.
+- If a PDF's matched PHU does not align with the template's target PHU, the validator emits a `phix_target_phu` warning per file.
+
+This gives auditors a traceable link from every generated PDF back to the PHIX reference data used during preprocessing.
+
+### PHIX reference workbook is BYO
+
+- The official PHIX reference workbook is licensed and cannot be redistributed in this repository.
+- `.gitignore` explicitly ignores `PHIX Reference*.xls*` so accidental copies never enter history.
+- `config/parameters.yaml` ships with the placeholder `BYO_PHIX_REFERENCE.xlsx`; operators must point it at their local copy before running Step 2.
+- Document the location internally (outside git) and ensure CI/CD environments mount the workbook securely (e.g., secrets storage or mounted volume).
+
 ## Why we prefer template‑emitted measurements over PDF distance math
 
 We strongly prefer emitting precise measurements from the Typst template (via `measure()` and `MEASURE_...` markers) instead of inferring sizes by computing distances between two markers in extracted PDF text. Reasons:
diff --git a/pipeline/orchestrator.py b/pipeline/orchestrator.py
index 9517c6f..a04df34 100755
--- a/pipeline/orchestrator.py
+++ b/pipeline/orchestrator.py
@@ -39,9 +39,10 @@
 import traceback
 from datetime import datetime, timezone
 from pathlib import Path
+from typing import Optional
 
 # Import pipeline steps
-from . import bundle_pdfs, cleanup, compile_notices, validate_pdfs
+from . import bundle_pdfs, cleanup, compile_notices, validate_pdfs, validate_phix
 from . import (
     encrypt_notice,
     generate_notices,
@@ -50,6 +51,7 @@
     preprocess,
 )
 from .config_loader import load_config
+from .data_models import PreprocessResult
 from .enums import Language
 
 SCRIPT_DIR = Path(__file__).resolve().parent
@@ -123,6 +125,7 @@ def validate_args(args: argparse.Namespace) -> None:
             f"Input file not found: {args.input_dir / args.input_file}"
         )
 
+    args.template_key = None
     # Resolve template directory
     if args.template_dir is None:
         # No custom template specified; use default
@@ -136,6 +139,7 @@ def validate_args(args: argparse.Namespace) -> None:
                 f"Expected a simple name like 'wdgph' or 'my_phu', not a path."
             )
 
+        args.template_key = args.template_dir.strip()
         phu_template_path = DEFAULT_PHU_TEMPLATES_DIR / args.template_dir
         if not phu_template_path.exists():
             raise FileNotFoundError(
@@ -209,6 +213,8 @@ def run_step_2_preprocess(
     output_dir: Path,
     language: str,
     run_id: str,
+    config_dir: Path,
+    template_code: Optional[str] = None,
 ) -> int:
     """Step 2: Preprocessing.
 
@@ -220,6 +226,10 @@ def run_step_2_preprocess(
     # Configure logging
     log_path = preprocess.configure_logging(output_dir, run_id)
 
+    # Load configuration for PHIX validation
+    config = load_config(config_dir / "parameters.yaml")
+    project_root = config_dir.parent
+
     # Load and process input data
     input_path = input_dir / input_file
     df_raw = preprocess.read_input(input_path)
@@ -230,6 +240,51 @@ def run_step_2_preprocess(
     # Check that addresses are complete, return only complete rows
     df = preprocess.check_addresses_complete(df)
 
+    # Validate schools/daycares against PHIX reference list
+    phix_config = config.get("phix_validation", {})
+    phix_warnings: list[str] = []
+    if phix_config.get("enabled", False):
+        reference_file = phix_config.get("reference_file", "")
+        mapping_file = phix_config.get("phu_mapping_file")
+        mapping_path: Optional[Path] = None
+        if mapping_file:
+            mapping_path = Path(mapping_file)
+            if not mapping_path.is_absolute():
+                mapping_path = (project_root / mapping_file).resolve()
+            if not mapping_path.exists():
+                raise FileNotFoundError(f"PHU alias mapping file not found: {mapping_path}")
+
+        target_phu_codes: set[str] = set()
+        configured_target = phix_config.get("target_phu_code")
+        if isinstance(configured_target, str):
+            configured_target = [configured_target]
+        if isinstance(configured_target, (list, tuple, set)):
+            for code in configured_target:
+                if code and str(code).strip():
+                    target_phu_codes.add(str(code))
+        if template_code:
+            target_phu_codes.add(template_code)
+
+        if reference_file:
+            reference_path = Path(reference_file)
+            # If relative path, resolve from project root
+            if not reference_path.is_absolute():
+                reference_path = (project_root / reference_file).resolve()
+            if reference_path.exists():
+                df, phix_warnings = validate_phix.validate_facilities(
+                    df=df,
+                    reference_path=reference_path,
+                    output_dir=output_dir,
+                    unmatched_behavior=phix_config.get("unmatched_behavior", "warn"),
+                    target_phu_codes=target_phu_codes or None,
+                    phu_mapping_path=mapping_path,
+                )
+                print(f"🏫 PHIX validation complete: {len(df)} records validated")
+            else:
+                print(f"⚠️  PHIX reference file not found: {reference_path}")
+        else:
+            print("⚠️  PHIX validation enabled but no reference_file configured")
+
     # Load configuration
     vaccine_reference_path = preprocess.VACCINE_REFERENCE_PATH
     vaccine_reference = json.loads(vaccine_reference_path.read_text(encoding="utf-8"))
@@ -239,6 +294,14 @@ def run_step_2_preprocess(
         df, language, vaccine_reference, preprocess.REPLACE_UNSPECIFIED
     )
 
+    # Merge PHIX validation warnings into result
+    if phix_warnings:
+        combined_warnings = list(result.warnings) + phix_warnings
+        result = PreprocessResult(
+            clients=result.clients,
+            warnings=combined_warnings,
+        )
+
     # Write artifact
     artifact_path = preprocess.write_artifact(
         output_dir / "artifacts", language, run_id, result
@@ -397,6 +460,7 @@ def run_step_6_validate_pdfs(
 
     # Load preprocessed clients to build client ID mapping
     client_id_map = {}
+    client_metadata_map = {}
     import json
 
     with open(preprocessed_json, "r", encoding="utf-8") as f:
@@ -406,11 +470,20 @@ def run_step_6_validate_pdfs(
         # Filename format: {language}_notice_{sequence:05d}_{client_id}.pdf
         for idx, client in enumerate(clients, start=1):
             client_id = str(client.get("client_id", ""))
+            metadata = client.get("metadata", {}) or {}
+            phix_meta = metadata.get("phix_validation", {}) or {}
+            school = client.get("school", {}) or {}
             # Try to match any expected filename format
             for ext in [".pdf"]:
                 for lang_prefix in ["en", "fr"]:
                     filename = f"{lang_prefix}_notice_{idx:05d}_{client_id}{ext}"
                     client_id_map[filename] = client_id
+                    client_metadata_map[filename] = {
+                        "phix_validation": phix_meta,
+                        "phix_target_phu_code": metadata.get("phix_target_phu_code"),
+                        "phix_target_phu_label": metadata.get("phix_target_phu_label"),
+                        "school_name": school.get("name"),
+                    }
 
     # Validate PDFs (module loads validation rules from config_dir)
     validate_pdfs.main(
@@ -419,6 +492,7 @@ def run_step_6_validate_pdfs(
         json_output=validation_json,
         client_id_map=client_id_map,
         config_dir=config_dir,
+        client_metadata_map=client_metadata_map,
     )
 
 
@@ -574,6 +648,8 @@ def main() -> int:
             output_dir,
             args.language,
             run_id,
+            config_dir,
+            args.template_key,
         )
         step_duration = time.time() - step_start
         step_times.append(("Preprocessing", step_duration))
diff --git a/pipeline/preprocess.py b/pipeline/preprocess.py
index 6209d54..c0033b2 100644
--- a/pipeline/preprocess.py
+++ b/pipeline/preprocess.py
@@ -730,6 +730,16 @@ def build_preprocess_result(
     warnings: set[str] = set()
     working = normalize_dataframe(df)
 
+    def clean_optional(value: Any) -> Any:
+        """Convert pandas NA or empty values to None."""
+        if value is None:
+            return None
+        if isinstance(value, (float, int)) and pd.isna(value):
+            return None
+        if pd.isna(value):
+            return None
+        return value
+
     # Load parameters for date_notice_delivery and chart_diseases_header
     params = {}
     if PARAMETERS_PATH.exists():
@@ -832,6 +842,34 @@ def build_preprocess_result(
             "postal_code": postal_code,
         }
 
+        phix_id = clean_optional(getattr(row, "PHIX_ID", None))
+        phix_match_type = getattr(row, "PHIX_MATCH_TYPE", "none")
+        phix_match_conf = getattr(row, "PHIX_MATCH_CONFIDENCE", 0)
+        if pd.isna(phix_match_conf):
+            phix_match_conf = 0
+        phix_match_conf = int(phix_match_conf)
+        phix_phu_name = clean_optional(getattr(row, "PHIX_MATCHED_PHU", None))
+        phix_phu_code = clean_optional(getattr(row, "PHIX_MATCHED_PHU_CODE", None))
+        phix_target_code = clean_optional(getattr(row, "PHIX_TARGET_PHU_CODE", None))
+        phix_target_label = clean_optional(getattr(row, "PHIX_TARGET_PHU_LABEL", None))
+
+        metadata: Dict[str, Any] = {
+            "unique_id": row.UNIQUE_ID or None,  # type: ignore[attr-defined]
+        }
+        metadata["phix_validation"] = {
+            "id": phix_id,
+            "match_type": phix_match_type or "none",
+            "confidence": phix_match_conf,
+            "phu_name": phix_phu_name,
+            "phu_code": phix_phu_code,
+            "target_phu_code": phix_target_code,
+            "target_phu_label": phix_target_label,
+        }
+        if phix_target_code:
+            metadata["phix_target_phu_code"] = phix_target_code
+        if phix_target_label:
+            metadata["phix_target_phu_label"] = phix_target_label
+
         client = ClientRecord(
             sequence=sequence,
             client_id=client_id,
@@ -843,9 +881,7 @@ def build_preprocess_result(
             vaccines_due=vaccines_due if vaccines_due else None,
             vaccines_due_list=vaccines_due_list if vaccines_due_list else None,
             received=received if received else None,
-            metadata={
-                "unique_id": row.UNIQUE_ID or None,  # type: ignore[attr-defined]
-            },
+            metadata=metadata,
         )
 
         clients.append(client)
diff --git a/pipeline/validate_pdfs.py b/pipeline/validate_pdfs.py
index 8e15d67..527a26b 100644
--- a/pipeline/validate_pdfs.py
+++ b/pipeline/validate_pdfs.py
@@ -242,6 +242,7 @@ def validate_pdf_layout(
     reader: PdfReader,
     enabled_rules: dict[str, str],
     client_id_map: dict[str, str] | None = None,
+    client_metadata_map: dict[str, dict[str, object]] | None = None,
 ) -> tuple[List[str], dict[str, float]]:
     """Check PDF for layout issues using invisible markers and metadata.
 
@@ -256,6 +257,9 @@ def validate_pdf_layout(
     client_id_map : dict[str, str], optional
         Mapping of PDF filename (without path) to expected client ID.
         If provided, client_id_presence validation uses this as source of truth.
+    client_metadata_map : dict[str, dict], optional
+        Additional per-client metadata (PHIX validation scope, school name, etc.)
+        injected into per-PDF measurements for audit trails.
 
     Returns
     -------
@@ -343,6 +347,39 @@ def validate_pdf_layout(
             # If client ID check fails, skip silently (parsing error)
             pass
 
+    if client_metadata_map:
+        meta = client_metadata_map.get(pdf_path.name, {})
+        phix_meta = meta.get("phix_validation") or {}
+        target_phu_code = meta.get("phix_target_phu_code") or phix_meta.get("target_phu_code")
+        target_phu_label = meta.get("phix_target_phu_label") or phix_meta.get("target_phu_label")
+        matched_phu_code = phix_meta.get("phu_code")
+        matched_phu_name = phix_meta.get("phu_name")
+        match_type = phix_meta.get("match_type")
+        match_confidence = phix_meta.get("confidence", 0)
+        facility_name = meta.get("school_name")
+
+        measurements["phix_target_phu_code"] = target_phu_code or ""
+        measurements["phix_target_phu_label"] = target_phu_label or ""
+        measurements["phix_matched_phu_code"] = matched_phu_code or ""
+        measurements["phix_matched_phu_name"] = matched_phu_name or ""
+        measurements["phix_match_type"] = match_type or ""
+        try:
+            measurements["phix_match_confidence"] = int(match_confidence or 0)
+        except (TypeError, ValueError):
+            measurements["phix_match_confidence"] = 0
+        if facility_name:
+            measurements["phix_facility_school"] = facility_name
+
+        if (
+            target_phu_code
+            and matched_phu_code
+            and target_phu_code != matched_phu_code
+        ):
+            warnings.append(
+                "phix_target_phu: Facility PHU code "
+                f"{matched_phu_code} does not match template PHU {target_phu_code}"
+            )
+
     return warnings, measurements
 
 
@@ -350,6 +387,7 @@ def validate_pdf_structure(
     pdf_path: Path,
     enabled_rules: dict[str, str] | None = None,
     client_id_map: dict[str, str] | None = None,
+    client_metadata_map: dict[str, dict[str, object]] | None = None,
 ) -> ValidationResult:
     """Validate a single PDF file for structure and layout.
 
@@ -361,6 +399,9 @@ def validate_pdf_structure(
         Validation rules configuration (rule_name -> "disabled"/"warn"/"error").
     client_id_map : dict[str, str], optional
         Mapping of PDF filename to expected client ID (from preprocessed_clients.json).
+    client_metadata_map : dict[str, dict], optional
+        Per-PDF metadata (PHIX validation scope, school name, etc.) to include in
+        measurements and warnings.
 
     Returns
     -------
@@ -390,7 +431,11 @@ def validate_pdf_structure(
 
     # Validate layout using markers
     layout_warnings, layout_measurements = validate_pdf_layout(
-        pdf_path, reader, enabled_rules, client_id_map=client_id_map
+        pdf_path,
+        reader,
+        enabled_rules,
+        client_id_map=client_id_map,
+        client_metadata_map=client_metadata_map,
     )
     warnings.extend(layout_warnings)
     measurements.update(layout_measurements)
@@ -449,6 +494,7 @@ def validate_pdfs(
     files: List[Path],
     enabled_rules: dict[str, str] | None = None,
     client_id_map: dict[str, str] | None = None,
+    client_metadata_map: dict[str, dict[str, object]] | None = None,
 ) -> ValidationSummary:
     """Validate all PDF files and generate summary.
 
@@ -460,6 +506,8 @@ def validate_pdfs(
         Validation rules configuration (rule_name -> "disabled"/"warn"/"error").
     client_id_map : dict[str, str], optional
         Mapping of PDF filename to expected client ID (from preprocessed_clients.json).
+    client_metadata_map : dict[str, dict], optional
+        Per-PDF metadata (PHIX validation scope, school name, etc.).
 
     Returns
     -------
@@ -477,7 +525,10 @@ def validate_pdfs(
 
     for pdf_path in files:
         result = validate_pdf_structure(
-            pdf_path, enabled_rules=enabled_rules, client_id_map=client_id_map
+            pdf_path,
+            enabled_rules=enabled_rules,
+            client_id_map=client_id_map,
+            client_metadata_map=client_metadata_map,
         )
         results.append(result)
         page_count = int(result.measurements.get("page_count", 0))
@@ -592,6 +643,7 @@ def main(
     json_output: Path | None = None,
     client_id_map: dict[str, str] | None = None,
     config_dir: Path | None = None,
+    client_metadata_map: dict[str, dict[str, object]] | None = None,
 ) -> ValidationSummary:
     """Main entry point for PDF validation.
 
@@ -612,6 +664,9 @@ def main(
         Path to config directory containing parameters.yaml.
         Used to load enabled_rules if not explicitly provided.
         If not provided, uses default location (config/parameters.yaml in project root).
+    client_metadata_map : dict[str, dict], optional
+        Additional metadata (PHIX validation scope, school name, etc.) to inject
+        into per-PDF measurements for downstream auditing.
 
     Returns
     -------
@@ -632,11 +687,16 @@ def main(
 
     if client_id_map is None:
         client_id_map = {}
+    if client_metadata_map is None:
+        client_metadata_map = {}
 
     files = discover_pdfs(target)
     filtered = filter_by_language(files, language)
     summary = validate_pdfs(
-        filtered, enabled_rules=enabled_rules, client_id_map=client_id_map
+        filtered,
+        enabled_rules=enabled_rules,
+        client_id_map=client_id_map,
+        client_metadata_map=client_metadata_map,
     )
     summary.language = language
 
diff --git a/pipeline/validate_phix.py b/pipeline/validate_phix.py
new file mode 100644
index 0000000..5232dee
--- /dev/null
+++ b/pipeline/validate_phix.py
@@ -0,0 +1,583 @@
+"""Validate schools and daycares against PHIX Reference List.
+
+This module provides validation of school/daycare names against a canonical
+PHIX reference list. It supports strict exact matching plus PHU alias
+scoping to prevent cross-jurisdiction matches.
+
+**Input Contract:**
+- PHIX reference Excel file must exist at configured path
+- Reference file must contain 'Schools & Day Cares' sheet
+- Each column is a PHU, values are "FACILITY NAME - ID" format
+
+**Output Contract:**
+- Returns validation results with matched PHIX IDs and confidence scores
+- Writes unmatched facilities to CSV for PHU review
+- Raises or warns based on configured `unmatched_behavior`
+
+**Usage:**
+    Called from preprocess.py after address validation, before building results.
+"""
+
+from __future__ import annotations
+
+import logging
+import re
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Dict, Iterable, List, Optional, Set, Tuple
+
+import pandas as pd
+import yaml
+
+LOG = logging.getLogger(__name__)
+
+# Cache for loaded PHIX reference data and PHU alias mappings
+_PHIX_REFERENCE_CACHE: Optional[Dict[str, Any]] = None
+_PHU_MAPPING_CACHE: Optional[Dict[str, Any]] = None
+
+
+@dataclass
+class PHIXFacility:
+    """A facility entry from the PHIX reference list."""
+
+    phix_id: str
+    name: str
+    phu: str  # Public Health Unit that owns this facility
+
+    def __hash__(self) -> int:
+        return hash((self.phix_id, self.name, self.phu))
+
+
+@dataclass
+class PHIXMatchResult:
+    """Result of matching a facility against PHIX reference."""
+
+    input_name: str
+    matched: bool
+    phix_id: Optional[str] = None
+    phix_name: Optional[str] = None
+    phu_name: Optional[str] = None
+    phu_code: Optional[str] = None
+    confidence: int = 0
+    match_type: str = "none"  # exact or none
+
+
+def parse_facility_entry(entry: str, phu: str) -> Optional[PHIXFacility]:
+    """Parse a PHIX facility entry in 'NAME - ID' format.
+
+    Parameters
+    ----------
+    entry : str
+        Raw entry from Excel like "ANNA MCCREA PUBLIC SCHOOL - 019186"
+    phu : str
+        Name of the Public Health Unit column
+
+    Returns
+    -------
+    PHIXFacility or None
+        Parsed facility, or None if entry is empty/invalid
+    """
+    if not entry or pd.isna(entry):
+        return None
+
+    entry = str(entry).strip()
+    if not entry:
+        return None
+
+    # Parse "NAME - ID" format, where ID is the last segment after " - "
+    # Some names contain " - " so we split from the right
+    parts = entry.rsplit(" - ", maxsplit=1)
+    if len(parts) == 2:
+        name = parts[0].strip()
+        phix_id = parts[1].strip()
+    else:
+        # No ID separator found, use entire string as name
+        name = entry
+        phix_id = ""
+
+    return PHIXFacility(phix_id=phix_id, name=name, phu=phu)
+
+
+def load_phix_reference(
+    reference_path: Path,
+    sheet_name: str = "Schools & Day Cares",
+) -> Dict[str, Any]:
+    """Load and parse the PHIX reference Excel file.
+
+    Caches the result for subsequent calls with the same path.
+
+    Parameters
+    ----------
+    reference_path : Path
+        Path to the PHIX reference Excel file
+    sheet_name : str
+        Name of the sheet containing school/daycare data
+
+    Returns
+    -------
+    Dict with keys:
+        - facilities: List[PHIXFacility] - all parsed facilities
+        - by_name: Dict[str, PHIXFacility] - lookup by normalized name
+        - by_name_phu: Dict[str, Dict[str, PHIXFacility]] - lookup by PHU column
+        - phus: List[str] - list of PHU column names
+    """
+    global _PHIX_REFERENCE_CACHE
+
+    cache_key = str(reference_path.resolve())
+    if _PHIX_REFERENCE_CACHE is not None:
+        cached_path = _PHIX_REFERENCE_CACHE.get("_cache_path")
+        if cached_path == cache_key:
+            return _PHIX_REFERENCE_CACHE
+
+    if not reference_path.exists():
+        raise FileNotFoundError(f"PHIX reference file not found: {reference_path}")
+
+    LOG.info("Loading PHIX reference from %s", reference_path)
+    df = pd.read_excel(reference_path, sheet_name=sheet_name)
+
+    facilities: List[PHIXFacility] = []
+    by_name: Dict[str, PHIXFacility] = {}
+    by_name_phu: Dict[str, Dict[str, PHIXFacility]] = {}
+    seen_names: Set[str] = set()
+
+    for phu_column in df.columns:
+        for entry in df[phu_column].dropna():
+            facility = parse_facility_entry(entry, phu_column)
+            if facility:
+                facilities.append(facility)
+                # Use normalized name as key for exact matching
+                normalized = normalize_facility_name(facility.name)
+                by_name_phu.setdefault(normalized, {})[phu_column] = facility
+                if normalized not in seen_names:
+                    by_name[normalized] = facility
+                    seen_names.add(normalized)
+
+    LOG.info(
+        "Loaded %d facilities from %d PHUs", len(facilities), len(df.columns)
+    )
+
+    _PHIX_REFERENCE_CACHE = {
+        "_cache_path": cache_key,
+        "facilities": facilities,
+        "by_name": by_name,
+        "by_name_phu": by_name_phu,
+        "phus": list(df.columns),
+    }
+    return _PHIX_REFERENCE_CACHE
+
+
+def normalize_phu_code(code: str) -> str:
+    """Normalize a PHU acronym/template key for comparison."""
+    if not code:
+        return ""
+    normalized = re.sub(r"[^a-z0-9]+", "_", code.strip().lower())
+    return normalized.strip("_")
+
+
+def normalize_phu_label(label: str) -> str:
+    """Normalize PHU names/aliases for lookup."""
+    if not label:
+        return ""
+    return re.sub(r"\s+", " ", str(label).strip().upper())
+
+
+def load_phu_aliases(mapping_path: Path) -> Dict[str, Any]:
+    """Load canonical PHU aliases from YAML mapping file."""
+    global _PHU_MAPPING_CACHE
+
+    cache_key = str(mapping_path.resolve())
+    if _PHU_MAPPING_CACHE is not None:
+        cached_path = _PHU_MAPPING_CACHE.get("_cache_path")
+        if cached_path == cache_key:
+            return _PHU_MAPPING_CACHE
+
+    if not mapping_path.exists():
+        raise FileNotFoundError(f"PHU alias mapping file not found: {mapping_path}")
+
+    raw = yaml.safe_load(mapping_path.read_text(encoding="utf-8")) or {}
+    entries = raw.get("phu_aliases")
+    if not isinstance(entries, dict):
+        raise ValueError(
+            f"Invalid PHU alias mapping format in {mapping_path}. "
+            "Expected top-level 'phu_aliases' dictionary."
+        )
+
+    alias_to_code: Dict[str, str] = {}
+    code_to_display: Dict[str, str] = {}
+    code_to_aliases: Dict[str, Set[str]] = {}
+
+    for code, meta in entries.items():
+        normalized_code = normalize_phu_code(code)
+        if not normalized_code:
+            continue
+
+        meta = meta or {}
+        display_name = str(meta.get("display_name") or code).strip()
+        code_to_display[normalized_code] = display_name
+
+        aliases = meta.get("aliases", [])
+        if isinstance(aliases, str):
+            aliases = [aliases]
+
+        alias_candidates = set(
+            filter(
+                None,
+                [
+                    *[str(alias) for alias in aliases],
+                    code,
+                    display_name,
+                ],
+            )
+        )
+
+        for alias in alias_candidates:
+            normalized_alias = normalize_phu_label(alias)
+            if not normalized_alias:
+                continue
+            alias_to_code[normalized_alias] = normalized_code
+            code_to_aliases.setdefault(normalized_code, set()).add(str(alias).strip())
+
+    _PHU_MAPPING_CACHE = {
+        "_cache_path": cache_key,
+        "alias_to_code": alias_to_code,
+        "code_to_display": code_to_display,
+        "code_to_aliases": code_to_aliases,
+    }
+    return _PHU_MAPPING_CACHE
+
+
+def normalize_facility_name(name: str) -> str:
+    """Normalize a facility name for comparison.
+
+    Converts to uppercase, removes extra whitespace, and standardizes
+    common abbreviations.
+
+    Parameters
+    ----------
+    name : str
+        Raw facility name
+
+    Returns
+    -------
+    str
+        Normalized name for comparison
+    """
+    if not name:
+        return ""
+
+    # Uppercase and strip
+    normalized = name.upper().strip()
+
+    # Collapse multiple spaces
+    normalized = re.sub(r"\s+", " ", normalized)
+
+    return normalized
+
+
+def match_facility(
+    input_name: str,
+    reference: Dict[str, Any],
+    facility_phu_codes: Optional[Dict[str, Optional[str]]] = None,
+) -> PHIXMatchResult:
+    """Match a single facility name against PHIX reference using exact match."""
+    if not input_name or not input_name.strip():
+        return PHIXMatchResult(
+            input_name=input_name or "",
+            matched=False,
+            match_type="none",
+        )
+
+    normalized_input = normalize_facility_name(input_name)
+    by_name = reference["by_name"]
+
+    # Try exact match first
+    if normalized_input in by_name:
+        facility = by_name[normalized_input]
+        phu_code = facility_phu_codes.get(normalized_input) if facility_phu_codes else None
+        return PHIXMatchResult(
+            input_name=input_name,
+            matched=True,
+            phix_id=facility.phix_id,
+            phix_name=facility.name,
+            phu_name=facility.phu,
+            phu_code=phu_code,
+            confidence=100,
+            match_type="exact",
+        )
+
+    return PHIXMatchResult(
+        input_name=input_name,
+        matched=False,
+        match_type="none",
+    )
+
+
+def validate_facilities(
+    df: pd.DataFrame,
+    reference_path: Path,
+    output_dir: Path,
+    unmatched_behavior: str = "warn",
+    school_column: str = "SCHOOL_NAME",
+    target_phu_codes: Optional[Iterable[str]] = None,
+    phu_mapping_path: Optional[Path] = None,
+) -> Tuple[pd.DataFrame, List[str]]:
+    """Validate all facilities in DataFrame against PHIX reference.
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        Input DataFrame with facility names
+    reference_path : Path
+        Path to PHIX reference Excel file
+    output_dir : Path
+        Directory to write unmatched facilities CSV
+    unmatched_behavior : str
+        How to handle unmatched facilities:
+        - 'warn': Log warning, continue processing all records
+        - 'error': Raise ValueError if any unmatched
+        - 'skip': Filter out unmatched records
+    school_column : str
+        Name of column containing facility names
+    target_phu_codes : Iterable[str], optional
+        Canonical PHU codes (matching template folders) to restrict validation.
+        When provided, only facilities that belong to these PHUs are considered.
+    phu_mapping_path : Path, optional
+        Path to YAML mapping file that links PHIX column names to canonical PHU
+        codes. Required when target_phu_codes are provided.
+
+    Returns
+    -------
+    Tuple[pd.DataFrame, List[str]]
+        - DataFrame with PHIX validation columns added
+        - List of warning messages
+
+    Raises
+    ------
+    ValueError
+        If unmatched_behavior is 'error' and unmatched facilities exist
+    """
+    warnings: List[str] = []
+
+    if school_column not in df.columns:
+        LOG.warning("Column '%s' not found, skipping PHIX validation", school_column)
+        return df, warnings
+
+    # Load reference
+    reference = load_phix_reference(reference_path)
+
+    phu_mapping_data: Optional[Dict[str, Any]] = None
+    alias_lookup: Dict[str, str] = {}
+    if phu_mapping_path:
+        phu_mapping_data = load_phu_aliases(phu_mapping_path)
+        alias_lookup = phu_mapping_data.get("alias_to_code", {})
+
+    normalized_target_codes: Set[str] = set()
+    if target_phu_codes:
+        for code in target_phu_codes:
+            normalized = normalize_phu_code(code)
+            if normalized:
+                normalized_target_codes.add(normalized)
+        if normalized_target_codes and not phu_mapping_data:
+            raise ValueError(
+                "Target PHU codes provided but phu_mapping_file is not configured. "
+                "Update phix_validation.phu_mapping_file to scope validation."
+            )
+
+    if normalized_target_codes and phu_mapping_data:
+        code_to_display = phu_mapping_data.get("code_to_display", {})
+        missing_codes = sorted(
+            code for code in normalized_target_codes if code not in code_to_display
+        )
+        if missing_codes:
+            raise ValueError(
+                f"Template PHU codes not defined in {phu_mapping_path}: "
+                f"{', '.join(missing_codes)}. Update config/phu_aliases.yaml."
+            )
+
+    phu_column_codes: Dict[str, str] = {}
+    if alias_lookup:
+        for phu_column in reference["phus"]:
+            canonical_code = alias_lookup.get(normalize_phu_label(phu_column))
+            if canonical_code:
+                phu_column_codes[phu_column] = canonical_code
+
+    reference_for_matching = reference
+    target_display: Optional[str] = None
+    target_codes_str: Optional[str] = None
+
+    if normalized_target_codes:
+        allowed_phu_columns = [
+            column
+            for column, code in phu_column_codes.items()
+            if code in normalized_target_codes
+        ]
+        if not allowed_phu_columns:
+            raise ValueError(
+                "No PHIX columns mapped to the requested PHU codes. "
+                f"Template codes: {', '.join(sorted(normalized_target_codes))}. "
+                "Confirm config/phu_aliases.yaml contains the PHIX column names."
+            )
+
+        by_name_phu: Dict[str, Dict[str, PHIXFacility]] = reference.get(
+            "by_name_phu", {}
+        )
+        filtered_by_name: Dict[str, PHIXFacility] = {}
+        filtered_name_list: List[str] = []
+        for normalized_name, facilities_by_phu in by_name_phu.items():
+            for phu_label in allowed_phu_columns:
+                facility = facilities_by_phu.get(phu_label)
+                if facility:
+                    filtered_by_name[normalized_name] = facility
+                    filtered_name_list.append(normalized_name)
+                    break
+
+        if not filtered_by_name:
+            raise ValueError(
+                "No PHIX reference entries found for the requested PHU columns. "
+                f"Columns: {', '.join(sorted(allowed_phu_columns))}. "
+                "Verify the mapping file includes every PHIX alias."
+            )
+
+        reference_for_matching = dict(reference)
+        reference_for_matching["by_name"] = filtered_by_name
+        reference_for_matching["name_list"] = filtered_name_list
+
+        code_to_display = phu_mapping_data.get("code_to_display", {}) if phu_mapping_data else {}
+        target_display = ", ".join(
+            code_to_display.get(code, code).strip() for code in sorted(normalized_target_codes)
+        )
+        target_codes_str = ",".join(sorted(normalized_target_codes))
+        LOG.info(
+            "Restricting PHIX validation to PHU(s): %s",
+            target_display or target_codes_str,
+        )
+
+    facility_phu_codes_for_matching: Dict[str, Optional[str]] = {}
+    if alias_lookup:
+        for normalized_name, facility in reference_for_matching["by_name"].items():
+            canonical_code = alias_lookup.get(normalize_phu_label(facility.phu))
+            if canonical_code:
+                facility_phu_codes_for_matching[normalized_name] = canonical_code
+
+    # Match each unique facility
+    unique_facilities = df[school_column].dropna().unique()
+    match_results: Dict[str, PHIXMatchResult] = {}
+
+    for facility_name in unique_facilities:
+        result = match_facility(
+            str(facility_name),
+            reference_for_matching,
+            facility_phu_codes=(
+                facility_phu_codes_for_matching
+                if facility_phu_codes_for_matching
+                else None
+            ),
+        )
+        match_results[str(facility_name)] = result
+
+    # Add validation columns to DataFrame
+    df = df.copy()
+    df["PHIX_ID"] = df[school_column].apply(
+        lambda x: match_results.get(str(x), PHIXMatchResult(str(x), False)).phix_id
+        if pd.notna(x)
+        else None
+    )
+    df["PHIX_MATCH_CONFIDENCE"] = df[school_column].apply(
+        lambda x: match_results.get(str(x), PHIXMatchResult(str(x), False)).confidence
+        if pd.notna(x)
+        else 0
+    )
+    df["PHIX_MATCH_TYPE"] = df[school_column].apply(
+        lambda x: match_results.get(
+            str(x), PHIXMatchResult(str(x), False)
+        ).match_type
+        if pd.notna(x)
+        else "none"
+    )
+    df["PHIX_MATCHED_PHU"] = df[school_column].apply(
+        lambda x: match_results.get(
+            str(x), PHIXMatchResult(str(x), False)
+        ).phu_name
+        if pd.notna(x)
+        else None
+    )
+    df["PHIX_MATCHED_PHU_CODE"] = df[school_column].apply(
+        lambda x: match_results.get(
+            str(x), PHIXMatchResult(str(x), False)
+        ).phu_code
+        if pd.notna(x)
+        else None
+    )
+    df["PHIX_TARGET_PHU_CODE"] = target_codes_str
+    df["PHIX_TARGET_PHU_LABEL"] = target_display
+
+    # Identify unmatched facilities
+    unmatched = [r for r in match_results.values() if not r.matched]
+
+    if unmatched:
+        unmatched_names = sorted(set(r.input_name for r in unmatched))
+        LOG.warning(
+            "%d facilities could not be matched to PHIX reference: %s",
+            len(unmatched_names),
+            unmatched_names[:5],  # Log first 5
+        )
+
+        # Write unmatched to CSV
+        output_dir.mkdir(parents=True, exist_ok=True)
+        unmatched_path = output_dir / "unmatched_facilities.csv"
+        unmatched_df = pd.DataFrame(
+            [
+                {
+                    "facility_name": r.input_name,
+                    "match_type": r.match_type,
+                    "confidence": r.confidence,
+                    "target_phu_code": target_codes_str or "",
+                    "target_phu_label": target_display or "",
+                }
+                for r in unmatched
+            ]
+        )
+        unmatched_df.to_csv(unmatched_path, index=False)
+        LOG.info("Wrote %d unmatched facilities to %s", len(unmatched), unmatched_path)
+        warnings.append(
+            f"{len(unmatched_names)} facilities not found in PHIX reference. "
+            f"See {unmatched_path} for details."
+        )
+
+        if unmatched_behavior == "error":
+            raise ValueError(
+                f"{len(unmatched_names)} facilities not found in PHIX reference: "
+                f"{', '.join(unmatched_names[:10])}"
+                + (f" (and {len(unmatched_names) - 10} more)" if len(unmatched_names) > 10 else "")
+            )
+        elif unmatched_behavior == "skip":
+            # Filter out rows with unmatched facilities
+            matched_names = {r.input_name for r in match_results.values() if r.matched}
+            original_count = len(df)
+            df = df[df[school_column].isin(matched_names)]
+            filtered_count = original_count - len(df)
+            LOG.info(
+                "Filtered %d records with unmatched facilities, %d remaining",
+                filtered_count,
+                len(df),
+            )
+            warnings.append(
+                f"Filtered {filtered_count} records with unmatched facilities."
+            )
+
+    # Log summary
+    matched_count = sum(1 for r in match_results.values() if r.matched)
+    LOG.info(
+        "PHIX validation complete: %d matched, %d unmatched",
+        matched_count,
+        len(unmatched),
+    )
+
+    return df, warnings
+
+
+def clear_cache() -> None:
+    """Clear the PHIX reference cache. Useful for testing."""
+    global _PHIX_REFERENCE_CACHE, _PHU_MAPPING_CACHE
+    _PHIX_REFERENCE_CACHE = None
+    _PHU_MAPPING_CACHE = None
diff --git a/plans/001-validate-phix-reference.md b/plans/001-validate-phix-reference.md
new file mode 100644
index 0000000..e6bcdd1
--- /dev/null
+++ b/plans/001-validate-phix-reference.md
@@ -0,0 +1,186 @@
+# Plan: Validate Schools and Daycares Against PHIX Reference List
+
+**Status:** ✅ Implemented  
+**Date:** 2026-01-14  
+
+---
+
+## Problem Statement
+
+Input data contains school/daycare names that may not match the official PHIX (Public Health Information Exchange) reference list. This causes:
+- Data quality issues in reports
+- Difficulty linking records to official facility IDs
+- No validation that facilities exist in the PHU's jurisdiction
+
+## Solution
+
+Add a validation step in preprocessing that matches input school/daycare names against the PHIX reference list using strict exact comparisons with configurable behavior for unmatched facilities.
+
+---
+
+## Implementation
+
+### Files Created
+
+| File | Purpose |
+|------|---------|
+| `pipeline/validate_phix.py` | Validation module with loading, matching, and batch validation |
+| `tests/unit/test_validate_phix.py` | 26 unit tests covering all functionality |
+
+### Files Modified
+
+| File | Changes |
+|------|---------|
+| `config/parameters.yaml` | Added `phix_validation` configuration section |
+| `pipeline/orchestrator.py` | Integrated validation into Step 2 (Preprocessing) |
+
+### Configuration Options
+
+```yaml
+phix_validation:
+  enabled: true
+  reference_file: BYO_PHIX_REFERENCE.xlsx   # Placeholder; operators supply actual PHIX workbook path
+  phu_mapping_file: config/phu_aliases.yaml   # Maps PHIX column names -> template codes
+  target_phu_code: null                       # Optional default scope when no template is provided
+  unmatched_behavior: warn # warn | error | skip
+```
+
+**Behavior modes:**
+- `warn` - Log warning, continue processing all records
+- `error` - Fail pipeline if any facilities don't match
+- `skip` - Filter out records with unmatched facilities
+
+---
+
+## How It Works
+
+### Pipeline Flow
+
+```
+Step 2: Preprocessing
+├── read_input()
+├── map_columns()
+├── filter_columns()
+├── ensure_required_columns()
+├── check_addresses_complete()
+├── validate_facilities()        ← NEW: PHIX validation
+│   ├── Load PHIX Excel reference
+│   ├── Normalize facility names
+│   ├── Map PHIX PHU columns to canonical template codes
+│   ├── Restrict matching to template/config PHU scope (if configured)
+│   ├── Try exact match (100% confidence)
+│   ├── Write unmatched to CSV
+│   └── Enrich DataFrame/metadata with PHIX IDs + PHU scope
+├── build_preprocess_result()
+└── write_artifact()
+```
+
+### PHIX Reference Format
+
+The Excel file has one column per PHU, with values in `"FACILITY NAME - ID"` format:
+
+```
+| Algoma PHU                              | Brant PHU                    |
+|----------------------------------------|------------------------------|
+| ANNA MCCREA PUBLIC SCHOOL - 019186     | BRANTFORD ELEMENTARY - 12345 |
+| SUNSHINE DAYCARE - AL-0003561          | MAPLE CHILDCARE - 67890      |
+```
+
+### Output
+
+**Console:**
+```
+Step 2: Preprocessing
+...
+🏫 PHIX validation complete: 1,247 records validated
+⚠️  3 facilities not found in PHIX reference. See output/unmatched_facilities.csv
+```
+
+**Enriched data (per record):**
+- `PHIX_ID` - Official facility identifier
+- `PHIX_MATCH_CONFIDENCE` - Match score (0-100)
+- `PHIX_MATCH_TYPE` - "exact" or "none"
+- `PHIX_MATCHED_PHU` / `_CODE` - PHU column + canonical code for the matched facility
+- `PHIX_TARGET_PHU_CODE` / `_LABEL` - Template/config PHU scope copied to metadata
+- Artifact metadata includes `phix_validation` payload so Step 6 can log PHIX scope per PDF
+
+**Unmatched report (`output/unmatched_facilities.csv`):**
+```csv
+facility_name,match_type,confidence
+Lincon Elementary School,none,0
+New Daycare Centre,none,0
+```
+
+---
+
+## Design Decisions
+
+### 1. Excel vs JSON for reference data
+
+**Decision:** Keep Excel as source (official PHIX format PHUs receive)
+
+**Rationale:**
+- PHU staff can update without technical knowledge
+- No manual conversion step to forget
+- Caching makes load time acceptable
+
+**Trade-off:** Slightly slower initial load (~0.5s vs ~50ms)
+
+### 2. Integration point
+
+**Decision:** After `check_addresses_complete()`, before `build_preprocess_result()`
+
+**Rationale:**
+- Validates after basic data normalization
+- Can filter records before building client objects
+- Follows existing validation pattern (addresses)
+
+### 3. Fuzzy matching algorithm
+
+**Decision:** Require exact facility name matches (case-insensitive)
+
+**Rationale:**
+- Eliminates the risk of picking the wrong facility due to similar spellings
+- Aligns with Panorama usage where facility folders/templates must match canonical names
+- Encourages upstream data normalization (alias mapping now solved by PHU mapping, not name similarity)
+
+### 4. PHU alias mapping and PDF auditing
+
+**Decision:** Require a YAML mapping (`config/phu_aliases.yaml`) that links PHIX column headers
+to template acronyms; propagate PHIX scope into PDF validation logs.
+
+**Rationale:**
+- Prevents cross-PHU matches when multiple units share similarly named schools.
+- Supports PHU mergers by allowing multiple PHIX aliases to map to one template code.
+- PDF validation now records `phix_target_phu_code`/`phix_matched_phu_code` per notice and
+  emits a `phix_target_phu` warning if the PHU codes diverge, giving auditors a traceable log.
+- Threshold 85% catches minor typos without false positives
+
+---
+
+## Testing
+
+```bash
+# Run PHIX validation tests only
+uv run pytest tests/unit/test_validate_phix.py -v
+
+# Run all unit tests
+uv run pytest -m unit
+```
+
+**Test coverage:**
+- Parsing PHIX entries ("NAME - ID" format)
+- Exact match behavior (case-insensitive, typo rejection)
+- All three `unmatched_behavior` modes
+- Edge cases (empty data, missing columns)
+- Caching behavior
+
+---
+
+## Future Enhancements
+
+1. **JSON cache layer** - Auto-generate JSON cache for faster loads
+2. **CLI command** - `viper convert-phix` to pre-generate JSON
+3. **PHU filtering** - Only load facilities for configured PHU
+4. **Alias support** - Allow manual alias mappings for known variations
+- BYO PHIX file: keep the licensed workbook outside git (ignored by pattern) and update `reference_file` locally before running.
diff --git a/plans/README.md b/plans/README.md
new file mode 100644
index 0000000..0ccdec0
--- /dev/null
+++ b/plans/README.md
@@ -0,0 +1,35 @@
+# Plans Directory
+
+This directory contains implementation plans for features developed with AI coding assistance.
+
+## Purpose
+
+- Document the reasoning behind implementations
+- Provide context for code reviewers
+- Help future contributors understand design decisions
+- Track what has been implemented and what is planned
+
+## Naming Convention
+
+```
+NNN-feature-name.md
+```
+
+Where `NNN` is a zero-padded sequence number.
+
+## Plan Structure
+
+Each plan should include:
+
+1. **Problem Statement** - What problem does this solve?
+2. **Solution** - High-level approach
+3. **Implementation** - Files created/modified
+4. **Design Decisions** - Key choices and rationale
+5. **Testing** - How to verify it works
+6. **Future Enhancements** - What could be added later
+
+## Current Plans
+
+| # | Feature | Status | Date |
+|---|---------|--------|------|
+| 001 | [Validate PHIX Reference](001-validate-phix-reference.md) | ✅ Implemented | 2026-01-14 |
diff --git a/tests/unit/test_run_pipeline.py b/tests/unit/test_run_pipeline.py
index dff22a8..f3cdd99 100644
--- a/tests/unit/test_run_pipeline.py
+++ b/tests/unit/test_run_pipeline.py
@@ -220,7 +220,8 @@ def test_run_step_2_preprocess(self, tmp_test_dir: Path, tmp_output_structure: d
         mock_result.clients = [client1, client2]
         mock_result.warnings = []
 
-        with patch("pipeline.orchestrator.preprocess.read_input", return_value=mock_df), \
+        with patch("pipeline.orchestrator.load_config", return_value={"phix_validation": {"enabled": False}}), \
+            patch("pipeline.orchestrator.preprocess.read_input", return_value=mock_df), \
             patch("pipeline.orchestrator.preprocess.map_columns", return_value=(mock_mapped_df, {})), \
             patch("pipeline.orchestrator.preprocess.filter_columns", return_value=mock_filtered_df), \
             patch("pipeline.orchestrator.preprocess.ensure_required_columns", return_value=mock_final_df), \
@@ -236,6 +237,8 @@ def test_run_step_2_preprocess(self, tmp_test_dir: Path, tmp_output_structure: d
                 output_dir=tmp_output_structure["root"],
                 language="en",
                 run_id="test_20250101_120000",
+                config_dir=tmp_test_dir,
+                template_code=None,
             )
 
         assert total == 2
diff --git a/tests/unit/test_validate_pdfs.py b/tests/unit/test_validate_pdfs.py
index 3eabb86..91538df 100644
--- a/tests/unit/test_validate_pdfs.py
+++ b/tests/unit/test_validate_pdfs.py
@@ -769,3 +769,69 @@ def test_client_id_presence_disabled(self, tmp_path: Path) -> None:
 
         # Should have no warnings because all rules are disabled
         assert len(result.warnings) == 0
+
+
+@pytest.mark.unit
+class TestPhixMetadataLogging:
+    """Tests for PHIX metadata measurements in PDF validation."""
+
+    def test_measurements_include_phix_scope(self, tmp_path: Path) -> None:
+        """PHIX metadata is propagated into validation measurements."""
+        pdf_path = tmp_path / "en_notice_00001_1009876543.pdf"
+        writer = PdfWriter()
+        writer.add_blank_page(width=612, height=792)
+        with open(pdf_path, "wb") as f:
+            writer.write(f)
+
+        metadata_map = {
+            pdf_path.name: {
+                "phix_validation": {
+                    "phu_code": "test_phu_1",
+                    "phu_name": "Test PHU 1",
+                    "match_type": "exact",
+                    "confidence": 100,
+                },
+                "phix_target_phu_code": "test_phu_1",
+                "phix_target_phu_label": "Test PHU 1",
+                "school_name": "Lincoln Elementary School",
+            }
+        }
+
+        result = validate_pdfs.validate_pdf_structure(
+            pdf_path,
+            enabled_rules={"exactly_two_pages": "disabled"},
+            client_metadata_map=metadata_map,
+        )
+
+        assert result.measurements["phix_target_phu_code"] == "test_phu_1"
+        assert result.measurements["phix_matched_phu_code"] == "test_phu_1"
+        assert "phix_target_phu" not in "".join(result.warnings)
+
+    def test_mismatch_emits_warning(self, tmp_path: Path) -> None:
+        """PHU mismatch triggers per-PDF warning."""
+        pdf_path = tmp_path / "en_notice_00001_1009876543.pdf"
+        writer = PdfWriter()
+        writer.add_blank_page(width=612, height=792)
+        with open(pdf_path, "wb") as f:
+            writer.write(f)
+
+        metadata_map = {
+            pdf_path.name: {
+                "phix_validation": {
+                    "phu_code": "test_phu_2",
+                    "phu_name": "Test PHU 2",
+                    "match_type": "exact",
+                    "confidence": 100,
+                },
+                "phix_target_phu_code": "test_phu_1",
+                "phix_target_phu_label": "Test PHU 1",
+            }
+        }
+
+        result = validate_pdfs.validate_pdf_structure(
+            pdf_path,
+            enabled_rules={"exactly_two_pages": "disabled"},
+            client_metadata_map=metadata_map,
+        )
+
+        assert any("phix_target_phu" in warning for warning in result.warnings)
diff --git a/tests/unit/test_validate_phix.py b/tests/unit/test_validate_phix.py
new file mode 100644
index 0000000..4083b45
--- /dev/null
+++ b/tests/unit/test_validate_phix.py
@@ -0,0 +1,441 @@
+"""Unit tests for PHIX validation module.
+
+Tests cover:
+- Loading and parsing PHIX reference Excel files
+- Exact matching against reference list (no fuzzy fallback)
+- Different unmatched_behavior modes (warn, error, skip)
+- Edge cases (empty data, missing columns)
+"""
+
+from __future__ import annotations
+
+import tempfile
+from pathlib import Path
+from unittest.mock import patch
+
+import pandas as pd
+import pytest
+
+from pipeline.validate_phix import (
+    PHIXFacility,
+    PHIXMatchResult,
+    clear_cache,
+    load_phix_reference,
+    match_facility,
+    normalize_facility_name,
+    parse_facility_entry,
+    validate_facilities,
+)
+
+
+@pytest.fixture(autouse=True)
+def reset_cache():
+    """Clear PHIX reference cache before and after each test."""
+    clear_cache()
+    yield
+    clear_cache()
+
+
+@pytest.fixture
+def sample_phix_excel(tmp_path: Path) -> Path:
+    """Create a sample PHIX reference Excel file for testing."""
+    data = {
+        "Test PHU 1": [
+            "Lincoln Elementary School - SCH001",
+            "Maple High School - SCH002",
+            "Sunshine Childcare Centre - DAY001",
+            None,  # Empty row
+        ],
+        "Test PHU 2": [
+            "Oak Valley Public School - SCH003",
+            "Bright Futures Daycare - DAY002",
+            None,
+            None,
+        ],
+    }
+    df = pd.DataFrame(data)
+    excel_path = tmp_path / "test_phix_reference.xlsx"
+    df.to_excel(excel_path, sheet_name="Schools & Day Cares", index=False)
+    return excel_path
+
+
+@pytest.fixture
+def sample_phu_mapping(tmp_path: Path) -> Path:
+    """Create a sample PHU alias mapping file."""
+    mapping_path = tmp_path / "phu_aliases.yaml"
+    mapping_path.write_text(
+        """
+phu_aliases:
+  test_phu_1:
+    display_name: Test PHU 1
+    aliases:
+      - Test PHU 1
+  test_phu_2:
+    display_name: Test PHU 2
+    aliases:
+      - Test PHU 2
+        """.strip(),
+        encoding="utf-8",
+    )
+    return mapping_path
+
+
+class TestParseFacilityEntry:
+    """Tests for parse_facility_entry function."""
+
+    def test_parse_standard_entry(self):
+        """Parse standard 'NAME - ID' format."""
+        facility = parse_facility_entry(
+            "ANNA MCCREA PUBLIC SCHOOL - 019186", "Test PHU"
+        )
+        assert facility is not None
+        assert facility.name == "ANNA MCCREA PUBLIC SCHOOL"
+        assert facility.phix_id == "019186"
+        assert facility.phu == "Test PHU"
+
+    def test_parse_entry_with_multiple_dashes(self):
+        """Parse entry where name contains dashes."""
+        facility = parse_facility_entry(
+            "ST. MARY'S CO-OP - PRE-SCHOOL - DAY123", "PHU"
+        )
+        assert facility is not None
+        # Should split on last " - "
+        assert facility.name == "ST. MARY'S CO-OP - PRE-SCHOOL"
+        assert facility.phix_id == "DAY123"
+
+    def test_parse_entry_no_id(self):
+        """Parse entry without ID separator."""
+        facility = parse_facility_entry("Some School Name", "PHU")
+        assert facility is not None
+        assert facility.name == "Some School Name"
+        assert facility.phix_id == ""
+
+    def test_parse_empty_entry(self):
+        """Empty entries return None."""
+        assert parse_facility_entry("", "PHU") is None
+        assert parse_facility_entry(None, "PHU") is None  # type: ignore[arg-type]
+        assert parse_facility_entry("   ", "PHU") is None
+
+    def test_parse_nan_entry(self):
+        """NaN values return None."""
+        assert parse_facility_entry(float("nan"), "PHU") is None  # type: ignore[arg-type]
+
+
+class TestNormalizeFacilityName:
+    """Tests for normalize_facility_name function."""
+
+    def test_uppercase_conversion(self):
+        """Names are converted to uppercase."""
+        assert normalize_facility_name("Lincoln School") == "LINCOLN SCHOOL"
+
+    def test_whitespace_normalization(self):
+        """Extra whitespace is collapsed."""
+        assert normalize_facility_name("  Lincoln   School  ") == "LINCOLN SCHOOL"
+
+    def test_empty_string(self):
+        """Empty string returns empty."""
+        assert normalize_facility_name("") == ""
+        assert normalize_facility_name("   ") == ""
+
+
+class TestLoadPhixReference:
+    """Tests for load_phix_reference function."""
+
+    def test_load_valid_file(self, sample_phix_excel: Path):
+        """Load and parse valid PHIX reference file."""
+        ref = load_phix_reference(sample_phix_excel)
+
+        assert "facilities" in ref
+        assert "by_name" in ref
+        assert "phus" in ref
+        assert "by_name_phu" in ref
+
+        # Should have 5 facilities (excluding None rows)
+        assert len(ref["facilities"]) == 5
+        assert len(ref["phus"]) == 2
+
+    def test_caching(self, sample_phix_excel: Path):
+        """Second load returns cached data."""
+        ref1 = load_phix_reference(sample_phix_excel)
+        ref2 = load_phix_reference(sample_phix_excel)
+
+        # Should be same object (cached)
+        assert ref1 is ref2
+
+    def test_file_not_found(self, tmp_path: Path):
+        """Missing file raises FileNotFoundError."""
+        with pytest.raises(FileNotFoundError, match="PHIX reference file not found"):
+            load_phix_reference(tmp_path / "nonexistent.xlsx")
+
+
+class TestMatchFacility:
+    """Tests for match_facility function."""
+
+    def test_exact_match(self, sample_phix_excel: Path):
+        """Exact match returns 100% confidence."""
+        ref = load_phix_reference(sample_phix_excel)
+
+        result = match_facility("Lincoln Elementary School", ref)
+        assert result.matched is True
+        assert result.phix_id == "SCH001"
+        assert result.confidence == 100
+        assert result.match_type == "exact"
+        assert result.phu_name == "Test PHU 1"
+
+    def test_exact_match_case_insensitive(self, sample_phix_excel: Path):
+        """Exact match is case-insensitive."""
+        ref = load_phix_reference(sample_phix_excel)
+
+        result = match_facility("LINCOLN ELEMENTARY SCHOOL", ref)
+        assert result.matched is True
+        assert result.confidence == 100
+
+    def test_similar_name_not_matched(self, sample_phix_excel: Path):
+        """Slight typos do not match (exact-only policy)."""
+        ref = load_phix_reference(sample_phix_excel)
+
+        # Slight typo that would match fuzzy
+        result = match_facility("Lincoln Elementry School", ref)
+        assert result.matched is False
+
+    def test_empty_input(self, sample_phix_excel: Path):
+        """Empty input returns no match."""
+        ref = load_phix_reference(sample_phix_excel)
+
+        result = match_facility("", ref)
+        assert result.matched is False
+
+        result = match_facility("   ", ref)
+        assert result.matched is False
+
+
+class TestValidateFacilities:
+    """Tests for validate_facilities function."""
+
+    def test_validate_all_matched(self, sample_phix_excel: Path, tmp_path: Path):
+        """All facilities matched returns no warnings."""
+        df = pd.DataFrame({
+            "SCHOOL_NAME": ["Lincoln Elementary School", "Maple High School"],
+            "OTHER_COL": ["A", "B"],
+        })
+
+        result_df, warnings = validate_facilities(df, sample_phix_excel, tmp_path)
+
+        assert len(result_df) == 2
+        assert "PHIX_ID" in result_df.columns
+        assert "PHIX_MATCH_CONFIDENCE" in result_df.columns
+        assert result_df.iloc[0]["PHIX_ID"] == "SCH001"
+        assert "PHIX_MATCHED_PHU" in result_df.columns
+        assert result_df.iloc[0]["PHIX_MATCHED_PHU"] == "Test PHU 1"
+        assert "PHIX_TARGET_PHU_CODE" in result_df.columns
+        assert pd.isna(result_df["PHIX_TARGET_PHU_CODE"]).all()
+        assert len(warnings) == 0
+
+    def test_validate_scoped_to_target_phu(
+        self,
+        sample_phix_excel: Path,
+        sample_phu_mapping: Path,
+        tmp_path: Path,
+    ):
+        """Facilities outside target PHU remain unmatched."""
+        df = pd.DataFrame(
+            {
+                "SCHOOL_NAME": [
+                    "Lincoln Elementary School",  # PHU 1
+                    "Oak Valley Public School",  # PHU 2
+                ],
+            }
+        )
+
+        result_df, warnings = validate_facilities(
+            df,
+            sample_phix_excel,
+            tmp_path,
+            target_phu_codes=["test_phu_2"],
+            phu_mapping_path=sample_phu_mapping,
+        )
+
+        assert result_df.iloc[0]["PHIX_MATCH_TYPE"] == "none"
+        assert result_df.iloc[0]["PHIX_ID"] is None
+        assert result_df.iloc[1]["PHIX_ID"] == "SCH003"
+        assert result_df.iloc[1]["PHIX_MATCHED_PHU_CODE"] == "test_phu_2"
+        assert result_df.iloc[1]["PHIX_TARGET_PHU_CODE"] == "test_phu_2"
+        assert len(warnings) == 1  # unmatched facility warning
+        unmatched_csv = tmp_path / "unmatched_facilities.csv"
+        assert unmatched_csv.exists()
+
+    def test_validate_missing_mapping_for_target(
+        self, sample_phix_excel: Path, tmp_path: Path
+    ):
+        """Target PHU requires mapping file."""
+        df = pd.DataFrame({"SCHOOL_NAME": ["Lincoln Elementary School"]})
+
+        with pytest.raises(ValueError, match="phu_mapping_file"):
+            validate_facilities(
+                df,
+                sample_phix_excel,
+                tmp_path,
+                target_phu_codes=["test_phu_1"],
+            )
+
+    def test_validate_unknown_template_code(
+        self, sample_phix_excel: Path, sample_phu_mapping: Path, tmp_path: Path
+    ):
+        """Unknown template code raises descriptive error."""
+        df = pd.DataFrame({"SCHOOL_NAME": ["Lincoln Elementary School"]})
+
+        with pytest.raises(ValueError, match="not defined"):
+            validate_facilities(
+                df,
+                sample_phix_excel,
+                tmp_path,
+                target_phu_codes=["unknown_code"],
+                phu_mapping_path=sample_phu_mapping,
+            )
+
+    def test_validate_missing_phu_alias(
+        self, sample_phix_excel: Path, tmp_path: Path
+    ):
+        """Missing PHU alias in mapping is surfaced."""
+        df = pd.DataFrame({"SCHOOL_NAME": ["Lincoln Elementary School"]})
+        mapping_path = tmp_path / "phu_aliases_partial.yaml"
+        mapping_path.write_text(
+            """
+phu_aliases:
+  test_phu_1:
+    display_name: Example Display
+    aliases:
+      - Unknown Alias
+            """.strip(),
+            encoding="utf-8",
+        )
+
+        with pytest.raises(ValueError, match="No PHIX columns mapped"):
+            validate_facilities(
+                df,
+                sample_phix_excel,
+                tmp_path,
+                target_phu_codes=["test_phu_1"],
+                phu_mapping_path=mapping_path,
+            )
+
+    def test_validate_with_unmatched_warn(
+        self, sample_phix_excel: Path, tmp_path: Path
+    ):
+        """Unmatched facilities with warn behavior logs warning."""
+        df = pd.DataFrame({
+            "SCHOOL_NAME": ["Lincoln Elementary School", "Unknown School XYZ"],
+        })
+
+        result_df, warnings = validate_facilities(
+            df,
+            sample_phix_excel,
+            tmp_path,
+            unmatched_behavior="warn",
+        )
+
+        assert len(result_df) == 2  # All records kept
+        assert len(warnings) == 1
+        assert "not found in PHIX reference" in warnings[0]
+
+        # Check unmatched CSV was written
+        unmatched_csv = tmp_path / "unmatched_facilities.csv"
+        assert unmatched_csv.exists()
+
+    def test_validate_with_unmatched_error(
+        self, sample_phix_excel: Path, tmp_path: Path
+    ):
+        """Unmatched facilities with error behavior raises."""
+        df = pd.DataFrame({
+            "SCHOOL_NAME": ["Lincoln Elementary School", "Unknown School XYZ"],
+        })
+
+        with pytest.raises(ValueError, match="not found in PHIX reference"):
+            validate_facilities(
+                df,
+                sample_phix_excel,
+                tmp_path,
+                unmatched_behavior="error",
+            )
+
+    def test_validate_with_unmatched_skip(
+        self, sample_phix_excel: Path, tmp_path: Path
+    ):
+        """Unmatched facilities with skip behavior filters them out."""
+        df = pd.DataFrame({
+            "SCHOOL_NAME": ["Lincoln Elementary School", "Unknown School XYZ"],
+        })
+
+        result_df, warnings = validate_facilities(
+            df,
+            sample_phix_excel,
+            tmp_path,
+            unmatched_behavior="skip",
+        )
+
+        assert len(result_df) == 1  # Unknown filtered out
+        assert result_df.iloc[0]["SCHOOL_NAME"] == "Lincoln Elementary School"
+        assert len(warnings) == 2  # unmatched warning + filtered warning
+
+    def test_validate_missing_column(
+        self, sample_phix_excel: Path, tmp_path: Path
+    ):
+        """Missing school column skips validation gracefully."""
+        df = pd.DataFrame({
+            "OTHER_COL": ["A", "B"],
+        })
+
+        result_df, warnings = validate_facilities(
+            df, sample_phix_excel, tmp_path,
+            school_column="SCHOOL_NAME",
+        )
+
+        assert len(result_df) == 2
+        assert "PHIX_ID" not in result_df.columns
+        assert len(warnings) == 0
+
+    def test_validate_empty_dataframe(
+        self, sample_phix_excel: Path, tmp_path: Path
+    ):
+        """Empty DataFrame returns empty with no errors."""
+        df = pd.DataFrame({"SCHOOL_NAME": []})
+
+        result_df, warnings = validate_facilities(
+            df, sample_phix_excel, tmp_path
+        )
+
+        assert len(result_df) == 0
+        assert len(warnings) == 0
+
+
+class TestPHIXFacilityDataclass:
+    """Tests for PHIXFacility dataclass."""
+
+    def test_hash_equality(self):
+        """Two facilities with same data have same hash."""
+        f1 = PHIXFacility(phix_id="123", name="Test", phu="PHU1")
+        f2 = PHIXFacility(phix_id="123", name="Test", phu="PHU1")
+
+        assert hash(f1) == hash(f2)
+
+    def test_hash_difference(self):
+        """Different facilities have different hashes."""
+        f1 = PHIXFacility(phix_id="123", name="Test", phu="PHU1")
+        f2 = PHIXFacility(phix_id="456", name="Test", phu="PHU1")
+
+        assert hash(f1) != hash(f2)
+
+
+class TestPHIXMatchResultDataclass:
+    """Tests for PHIXMatchResult dataclass."""
+
+    def test_default_values(self):
+        """Verify default values."""
+        result = PHIXMatchResult(input_name="Test", matched=False)
+
+        assert result.phix_id is None
+        assert result.phix_name is None
+        assert result.phu_name is None
+        assert result.phu_code is None
+        assert result.confidence == 0
+        assert result.match_type == "none"