diff --git a/.gitignore b/.gitignore index a1c9df5..0d8ca03 100644 --- a/.gitignore +++ b/.gitignore @@ -15,7 +15,10 @@ coverage.xml coverage.json !input/rodent_dataset.xlsx input/* +PHIX Reference*.xlsx +PHIX Reference*.xls +*PHIX*Reference*List*.xlsx phu_templates/* !phu_templates/README.md !phu_templates/.gitkeep -.gitmodules \ No newline at end of file +.gitmodules diff --git a/config/parameters.yaml b/config/parameters.yaml index 5a76b33..5296686 100644 --- a/config/parameters.yaml +++ b/config/parameters.yaml @@ -1,6 +1,27 @@ bundling: bundle_size: 100 group_by: null + +# Step 2: Preprocessing - PHIX Reference Validation +# Validates school/daycare names against the official PHIX reference list +phix_validation: + enabled: true + # Path to PHIX reference Excel file (relative to project root) + # IMPORTANT: We cannot redistribute the PHIX workbook. Provide your own copy + # and update this path (absolute path or relative to repo root). + reference_file: BYO_PHIX_REFERENCE.xlsx + # Optional mapping file that converts PHIX PHU column names to canonical + # template codes. Required when target_phu_code or --template is provided. + phu_mapping_file: config/phu_aliases.yaml + # Optional default PHU scope when running without --template. + # Accepts a single code or list of codes. Leave null to accept all PHUs. + target_phu_code: null + # How to handle unmatched facilities: 'warn', 'error', or 'skip' + # - warn: Log warning, continue processing all records + # - error: Fail pipeline if any facilities don't match + # - skip: Filter out records with unmatched facilities + unmatched_behavior: warn + chart_diseases_header: - Diphtheria - Tetanus diff --git a/config/phu_aliases.yaml b/config/phu_aliases.yaml new file mode 100644 index 0000000..bbf1c4c --- /dev/null +++ b/config/phu_aliases.yaml @@ -0,0 +1,30 @@ +# Maps canonical PHU acronyms (matching template folder names) to every +# alias that may appear in the PHIX reference workbook. Customize this file +# so each Public Health Unit is represented with all historic spellings, +# ensuring PHIX validation can restrict facilities to the correct PHU. +# +# Example usage: +# phu_aliases: +# wdgph: +# display_name: Wellington-Dufferin-Guelph Public Health +# aliases: +# - Wellington Dufferin Guelph Public Health +# - WDGPH + +phu_aliases: + wdgph: + display_name: Wellington-Dufferin-Guelph Public Health + aliases: + - Wellington Dufferin Guelph Public Health + - Wellington-Dufferin-Guelph Health Unit + - WDGPH + peel: + display_name: Peel Public Health + aliases: + - Region of Peel Public Health + - PEEL HEALTH UNIT + ottawa: + display_name: Ottawa Public Health + aliases: + - City of Ottawa Health Unit + - OTTAWA PHU diff --git a/docs/PDF_VALIDATION.md b/docs/PDF_VALIDATION.md index ec1b8db..5b72ea5 100644 --- a/docs/PDF_VALIDATION.md +++ b/docs/PDF_VALIDATION.md @@ -151,7 +151,8 @@ This markerless approach is also suitable for checks like: In step 6 (validation), the orchestrator: 1. Loads `preprocessed_clients_{run_id}.json` from `output/artifacts/`. 2. Builds a mapping: `filename -> expected_value` (e.g., client ID, sequence number). -3. Passes this mapping to `validate_pdfs.main(..., client_id_map=client_id_map)`. +3. Passes this mapping to `validate_pdfs.main(..., client_id_map=client_id_map, client_metadata_map=client_metadata_map)`. +4. `client_metadata_map` currently carries PHIX validation context (target PHU, matched PHU, school name) so every per-PDF log records which facility/PHU was validated upstream. Rules then validate against the mapping using artifact data as the source of truth. @@ -165,6 +166,22 @@ Current rule: Searches for any 10-digit number in the PDF text and compares to t This ensures every generated PDF contains the correct client ID, catching generation errors or data drift early. +### Example: PHIX facility scope tracking + +- Preprocessing stores PHIX validation metadata (`phix_validation`) in each client's artifact entry. +- The orchestrator passes this data via `client_metadata_map`. +- `validate_pdf_layout` records `phix_target_phu_code`, `phix_matched_phu_code`, and `phix_match_confidence` in each PDF's `measurements`. +- If a PDF's matched PHU does not align with the template's target PHU, the validator emits a `phix_target_phu` warning per file. + +This gives auditors a traceable link from every generated PDF back to the PHIX reference data used during preprocessing. + +### PHIX reference workbook is BYO + +- The official PHIX reference workbook is licensed and cannot be redistributed in this repository. +- `.gitignore` explicitly ignores `PHIX Reference*.xls*` so accidental copies never enter history. +- `config/parameters.yaml` ships with the placeholder `BYO_PHIX_REFERENCE.xlsx`; operators must point it at their local copy before running Stepβ€―2. +- Document the location internally (outside git) and ensure CI/CD environments mount the workbook securely (e.g., secrets storage or mounted volume). + ## Why we prefer template‑emitted measurements over PDF distance math We strongly prefer emitting precise measurements from the Typst template (via `measure()` and `MEASURE_...` markers) instead of inferring sizes by computing distances between two markers in extracted PDF text. Reasons: diff --git a/pipeline/orchestrator.py b/pipeline/orchestrator.py index 9517c6f..a04df34 100755 --- a/pipeline/orchestrator.py +++ b/pipeline/orchestrator.py @@ -39,9 +39,10 @@ import traceback from datetime import datetime, timezone from pathlib import Path +from typing import Optional # Import pipeline steps -from . import bundle_pdfs, cleanup, compile_notices, validate_pdfs +from . import bundle_pdfs, cleanup, compile_notices, validate_pdfs, validate_phix from . import ( encrypt_notice, generate_notices, @@ -50,6 +51,7 @@ preprocess, ) from .config_loader import load_config +from .data_models import PreprocessResult from .enums import Language SCRIPT_DIR = Path(__file__).resolve().parent @@ -123,6 +125,7 @@ def validate_args(args: argparse.Namespace) -> None: f"Input file not found: {args.input_dir / args.input_file}" ) + args.template_key = None # Resolve template directory if args.template_dir is None: # No custom template specified; use default @@ -136,6 +139,7 @@ def validate_args(args: argparse.Namespace) -> None: f"Expected a simple name like 'wdgph' or 'my_phu', not a path." ) + args.template_key = args.template_dir.strip() phu_template_path = DEFAULT_PHU_TEMPLATES_DIR / args.template_dir if not phu_template_path.exists(): raise FileNotFoundError( @@ -209,6 +213,8 @@ def run_step_2_preprocess( output_dir: Path, language: str, run_id: str, + config_dir: Path, + template_code: Optional[str] = None, ) -> int: """Step 2: Preprocessing. @@ -220,6 +226,10 @@ def run_step_2_preprocess( # Configure logging log_path = preprocess.configure_logging(output_dir, run_id) + # Load configuration for PHIX validation + config = load_config(config_dir / "parameters.yaml") + project_root = config_dir.parent + # Load and process input data input_path = input_dir / input_file df_raw = preprocess.read_input(input_path) @@ -230,6 +240,51 @@ def run_step_2_preprocess( # Check that addresses are complete, return only complete rows df = preprocess.check_addresses_complete(df) + # Validate schools/daycares against PHIX reference list + phix_config = config.get("phix_validation", {}) + phix_warnings: list[str] = [] + if phix_config.get("enabled", False): + reference_file = phix_config.get("reference_file", "") + mapping_file = phix_config.get("phu_mapping_file") + mapping_path: Optional[Path] = None + if mapping_file: + mapping_path = Path(mapping_file) + if not mapping_path.is_absolute(): + mapping_path = (project_root / mapping_file).resolve() + if not mapping_path.exists(): + raise FileNotFoundError(f"PHU alias mapping file not found: {mapping_path}") + + target_phu_codes: set[str] = set() + configured_target = phix_config.get("target_phu_code") + if isinstance(configured_target, str): + configured_target = [configured_target] + if isinstance(configured_target, (list, tuple, set)): + for code in configured_target: + if code and str(code).strip(): + target_phu_codes.add(str(code)) + if template_code: + target_phu_codes.add(template_code) + + if reference_file: + reference_path = Path(reference_file) + # If relative path, resolve from project root + if not reference_path.is_absolute(): + reference_path = (project_root / reference_file).resolve() + if reference_path.exists(): + df, phix_warnings = validate_phix.validate_facilities( + df=df, + reference_path=reference_path, + output_dir=output_dir, + unmatched_behavior=phix_config.get("unmatched_behavior", "warn"), + target_phu_codes=target_phu_codes or None, + phu_mapping_path=mapping_path, + ) + print(f"🏫 PHIX validation complete: {len(df)} records validated") + else: + print(f"⚠️ PHIX reference file not found: {reference_path}") + else: + print("⚠️ PHIX validation enabled but no reference_file configured") + # Load configuration vaccine_reference_path = preprocess.VACCINE_REFERENCE_PATH vaccine_reference = json.loads(vaccine_reference_path.read_text(encoding="utf-8")) @@ -239,6 +294,14 @@ def run_step_2_preprocess( df, language, vaccine_reference, preprocess.REPLACE_UNSPECIFIED ) + # Merge PHIX validation warnings into result + if phix_warnings: + combined_warnings = list(result.warnings) + phix_warnings + result = PreprocessResult( + clients=result.clients, + warnings=combined_warnings, + ) + # Write artifact artifact_path = preprocess.write_artifact( output_dir / "artifacts", language, run_id, result @@ -397,6 +460,7 @@ def run_step_6_validate_pdfs( # Load preprocessed clients to build client ID mapping client_id_map = {} + client_metadata_map = {} import json with open(preprocessed_json, "r", encoding="utf-8") as f: @@ -406,11 +470,20 @@ def run_step_6_validate_pdfs( # Filename format: {language}_notice_{sequence:05d}_{client_id}.pdf for idx, client in enumerate(clients, start=1): client_id = str(client.get("client_id", "")) + metadata = client.get("metadata", {}) or {} + phix_meta = metadata.get("phix_validation", {}) or {} + school = client.get("school", {}) or {} # Try to match any expected filename format for ext in [".pdf"]: for lang_prefix in ["en", "fr"]: filename = f"{lang_prefix}_notice_{idx:05d}_{client_id}{ext}" client_id_map[filename] = client_id + client_metadata_map[filename] = { + "phix_validation": phix_meta, + "phix_target_phu_code": metadata.get("phix_target_phu_code"), + "phix_target_phu_label": metadata.get("phix_target_phu_label"), + "school_name": school.get("name"), + } # Validate PDFs (module loads validation rules from config_dir) validate_pdfs.main( @@ -419,6 +492,7 @@ def run_step_6_validate_pdfs( json_output=validation_json, client_id_map=client_id_map, config_dir=config_dir, + client_metadata_map=client_metadata_map, ) @@ -574,6 +648,8 @@ def main() -> int: output_dir, args.language, run_id, + config_dir, + args.template_key, ) step_duration = time.time() - step_start step_times.append(("Preprocessing", step_duration)) diff --git a/pipeline/preprocess.py b/pipeline/preprocess.py index 6209d54..c0033b2 100644 --- a/pipeline/preprocess.py +++ b/pipeline/preprocess.py @@ -730,6 +730,16 @@ def build_preprocess_result( warnings: set[str] = set() working = normalize_dataframe(df) + def clean_optional(value: Any) -> Any: + """Convert pandas NA or empty values to None.""" + if value is None: + return None + if isinstance(value, (float, int)) and pd.isna(value): + return None + if pd.isna(value): + return None + return value + # Load parameters for date_notice_delivery and chart_diseases_header params = {} if PARAMETERS_PATH.exists(): @@ -832,6 +842,34 @@ def build_preprocess_result( "postal_code": postal_code, } + phix_id = clean_optional(getattr(row, "PHIX_ID", None)) + phix_match_type = getattr(row, "PHIX_MATCH_TYPE", "none") + phix_match_conf = getattr(row, "PHIX_MATCH_CONFIDENCE", 0) + if pd.isna(phix_match_conf): + phix_match_conf = 0 + phix_match_conf = int(phix_match_conf) + phix_phu_name = clean_optional(getattr(row, "PHIX_MATCHED_PHU", None)) + phix_phu_code = clean_optional(getattr(row, "PHIX_MATCHED_PHU_CODE", None)) + phix_target_code = clean_optional(getattr(row, "PHIX_TARGET_PHU_CODE", None)) + phix_target_label = clean_optional(getattr(row, "PHIX_TARGET_PHU_LABEL", None)) + + metadata: Dict[str, Any] = { + "unique_id": row.UNIQUE_ID or None, # type: ignore[attr-defined] + } + metadata["phix_validation"] = { + "id": phix_id, + "match_type": phix_match_type or "none", + "confidence": phix_match_conf, + "phu_name": phix_phu_name, + "phu_code": phix_phu_code, + "target_phu_code": phix_target_code, + "target_phu_label": phix_target_label, + } + if phix_target_code: + metadata["phix_target_phu_code"] = phix_target_code + if phix_target_label: + metadata["phix_target_phu_label"] = phix_target_label + client = ClientRecord( sequence=sequence, client_id=client_id, @@ -843,9 +881,7 @@ def build_preprocess_result( vaccines_due=vaccines_due if vaccines_due else None, vaccines_due_list=vaccines_due_list if vaccines_due_list else None, received=received if received else None, - metadata={ - "unique_id": row.UNIQUE_ID or None, # type: ignore[attr-defined] - }, + metadata=metadata, ) clients.append(client) diff --git a/pipeline/validate_pdfs.py b/pipeline/validate_pdfs.py index 8e15d67..527a26b 100644 --- a/pipeline/validate_pdfs.py +++ b/pipeline/validate_pdfs.py @@ -242,6 +242,7 @@ def validate_pdf_layout( reader: PdfReader, enabled_rules: dict[str, str], client_id_map: dict[str, str] | None = None, + client_metadata_map: dict[str, dict[str, object]] | None = None, ) -> tuple[List[str], dict[str, float]]: """Check PDF for layout issues using invisible markers and metadata. @@ -256,6 +257,9 @@ def validate_pdf_layout( client_id_map : dict[str, str], optional Mapping of PDF filename (without path) to expected client ID. If provided, client_id_presence validation uses this as source of truth. + client_metadata_map : dict[str, dict], optional + Additional per-client metadata (PHIX validation scope, school name, etc.) + injected into per-PDF measurements for audit trails. Returns ------- @@ -343,6 +347,39 @@ def validate_pdf_layout( # If client ID check fails, skip silently (parsing error) pass + if client_metadata_map: + meta = client_metadata_map.get(pdf_path.name, {}) + phix_meta = meta.get("phix_validation") or {} + target_phu_code = meta.get("phix_target_phu_code") or phix_meta.get("target_phu_code") + target_phu_label = meta.get("phix_target_phu_label") or phix_meta.get("target_phu_label") + matched_phu_code = phix_meta.get("phu_code") + matched_phu_name = phix_meta.get("phu_name") + match_type = phix_meta.get("match_type") + match_confidence = phix_meta.get("confidence", 0) + facility_name = meta.get("school_name") + + measurements["phix_target_phu_code"] = target_phu_code or "" + measurements["phix_target_phu_label"] = target_phu_label or "" + measurements["phix_matched_phu_code"] = matched_phu_code or "" + measurements["phix_matched_phu_name"] = matched_phu_name or "" + measurements["phix_match_type"] = match_type or "" + try: + measurements["phix_match_confidence"] = int(match_confidence or 0) + except (TypeError, ValueError): + measurements["phix_match_confidence"] = 0 + if facility_name: + measurements["phix_facility_school"] = facility_name + + if ( + target_phu_code + and matched_phu_code + and target_phu_code != matched_phu_code + ): + warnings.append( + "phix_target_phu: Facility PHU code " + f"{matched_phu_code} does not match template PHU {target_phu_code}" + ) + return warnings, measurements @@ -350,6 +387,7 @@ def validate_pdf_structure( pdf_path: Path, enabled_rules: dict[str, str] | None = None, client_id_map: dict[str, str] | None = None, + client_metadata_map: dict[str, dict[str, object]] | None = None, ) -> ValidationResult: """Validate a single PDF file for structure and layout. @@ -361,6 +399,9 @@ def validate_pdf_structure( Validation rules configuration (rule_name -> "disabled"/"warn"/"error"). client_id_map : dict[str, str], optional Mapping of PDF filename to expected client ID (from preprocessed_clients.json). + client_metadata_map : dict[str, dict], optional + Per-PDF metadata (PHIX validation scope, school name, etc.) to include in + measurements and warnings. Returns ------- @@ -390,7 +431,11 @@ def validate_pdf_structure( # Validate layout using markers layout_warnings, layout_measurements = validate_pdf_layout( - pdf_path, reader, enabled_rules, client_id_map=client_id_map + pdf_path, + reader, + enabled_rules, + client_id_map=client_id_map, + client_metadata_map=client_metadata_map, ) warnings.extend(layout_warnings) measurements.update(layout_measurements) @@ -449,6 +494,7 @@ def validate_pdfs( files: List[Path], enabled_rules: dict[str, str] | None = None, client_id_map: dict[str, str] | None = None, + client_metadata_map: dict[str, dict[str, object]] | None = None, ) -> ValidationSummary: """Validate all PDF files and generate summary. @@ -460,6 +506,8 @@ def validate_pdfs( Validation rules configuration (rule_name -> "disabled"/"warn"/"error"). client_id_map : dict[str, str], optional Mapping of PDF filename to expected client ID (from preprocessed_clients.json). + client_metadata_map : dict[str, dict], optional + Per-PDF metadata (PHIX validation scope, school name, etc.). Returns ------- @@ -477,7 +525,10 @@ def validate_pdfs( for pdf_path in files: result = validate_pdf_structure( - pdf_path, enabled_rules=enabled_rules, client_id_map=client_id_map + pdf_path, + enabled_rules=enabled_rules, + client_id_map=client_id_map, + client_metadata_map=client_metadata_map, ) results.append(result) page_count = int(result.measurements.get("page_count", 0)) @@ -592,6 +643,7 @@ def main( json_output: Path | None = None, client_id_map: dict[str, str] | None = None, config_dir: Path | None = None, + client_metadata_map: dict[str, dict[str, object]] | None = None, ) -> ValidationSummary: """Main entry point for PDF validation. @@ -612,6 +664,9 @@ def main( Path to config directory containing parameters.yaml. Used to load enabled_rules if not explicitly provided. If not provided, uses default location (config/parameters.yaml in project root). + client_metadata_map : dict[str, dict], optional + Additional metadata (PHIX validation scope, school name, etc.) to inject + into per-PDF measurements for downstream auditing. Returns ------- @@ -632,11 +687,16 @@ def main( if client_id_map is None: client_id_map = {} + if client_metadata_map is None: + client_metadata_map = {} files = discover_pdfs(target) filtered = filter_by_language(files, language) summary = validate_pdfs( - filtered, enabled_rules=enabled_rules, client_id_map=client_id_map + filtered, + enabled_rules=enabled_rules, + client_id_map=client_id_map, + client_metadata_map=client_metadata_map, ) summary.language = language diff --git a/pipeline/validate_phix.py b/pipeline/validate_phix.py new file mode 100644 index 0000000..5232dee --- /dev/null +++ b/pipeline/validate_phix.py @@ -0,0 +1,583 @@ +"""Validate schools and daycares against PHIX Reference List. + +This module provides validation of school/daycare names against a canonical +PHIX reference list. It supports strict exact matching plus PHU alias +scoping to prevent cross-jurisdiction matches. + +**Input Contract:** +- PHIX reference Excel file must exist at configured path +- Reference file must contain 'Schools & Day Cares' sheet +- Each column is a PHU, values are "FACILITY NAME - ID" format + +**Output Contract:** +- Returns validation results with matched PHIX IDs and confidence scores +- Writes unmatched facilities to CSV for PHU review +- Raises or warns based on configured `unmatched_behavior` + +**Usage:** + Called from preprocess.py after address validation, before building results. +""" + +from __future__ import annotations + +import logging +import re +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Dict, Iterable, List, Optional, Set, Tuple + +import pandas as pd +import yaml + +LOG = logging.getLogger(__name__) + +# Cache for loaded PHIX reference data and PHU alias mappings +_PHIX_REFERENCE_CACHE: Optional[Dict[str, Any]] = None +_PHU_MAPPING_CACHE: Optional[Dict[str, Any]] = None + + +@dataclass +class PHIXFacility: + """A facility entry from the PHIX reference list.""" + + phix_id: str + name: str + phu: str # Public Health Unit that owns this facility + + def __hash__(self) -> int: + return hash((self.phix_id, self.name, self.phu)) + + +@dataclass +class PHIXMatchResult: + """Result of matching a facility against PHIX reference.""" + + input_name: str + matched: bool + phix_id: Optional[str] = None + phix_name: Optional[str] = None + phu_name: Optional[str] = None + phu_code: Optional[str] = None + confidence: int = 0 + match_type: str = "none" # exact or none + + +def parse_facility_entry(entry: str, phu: str) -> Optional[PHIXFacility]: + """Parse a PHIX facility entry in 'NAME - ID' format. + + Parameters + ---------- + entry : str + Raw entry from Excel like "ANNA MCCREA PUBLIC SCHOOL - 019186" + phu : str + Name of the Public Health Unit column + + Returns + ------- + PHIXFacility or None + Parsed facility, or None if entry is empty/invalid + """ + if not entry or pd.isna(entry): + return None + + entry = str(entry).strip() + if not entry: + return None + + # Parse "NAME - ID" format, where ID is the last segment after " - " + # Some names contain " - " so we split from the right + parts = entry.rsplit(" - ", maxsplit=1) + if len(parts) == 2: + name = parts[0].strip() + phix_id = parts[1].strip() + else: + # No ID separator found, use entire string as name + name = entry + phix_id = "" + + return PHIXFacility(phix_id=phix_id, name=name, phu=phu) + + +def load_phix_reference( + reference_path: Path, + sheet_name: str = "Schools & Day Cares", +) -> Dict[str, Any]: + """Load and parse the PHIX reference Excel file. + + Caches the result for subsequent calls with the same path. + + Parameters + ---------- + reference_path : Path + Path to the PHIX reference Excel file + sheet_name : str + Name of the sheet containing school/daycare data + + Returns + ------- + Dict with keys: + - facilities: List[PHIXFacility] - all parsed facilities + - by_name: Dict[str, PHIXFacility] - lookup by normalized name + - by_name_phu: Dict[str, Dict[str, PHIXFacility]] - lookup by PHU column + - phus: List[str] - list of PHU column names + """ + global _PHIX_REFERENCE_CACHE + + cache_key = str(reference_path.resolve()) + if _PHIX_REFERENCE_CACHE is not None: + cached_path = _PHIX_REFERENCE_CACHE.get("_cache_path") + if cached_path == cache_key: + return _PHIX_REFERENCE_CACHE + + if not reference_path.exists(): + raise FileNotFoundError(f"PHIX reference file not found: {reference_path}") + + LOG.info("Loading PHIX reference from %s", reference_path) + df = pd.read_excel(reference_path, sheet_name=sheet_name) + + facilities: List[PHIXFacility] = [] + by_name: Dict[str, PHIXFacility] = {} + by_name_phu: Dict[str, Dict[str, PHIXFacility]] = {} + seen_names: Set[str] = set() + + for phu_column in df.columns: + for entry in df[phu_column].dropna(): + facility = parse_facility_entry(entry, phu_column) + if facility: + facilities.append(facility) + # Use normalized name as key for exact matching + normalized = normalize_facility_name(facility.name) + by_name_phu.setdefault(normalized, {})[phu_column] = facility + if normalized not in seen_names: + by_name[normalized] = facility + seen_names.add(normalized) + + LOG.info( + "Loaded %d facilities from %d PHUs", len(facilities), len(df.columns) + ) + + _PHIX_REFERENCE_CACHE = { + "_cache_path": cache_key, + "facilities": facilities, + "by_name": by_name, + "by_name_phu": by_name_phu, + "phus": list(df.columns), + } + return _PHIX_REFERENCE_CACHE + + +def normalize_phu_code(code: str) -> str: + """Normalize a PHU acronym/template key for comparison.""" + if not code: + return "" + normalized = re.sub(r"[^a-z0-9]+", "_", code.strip().lower()) + return normalized.strip("_") + + +def normalize_phu_label(label: str) -> str: + """Normalize PHU names/aliases for lookup.""" + if not label: + return "" + return re.sub(r"\s+", " ", str(label).strip().upper()) + + +def load_phu_aliases(mapping_path: Path) -> Dict[str, Any]: + """Load canonical PHU aliases from YAML mapping file.""" + global _PHU_MAPPING_CACHE + + cache_key = str(mapping_path.resolve()) + if _PHU_MAPPING_CACHE is not None: + cached_path = _PHU_MAPPING_CACHE.get("_cache_path") + if cached_path == cache_key: + return _PHU_MAPPING_CACHE + + if not mapping_path.exists(): + raise FileNotFoundError(f"PHU alias mapping file not found: {mapping_path}") + + raw = yaml.safe_load(mapping_path.read_text(encoding="utf-8")) or {} + entries = raw.get("phu_aliases") + if not isinstance(entries, dict): + raise ValueError( + f"Invalid PHU alias mapping format in {mapping_path}. " + "Expected top-level 'phu_aliases' dictionary." + ) + + alias_to_code: Dict[str, str] = {} + code_to_display: Dict[str, str] = {} + code_to_aliases: Dict[str, Set[str]] = {} + + for code, meta in entries.items(): + normalized_code = normalize_phu_code(code) + if not normalized_code: + continue + + meta = meta or {} + display_name = str(meta.get("display_name") or code).strip() + code_to_display[normalized_code] = display_name + + aliases = meta.get("aliases", []) + if isinstance(aliases, str): + aliases = [aliases] + + alias_candidates = set( + filter( + None, + [ + *[str(alias) for alias in aliases], + code, + display_name, + ], + ) + ) + + for alias in alias_candidates: + normalized_alias = normalize_phu_label(alias) + if not normalized_alias: + continue + alias_to_code[normalized_alias] = normalized_code + code_to_aliases.setdefault(normalized_code, set()).add(str(alias).strip()) + + _PHU_MAPPING_CACHE = { + "_cache_path": cache_key, + "alias_to_code": alias_to_code, + "code_to_display": code_to_display, + "code_to_aliases": code_to_aliases, + } + return _PHU_MAPPING_CACHE + + +def normalize_facility_name(name: str) -> str: + """Normalize a facility name for comparison. + + Converts to uppercase, removes extra whitespace, and standardizes + common abbreviations. + + Parameters + ---------- + name : str + Raw facility name + + Returns + ------- + str + Normalized name for comparison + """ + if not name: + return "" + + # Uppercase and strip + normalized = name.upper().strip() + + # Collapse multiple spaces + normalized = re.sub(r"\s+", " ", normalized) + + return normalized + + +def match_facility( + input_name: str, + reference: Dict[str, Any], + facility_phu_codes: Optional[Dict[str, Optional[str]]] = None, +) -> PHIXMatchResult: + """Match a single facility name against PHIX reference using exact match.""" + if not input_name or not input_name.strip(): + return PHIXMatchResult( + input_name=input_name or "", + matched=False, + match_type="none", + ) + + normalized_input = normalize_facility_name(input_name) + by_name = reference["by_name"] + + # Try exact match first + if normalized_input in by_name: + facility = by_name[normalized_input] + phu_code = facility_phu_codes.get(normalized_input) if facility_phu_codes else None + return PHIXMatchResult( + input_name=input_name, + matched=True, + phix_id=facility.phix_id, + phix_name=facility.name, + phu_name=facility.phu, + phu_code=phu_code, + confidence=100, + match_type="exact", + ) + + return PHIXMatchResult( + input_name=input_name, + matched=False, + match_type="none", + ) + + +def validate_facilities( + df: pd.DataFrame, + reference_path: Path, + output_dir: Path, + unmatched_behavior: str = "warn", + school_column: str = "SCHOOL_NAME", + target_phu_codes: Optional[Iterable[str]] = None, + phu_mapping_path: Optional[Path] = None, +) -> Tuple[pd.DataFrame, List[str]]: + """Validate all facilities in DataFrame against PHIX reference. + + Parameters + ---------- + df : pd.DataFrame + Input DataFrame with facility names + reference_path : Path + Path to PHIX reference Excel file + output_dir : Path + Directory to write unmatched facilities CSV + unmatched_behavior : str + How to handle unmatched facilities: + - 'warn': Log warning, continue processing all records + - 'error': Raise ValueError if any unmatched + - 'skip': Filter out unmatched records + school_column : str + Name of column containing facility names + target_phu_codes : Iterable[str], optional + Canonical PHU codes (matching template folders) to restrict validation. + When provided, only facilities that belong to these PHUs are considered. + phu_mapping_path : Path, optional + Path to YAML mapping file that links PHIX column names to canonical PHU + codes. Required when target_phu_codes are provided. + + Returns + ------- + Tuple[pd.DataFrame, List[str]] + - DataFrame with PHIX validation columns added + - List of warning messages + + Raises + ------ + ValueError + If unmatched_behavior is 'error' and unmatched facilities exist + """ + warnings: List[str] = [] + + if school_column not in df.columns: + LOG.warning("Column '%s' not found, skipping PHIX validation", school_column) + return df, warnings + + # Load reference + reference = load_phix_reference(reference_path) + + phu_mapping_data: Optional[Dict[str, Any]] = None + alias_lookup: Dict[str, str] = {} + if phu_mapping_path: + phu_mapping_data = load_phu_aliases(phu_mapping_path) + alias_lookup = phu_mapping_data.get("alias_to_code", {}) + + normalized_target_codes: Set[str] = set() + if target_phu_codes: + for code in target_phu_codes: + normalized = normalize_phu_code(code) + if normalized: + normalized_target_codes.add(normalized) + if normalized_target_codes and not phu_mapping_data: + raise ValueError( + "Target PHU codes provided but phu_mapping_file is not configured. " + "Update phix_validation.phu_mapping_file to scope validation." + ) + + if normalized_target_codes and phu_mapping_data: + code_to_display = phu_mapping_data.get("code_to_display", {}) + missing_codes = sorted( + code for code in normalized_target_codes if code not in code_to_display + ) + if missing_codes: + raise ValueError( + f"Template PHU codes not defined in {phu_mapping_path}: " + f"{', '.join(missing_codes)}. Update config/phu_aliases.yaml." + ) + + phu_column_codes: Dict[str, str] = {} + if alias_lookup: + for phu_column in reference["phus"]: + canonical_code = alias_lookup.get(normalize_phu_label(phu_column)) + if canonical_code: + phu_column_codes[phu_column] = canonical_code + + reference_for_matching = reference + target_display: Optional[str] = None + target_codes_str: Optional[str] = None + + if normalized_target_codes: + allowed_phu_columns = [ + column + for column, code in phu_column_codes.items() + if code in normalized_target_codes + ] + if not allowed_phu_columns: + raise ValueError( + "No PHIX columns mapped to the requested PHU codes. " + f"Template codes: {', '.join(sorted(normalized_target_codes))}. " + "Confirm config/phu_aliases.yaml contains the PHIX column names." + ) + + by_name_phu: Dict[str, Dict[str, PHIXFacility]] = reference.get( + "by_name_phu", {} + ) + filtered_by_name: Dict[str, PHIXFacility] = {} + filtered_name_list: List[str] = [] + for normalized_name, facilities_by_phu in by_name_phu.items(): + for phu_label in allowed_phu_columns: + facility = facilities_by_phu.get(phu_label) + if facility: + filtered_by_name[normalized_name] = facility + filtered_name_list.append(normalized_name) + break + + if not filtered_by_name: + raise ValueError( + "No PHIX reference entries found for the requested PHU columns. " + f"Columns: {', '.join(sorted(allowed_phu_columns))}. " + "Verify the mapping file includes every PHIX alias." + ) + + reference_for_matching = dict(reference) + reference_for_matching["by_name"] = filtered_by_name + reference_for_matching["name_list"] = filtered_name_list + + code_to_display = phu_mapping_data.get("code_to_display", {}) if phu_mapping_data else {} + target_display = ", ".join( + code_to_display.get(code, code).strip() for code in sorted(normalized_target_codes) + ) + target_codes_str = ",".join(sorted(normalized_target_codes)) + LOG.info( + "Restricting PHIX validation to PHU(s): %s", + target_display or target_codes_str, + ) + + facility_phu_codes_for_matching: Dict[str, Optional[str]] = {} + if alias_lookup: + for normalized_name, facility in reference_for_matching["by_name"].items(): + canonical_code = alias_lookup.get(normalize_phu_label(facility.phu)) + if canonical_code: + facility_phu_codes_for_matching[normalized_name] = canonical_code + + # Match each unique facility + unique_facilities = df[school_column].dropna().unique() + match_results: Dict[str, PHIXMatchResult] = {} + + for facility_name in unique_facilities: + result = match_facility( + str(facility_name), + reference_for_matching, + facility_phu_codes=( + facility_phu_codes_for_matching + if facility_phu_codes_for_matching + else None + ), + ) + match_results[str(facility_name)] = result + + # Add validation columns to DataFrame + df = df.copy() + df["PHIX_ID"] = df[school_column].apply( + lambda x: match_results.get(str(x), PHIXMatchResult(str(x), False)).phix_id + if pd.notna(x) + else None + ) + df["PHIX_MATCH_CONFIDENCE"] = df[school_column].apply( + lambda x: match_results.get(str(x), PHIXMatchResult(str(x), False)).confidence + if pd.notna(x) + else 0 + ) + df["PHIX_MATCH_TYPE"] = df[school_column].apply( + lambda x: match_results.get( + str(x), PHIXMatchResult(str(x), False) + ).match_type + if pd.notna(x) + else "none" + ) + df["PHIX_MATCHED_PHU"] = df[school_column].apply( + lambda x: match_results.get( + str(x), PHIXMatchResult(str(x), False) + ).phu_name + if pd.notna(x) + else None + ) + df["PHIX_MATCHED_PHU_CODE"] = df[school_column].apply( + lambda x: match_results.get( + str(x), PHIXMatchResult(str(x), False) + ).phu_code + if pd.notna(x) + else None + ) + df["PHIX_TARGET_PHU_CODE"] = target_codes_str + df["PHIX_TARGET_PHU_LABEL"] = target_display + + # Identify unmatched facilities + unmatched = [r for r in match_results.values() if not r.matched] + + if unmatched: + unmatched_names = sorted(set(r.input_name for r in unmatched)) + LOG.warning( + "%d facilities could not be matched to PHIX reference: %s", + len(unmatched_names), + unmatched_names[:5], # Log first 5 + ) + + # Write unmatched to CSV + output_dir.mkdir(parents=True, exist_ok=True) + unmatched_path = output_dir / "unmatched_facilities.csv" + unmatched_df = pd.DataFrame( + [ + { + "facility_name": r.input_name, + "match_type": r.match_type, + "confidence": r.confidence, + "target_phu_code": target_codes_str or "", + "target_phu_label": target_display or "", + } + for r in unmatched + ] + ) + unmatched_df.to_csv(unmatched_path, index=False) + LOG.info("Wrote %d unmatched facilities to %s", len(unmatched), unmatched_path) + warnings.append( + f"{len(unmatched_names)} facilities not found in PHIX reference. " + f"See {unmatched_path} for details." + ) + + if unmatched_behavior == "error": + raise ValueError( + f"{len(unmatched_names)} facilities not found in PHIX reference: " + f"{', '.join(unmatched_names[:10])}" + + (f" (and {len(unmatched_names) - 10} more)" if len(unmatched_names) > 10 else "") + ) + elif unmatched_behavior == "skip": + # Filter out rows with unmatched facilities + matched_names = {r.input_name for r in match_results.values() if r.matched} + original_count = len(df) + df = df[df[school_column].isin(matched_names)] + filtered_count = original_count - len(df) + LOG.info( + "Filtered %d records with unmatched facilities, %d remaining", + filtered_count, + len(df), + ) + warnings.append( + f"Filtered {filtered_count} records with unmatched facilities." + ) + + # Log summary + matched_count = sum(1 for r in match_results.values() if r.matched) + LOG.info( + "PHIX validation complete: %d matched, %d unmatched", + matched_count, + len(unmatched), + ) + + return df, warnings + + +def clear_cache() -> None: + """Clear the PHIX reference cache. Useful for testing.""" + global _PHIX_REFERENCE_CACHE, _PHU_MAPPING_CACHE + _PHIX_REFERENCE_CACHE = None + _PHU_MAPPING_CACHE = None diff --git a/plans/001-validate-phix-reference.md b/plans/001-validate-phix-reference.md new file mode 100644 index 0000000..e6bcdd1 --- /dev/null +++ b/plans/001-validate-phix-reference.md @@ -0,0 +1,186 @@ +# Plan: Validate Schools and Daycares Against PHIX Reference List + +**Status:** βœ… Implemented +**Date:** 2026-01-14 + +--- + +## Problem Statement + +Input data contains school/daycare names that may not match the official PHIX (Public Health Information Exchange) reference list. This causes: +- Data quality issues in reports +- Difficulty linking records to official facility IDs +- No validation that facilities exist in the PHU's jurisdiction + +## Solution + +Add a validation step in preprocessing that matches input school/daycare names against the PHIX reference list using strict exact comparisons with configurable behavior for unmatched facilities. + +--- + +## Implementation + +### Files Created + +| File | Purpose | +|------|---------| +| `pipeline/validate_phix.py` | Validation module with loading, matching, and batch validation | +| `tests/unit/test_validate_phix.py` | 26 unit tests covering all functionality | + +### Files Modified + +| File | Changes | +|------|---------| +| `config/parameters.yaml` | Added `phix_validation` configuration section | +| `pipeline/orchestrator.py` | Integrated validation into Step 2 (Preprocessing) | + +### Configuration Options + +```yaml +phix_validation: + enabled: true + reference_file: BYO_PHIX_REFERENCE.xlsx # Placeholder; operators supply actual PHIX workbook path + phu_mapping_file: config/phu_aliases.yaml # Maps PHIX column names -> template codes + target_phu_code: null # Optional default scope when no template is provided + unmatched_behavior: warn # warn | error | skip +``` + +**Behavior modes:** +- `warn` - Log warning, continue processing all records +- `error` - Fail pipeline if any facilities don't match +- `skip` - Filter out records with unmatched facilities + +--- + +## How It Works + +### Pipeline Flow + +``` +Step 2: Preprocessing +β”œβ”€β”€ read_input() +β”œβ”€β”€ map_columns() +β”œβ”€β”€ filter_columns() +β”œβ”€β”€ ensure_required_columns() +β”œβ”€β”€ check_addresses_complete() +β”œβ”€β”€ validate_facilities() ← NEW: PHIX validation +β”‚ β”œβ”€β”€ Load PHIX Excel reference +β”‚ β”œβ”€β”€ Normalize facility names +β”‚ β”œβ”€β”€ Map PHIX PHU columns to canonical template codes +β”‚ β”œβ”€β”€ Restrict matching to template/config PHU scope (if configured) +β”‚ β”œβ”€β”€ Try exact match (100% confidence) +β”‚ β”œβ”€β”€ Write unmatched to CSV +β”‚ └── Enrich DataFrame/metadata with PHIX IDs + PHU scope +β”œβ”€β”€ build_preprocess_result() +└── write_artifact() +``` + +### PHIX Reference Format + +The Excel file has one column per PHU, with values in `"FACILITY NAME - ID"` format: + +``` +| Algoma PHU | Brant PHU | +|----------------------------------------|------------------------------| +| ANNA MCCREA PUBLIC SCHOOL - 019186 | BRANTFORD ELEMENTARY - 12345 | +| SUNSHINE DAYCARE - AL-0003561 | MAPLE CHILDCARE - 67890 | +``` + +### Output + +**Console:** +``` +Step 2: Preprocessing +... +🏫 PHIX validation complete: 1,247 records validated +⚠️ 3 facilities not found in PHIX reference. See output/unmatched_facilities.csv +``` + +**Enriched data (per record):** +- `PHIX_ID` - Official facility identifier +- `PHIX_MATCH_CONFIDENCE` - Match score (0-100) +- `PHIX_MATCH_TYPE` - "exact" or "none" +- `PHIX_MATCHED_PHU` / `_CODE` - PHU column + canonical code for the matched facility +- `PHIX_TARGET_PHU_CODE` / `_LABEL` - Template/config PHU scope copied to metadata +- Artifact metadata includes `phix_validation` payload so Step 6 can log PHIX scope per PDF + +**Unmatched report (`output/unmatched_facilities.csv`):** +```csv +facility_name,match_type,confidence +Lincon Elementary School,none,0 +New Daycare Centre,none,0 +``` + +--- + +## Design Decisions + +### 1. Excel vs JSON for reference data + +**Decision:** Keep Excel as source (official PHIX format PHUs receive) + +**Rationale:** +- PHU staff can update without technical knowledge +- No manual conversion step to forget +- Caching makes load time acceptable + +**Trade-off:** Slightly slower initial load (~0.5s vs ~50ms) + +### 2. Integration point + +**Decision:** After `check_addresses_complete()`, before `build_preprocess_result()` + +**Rationale:** +- Validates after basic data normalization +- Can filter records before building client objects +- Follows existing validation pattern (addresses) + +### 3. Fuzzy matching algorithm + +**Decision:** Require exact facility name matches (case-insensitive) + +**Rationale:** +- Eliminates the risk of picking the wrong facility due to similar spellings +- Aligns with Panorama usage where facility folders/templates must match canonical names +- Encourages upstream data normalization (alias mapping now solved by PHU mapping, not name similarity) + +### 4. PHU alias mapping and PDF auditing + +**Decision:** Require a YAML mapping (`config/phu_aliases.yaml`) that links PHIX column headers +to template acronyms; propagate PHIX scope into PDF validation logs. + +**Rationale:** +- Prevents cross-PHU matches when multiple units share similarly named schools. +- Supports PHU mergers by allowing multiple PHIX aliases to map to one template code. +- PDF validation now records `phix_target_phu_code`/`phix_matched_phu_code` per notice and + emits a `phix_target_phu` warning if the PHU codes diverge, giving auditors a traceable log. +- Threshold 85% catches minor typos without false positives + +--- + +## Testing + +```bash +# Run PHIX validation tests only +uv run pytest tests/unit/test_validate_phix.py -v + +# Run all unit tests +uv run pytest -m unit +``` + +**Test coverage:** +- Parsing PHIX entries ("NAME - ID" format) +- Exact match behavior (case-insensitive, typo rejection) +- All three `unmatched_behavior` modes +- Edge cases (empty data, missing columns) +- Caching behavior + +--- + +## Future Enhancements + +1. **JSON cache layer** - Auto-generate JSON cache for faster loads +2. **CLI command** - `viper convert-phix` to pre-generate JSON +3. **PHU filtering** - Only load facilities for configured PHU +4. **Alias support** - Allow manual alias mappings for known variations +- BYO PHIX file: keep the licensed workbook outside git (ignored by pattern) and update `reference_file` locally before running. diff --git a/plans/README.md b/plans/README.md new file mode 100644 index 0000000..0ccdec0 --- /dev/null +++ b/plans/README.md @@ -0,0 +1,35 @@ +# Plans Directory + +This directory contains implementation plans for features developed with AI coding assistance. + +## Purpose + +- Document the reasoning behind implementations +- Provide context for code reviewers +- Help future contributors understand design decisions +- Track what has been implemented and what is planned + +## Naming Convention + +``` +NNN-feature-name.md +``` + +Where `NNN` is a zero-padded sequence number. + +## Plan Structure + +Each plan should include: + +1. **Problem Statement** - What problem does this solve? +2. **Solution** - High-level approach +3. **Implementation** - Files created/modified +4. **Design Decisions** - Key choices and rationale +5. **Testing** - How to verify it works +6. **Future Enhancements** - What could be added later + +## Current Plans + +| # | Feature | Status | Date | +|---|---------|--------|------| +| 001 | [Validate PHIX Reference](001-validate-phix-reference.md) | βœ… Implemented | 2026-01-14 | diff --git a/tests/unit/test_run_pipeline.py b/tests/unit/test_run_pipeline.py index dff22a8..f3cdd99 100644 --- a/tests/unit/test_run_pipeline.py +++ b/tests/unit/test_run_pipeline.py @@ -220,7 +220,8 @@ def test_run_step_2_preprocess(self, tmp_test_dir: Path, tmp_output_structure: d mock_result.clients = [client1, client2] mock_result.warnings = [] - with patch("pipeline.orchestrator.preprocess.read_input", return_value=mock_df), \ + with patch("pipeline.orchestrator.load_config", return_value={"phix_validation": {"enabled": False}}), \ + patch("pipeline.orchestrator.preprocess.read_input", return_value=mock_df), \ patch("pipeline.orchestrator.preprocess.map_columns", return_value=(mock_mapped_df, {})), \ patch("pipeline.orchestrator.preprocess.filter_columns", return_value=mock_filtered_df), \ patch("pipeline.orchestrator.preprocess.ensure_required_columns", return_value=mock_final_df), \ @@ -236,6 +237,8 @@ def test_run_step_2_preprocess(self, tmp_test_dir: Path, tmp_output_structure: d output_dir=tmp_output_structure["root"], language="en", run_id="test_20250101_120000", + config_dir=tmp_test_dir, + template_code=None, ) assert total == 2 diff --git a/tests/unit/test_validate_pdfs.py b/tests/unit/test_validate_pdfs.py index 3eabb86..91538df 100644 --- a/tests/unit/test_validate_pdfs.py +++ b/tests/unit/test_validate_pdfs.py @@ -769,3 +769,69 @@ def test_client_id_presence_disabled(self, tmp_path: Path) -> None: # Should have no warnings because all rules are disabled assert len(result.warnings) == 0 + + +@pytest.mark.unit +class TestPhixMetadataLogging: + """Tests for PHIX metadata measurements in PDF validation.""" + + def test_measurements_include_phix_scope(self, tmp_path: Path) -> None: + """PHIX metadata is propagated into validation measurements.""" + pdf_path = tmp_path / "en_notice_00001_1009876543.pdf" + writer = PdfWriter() + writer.add_blank_page(width=612, height=792) + with open(pdf_path, "wb") as f: + writer.write(f) + + metadata_map = { + pdf_path.name: { + "phix_validation": { + "phu_code": "test_phu_1", + "phu_name": "Test PHU 1", + "match_type": "exact", + "confidence": 100, + }, + "phix_target_phu_code": "test_phu_1", + "phix_target_phu_label": "Test PHU 1", + "school_name": "Lincoln Elementary School", + } + } + + result = validate_pdfs.validate_pdf_structure( + pdf_path, + enabled_rules={"exactly_two_pages": "disabled"}, + client_metadata_map=metadata_map, + ) + + assert result.measurements["phix_target_phu_code"] == "test_phu_1" + assert result.measurements["phix_matched_phu_code"] == "test_phu_1" + assert "phix_target_phu" not in "".join(result.warnings) + + def test_mismatch_emits_warning(self, tmp_path: Path) -> None: + """PHU mismatch triggers per-PDF warning.""" + pdf_path = tmp_path / "en_notice_00001_1009876543.pdf" + writer = PdfWriter() + writer.add_blank_page(width=612, height=792) + with open(pdf_path, "wb") as f: + writer.write(f) + + metadata_map = { + pdf_path.name: { + "phix_validation": { + "phu_code": "test_phu_2", + "phu_name": "Test PHU 2", + "match_type": "exact", + "confidence": 100, + }, + "phix_target_phu_code": "test_phu_1", + "phix_target_phu_label": "Test PHU 1", + } + } + + result = validate_pdfs.validate_pdf_structure( + pdf_path, + enabled_rules={"exactly_two_pages": "disabled"}, + client_metadata_map=metadata_map, + ) + + assert any("phix_target_phu" in warning for warning in result.warnings) diff --git a/tests/unit/test_validate_phix.py b/tests/unit/test_validate_phix.py new file mode 100644 index 0000000..4083b45 --- /dev/null +++ b/tests/unit/test_validate_phix.py @@ -0,0 +1,441 @@ +"""Unit tests for PHIX validation module. + +Tests cover: +- Loading and parsing PHIX reference Excel files +- Exact matching against reference list (no fuzzy fallback) +- Different unmatched_behavior modes (warn, error, skip) +- Edge cases (empty data, missing columns) +""" + +from __future__ import annotations + +import tempfile +from pathlib import Path +from unittest.mock import patch + +import pandas as pd +import pytest + +from pipeline.validate_phix import ( + PHIXFacility, + PHIXMatchResult, + clear_cache, + load_phix_reference, + match_facility, + normalize_facility_name, + parse_facility_entry, + validate_facilities, +) + + +@pytest.fixture(autouse=True) +def reset_cache(): + """Clear PHIX reference cache before and after each test.""" + clear_cache() + yield + clear_cache() + + +@pytest.fixture +def sample_phix_excel(tmp_path: Path) -> Path: + """Create a sample PHIX reference Excel file for testing.""" + data = { + "Test PHU 1": [ + "Lincoln Elementary School - SCH001", + "Maple High School - SCH002", + "Sunshine Childcare Centre - DAY001", + None, # Empty row + ], + "Test PHU 2": [ + "Oak Valley Public School - SCH003", + "Bright Futures Daycare - DAY002", + None, + None, + ], + } + df = pd.DataFrame(data) + excel_path = tmp_path / "test_phix_reference.xlsx" + df.to_excel(excel_path, sheet_name="Schools & Day Cares", index=False) + return excel_path + + +@pytest.fixture +def sample_phu_mapping(tmp_path: Path) -> Path: + """Create a sample PHU alias mapping file.""" + mapping_path = tmp_path / "phu_aliases.yaml" + mapping_path.write_text( + """ +phu_aliases: + test_phu_1: + display_name: Test PHU 1 + aliases: + - Test PHU 1 + test_phu_2: + display_name: Test PHU 2 + aliases: + - Test PHU 2 + """.strip(), + encoding="utf-8", + ) + return mapping_path + + +class TestParseFacilityEntry: + """Tests for parse_facility_entry function.""" + + def test_parse_standard_entry(self): + """Parse standard 'NAME - ID' format.""" + facility = parse_facility_entry( + "ANNA MCCREA PUBLIC SCHOOL - 019186", "Test PHU" + ) + assert facility is not None + assert facility.name == "ANNA MCCREA PUBLIC SCHOOL" + assert facility.phix_id == "019186" + assert facility.phu == "Test PHU" + + def test_parse_entry_with_multiple_dashes(self): + """Parse entry where name contains dashes.""" + facility = parse_facility_entry( + "ST. MARY'S CO-OP - PRE-SCHOOL - DAY123", "PHU" + ) + assert facility is not None + # Should split on last " - " + assert facility.name == "ST. MARY'S CO-OP - PRE-SCHOOL" + assert facility.phix_id == "DAY123" + + def test_parse_entry_no_id(self): + """Parse entry without ID separator.""" + facility = parse_facility_entry("Some School Name", "PHU") + assert facility is not None + assert facility.name == "Some School Name" + assert facility.phix_id == "" + + def test_parse_empty_entry(self): + """Empty entries return None.""" + assert parse_facility_entry("", "PHU") is None + assert parse_facility_entry(None, "PHU") is None # type: ignore[arg-type] + assert parse_facility_entry(" ", "PHU") is None + + def test_parse_nan_entry(self): + """NaN values return None.""" + assert parse_facility_entry(float("nan"), "PHU") is None # type: ignore[arg-type] + + +class TestNormalizeFacilityName: + """Tests for normalize_facility_name function.""" + + def test_uppercase_conversion(self): + """Names are converted to uppercase.""" + assert normalize_facility_name("Lincoln School") == "LINCOLN SCHOOL" + + def test_whitespace_normalization(self): + """Extra whitespace is collapsed.""" + assert normalize_facility_name(" Lincoln School ") == "LINCOLN SCHOOL" + + def test_empty_string(self): + """Empty string returns empty.""" + assert normalize_facility_name("") == "" + assert normalize_facility_name(" ") == "" + + +class TestLoadPhixReference: + """Tests for load_phix_reference function.""" + + def test_load_valid_file(self, sample_phix_excel: Path): + """Load and parse valid PHIX reference file.""" + ref = load_phix_reference(sample_phix_excel) + + assert "facilities" in ref + assert "by_name" in ref + assert "phus" in ref + assert "by_name_phu" in ref + + # Should have 5 facilities (excluding None rows) + assert len(ref["facilities"]) == 5 + assert len(ref["phus"]) == 2 + + def test_caching(self, sample_phix_excel: Path): + """Second load returns cached data.""" + ref1 = load_phix_reference(sample_phix_excel) + ref2 = load_phix_reference(sample_phix_excel) + + # Should be same object (cached) + assert ref1 is ref2 + + def test_file_not_found(self, tmp_path: Path): + """Missing file raises FileNotFoundError.""" + with pytest.raises(FileNotFoundError, match="PHIX reference file not found"): + load_phix_reference(tmp_path / "nonexistent.xlsx") + + +class TestMatchFacility: + """Tests for match_facility function.""" + + def test_exact_match(self, sample_phix_excel: Path): + """Exact match returns 100% confidence.""" + ref = load_phix_reference(sample_phix_excel) + + result = match_facility("Lincoln Elementary School", ref) + assert result.matched is True + assert result.phix_id == "SCH001" + assert result.confidence == 100 + assert result.match_type == "exact" + assert result.phu_name == "Test PHU 1" + + def test_exact_match_case_insensitive(self, sample_phix_excel: Path): + """Exact match is case-insensitive.""" + ref = load_phix_reference(sample_phix_excel) + + result = match_facility("LINCOLN ELEMENTARY SCHOOL", ref) + assert result.matched is True + assert result.confidence == 100 + + def test_similar_name_not_matched(self, sample_phix_excel: Path): + """Slight typos do not match (exact-only policy).""" + ref = load_phix_reference(sample_phix_excel) + + # Slight typo that would match fuzzy + result = match_facility("Lincoln Elementry School", ref) + assert result.matched is False + + def test_empty_input(self, sample_phix_excel: Path): + """Empty input returns no match.""" + ref = load_phix_reference(sample_phix_excel) + + result = match_facility("", ref) + assert result.matched is False + + result = match_facility(" ", ref) + assert result.matched is False + + +class TestValidateFacilities: + """Tests for validate_facilities function.""" + + def test_validate_all_matched(self, sample_phix_excel: Path, tmp_path: Path): + """All facilities matched returns no warnings.""" + df = pd.DataFrame({ + "SCHOOL_NAME": ["Lincoln Elementary School", "Maple High School"], + "OTHER_COL": ["A", "B"], + }) + + result_df, warnings = validate_facilities(df, sample_phix_excel, tmp_path) + + assert len(result_df) == 2 + assert "PHIX_ID" in result_df.columns + assert "PHIX_MATCH_CONFIDENCE" in result_df.columns + assert result_df.iloc[0]["PHIX_ID"] == "SCH001" + assert "PHIX_MATCHED_PHU" in result_df.columns + assert result_df.iloc[0]["PHIX_MATCHED_PHU"] == "Test PHU 1" + assert "PHIX_TARGET_PHU_CODE" in result_df.columns + assert pd.isna(result_df["PHIX_TARGET_PHU_CODE"]).all() + assert len(warnings) == 0 + + def test_validate_scoped_to_target_phu( + self, + sample_phix_excel: Path, + sample_phu_mapping: Path, + tmp_path: Path, + ): + """Facilities outside target PHU remain unmatched.""" + df = pd.DataFrame( + { + "SCHOOL_NAME": [ + "Lincoln Elementary School", # PHU 1 + "Oak Valley Public School", # PHU 2 + ], + } + ) + + result_df, warnings = validate_facilities( + df, + sample_phix_excel, + tmp_path, + target_phu_codes=["test_phu_2"], + phu_mapping_path=sample_phu_mapping, + ) + + assert result_df.iloc[0]["PHIX_MATCH_TYPE"] == "none" + assert result_df.iloc[0]["PHIX_ID"] is None + assert result_df.iloc[1]["PHIX_ID"] == "SCH003" + assert result_df.iloc[1]["PHIX_MATCHED_PHU_CODE"] == "test_phu_2" + assert result_df.iloc[1]["PHIX_TARGET_PHU_CODE"] == "test_phu_2" + assert len(warnings) == 1 # unmatched facility warning + unmatched_csv = tmp_path / "unmatched_facilities.csv" + assert unmatched_csv.exists() + + def test_validate_missing_mapping_for_target( + self, sample_phix_excel: Path, tmp_path: Path + ): + """Target PHU requires mapping file.""" + df = pd.DataFrame({"SCHOOL_NAME": ["Lincoln Elementary School"]}) + + with pytest.raises(ValueError, match="phu_mapping_file"): + validate_facilities( + df, + sample_phix_excel, + tmp_path, + target_phu_codes=["test_phu_1"], + ) + + def test_validate_unknown_template_code( + self, sample_phix_excel: Path, sample_phu_mapping: Path, tmp_path: Path + ): + """Unknown template code raises descriptive error.""" + df = pd.DataFrame({"SCHOOL_NAME": ["Lincoln Elementary School"]}) + + with pytest.raises(ValueError, match="not defined"): + validate_facilities( + df, + sample_phix_excel, + tmp_path, + target_phu_codes=["unknown_code"], + phu_mapping_path=sample_phu_mapping, + ) + + def test_validate_missing_phu_alias( + self, sample_phix_excel: Path, tmp_path: Path + ): + """Missing PHU alias in mapping is surfaced.""" + df = pd.DataFrame({"SCHOOL_NAME": ["Lincoln Elementary School"]}) + mapping_path = tmp_path / "phu_aliases_partial.yaml" + mapping_path.write_text( + """ +phu_aliases: + test_phu_1: + display_name: Example Display + aliases: + - Unknown Alias + """.strip(), + encoding="utf-8", + ) + + with pytest.raises(ValueError, match="No PHIX columns mapped"): + validate_facilities( + df, + sample_phix_excel, + tmp_path, + target_phu_codes=["test_phu_1"], + phu_mapping_path=mapping_path, + ) + + def test_validate_with_unmatched_warn( + self, sample_phix_excel: Path, tmp_path: Path + ): + """Unmatched facilities with warn behavior logs warning.""" + df = pd.DataFrame({ + "SCHOOL_NAME": ["Lincoln Elementary School", "Unknown School XYZ"], + }) + + result_df, warnings = validate_facilities( + df, + sample_phix_excel, + tmp_path, + unmatched_behavior="warn", + ) + + assert len(result_df) == 2 # All records kept + assert len(warnings) == 1 + assert "not found in PHIX reference" in warnings[0] + + # Check unmatched CSV was written + unmatched_csv = tmp_path / "unmatched_facilities.csv" + assert unmatched_csv.exists() + + def test_validate_with_unmatched_error( + self, sample_phix_excel: Path, tmp_path: Path + ): + """Unmatched facilities with error behavior raises.""" + df = pd.DataFrame({ + "SCHOOL_NAME": ["Lincoln Elementary School", "Unknown School XYZ"], + }) + + with pytest.raises(ValueError, match="not found in PHIX reference"): + validate_facilities( + df, + sample_phix_excel, + tmp_path, + unmatched_behavior="error", + ) + + def test_validate_with_unmatched_skip( + self, sample_phix_excel: Path, tmp_path: Path + ): + """Unmatched facilities with skip behavior filters them out.""" + df = pd.DataFrame({ + "SCHOOL_NAME": ["Lincoln Elementary School", "Unknown School XYZ"], + }) + + result_df, warnings = validate_facilities( + df, + sample_phix_excel, + tmp_path, + unmatched_behavior="skip", + ) + + assert len(result_df) == 1 # Unknown filtered out + assert result_df.iloc[0]["SCHOOL_NAME"] == "Lincoln Elementary School" + assert len(warnings) == 2 # unmatched warning + filtered warning + + def test_validate_missing_column( + self, sample_phix_excel: Path, tmp_path: Path + ): + """Missing school column skips validation gracefully.""" + df = pd.DataFrame({ + "OTHER_COL": ["A", "B"], + }) + + result_df, warnings = validate_facilities( + df, sample_phix_excel, tmp_path, + school_column="SCHOOL_NAME", + ) + + assert len(result_df) == 2 + assert "PHIX_ID" not in result_df.columns + assert len(warnings) == 0 + + def test_validate_empty_dataframe( + self, sample_phix_excel: Path, tmp_path: Path + ): + """Empty DataFrame returns empty with no errors.""" + df = pd.DataFrame({"SCHOOL_NAME": []}) + + result_df, warnings = validate_facilities( + df, sample_phix_excel, tmp_path + ) + + assert len(result_df) == 0 + assert len(warnings) == 0 + + +class TestPHIXFacilityDataclass: + """Tests for PHIXFacility dataclass.""" + + def test_hash_equality(self): + """Two facilities with same data have same hash.""" + f1 = PHIXFacility(phix_id="123", name="Test", phu="PHU1") + f2 = PHIXFacility(phix_id="123", name="Test", phu="PHU1") + + assert hash(f1) == hash(f2) + + def test_hash_difference(self): + """Different facilities have different hashes.""" + f1 = PHIXFacility(phix_id="123", name="Test", phu="PHU1") + f2 = PHIXFacility(phix_id="456", name="Test", phu="PHU1") + + assert hash(f1) != hash(f2) + + +class TestPHIXMatchResultDataclass: + """Tests for PHIXMatchResult dataclass.""" + + def test_default_values(self): + """Verify default values.""" + result = PHIXMatchResult(input_name="Test", matched=False) + + assert result.phix_id is None + assert result.phix_name is None + assert result.phu_name is None + assert result.phu_code is None + assert result.confidence == 0 + assert result.match_type == "none"