|
| 1 | +"""Qid-level acceptance harness for the narrow P3.F JOIN-path work. |
| 2 | +
|
| 3 | +This script checks a finished eval/voting report. It does not call providers, |
| 4 | +does not run a broad residue sweep, and does not implement the JOIN linker. |
| 5 | +
|
| 6 | +Usage: |
| 7 | + uv run python scripts/p3f_acceptance.py \ |
| 8 | + --report eval/reports/2026-05-22/v20-kimi-k2-thinking-merged.json |
| 9 | + uv run python scripts/p3f_acceptance.py --report <candidate>.json --require-pass |
| 10 | +""" |
| 11 | + |
| 12 | +from __future__ import annotations |
| 13 | + |
| 14 | +import argparse |
| 15 | +import json |
| 16 | +import sys |
| 17 | +from collections.abc import Mapping |
| 18 | +from dataclasses import dataclass |
| 19 | +from pathlib import Path |
| 20 | +from typing import Any |
| 21 | + |
| 22 | +from sqlglot import exp, parse_one |
| 23 | +from sqlglot.errors import ParseError |
| 24 | + |
| 25 | +ColumnRef = tuple[str, str] |
| 26 | + |
| 27 | + |
| 28 | +@dataclass(frozen=True) |
| 29 | +class AcceptanceTarget: |
| 30 | + qid: int |
| 31 | + label: str |
| 32 | + required_columns: tuple[ColumnRef, ...] |
| 33 | + forbidden_columns: tuple[ColumnRef, ...] = () |
| 34 | + |
| 35 | + |
| 36 | +@dataclass(frozen=True) |
| 37 | +class AcceptanceResult: |
| 38 | + qid: int |
| 39 | + label: str |
| 40 | + accepted: bool |
| 41 | + match: bool |
| 42 | + reasons: tuple[str, ...] |
| 43 | + pred_sql: str |
| 44 | + |
| 45 | + |
| 46 | +TARGETS: tuple[AcceptanceTarget, ...] = ( |
| 47 | + AcceptanceTarget( |
| 48 | + qid=1404, |
| 49 | + label="student_club expense type must come from event.type", |
| 50 | + required_columns=(("event", "type"),), |
| 51 | + forbidden_columns=(("expense", "expense_description"), ("expense", "type")), |
| 52 | + ), |
| 53 | + AcceptanceTarget( |
| 54 | + qid=207, |
| 55 | + label="toxicology double bond path must not shortcut through connected.bond_id", |
| 56 | + required_columns=(("connected", "atom_id"),), |
| 57 | + forbidden_columns=(("connected", "bond_id"),), |
| 58 | + ), |
| 59 | +) |
| 60 | + |
| 61 | + |
| 62 | +def evaluate_report(report: Mapping[str, Any]) -> list[AcceptanceResult]: |
| 63 | + records = _records_by_qid(report) |
| 64 | + missing = [target.qid for target in TARGETS if target.qid not in records] |
| 65 | + if missing: |
| 66 | + raise ValueError(f"missing target qids: {missing}") |
| 67 | + return [_evaluate_record(records[target.qid], target) for target in TARGETS] |
| 68 | + |
| 69 | + |
| 70 | +def main(argv: list[str] | None = None) -> int: |
| 71 | + parser = argparse.ArgumentParser(description=__doc__) |
| 72 | + parser.add_argument("--report", type=Path, required=True) |
| 73 | + parser.add_argument( |
| 74 | + "--require-pass", |
| 75 | + action="store_true", |
| 76 | + help="return exit code 1 unless every P3.F target is accepted", |
| 77 | + ) |
| 78 | + args = parser.parse_args(argv) |
| 79 | + |
| 80 | + report = json.loads(args.report.read_text(encoding="utf-8")) |
| 81 | + try: |
| 82 | + results = evaluate_report(report) |
| 83 | + except ValueError as exc: |
| 84 | + print(f"[error] {exc}", file=sys.stderr) |
| 85 | + return 3 |
| 86 | + |
| 87 | + print(f"Report: {args.report}") |
| 88 | + for result in results: |
| 89 | + flag = "PASS" if result.accepted else "FAIL" |
| 90 | + print(f"{flag} qid={result.qid} match={result.match} - {result.label}") |
| 91 | + for reason in result.reasons: |
| 92 | + print(f" - {reason}") |
| 93 | + |
| 94 | + if args.require_pass and any(not result.accepted for result in results): |
| 95 | + return 1 |
| 96 | + return 0 |
| 97 | + |
| 98 | + |
| 99 | +def _evaluate_record( |
| 100 | + record: Mapping[str, Any], |
| 101 | + target: AcceptanceTarget, |
| 102 | +) -> AcceptanceResult: |
| 103 | + pred_sql = str(record.get("pred_sql") or "") |
| 104 | + match = bool(record.get("match")) |
| 105 | + columns, parse_error = _qualified_columns(pred_sql) |
| 106 | + reasons: list[str] = [] |
| 107 | + if not match: |
| 108 | + reasons.append("EA match is false") |
| 109 | + if parse_error: |
| 110 | + reasons.append(parse_error) |
| 111 | + for table, column in target.required_columns: |
| 112 | + if (table, column) not in columns: |
| 113 | + reasons.append(f"missing required column {table}.{column}") |
| 114 | + for table, column in target.forbidden_columns: |
| 115 | + if (table, column) in columns: |
| 116 | + reasons.append(f"forbidden column {table}.{column} is present") |
| 117 | + return AcceptanceResult( |
| 118 | + qid=target.qid, |
| 119 | + label=target.label, |
| 120 | + accepted=not reasons, |
| 121 | + match=match, |
| 122 | + reasons=tuple(reasons), |
| 123 | + pred_sql=pred_sql, |
| 124 | + ) |
| 125 | + |
| 126 | + |
| 127 | +def _records_by_qid(report: Mapping[str, Any]) -> dict[int, Mapping[str, Any]]: |
| 128 | + raw_records = report.get("records") or [] |
| 129 | + records: dict[int, Mapping[str, Any]] = {} |
| 130 | + for raw_record in raw_records: |
| 131 | + if not isinstance(raw_record, Mapping): |
| 132 | + continue |
| 133 | + qid = raw_record.get("question_id") |
| 134 | + if isinstance(qid, int): |
| 135 | + records[qid] = raw_record |
| 136 | + return records |
| 137 | + |
| 138 | + |
| 139 | +def _qualified_columns(sql: str) -> tuple[set[ColumnRef], str | None]: |
| 140 | + if not sql.strip(): |
| 141 | + return set(), None |
| 142 | + try: |
| 143 | + tree = parse_one(sql, read="sqlite") |
| 144 | + except ParseError as exc: |
| 145 | + return set(), f"SQL parse failed: {exc}" |
| 146 | + |
| 147 | + alias_to_table: dict[str, str] = {} |
| 148 | + for table in tree.find_all(exp.Table): |
| 149 | + table_name = _lower(table.name) |
| 150 | + if not table_name: |
| 151 | + continue |
| 152 | + alias_to_table[table_name] = table_name |
| 153 | + alias_to_table[_lower(table.alias_or_name)] = table_name |
| 154 | + |
| 155 | + columns: set[ColumnRef] = set() |
| 156 | + for column in tree.find_all(exp.Column): |
| 157 | + column_name = _lower(column.name) |
| 158 | + table_name = _lower(column.table) |
| 159 | + if not column_name: |
| 160 | + continue |
| 161 | + resolved_table = alias_to_table.get(table_name, table_name) |
| 162 | + columns.add((resolved_table, column_name)) |
| 163 | + return columns, None |
| 164 | + |
| 165 | + |
| 166 | +def _lower(value: str) -> str: |
| 167 | + return value.lower() |
| 168 | + |
| 169 | + |
| 170 | +if __name__ == "__main__": |
| 171 | + raise SystemExit(main()) |
0 commit comments