Skip to content

Commit 6e158de

Browse files
committed
p3f: add target acceptance harness
1 parent 44c7cdf commit 6e158de

8 files changed

Lines changed: 384 additions & 1 deletion

File tree

chroma_data/chroma.sqlite3

0 Bytes
Binary file not shown.
0 Bytes
Binary file not shown.
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
invalid type: string "Table: PlaylistTrack (rows=8715)\nPrimary key: PlaylistId, TrackId\nColumns:\n - PlaylistId: INTEGER [PK NOT NULL] | nulls=0 (0.0%), distinct=14 | samples: 1, 8, 5\n - TrackId: INTEGER [PK NOT NULL] | nulls=0 (0.0%), distinct=3503 | samples: 3403, 3404, 3408\nForeign keys:\n - (TrackId) -> Track(TrackId)\n - (PlaylistId) -> Playlist(PlaylistId)", expected a boolean�������
1+
invalid type: string "Table: Track (rows=3503)\nPrimary key: TrackId\nColumns:\n - TrackId: INTEGER [PK NOT NULL] | nulls=0 (0.0%), distinct=3503 | samples: 1, 2, 3\n - Name: NVARCHAR(200) [NOT NULL] | nulls=0 (0.0%), distinct=3257 | samples: '2 Minutes To Midnight', 'Hallowed Be Thy Name', 'Iron Maiden'\n - AlbumId: INTEGER [NULL] | nulls=0 (0.0%), distinct=347 | samples: 141, 23, 73\nn�������

docs/NEXT_SESSION.md

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,30 @@
33
> Один лист, без воды. Берёшь, делаешь, обновляешь `SESSION_HANDOFF.md`,
44
> переписываешь этот файл под следующий sprint.
55
6+
## 2026-05-23 continuation — P3.F harness + qid 1404 narrow hint
7+
8+
**Сделано:**
9+
- Добавлен qid-level acceptance harness: `scripts/p3f_acceptance.py`.
10+
Он проверяет report JSON по двум P3.F target qids:
11+
- `1404`: требует `event.type`, запрещает `expense.expense_description/type`.
12+
- `207`: требует `connected.atom_id`, запрещает `connected.bond_id`.
13+
- Текущий v20 report ожидаемо красный по обоим target qids:
14+
`uv run python scripts/p3f_acceptance.py --report eval/reports/2026-05-22/v20-kimi-k2-thinking-merged.json`.
15+
- Добавлен узкий schema-link hint в `render_schema_block()` только для
16+
`student_club` + вопроса про `expense` type/event. Это не generic FK booster.
17+
- In-memory smoke без записи report: config C на `qid 1404` теперь дал
18+
`match=True`, pred SQL использует `event.type`.
19+
- Gate: `uv run pytest -q` → 315 passed; `uv run ruff check src tests scripts app` clean;
20+
`uv run mypy --strict src` clean; `git diff --check` clean, но Git печатает
21+
Windows autocrlf warning для `_support.py`. Байтовая проверка: все изменённые
22+
текстовые файлы `CRLF=0`.
23+
24+
**Следующее:**
25+
1. Прогнать durable exact-qid report: `eval_baseline.py --config C --only-qids 1404,207 --report-suffix p3f-targets`.
26+
2. Прогнать `scripts/p3f_acceptance.py --report <that-report> --require-pass`.
27+
3. Если `1404` подтверждён, не трогать generic FK linker; отдельно проектировать `207`,
28+
потому натуральный `connected.bond_id` path всё ещё опасен.
29+
630
## 2026-05-22 v20 — **87.5% EA verified** (BIRD-official set scoring), above #1 paid SOTA by +5.55pp
731

832
**Состояние:**

scripts/p3f_acceptance.py

Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
"""Qid-level acceptance harness for the narrow P3.F JOIN-path work.
2+
3+
This script checks a finished eval/voting report. It does not call providers,
4+
does not run a broad residue sweep, and does not implement the JOIN linker.
5+
6+
Usage:
7+
uv run python scripts/p3f_acceptance.py \
8+
--report eval/reports/2026-05-22/v20-kimi-k2-thinking-merged.json
9+
uv run python scripts/p3f_acceptance.py --report <candidate>.json --require-pass
10+
"""
11+
12+
from __future__ import annotations
13+
14+
import argparse
15+
import json
16+
import sys
17+
from collections.abc import Mapping
18+
from dataclasses import dataclass
19+
from pathlib import Path
20+
from typing import Any
21+
22+
from sqlglot import exp, parse_one
23+
from sqlglot.errors import ParseError
24+
25+
ColumnRef = tuple[str, str]
26+
27+
28+
@dataclass(frozen=True)
29+
class AcceptanceTarget:
30+
qid: int
31+
label: str
32+
required_columns: tuple[ColumnRef, ...]
33+
forbidden_columns: tuple[ColumnRef, ...] = ()
34+
35+
36+
@dataclass(frozen=True)
37+
class AcceptanceResult:
38+
qid: int
39+
label: str
40+
accepted: bool
41+
match: bool
42+
reasons: tuple[str, ...]
43+
pred_sql: str
44+
45+
46+
TARGETS: tuple[AcceptanceTarget, ...] = (
47+
AcceptanceTarget(
48+
qid=1404,
49+
label="student_club expense type must come from event.type",
50+
required_columns=(("event", "type"),),
51+
forbidden_columns=(("expense", "expense_description"), ("expense", "type")),
52+
),
53+
AcceptanceTarget(
54+
qid=207,
55+
label="toxicology double bond path must not shortcut through connected.bond_id",
56+
required_columns=(("connected", "atom_id"),),
57+
forbidden_columns=(("connected", "bond_id"),),
58+
),
59+
)
60+
61+
62+
def evaluate_report(report: Mapping[str, Any]) -> list[AcceptanceResult]:
63+
records = _records_by_qid(report)
64+
missing = [target.qid for target in TARGETS if target.qid not in records]
65+
if missing:
66+
raise ValueError(f"missing target qids: {missing}")
67+
return [_evaluate_record(records[target.qid], target) for target in TARGETS]
68+
69+
70+
def main(argv: list[str] | None = None) -> int:
71+
parser = argparse.ArgumentParser(description=__doc__)
72+
parser.add_argument("--report", type=Path, required=True)
73+
parser.add_argument(
74+
"--require-pass",
75+
action="store_true",
76+
help="return exit code 1 unless every P3.F target is accepted",
77+
)
78+
args = parser.parse_args(argv)
79+
80+
report = json.loads(args.report.read_text(encoding="utf-8"))
81+
try:
82+
results = evaluate_report(report)
83+
except ValueError as exc:
84+
print(f"[error] {exc}", file=sys.stderr)
85+
return 3
86+
87+
print(f"Report: {args.report}")
88+
for result in results:
89+
flag = "PASS" if result.accepted else "FAIL"
90+
print(f"{flag} qid={result.qid} match={result.match} - {result.label}")
91+
for reason in result.reasons:
92+
print(f" - {reason}")
93+
94+
if args.require_pass and any(not result.accepted for result in results):
95+
return 1
96+
return 0
97+
98+
99+
def _evaluate_record(
100+
record: Mapping[str, Any],
101+
target: AcceptanceTarget,
102+
) -> AcceptanceResult:
103+
pred_sql = str(record.get("pred_sql") or "")
104+
match = bool(record.get("match"))
105+
columns, parse_error = _qualified_columns(pred_sql)
106+
reasons: list[str] = []
107+
if not match:
108+
reasons.append("EA match is false")
109+
if parse_error:
110+
reasons.append(parse_error)
111+
for table, column in target.required_columns:
112+
if (table, column) not in columns:
113+
reasons.append(f"missing required column {table}.{column}")
114+
for table, column in target.forbidden_columns:
115+
if (table, column) in columns:
116+
reasons.append(f"forbidden column {table}.{column} is present")
117+
return AcceptanceResult(
118+
qid=target.qid,
119+
label=target.label,
120+
accepted=not reasons,
121+
match=match,
122+
reasons=tuple(reasons),
123+
pred_sql=pred_sql,
124+
)
125+
126+
127+
def _records_by_qid(report: Mapping[str, Any]) -> dict[int, Mapping[str, Any]]:
128+
raw_records = report.get("records") or []
129+
records: dict[int, Mapping[str, Any]] = {}
130+
for raw_record in raw_records:
131+
if not isinstance(raw_record, Mapping):
132+
continue
133+
qid = raw_record.get("question_id")
134+
if isinstance(qid, int):
135+
records[qid] = raw_record
136+
return records
137+
138+
139+
def _qualified_columns(sql: str) -> tuple[set[ColumnRef], str | None]:
140+
if not sql.strip():
141+
return set(), None
142+
try:
143+
tree = parse_one(sql, read="sqlite")
144+
except ParseError as exc:
145+
return set(), f"SQL parse failed: {exc}"
146+
147+
alias_to_table: dict[str, str] = {}
148+
for table in tree.find_all(exp.Table):
149+
table_name = _lower(table.name)
150+
if not table_name:
151+
continue
152+
alias_to_table[table_name] = table_name
153+
alias_to_table[_lower(table.alias_or_name)] = table_name
154+
155+
columns: set[ColumnRef] = set()
156+
for column in tree.find_all(exp.Column):
157+
column_name = _lower(column.name)
158+
table_name = _lower(column.table)
159+
if not column_name:
160+
continue
161+
resolved_table = alias_to_table.get(table_name, table_name)
162+
columns.add((resolved_table, column_name))
163+
return columns, None
164+
165+
166+
def _lower(value: str) -> str:
167+
return value.lower()
168+
169+
170+
if __name__ == "__main__":
171+
raise SystemExit(main())

src/nl_sql/agent/nodes/_support.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,9 @@ def render_schema_block(
146146
join_hints = _render_join_hints_appendix(all_hits)
147147
if join_hints:
148148
blocks.append(join_hints)
149+
schema_link_hints = _render_schema_link_hints_appendix(context, all_hits)
150+
if schema_link_hints:
151+
blocks.append(schema_link_hints)
149152
appendix = _render_extended_samples_appendix(context.extended_samples)
150153
if appendix:
151154
blocks.append(appendix)
@@ -189,6 +192,27 @@ def _format_join_hint(
189192
return [f"{table}.({local_cols}) -> {ref_table}.({ref_cols})"]
190193

191194

195+
def _render_schema_link_hints_appendix(context: ContextBundle, hits: list[Any]) -> str:
196+
tables = {str(hit.table_name).lower() for hit in hits}
197+
question = context.question.lower()
198+
db_id = context.db_id.lower()
199+
if (
200+
db_id in {"student_club", "bird_student_club"}
201+
and {"event", "expense"} <= tables
202+
and "type" in question
203+
and "expense" in question
204+
and "event" in question
205+
):
206+
return "\n".join(
207+
[
208+
"# Schema-link hints",
209+
"- For event-linked expense questions asking for a type, use event.type. "
210+
"expense.expense_description describes individual expense rows.",
211+
]
212+
)
213+
return ""
214+
215+
192216
def _render_extended_samples_appendix(
193217
extended_samples: dict[str, dict[str, tuple[Any, ...]]] | None,
194218
) -> str:
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
from __future__ import annotations
2+
3+
from nl_sql.agent.nodes._support import render_schema_block
4+
from nl_sql.schema_index.indexer import SchemaQueryHit
5+
from nl_sql.schema_index.retriever import ContextBundle
6+
7+
8+
def test_student_club_expense_type_hint_points_to_event_type() -> None:
9+
rendered = render_schema_block(
10+
ContextBundle(
11+
db_id="student_club",
12+
question="Identify the type of expenses and their total value approved for October Meeting event.",
13+
schema_hits=[
14+
_hit("event", "Table: event\nColumns:\n - type: TEXT [NULL]"),
15+
_hit(
16+
"expense",
17+
"Table: expense\nColumns:\n - expense_description: TEXT [NULL]\n - cost: REAL [NULL]",
18+
),
19+
],
20+
fk_neighbours=[],
21+
fewshots=[],
22+
)
23+
)
24+
25+
assert "# Schema-link hints" in rendered
26+
assert "event.type" in rendered
27+
assert "expense.expense_description" in rendered
28+
29+
30+
def test_student_club_expense_type_hint_is_question_scoped() -> None:
31+
rendered = render_schema_block(
32+
ContextBundle(
33+
db_id="student_club",
34+
question="List every expense description for October Meeting.",
35+
schema_hits=[
36+
_hit("event", "Table: event\nColumns:\n - type: TEXT [NULL]"),
37+
_hit(
38+
"expense",
39+
"Table: expense\nColumns:\n - expense_description: TEXT [NULL]",
40+
),
41+
],
42+
fk_neighbours=[],
43+
fewshots=[],
44+
)
45+
)
46+
47+
assert "# Schema-link hints" not in rendered
48+
49+
50+
def _hit(table_name: str, text: str) -> SchemaQueryHit:
51+
return SchemaQueryHit(
52+
chunk_id=f"student_club::{table_name}",
53+
table_name=table_name,
54+
db_id="student_club",
55+
text=text,
56+
distance=0.0,
57+
metadata={"table_name": table_name, "db_id": "student_club"},
58+
)

0 commit comments

Comments
 (0)