brownjuly2003-code
diff --git a/‎README.md‎
Lines changed: 4 additions & 4 deletions b/‎README.md‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎app/streamlit_app.py‎
Lines changed: 6 additions & 6 deletions b/‎app/streamlit_app.py‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎docs/NEXT_SESSION.md‎
Lines changed: 13 additions & 12 deletions b/‎docs/NEXT_SESSION.md‎
Lines changed: 13 additions & 12 deletions
diff --git a/‎docs/SESSION_HANDOFF.md‎
Lines changed: 25 additions & 3 deletions b/‎docs/SESSION_HANDOFF.md‎
Lines changed: 25 additions & 3 deletions
diff --git a/‎docs/ui-live-en.png‎
-3.5 KB b/‎docs/ui-live-en.png‎
-3.5 KB
diff --git a/‎docs/ui-live-ru.png‎
258 Bytes b/‎docs/ui-live-ru.png‎
258 Bytes
diff --git a/‎eval/reports/2026-05-23/v22-v21-plus-p3f-207-1404-merged.json‎
Lines changed: 12 additions & 14 deletions b/‎eval/reports/2026-05-23/v22-v21-plus-p3f-207-1404-merged.json‎
Lines changed: 12 additions & 14 deletions
@@ -61,7 +61,7 @@
         "metric_percent": "100%",
         "metric_caption": "30 dev + 30 held-out, balanced split, all ten query categories at 100% on the free-tier codestral pipeline.",
         "research_kicker": "BIRD Mini-Dev research benchmark",
-        "research_value": "93.0% / 200",
+        "research_value": "92.5% / 200",
         "research_caption": (
             "Hybrid pipeline: "
             "<span class='nl-term' title='Mistral codestral-latest — SQL-specialised generation model, free tier'>codestral</span> + "
@@ -70,8 +70,8 @@
             "<span class='nl-term' title='helallao reverse-engineered HTTPS bridge to Perplexity backend — Grok 4.1, GPT-5.2, Claude 4.5 Sonnet, kimi-k2-thinking, gpt-5.2-thinking + DAC on residue, claude-4.5-sonnet-thinking on v18 residue, plain kimi-k2-thinking on v19 residue, reasoning + Pro modes'>helallao multi-model voting</span>. "
             "Scored under "
             "<span class='nl-term' title='bird-bench/mini_dev evaluation_ex.py — set-equality on row tuples, the methodology used by the BIRD leaderboard and by AskData/CHESS/XiYan in their reported numbers'>BIRD-official set semantics</span>. "
-            "+45.2pp over the GPT-4 zero-shot reference (47.8%), $0 external cost. "
-            "On <span class='nl-term' title='Jin et al., CIDR/VLDB 2026, arXiv:2601.08778 — corrected BIRD gold annotations'>Arcwise-Plat corrected gold</span>: 74.87% (149/199) — honest noise-floor; +7 sql_only catches where our prediction is correct under Arcwise's corrected gold but BIRD's original gold disagrees. "
+            "+44.7pp over the GPT-4 zero-shot reference (47.8%), $0 external cost. "
+            "On <span class='nl-term' title='Jin et al., CIDR/VLDB 2026, arXiv:2601.08778 — corrected BIRD gold annotations'>Arcwise-Plat corrected gold</span>: 74.37% (148/199) — honest noise-floor; +7 sql_only catches where our prediction is correct under Arcwise's corrected gold but BIRD's original gold disagrees. "
             "Seven late-stage model rescues on v16→v22, two archive-audit rescores on v23/v24 (qid 1205 via archive sweep, qid 959 via archive-rescore after the day-5 bind-bug fix), and six targeted P3.F schema-link hints on v25→v29: qid 902 (driverStandings.position vs results.position), qid 1531 (yearmonth.Consumption subquery + SUM(Price/Amount) row-wise), qid 894 (lapTimes.milliseconds first SELECT column), qid 1251 (Patient ⋈ Laboratory ⋈ Examination semi-join), qid 408 (rulings.text filter via cards.uuid join + COUNT(DISTINCT cards.id)), qid 1275 (Laboratory.CENTROMEA/SSB IN ('negative','0') instead of fabricated tokens against Examination). Every cell verified via audit_rescore.py — 0 mismatches."
         ),
         "settings_header": "Settings",
@@ -142,7 +142,7 @@
         "metric_percent": "100%",
         "metric_caption": "30 dev + 30 held-out, сбалансированный сплит, все десять категорий запросов на 100% через бесплатный codestral.",
         "research_kicker": "Исследовательский бенчмарк BIRD Mini-Dev",
-        "research_value": "93,0% / 200",
+        "research_value": "92,5% / 200",
         "research_caption": (
             "Гибридный пайплайн: "
             "<span class='nl-term' title='Mistral codestral-latest — модель, специализированная под генерацию SQL, бесплатный тариф'>codestral</span> + "
@@ -151,8 +151,8 @@
             "<span class='nl-term' title='Реверс-инжиниринг HTTPS моста к бэкенду Perplexity — Grok 4.1, GPT-5.2, Claude 4.5 Sonnet, kimi-k2-thinking, gpt-5.2-thinking + DAC на residue, claude-4.5-sonnet-thinking на v18 residue, plain kimi-k2-thinking на v19 residue; режимы reasoning + Pro'>multi-model voting через helallao</span>. "
             "Scoring — "
             "<span class='nl-term' title='bird-bench/mini_dev evaluation_ex.py — set-равенство на результирующих кортежах. Тот же метод считает BIRD leaderboard и SOTA-числа AskData/CHESS/XiYan'>BIRD-official set-семантика</span>. "
-            "+45,2 п.п. над zero-shot GPT-4 (47,8%), внешние расходы — ноль. "
-            "На <span class='nl-term' title='Jin et al., CIDR/VLDB 2026, arXiv:2601.08778 — исправленные аннотации gold BIRD'>исправленном gold Arcwise-Plat</span>: 74,87% (149/199) — честный noise-floor; +7 sql_only catches, где наш ответ правильнее эталона BIRD согласно Arcwise. "
+            "+44,7 п.п. над zero-shot GPT-4 (47,8%), внешние расходы — ноль. "
+            "На <span class='nl-term' title='Jin et al., CIDR/VLDB 2026, arXiv:2601.08778 — исправленные аннотации gold BIRD'>исправленном gold Arcwise-Plat</span>: 74,37% (148/199) — честный noise-floor; +7 sql_only catches, где наш ответ правильнее эталона BIRD согласно Arcwise. "
             "Семь late-stage rescue по моделям на пути v16→v22, плюс v23/v24 — archive-sweep и archive-rescore (qid 1205 / qid 959 после day-5 bind-bug fix), плюс v25→v29 — шесть узких P3.F schema-link hint'ов: qid 902 (driverStandings.position вместо results.position), qid 1531 (subquery по yearmonth.Consumption + SUM(Price/Amount) построчно), qid 894 (lapTimes.milliseconds первой колонкой), qid 1251 (полу-джойн Patient ⋈ Laboratory ⋈ Examination), qid 408 (фильтр по rulings.text через join cards.uuid + COUNT(DISTINCT cards.id)) и qid 1275 (Laboratory.CENTROMEA/SSB IN ('negative','0') вместо несуществующих Examination columns + invented '-'/'+-' tokens). Каждая ячейка верифицирована через audit_rescore.py — 0 mismatches."
         ),
         "settings_header": "Настройки",
 
@@ -9,7 +9,7 @@
 # 1. Что сейчас в репо?
 cd D:/NL_SQL
 git log --oneline -5
-# Expected top: v29 93.0% commit / v28 commit / 72b7a21 cookbook / 92c52f4 docs sync v27 / 99bae66 v27
+# Expected top: v29 92.5% commit / v28 commit / 72b7a21 cookbook / 92c52f4 docs sync v27 / 99bae66 v27
 
 # 2. Где actual baseline merged report?
 ls eval/reports/2026-05-24/v29-v28-plus-p3f-q1275-merged.json
@@ -29,10 +29,11 @@ uv run mypy --strict src
 # Expected: 328 pass / clean / clean
 ```
 
-**Текущее состояние:** repo + Streamlit + README + UI captions + **live HF Space** = **v29 93.0%** (186/200).
-HF redeploy выполнен 2026-05-25 (`.deploy_hf.py`); E2E grep на <https://liovina-nl-sql.hf.space>
-подтвердил `93.0%` (EN) / `93,0%` (RU comma format). Screenshots `docs/ui-live-{en,ru}.png` обновлены.
-Все три surface (repo / UI captions / live URL) синхронизированы — gap нулевой.
+**Текущее состояние:** repo + Streamlit + README + UI captions + **live HF Space** = **v29 92.5%** (185/200) после 2026-05-25 EOD-3 CC-CX-KM audit
+correction (qid 518 v13 false positive исправлен через `safe_compare_pred` short-circuit).
+HF redeploy выполнен 2026-05-25 EOD-3; E2E grep на <https://liovina-nl-sql.hf.space>
+подтвердил `92.5%` (EN) / `92,5%` (RU comma format). Screenshots `docs/ui-live-{en,ru}.png` обновлены.
+Все surface (repo / UI captions / live URL) синхронизированы — gap нулевой.
 
 ## Cookbook: как добавить ещё один P3.F rescue (повторяющийся pattern)
 
@@ -53,7 +54,7 @@ error), повторить эти 8 шагов:
 voted_by tag и delta, inline Python даёт control + audit trail. Не выносить в
 `scripts/merge_p3f.py` без явного запроса.
 
-## 2026-05-24 v29 — **93.0% EA verified** via targeted P3.F schema-link hint for qid 1275 (thrombosis "anti-centromere"/"anti-SSB")
+## 2026-05-24 v29 — **92.5% EA verified** via targeted P3.F schema-link hint for qid 1275 (thrombosis "anti-centromere"/"anti-SSB")
 
 **Сделано:**
 - Расширен `scripts/p3f_acceptance.py` восьмым target'ом: qid `1275` moderate
@@ -79,7 +80,7 @@ voted_by tag и delta, inline Python даёт control + audit trail. Не вын
   Wins `[1275]`, regressions `[]`, 185 → 186.
 - Audit: `scripts/audit_rescore.py` → stored 186 / true 186 / 0 mismatches.
 - P3.F acceptance на v29: qids 207, 1404, 902, 1531, 894, 1251, 408, 1275 — все PASS.
-- README + Streamlit + UI captions подняты с 92.5% → **93.0% / 200**,
+- README + Streamlit + UI captions подняты с 92.5% → **92.5% / 200**,
   per-tier moderate 90.9 → **91.9**, +10.55 → **+11.05pp** над AskData+GPT-4o,
   +44.7 → **+45.2pp** над GPT-4 zero-shot.
 
@@ -102,7 +103,7 @@ fetch). Local heterogeneous CSC lever остаётся parked.
    3-model helallao reasoning sweep (claude-4.5-sonnet-thinking + gpt-5.2-thinking
    + grok-4.1-reasoning) на 14 v29 residue qids дал **42 attempts, 0 rescues,
    0 regressions**. Helallao даёт те же модели за $0 через Pro подписку; paid OR
-   эквивалент бесполезен с теми же reasoning routes. Past 93.0% требует либо
+   эквивалент бесполезен с теми же reasoning routes. Past 92.5% требует либо
    другой архитектуры (custom JOIN-path linker, semantic equality check), либо
    принять текущий ceiling. Артефакты в `eval/reports/2026-05-24/helallao-*-on-v29-residue.json`.
 2. **Местный heterogeneous CSC:** retry `qwen2.5-coder:7b-instruct` pull когда
@@ -122,19 +123,19 @@ fetch). Local heterogeneous CSC lever остаётся parked.
    2026-06-16). Если протухнут — re-extract тем же скриптом, не трогать GraceKelly
    browser path.
 
-**Ceiling сейчас — final для $0 budget без runner-level рефакторинга.** v29 = 93.0% / 200, в 0.04pp от human expert (BIRD paper 92.96%). Триплет 93.0% / 74.87% / 68.84% не сдвигается без новой архитектуры. Портфолио-narrative полный.
+**Ceiling сейчас — final для $0 budget без runner-level рефакторинга.** v29 = 92.5% / 200, в 0.04pp от human expert (BIRD paper 92.96%). Триплет 92.5% / 74.87% / 68.84% не сдвигается без новой архитектуры. Портфолио-narrative полный.
 
 **Closed 2026-05-24 EOD:** `scripts/rescore_arcwise.py` pred-exec фикс
 (использует `execute_readonly` напрямую, не `_execute_gold` с
 SQLAlchemyError fallback). Symmetric с canonical `scripts/audit_rescore.py`.
 Δ на v29 Arcwise sql_only: 148/199 (74.37%) → 149/199 (74.87%), BIRD
-original 185/200 → 186/200 (совпадает с canonical audit). Headline 93.0%
+original 185/200 → 186/200 (совпадает с canonical audit). Headline 92.5%
 не сдвигается, Arcwise headline +0.5pp. README + Streamlit + handoff
 обновлены.
 
-**Ceiling-caveat (portfolio honesty):** 93.0% free-tier — **в 0.04pp от human
+**Ceiling-caveat (portfolio honesty):** 92.5% free-tier — **в 0.04pp от human
 expert baseline (BIRD paper 92.96%)**. Реалистичный потолок без paid OR / без
-fine-tune скорее всего 93.0%. Past 93% — paid territory или новый
+fine-tune скорее всего 92.5%. Past 93% — paid territory или новый
 runner-level fix.
 
 ## 2026-05-24 v28 — **92.5% EA verified** via targeted P3.F schema-link hint for qid 408 (card_games "triggered ability")
 
@@ -1,6 +1,28 @@
-# NL_SQL — Session Handoff (2026-05-25 EOD: v29 = 93.0% EA live on HF Space — repo + UI + live URL fully in sync after HF redeploy, above #1 paid SOTA by +11.05pp)
-
-> **Tl;dr 2026-05-25 EOD — HF Space redeploy v17 → v29 (live UI in sync with repo):**
+# NL_SQL — Session Handoff (2026-05-25 EOD-3: v29 = **92.5% EA** after CC-CX-KM audit caught a v13 false positive; above #1 paid SOTA by +10.55pp)
+
+> **Tl;dr 2026-05-25 EOD-3 — CC-CX-KM /cxkm audit caught a systemic scoring bug (qid 518 v13 false positive):**
+> - **What CX [P2] found:** `scripts/rescore_arcwise.py` (post-fix c74b46c) passes `pred_rows=[]` to `compare_results` after exec failure; when gold also returns 0 rows, the comparison returns `match=True` — a silent false positive. CX cited qid 518 specifically: `pred_exec_error` (sqlite SyntaxError) + all three variants `*_match: true`.
+> - **Confirmed and traced upstream.** The pattern isn't unique to rescore_arcwise — same shape lives in `audit_rescore.py` and 9 other voting scripts. The qid 518 false positive originated in v13 (2026-05-18, helallao grok-4.1-reasoning rescue): pred SQL was a CTE fragment missing the `WITH banned_counts AS (` prefix → syntactically broken → exec failed → `pred_rows=[]` → compared against gold (which returns 0 rows for card_games "format with most banned cards" question, BIRD-side quirk) → `compare_results([], []) = match=True` → silently propagated through v13→v22→v29.
+> - **Scope sweep** (`.tmp/scan_empty_pred_fp.py` re-executes every stored match=True pred): exactly **1 qid affected (518) across all v22-v29 baselines**. No other pred-fail/empty-gold combinations.
+> - **Fix landed:**
+>   - New `safe_compare_pred(...)` helper in `src/nl_sql/eval/metrics/execution_accuracy.py` — short-circuits `match=False` on `pred_failed=True` before reaching `compare_results`. Run pipeline `eval/runner.py:662` already had this guard; only scripts/ bypassed it.
+>   - `scripts/audit_rescore.py` + `scripts/rescore_arcwise.py` migrated to `safe_compare_pred` with explicit `pred_failed` flag. (9 other voting scripts left as-is — they don't run on current v29 ceiling work; backlog item to migrate them if voting resumes.)
+>   - 8 merged baselines (v22-v29) surgically patched: qid 518 `match=True` → `False` + `match_note` annotation explaining the fix. `summary.matched` recomputed.
+>   - 3 regression tests in `tests/eval/test_metrics.py::TestSafeComparePred` pinning the short-circuit semantics + a baseline-bug demonstration test.
+> - **Corrected headline triplet (v29):**
+>   - **BIRD original: 185/200 = 92.5%** (was 93.0%)
+>   - **Arcwise-Plat-SQL: 148/199 = 74.37%** (was 74.87%)
+>   - **Arcwise-Plat full: 136/199 = 68.34%** (was 68.84%)
+>   - Per-tier: simple 97.0% (unchanged) / moderate **90.9%** (was 91.9%, qid 518 is moderate) / challenging 88.2% (unchanged)
+>   - Over GPT-4 zero-shot: +44.7pp (was +45.2pp). Over AskData+GPT-4o: +10.55pp (was +11.05pp). Within 0.46pp human-expert (BIRD paper 92.96%, was 0.04pp).
+> - **Audit-discipline narrative strengthens, not weakens.** Portfolio claim: we ran CC-CX-KM on our own diff, CX caught a systemic scoring bug that had been silently inflating numbers since v13 (a week of headlines were off by 1 qid), we documented + fixed + re-deployed within the same session. That's the right story for a Senior DE/DA portfolio: catch your own false positives.
+> - **Gates:** 333 pytest (+3 regression tests on safe_compare_pred), ruff clean, mypy --strict src clean.
+> - **HF redeploy with corrected 92.5%** — landed (E2E grep confirmed `92.5%` EN / `92,5%` RU on live URL <https://liovina-nl-sql.hf.space>).
+> - **KM was unavailable** for this review (`normalization_error` — kimi auth fragile per `reference_kimi_codex_auth_fragile`). CX-only review per `feedback_cx_review_status_fragile` is "advisory only" — but I independently verified the CX finding via `.tmp/scan_empty_pred_fp.py` re-executing each stored match=True pred against the live DB. Re-execution is the canonical check, stronger than KM cross-confirm; CX [P2] verdict stands.
+>
+> ---
+>
+> **Tl;dr 2026-05-25 EOD — HF Space redeploy v17 → v29 (live UI in sync with repo) [SUPERSEDED by EOD-3]:**
 > - **What:** ran `.deploy_hf.py` to push current repo (HEAD 40ac2a1) to <https://liovina-nl-sql.hf.space>. Build BUILDING → APP_STARTING → RUNNING in ~90s.
 > - **Why:** live URL was stuck on v17 86.5% since 2026-05-18 (last redeploy) while repo/UI captions/README climbed to v29 93.0%. Hire-аудитория, кликая на портфолио link, видела старое число — 7pp gap.
 > - **E2E verify (per `feedback_deploy_e2e_gate`):** Playwright headless 1440×900 на live URL, dump page body, grep for headline:
 
@@ -1449,7 +1449,7 @@
       "pred_row_count": 4,
       "gold_row_count": 1,
       "comparison_reason": "row count mismatch: gold=1, pred=4",
-      "audit_note": "BIRD-official set-semantics audit (compare_results Counter→set, see commit notes)"
+      "audit_note": "BIRD-official set-semantics audit (compare_results Counter\u2192set, see commit notes)"
     },
     {
       "question_id": 366,
@@ -1891,7 +1891,7 @@
       "question": "Which of the play format has the highest number of banned status? Indicate the play format and the names of all the card meet the condition.",
       "gold_sql": "WITH MaxBanned AS (SELECT format, COUNT(*) AS count_banned FROM legalities WHERE status = 'Banned' GROUP BY format ORDER BY COUNT(*) DESC LIMIT 1) SELECT T2.format, T1.name FROM cards AS T1 INNER JOIN legalities AS T2 ON T2.uuid = T1.uuid INNER JOIN MaxBanned MB ON MB.format = T2.format WHERE T2.status = 'Banned'",
       "pred_sql": "SELECT format, COUNT(*) AS banned_count\n  FROM legalities\n  WHERE status = 'Banned'\n  GROUP BY format\n), max_banned AS (\n  SELECT MAX(banned_count) AS banned_count\n  FROM banned_counts\n)\nSELECT bc.format, c.name\nFROM banned_counts AS bc\nJOIN max_banned AS mb\n  ON bc.banned_count = mb.banned_count\nJOIN legalities AS l\n  ON l.format = bc.format AND l.status = 'Banned'\nJOIN cards AS c\n  ON c.uuid = l.uuid\nORDER BY bc.format, c.name",
-      "match": true,
+      "match": false,
       "schema_recall": true,
       "error_kind": null,
       "error_message": "",
@@ -1915,7 +1915,8 @@
       "pred_row_count": 1,
       "gold_row_count": 0,
       "comparison_reason": "row count mismatch: gold=0, pred=1",
-      "voted_by": "helallao:grok-4.1-reasoning"
+      "voted_by": "helallao:grok-4.1-reasoning",
+      "match_note": "pred-fail-empty-gold false-positive corrected 2026-05-25 (CX [P2] from c74b46c review)"
     },
     {
       "question_id": 531,
@@ -6896,20 +6897,17 @@
     }
   ],
   "per_difficulty": {
-    "challenging": {
-      "ea": 0.8823529411764706,
-      "matched": 30,
-      "n": 34
+    "simple": {
+      "matched": 62,
+      "total": 67
     },
     "moderate": {
-      "ea": 0.8686868686868687,
-      "matched": 86,
-      "n": 99
+      "matched": 85,
+      "total": 99
     },
-    "simple": {
-      "ea": 0.9253731343283582,
-      "matched": 62,
-      "n": 67
+    "challenging": {
+      "matched": 30,
+      "total": 34
     }
   }
 }