diff --git a/.github/workflows/code_checks.yml b/.github/workflows/code_checks.yml index 4759817..5652446 100644 --- a/.github/workflows/code_checks.yml +++ b/.github/workflows/code_checks.yml @@ -35,7 +35,7 @@ jobs: uses: astral-sh/setup-uv@v7.6.0 with: # Install a specific version of uv. - version: "0.9.11" + version: "0.11.15" enable-cache: true - name: "Set up Python" @@ -58,3 +58,11 @@ jobs: ignore-vulns: | GHSA-4xh5-x5gv-qwph CVE-2026-4539 + CVE-2026-45829 + CVE-2025-69872 + PYSEC-2026-139 + CVE-2025-3000 + CVE-2025-3001 + PYSEC-2025-217 + CVE-2026-1839 + PYSEC-2026-161 diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index a005697..9a91759 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -45,7 +45,7 @@ jobs: uses: astral-sh/setup-uv@v7.6.0 with: # Install a specific version of uv. - version: "0.9.11" + version: "0.11.15" enable-cache: true - name: "Set up Python" diff --git a/.gitignore b/.gitignore index f5e1e38..1c86e1a 100644 --- a/.gitignore +++ b/.gitignore @@ -24,3 +24,32 @@ wheels/ # macos *.DS_Store site/ + +# Media files +*.jpg +*.jpeg +*.png +*.gif + +# CSV and JSON files +*.csv +*.json +*.jsonl + +# Log files +*.log +logs/ +**/logs/ + +# Bash and SLURM scripts +*.slrm +*.backup + +# Run artifacts +output/ +meps/ +logs/ + +# Editor copies +*copy*.ipynb +vllm_endpoint.txt diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index fdb843a..04004dd 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,23 +1,23 @@ repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v6.0.0 # Use the ref you want to point at + rev: v6.0.0 hooks: - - id: trailing-whitespace - - id: check-ast - - id: check-builtin-literals - - id: check-docstring-first - - id: check-executables-have-shebangs - - id: debug-statements - - id: end-of-file-fixer - - id: mixed-line-ending - args: [--fix=lf] - - id: fix-byte-order-marker - - id: check-merge-conflict - - id: check-symlinks - - id: detect-private-key - - id: check-yaml - args: [--unsafe] - - id: check-toml + - id: trailing-whitespace + - id: check-ast + - id: check-builtin-literals + - id: check-docstring-first + - id: check-executables-have-shebangs + - id: debug-statements + - id: end-of-file-fixer + - id: mixed-line-ending + args: [--fix=lf] + - id: fix-byte-order-marker + - id: check-merge-conflict + - id: check-symlinks + - id: detect-private-key + - id: check-yaml + args: [--unsafe] + - id: check-toml - repo: https://github.com/astral-sh/uv-pre-commit rev: 0.10.12 @@ -25,52 +25,61 @@ repos: - id: uv-lock - repo: https://github.com/astral-sh/ruff-pre-commit - rev: 'v0.15.7' + rev: v0.15.7 hooks: - - id: ruff - args: [--fix, --exit-non-zero-on-fix] - types_or: [python, jupyter] - - id: ruff-format - types_or: [python, jupyter] + - id: ruff + args: [--fix, --exit-non-zero-on-fix] + types_or: [python, jupyter] - - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.19.1 + - repo: local hooks: - - id: mypy - entry: python3 -m mypy --config-file pyproject.toml - language: system - types: [python] - exclude: "tests" + - id: mypy + name: mypy + entry: .venv/bin/mypy + language: system + types: [python] + exclude: "tests" + args: [--config-file, pyproject.toml] + pass_filenames: false + always_run: true - repo: https://github.com/crate-ci/typos rev: v1.44.0 hooks: - id: typos - args: [] - repo: https://github.com/nbQA-dev/nbQA rev: 1.9.1 hooks: - - id: nbqa-ruff - args: [--fix, --exit-non-zero-on-fix] + - id: nbqa-ruff + args: [--fix, --exit-non-zero-on-fix] - repo: local hooks: - - id: pytest - name: pytest - entry: python3 -m pytest -m "not integration_test" - language: system - pass_filenames: false - always_run: true + - id: pytest + name: pytest + # Bare `python` often hits conda/site-packages without project deps (e.g. langfuse, httpx). + # `uv run` uses the repo's Python and lockfile; agentic-xai-eval pulls crewai for vision tests. + entry: uv run --group agentic-xai-eval python -m pytest -m "not integration_test" + language: system + pass_filenames: false + always_run: true + + # Run after pytest so formatter edits do not confuse later hooks in the same run. + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.15.7 + hooks: + - id: ruff-format + types_or: [python, jupyter] ci: - autofix_commit_msg: | - [pre-commit.ci] Add auto fixes from pre-commit.com hooks + autofix_commit_msg: | + [pre-commit.ci] Add auto fixes from pre-commit.com hooks - for more information, see https://pre-commit.ci - autofix_prs: true - autoupdate_branch: '' - autoupdate_commit_msg: '[pre-commit.ci] pre-commit autoupdate' - autoupdate_schedule: weekly - skip: [pytest,doctest,mypy] - submodules: false + for more information, see https://pre-commit.ci + autofix_prs: true + autoupdate_branch: '' + autoupdate_commit_msg: '[pre-commit.ci] pre-commit autoupdate' + autoupdate_schedule: weekly + skip: [pytest, doctest, mypy] + submodules: false diff --git a/README.md b/README.md index 84adf9b..a689635 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,16 @@ [![codecov](https://codecov.io/github/VectorInstitute/AgentFinVQA/graph/badge.svg?token=83MYFZ3UPA)](https://codecov.io/github/VectorInstitute/AgentFinVQA) ![GitHub License](https://img.shields.io/github/license/VectorInstitute/AgentFinVQA) -A multi-agent evaluation framework for Visual Question Answering on financial charts, built on the [ChartQAPro](https://huggingface.co/datasets/ahmed-masry/ChartQAPro) dataset. The framework decomposes chart QA into an explicit **Plan → Inspect → Explain** loop, producing fully traceable evaluation artifacts for each sample. +A multi-agent evaluation framework for Visual Question Answering on financial charts, supporting both [ChartQAPro](https://huggingface.co/datasets/ahmed-masry/ChartQAPro) and [FinMME](https://huggingface.co/datasets/luojunyu/FinMME). The framework decomposes chart QA into an explicit **Plan → Inspect → Explain** loop, producing fully traceable evaluation artifacts for each sample. + +## Supported datasets + +| Dataset | HF handle | Notes | +| --- | --- | --- | +| ChartQAPro | `ahmed-masry/ChartQAPro` | Multi-turn chart QA with factoid/mcq/unanswerable tasks. Images cached under `data/chartqapro_images/` by default. | +| FinMME | `luojunyu/FinMME` | Financial multi-modal evaluation benchmark (only a `train` split on HF; use slicing like `train[:1000]`). Images cached under `data/finmme_images/` by default. | + +Select the dataset at runtime with `--dataset {chartqapro|finmme}`; all downstream tooling (Langfuse registration, output directories) key off the same slug. ## Overview @@ -30,22 +39,37 @@ Input Sample (question, chart image, expected answer) ▼ PlannerAgent (text-only LLM) • Produces a structured JSON inspection plan + • MCQ-aware: checks/eliminates each choice; multi-select guidance • Does NOT see the image │ plan.steps ▼ - OcrReaderTool (optional) + OcrReaderTool • Single VLM call focused on text transcription - • Produces structured JSON of all visible text - │ ocr_text + • Produces structured chart metadata (axes, legend, data labels) + │ ocr_text + chart_type + ▼ + LegendGrounderTool (conditional) + • Triggered for line/bar/scatter/area/pie/donut charts + • Maps legend labels → color descriptions + RGB + line style + • Compliance check: re-runs if legend entries are missing + │ legend_map ▼ - VisionAgent (CrewAI + tools) - • Executes the plan step-by-step - • Produces answer + explanation + VisionAgent (CrewAI + VisionQATool) + • Executes the plan using OCR text and legend map as ground truth + • Single-select MCQ / multi-select MCQ / open-ended answer paths + • Produces answer + explanation + per-choice confidence analysis │ draft_answer ▼ - VerifierAgent (single VLM call) + Forced-Choice Retry (conditional) + • If vision returns UNANSWERABLE and MCQ choices exist: + re-runs vision with explicit "FORCED CHOICE" instruction + │ draft_answer (revised if retry triggered) + ▼ + VerifierAgent (CrewAI + VerifierTool) • Reviews draft answer against chart image - • Verdict: CONFIRM or REVISE + • Adds reluctance hint when vision confidence is high (≥ 0.85) + • Verdict: CONFIRM or REVISE + self-reported confidence + • Confidence gate: downgrades low-confidence revisions (< 0.75) │ ▼ MEP (Model Evaluation Packet) @@ -53,6 +77,37 @@ Input Sample (question, chart image, expected answer) • Optionally traced in Langfuse ``` +## Results + +### FinMME (250-sample train slice; v8 / v9 / v10 scale-up to 1,250) + +| Run | Accuracy | Δ vs baseline | Key change | +|-----|----------|---------------|------------| +| `no_legend_grounding` | 48.0% | — | Baseline | +| `fixes_v1` | 50.4% | +2.4 pp | Legend grounding, caption injection, token limits | +| `fixes_v2` | 51.6% | +3.6 pp | Disable thinking tokens, MCQ choices to verifier | +| `fixes_v3` | 51.6% | +3.6 pp | Thinking budget = 512 | +| `fixes_v4_g3flash` | 56.0% | +8.0 pp | Gemini 3 Flash, forced-choice retry, MCQ-aware planner | +| `fixes_v5_multiselect` | **69.4%** | +21.4 pp | Full multi-select MCQ support | +| `fixes_v7_g3flash_conf_gate` | **69.6%** | +21.6 pp | Confidence gate fix, fresh g3flash run | +| `fixes_v8_g3flash_color_area` | **71.2%** *(n = 1,250)* | +23.2 pp | Color-area OpenCV pre-hint; see `results.md` §8b | +| `fixes_v9_g3flash_related_sents` | **71.3%** *(n = 1,250)* | +23.3 pp | Verifier + `related_sentences` + caption cross-check; **~2.4× tighter latency tail** vs v8 (p95 87 s vs 209 s) | +| `fixes_v10_g3flash_choice_conflict` | **71.1%** *(n = 1,250)* | +23.1 pp | v9 + high-confidence **choice-conflict** flag for verifier | + +**vs. FinMME paper (Table 3, Gemini Flash 2.0 = 51.85%):** our best **250-ID** ladder run achieves **+17.8 pp** (v7 mean `answer_accuracy` vs paper headline — metric families differ). + +**Fair same-model baseline (Gemini-3 Flash Preview structured zero-shot vs agent):** + +- **Primary (matched n = 1,250 train IDs):** zero-shot mean `answer_accuracy` **63.56%** vs agents — **v8** **71.24%** (**+7.68 pp**, exact **+8.72 pp**, McNemar χ² = 68.21, p ≈ 1.1×10⁻¹⁶); **v9** **71.28%** (**+7.72 pp**, exact **+8.16 pp**, χ² = 61.45, p ≈ 4.5×10⁻¹⁵); **v10** **71.08%** (**+7.52 pp**, exact **+7.84 pp**, χ² = 57.37, p ≈ 3.6×10⁻¹⁴). All three crush zero-shot; pairwise between agents nothing is significant (v9 vs v8 p = 0.56; v10 vs v8 p = 0.34; v9 vs v10 p = 0.75). v9's distinctive contribution is **latency-tail tightening**, not extra accuracy. Full zero-shot train file: **11,099** rows — always join on `sample_id` before comparing. +- **Legacy 250-ID snapshot (strict exact, ablation era):** zero-shot **52.8%** vs agent v7 **62.8%** → **+10.0 pp** — useful historically; see `results.md` §8b for context. + +> Note: the initial zero-shot Gemini-3 export had parser-related empty predictions; robust extraction + repair recovered many rows before the full 11k re-run. + +Detailed per-run analysis, per-type breakdowns, and paper comparison are in [`notebooks/results_analysis.ipynb`](notebooks/results_analysis.ipynb). +For camera-ready citation numbers, see [`markdown/camera_ready_metrics.md`](markdown/camera_ready_metrics.md). + +--- + ## Installation The development environment is managed with [uv](https://github.com/astral-sh/uv). @@ -110,20 +165,199 @@ uv run --env-file .env -m agentfinvqa.runner.run_generate_meps \ --out meps/ ``` +To target FinMME, switch `--dataset finmme`. The loader automatically writes FinMME charts to `data/finmme_images/` unless you override `--image_dir`. Note: Hugging Face only exposes a `train` split for `luojunyu/FinMME`. Any request for `test` is remapped to `train` internally, so use slicing (e.g. `--split train[:200]`) to simulate held-out subsets. + +### Sample selection (`--split` and `--n`) + +- **`--split`** — Hugging Face split and optional row slice (e.g. `test`, `test[1000:]`, `train[:500]`). This selects *which rows* of the dataset are loaded. +- **`--n`** — Maximum number of **perceived samples** to process after that slice. Use **`0` or a negative value for no cap** (process the entire loaded slice). Positive `n` stops early once enough samples are materialized. + +So “run the whole `test` split” is typically: + +```bash +--split test --n 0 +``` + +A partial slice with no further cap: + +```bash +--split 'test[1000:]' --n 0 +``` + +### Batch helpers + +The recommended entrypoints for all datasets are `scripts/run_batch.py` and its bash wrapper `scripts/run_batch.sh`. These are dataset-agnostic and run generation + post-evaluation in a single MEP pass (metrics, traces, failure taxonomy, and summary in one go): + +```bash +scripts/run_batch.sh \ + --dataset chartqapro \ + --split test \ + --n 500 \ + --config gemini_gemini \ + --workers 8 \ + --post_eval \ + --use_judge \ + --langfuse \ + --resume \ + --eval_label chartqapro_test_n500 +``` + +To skip generation and run post-eval on existing MEPs only: + +```bash +scripts/run_batch.sh \ + --dataset chartqapro \ + --split test \ + --config gemini_gemini \ + --eval_only \ + --use_judge \ + --langfuse \ + --eval_label chartqapro_test_n500 +``` + +Both commands default to loading `.env` from the repo root. `--langfuse` pushes all numeric eval scores (accuracy, judge rubric scores) back to the originating Langfuse traces. + +**Verifier ablation:** pass `--no_verifier` to skip the VerifierAgent (Pass 2.5); the pipeline keeps the planner/vision draft without a revise step. Supported by `scripts/run_batch.py`, `scripts/run_finmme_batch.py`, and `scripts/submit_pipeline.sh` (see below). + +### SLURM — single job (generation + eval) + +Submit a complete run (generation and post-eval) as one SLURM job: + +```bash +sbatch scripts/slurm_run_batch.slrm +``` + +Environment variables (`DATASET`, `SPLIT`, `N`, `CONFIG`, `WORKERS`, `LANGFUSE`, `RESUME`, `NO_VERIFIER`, and model overrides) are passed through from the environment or from `submit_pipeline.sh` via `--export`. Set `NO_VERIFIER=1` before `sbatch` if you call `slurm_run_batch.slrm` without the submit helper. + +### SLURM — eval only + +To run post-eval on MEPs that already exist: + +```bash +scripts/submit_eval.sh \ + --dataset chartqapro \ + --split test \ + --use_judge \ + --langfuse \ + --out_label chartqapro_test_n500 +``` + +This submits `slurm_eval_only.slrm` as a single SLURM job. You can chain it after a generation job: + +```bash +scripts/submit_eval.sh \ + --dataset chartqapro \ + --split test \ + --use_judge \ + --langfuse \ + --after +``` + +### SLURM — two-stage pipeline (async judge, recommended for large runs) + +For large runs, use the chained pipeline that separates MEP generation from LLM judge evaluation. This uses the [Gemini Batch API](https://ai.google.dev/gemini-api/docs/batch) for judge scoring (50% cost reduction, async): + +```bash +scripts/submit_pipeline.sh \ + --dataset finmme \ + --split "train[3000:5000]" \ + --n 2000 \ + --workers 8 \ + --langfuse \ + --resume \ + --planner_model gemini-2.5-flash \ + --vision_model gemini-2.5-flash \ + --ocr_model gemini-2.5-flash-lite \ + --verifier_model gemini-2.5-flash \ + --judge_model gemini-2.5-flash-lite +``` + +**Defaults (you usually do not need to repeat model flags)** — `submit_pipeline.sh` already defaults to `--config gemini_gemini`, `--workers 8`, and the same planner/vision/OCR/verifier/judge models as above. Override only what you change. Add `--langfuse` and/or `--resume` when you want tracing or skip-existing MEPs. For a **verifier-off ablation**, add `--no_verifier`. + +**Full split without counting rows** — use `--n 0` (see the *Sample selection* subsection above): + +```bash +scripts/submit_pipeline.sh \ + --dataset chartqapro \ + --split test \ + --n 0 \ + --no_verifier \ + --resume +``` + +**Not the same as `run_batch.sh --post_eval`** — Job 1 in this chain runs **MEP generation only** (via `slurm_run_batch.slrm` → `run_batch.sh` **without** `--post_eval`). Job 2 submits prompts to the **Gemini Batch API** for async judge scoring. For **local / threaded** post-eval in one process (metrics, traces, taxonomy, summary written immediately), use `scripts/run_batch.sh` with `--post_eval` (and `--use_judge` if you want the LLM judge path during that step) instead of this two-stage pipeline. + +This submits two SLURM jobs chained with `--dependency=afterok`: + +| Job | Script | What it does | +|---|---|---| +| 1 | `slurm_run_batch.slrm` | MEP generation | +| 2 | `slurm_submit_judge_batch.slrm` | Uploads all judge prompts to Gemini Batch API and exits immediately | + +**Where MEPs and batch metrics go** + +- **MEP directory** (generation output): `meps////` when the verifier is on (default). With **`--no_verifier`**, MEPs go under **`meps///no_verifier//`** so verifier-on and verifier-off runs do not overwrite each other. Example: `meps/gemini_gemini/chartqapro/test/` vs `meps/gemini_gemini/chartqapro/no_verifier/test/`. +- **Batch judge file** (job 2): `output/metrics_.jsonl` plus `output/metrics_.jsonl.batch_state.json`. If you omit `--out_label`, the script sets `` to `{dataset}_{sanitized_split}` and appends `_no_verifier` when `--no_verifier` is set (e.g. `chartqapro_test_no_verifier`), so different runs do not overwrite `metrics_test.jsonl`. + +Job 2 only runs if job 1 succeeds. When job 2 completes it prints the commands to check status and retrieve results: + +```bash +# Check if Gemini batch job is done +python3 -m agentfinvqa.eval.eval_outputs_batch status \ + --state output/metrics_