From 6b46e1166ee1a14179ea28c5dde713531e7848bc Mon Sep 17 00:00:00 2001 From: Aravind N Date: Fri, 27 Mar 2026 18:34:00 -0400 Subject: [PATCH 01/15] Add updated code for FinMME --- .gitignore | 20 ++ README.md | 123 ++++++- pyproject.toml | 3 +- scripts/run_finmme_batch.py | 231 +++++++++++++ scripts/run_finmme_batch.sh | 22 ++ scripts/submit_pipeline.sh | 110 ++++++ src/agentfinvqa/agents/planner_agent.py | 3 +- src/agentfinvqa/agents/prompts/planner.txt | 12 +- src/agentfinvqa/agents/prompts/vision.txt | 25 +- src/agentfinvqa/agents/verifier_agent.py | 322 +++++------------- src/agentfinvqa/agents/vision_agent.py | 106 +++--- src/agentfinvqa/datasets/chartqapro_loader.py | 29 +- src/agentfinvqa/datasets/finmme_loader.py | 228 +++++++++++++ src/agentfinvqa/datasets/image_utils.py | 58 ++++ src/agentfinvqa/eval/dashboard.py | 117 ++++++- src/agentfinvqa/eval/error_taxonomy.py | 21 +- src/agentfinvqa/eval/eval_outputs.py | 66 ++++ src/agentfinvqa/eval/eval_outputs_batch.py | 190 +++++++++++ src/agentfinvqa/eval/eval_topk.py | 10 +- src/agentfinvqa/eval/judge.py | 3 +- src/agentfinvqa/eval/judge_batch.py | 248 ++++++++++++++ .../langfuse_integration/client.py | 46 ++- .../langfuse_integration/dataset.py | 55 ++- .../langfuse_integration/tracing.py | 5 +- src/agentfinvqa/mep/schema.py | 1 + src/agentfinvqa/runner/run_generate_meps.py | 213 +++++++++--- src/agentfinvqa/tools/ocr_reader_tool.py | 5 +- src/agentfinvqa/tools/verifier_tool.py | 221 ++++++++++++ src/agentfinvqa/tools/vision_qa_tool.py | 14 +- src/agentfinvqa/utils/model_compat.py | 23 ++ tests/agentfinvqa/test_finmme_loader.py | 64 ++++ tests/conftest.py | 12 +- uv.lock | 4 +- 33 files changed, 2185 insertions(+), 425 deletions(-) create mode 100644 scripts/run_finmme_batch.py create mode 100755 scripts/run_finmme_batch.sh create mode 100755 scripts/submit_pipeline.sh create mode 100644 src/agentfinvqa/datasets/finmme_loader.py create mode 100644 src/agentfinvqa/datasets/image_utils.py create mode 100644 src/agentfinvqa/eval/eval_outputs_batch.py create mode 100644 src/agentfinvqa/eval/judge_batch.py create mode 100644 src/agentfinvqa/tools/verifier_tool.py create mode 100644 src/agentfinvqa/utils/model_compat.py create mode 100644 tests/agentfinvqa/test_finmme_loader.py diff --git a/.gitignore b/.gitignore index f5e1e38..c3955d2 100644 --- a/.gitignore +++ b/.gitignore @@ -24,3 +24,23 @@ wheels/ # macos *.DS_Store site/ + +# Media files +*.jpg +*.jpeg +*.png +*.gif + +# CSV and JSON files +*.csv +*.json +*.jsonl + +# Log files +*.log +logs/ +**/logs/ + +# Bash and SLURM scripts +*.slrm +*.backup diff --git a/README.md b/README.md index 84adf9b..5d0ea32 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,16 @@ [![codecov](https://codecov.io/github/VectorInstitute/AgentFinVQA/graph/badge.svg?token=83MYFZ3UPA)](https://codecov.io/github/VectorInstitute/AgentFinVQA) ![GitHub License](https://img.shields.io/github/license/VectorInstitute/AgentFinVQA) -A multi-agent evaluation framework for Visual Question Answering on financial charts, built on the [ChartQAPro](https://huggingface.co/datasets/ahmed-masry/ChartQAPro) dataset. The framework decomposes chart QA into an explicit **Plan → Inspect → Explain** loop, producing fully traceable evaluation artifacts for each sample. +A multi-agent evaluation framework for Visual Question Answering on financial charts, supporting both [ChartQAPro](https://huggingface.co/datasets/ahmed-masry/ChartQAPro) and [FinMME](https://huggingface.co/datasets/luojunyu/FinMME). The framework decomposes chart QA into an explicit **Plan → Inspect → Explain** loop, producing fully traceable evaluation artifacts for each sample. + +## Supported datasets + +| Dataset | HF handle | Notes | +| --- | --- | --- | +| ChartQAPro | `ahmed-masry/ChartQAPro` | Multi-turn chart QA with factoid/mcq/unanswerable tasks. Images cached under `data/chartqapro_images/` by default. | +| FinMME | `luojunyu/FinMME` | Financial multi-modal evaluation benchmark (only a `train` split on HF; use slicing like `train[:1000]`). Images cached under `data/finmme_images/` by default. | + +Select the dataset at runtime with `--dataset {chartqapro|finmme}`; all downstream tooling (Langfuse registration, output directories) key off the same slug. ## Overview @@ -110,20 +119,122 @@ uv run --env-file .env -m agentfinvqa.runner.run_generate_meps \ --out meps/ ``` +To target FinMME, switch `--dataset finmme`. The loader automatically writes FinMME charts to `data/finmme_images/` unless you override `--image_dir`. Note: Hugging Face only exposes a `train` split for `luojunyu/FinMME`. Any request for `test` is remapped to `train` internally, so use slicing (e.g. `--split train[:200]`) to simulate held-out subsets. + +### Batch helpers + +For headless or batch jobs, use the helper scripts: + +- Python entrypoint: `python scripts/run_finmme_batch.py --n 500 --split train[:500] --config gemini_gemini --workers 8` +- Bash wrapper (convenient for schedulers): `scripts/run_finmme_batch.sh --n 500 --split train[:500]` + +Both commands default to loading `.env` from the repo root; override by setting `ENV_FILE=/path/to/.env` before calling the bash script or by passing `--env_file` to the Python script. + +### SLURM batch job template + +If you need to submit a single FinMME run to SLURM (MEP generation + post-eval in one job), use the monolithic template: + +```bash +sbatch scripts/slurm_run_finmme_batch.slrm +``` + +This script sets a time limit (`--time=0-04:00:00`), runs the existing bash helper, and logs output to `logs/slurm_finmme_.out/err`. Edit the SBATCH directives and the `run_finmme_batch.sh` arguments inside the file to match your workload. + +### SLURM two-stage pipeline (recommended for large runs) + +For large runs, use the chained pipeline that separates MEP generation from LLM judge evaluation. This uses the [Gemini Batch API](https://ai.google.dev/gemini-api/docs/batch) for judge scoring (50% cost reduction, async): + +```bash +scripts/submit_pipeline.sh \ + --split "train[3000:5000]" \ + --n 2000 \ + --workers 8 \ + --langfuse \ + --resume \ + --planner_model gemini-2.5-flash \ + --vision_model gemini-2.5-flash \ + --ocr_model gemini-2.5-flash-lite \ + --verifier_model gemini-2.5-flash \ + --judge_model gemini-2.5-flash-lite +``` + +This submits two SLURM jobs chained with `--dependency=afterok`: + +| Job | Script | What it does | +|---|---|---| +| 1 | `slurm_generate_meps.slrm` | MEP generation only | +| 2 | `slurm_submit_judge_batch.slrm` | Uploads all judge prompts to Gemini Batch API and exits immediately | + +Job 2 only runs if job 1 succeeds. When job 2 completes it prints the commands to check status and retrieve results: + +```bash +# Check if Gemini batch job is done +python3 -m agentfinvqa.eval.eval_outputs_batch status \ + --state output/metrics_