From 8201912eca67e71d4dc99bb4e4090263228335e5 Mon Sep 17 00:00:00 2001 From: Arya Hariharan Date: Sat, 11 Oct 2025 13:08:10 +0530 Subject: [PATCH 01/12] Add careqa mcq eval environment --- environments/careqa_mcq/README.md | 60 ++++++++++++++++++++ environments/careqa_mcq/careqa_mcq.py | 78 ++++++++++++++++++++++++++ environments/careqa_mcq/pyproject.toml | 22 ++++++++ 3 files changed, 160 insertions(+) create mode 100644 environments/careqa_mcq/README.md create mode 100644 environments/careqa_mcq/careqa_mcq.py create mode 100644 environments/careqa_mcq/pyproject.toml diff --git a/environments/careqa_mcq/README.md b/environments/careqa_mcq/README.md new file mode 100644 index 00000000..4d936e88 --- /dev/null +++ b/environments/careqa_mcq/README.md @@ -0,0 +1,60 @@ +# careqa + +Evaluation environment for the [HPAI-BSC/CareQA](https://huggingface.co/datasets/HPAI-BSC/CareQA) multiple-choice dataset. + +--- + +### Overview +- **Environment ID**: `careqa` +- **Short description**: CareQA is a healthcare QA dataset with **multiple-choice** and **open-ended clinical reasoning questions**. This environment is for the MCQs only. +- **Tags**: healthcare, medical QA, clinical reasoning, MCQ, open-ended, single-turn + +### Datasets +- **Primary dataset(s)**: + - `CareQA_en` – multiple-choice clinical questions with 4 options and correct answer labels. +- **Source links**: + - [Hugging Face CareQA dataset](https://huggingface.co/datasets/HPAI-BSC/CareQA) + +### Task +- **Type**: single-turn +- **Parser**: custom prompt mapping (no structured markup) +- **Rubric overview**: +**MCQ (`closed_mcq`)**: `vf.Rubric()` measuring **accuracy** (letter match). + +--- + +### Quickstart +Run an evaluation with default settings: + +```bash +uv run vf-eval careqa +``` + +Configure model and sampling: + +```bash +uv run vf-eval careqa -m gpt-4.1-mini -n 20 -r 3 -t 1024 -T 0.7 -a '{"max_examples": 50}' +``` + +Notes: +- Use `-a` / `--env-args` to pass environment-specific configuration as a JSON object. + + +### Environment Arguments + +| Arg | Type | Default | Description | +|----------------|------|---------|-------------| +| `max_examples` | int | `-1` | Maximum number of examples to evaluate; use `-1` for full dataset | +| `split` | str | `"test"` | Dataset split to use: `train`, `validation`, or `test` | +| `verbose` | bool | `False` | Print prompt/answer samples during evaluation | + +--- + +### Metrics + +| Metric | Meaning | +|---------------|---------| +| `reward` | Main scalar reward (weighted sum of rubric criteria) | +| `accuracy` | Exact match on target MCQ answer (letter A–D) | + + diff --git a/environments/careqa_mcq/careqa_mcq.py b/environments/careqa_mcq/careqa_mcq.py new file mode 100644 index 00000000..b670fcd8 --- /dev/null +++ b/environments/careqa_mcq/careqa_mcq.py @@ -0,0 +1,78 @@ +from __future__ import annotations +from typing import Any, Optional +from datasets import load_dataset +import verifiers as vf + + +# Helper Functions + +def _get_text_from_completion(completion: Any) -> str: + """Extract plain text from completion.""" + if isinstance(completion, str): + return completion.strip() + if isinstance(completion, list) and completion: + last = completion[-1] + if isinstance(last, dict): + return str(last.get("content", "")).strip() + return str(last).strip() + return str(completion).strip() + + +def _first_letter(text: str) -> Optional[str]: + """Extract the first uppercase A–Z letter.""" + for ch in (text or "").upper(): + if "A" <= ch <= "Z": + return ch + return None + +# Prompt Construction + +def _build_prompt(question: str, options: dict[str, str]) -> str: + """Create a polished clinical MCQ prompt.""" + formatted_opts = "\n".join(f"{k}. {v}" for k, v in options.items()) + letters = ", ".join(options.keys()) + return ( + "You are a board-certified clinician taking a medical reasoning test.\n" + "Read the following question carefully and choose the most appropriate answer.\n\n" + f"Question:\n{question.strip()}\n\n" + f"Options:\n{formatted_opts}\n\n" + f"Respond with only the option letter ({letters}), nothing else." + ) + +# Main Environment + +def load_environment(split: str = "test") -> vf.Environment: + """ + CareQA multiple-choice evaluation environment. + Uses vf.SingleTurnEnv + MCQ accuracy rubric. + """ + ds = load_dataset("HPAI-BSC/CareQA",'CareQA_en', split=split) + + def _map(ex): + options = {"A": ex["op1"], "B": ex["op2"], "C": ex["op3"], "D": ex["op4"]} + gold_letter = ["A", "B", "C", "D"][ex["cop"] - 1] + # The key change is here: format the single prompt string as a list of dicts (ChatML format) + return { + "prompt": [ + { + "role": "user", + "content": _build_prompt(ex["question"], options) + } + ], + "answer": gold_letter, + } + + mapped = ds.map(_map, remove_columns=ds.column_names) + + def mcq_accuracy(completion, answer): + pred = _first_letter(_get_text_from_completion(completion)) + return 1.0 if pred == str(answer).upper() else 0.0 + + rubric = vf.Rubric(funcs=[mcq_accuracy], weights=[1.0]) + + return vf.SingleTurnEnv( + dataset=mapped, + eval_dataset=mapped, + rubric=rubric, + system_prompt=None, + ) diff --git a/environments/careqa_mcq/pyproject.toml b/environments/careqa_mcq/pyproject.toml new file mode 100644 index 00000000..fa298d5a --- /dev/null +++ b/environments/careqa_mcq/pyproject.toml @@ -0,0 +1,22 @@ +[project] +name = "careqa_mcq" +description = "Evaluation environment for the HPAI-BSC/CareQA MCQ dataset" +tags = ["healthcare", "medical-qa", "mcq", "open-ended", "clinical", "single-turn"] +version = "0.1.0" +requires-python = ">=3.11" +dependencies = [ + "verifiers>=0.1.4", + "datasets>=2.13.0" +] + +[tool.prime.environment] +loader = "careqa_mcq:load_environment" +display_name = "CareQA" +visibility = "PUBLIC" + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build] +include = ["careqa_mcq.py"] \ No newline at end of file From a19ba978e0f76ea7d2d140999b22de49b1e94102 Mon Sep 17 00:00:00 2001 From: Arya Hariharan Date: Sat, 11 Oct 2025 16:19:31 +0530 Subject: [PATCH 02/12] add careqa open-ended env --- environments/careqa_mcq/README.md | 15 ++---- environments/careqa_mcq/careqa_mcq.py | 2 +- environments/careqa_mcq/pyproject.toml | 2 +- environments/careqa_openended/README.md | 51 +++++++++++++++++++ .../careqa_openended/careqa_openended.py | 48 +++++++++++++++++ environments/careqa_openended/pyproject.toml | 22 ++++++++ 6 files changed, 127 insertions(+), 13 deletions(-) create mode 100644 environments/careqa_openended/README.md create mode 100644 environments/careqa_openended/careqa_openended.py create mode 100644 environments/careqa_openended/pyproject.toml diff --git a/environments/careqa_mcq/README.md b/environments/careqa_mcq/README.md index 4d936e88..89dc1c04 100644 --- a/environments/careqa_mcq/README.md +++ b/environments/careqa_mcq/README.md @@ -2,12 +2,10 @@ Evaluation environment for the [HPAI-BSC/CareQA](https://huggingface.co/datasets/HPAI-BSC/CareQA) multiple-choice dataset. ---- - ### Overview -- **Environment ID**: `careqa` +- **Environment ID**: `careqa_mcq` - **Short description**: CareQA is a healthcare QA dataset with **multiple-choice** and **open-ended clinical reasoning questions**. This environment is for the MCQs only. -- **Tags**: healthcare, medical QA, clinical reasoning, MCQ, open-ended, single-turn +- **Tags**: healthcare, medical QA, clinical reasoning, MCQ, single-turn ### Datasets - **Primary dataset(s)**: @@ -21,8 +19,6 @@ Evaluation environment for the [HPAI-BSC/CareQA](https://huggingface.co/datasets - **Rubric overview**: **MCQ (`closed_mcq`)**: `vf.Rubric()` measuring **accuracy** (letter match). ---- - ### Quickstart Run an evaluation with default settings: @@ -33,11 +29,8 @@ uv run vf-eval careqa Configure model and sampling: ```bash -uv run vf-eval careqa -m gpt-4.1-mini -n 20 -r 3 -t 1024 -T 0.7 -a '{"max_examples": 50}' -``` - -Notes: -- Use `-a` / `--env-args` to pass environment-specific configuration as a JSON object. +uv run vf-eval careqa_mcq --model gpt-4.1-mini --num-examples 3 -s +``` ### Environment Arguments diff --git a/environments/careqa_mcq/careqa_mcq.py b/environments/careqa_mcq/careqa_mcq.py index b670fcd8..6f62367d 100644 --- a/environments/careqa_mcq/careqa_mcq.py +++ b/environments/careqa_mcq/careqa_mcq.py @@ -28,7 +28,7 @@ def _first_letter(text: str) -> Optional[str]: # Prompt Construction def _build_prompt(question: str, options: dict[str, str]) -> str: - """Create a polished clinical MCQ prompt.""" + """Create an MCQ prompt.""" formatted_opts = "\n".join(f"{k}. {v}" for k, v in options.items()) letters = ", ".join(options.keys()) return ( diff --git a/environments/careqa_mcq/pyproject.toml b/environments/careqa_mcq/pyproject.toml index fa298d5a..8fbbc8cc 100644 --- a/environments/careqa_mcq/pyproject.toml +++ b/environments/careqa_mcq/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "careqa_mcq" description = "Evaluation environment for the HPAI-BSC/CareQA MCQ dataset" -tags = ["healthcare", "medical-qa", "mcq", "open-ended", "clinical", "single-turn"] +tags = ["healthcare", "medical-qa", "mcq", "clinical", "single-turn"] version = "0.1.0" requires-python = ">=3.11" dependencies = [ diff --git a/environments/careqa_openended/README.md b/environments/careqa_openended/README.md new file mode 100644 index 00000000..ffa0f78a --- /dev/null +++ b/environments/careqa_openended/README.md @@ -0,0 +1,51 @@ +# careqa-openended + +> Replace the placeholders below, then remove this callout. + +### Overview +- **Environment ID**: `careqa-openended` +- **Short description**: +- **Tags**: + +### Datasets +- **Primary dataset(s)**: +- **Source links**: +- **Split sizes**: + +### Task +- **Type**: +- **Parser**: +- **Rubric overview**: + +### Quickstart +Run an evaluation with default settings: + +```bash +uv run vf-eval careqa-openended +``` + +Configure model and sampling: + +```bash +uv run vf-eval careqa-openended -m gpt-4.1-mini -n 20 -r 3 -t 1024 -T 0.7 -a '{"key": "value"}' # env-specific args as JSON +``` + +Notes: +- Use `-a` / `--env-args` to pass environment-specific configuration as a JSON object. + +### Environment Arguments +Document any supported environment arguments and their meaning. Example: + +| Arg | Type | Default | Description | +| --- | ---- | ------- | ----------- | +| `foo` | str | `"bar"` | What this controls | +| `max_examples` | int | `-1` | Limit on dataset size (use -1 for all) | + +### Metrics +Summarize key metrics your rubric emits and how theyre interpreted. + +| Metric | Meaning | +| ------ | ------- | +| `reward` | Main scalar reward (weighted sum of criteria) | +| `accuracy` | Exact match on target answer | + diff --git a/environments/careqa_openended/careqa_openended.py b/environments/careqa_openended/careqa_openended.py new file mode 100644 index 00000000..a09aa731 --- /dev/null +++ b/environments/careqa_openended/careqa_openended.py @@ -0,0 +1,48 @@ +from __future__ import annotations +from typing import Any, Optional +from datasets import load_dataset +import verifiers as vf + +# Prompt Construction + +def _build_open_prompt(question: str) -> str: + """Create an open-ended clinical QA prompt.""" + return ( + "You are an expert clinician answering medical questions.\n" + "Read the following question carefully and provide a detailed, concise answer.\n\n" + f"Question:\n{question.strip()}\n\n" + "Answer:" + ) + +# Load Open-Ended Environment + +def load_environment(split: str = "test") -> vf.SingleTurnEnv: + ds = load_dataset("HPAI-BSC/CareQA", 'CareQA_en_open', split=split) + + def _map(ex): + system_content = "You are an expert clinician answering medical questions." + + user_content = ( + "Read the following question carefully and provide a detailed, concise answer.\n\n" + f"Question:\n{ex['question'].strip()}\n\n" + "Answer:" + ) + + return { + "prompt": [ + {"role": "system", "content": system_content}, + {"role": "user", "content": user_content}, + ], + "answer": ex.get("answer_explanation", ex.get("answer", "")), + } + + mapped = ds.map(_map, remove_columns=ds.column_names) + + rubric = vf.JudgeRubric() + + return vf.SingleTurnEnv( + dataset=mapped, + eval_dataset=mapped, + rubric=rubric, + system_prompt=None, + ) diff --git a/environments/careqa_openended/pyproject.toml b/environments/careqa_openended/pyproject.toml new file mode 100644 index 00000000..80e4e765 --- /dev/null +++ b/environments/careqa_openended/pyproject.toml @@ -0,0 +1,22 @@ +[project] +name = "careqa_openended" +description = "Evaluation environment for the HPAI-BSC/CareQA open-ended dataset" +tags = ["healthcare", "medical-qa", "open-ended", "clinical", "single-turn"] +version = "0.1.0" +requires-python = ">=3.11" +dependencies = [ + "verifiers>=0.1.4", + "datasets>=2.13.0" +] + +[tool.prime.environment] +loader = "careqa_openended:load_environment" +display_name = "CareQA" +visibility = "PUBLIC" + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build] +include = ["careqa_openended.py"] \ No newline at end of file From ec93a0fb012c55e085ab8cd76f214d37eb83f1bf Mon Sep 17 00:00:00 2001 From: Arya Hariharan Date: Sat, 11 Oct 2025 16:22:55 +0530 Subject: [PATCH 03/12] add careqa open-ended env --- environments/careqa_mcq/README.md | 11 ------ environments/careqa_openended/README.md | 51 ++++++++++--------------- 2 files changed, 21 insertions(+), 41 deletions(-) diff --git a/environments/careqa_mcq/README.md b/environments/careqa_mcq/README.md index 89dc1c04..2797a8dd 100644 --- a/environments/careqa_mcq/README.md +++ b/environments/careqa_mcq/README.md @@ -32,17 +32,6 @@ Configure model and sampling: uv run vf-eval careqa_mcq --model gpt-4.1-mini --num-examples 3 -s ``` - -### Environment Arguments - -| Arg | Type | Default | Description | -|----------------|------|---------|-------------| -| `max_examples` | int | `-1` | Maximum number of examples to evaluate; use `-1` for full dataset | -| `split` | str | `"test"` | Dataset split to use: `train`, `validation`, or `test` | -| `verbose` | bool | `False` | Print prompt/answer samples during evaluation | - ---- - ### Metrics | Metric | Meaning | diff --git a/environments/careqa_openended/README.md b/environments/careqa_openended/README.md index ffa0f78a..f08d0250 100644 --- a/environments/careqa_openended/README.md +++ b/environments/careqa_openended/README.md @@ -1,51 +1,42 @@ -# careqa-openended +# careqa_openended -> Replace the placeholders below, then remove this callout. +Evaluation environment for the [HPAI-BSC/CareQA](https://huggingface.co/datasets/HPAI-BSC/CareQA) openended dataset. ### Overview -- **Environment ID**: `careqa-openended` -- **Short description**: -- **Tags**: +- **Environment ID**: `careqa_openended` +- **Short description**: CareQA is a healthcare QA dataset with **multiple-choice** and **open-ended clinical reasoning questions**. This environment is for the open-ended questions only. +- **Tags**: healthcare, medical QA, clinical reasoning, single-turn ### Datasets -- **Primary dataset(s)**: -- **Source links**: -- **Split sizes**: +- **Primary dataset(s)**: + - `CareQA_en_open` – open-ended clinical questions with reference answers. +- **Source links**: + - [Hugging Face CareQA dataset](https://huggingface.co/datasets/HPAI-BSC/CareQA) ### Task -- **Type**: -- **Parser**: -- **Rubric overview**: +- **Type**: single-turn +- **Parser**: custom prompt mapping (no structured markup) +- **Rubric overview**: +**Open-ended (`open_clinical`)**: `vf.JudgeRubric()` using an LLM-as-judge to score free-text answers for correctness and clinical reasoning. ### Quickstart Run an evaluation with default settings: ```bash -uv run vf-eval careqa-openended +uv run vf-eval careqa ``` Configure model and sampling: ```bash -uv run vf-eval careqa-openended -m gpt-4.1-mini -n 20 -r 3 -t 1024 -T 0.7 -a '{"key": "value"}' # env-specific args as JSON -``` - -Notes: -- Use `-a` / `--env-args` to pass environment-specific configuration as a JSON object. - -### Environment Arguments -Document any supported environment arguments and their meaning. Example: - -| Arg | Type | Default | Description | -| --- | ---- | ------- | ----------- | -| `foo` | str | `"bar"` | What this controls | -| `max_examples` | int | `-1` | Limit on dataset size (use -1 for all) | +uv run vf-eval careqa_openended --model gpt-4.1-mini --num-examples 3 -s +``` ### Metrics -Summarize key metrics your rubric emits and how theyre interpreted. -| Metric | Meaning | -| ------ | ------- | -| `reward` | Main scalar reward (weighted sum of criteria) | -| `accuracy` | Exact match on target answer | +| Metric | Meaning | +|---------------|---------| +| `reward` | Main scalar reward (weighted sum of rubric criteria) | +| `judge_score` | For open-ended questions, LLM-assigned score evaluating answer quality, correctness, and clinical reasoning | + From 61eed341585e41da02fb782573f4de9666f8d6bf Mon Sep 17 00:00:00 2001 From: Arya Hariharan Date: Sat, 11 Oct 2025 16:26:34 +0530 Subject: [PATCH 04/12] resolving issues --- environments/careqa_openended/careqa_openended.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/environments/careqa_openended/careqa_openended.py b/environments/careqa_openended/careqa_openended.py index a09aa731..6e560f3d 100644 --- a/environments/careqa_openended/careqa_openended.py +++ b/environments/careqa_openended/careqa_openended.py @@ -2,17 +2,6 @@ from typing import Any, Optional from datasets import load_dataset import verifiers as vf - -# Prompt Construction - -def _build_open_prompt(question: str) -> str: - """Create an open-ended clinical QA prompt.""" - return ( - "You are an expert clinician answering medical questions.\n" - "Read the following question carefully and provide a detailed, concise answer.\n\n" - f"Question:\n{question.strip()}\n\n" - "Answer:" - ) # Load Open-Ended Environment From f311014e26dec64a2e54e84cea36a15799a26cf3 Mon Sep 17 00:00:00 2001 From: Arya Hariharan Date: Sat, 11 Oct 2025 16:27:14 +0530 Subject: [PATCH 05/12] removing redundant imports --- environments/careqa_openended/careqa_openended.py | 1 - 1 file changed, 1 deletion(-) diff --git a/environments/careqa_openended/careqa_openended.py b/environments/careqa_openended/careqa_openended.py index 6e560f3d..a69125ee 100644 --- a/environments/careqa_openended/careqa_openended.py +++ b/environments/careqa_openended/careqa_openended.py @@ -1,5 +1,4 @@ from __future__ import annotations -from typing import Any, Optional from datasets import load_dataset import verifiers as vf From f9f321a9e94a511cb7cfd61b4f5148b06168bae7 Mon Sep 17 00:00:00 2001 From: Arya Hariharan Date: Fri, 17 Oct 2025 13:18:33 +0530 Subject: [PATCH 06/12] resolving comments --- environments/careqa_mcq/README.md | 2 +- environments/careqa_mcq/careqa_mcq.py | 61 ++++++++++----------------- 2 files changed, 24 insertions(+), 39 deletions(-) diff --git a/environments/careqa_mcq/README.md b/environments/careqa_mcq/README.md index 2797a8dd..9f455715 100644 --- a/environments/careqa_mcq/README.md +++ b/environments/careqa_mcq/README.md @@ -23,7 +23,7 @@ Evaluation environment for the [HPAI-BSC/CareQA](https://huggingface.co/datasets Run an evaluation with default settings: ```bash -uv run vf-eval careqa +uv run vf-eval careqa_mcq ``` Configure model and sampling: diff --git a/environments/careqa_mcq/careqa_mcq.py b/environments/careqa_mcq/careqa_mcq.py index 6f62367d..fb10d9fe 100644 --- a/environments/careqa_mcq/careqa_mcq.py +++ b/environments/careqa_mcq/careqa_mcq.py @@ -2,46 +2,32 @@ from typing import Any, Optional from datasets import load_dataset import verifiers as vf - - -# Helper Functions - -def _get_text_from_completion(completion: Any) -> str: - """Extract plain text from completion.""" - if isinstance(completion, str): - return completion.strip() - if isinstance(completion, list) and completion: - last = completion[-1] - if isinstance(last, dict): - return str(last.get("content", "")).strip() - return str(last).strip() - return str(completion).strip() - - -def _first_letter(text: str) -> Optional[str]: - """Extract the first uppercase A–Z letter.""" - for ch in (text or "").upper(): - if "A" <= ch <= "Z": - return ch - return None +from verifiers.utils.data_utils import ( + extract_boxed_answer, + BOXED_SYSTEM_PROMPT, + THINK_BOXED_SYSTEM_PROMPT, +) # Prompt Construction def _build_prompt(question: str, options: dict[str, str]) -> str: """Create an MCQ prompt.""" formatted_opts = "\n".join(f"{k}. {v}" for k, v in options.items()) - letters = ", ".join(options.keys()) - return ( - "You are a board-certified clinician taking a medical reasoning test.\n" - "Read the following question carefully and choose the most appropriate answer.\n\n" - f"Question:\n{question.strip()}\n\n" - f"Options:\n{formatted_opts}\n\n" - f"Respond with only the option letter ({letters}), nothing else." - ) + return f"Question:{question}\n{formatted_opts}\nAnswer:" + +def exact_match(parser: vf.Parser, completion: str, answer: str, **kwargs) -> float: + """Reward exact matches.""" + response = parser.parse_answer(completion).strip().upper() + return 1.0 if response == answer.strip().upper() else 0.0 # Main Environment -def load_environment(split: str = "test") -> vf.Environment: +def load_environment( + split: str = "test", + use_think: bool = False, + system_prompt: Optional[str] = None + ) -> vf.Environment: + """ CareQA multiple-choice evaluation environment. Uses vf.SingleTurnEnv + MCQ accuracy rubric. @@ -51,7 +37,6 @@ def load_environment(split: str = "test") -> vf.Environment: def _map(ex): options = {"A": ex["op1"], "B": ex["op2"], "C": ex["op3"], "D": ex["op4"]} gold_letter = ["A", "B", "C", "D"][ex["cop"] - 1] - # The key change is here: format the single prompt string as a list of dicts (ChatML format) return { "prompt": [ { @@ -63,16 +48,16 @@ def _map(ex): } mapped = ds.map(_map, remove_columns=ds.column_names) + + parser = vf.ThinkParser(extract_boxed_answer) if use_think else vf.Parser(extract_boxed_answer) + system_prompt = system_prompt or (THINK_BOXED_SYSTEM_PROMPT if use_think else BOXED_SYSTEM_PROMPT) - def mcq_accuracy(completion, answer): - pred = _first_letter(_get_text_from_completion(completion)) - return 1.0 if pred == str(answer).upper() else 0.0 - - rubric = vf.Rubric(funcs=[mcq_accuracy], weights=[1.0]) + rubric = vf.Rubric(funcs=[exact_match], weights=[1.0], parser=parser) return vf.SingleTurnEnv( dataset=mapped, eval_dataset=mapped, rubric=rubric, - system_prompt=None, + parser = parser, + system_prompt=system_prompt, ) From 8a04ba90491ea6a99480fd7d09549fb8e8384560 Mon Sep 17 00:00:00 2001 From: Arya Hariharan Date: Sat, 18 Oct 2025 11:41:13 +0530 Subject: [PATCH 07/12] resolving commits --- environments/careqa_mcq/careqa_mcq.py | 2 +- .../careqa_openended/careqa_openended.py | 123 +++++++++++++++--- 2 files changed, 103 insertions(+), 22 deletions(-) diff --git a/environments/careqa_mcq/careqa_mcq.py b/environments/careqa_mcq/careqa_mcq.py index fb10d9fe..16bcce88 100644 --- a/environments/careqa_mcq/careqa_mcq.py +++ b/environments/careqa_mcq/careqa_mcq.py @@ -13,7 +13,7 @@ def _build_prompt(question: str, options: dict[str, str]) -> str: """Create an MCQ prompt.""" formatted_opts = "\n".join(f"{k}. {v}" for k, v in options.items()) - return f"Question:{question}\n{formatted_opts}\nAnswer:" + return f"Question:{question}\nChoices:{formatted_opts}\nAnswer:" def exact_match(parser: vf.Parser, completion: str, answer: str, **kwargs) -> float: """Reward exact matches.""" diff --git a/environments/careqa_openended/careqa_openended.py b/environments/careqa_openended/careqa_openended.py index a69125ee..08b0f0c4 100644 --- a/environments/careqa_openended/careqa_openended.py +++ b/environments/careqa_openended/careqa_openended.py @@ -1,36 +1,117 @@ from __future__ import annotations +import os +import re from datasets import load_dataset +from openai import AsyncOpenAI import verifiers as vf - -# Load Open-Ended Environment -def load_environment(split: str = "test") -> vf.SingleTurnEnv: - ds = load_dataset("HPAI-BSC/CareQA", 'CareQA_en_open', split=split) - def _map(ex): - system_content = "You are an expert clinician answering medical questions." +def load_environment( + judge_model: str = "gpt-4o-mini", + judge_base_url: str | None = None, + judge_api_key: str | None = None, +) -> vf.Environment: + """ + CareQA Open-Ended QA environment using LLM-as-a-Judge evaluation. + + This environment loads the open-ended subset of the CareQA dataset and + uses an LLM judge to assess whether a model's response matches or aligns + medically with the reference answer. + """ - user_content = ( - "Read the following question carefully and provide a detailed, concise answer.\n\n" - f"Question:\n{ex['question'].strip()}\n\n" - "Answer:" - ) + # --- Load Dataset --- + ds = load_dataset("HPAI-BSC/CareQA", "CareQA_en_open") + train_dataset = ds["train"] if "train" in ds else None + eval_dataset = ds["test"] + def _map(ex): return { - "prompt": [ - {"role": "system", "content": system_content}, - {"role": "user", "content": user_content}, - ], + "question": ex["question"].strip(), "answer": ex.get("answer_explanation", ex.get("answer", "")), + "task": "careqa_open", } - mapped = ds.map(_map, remove_columns=ds.column_names) + if train_dataset: + train_dataset = train_dataset.map(_map, remove_columns=train_dataset.column_names) + eval_dataset = eval_dataset.map(_map, remove_columns=eval_dataset.column_names) + + # System Prompt + system_prompt = ( + "You are an expert clinician answering open-ended medical questions. " + "Read the question carefully, reason step by step, and provide a precise, " + "clinically sound answer enclosed in tags, followed by your conclusion." + ) + + # Judge Prompt Template + JUDGE_TEMPLATE = """\ +You are a clinical reasoning evaluator tasked with determining whether a model's medical answer +is equivalent in meaning and correctness to the reference (ground truth) answer. + +You will be given: +1. A clinical question. +2. The ground truth answer. +3. A model's predicted answer. + +Judge whether the predicted answer is *medically equivalent* to the ground truth. +Equivalence means that both answers express the same medical reasoning or correct clinical interpretation, +even if the wording differs. + +Guidelines: +- Equivalent if the same diagnosis, reasoning, or recommendation is conveyed. +- Accept synonyms (e.g., “heart attack” vs “myocardial infarction”). +- Ignore trivial stylistic differences or additional context. +- Not equivalent if the model changes the diagnosis, key mechanism, or recommendation. + +Question: {question} - rubric = vf.JudgeRubric() +Ground Truth Answer: {answer} - return vf.SingleTurnEnv( - dataset=mapped, - eval_dataset=mapped, +Predicted Answer: {response} + +Is the predicted answer medically equivalent to the ground truth? +Respond strictly with "EQUIVALENT" or "NOT_EQUIVALENT". +""".strip() + + # Judge Client Setup + api_key = judge_api_key or os.getenv("OPENAI_API_KEY") + judge_client = AsyncOpenAI(base_url=judge_base_url, api_key=api_key) if api_key else None + + # Reward Extraction + def extract_answer_section(completion_text: str) -> str: + """Extract final answer after think tags.""" + if not completion_text: + return "" + if "" in completion_text and "" in completion_text: + return re.sub(r".*?", "", completion_text, flags=re.DOTALL).strip() + return completion_text.strip() + + async def careqa_reward_func(judge, prompt, completion, answer, state, **kwargs) -> float: + """Evaluate medical equivalence using LLM-as-judge.""" + completion_text = completion if isinstance(completion, str) else str(completion) + response = extract_answer_section(completion_text) + + judge_response = await judge(prompt, response, answer, state, **kwargs) + decision = judge_response.strip().upper() + + if "EQUIVALENT" in decision and "NOT_EQUIVALENT" not in decision: + return 1.0 + else: + return 0.0 + + # Judge Rubric + rubric = vf.JudgeRubric( + judge_client=judge_client, + judge_model=judge_model, + judge_prompt=JUDGE_TEMPLATE, + ) + rubric.add_reward_func(careqa_reward_func, weight=1.0) + + # Environment Construction + vf_env = vf.SingleTurnEnv( + dataset=train_dataset, + eval_dataset=eval_dataset, + system_prompt=system_prompt, rubric=rubric, - system_prompt=None, ) + + return vf_env From f0018abdc2091ef51841930bb60d6d80b4e866f4 Mon Sep 17 00:00:00 2001 From: Arya Hariharan Date: Tue, 21 Oct 2025 21:28:51 +0530 Subject: [PATCH 08/12] resolving comments --- .../careqa_openended/careqa_openended.py | 32 ++++++------------- 1 file changed, 9 insertions(+), 23 deletions(-) diff --git a/environments/careqa_openended/careqa_openended.py b/environments/careqa_openended/careqa_openended.py index 08b0f0c4..ff3d369e 100644 --- a/environments/careqa_openended/careqa_openended.py +++ b/environments/careqa_openended/careqa_openended.py @@ -43,33 +43,19 @@ def _map(ex): ) # Judge Prompt Template - JUDGE_TEMPLATE = """\ -You are a clinical reasoning evaluator tasked with determining whether a model's medical answer -is equivalent in meaning and correctness to the reference (ground truth) answer. - -You will be given: -1. A clinical question. -2. The ground truth answer. -3. A model's predicted answer. - -Judge whether the predicted answer is *medically equivalent* to the ground truth. -Equivalence means that both answers express the same medical reasoning or correct clinical interpretation, -even if the wording differs. - -Guidelines: -- Equivalent if the same diagnosis, reasoning, or recommendation is conveyed. -- Accept synonyms (e.g., “heart attack” vs “myocardial infarction”). -- Ignore trivial stylistic differences or additional context. -- Not equivalent if the model changes the diagnosis, key mechanism, or recommendation. - + JUDGE_TEMPLATE = """You are a clinical fact verifier. +Given: Question: {question} +Reference (ground truth) answer: {answer} +Model’s answer: {response} -Ground Truth Answer: {answer} +Determine if the model’s answer is medically equivalent to the reference. +- Consider medical synonyms and abbreviations equivalent. +- Ignore minor wording differences (e.g., “high blood pressure” ≈ “hypertension”). +- If the model’s answer is more general or specific but still correct, consider it equivalent. -Predicted Answer: {response} +Respond with one word only: "EQUIVALENT" or "NOT_EQUIVALENT". -Is the predicted answer medically equivalent to the ground truth? -Respond strictly with "EQUIVALENT" or "NOT_EQUIVALENT". """.strip() # Judge Client Setup From b5da66a2f512aefb4f16b07bf2e948fc653d79f0 Mon Sep 17 00:00:00 2001 From: Arya Hariharan <84255987+Arya-Hari@users.noreply.github.com> Date: Mon, 27 Oct 2025 20:50:16 +0530 Subject: [PATCH 09/12] Update careqa_openended.py --- environments/careqa_openended/careqa_openended.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/environments/careqa_openended/careqa_openended.py b/environments/careqa_openended/careqa_openended.py index ff3d369e..f96973f5 100644 --- a/environments/careqa_openended/careqa_openended.py +++ b/environments/careqa_openended/careqa_openended.py @@ -37,9 +37,7 @@ def _map(ex): # System Prompt system_prompt = ( - "You are an expert clinician answering open-ended medical questions. " - "Read the question carefully, reason step by step, and provide a precise, " - "clinically sound answer enclosed in tags, followed by your conclusion." + "Instructions: The following text is a medical question. Answer it in the most factual, concise and informative way possible" ) # Judge Prompt Template From 013b0d63fd3aef5986d094b25a7d2b05fc5c9028 Mon Sep 17 00:00:00 2001 From: Arya Hariharan <84255987+Arya-Hari@users.noreply.github.com> Date: Mon, 27 Oct 2025 20:51:57 +0530 Subject: [PATCH 10/12] Update careqa_openended.py --- environments/careqa_openended/careqa_openended.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environments/careqa_openended/careqa_openended.py b/environments/careqa_openended/careqa_openended.py index f96973f5..fc6d93d2 100644 --- a/environments/careqa_openended/careqa_openended.py +++ b/environments/careqa_openended/careqa_openended.py @@ -37,7 +37,7 @@ def _map(ex): # System Prompt system_prompt = ( - "Instructions: The following text is a medical question. Answer it in the most factual, concise and informative way possible" + "Instructions: The question that will be given to you is a medical question. Answer it in the most factual, concise and informative way possible" ) # Judge Prompt Template From d76f8607ee8a9dcf188852f8eed907470b2b41cf Mon Sep 17 00:00:00 2001 From: Arya Hariharan <84255987+Arya-Hari@users.noreply.github.com> Date: Mon, 27 Oct 2025 20:56:05 +0530 Subject: [PATCH 11/12] Update careqa_openended.py --- environments/careqa_openended/careqa_openended.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/environments/careqa_openended/careqa_openended.py b/environments/careqa_openended/careqa_openended.py index fc6d93d2..ff3d369e 100644 --- a/environments/careqa_openended/careqa_openended.py +++ b/environments/careqa_openended/careqa_openended.py @@ -37,7 +37,9 @@ def _map(ex): # System Prompt system_prompt = ( - "Instructions: The question that will be given to you is a medical question. Answer it in the most factual, concise and informative way possible" + "You are an expert clinician answering open-ended medical questions. " + "Read the question carefully, reason step by step, and provide a precise, " + "clinically sound answer enclosed in tags, followed by your conclusion." ) # Judge Prompt Template From 3682c6087b253acb6ba0bf5922803ac59ca7046e Mon Sep 17 00:00:00 2001 From: Benjamin Warner Date: Fri, 12 Dec 2025 17:45:19 -0500 Subject: [PATCH 12/12] update careqa to use No Free Labels style prompt --- environments/careqa/README.md | 89 +++++++ environments/careqa/careqa.py | 251 ++++++++++++++++++ .../{careqa_mcq => careqa}/pyproject.toml | 14 +- environments/careqa_mcq/README.md | 42 --- environments/careqa_mcq/careqa_mcq.py | 63 ----- environments/careqa_openended/README.md | 42 --- .../careqa_openended/careqa_openended.py | 103 ------- environments/careqa_openended/pyproject.toml | 22 -- 8 files changed, 349 insertions(+), 277 deletions(-) create mode 100644 environments/careqa/README.md create mode 100644 environments/careqa/careqa.py rename environments/{careqa_mcq => careqa}/pyproject.toml (62%) delete mode 100644 environments/careqa_mcq/README.md delete mode 100644 environments/careqa_mcq/careqa_mcq.py delete mode 100644 environments/careqa_openended/README.md delete mode 100644 environments/careqa_openended/careqa_openended.py delete mode 100644 environments/careqa_openended/pyproject.toml diff --git a/environments/careqa/README.md b/environments/careqa/README.md new file mode 100644 index 00000000..65d593e1 --- /dev/null +++ b/environments/careqa/README.md @@ -0,0 +1,89 @@ +# careqa + +Evaluation environment for the [HPAI-BSC/CareQA](https://huggingface.co/datasets/HPAI-BSC/CareQA) dataset. + +### Overview +- **Environment ID**: `careqa` +- **Short description**: CareQA is a healthcare QA dataset with **multiple-choice** and **open-ended clinical reasoning questions**. This environment supports both modes through the `mode` parameter. +- **Tags**: healthcare, medical QA, clinical reasoning, MCQ, single-turn + +### Datasets +- **Primary dataset(s)**: + - `CareQA_en` – multiple-choice clinical questions with 4 options and correct answer labels + - `CareQA_en_open` – open-ended clinical questions with reference answers +- **Source links**: + - [Hugging Face CareQA dataset](https://huggingface.co/datasets/HPAI-BSC/CareQA) + +### Task +- **Type**: single-turn +- **Parser**: + - MCQ mode: `vf.Parser()` or `vf.ThinkParser()` for extracting boxed answers + - Open-ended mode: `XMLParser()` for judge responses +- **Rubric overview**: + - **MCQ mode (`en`)**: `vf.Rubric()` measuring **accuracy** (letter match A–D) + - **Open-ended mode (`open`)**: `vf.JudgeRubric()` using an LLM-as-judge to score free-text answers for correctness and clinical reasoning + +### Quickstart + +**Multiple-choice evaluation:** +```bash +medarc-eval careqa --mode en --model gpt-4.1-mini --num-examples 10 -s +``` + +**Open-ended evaluation:** +```bash +medarc-eval careqa --mode open --model gpt-4.1-mini --num-examples 10 -s +``` + +**With think-mode prompting (MCQ only):** +```bash +medarc-eval careqa --mode en --use-think --model gpt-4.1-mini --num-examples 10 -s +``` + +**With shuffled answer options (MCQ only):** +```bash +medarc-eval careqa --mode en --shuffle-answers --shuffle-seed 42 --model gpt-4.1-mini -n 10 -s +``` + +### Configuration Options + +#### Common Parameters +- `--mode`: Select mode: `en` (multiple-choice) or `open` (open-ended). Default: `open` +- `--split`: Dataset split to use. Default: `test` +- `--system-prompt`: Custom system prompt (uses mode-appropriate default if not specified) + +#### MCQ-Specific Parameters +- `--use-think`: Enable think-style prompting with boxed answers +- `--shuffle-answers`: Randomly shuffle answer options +- `--shuffle-seed`: Seed for answer shuffling (default: 1618) + +#### Open-Ended-Specific Parameters +- `--judge-model`: Model for LLM-as-judge evaluation (default: `gpt-4o-mini`) +- `--judge-base-url`: Base URL for judge API +- `--judge-api-key`: API key for judge (falls back to `OPENAI_API_KEY` env var) + +### Metrics + +#### MCQ Mode +| Metric | Meaning | +|---------------|---------| +| `reward` | Main scalar reward (weighted sum of rubric criteria) | +| `accuracy` | Exact match on target MCQ answer (letter A–D) | + +#### Open-Ended Mode +| Metric | Meaning | +|---------------|---------| +| `reward` | Main scalar reward (weighted sum of rubric criteria) | +| `judge_score` | LLM-assigned score evaluating answer quality, correctness, and clinical reasoning | + +### Example Usage + +```python +import verifiers as vf + +# Load MCQ environment +env_mcq = vf.load_environment("careqa", mode="en", shuffle_answers=True) + +# Load open-ended environment +env_open = vf.load_environment("careqa", mode="open", judge_model="gpt-4o-mini") +``` diff --git a/environments/careqa/careqa.py b/environments/careqa/careqa.py new file mode 100644 index 00000000..e67178af --- /dev/null +++ b/environments/careqa/careqa.py @@ -0,0 +1,251 @@ +import re +from enum import Enum +from typing import Optional + +from datasets import load_dataset +from openai import AsyncOpenAI +import verifiers as vf +from medarc_verifiers.rewards.multiple_choice_accuracy import multiple_choice_accuracy +from medarc_verifiers.utils.randomize_multiple_choice import randomize_multiple_choice +from medarc_verifiers.parsers.xml_parser import XMLParser +from verifiers.types import Info, State +from verifiers.utils.data_utils import extract_boxed_answer, BOXED_SYSTEM_PROMPT + + +class CareQASplit(Enum): + """Mode selector for CareQA environment.""" + + EN = "en" + OPEN = "open" + + +# --- MCQ Helpers --- + + +def _build_mcq_prompt(question: str, options: dict[str, str]) -> str: + """Create an MCQ prompt.""" + formatted_opts = "\n".join(f"{k}. {v}" for k, v in options.items()) + return f"Question: {question}\nChoices:\n{formatted_opts}\nAnswer:" + + +def accuracy(completion, answer: str, parser: vf.Parser, info: dict | None = None, **kwargs) -> float: + """Reward based on shared multiple-choice accuracy grading.""" + parsed = parser.parse_answer(completion) or "" + answer_text = info.get("answer_text", None) if info else None + is_correct = multiple_choice_accuracy(llm_answer=parsed, answer_letter=answer, answer_text=answer_text) + return 1.0 if is_correct else 0.0 + + +# --- Open-Ended Helpers --- + + +JUDGE_TEMPLATE = """You are grading an AI assistant's answer to a medical/science exam questions. + +Input: +- : The exam question. +- : The correct answer. +- : The AI's response to grade. + +Task: Determine if the assistant's answer is correct or incorrect by comparing it to the reference answer and output your grade in ... tags. + +Grading Rules: +- Assume the reference answer is correct and reflects the expected exam solution. +- Focus on factual content and meaning, not style, length, or confidence. + +Correct if the assistant's answer conveys the same essential fact(s) as the reference, including: +- Synonyms, acronyms (expanded or abbreviated), or rephrasing with equivalent meaning +- Slightly more general/specific phrasing that captures the key concept +- Shorter or longer answers that express the tested fact without contradictions +- Additional supporting details that don't contradict the reference + +Incorrect if any of these apply: +- Different main concept, mechanism, structure, or relationship +- Contradicts the reference on key points (wrong organ, drug, effect, process, etc.) +- Contains clearly incorrect information +- Too vague/incomplete to match the reference +- Merely repeats question words without the core information from the reference + +Be strict: clear mismatches on main concepts or incorrect claims = Incorrect. + +{question} +{answer} +{response} + +Briefly explain whether the assistant's answer matches or conflicts with the reference. Then output your grade as: + +[Correct or Incorrect] +""".strip() + + +def extract_answer_section(completion_text: str) -> str: + """Extract final answer after think tags.""" + if not completion_text: + return "" + if "" in completion_text and "" in completion_text: + return re.sub(r".*?", "", completion_text, flags=re.DOTALL).strip() + return completion_text.strip() + + +def load_environment( + split: str | CareQASplit, + system_prompt: Optional[str] = None, + # MCQ-specific options + shuffle_answers: bool = False, + shuffle_seed: int | None = 1618, + # Open-ended specific options + judge_model: str = "gpt-4o-mini", + judge_base_url: str | None = None, + judge_api_key: str | None = None, + **kwargs, +) -> vf.Environment: + """ + CareQA evaluation environment supporting both MCQ and Open-Ended modes. + + Args: + split: CareQASplit.EN for multiple-choice or CareQASplit.OPEN for open-ended QA. + system_prompt: Custom system prompt (uses mode-appropriate default if None). + shuffle_answers: Shuffle MCQ answer options (MCQ mode only). + shuffle_seed: Seed for answer shuffling (MCQ mode only). + judge_model: Model to use for LLM-as-judge evaluation (Open-ended mode only). + judge_base_url: Base URL for judge API (Open-ended mode only). + judge_api_key: API key for judge (Open-ended mode only). + + Returns: + A vf.Environment configured for the selected mode. + """ + split = CareQASplit(split) if isinstance(split, str) else split + if split == CareQASplit.EN: + return _load_mcq_environment( + system_prompt=system_prompt, + shuffle_answers=shuffle_answers, + shuffle_seed=shuffle_seed, + ) + elif split == CareQASplit.OPEN: + return _load_open_ended_environment( + system_prompt=system_prompt, + judge_model=judge_model, + judge_base_url=judge_base_url, + judge_api_key=judge_api_key, + ) + else: + raise ValueError(f"Invalid mode: {split}") + + +def _load_mcq_environment( + system_prompt: Optional[str], + shuffle_answers: bool, + shuffle_seed: int | None, +) -> vf.Environment: + """Load CareQA multiple-choice environment.""" + eval_dataset = load_dataset("HPAI-BSC/CareQA", "CareQA_en", split="test") + + def _map(ex, idx=None): + options = {"A": ex["op1"], "B": ex["op2"], "C": ex["op3"], "D": ex["op4"]} + gold_letter = ["A", "B", "C", "D"][ex["cop"] - 1] + + if shuffle_answers and gold_letter in options: + options, gold_letter, _ = randomize_multiple_choice( + options=options, + answer_choice=gold_letter, + seed=shuffle_seed, + row_id=ex.get("id", idx), + ) + + return { + "question": _build_mcq_prompt(ex["question"], options), + "answer": gold_letter, + "info": { + "answer_text": options.get(gold_letter, None), + **({"options": options} if shuffle_answers else {}), + }, + } + + load_from_cache_file = not shuffle_answers + eval_dataset = eval_dataset.map( + _map, + with_indices=True, + remove_columns=eval_dataset.column_names, + load_from_cache_file=load_from_cache_file, + ) + + parser = vf.Parser(extract_boxed_answer) + final_system_prompt = BOXED_SYSTEM_PROMPT or system_prompt + + rubric = vf.Rubric(funcs=[accuracy], weights=[1.0], parser=parser) + + return vf.SingleTurnEnv( + eval_dataset=eval_dataset, + rubric=rubric, + parser=parser, + system_prompt=final_system_prompt, + ) + + +def _load_open_ended_environment( + system_prompt: Optional[str], + judge_model: str, + judge_base_url: str | None, + judge_api_key: str | None, +) -> vf.Environment: + """Load CareQA open-ended environment with LLM-as-judge evaluation.""" + eval_dataset = load_dataset("HPAI-BSC/CareQA", "CareQA_en_open", split="test") + + def _map(ex): + info = {} + info["question"] = ex["question"].strip() + return { + "question": ex["question"].strip(), + "answer": ex.get("answer_explanation", ex.get("answer", "")), + "task": "careqa_open", + "info": info, + } + + eval_dataset = eval_dataset.map(_map, remove_columns=eval_dataset.column_names) + + final_system_prompt = system_prompt or ( + "Instructions: The following text is a medical question. Answer it in the most factual, concise, and informative way possible." + ) + + # Judge client setup + judge_client = AsyncOpenAI(base_url=judge_base_url, api_key=judge_api_key) + judge_parser = XMLParser(fields=["grade"], answer_field="grade") + + judge_rubric = vf.JudgeRubric( + parser=judge_parser, + judge_client=judge_client, + judge_model=judge_model, + judge_prompt="{question}", + ) + + async def accuracy(judge, prompt, completion, answer, state: State, info: Info) -> float: + """Evaluate medical equivalence using LLM-as-judge.""" + completion_text = completion if isinstance(completion, str) else str(completion) + response = extract_answer_section(completion_text) + + try: + judge_prompt = JUDGE_TEMPLATE.format(question=info.get("question", ""), answer=answer, response=response) + judge_response = await judge_rubric.judge(judge_prompt, "", "", state) + grade = judge_parser.parse_answer(judge_response).strip().lower() + except AttributeError: + judge_response = await judge_rubric.judge(judge_prompt, "", "", state) + grade = judge_parser.parse_answer(judge_response).strip().lower() + + info.setdefault("judge_feedback", []).append( + { + "grade": grade, + "raw_judge": str(judge_response), + } + ) + + if "correct" in grade and "incorrect" not in grade: + return 1.0 + else: + return 0.0 + + judge_rubric.add_reward_func(accuracy, weight=1.0) + + return vf.SingleTurnEnv( + eval_dataset=eval_dataset, + system_prompt=final_system_prompt, + rubric=judge_rubric, + ) diff --git a/environments/careqa_mcq/pyproject.toml b/environments/careqa/pyproject.toml similarity index 62% rename from environments/careqa_mcq/pyproject.toml rename to environments/careqa/pyproject.toml index 8fbbc8cc..a875b5ff 100644 --- a/environments/careqa_mcq/pyproject.toml +++ b/environments/careqa/pyproject.toml @@ -1,16 +1,17 @@ [project] -name = "careqa_mcq" +name = "careqa" description = "Evaluation environment for the HPAI-BSC/CareQA MCQ dataset" -tags = ["healthcare", "medical-qa", "mcq", "clinical", "single-turn"] +tags = ["healthcare", "medical-qa", "mcq", "clinical", "single-turn", "open-ended"] version = "0.1.0" requires-python = ">=3.11" dependencies = [ "verifiers>=0.1.4", - "datasets>=2.13.0" + "datasets>=2.13.0", + "medarc_verifiers>=0.1.0", ] [tool.prime.environment] -loader = "careqa_mcq:load_environment" +loader = "careqa:load_environment" display_name = "CareQA" visibility = "PUBLIC" @@ -19,4 +20,7 @@ requires = ["hatchling"] build-backend = "hatchling.build" [tool.hatch.build] -include = ["careqa_mcq.py"] \ No newline at end of file +include = ["careqa.py"] + +[tool.uv.sources] +medarc_verifiers = { git = "https://github.com/MedARC-AI/med-lm-envs" } \ No newline at end of file diff --git a/environments/careqa_mcq/README.md b/environments/careqa_mcq/README.md deleted file mode 100644 index 9f455715..00000000 --- a/environments/careqa_mcq/README.md +++ /dev/null @@ -1,42 +0,0 @@ -# careqa - -Evaluation environment for the [HPAI-BSC/CareQA](https://huggingface.co/datasets/HPAI-BSC/CareQA) multiple-choice dataset. - -### Overview -- **Environment ID**: `careqa_mcq` -- **Short description**: CareQA is a healthcare QA dataset with **multiple-choice** and **open-ended clinical reasoning questions**. This environment is for the MCQs only. -- **Tags**: healthcare, medical QA, clinical reasoning, MCQ, single-turn - -### Datasets -- **Primary dataset(s)**: - - `CareQA_en` – multiple-choice clinical questions with 4 options and correct answer labels. -- **Source links**: - - [Hugging Face CareQA dataset](https://huggingface.co/datasets/HPAI-BSC/CareQA) - -### Task -- **Type**: single-turn -- **Parser**: custom prompt mapping (no structured markup) -- **Rubric overview**: -**MCQ (`closed_mcq`)**: `vf.Rubric()` measuring **accuracy** (letter match). - -### Quickstart -Run an evaluation with default settings: - -```bash -uv run vf-eval careqa_mcq -``` - -Configure model and sampling: - -```bash -uv run vf-eval careqa_mcq --model gpt-4.1-mini --num-examples 3 -s -``` - -### Metrics - -| Metric | Meaning | -|---------------|---------| -| `reward` | Main scalar reward (weighted sum of rubric criteria) | -| `accuracy` | Exact match on target MCQ answer (letter A–D) | - - diff --git a/environments/careqa_mcq/careqa_mcq.py b/environments/careqa_mcq/careqa_mcq.py deleted file mode 100644 index 16bcce88..00000000 --- a/environments/careqa_mcq/careqa_mcq.py +++ /dev/null @@ -1,63 +0,0 @@ -from __future__ import annotations -from typing import Any, Optional -from datasets import load_dataset -import verifiers as vf -from verifiers.utils.data_utils import ( - extract_boxed_answer, - BOXED_SYSTEM_PROMPT, - THINK_BOXED_SYSTEM_PROMPT, -) - -# Prompt Construction - -def _build_prompt(question: str, options: dict[str, str]) -> str: - """Create an MCQ prompt.""" - formatted_opts = "\n".join(f"{k}. {v}" for k, v in options.items()) - return f"Question:{question}\nChoices:{formatted_opts}\nAnswer:" - -def exact_match(parser: vf.Parser, completion: str, answer: str, **kwargs) -> float: - """Reward exact matches.""" - response = parser.parse_answer(completion).strip().upper() - return 1.0 if response == answer.strip().upper() else 0.0 - -# Main Environment - -def load_environment( - split: str = "test", - use_think: bool = False, - system_prompt: Optional[str] = None - ) -> vf.Environment: - - """ - CareQA multiple-choice evaluation environment. - Uses vf.SingleTurnEnv + MCQ accuracy rubric. - """ - ds = load_dataset("HPAI-BSC/CareQA",'CareQA_en', split=split) - - def _map(ex): - options = {"A": ex["op1"], "B": ex["op2"], "C": ex["op3"], "D": ex["op4"]} - gold_letter = ["A", "B", "C", "D"][ex["cop"] - 1] - return { - "prompt": [ - { - "role": "user", - "content": _build_prompt(ex["question"], options) - } - ], - "answer": gold_letter, - } - - mapped = ds.map(_map, remove_columns=ds.column_names) - - parser = vf.ThinkParser(extract_boxed_answer) if use_think else vf.Parser(extract_boxed_answer) - system_prompt = system_prompt or (THINK_BOXED_SYSTEM_PROMPT if use_think else BOXED_SYSTEM_PROMPT) - - rubric = vf.Rubric(funcs=[exact_match], weights=[1.0], parser=parser) - - return vf.SingleTurnEnv( - dataset=mapped, - eval_dataset=mapped, - rubric=rubric, - parser = parser, - system_prompt=system_prompt, - ) diff --git a/environments/careqa_openended/README.md b/environments/careqa_openended/README.md deleted file mode 100644 index f08d0250..00000000 --- a/environments/careqa_openended/README.md +++ /dev/null @@ -1,42 +0,0 @@ -# careqa_openended - -Evaluation environment for the [HPAI-BSC/CareQA](https://huggingface.co/datasets/HPAI-BSC/CareQA) openended dataset. - -### Overview -- **Environment ID**: `careqa_openended` -- **Short description**: CareQA is a healthcare QA dataset with **multiple-choice** and **open-ended clinical reasoning questions**. This environment is for the open-ended questions only. -- **Tags**: healthcare, medical QA, clinical reasoning, single-turn - -### Datasets -- **Primary dataset(s)**: - - `CareQA_en_open` – open-ended clinical questions with reference answers. -- **Source links**: - - [Hugging Face CareQA dataset](https://huggingface.co/datasets/HPAI-BSC/CareQA) - -### Task -- **Type**: single-turn -- **Parser**: custom prompt mapping (no structured markup) -- **Rubric overview**: -**Open-ended (`open_clinical`)**: `vf.JudgeRubric()` using an LLM-as-judge to score free-text answers for correctness and clinical reasoning. - -### Quickstart -Run an evaluation with default settings: - -```bash -uv run vf-eval careqa -``` - -Configure model and sampling: - -```bash -uv run vf-eval careqa_openended --model gpt-4.1-mini --num-examples 3 -s -``` - -### Metrics - -| Metric | Meaning | -|---------------|---------| -| `reward` | Main scalar reward (weighted sum of rubric criteria) | -| `judge_score` | For open-ended questions, LLM-assigned score evaluating answer quality, correctness, and clinical reasoning | - - diff --git a/environments/careqa_openended/careqa_openended.py b/environments/careqa_openended/careqa_openended.py deleted file mode 100644 index ff3d369e..00000000 --- a/environments/careqa_openended/careqa_openended.py +++ /dev/null @@ -1,103 +0,0 @@ -from __future__ import annotations -import os -import re -from datasets import load_dataset -from openai import AsyncOpenAI -import verifiers as vf - - -def load_environment( - judge_model: str = "gpt-4o-mini", - judge_base_url: str | None = None, - judge_api_key: str | None = None, -) -> vf.Environment: - """ - CareQA Open-Ended QA environment using LLM-as-a-Judge evaluation. - - This environment loads the open-ended subset of the CareQA dataset and - uses an LLM judge to assess whether a model's response matches or aligns - medically with the reference answer. - """ - - # --- Load Dataset --- - ds = load_dataset("HPAI-BSC/CareQA", "CareQA_en_open") - train_dataset = ds["train"] if "train" in ds else None - eval_dataset = ds["test"] - - def _map(ex): - return { - "question": ex["question"].strip(), - "answer": ex.get("answer_explanation", ex.get("answer", "")), - "task": "careqa_open", - } - - if train_dataset: - train_dataset = train_dataset.map(_map, remove_columns=train_dataset.column_names) - eval_dataset = eval_dataset.map(_map, remove_columns=eval_dataset.column_names) - - # System Prompt - system_prompt = ( - "You are an expert clinician answering open-ended medical questions. " - "Read the question carefully, reason step by step, and provide a precise, " - "clinically sound answer enclosed in tags, followed by your conclusion." - ) - - # Judge Prompt Template - JUDGE_TEMPLATE = """You are a clinical fact verifier. -Given: -Question: {question} -Reference (ground truth) answer: {answer} -Model’s answer: {response} - -Determine if the model’s answer is medically equivalent to the reference. -- Consider medical synonyms and abbreviations equivalent. -- Ignore minor wording differences (e.g., “high blood pressure” ≈ “hypertension”). -- If the model’s answer is more general or specific but still correct, consider it equivalent. - -Respond with one word only: "EQUIVALENT" or "NOT_EQUIVALENT". - -""".strip() - - # Judge Client Setup - api_key = judge_api_key or os.getenv("OPENAI_API_KEY") - judge_client = AsyncOpenAI(base_url=judge_base_url, api_key=api_key) if api_key else None - - # Reward Extraction - def extract_answer_section(completion_text: str) -> str: - """Extract final answer after think tags.""" - if not completion_text: - return "" - if "" in completion_text and "" in completion_text: - return re.sub(r".*?", "", completion_text, flags=re.DOTALL).strip() - return completion_text.strip() - - async def careqa_reward_func(judge, prompt, completion, answer, state, **kwargs) -> float: - """Evaluate medical equivalence using LLM-as-judge.""" - completion_text = completion if isinstance(completion, str) else str(completion) - response = extract_answer_section(completion_text) - - judge_response = await judge(prompt, response, answer, state, **kwargs) - decision = judge_response.strip().upper() - - if "EQUIVALENT" in decision and "NOT_EQUIVALENT" not in decision: - return 1.0 - else: - return 0.0 - - # Judge Rubric - rubric = vf.JudgeRubric( - judge_client=judge_client, - judge_model=judge_model, - judge_prompt=JUDGE_TEMPLATE, - ) - rubric.add_reward_func(careqa_reward_func, weight=1.0) - - # Environment Construction - vf_env = vf.SingleTurnEnv( - dataset=train_dataset, - eval_dataset=eval_dataset, - system_prompt=system_prompt, - rubric=rubric, - ) - - return vf_env diff --git a/environments/careqa_openended/pyproject.toml b/environments/careqa_openended/pyproject.toml deleted file mode 100644 index 80e4e765..00000000 --- a/environments/careqa_openended/pyproject.toml +++ /dev/null @@ -1,22 +0,0 @@ -[project] -name = "careqa_openended" -description = "Evaluation environment for the HPAI-BSC/CareQA open-ended dataset" -tags = ["healthcare", "medical-qa", "open-ended", "clinical", "single-turn"] -version = "0.1.0" -requires-python = ">=3.11" -dependencies = [ - "verifiers>=0.1.4", - "datasets>=2.13.0" -] - -[tool.prime.environment] -loader = "careqa_openended:load_environment" -display_name = "CareQA" -visibility = "PUBLIC" - -[build-system] -requires = ["hatchling"] -build-backend = "hatchling.build" - -[tool.hatch.build] -include = ["careqa_openended.py"] \ No newline at end of file