From 8201912eca67e71d4dc99bb4e4090263228335e5 Mon Sep 17 00:00:00 2001
From: Arya Hariharan <aryahariharan08@gmail.com>
Date: Sat, 11 Oct 2025 13:08:10 +0530
Subject: [PATCH 01/12] Add careqa mcq eval environment

---
 environments/careqa_mcq/README.md      | 60 ++++++++++++++++++++
 environments/careqa_mcq/careqa_mcq.py  | 78 ++++++++++++++++++++++++++
 environments/careqa_mcq/pyproject.toml | 22 ++++++++
 3 files changed, 160 insertions(+)
 create mode 100644 environments/careqa_mcq/README.md
 create mode 100644 environments/careqa_mcq/careqa_mcq.py
 create mode 100644 environments/careqa_mcq/pyproject.toml

diff --git a/environments/careqa_mcq/README.md b/environments/careqa_mcq/README.md
new file mode 100644
index 00000000..4d936e88
--- /dev/null
+++ b/environments/careqa_mcq/README.md
@@ -0,0 +1,60 @@
+# careqa
+
+Evaluation environment for the [HPAI-BSC/CareQA](https://huggingface.co/datasets/HPAI-BSC/CareQA) multiple-choice dataset.
+
+---
+
+### Overview
+- **Environment ID**: `careqa`  
+- **Short description**: CareQA is a healthcare QA dataset with **multiple-choice** and **open-ended clinical reasoning questions**. This environment is for the MCQs only.  
+- **Tags**: healthcare, medical QA, clinical reasoning, MCQ, open-ended, single-turn
+
+### Datasets
+- **Primary dataset(s)**:  
+  - `CareQA_en` – multiple-choice clinical questions with 4 options and correct answer labels.  
+- **Source links**:  
+  - [Hugging Face CareQA dataset](https://huggingface.co/datasets/HPAI-BSC/CareQA)
+
+### Task
+- **Type**: single-turn  
+- **Parser**: custom prompt mapping (no structured markup)  
+- **Rubric overview**:  
+**MCQ (`closed_mcq`)**: `vf.Rubric()` measuring **accuracy** (letter match).  
+
+---
+
+### Quickstart
+Run an evaluation with default settings:
+
+```bash
+uv run vf-eval careqa
+```
+
+Configure model and sampling:
+
+```bash
+uv run vf-eval careqa     -m gpt-4.1-mini     -n 20 -r 3 -t 1024 -T 0.7     -a '{"max_examples": 50}'
+```
+
+Notes:  
+- Use `-a` / `--env-args` to pass environment-specific configuration as a JSON object.  
+
+
+### Environment Arguments
+
+| Arg            | Type | Default | Description |
+|----------------|------|---------|-------------|
+| `max_examples` | int  | `-1`    | Maximum number of examples to evaluate; use `-1` for full dataset |
+| `split`        | str  | `"test"` | Dataset split to use: `train`, `validation`, or `test` |
+| `verbose`      | bool | `False` | Print prompt/answer samples during evaluation |
+
+---
+
+### Metrics
+
+| Metric        | Meaning |
+|---------------|---------|
+| `reward`      | Main scalar reward (weighted sum of rubric criteria) |
+| `accuracy`    | Exact match on target MCQ answer (letter A–D) |
+
+
diff --git a/environments/careqa_mcq/careqa_mcq.py b/environments/careqa_mcq/careqa_mcq.py
new file mode 100644
index 00000000..b670fcd8
--- /dev/null
+++ b/environments/careqa_mcq/careqa_mcq.py
@@ -0,0 +1,78 @@
+from __future__ import annotations
+from typing import Any, Optional
+from datasets import load_dataset
+import verifiers as vf
+
+
+# Helper Functions
+
+def _get_text_from_completion(completion: Any) -> str:
+    """Extract plain text from completion."""
+    if isinstance(completion, str):
+        return completion.strip()
+    if isinstance(completion, list) and completion:
+        last = completion[-1]
+        if isinstance(last, dict):
+            return str(last.get("content", "")).strip()
+        return str(last).strip()
+    return str(completion).strip()
+
+
+def _first_letter(text: str) -> Optional[str]:
+    """Extract the first uppercase A–Z letter."""
+    for ch in (text or "").upper():
+        if "A" <= ch <= "Z":
+            return ch
+    return None
+
+# Prompt Construction
+
+def _build_prompt(question: str, options: dict[str, str]) -> str:
+    """Create a polished clinical MCQ prompt."""
+    formatted_opts = "\n".join(f"{k}. {v}" for k, v in options.items())
+    letters = ", ".join(options.keys())
+    return (
+        "You are a board-certified clinician taking a medical reasoning test.\n"
+        "Read the following question carefully and choose the most appropriate answer.\n\n"
+        f"Question:\n{question.strip()}\n\n"
+        f"Options:\n{formatted_opts}\n\n"
+        f"Respond with only the option letter ({letters}), nothing else."
+    )
+
+# Main Environment
+
+def load_environment(split: str = "test") -> vf.Environment:
+    """
+    CareQA multiple-choice evaluation environment.
+    Uses vf.SingleTurnEnv + MCQ accuracy rubric.
+    """
+    ds = load_dataset("HPAI-BSC/CareQA",'CareQA_en', split=split)
+
+    def _map(ex):
+        options = {"A": ex["op1"], "B": ex["op2"], "C": ex["op3"], "D": ex["op4"]}
+        gold_letter = ["A", "B", "C", "D"][ex["cop"] - 1] 
+        # The key change is here: format the single prompt string as a list of dicts (ChatML format)
+        return {
+            "prompt": [
+                {
+                    "role": "user", 
+                    "content": _build_prompt(ex["question"], options)
+                }
+            ],
+            "answer": gold_letter,
+        }
+
+    mapped = ds.map(_map, remove_columns=ds.column_names)
+
+    def mcq_accuracy(completion, answer):
+        pred = _first_letter(_get_text_from_completion(completion))
+        return 1.0 if pred == str(answer).upper() else 0.0
+
+    rubric = vf.Rubric(funcs=[mcq_accuracy], weights=[1.0])
+
+    return vf.SingleTurnEnv(
+        dataset=mapped,
+        eval_dataset=mapped,
+        rubric=rubric,
+        system_prompt=None,
+    )
diff --git a/environments/careqa_mcq/pyproject.toml b/environments/careqa_mcq/pyproject.toml
new file mode 100644
index 00000000..fa298d5a
--- /dev/null
+++ b/environments/careqa_mcq/pyproject.toml
@@ -0,0 +1,22 @@
+[project]
+name = "careqa_mcq"
+description = "Evaluation environment for the HPAI-BSC/CareQA MCQ dataset"
+tags = ["healthcare", "medical-qa", "mcq", "open-ended", "clinical", "single-turn"]
+version = "0.1.0"
+requires-python = ">=3.11"
+dependencies = [
+    "verifiers>=0.1.4",
+    "datasets>=2.13.0"
+]
+
+[tool.prime.environment]
+loader = "careqa_mcq:load_environment"
+display_name = "CareQA"
+visibility = "PUBLIC"
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.hatch.build]
+include = ["careqa_mcq.py"]
\ No newline at end of file

From a19ba978e0f76ea7d2d140999b22de49b1e94102 Mon Sep 17 00:00:00 2001
From: Arya Hariharan <aryahariharan08@gmail.com>
Date: Sat, 11 Oct 2025 16:19:31 +0530
Subject: [PATCH 02/12] add careqa open-ended env

---
 environments/careqa_mcq/README.md             | 15 ++----
 environments/careqa_mcq/careqa_mcq.py         |  2 +-
 environments/careqa_mcq/pyproject.toml        |  2 +-
 environments/careqa_openended/README.md       | 51 +++++++++++++++++++
 .../careqa_openended/careqa_openended.py      | 48 +++++++++++++++++
 environments/careqa_openended/pyproject.toml  | 22 ++++++++
 6 files changed, 127 insertions(+), 13 deletions(-)
 create mode 100644 environments/careqa_openended/README.md
 create mode 100644 environments/careqa_openended/careqa_openended.py
 create mode 100644 environments/careqa_openended/pyproject.toml

diff --git a/environments/careqa_mcq/README.md b/environments/careqa_mcq/README.md
index 4d936e88..89dc1c04 100644
--- a/environments/careqa_mcq/README.md
+++ b/environments/careqa_mcq/README.md
@@ -2,12 +2,10 @@
 
 Evaluation environment for the [HPAI-BSC/CareQA](https://huggingface.co/datasets/HPAI-BSC/CareQA) multiple-choice dataset.
 
----
-
 ### Overview
-- **Environment ID**: `careqa`  
+- **Environment ID**: `careqa_mcq`  
 - **Short description**: CareQA is a healthcare QA dataset with **multiple-choice** and **open-ended clinical reasoning questions**. This environment is for the MCQs only.  
-- **Tags**: healthcare, medical QA, clinical reasoning, MCQ, open-ended, single-turn
+- **Tags**: healthcare, medical QA, clinical reasoning, MCQ, single-turn
 
 ### Datasets
 - **Primary dataset(s)**:  
@@ -21,8 +19,6 @@ Evaluation environment for the [HPAI-BSC/CareQA](https://huggingface.co/datasets
 - **Rubric overview**:  
 **MCQ (`closed_mcq`)**: `vf.Rubric()` measuring **accuracy** (letter match).  
 
----
-
 ### Quickstart
 Run an evaluation with default settings:
 
@@ -33,11 +29,8 @@ uv run vf-eval careqa
 Configure model and sampling:
 
 ```bash
-uv run vf-eval careqa     -m gpt-4.1-mini     -n 20 -r 3 -t 1024 -T 0.7     -a '{"max_examples": 50}'
-```
-
-Notes:  
-- Use `-a` / `--env-args` to pass environment-specific configuration as a JSON object.  
+uv run vf-eval careqa_mcq --model gpt-4.1-mini --num-examples 3 -s
+``` 
 
 
 ### Environment Arguments
diff --git a/environments/careqa_mcq/careqa_mcq.py b/environments/careqa_mcq/careqa_mcq.py
index b670fcd8..6f62367d 100644
--- a/environments/careqa_mcq/careqa_mcq.py
+++ b/environments/careqa_mcq/careqa_mcq.py
@@ -28,7 +28,7 @@ def _first_letter(text: str) -> Optional[str]:
 # Prompt Construction
 
 def _build_prompt(question: str, options: dict[str, str]) -> str:
-    """Create a polished clinical MCQ prompt."""
+    """Create an MCQ prompt."""
     formatted_opts = "\n".join(f"{k}. {v}" for k, v in options.items())
     letters = ", ".join(options.keys())
     return (
diff --git a/environments/careqa_mcq/pyproject.toml b/environments/careqa_mcq/pyproject.toml
index fa298d5a..8fbbc8cc 100644
--- a/environments/careqa_mcq/pyproject.toml
+++ b/environments/careqa_mcq/pyproject.toml
@@ -1,7 +1,7 @@
 [project]
 name = "careqa_mcq"
 description = "Evaluation environment for the HPAI-BSC/CareQA MCQ dataset"
-tags = ["healthcare", "medical-qa", "mcq", "open-ended", "clinical", "single-turn"]
+tags = ["healthcare", "medical-qa", "mcq", "clinical", "single-turn"]
 version = "0.1.0"
 requires-python = ">=3.11"
 dependencies = [
diff --git a/environments/careqa_openended/README.md b/environments/careqa_openended/README.md
new file mode 100644
index 00000000..ffa0f78a
--- /dev/null
+++ b/environments/careqa_openended/README.md
@@ -0,0 +1,51 @@
+# careqa-openended
+
+> Replace the placeholders below, then remove this callout.
+
+### Overview
+- **Environment ID**: `careqa-openended`
+- **Short description**: <one-sentence description>
+- **Tags**: <comma-separated tags>
+
+### Datasets
+- **Primary dataset(s)**: <name(s) and brief description>
+- **Source links**: <links>
+- **Split sizes**: <train/eval counts>
+
+### Task
+- **Type**: <single-turn | multi-turn | tool use>
+- **Parser**: <e.g., ThinkParser, XMLParser, custom>
+- **Rubric overview**: <briefly list reward functions and key metrics>
+
+### Quickstart
+Run an evaluation with default settings:
+
+```bash
+uv run vf-eval careqa-openended
+```
+
+Configure model and sampling:
+
+```bash
+uv run vf-eval careqa-openended   -m gpt-4.1-mini   -n 20 -r 3 -t 1024 -T 0.7   -a '{"key": "value"}'  # env-specific args as JSON
+```
+
+Notes:
+- Use `-a` / `--env-args` to pass environment-specific configuration as a JSON object.
+
+### Environment Arguments
+Document any supported environment arguments and their meaning. Example:
+
+| Arg | Type | Default | Description |
+| --- | ---- | ------- | ----------- |
+| `foo` | str | `"bar"` | What this controls |
+| `max_examples` | int | `-1` | Limit on dataset size (use -1 for all) |
+
+### Metrics
+Summarize key metrics your rubric emits and how they�re interpreted.
+
+| Metric | Meaning |
+| ------ | ------- |
+| `reward` | Main scalar reward (weighted sum of criteria) |
+| `accuracy` | Exact match on target answer |
+
diff --git a/environments/careqa_openended/careqa_openended.py b/environments/careqa_openended/careqa_openended.py
new file mode 100644
index 00000000..a09aa731
--- /dev/null
+++ b/environments/careqa_openended/careqa_openended.py
@@ -0,0 +1,48 @@
+from __future__ import annotations
+from typing import Any, Optional
+from datasets import load_dataset
+import verifiers as vf
+
+# Prompt Construction
+
+def _build_open_prompt(question: str) -> str:
+    """Create an open-ended clinical QA prompt."""
+    return (
+        "You are an expert clinician answering medical questions.\n"
+        "Read the following question carefully and provide a detailed, concise answer.\n\n"
+        f"Question:\n{question.strip()}\n\n"
+        "Answer:"
+    )
+    
+# Load Open-Ended Environment
+
+def load_environment(split: str = "test") -> vf.SingleTurnEnv:
+    ds = load_dataset("HPAI-BSC/CareQA", 'CareQA_en_open', split=split)
+
+    def _map(ex):
+        system_content = "You are an expert clinician answering medical questions."
+
+        user_content = (
+            "Read the following question carefully and provide a detailed, concise answer.\n\n"
+            f"Question:\n{ex['question'].strip()}\n\n"
+            "Answer:"
+        )
+
+        return {
+            "prompt": [
+                {"role": "system", "content": system_content},
+                {"role": "user", "content": user_content},
+            ],
+            "answer": ex.get("answer_explanation", ex.get("answer", "")),
+        }
+
+    mapped = ds.map(_map, remove_columns=ds.column_names)
+
+    rubric = vf.JudgeRubric()
+
+    return vf.SingleTurnEnv(
+        dataset=mapped,
+        eval_dataset=mapped,
+        rubric=rubric,
+        system_prompt=None,
+    )
diff --git a/environments/careqa_openended/pyproject.toml b/environments/careqa_openended/pyproject.toml
new file mode 100644
index 00000000..80e4e765
--- /dev/null
+++ b/environments/careqa_openended/pyproject.toml
@@ -0,0 +1,22 @@
+[project]
+name = "careqa_openended"
+description = "Evaluation environment for the HPAI-BSC/CareQA open-ended dataset"
+tags = ["healthcare", "medical-qa", "open-ended", "clinical", "single-turn"]
+version = "0.1.0"
+requires-python = ">=3.11"
+dependencies = [
+    "verifiers>=0.1.4",
+    "datasets>=2.13.0"
+]
+
+[tool.prime.environment]
+loader = "careqa_openended:load_environment"
+display_name = "CareQA"
+visibility = "PUBLIC"
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.hatch.build]
+include = ["careqa_openended.py"]
\ No newline at end of file

From ec93a0fb012c55e085ab8cd76f214d37eb83f1bf Mon Sep 17 00:00:00 2001
From: Arya Hariharan <aryahariharan08@gmail.com>
Date: Sat, 11 Oct 2025 16:22:55 +0530
Subject: [PATCH 03/12] add careqa open-ended env

---
 environments/careqa_mcq/README.md       | 11 ------
 environments/careqa_openended/README.md | 51 ++++++++++---------------
 2 files changed, 21 insertions(+), 41 deletions(-)

diff --git a/environments/careqa_mcq/README.md b/environments/careqa_mcq/README.md
index 89dc1c04..2797a8dd 100644
--- a/environments/careqa_mcq/README.md
+++ b/environments/careqa_mcq/README.md
@@ -32,17 +32,6 @@ Configure model and sampling:
 uv run vf-eval careqa_mcq --model gpt-4.1-mini --num-examples 3 -s
 ``` 
 
-
-### Environment Arguments
-
-| Arg            | Type | Default | Description |
-|----------------|------|---------|-------------|
-| `max_examples` | int  | `-1`    | Maximum number of examples to evaluate; use `-1` for full dataset |
-| `split`        | str  | `"test"` | Dataset split to use: `train`, `validation`, or `test` |
-| `verbose`      | bool | `False` | Print prompt/answer samples during evaluation |
-
----
-
 ### Metrics
 
 | Metric        | Meaning |
diff --git a/environments/careqa_openended/README.md b/environments/careqa_openended/README.md
index ffa0f78a..f08d0250 100644
--- a/environments/careqa_openended/README.md
+++ b/environments/careqa_openended/README.md
@@ -1,51 +1,42 @@
-# careqa-openended
+# careqa_openended
 
-> Replace the placeholders below, then remove this callout.
+Evaluation environment for the [HPAI-BSC/CareQA](https://huggingface.co/datasets/HPAI-BSC/CareQA) openended dataset.
 
 ### Overview
-- **Environment ID**: `careqa-openended`
-- **Short description**: <one-sentence description>
-- **Tags**: <comma-separated tags>
+- **Environment ID**: `careqa_openended`  
+- **Short description**: CareQA is a healthcare QA dataset with **multiple-choice** and **open-ended clinical reasoning questions**. This environment is for the open-ended questions only.  
+- **Tags**: healthcare, medical QA, clinical reasoning, single-turn
 
 ### Datasets
-- **Primary dataset(s)**: <name(s) and brief description>
-- **Source links**: <links>
-- **Split sizes**: <train/eval counts>
+- **Primary dataset(s)**:  
+  - `CareQA_en_open` – open-ended clinical questions with reference answers.
+- **Source links**:  
+  - [Hugging Face CareQA dataset](https://huggingface.co/datasets/HPAI-BSC/CareQA)
 
 ### Task
-- **Type**: <single-turn | multi-turn | tool use>
-- **Parser**: <e.g., ThinkParser, XMLParser, custom>
-- **Rubric overview**: <briefly list reward functions and key metrics>
+- **Type**: single-turn  
+- **Parser**: custom prompt mapping (no structured markup)  
+- **Rubric overview**:  
+**Open-ended (`open_clinical`)**: `vf.JudgeRubric()` using an LLM-as-judge to score free-text answers for correctness and clinical reasoning. 
 
 ### Quickstart
 Run an evaluation with default settings:
 
 ```bash
-uv run vf-eval careqa-openended
+uv run vf-eval careqa
 ```
 
 Configure model and sampling:
 
 ```bash
-uv run vf-eval careqa-openended   -m gpt-4.1-mini   -n 20 -r 3 -t 1024 -T 0.7   -a '{"key": "value"}'  # env-specific args as JSON
-```
-
-Notes:
-- Use `-a` / `--env-args` to pass environment-specific configuration as a JSON object.
-
-### Environment Arguments
-Document any supported environment arguments and their meaning. Example:
-
-| Arg | Type | Default | Description |
-| --- | ---- | ------- | ----------- |
-| `foo` | str | `"bar"` | What this controls |
-| `max_examples` | int | `-1` | Limit on dataset size (use -1 for all) |
+uv run vf-eval careqa_openended --model gpt-4.1-mini --num-examples 3 -s
+``` 
 
 ### Metrics
-Summarize key metrics your rubric emits and how they�re interpreted.
 
-| Metric | Meaning |
-| ------ | ------- |
-| `reward` | Main scalar reward (weighted sum of criteria) |
-| `accuracy` | Exact match on target answer |
+| Metric        | Meaning |
+|---------------|---------|
+| `reward`      | Main scalar reward (weighted sum of rubric criteria) |
+|  `judge_score` | For open-ended questions, LLM-assigned score evaluating answer quality, correctness, and clinical reasoning |
+
 

From 61eed341585e41da02fb782573f4de9666f8d6bf Mon Sep 17 00:00:00 2001
From: Arya Hariharan <aryahariharan08@gmail.com>
Date: Sat, 11 Oct 2025 16:26:34 +0530
Subject: [PATCH 04/12] resolving issues

---
 environments/careqa_openended/careqa_openended.py | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/environments/careqa_openended/careqa_openended.py b/environments/careqa_openended/careqa_openended.py
index a09aa731..6e560f3d 100644
--- a/environments/careqa_openended/careqa_openended.py
+++ b/environments/careqa_openended/careqa_openended.py
@@ -2,17 +2,6 @@
 from typing import Any, Optional
 from datasets import load_dataset
 import verifiers as vf
-
-# Prompt Construction
-
-def _build_open_prompt(question: str) -> str:
-    """Create an open-ended clinical QA prompt."""
-    return (
-        "You are an expert clinician answering medical questions.\n"
-        "Read the following question carefully and provide a detailed, concise answer.\n\n"
-        f"Question:\n{question.strip()}\n\n"
-        "Answer:"
-    )
     
 # Load Open-Ended Environment
 

From f311014e26dec64a2e54e84cea36a15799a26cf3 Mon Sep 17 00:00:00 2001
From: Arya Hariharan <aryahariharan08@gmail.com>
Date: Sat, 11 Oct 2025 16:27:14 +0530
Subject: [PATCH 05/12] removing redundant imports

---
 environments/careqa_openended/careqa_openended.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/environments/careqa_openended/careqa_openended.py b/environments/careqa_openended/careqa_openended.py
index 6e560f3d..a69125ee 100644
--- a/environments/careqa_openended/careqa_openended.py
+++ b/environments/careqa_openended/careqa_openended.py
@@ -1,5 +1,4 @@
 from __future__ import annotations
-from typing import Any, Optional
 from datasets import load_dataset
 import verifiers as vf
     

From f9f321a9e94a511cb7cfd61b4f5148b06168bae7 Mon Sep 17 00:00:00 2001
From: Arya Hariharan <aryahariharan08@gmail.com>
Date: Fri, 17 Oct 2025 13:18:33 +0530
Subject: [PATCH 06/12] resolving comments

---
 environments/careqa_mcq/README.md     |  2 +-
 environments/careqa_mcq/careqa_mcq.py | 61 ++++++++++-----------------
 2 files changed, 24 insertions(+), 39 deletions(-)

diff --git a/environments/careqa_mcq/README.md b/environments/careqa_mcq/README.md
index 2797a8dd..9f455715 100644
--- a/environments/careqa_mcq/README.md
+++ b/environments/careqa_mcq/README.md
@@ -23,7 +23,7 @@ Evaluation environment for the [HPAI-BSC/CareQA](https://huggingface.co/datasets
 Run an evaluation with default settings:
 
 ```bash
-uv run vf-eval careqa
+uv run vf-eval careqa_mcq
 ```
 
 Configure model and sampling:
diff --git a/environments/careqa_mcq/careqa_mcq.py b/environments/careqa_mcq/careqa_mcq.py
index 6f62367d..fb10d9fe 100644
--- a/environments/careqa_mcq/careqa_mcq.py
+++ b/environments/careqa_mcq/careqa_mcq.py
@@ -2,46 +2,32 @@
 from typing import Any, Optional
 from datasets import load_dataset
 import verifiers as vf
-
-
-# Helper Functions
-
-def _get_text_from_completion(completion: Any) -> str:
-    """Extract plain text from completion."""
-    if isinstance(completion, str):
-        return completion.strip()
-    if isinstance(completion, list) and completion:
-        last = completion[-1]
-        if isinstance(last, dict):
-            return str(last.get("content", "")).strip()
-        return str(last).strip()
-    return str(completion).strip()
-
-
-def _first_letter(text: str) -> Optional[str]:
-    """Extract the first uppercase A–Z letter."""
-    for ch in (text or "").upper():
-        if "A" <= ch <= "Z":
-            return ch
-    return None
+from verifiers.utils.data_utils import (
+    extract_boxed_answer,
+    BOXED_SYSTEM_PROMPT,
+    THINK_BOXED_SYSTEM_PROMPT,
+)
 
 # Prompt Construction
 
 def _build_prompt(question: str, options: dict[str, str]) -> str:
     """Create an MCQ prompt."""
     formatted_opts = "\n".join(f"{k}. {v}" for k, v in options.items())
-    letters = ", ".join(options.keys())
-    return (
-        "You are a board-certified clinician taking a medical reasoning test.\n"
-        "Read the following question carefully and choose the most appropriate answer.\n\n"
-        f"Question:\n{question.strip()}\n\n"
-        f"Options:\n{formatted_opts}\n\n"
-        f"Respond with only the option letter ({letters}), nothing else."
-    )
+    return f"Question:{question}\n{formatted_opts}\nAnswer:"
+    
+def exact_match(parser: vf.Parser, completion: str, answer: str, **kwargs) -> float:
+    """Reward exact matches."""
+    response = parser.parse_answer(completion).strip().upper()
+    return 1.0 if response == answer.strip().upper() else 0.0
 
 # Main Environment
 
-def load_environment(split: str = "test") -> vf.Environment:
+def load_environment(
+    split: str = "test",
+    use_think: bool = False,
+    system_prompt: Optional[str] = None
+    ) -> vf.Environment:
+    
     """
     CareQA multiple-choice evaluation environment.
     Uses vf.SingleTurnEnv + MCQ accuracy rubric.
@@ -51,7 +37,6 @@ def load_environment(split: str = "test") -> vf.Environment:
     def _map(ex):
         options = {"A": ex["op1"], "B": ex["op2"], "C": ex["op3"], "D": ex["op4"]}
         gold_letter = ["A", "B", "C", "D"][ex["cop"] - 1] 
-        # The key change is here: format the single prompt string as a list of dicts (ChatML format)
         return {
             "prompt": [
                 {
@@ -63,16 +48,16 @@ def _map(ex):
         }
 
     mapped = ds.map(_map, remove_columns=ds.column_names)
+    
+    parser = vf.ThinkParser(extract_boxed_answer) if use_think else vf.Parser(extract_boxed_answer)
+    system_prompt = system_prompt or (THINK_BOXED_SYSTEM_PROMPT if use_think else BOXED_SYSTEM_PROMPT)
 
-    def mcq_accuracy(completion, answer):
-        pred = _first_letter(_get_text_from_completion(completion))
-        return 1.0 if pred == str(answer).upper() else 0.0
-
-    rubric = vf.Rubric(funcs=[mcq_accuracy], weights=[1.0])
+    rubric = vf.Rubric(funcs=[exact_match], weights=[1.0], parser=parser)
 
     return vf.SingleTurnEnv(
         dataset=mapped,
         eval_dataset=mapped,
         rubric=rubric,
-        system_prompt=None,
+        parser = parser,
+        system_prompt=system_prompt,
     )

From 8a04ba90491ea6a99480fd7d09549fb8e8384560 Mon Sep 17 00:00:00 2001
From: Arya Hariharan <aryahariharan08@gmail.com>
Date: Sat, 18 Oct 2025 11:41:13 +0530
Subject: [PATCH 07/12] resolving commits

---
 environments/careqa_mcq/careqa_mcq.py         |   2 +-
 .../careqa_openended/careqa_openended.py      | 123 +++++++++++++++---
 2 files changed, 103 insertions(+), 22 deletions(-)

diff --git a/environments/careqa_mcq/careqa_mcq.py b/environments/careqa_mcq/careqa_mcq.py
index fb10d9fe..16bcce88 100644
--- a/environments/careqa_mcq/careqa_mcq.py
+++ b/environments/careqa_mcq/careqa_mcq.py
@@ -13,7 +13,7 @@
 def _build_prompt(question: str, options: dict[str, str]) -> str:
     """Create an MCQ prompt."""
     formatted_opts = "\n".join(f"{k}. {v}" for k, v in options.items())
-    return f"Question:{question}\n{formatted_opts}\nAnswer:"
+    return f"Question:{question}\nChoices:{formatted_opts}\nAnswer:"
     
 def exact_match(parser: vf.Parser, completion: str, answer: str, **kwargs) -> float:
     """Reward exact matches."""
diff --git a/environments/careqa_openended/careqa_openended.py b/environments/careqa_openended/careqa_openended.py
index a69125ee..08b0f0c4 100644
--- a/environments/careqa_openended/careqa_openended.py
+++ b/environments/careqa_openended/careqa_openended.py
@@ -1,36 +1,117 @@
 from __future__ import annotations
+import os
+import re
 from datasets import load_dataset
+from openai import AsyncOpenAI
 import verifiers as vf
-    
-# Load Open-Ended Environment
 
-def load_environment(split: str = "test") -> vf.SingleTurnEnv:
-    ds = load_dataset("HPAI-BSC/CareQA", 'CareQA_en_open', split=split)
 
-    def _map(ex):
-        system_content = "You are an expert clinician answering medical questions."
+def load_environment(
+    judge_model: str = "gpt-4o-mini",
+    judge_base_url: str | None = None,
+    judge_api_key: str | None = None,
+) -> vf.Environment:
+    """
+    CareQA Open-Ended QA environment using LLM-as-a-Judge evaluation.
+    
+    This environment loads the open-ended subset of the CareQA dataset and
+    uses an LLM judge to assess whether a model's response matches or aligns
+    medically with the reference answer.
+    """
 
-        user_content = (
-            "Read the following question carefully and provide a detailed, concise answer.\n\n"
-            f"Question:\n{ex['question'].strip()}\n\n"
-            "Answer:"
-        )
+    # --- Load Dataset ---
+    ds = load_dataset("HPAI-BSC/CareQA", "CareQA_en_open")
+    train_dataset = ds["train"] if "train" in ds else None
+    eval_dataset = ds["test"]
 
+    def _map(ex):
         return {
-            "prompt": [
-                {"role": "system", "content": system_content},
-                {"role": "user", "content": user_content},
-            ],
+            "question": ex["question"].strip(),
             "answer": ex.get("answer_explanation", ex.get("answer", "")),
+            "task": "careqa_open",
         }
 
-    mapped = ds.map(_map, remove_columns=ds.column_names)
+    if train_dataset:
+        train_dataset = train_dataset.map(_map, remove_columns=train_dataset.column_names)
+    eval_dataset = eval_dataset.map(_map, remove_columns=eval_dataset.column_names)
+
+    # System Prompt
+    system_prompt = (
+        "You are an expert clinician answering open-ended medical questions. "
+        "Read the question carefully, reason step by step, and provide a precise, "
+        "clinically sound answer enclosed in <think></think> tags, followed by your conclusion."
+    )
+
+    # Judge Prompt Template
+    JUDGE_TEMPLATE = """\
+You are a clinical reasoning evaluator tasked with determining whether a model's medical answer
+is equivalent in meaning and correctness to the reference (ground truth) answer.
+
+You will be given:
+1. A clinical question.
+2. The ground truth answer.
+3. A model's predicted answer.
+
+Judge whether the predicted answer is *medically equivalent* to the ground truth. 
+Equivalence means that both answers express the same medical reasoning or correct clinical interpretation,
+even if the wording differs.
+
+Guidelines:
+- Equivalent if the same diagnosis, reasoning, or recommendation is conveyed.
+- Accept synonyms (e.g., “heart attack” vs “myocardial infarction”).
+- Ignore trivial stylistic differences or additional context.
+- Not equivalent if the model changes the diagnosis, key mechanism, or recommendation.
+
+Question: {question}
 
-    rubric = vf.JudgeRubric()
+Ground Truth Answer: {answer}
 
-    return vf.SingleTurnEnv(
-        dataset=mapped,
-        eval_dataset=mapped,
+Predicted Answer: {response}
+
+Is the predicted answer medically equivalent to the ground truth?
+Respond strictly with "EQUIVALENT" or "NOT_EQUIVALENT".
+""".strip()
+
+    # Judge Client Setup
+    api_key = judge_api_key or os.getenv("OPENAI_API_KEY")
+    judge_client = AsyncOpenAI(base_url=judge_base_url, api_key=api_key) if api_key else None
+
+    # Reward Extraction
+    def extract_answer_section(completion_text: str) -> str:
+        """Extract final answer after think tags."""
+        if not completion_text:
+            return ""
+        if "<think>" in completion_text and "</think>" in completion_text:
+            return re.sub(r".*?</think>", "", completion_text, flags=re.DOTALL).strip()
+        return completion_text.strip()
+
+    async def careqa_reward_func(judge, prompt, completion, answer, state, **kwargs) -> float:
+        """Evaluate medical equivalence using LLM-as-judge."""
+        completion_text = completion if isinstance(completion, str) else str(completion)
+        response = extract_answer_section(completion_text)
+
+        judge_response = await judge(prompt, response, answer, state, **kwargs)
+        decision = judge_response.strip().upper()
+
+        if "EQUIVALENT" in decision and "NOT_EQUIVALENT" not in decision:
+            return 1.0
+        else:
+            return 0.0
+
+    # Judge Rubric
+    rubric = vf.JudgeRubric(
+        judge_client=judge_client,
+        judge_model=judge_model,
+        judge_prompt=JUDGE_TEMPLATE,
+    )
+    rubric.add_reward_func(careqa_reward_func, weight=1.0)
+
+    # Environment Construction
+    vf_env = vf.SingleTurnEnv(
+        dataset=train_dataset,
+        eval_dataset=eval_dataset,
+        system_prompt=system_prompt,
         rubric=rubric,
-        system_prompt=None,
     )
+
+    return vf_env

From f0018abdc2091ef51841930bb60d6d80b4e866f4 Mon Sep 17 00:00:00 2001
From: Arya Hariharan <aryahariharan08@gmail.com>
Date: Tue, 21 Oct 2025 21:28:51 +0530
Subject: [PATCH 08/12] resolving comments

---
 .../careqa_openended/careqa_openended.py      | 32 ++++++-------------
 1 file changed, 9 insertions(+), 23 deletions(-)

diff --git a/environments/careqa_openended/careqa_openended.py b/environments/careqa_openended/careqa_openended.py
index 08b0f0c4..ff3d369e 100644
--- a/environments/careqa_openended/careqa_openended.py
+++ b/environments/careqa_openended/careqa_openended.py
@@ -43,33 +43,19 @@ def _map(ex):
     )
 
     # Judge Prompt Template
-    JUDGE_TEMPLATE = """\
-You are a clinical reasoning evaluator tasked with determining whether a model's medical answer
-is equivalent in meaning and correctness to the reference (ground truth) answer.
-
-You will be given:
-1. A clinical question.
-2. The ground truth answer.
-3. A model's predicted answer.
-
-Judge whether the predicted answer is *medically equivalent* to the ground truth. 
-Equivalence means that both answers express the same medical reasoning or correct clinical interpretation,
-even if the wording differs.
-
-Guidelines:
-- Equivalent if the same diagnosis, reasoning, or recommendation is conveyed.
-- Accept synonyms (e.g., “heart attack” vs “myocardial infarction”).
-- Ignore trivial stylistic differences or additional context.
-- Not equivalent if the model changes the diagnosis, key mechanism, or recommendation.
-
+    JUDGE_TEMPLATE = """You are a clinical fact verifier.
+Given:
 Question: {question}
+Reference (ground truth) answer: {answer}
+Model’s answer: {response}
 
-Ground Truth Answer: {answer}
+Determine if the model’s answer is medically equivalent to the reference.
+- Consider medical synonyms and abbreviations equivalent.
+- Ignore minor wording differences (e.g., “high blood pressure” ≈ “hypertension”).
+- If the model’s answer is more general or specific but still correct, consider it equivalent.
 
-Predicted Answer: {response}
+Respond with one word only: "EQUIVALENT" or "NOT_EQUIVALENT".
 
-Is the predicted answer medically equivalent to the ground truth?
-Respond strictly with "EQUIVALENT" or "NOT_EQUIVALENT".
 """.strip()
 
     # Judge Client Setup

From b5da66a2f512aefb4f16b07bf2e948fc653d79f0 Mon Sep 17 00:00:00 2001
From: Arya Hariharan <84255987+Arya-Hari@users.noreply.github.com>
Date: Mon, 27 Oct 2025 20:50:16 +0530
Subject: [PATCH 09/12] Update careqa_openended.py

---
 environments/careqa_openended/careqa_openended.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/environments/careqa_openended/careqa_openended.py b/environments/careqa_openended/careqa_openended.py
index ff3d369e..f96973f5 100644
--- a/environments/careqa_openended/careqa_openended.py
+++ b/environments/careqa_openended/careqa_openended.py
@@ -37,9 +37,7 @@ def _map(ex):
 
     # System Prompt
     system_prompt = (
-        "You are an expert clinician answering open-ended medical questions. "
-        "Read the question carefully, reason step by step, and provide a precise, "
-        "clinically sound answer enclosed in <think></think> tags, followed by your conclusion."
+        "Instructions: The following text is a medical question. Answer it in the most factual, concise and informative way possible"
     )
 
     # Judge Prompt Template

From 013b0d63fd3aef5986d094b25a7d2b05fc5c9028 Mon Sep 17 00:00:00 2001
From: Arya Hariharan <84255987+Arya-Hari@users.noreply.github.com>
Date: Mon, 27 Oct 2025 20:51:57 +0530
Subject: [PATCH 10/12] Update careqa_openended.py

---
 environments/careqa_openended/careqa_openended.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/environments/careqa_openended/careqa_openended.py b/environments/careqa_openended/careqa_openended.py
index f96973f5..fc6d93d2 100644
--- a/environments/careqa_openended/careqa_openended.py
+++ b/environments/careqa_openended/careqa_openended.py
@@ -37,7 +37,7 @@ def _map(ex):
 
     # System Prompt
     system_prompt = (
-        "Instructions: The following text is a medical question. Answer it in the most factual, concise and informative way possible"
+        "Instructions: The question that will be given to you is a medical question. Answer it in the most factual, concise and informative way possible"
     )
 
     # Judge Prompt Template

From d76f8607ee8a9dcf188852f8eed907470b2b41cf Mon Sep 17 00:00:00 2001
From: Arya Hariharan <84255987+Arya-Hari@users.noreply.github.com>
Date: Mon, 27 Oct 2025 20:56:05 +0530
Subject: [PATCH 11/12] Update careqa_openended.py

---
 environments/careqa_openended/careqa_openended.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/environments/careqa_openended/careqa_openended.py b/environments/careqa_openended/careqa_openended.py
index fc6d93d2..ff3d369e 100644
--- a/environments/careqa_openended/careqa_openended.py
+++ b/environments/careqa_openended/careqa_openended.py
@@ -37,7 +37,9 @@ def _map(ex):
 
     # System Prompt
     system_prompt = (
-        "Instructions: The question that will be given to you is a medical question. Answer it in the most factual, concise and informative way possible"
+        "You are an expert clinician answering open-ended medical questions. "
+        "Read the question carefully, reason step by step, and provide a precise, "
+        "clinically sound answer enclosed in <think></think> tags, followed by your conclusion."
     )
 
     # Judge Prompt Template

From 3682c6087b253acb6ba0bf5922803ac59ca7046e Mon Sep 17 00:00:00 2001
From: Benjamin Warner <me@benjaminwarner.dev>
Date: Fri, 12 Dec 2025 17:45:19 -0500
Subject: [PATCH 12/12] update careqa to use No Free Labels style prompt

---
 environments/careqa/README.md                 |  89 +++++++
 environments/careqa/careqa.py                 | 251 ++++++++++++++++++
 .../{careqa_mcq => careqa}/pyproject.toml     |  14 +-
 environments/careqa_mcq/README.md             |  42 ---
 environments/careqa_mcq/careqa_mcq.py         |  63 -----
 environments/careqa_openended/README.md       |  42 ---
 .../careqa_openended/careqa_openended.py      | 103 -------
 environments/careqa_openended/pyproject.toml  |  22 --
 8 files changed, 349 insertions(+), 277 deletions(-)
 create mode 100644 environments/careqa/README.md
 create mode 100644 environments/careqa/careqa.py
 rename environments/{careqa_mcq => careqa}/pyproject.toml (62%)
 delete mode 100644 environments/careqa_mcq/README.md
 delete mode 100644 environments/careqa_mcq/careqa_mcq.py
 delete mode 100644 environments/careqa_openended/README.md
 delete mode 100644 environments/careqa_openended/careqa_openended.py
 delete mode 100644 environments/careqa_openended/pyproject.toml

diff --git a/environments/careqa/README.md b/environments/careqa/README.md
new file mode 100644
index 00000000..65d593e1
--- /dev/null
+++ b/environments/careqa/README.md
@@ -0,0 +1,89 @@
+# careqa
+
+Evaluation environment for the [HPAI-BSC/CareQA](https://huggingface.co/datasets/HPAI-BSC/CareQA) dataset.
+
+### Overview
+- **Environment ID**: `careqa`  
+- **Short description**: CareQA is a healthcare QA dataset with **multiple-choice** and **open-ended clinical reasoning questions**. This environment supports both modes through the `mode` parameter.  
+- **Tags**: healthcare, medical QA, clinical reasoning, MCQ, single-turn
+
+### Datasets
+- **Primary dataset(s)**:
+  - `CareQA_en` – multiple-choice clinical questions with 4 options and correct answer labels
+  - `CareQA_en_open` – open-ended clinical questions with reference answers
+- **Source links**:
+  - [Hugging Face CareQA dataset](https://huggingface.co/datasets/HPAI-BSC/CareQA)
+
+### Task
+- **Type**: single-turn
+- **Parser**:
+  - MCQ mode: `vf.Parser()` or `vf.ThinkParser()` for extracting boxed answers
+  - Open-ended mode: `XMLParser()` for judge responses
+- **Rubric overview**:
+  - **MCQ mode (`en`)**: `vf.Rubric()` measuring **accuracy** (letter match A–D)
+  - **Open-ended mode (`open`)**: `vf.JudgeRubric()` using an LLM-as-judge to score free-text answers for correctness and clinical reasoning
+
+### Quickstart
+
+**Multiple-choice evaluation:**
+```bash
+medarc-eval careqa --mode en --model gpt-4.1-mini --num-examples 10 -s
+```
+
+**Open-ended evaluation:**
+```bash
+medarc-eval careqa --mode open --model gpt-4.1-mini --num-examples 10 -s
+```
+
+**With think-mode prompting (MCQ only):**
+```bash
+medarc-eval careqa --mode en --use-think --model gpt-4.1-mini --num-examples 10 -s
+```
+
+**With shuffled answer options (MCQ only):**
+```bash
+medarc-eval careqa --mode en --shuffle-answers --shuffle-seed 42 --model gpt-4.1-mini -n 10 -s
+```
+
+### Configuration Options
+
+#### Common Parameters
+- `--mode`: Select mode: `en` (multiple-choice) or `open` (open-ended). Default: `open`
+- `--split`: Dataset split to use. Default: `test`
+- `--system-prompt`: Custom system prompt (uses mode-appropriate default if not specified)
+
+#### MCQ-Specific Parameters
+- `--use-think`: Enable think-style prompting with boxed answers
+- `--shuffle-answers`: Randomly shuffle answer options
+- `--shuffle-seed`: Seed for answer shuffling (default: 1618)
+
+#### Open-Ended-Specific Parameters
+- `--judge-model`: Model for LLM-as-judge evaluation (default: `gpt-4o-mini`)
+- `--judge-base-url`: Base URL for judge API
+- `--judge-api-key`: API key for judge (falls back to `OPENAI_API_KEY` env var)
+
+### Metrics
+
+#### MCQ Mode
+| Metric        | Meaning |
+|---------------|---------|
+| `reward`      | Main scalar reward (weighted sum of rubric criteria) |
+| `accuracy`    | Exact match on target MCQ answer (letter A–D) |
+
+#### Open-Ended Mode
+| Metric        | Meaning |
+|---------------|---------|
+| `reward`      | Main scalar reward (weighted sum of rubric criteria) |
+| `judge_score` | LLM-assigned score evaluating answer quality, correctness, and clinical reasoning |
+
+### Example Usage
+
+```python
+import verifiers as vf
+
+# Load MCQ environment
+env_mcq = vf.load_environment("careqa", mode="en", shuffle_answers=True)
+
+# Load open-ended environment
+env_open = vf.load_environment("careqa", mode="open", judge_model="gpt-4o-mini")
+```
diff --git a/environments/careqa/careqa.py b/environments/careqa/careqa.py
new file mode 100644
index 00000000..e67178af
--- /dev/null
+++ b/environments/careqa/careqa.py
@@ -0,0 +1,251 @@
+import re
+from enum import Enum
+from typing import Optional
+
+from datasets import load_dataset
+from openai import AsyncOpenAI
+import verifiers as vf
+from medarc_verifiers.rewards.multiple_choice_accuracy import multiple_choice_accuracy
+from medarc_verifiers.utils.randomize_multiple_choice import randomize_multiple_choice
+from medarc_verifiers.parsers.xml_parser import XMLParser
+from verifiers.types import Info, State
+from verifiers.utils.data_utils import extract_boxed_answer, BOXED_SYSTEM_PROMPT
+
+
+class CareQASplit(Enum):
+    """Mode selector for CareQA environment."""
+
+    EN = "en"
+    OPEN = "open"
+
+
+# --- MCQ Helpers ---
+
+
+def _build_mcq_prompt(question: str, options: dict[str, str]) -> str:
+    """Create an MCQ prompt."""
+    formatted_opts = "\n".join(f"{k}. {v}" for k, v in options.items())
+    return f"Question: {question}\nChoices:\n{formatted_opts}\nAnswer:"
+
+
+def accuracy(completion, answer: str, parser: vf.Parser, info: dict | None = None, **kwargs) -> float:
+    """Reward based on shared multiple-choice accuracy grading."""
+    parsed = parser.parse_answer(completion) or ""
+    answer_text = info.get("answer_text", None) if info else None
+    is_correct = multiple_choice_accuracy(llm_answer=parsed, answer_letter=answer, answer_text=answer_text)
+    return 1.0 if is_correct else 0.0
+
+
+# --- Open-Ended Helpers ---
+
+
+JUDGE_TEMPLATE = """You are grading an AI assistant's answer to a medical/science exam questions.
+
+Input:
+- <question>: The exam question.
+- <reference_answer>: The correct answer.
+- <assistant_answer>: The AI's response to grade.
+
+Task: Determine if the assistant's answer is correct or incorrect by comparing it to the reference answer and output your grade in <grade>...</grade> tags.
+
+Grading Rules:
+- Assume the reference answer is correct and reflects the expected exam solution.
+- Focus on factual content and meaning, not style, length, or confidence.
+
+Correct if the assistant's answer conveys the same essential fact(s) as the reference, including:
+- Synonyms, acronyms (expanded or abbreviated), or rephrasing with equivalent meaning
+- Slightly more general/specific phrasing that captures the key concept
+- Shorter or longer answers that express the tested fact without contradictions
+- Additional supporting details that don't contradict the reference
+
+Incorrect if any of these apply:
+- Different main concept, mechanism, structure, or relationship
+- Contradicts the reference on key points (wrong organ, drug, effect, process, etc.)
+- Contains clearly incorrect information
+- Too vague/incomplete to match the reference
+- Merely repeats question words without the core information from the reference
+
+Be strict: clear mismatches on main concepts or incorrect claims = Incorrect.
+
+<question>{question}</question>
+<reference_answer>{answer}</reference_answer>
+<assistant_answer>{response}</assistant_answer>
+
+Briefly explain whether the assistant's answer matches or conflicts with the reference. Then output your grade as:
+
+<grade>[Correct or Incorrect]</grade>
+""".strip()
+
+
+def extract_answer_section(completion_text: str) -> str:
+    """Extract final answer after think tags."""
+    if not completion_text:
+        return ""
+    if "<think>" in completion_text and "</think>" in completion_text:
+        return re.sub(r".*?</think>", "", completion_text, flags=re.DOTALL).strip()
+    return completion_text.strip()
+
+
+def load_environment(
+    split: str | CareQASplit,
+    system_prompt: Optional[str] = None,
+    # MCQ-specific options
+    shuffle_answers: bool = False,
+    shuffle_seed: int | None = 1618,
+    # Open-ended specific options
+    judge_model: str = "gpt-4o-mini",
+    judge_base_url: str | None = None,
+    judge_api_key: str | None = None,
+    **kwargs,
+) -> vf.Environment:
+    """
+    CareQA evaluation environment supporting both MCQ and Open-Ended modes.
+
+    Args:
+        split: CareQASplit.EN for multiple-choice or CareQASplit.OPEN for open-ended QA.
+        system_prompt: Custom system prompt (uses mode-appropriate default if None).
+        shuffle_answers: Shuffle MCQ answer options (MCQ mode only).
+        shuffle_seed: Seed for answer shuffling (MCQ mode only).
+        judge_model: Model to use for LLM-as-judge evaluation (Open-ended mode only).
+        judge_base_url: Base URL for judge API (Open-ended mode only).
+        judge_api_key: API key for judge (Open-ended mode only).
+
+    Returns:
+        A vf.Environment configured for the selected mode.
+    """
+    split = CareQASplit(split) if isinstance(split, str) else split
+    if split == CareQASplit.EN:
+        return _load_mcq_environment(
+            system_prompt=system_prompt,
+            shuffle_answers=shuffle_answers,
+            shuffle_seed=shuffle_seed,
+        )
+    elif split == CareQASplit.OPEN:
+        return _load_open_ended_environment(
+            system_prompt=system_prompt,
+            judge_model=judge_model,
+            judge_base_url=judge_base_url,
+            judge_api_key=judge_api_key,
+        )
+    else:
+        raise ValueError(f"Invalid mode: {split}")
+
+
+def _load_mcq_environment(
+    system_prompt: Optional[str],
+    shuffle_answers: bool,
+    shuffle_seed: int | None,
+) -> vf.Environment:
+    """Load CareQA multiple-choice environment."""
+    eval_dataset = load_dataset("HPAI-BSC/CareQA", "CareQA_en", split="test")
+
+    def _map(ex, idx=None):
+        options = {"A": ex["op1"], "B": ex["op2"], "C": ex["op3"], "D": ex["op4"]}
+        gold_letter = ["A", "B", "C", "D"][ex["cop"] - 1]
+
+        if shuffle_answers and gold_letter in options:
+            options, gold_letter, _ = randomize_multiple_choice(
+                options=options,
+                answer_choice=gold_letter,
+                seed=shuffle_seed,
+                row_id=ex.get("id", idx),
+            )
+
+        return {
+            "question": _build_mcq_prompt(ex["question"], options),
+            "answer": gold_letter,
+            "info": {
+                "answer_text": options.get(gold_letter, None),
+                **({"options": options} if shuffle_answers else {}),
+            },
+        }
+
+    load_from_cache_file = not shuffle_answers
+    eval_dataset = eval_dataset.map(
+        _map,
+        with_indices=True,
+        remove_columns=eval_dataset.column_names,
+        load_from_cache_file=load_from_cache_file,
+    )
+
+    parser = vf.Parser(extract_boxed_answer)
+    final_system_prompt = BOXED_SYSTEM_PROMPT or system_prompt
+
+    rubric = vf.Rubric(funcs=[accuracy], weights=[1.0], parser=parser)
+
+    return vf.SingleTurnEnv(
+        eval_dataset=eval_dataset,
+        rubric=rubric,
+        parser=parser,
+        system_prompt=final_system_prompt,
+    )
+
+
+def _load_open_ended_environment(
+    system_prompt: Optional[str],
+    judge_model: str,
+    judge_base_url: str | None,
+    judge_api_key: str | None,
+) -> vf.Environment:
+    """Load CareQA open-ended environment with LLM-as-judge evaluation."""
+    eval_dataset = load_dataset("HPAI-BSC/CareQA", "CareQA_en_open", split="test")
+
+    def _map(ex):
+        info = {}
+        info["question"] = ex["question"].strip()
+        return {
+            "question": ex["question"].strip(),
+            "answer": ex.get("answer_explanation", ex.get("answer", "")),
+            "task": "careqa_open",
+            "info": info,
+        }
+
+    eval_dataset = eval_dataset.map(_map, remove_columns=eval_dataset.column_names)
+
+    final_system_prompt = system_prompt or (
+        "Instructions: The following text is a medical question. Answer it in the most factual, concise, and informative way possible."
+    )
+
+    # Judge client setup
+    judge_client = AsyncOpenAI(base_url=judge_base_url, api_key=judge_api_key)
+    judge_parser = XMLParser(fields=["grade"], answer_field="grade")
+
+    judge_rubric = vf.JudgeRubric(
+        parser=judge_parser,
+        judge_client=judge_client,
+        judge_model=judge_model,
+        judge_prompt="{question}",
+    )
+
+    async def accuracy(judge, prompt, completion, answer, state: State, info: Info) -> float:
+        """Evaluate medical equivalence using LLM-as-judge."""
+        completion_text = completion if isinstance(completion, str) else str(completion)
+        response = extract_answer_section(completion_text)
+
+        try:
+            judge_prompt = JUDGE_TEMPLATE.format(question=info.get("question", ""), answer=answer, response=response)
+            judge_response = await judge_rubric.judge(judge_prompt, "", "", state)
+            grade = judge_parser.parse_answer(judge_response).strip().lower()
+        except AttributeError:
+            judge_response = await judge_rubric.judge(judge_prompt, "", "", state)
+            grade = judge_parser.parse_answer(judge_response).strip().lower()
+
+        info.setdefault("judge_feedback", []).append(
+            {
+                "grade": grade,
+                "raw_judge": str(judge_response),
+            }
+        )
+
+        if "correct" in grade and "incorrect" not in grade:
+            return 1.0
+        else:
+            return 0.0
+
+    judge_rubric.add_reward_func(accuracy, weight=1.0)
+
+    return vf.SingleTurnEnv(
+        eval_dataset=eval_dataset,
+        system_prompt=final_system_prompt,
+        rubric=judge_rubric,
+    )
diff --git a/environments/careqa_mcq/pyproject.toml b/environments/careqa/pyproject.toml
similarity index 62%
rename from environments/careqa_mcq/pyproject.toml
rename to environments/careqa/pyproject.toml
index 8fbbc8cc..a875b5ff 100644
--- a/environments/careqa_mcq/pyproject.toml
+++ b/environments/careqa/pyproject.toml
@@ -1,16 +1,17 @@
 [project]
-name = "careqa_mcq"
+name = "careqa"
 description = "Evaluation environment for the HPAI-BSC/CareQA MCQ dataset"
-tags = ["healthcare", "medical-qa", "mcq", "clinical", "single-turn"]
+tags = ["healthcare", "medical-qa", "mcq", "clinical", "single-turn", "open-ended"]
 version = "0.1.0"
 requires-python = ">=3.11"
 dependencies = [
     "verifiers>=0.1.4",
-    "datasets>=2.13.0"
+    "datasets>=2.13.0",
+    "medarc_verifiers>=0.1.0",
 ]
 
 [tool.prime.environment]
-loader = "careqa_mcq:load_environment"
+loader = "careqa:load_environment"
 display_name = "CareQA"
 visibility = "PUBLIC"
 
@@ -19,4 +20,7 @@ requires = ["hatchling"]
 build-backend = "hatchling.build"
 
 [tool.hatch.build]
-include = ["careqa_mcq.py"]
\ No newline at end of file
+include = ["careqa.py"]
+
+[tool.uv.sources]
+medarc_verifiers = { git = "https://github.com/MedARC-AI/med-lm-envs" }
\ No newline at end of file
diff --git a/environments/careqa_mcq/README.md b/environments/careqa_mcq/README.md
deleted file mode 100644
index 9f455715..00000000
--- a/environments/careqa_mcq/README.md
+++ /dev/null
@@ -1,42 +0,0 @@
-# careqa
-
-Evaluation environment for the [HPAI-BSC/CareQA](https://huggingface.co/datasets/HPAI-BSC/CareQA) multiple-choice dataset.
-
-### Overview
-- **Environment ID**: `careqa_mcq`  
-- **Short description**: CareQA is a healthcare QA dataset with **multiple-choice** and **open-ended clinical reasoning questions**. This environment is for the MCQs only.  
-- **Tags**: healthcare, medical QA, clinical reasoning, MCQ, single-turn
-
-### Datasets
-- **Primary dataset(s)**:  
-  - `CareQA_en` – multiple-choice clinical questions with 4 options and correct answer labels.  
-- **Source links**:  
-  - [Hugging Face CareQA dataset](https://huggingface.co/datasets/HPAI-BSC/CareQA)
-
-### Task
-- **Type**: single-turn  
-- **Parser**: custom prompt mapping (no structured markup)  
-- **Rubric overview**:  
-**MCQ (`closed_mcq`)**: `vf.Rubric()` measuring **accuracy** (letter match).  
-
-### Quickstart
-Run an evaluation with default settings:
-
-```bash
-uv run vf-eval careqa_mcq
-```
-
-Configure model and sampling:
-
-```bash
-uv run vf-eval careqa_mcq --model gpt-4.1-mini --num-examples 3 -s
-``` 
-
-### Metrics
-
-| Metric        | Meaning |
-|---------------|---------|
-| `reward`      | Main scalar reward (weighted sum of rubric criteria) |
-| `accuracy`    | Exact match on target MCQ answer (letter A–D) |
-
-
diff --git a/environments/careqa_mcq/careqa_mcq.py b/environments/careqa_mcq/careqa_mcq.py
deleted file mode 100644
index 16bcce88..00000000
--- a/environments/careqa_mcq/careqa_mcq.py
+++ /dev/null
@@ -1,63 +0,0 @@
-from __future__ import annotations
-from typing import Any, Optional
-from datasets import load_dataset
-import verifiers as vf
-from verifiers.utils.data_utils import (
-    extract_boxed_answer,
-    BOXED_SYSTEM_PROMPT,
-    THINK_BOXED_SYSTEM_PROMPT,
-)
-
-# Prompt Construction
-
-def _build_prompt(question: str, options: dict[str, str]) -> str:
-    """Create an MCQ prompt."""
-    formatted_opts = "\n".join(f"{k}. {v}" for k, v in options.items())
-    return f"Question:{question}\nChoices:{formatted_opts}\nAnswer:"
-    
-def exact_match(parser: vf.Parser, completion: str, answer: str, **kwargs) -> float:
-    """Reward exact matches."""
-    response = parser.parse_answer(completion).strip().upper()
-    return 1.0 if response == answer.strip().upper() else 0.0
-
-# Main Environment
-
-def load_environment(
-    split: str = "test",
-    use_think: bool = False,
-    system_prompt: Optional[str] = None
-    ) -> vf.Environment:
-    
-    """
-    CareQA multiple-choice evaluation environment.
-    Uses vf.SingleTurnEnv + MCQ accuracy rubric.
-    """
-    ds = load_dataset("HPAI-BSC/CareQA",'CareQA_en', split=split)
-
-    def _map(ex):
-        options = {"A": ex["op1"], "B": ex["op2"], "C": ex["op3"], "D": ex["op4"]}
-        gold_letter = ["A", "B", "C", "D"][ex["cop"] - 1] 
-        return {
-            "prompt": [
-                {
-                    "role": "user", 
-                    "content": _build_prompt(ex["question"], options)
-                }
-            ],
-            "answer": gold_letter,
-        }
-
-    mapped = ds.map(_map, remove_columns=ds.column_names)
-    
-    parser = vf.ThinkParser(extract_boxed_answer) if use_think else vf.Parser(extract_boxed_answer)
-    system_prompt = system_prompt or (THINK_BOXED_SYSTEM_PROMPT if use_think else BOXED_SYSTEM_PROMPT)
-
-    rubric = vf.Rubric(funcs=[exact_match], weights=[1.0], parser=parser)
-
-    return vf.SingleTurnEnv(
-        dataset=mapped,
-        eval_dataset=mapped,
-        rubric=rubric,
-        parser = parser,
-        system_prompt=system_prompt,
-    )
diff --git a/environments/careqa_openended/README.md b/environments/careqa_openended/README.md
deleted file mode 100644
index f08d0250..00000000
--- a/environments/careqa_openended/README.md
+++ /dev/null
@@ -1,42 +0,0 @@
-# careqa_openended
-
-Evaluation environment for the [HPAI-BSC/CareQA](https://huggingface.co/datasets/HPAI-BSC/CareQA) openended dataset.
-
-### Overview
-- **Environment ID**: `careqa_openended`  
-- **Short description**: CareQA is a healthcare QA dataset with **multiple-choice** and **open-ended clinical reasoning questions**. This environment is for the open-ended questions only.  
-- **Tags**: healthcare, medical QA, clinical reasoning, single-turn
-
-### Datasets
-- **Primary dataset(s)**:  
-  - `CareQA_en_open` – open-ended clinical questions with reference answers.
-- **Source links**:  
-  - [Hugging Face CareQA dataset](https://huggingface.co/datasets/HPAI-BSC/CareQA)
-
-### Task
-- **Type**: single-turn  
-- **Parser**: custom prompt mapping (no structured markup)  
-- **Rubric overview**:  
-**Open-ended (`open_clinical`)**: `vf.JudgeRubric()` using an LLM-as-judge to score free-text answers for correctness and clinical reasoning. 
-
-### Quickstart
-Run an evaluation with default settings:
-
-```bash
-uv run vf-eval careqa
-```
-
-Configure model and sampling:
-
-```bash
-uv run vf-eval careqa_openended --model gpt-4.1-mini --num-examples 3 -s
-``` 
-
-### Metrics
-
-| Metric        | Meaning |
-|---------------|---------|
-| `reward`      | Main scalar reward (weighted sum of rubric criteria) |
-|  `judge_score` | For open-ended questions, LLM-assigned score evaluating answer quality, correctness, and clinical reasoning |
-
-
diff --git a/environments/careqa_openended/careqa_openended.py b/environments/careqa_openended/careqa_openended.py
deleted file mode 100644
index ff3d369e..00000000
--- a/environments/careqa_openended/careqa_openended.py
+++ /dev/null
@@ -1,103 +0,0 @@
-from __future__ import annotations
-import os
-import re
-from datasets import load_dataset
-from openai import AsyncOpenAI
-import verifiers as vf
-
-
-def load_environment(
-    judge_model: str = "gpt-4o-mini",
-    judge_base_url: str | None = None,
-    judge_api_key: str | None = None,
-) -> vf.Environment:
-    """
-    CareQA Open-Ended QA environment using LLM-as-a-Judge evaluation.
-    
-    This environment loads the open-ended subset of the CareQA dataset and
-    uses an LLM judge to assess whether a model's response matches or aligns
-    medically with the reference answer.
-    """
-
-    # --- Load Dataset ---
-    ds = load_dataset("HPAI-BSC/CareQA", "CareQA_en_open")
-    train_dataset = ds["train"] if "train" in ds else None
-    eval_dataset = ds["test"]
-
-    def _map(ex):
-        return {
-            "question": ex["question"].strip(),
-            "answer": ex.get("answer_explanation", ex.get("answer", "")),
-            "task": "careqa_open",
-        }
-
-    if train_dataset:
-        train_dataset = train_dataset.map(_map, remove_columns=train_dataset.column_names)
-    eval_dataset = eval_dataset.map(_map, remove_columns=eval_dataset.column_names)
-
-    # System Prompt
-    system_prompt = (
-        "You are an expert clinician answering open-ended medical questions. "
-        "Read the question carefully, reason step by step, and provide a precise, "
-        "clinically sound answer enclosed in <think></think> tags, followed by your conclusion."
-    )
-
-    # Judge Prompt Template
-    JUDGE_TEMPLATE = """You are a clinical fact verifier.
-Given:
-Question: {question}
-Reference (ground truth) answer: {answer}
-Model’s answer: {response}
-
-Determine if the model’s answer is medically equivalent to the reference.
-- Consider medical synonyms and abbreviations equivalent.
-- Ignore minor wording differences (e.g., “high blood pressure” ≈ “hypertension”).
-- If the model’s answer is more general or specific but still correct, consider it equivalent.
-
-Respond with one word only: "EQUIVALENT" or "NOT_EQUIVALENT".
-
-""".strip()
-
-    # Judge Client Setup
-    api_key = judge_api_key or os.getenv("OPENAI_API_KEY")
-    judge_client = AsyncOpenAI(base_url=judge_base_url, api_key=api_key) if api_key else None
-
-    # Reward Extraction
-    def extract_answer_section(completion_text: str) -> str:
-        """Extract final answer after think tags."""
-        if not completion_text:
-            return ""
-        if "<think>" in completion_text and "</think>" in completion_text:
-            return re.sub(r".*?</think>", "", completion_text, flags=re.DOTALL).strip()
-        return completion_text.strip()
-
-    async def careqa_reward_func(judge, prompt, completion, answer, state, **kwargs) -> float:
-        """Evaluate medical equivalence using LLM-as-judge."""
-        completion_text = completion if isinstance(completion, str) else str(completion)
-        response = extract_answer_section(completion_text)
-
-        judge_response = await judge(prompt, response, answer, state, **kwargs)
-        decision = judge_response.strip().upper()
-
-        if "EQUIVALENT" in decision and "NOT_EQUIVALENT" not in decision:
-            return 1.0
-        else:
-            return 0.0
-
-    # Judge Rubric
-    rubric = vf.JudgeRubric(
-        judge_client=judge_client,
-        judge_model=judge_model,
-        judge_prompt=JUDGE_TEMPLATE,
-    )
-    rubric.add_reward_func(careqa_reward_func, weight=1.0)
-
-    # Environment Construction
-    vf_env = vf.SingleTurnEnv(
-        dataset=train_dataset,
-        eval_dataset=eval_dataset,
-        system_prompt=system_prompt,
-        rubric=rubric,
-    )
-
-    return vf_env
diff --git a/environments/careqa_openended/pyproject.toml b/environments/careqa_openended/pyproject.toml
deleted file mode 100644
index 80e4e765..00000000
--- a/environments/careqa_openended/pyproject.toml
+++ /dev/null
@@ -1,22 +0,0 @@
-[project]
-name = "careqa_openended"
-description = "Evaluation environment for the HPAI-BSC/CareQA open-ended dataset"
-tags = ["healthcare", "medical-qa", "open-ended", "clinical", "single-turn"]
-version = "0.1.0"
-requires-python = ">=3.11"
-dependencies = [
-    "verifiers>=0.1.4",
-    "datasets>=2.13.0"
-]
-
-[tool.prime.environment]
-loader = "careqa_openended:load_environment"
-display_name = "CareQA"
-visibility = "PUBLIC"
-
-[build-system]
-requires = ["hatchling"]
-build-backend = "hatchling.build"
-
-[tool.hatch.build]
-include = ["careqa_openended.py"]
\ No newline at end of file