From 185d3540f088cc283d150fde7c3758527d8fcaef Mon Sep 17 00:00:00 2001
From: mkieffer1107 <mkieffer1107@gmail.com>
Date: Sun, 14 Sep 2025 17:27:39 -0400
Subject: [PATCH 1/8] medbullets-env added

---
 .gitignore                             |   4 +
 .python-version                        |   1 +
 environments/medbullets/README.md      |  65 ++++++++++++
 environments/medbullets/medbullets.py  | 139 +++++++++++++++++++++++++
 environments/medbullets/pyproject.toml |  16 +++
 pyproject.toml                         |   9 ++
 6 files changed, 234 insertions(+)
 create mode 100644 .python-version
 create mode 100644 environments/medbullets/README.md
 create mode 100644 environments/medbullets/medbullets.py
 create mode 100644 environments/medbullets/pyproject.toml
 create mode 100644 pyproject.toml

diff --git a/.gitignore b/.gitignore
index b7faf403..b9cb397d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,7 @@
+# Project-specific 
+**/outputs/**
+.DS_Store
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[codz]
diff --git a/.python-version b/.python-version
new file mode 100644
index 00000000..e4fba218
--- /dev/null
+++ b/.python-version
@@ -0,0 +1 @@
+3.12
diff --git a/environments/medbullets/README.md b/environments/medbullets/README.md
new file mode 100644
index 00000000..4cf2c609
--- /dev/null
+++ b/environments/medbullets/README.md
@@ -0,0 +1,65 @@
+# medbullets
+
+### Overview
+- **Environment ID**: `medbullets`
+- **Short description**: USMLE-style multiple-choice questions from Medbullets.
+- **Tags**: medical, clinical, single-turn, multiple-choice, USMLE, train, evaluation
+
+### Datasets
+- **Primary dataset(s)**: `Medbullets-4` and `Medbullets-5`
+- **Source links**: [Paper](https://arxiv.org/pdf/2402.18060), [Github](https://github.com/HanjieChen/ChallengeClinicalQA), [HF Dataset](https://huggingface.co/datasets/mkieffer/Medbullets)
+- **Split sizes**: 80/20 train/eval splits 
+
+    | Split       | Choices         | Count   |
+    | ----------- | --------------- | ------- |
+    | `op4_train` | {A, B, C, D}    | **246** |
+    | `op4_eval`  | {A, B, C, D}    | **62**  |
+    | `op5_train` | {A, B, C, D, E} | **246** |
+    | `op5_eval`  | {A, B, C, D, E} | **62**  |
+
+    `op5` splits contain the same questions as `op4` splits, but with one additional answer choice to increase difficulty. Note that while the content and answer texts are identical, the correct answer letter may differ between `op4` and `op5`.
+
+### Task
+- **Type**: single-turn
+- **Parser**: `Parser` or `ThinkParser`, with `extract_fn=extract_boxed_answer` for strict letter-in-\boxed{}-format parsing
+- **Rubric overview**: Binary scoring based on correctly boxed letter choice and optional think tag formatting
+
+### Quickstart
+Run an evaluation with default settings:
+
+```bash
+uv run vf-eval medbullets
+```
+
+Configure model and sampling:
+
+```bash
+uv run vf-eval medbullets \
+    -m gpt-4.1-mini   \
+    -n -1 -r 3 -t 1024 -T 0.7  \
+    -a '{"num_train_examples": -1, "num_eval_examples": -1, "num_options": 4, "use_think": true}'
+
+```
+
+Notes:
+- Use `-a` / `--env-args` to pass environment-specific configuration as a JSON object.
+
+### Environment Arguments
+Document any supported environment arguments and their meaning. Example:
+
+| Arg                  | Type | Default | Description                                                                                                                                                                          |
+| -------------------- | ---- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `num_train_examples` | int  | `-1`    | Limit the number of training examples (`-1` for all)                                                                                                                            |
+| `num_eval_examples`  | int  | `-1`    | Limit the number of eval examples (`-1` for all)                                                                                                                                |
+| `num_options`        | int  | `4`     | Number of options: `4` → {A, B, C, D}; `5` → {A, B, C, D, E}                                                |
+| `use_think`          | bool | `False` | Whether to check for `<think>...</think>` formatting with `ThinkParser`|
+
+### Metrics
+Summarize key metrics your rubric emits and how they’re interpreted.
+
+| Metric | Meaning |
+| ------ | ------- |
+| `correct_answer_reward_func` | (weight 1.0): 1.0 if parsed letter is correct, else 0.0|
+| `parser.get_format_reward_func()` | (weight 0.0): optional format adherence (not counted) |
+
+
diff --git a/environments/medbullets/medbullets.py b/environments/medbullets/medbullets.py
new file mode 100644
index 00000000..e1773000
--- /dev/null
+++ b/environments/medbullets/medbullets.py
@@ -0,0 +1,139 @@
+import verifiers as vf
+from verifiers.utils.data_utils import extract_boxed_answer
+from datasets import load_dataset, Dataset, concatenate_datasets
+from datasets.utils.logging import disable_progress_bar
+disable_progress_bar() # suppress 'Generating...' messages from Dataset.from_generator
+
+def _strip_E(split):
+    for ex in split:
+        ex = dict(ex)
+        ex["options"] = {k: v for k, v in ex["options"].items() if k != "E"}
+        yield ex
+
+def _build_question_str(question: str, options: dict) -> str:
+    s = f"Question: {question}\n"
+    for k, v in options.items():
+        # skip null values of v (for the combined dataset where E opt for 4op is null)
+        if v is not None and v != "":
+            s += f"\n{k}: {v}"
+    return s
+
+def _to_vf_format(ds: Dataset, split: str) -> Dataset:
+    """
+    Shape each row for SingleTurnEnv's defaults:
+      - 'question': string the env will turn into chat messages
+      - 'answer':   top-level gold letter (A/B/C/D[/E])
+      - 'info':     keep all original fields for bookkeeping
+    """
+    VALID = {"A","B","C","D","E"}
+
+    def gen():
+        for row in ds:
+            row = dict(row)
+            # build the user-visible question string (stem + options)
+            q = row.get("question", "") or ""
+            opts = row.get("options", {}) or {}
+            question_str = _build_question_str(q, opts)
+
+            # lift the answer to top-level, normalize to a single letter
+            ans = (row.get("answer") or "").strip().upper()
+            if ans not in VALID:
+                # if op4 split sometimes stores 'E' or empty, coerce safely
+                if ans == "" and "answer_letter" in row:
+                    ans = str(row["answer_letter"]).strip().upper()
+                if ans not in VALID:
+                    # final guard: drop anything unexpected
+                    ans = ""
+
+            # keep full original example under 'info'
+            info = dict(row)
+
+            yield {
+                "question": question_str,
+                "answer": ans,
+                "info": info,
+            }
+
+    return Dataset.from_generator(gen, split=split)
+
+def load_environment(
+        num_train_examples: int = -1, 
+        num_eval_examples: int = -1,
+        num_options: int = 4,
+        use_think: bool = False,
+        **kwargs
+    ) -> vf.Environment:
+    """
+    Single-turn Medbullets environment using HuggingFace `mkieffer/Medbullets` dataset
+    
+    Each example is normalized to the fields expected by `vf.SingleTurnEnv`:
+        {
+            "question": "<stem + formatted options>",      # string used as the user prompt
+            "answer":   "<A|B|C|D|E>",                     # top-level gold letter
+            "info":     { ...original example fields... }  # full source row for debugging
+        }
+
+    - num_options=4 : loads splits `op4_train` / `op4_eval` and drops option "E"
+    - num_options=5 : loads splits `op5_train` / `op5_eval`
+
+    - Parser extracts \\boxed{A|B|C|D|E} from completions
+
+    - Reward looks for exact match between parsed letter and answer letter
+    """
+
+    # -------- load dataset --------
+    if num_options == 4:
+        # 4 options: {"A", "B", "C", "D"}
+        train_raw, eval_raw = load_dataset("mkieffer/Medbullets", split=["op4_train", "op4_eval"])
+        # remove the "E" option from op4 splits
+        train_raw = Dataset.from_generator(lambda: _strip_E(train_raw), split="op4_train")
+        eval_raw  = Dataset.from_generator(lambda: _strip_E(eval_raw), split="op4_eval")
+    elif num_options == 5:
+        # 5 options: {"A", "B", "C", "D", "E"}
+        train_raw, eval_raw = load_dataset("mkieffer/Medbullets", split=["op5_train", "op5_eval"])
+    else: 
+        raise ValueError("'num_options' must be 4 or 5")
+
+    # -------- limit number of examples if specified --------
+    if num_train_examples != -1:
+        train_raw = train_raw.select(range(min(num_train_examples, len(train_raw))))
+    if num_eval_examples != -1:
+        eval_raw = eval_raw.select(range(min(num_eval_examples, len(eval_raw))))
+
+    # -------- reshape to {'prompt', 'info'} --------
+    rng_seed = 12345
+    train_ds = _to_vf_format(train_raw, split="train").shuffle(seed=rng_seed)
+    eval_ds  = _to_vf_format(eval_raw, split="eval").shuffle(seed=rng_seed)
+
+    # -------- construct prompts and questions --------
+    options = "(A, B, C, or D)" if num_options == 4 else "(A, B, C, D, or E)"
+
+    if use_think:
+        system_prompt = f"""Think step-by-step inside <think>...</think> tags, then give only the letter of the correct answer inside \\boxed{{...}} {options}. Do not include option text in the box; only the letter."""
+        parser = vf.ThinkParser(extract_fn=extract_boxed_answer)
+    else:
+        system_prompt = f"""Give only the letter of the correct answer inside \\boxed{{...}} {options}. Do not include option text in the box; only the letter. /no_think"""
+        parser = vf.Parser(extract_fn=extract_boxed_answer)
+
+    # -------- rubric --------
+    def correct_answer_reward_func(parser, completion, answer, **kwargs) -> float:
+        response = parser.parse_answer(completion) or ""
+        return 1.0 if response == answer else 0.0
+
+    rubric = vf.Rubric(
+        funcs=[
+            correct_answer_reward_func,
+            parser.get_format_reward_func()
+        ],
+        weights=[1.0, 0.0],
+        parser=parser,
+    )
+
+    return vf.SingleTurnEnv(
+        dataset=train_ds,
+        eval_dataset=eval_ds,
+        system_prompt=system_prompt,
+        parser=parser,
+        rubric=rubric,
+        **kwargs
+    )
\ No newline at end of file
diff --git a/environments/medbullets/pyproject.toml b/environments/medbullets/pyproject.toml
new file mode 100644
index 00000000..d4d9d690
--- /dev/null
+++ b/environments/medbullets/pyproject.toml
@@ -0,0 +1,16 @@
+[project]
+name = "medbullets"
+description = "Single-turn medical MCQ"
+tags = ["medical", "clinical", "single-turn", "multiple-choice", "usmle", "train", "evaluation"]
+version = "0.1.0"
+requires-python = ">=3.11"
+dependencies = [
+    "verifiers>=0.1.3.post0",
+]
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.hatch.build]
+include = ["medbullets.py"]
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 00000000..deb917d4
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,9 @@
+[project]
+name = "med-lm-eval"
+version = "0.1.0"
+description = "Automated LLM evaluation suite for medical tasks"
+readme = "README.md"
+requires-python = ">=3.12"
+dependencies = [
+    "verifiers>=0.1.3.post0",
+]

From 1a3d00a596e75d00f8807dbe0bd55efaef307db4 Mon Sep 17 00:00:00 2001
From: mkieffer1107 <mkieffer1107@gmail.com>
Date: Wed, 1 Oct 2025 20:26:34 -0400
Subject: [PATCH 2/8] Fixed problems

Removed global pyproject.toml, python version, and gitignore. Switched to verifiers system prompts. Added `shuffle` arg to shuffle answer choices.
---
 .gitignore                             | 211 -------------------------
 .python-version                        |   1 -
 environments/medbullets/README.md      |   5 +-
 environments/medbullets/medbullets.py  | 122 ++++++++------
 environments/medbullets/pyproject.toml |   5 +
 pyproject.toml                         |   9 --
 6 files changed, 83 insertions(+), 270 deletions(-)
 delete mode 100644 .gitignore
 delete mode 100644 .python-version
 delete mode 100644 pyproject.toml

diff --git a/.gitignore b/.gitignore
deleted file mode 100644
index b9cb397d..00000000
--- a/.gitignore
+++ /dev/null
@@ -1,211 +0,0 @@
-# Project-specific 
-**/outputs/**
-.DS_Store
-
-# Byte-compiled / optimized / DLL files
-__pycache__/
-*.py[codz]
-*$py.class
-
-# C extensions
-*.so
-
-# Distribution / packaging
-.Python
-build/
-develop-eggs/
-dist/
-downloads/
-eggs/
-.eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-wheels/
-share/python-wheels/
-*.egg-info/
-.installed.cfg
-*.egg
-MANIFEST
-
-# PyInstaller
-#  Usually these files are written by a python script from a template
-#  before PyInstaller builds the exe, so as to inject date/other infos into it.
-*.manifest
-*.spec
-
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-
-# Unit test / coverage reports
-htmlcov/
-.tox/
-.nox/
-.coverage
-.coverage.*
-.cache
-nosetests.xml
-coverage.xml
-*.cover
-*.py.cover
-.hypothesis/
-.pytest_cache/
-cover/
-
-# Translations
-*.mo
-*.pot
-
-# Django stuff:
-*.log
-local_settings.py
-db.sqlite3
-db.sqlite3-journal
-
-# Flask stuff:
-instance/
-.webassets-cache
-
-# Scrapy stuff:
-.scrapy
-
-# Sphinx documentation
-docs/_build/
-
-# PyBuilder
-.pybuilder/
-target/
-
-# Jupyter Notebook
-.ipynb_checkpoints
-
-# IPython
-profile_default/
-ipython_config.py
-
-# pyenv
-#   For a library or package, you might want to ignore these files since the code is
-#   intended to run in multiple environments; otherwise, check them in:
-# .python-version
-
-# pipenv
-#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
-#   However, in case of collaboration, if having platform-specific dependencies or dependencies
-#   having no cross-platform support, pipenv may install dependencies that don't work, or not
-#   install all needed dependencies.
-#Pipfile.lock
-
-# UV
-#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
-#   This is especially recommended for binary packages to ensure reproducibility, and is more
-#   commonly ignored for libraries.
-#uv.lock
-
-# poetry
-#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
-#   This is especially recommended for binary packages to ensure reproducibility, and is more
-#   commonly ignored for libraries.
-#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
-#poetry.lock
-#poetry.toml
-
-# pdm
-#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
-#   pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
-#   https://pdm-project.org/en/latest/usage/project/#working-with-version-control
-#pdm.lock
-#pdm.toml
-.pdm-python
-.pdm-build/
-
-# pixi
-#   Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
-#pixi.lock
-#   Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
-#   in the .venv directory. It is recommended not to include this directory in version control.
-.pixi
-
-# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
-__pypackages__/
-
-# Celery stuff
-celerybeat-schedule
-celerybeat.pid
-
-# SageMath parsed files
-*.sage.py
-
-# Environments
-.env
-.envrc
-.venv
-env/
-venv/
-ENV/
-env.bak/
-venv.bak/
-
-# Spyder project settings
-.spyderproject
-.spyproject
-
-# Rope project settings
-.ropeproject
-
-# mkdocs documentation
-/site
-
-# mypy
-.mypy_cache/
-.dmypy.json
-dmypy.json
-
-# Pyre type checker
-.pyre/
-
-# pytype static type analyzer
-.pytype/
-
-# Cython debug symbols
-cython_debug/
-
-# PyCharm
-#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
-#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
-#  and can be added to the global gitignore or merged into this file.  For a more nuclear
-#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-#.idea/
-
-# Abstra
-# Abstra is an AI-powered process automation framework.
-# Ignore directories containing user credentials, local state, and settings.
-# Learn more at https://abstra.io/docs
-.abstra/
-
-# Visual Studio Code
-#  Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore 
-#  that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
-#  and can be added to the global gitignore or merged into this file. However, if you prefer, 
-#  you could uncomment the following to ignore the entire vscode folder
-# .vscode/
-
-# Ruff stuff:
-.ruff_cache/
-
-# PyPI configuration file
-.pypirc
-
-# Cursor
-#  Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
-#  exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
-#  refer to https://docs.cursor.com/context/ignore-files
-.cursorignore
-.cursorindexingignore
-
-# Marimo
-marimo/_static/
-marimo/_lsp/
-__marimo__/
diff --git a/.python-version b/.python-version
deleted file mode 100644
index e4fba218..00000000
--- a/.python-version
+++ /dev/null
@@ -1 +0,0 @@
-3.12
diff --git a/environments/medbullets/README.md b/environments/medbullets/README.md
index 4cf2c609..f5335af9 100644
--- a/environments/medbullets/README.md
+++ b/environments/medbullets/README.md
@@ -37,8 +37,7 @@ Configure model and sampling:
 uv run vf-eval medbullets \
     -m gpt-4.1-mini   \
     -n -1 -r 3 -t 1024 -T 0.7  \
-    -a '{"num_train_examples": -1, "num_eval_examples": -1, "num_options": 4, "use_think": true}'
-
+    -a '{"use_think": false, "num_options": 4, "num_train_examples": -1, "num_eval_examples": -1, "shuffle": true}'
 ```
 
 Notes:
@@ -53,6 +52,8 @@ Document any supported environment arguments and their meaning. Example:
 | `num_eval_examples`  | int  | `-1`    | Limit the number of eval examples (`-1` for all)                                                                                                                                |
 | `num_options`        | int  | `4`     | Number of options: `4` → {A, B, C, D}; `5` → {A, B, C, D, E}                                                |
 | `use_think`          | bool | `False` | Whether to check for `<think>...</think>` formatting with `ThinkParser`|
+| `shuffle`            | bool | `False` | Whether to shuffle answer choices |
+
 
 ### Metrics
 Summarize key metrics your rubric emits and how they’re interpreted.
diff --git a/environments/medbullets/medbullets.py b/environments/medbullets/medbullets.py
index e1773000..6a7c6d3e 100644
--- a/environments/medbullets/medbullets.py
+++ b/environments/medbullets/medbullets.py
@@ -1,14 +1,10 @@
 import verifiers as vf
-from verifiers.utils.data_utils import extract_boxed_answer
-from datasets import load_dataset, Dataset, concatenate_datasets
+from verifiers.utils.data_utils import extract_boxed_answer, THINK_BOXED_SYSTEM_PROMPT, BOXED_SYSTEM_PROMPT
+from datasets import load_dataset, Dataset
 from datasets.utils.logging import disable_progress_bar
-disable_progress_bar() # suppress 'Generating...' messages from Dataset.from_generator
+import random
+disable_progress_bar() # suppress datasets mapping progress bar
 
-def _strip_E(split):
-    for ex in split:
-        ex = dict(ex)
-        ex["options"] = {k: v for k, v in ex["options"].items() if k != "E"}
-        yield ex
 
 def _build_question_str(question: str, options: dict) -> str:
     s = f"Question: {question}\n"
@@ -18,49 +14,80 @@ def _build_question_str(question: str, options: dict) -> str:
             s += f"\n{k}: {v}"
     return s
 
-def _to_vf_format(ds: Dataset, split: str) -> Dataset:
+
+def _to_vf_format(ds: Dataset, split: str, num_options: int, shuffle: bool) -> Dataset:
     """
     Shape each row for SingleTurnEnv's defaults:
       - 'question': string the env will turn into chat messages
       - 'answer':   top-level gold letter (A/B/C/D[/E])
       - 'info':     keep all original fields for bookkeeping
+    
+    Args:
+      - num_options: 4 or 5; if 4, strips out option "E"
+      - shuffle: whether to shuffle the answer choices
     """
     VALID = {"A","B","C","D","E"}
 
-    def gen():
-        for row in ds:
-            row = dict(row)
-            # build the user-visible question string (stem + options)
-            q = row.get("question", "") or ""
-            opts = row.get("options", {}) or {}
-            question_str = _build_question_str(q, opts)
-
-            # lift the answer to top-level, normalize to a single letter
-            ans = (row.get("answer") or "").strip().upper()
-            if ans not in VALID:
-                # if op4 split sometimes stores 'E' or empty, coerce safely
-                if ans == "" and "answer_letter" in row:
-                    ans = str(row["answer_letter"]).strip().upper()
-                if ans not in VALID:
-                    # final guard: drop anything unexpected
-                    ans = ""
-
-            # keep full original example under 'info'
-            info = dict(row)
-
-            yield {
-                "question": question_str,
-                "answer": ans,
-                "info": info,
-            }
-
-    return Dataset.from_generator(gen, split=split)
+    def _format_row(row: dict) -> dict:
+        question = row.get("question", "") or "" # question string
+        opts = row.get("options", {}) or {} # answer choices, map of letter to answer text
+        
+        # strip option E if num_options == 4
+        if num_options == 4:
+            opts = {k: v for k, v in opts.items() if k != "E"}
+        
+        # lift the answer to top-level, normalize to a single letter
+        answer_letter = (row.get("answer") or "").strip().upper()
+        if answer_letter not in VALID:
+            return None
+        
+        # shuffle answer choices if requested
+        if shuffle and answer_letter and answer_letter in opts:
+            # get the correct answer text before shuffling
+            correct_answer_text = opts[answer_letter]
+
+            # create list of (letter, text) pairs and shuffle them
+            option_pairs = list(opts.items())
+
+            # use a deterministic seed based on the question for consistency
+            rng = random.Random(hash(question) % (2**32))
+            rng.shuffle(option_pairs)
+            
+            # rebuild options dict with new letter assignments
+            letters = ["A", "B", "C", "D", "E"][:len(option_pairs)]
+            opts = {letters[i]: text for i, (_, text) in enumerate(option_pairs)}
+            
+            # find the new letter for the correct answer
+            for letter, text in opts.items():
+                if text == correct_answer_text:
+                    answer_letter = letter
+                    break
+        
+        question_str = _build_question_str(question, opts)
+
+        # question and answer have been moved to top-level, so remove them here
+        info = dict(row)
+
+        # update shuffled answer choices in the info dict
+        if shuffle:
+            info["answer"] = answer_letter
+            info["options"] = opts
+
+        return {
+            "question": question_str,
+            "answer": answer_letter,
+            "info": info,
+        }
+
+    return ds.map(_format_row, remove_columns=ds.column_names).filter(lambda row: row is not None)
+
 
 def load_environment(
         num_train_examples: int = -1, 
         num_eval_examples: int = -1,
         num_options: int = 4,
         use_think: bool = False,
+        shuffle: bool = False,
         **kwargs
     ) -> vf.Environment:
     """
@@ -85,9 +112,6 @@ def load_environment(
     if num_options == 4:
         # 4 options: {"A", "B", "C", "D"}
         train_raw, eval_raw = load_dataset("mkieffer/Medbullets", split=["op4_train", "op4_eval"])
-        # remove the "E" option from op4 splits
-        train_raw = Dataset.from_generator(lambda: _strip_E(train_raw), split="op4_train")
-        eval_raw  = Dataset.from_generator(lambda: _strip_E(eval_raw), split="op4_eval")
     elif num_options == 5:
         # 5 options: {"A", "B", "C", "D", "E"}
         train_raw, eval_raw = load_dataset("mkieffer/Medbullets", split=["op5_train", "op5_eval"])
@@ -100,19 +124,23 @@ def load_environment(
     if num_eval_examples != -1:
         eval_raw = eval_raw.select(range(min(num_eval_examples, len(eval_raw))))
 
-    # -------- reshape to {'prompt', 'info'} --------
+    # -------- convert rows to vf format and shuffle row order --------
     rng_seed = 12345
-    train_ds = _to_vf_format(train_raw, split="train").shuffle(seed=rng_seed)
-    eval_ds  = _to_vf_format(eval_raw, split="eval").shuffle(seed=rng_seed)
+    train_ds = _to_vf_format(train_raw, split="train", num_options=num_options, shuffle=shuffle).shuffle(seed=rng_seed)
+    eval_ds  = _to_vf_format(eval_raw, split="eval", num_options=num_options, shuffle=shuffle).shuffle(seed=rng_seed)
+    del train_raw, eval_raw  # free memory
+    
+    import json
+    print(json.dumps(train_ds[0], indent=4))
+    print(json.dumps(eval_ds[0], indent=4))
+    exit(0)
 
     # -------- construct prompts and questions --------
-    options = "(A, B, C, or D)" if num_options == 4 else "(A, B, C, D, or E)"
-
     if use_think:
-        system_prompt = f"""Think step-by-step inside <think>...</think> tags, then give only the letter of the correct answer inside \\boxed{{...}} {options}. Do not include option text in the box; only the letter."""
+        system_prompt = THINK_BOXED_SYSTEM_PROMPT
         parser = vf.ThinkParser(extract_fn=extract_boxed_answer)
     else:
-        system_prompt = f"""Give only the letter of the correct answer inside \\boxed{{...}} {options}. Do not include option text in the box; only the letter. /no_think"""
+        system_prompt = BOXED_SYSTEM_PROMPT
         parser = vf.Parser(extract_fn=extract_boxed_answer)
 
     # -------- rubric --------
diff --git a/environments/medbullets/pyproject.toml b/environments/medbullets/pyproject.toml
index d4d9d690..30e9ad9c 100644
--- a/environments/medbullets/pyproject.toml
+++ b/environments/medbullets/pyproject.toml
@@ -8,6 +8,11 @@ dependencies = [
     "verifiers>=0.1.3.post0",
 ]
 
+[tool.prime.environment]
+loader = "medbullets:load_environment"
+display_name = "Medbullets"
+visibility = "PUBLIC"
+
 [build-system]
 requires = ["hatchling"]
 build-backend = "hatchling.build"
diff --git a/pyproject.toml b/pyproject.toml
deleted file mode 100644
index deb917d4..00000000
--- a/pyproject.toml
+++ /dev/null
@@ -1,9 +0,0 @@
-[project]
-name = "med-lm-eval"
-version = "0.1.0"
-description = "Automated LLM evaluation suite for medical tasks"
-readme = "README.md"
-requires-python = ">=3.12"
-dependencies = [
-    "verifiers>=0.1.3.post0",
-]

From 81e23a98cbcd463201abf798199114c5c13338e6 Mon Sep 17 00:00:00 2001
From: mkieffer1107 <mkieffer1107@gmail.com>
Date: Wed, 1 Oct 2025 20:34:54 -0400
Subject: [PATCH 3/8] Update medbullets.py

Removed debugging print statement
---
 environments/medbullets/medbullets.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/environments/medbullets/medbullets.py b/environments/medbullets/medbullets.py
index 6a7c6d3e..b00d64f4 100644
--- a/environments/medbullets/medbullets.py
+++ b/environments/medbullets/medbullets.py
@@ -129,11 +129,6 @@ def load_environment(
     train_ds = _to_vf_format(train_raw, split="train", num_options=num_options, shuffle=shuffle).shuffle(seed=rng_seed)
     eval_ds  = _to_vf_format(eval_raw, split="eval", num_options=num_options, shuffle=shuffle).shuffle(seed=rng_seed)
     del train_raw, eval_raw  # free memory
-    
-    import json
-    print(json.dumps(train_ds[0], indent=4))
-    print(json.dumps(eval_ds[0], indent=4))
-    exit(0)
 
     # -------- construct prompts and questions --------
     if use_think:

From b27d4ed534e9cc69ba63bcc53b80f1647aa10757 Mon Sep 17 00:00:00 2001
From: mkieffer1107 <mkieffer1107@gmail.com>
Date: Thu, 2 Oct 2025 04:09:43 -0400
Subject: [PATCH 4/8] Test split only

Now only references a single test split
---
 environments/medbullets/README.md     | 18 ++++------
 environments/medbullets/medbullets.py | 51 ++++++++++-----------------
 2 files changed, 26 insertions(+), 43 deletions(-)

diff --git a/environments/medbullets/README.md b/environments/medbullets/README.md
index f5335af9..e848205b 100644
--- a/environments/medbullets/README.md
+++ b/environments/medbullets/README.md
@@ -8,16 +8,15 @@
 ### Datasets
 - **Primary dataset(s)**: `Medbullets-4` and `Medbullets-5`
 - **Source links**: [Paper](https://arxiv.org/pdf/2402.18060), [Github](https://github.com/HanjieChen/ChallengeClinicalQA), [HF Dataset](https://huggingface.co/datasets/mkieffer/Medbullets)
-- **Split sizes**: 80/20 train/eval splits 
+- **Split sizes**:
 
     | Split       | Choices         | Count   |
     | ----------- | --------------- | ------- |
-    | `op4_train` | {A, B, C, D}    | **246** |
-    | `op4_eval`  | {A, B, C, D}    | **62**  |
-    | `op5_train` | {A, B, C, D, E} | **246** |
-    | `op5_eval`  | {A, B, C, D, E} | **62**  |
+    | `op4_test` | {A, B, C, D}    | **308** |
+    | `op5_test` | {A, B, C, D, E} | **308** |
+
+    `op5_test` contains the same content as `op4_test`, but with one additional answer choice to increase difficulty. Note that while the content is the same, the letter choice corresponding to the correct answer is sometimes different between these splits.
 
-    `op5` splits contain the same questions as `op4` splits, but with one additional answer choice to increase difficulty. Note that while the content and answer texts are identical, the correct answer letter may differ between `op4` and `op5`.
 
 ### Task
 - **Type**: single-turn
@@ -37,7 +36,7 @@ Configure model and sampling:
 uv run vf-eval medbullets \
     -m gpt-4.1-mini   \
     -n -1 -r 3 -t 1024 -T 0.7  \
-    -a '{"use_think": false, "num_options": 4, "num_train_examples": -1, "num_eval_examples": -1, "shuffle": true}'
+    -a '{"use_think": false, "num_options": 4, "num_test_examples": -1, "shuffle": true}'
 ```
 
 Notes:
@@ -48,8 +47,7 @@ Document any supported environment arguments and their meaning. Example:
 
 | Arg                  | Type | Default | Description                                                                                                                                                                          |
 | -------------------- | ---- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `num_train_examples` | int  | `-1`    | Limit the number of training examples (`-1` for all)                                                                                                                            |
-| `num_eval_examples`  | int  | `-1`    | Limit the number of eval examples (`-1` for all)                                                                                                                                |
+| `num_test_examples` | int  | `-1`    | Limit the number of test examples (`-1` for all)                                                                                                                            |
 | `num_options`        | int  | `4`     | Number of options: `4` → {A, B, C, D}; `5` → {A, B, C, D, E}                                                |
 | `use_think`          | bool | `False` | Whether to check for `<think>...</think>` formatting with `ThinkParser`|
 | `shuffle`            | bool | `False` | Whether to shuffle answer choices |
@@ -62,5 +60,3 @@ Summarize key metrics your rubric emits and how they’re interpreted.
 | ------ | ------- |
 | `correct_answer_reward_func` | (weight 1.0): 1.0 if parsed letter is correct, else 0.0|
 | `parser.get_format_reward_func()` | (weight 0.0): optional format adherence (not counted) |
-
-
diff --git a/environments/medbullets/medbullets.py b/environments/medbullets/medbullets.py
index b00d64f4..b4e6908f 100644
--- a/environments/medbullets/medbullets.py
+++ b/environments/medbullets/medbullets.py
@@ -2,20 +2,16 @@
 from verifiers.utils.data_utils import extract_boxed_answer, THINK_BOXED_SYSTEM_PROMPT, BOXED_SYSTEM_PROMPT
 from datasets import load_dataset, Dataset
 from datasets.utils.logging import disable_progress_bar
+from typing import Dict
 import random
 disable_progress_bar() # suppress datasets mapping progress bar
 
 
-def _build_question_str(question: str, options: dict) -> str:
-    s = f"Question: {question}\n"
-    for k, v in options.items():
-        # skip null values of v (for the combined dataset where E opt for 4op is null)
-        if v is not None and v != "":
-            s += f"\n{k}: {v}"
-    return s
+def _build_question_str(question: str, options: Dict[str, str]) -> str:
+    opts = "\n".join(f"{k}. {v}" for k, v in options.items())
+    return f"Question: {question}\n\n{opts}"
 
-
-def _to_vf_format(ds: Dataset, split: str, num_options: int, shuffle: bool) -> Dataset:
+def _to_vf_format(ds: Dataset, num_options: int, shuffle: bool) -> Dataset:
     """
     Shape each row for SingleTurnEnv's defaults:
       - 'question': string the env will turn into chat messages
@@ -26,7 +22,7 @@ def _to_vf_format(ds: Dataset, split: str, num_options: int, shuffle: bool) -> D
       - num_options: 4 or 5; if 4, strips out option "E"
       - shuffle: whether to shuffle the answer choices
     """
-    VALID = {"A","B","C","D","E"}
+    VALID = ("A","B","C","D","E")
 
     def _format_row(row: dict) -> dict:
         question = row.get("question", "") or "" # question string
@@ -54,7 +50,7 @@ def _format_row(row: dict) -> dict:
             rng.shuffle(option_pairs)
             
             # rebuild options dict with new letter assignments
-            letters = ["A", "B", "C", "D", "E"][:len(option_pairs)]
+            letters = VALID[:len(option_pairs)]
             opts = {letters[i]: text for i, (_, text) in enumerate(option_pairs)}
             
             # find the new letter for the correct answer
@@ -83,8 +79,7 @@ def _format_row(row: dict) -> dict:
 
 
 def load_environment(
-        num_train_examples: int = -1, 
-        num_eval_examples: int = -1,
+        num_test_examples: int = -1, 
         num_options: int = 4,
         use_think: bool = False,
         shuffle: bool = False,
@@ -100,8 +95,8 @@ def load_environment(
             "info":     { ...original example fields... }  # full source row for debugging
         }
 
-    - num_options=4 : loads splits `op4_train` / `op4_eval` and drops option "E"
-    - num_options=5 : loads splits `op5_train` / `op5_eval`
+    - num_options=4 : loads split `op4_test
+    - num_options=5 : loads split `op5_test`
 
     - Parser extracts \\boxed{A|B|C|D|E} from completions
 
@@ -111,32 +106,25 @@ def load_environment(
     # -------- load dataset --------
     if num_options == 4:
         # 4 options: {"A", "B", "C", "D"}
-        train_raw, eval_raw = load_dataset("mkieffer/Medbullets", split=["op4_train", "op4_eval"])
+        test_raw = load_dataset("mkieffer/Medbullets", split="op4_test")
     elif num_options == 5:
         # 5 options: {"A", "B", "C", "D", "E"}
-        train_raw, eval_raw = load_dataset("mkieffer/Medbullets", split=["op5_train", "op5_eval"])
+        test_raw= load_dataset("mkieffer/Medbullets", split="op5_test")
     else: 
         raise ValueError("'num_options' must be 4 or 5")
 
     # -------- limit number of examples if specified --------
-    if num_train_examples != -1:
-        train_raw = train_raw.select(range(min(num_train_examples, len(train_raw))))
-    if num_eval_examples != -1:
-        eval_raw = eval_raw.select(range(min(num_eval_examples, len(eval_raw))))
+    if num_test_examples != -1:
+        test_raw = test_raw.select(range(min(num_test_examples, len(test_raw))))
 
     # -------- convert rows to vf format and shuffle row order --------
     rng_seed = 12345
-    train_ds = _to_vf_format(train_raw, split="train", num_options=num_options, shuffle=shuffle).shuffle(seed=rng_seed)
-    eval_ds  = _to_vf_format(eval_raw, split="eval", num_options=num_options, shuffle=shuffle).shuffle(seed=rng_seed)
-    del train_raw, eval_raw  # free memory
+    test_ds = _to_vf_format(test_raw, num_options=num_options, shuffle=shuffle).shuffle(seed=rng_seed)
+    del test_raw  # free memory
 
     # -------- construct prompts and questions --------
-    if use_think:
-        system_prompt = THINK_BOXED_SYSTEM_PROMPT
-        parser = vf.ThinkParser(extract_fn=extract_boxed_answer)
-    else:
-        system_prompt = BOXED_SYSTEM_PROMPT
-        parser = vf.Parser(extract_fn=extract_boxed_answer)
+    parser = vf.ThinkParser(extract_fn=extract_boxed_answer) if use_think else vf.Parser(extract_fn=extract_boxed_answer)
+    system_prompt = THINK_BOXED_SYSTEM_PROMPT if use_think else BOXED_SYSTEM_PROMPT
 
     # -------- rubric --------
     def correct_answer_reward_func(parser, completion, answer, **kwargs) -> float:
@@ -153,8 +141,7 @@ def correct_answer_reward_func(parser, completion, answer, **kwargs) -> float:
     )
 
     return vf.SingleTurnEnv(
-        dataset=train_ds,
-        eval_dataset=eval_ds,
+        eval_dataset=test_ds,
         system_prompt=system_prompt,
         parser=parser,
         rubric=rubric,

From 37279519e9090318b5ed6178352919751ae037d3 Mon Sep 17 00:00:00 2001
From: Benjamin Warner <me@benjaminwarner.dev>
Date: Thu, 2 Oct 2025 22:43:27 -0400
Subject: [PATCH 5/8] restore the gitignore

---
 .gitignore | 211 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 211 insertions(+)
 create mode 100644 .gitignore

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 00000000..5ca29d20
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,211 @@
+outputs/
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[codz]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py.cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+#poetry.toml
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#   pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
+#   https://pdm-project.org/en/latest/usage/project/#working-with-version-control
+#pdm.lock
+#pdm.toml
+.pdm-python
+.pdm-build/
+
+# pixi
+#   Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
+#pixi.lock
+#   Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
+#   in the .venv directory. It is recommended not to include this directory in version control.
+.pixi
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.envrc
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+
+# Abstra
+# Abstra is an AI-powered process automation framework.
+# Ignore directories containing user credentials, local state, and settings.
+# Learn more at https://abstra.io/docs
+.abstra/
+
+# Visual Studio Code
+#  Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore 
+#  that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
+#  and can be added to the global gitignore or merged into this file. However, if you prefer, 
+#  you could uncomment the following to ignore the entire vscode folder
+# .vscode/
+
+# Ruff stuff:
+.ruff_cache/
+
+# PyPI configuration file
+.pypirc
+
+# Cursor
+#  Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
+#  exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
+#  refer to https://docs.cursor.com/context/ignore-files
+.cursorignore
+.cursorindexingignore
+
+# Marimo
+marimo/_static/
+marimo/_lsp/
+__marimo__/
+
+uv.lock

From 1d03a7f94f277509e758ff545b8463448cde4b28 Mon Sep 17 00:00:00 2001
From: Benjamin Warner <me@benjaminwarner.dev>
Date: Fri, 3 Oct 2025 02:08:09 -0400
Subject: [PATCH 6/8] simplify implementation + ruff format

---
 environments/medbullets/medbullets.py | 79 ++++++++++-----------------
 1 file changed, 30 insertions(+), 49 deletions(-)

diff --git a/environments/medbullets/medbullets.py b/environments/medbullets/medbullets.py
index b4e6908f..a696271d 100644
--- a/environments/medbullets/medbullets.py
+++ b/environments/medbullets/medbullets.py
@@ -1,42 +1,44 @@
+import random
+
 import verifiers as vf
-from verifiers.utils.data_utils import extract_boxed_answer, THINK_BOXED_SYSTEM_PROMPT, BOXED_SYSTEM_PROMPT
-from datasets import load_dataset, Dataset
+from datasets import Dataset, load_dataset
 from datasets.utils.logging import disable_progress_bar
-from typing import Dict
-import random
-disable_progress_bar() # suppress datasets mapping progress bar
+from verifiers.utils.data_utils import BOXED_SYSTEM_PROMPT, THINK_BOXED_SYSTEM_PROMPT, extract_boxed_answer
+
+disable_progress_bar()  # suppress datasets mapping progress bar
 
 
-def _build_question_str(question: str, options: Dict[str, str]) -> str:
+def _build_question_str(question: str, options: dict[str, str]) -> str:
     opts = "\n".join(f"{k}. {v}" for k, v in options.items())
     return f"Question: {question}\n\n{opts}"
 
+
 def _to_vf_format(ds: Dataset, num_options: int, shuffle: bool) -> Dataset:
     """
     Shape each row for SingleTurnEnv's defaults:
       - 'question': string the env will turn into chat messages
       - 'answer':   top-level gold letter (A/B/C/D[/E])
       - 'info':     keep all original fields for bookkeeping
-    
+
     Args:
       - num_options: 4 or 5; if 4, strips out option "E"
       - shuffle: whether to shuffle the answer choices
     """
-    VALID = ("A","B","C","D","E")
+    VALID = ("A", "B", "C", "D", "E")
 
     def _format_row(row: dict) -> dict:
-        question = row.get("question", "") or "" # question string
-        opts = row.get("options", {}) or {} # answer choices, map of letter to answer text
-        
+        question = row.get("question", "") or ""  # question string
+        opts = row.get("options", {}) or {}  # answer choices, map of letter to answer text
+
         # strip option E if num_options == 4
         if num_options == 4:
             opts = {k: v for k, v in opts.items() if k != "E"}
-        
+
         # lift the answer to top-level, normalize to a single letter
         answer_letter = (row.get("answer") or "").strip().upper()
         if answer_letter not in VALID:
             return None
-        
+
         # shuffle answer choices if requested
         if shuffle and answer_letter and answer_letter in opts:
             # get the correct answer text before shuffling
@@ -48,17 +50,17 @@ def _format_row(row: dict) -> dict:
             # use a deterministic seed based on the question for consistency
             rng = random.Random(hash(question) % (2**32))
             rng.shuffle(option_pairs)
-            
+
             # rebuild options dict with new letter assignments
-            letters = VALID[:len(option_pairs)]
+            letters = VALID[: len(option_pairs)]
             opts = {letters[i]: text for i, (_, text) in enumerate(option_pairs)}
-            
+
             # find the new letter for the correct answer
             for letter, text in opts.items():
                 if text == correct_answer_text:
                     answer_letter = letter
                     break
-        
+
         question_str = _build_question_str(question, opts)
 
         # question and answer have been moved to top-level, so remove them here
@@ -78,16 +80,10 @@ def _format_row(row: dict) -> dict:
     return ds.map(_format_row, remove_columns=ds.column_names).filter(lambda row: row is not None)
 
 
-def load_environment(
-        num_test_examples: int = -1, 
-        num_options: int = 4,
-        use_think: bool = False,
-        shuffle: bool = False,
-        **kwargs
-    ) -> vf.Environment:
+def load_environment(num_options: int = 4, use_think: bool = False, shuffle: bool = False, **kwargs) -> vf.Environment:
     """
     Single-turn Medbullets environment using HuggingFace `mkieffer/Medbullets` dataset
-    
+
     Each example is normalized to the fields expected by `vf.SingleTurnEnv`:
         {
             "question": "<stem + formatted options>",      # string used as the user prompt
@@ -109,41 +105,26 @@ def load_environment(
         test_raw = load_dataset("mkieffer/Medbullets", split="op4_test")
     elif num_options == 5:
         # 5 options: {"A", "B", "C", "D", "E"}
-        test_raw= load_dataset("mkieffer/Medbullets", split="op5_test")
-    else: 
+        test_raw = load_dataset("mkieffer/Medbullets", split="op5_test")
+    else:
         raise ValueError("'num_options' must be 4 or 5")
 
-    # -------- limit number of examples if specified --------
-    if num_test_examples != -1:
-        test_raw = test_raw.select(range(min(num_test_examples, len(test_raw))))
-
-    # -------- convert rows to vf format and shuffle row order --------
-    rng_seed = 12345
-    test_ds = _to_vf_format(test_raw, num_options=num_options, shuffle=shuffle).shuffle(seed=rng_seed)
+    test_ds = _to_vf_format(test_raw, num_options=num_options, shuffle=shuffle)
     del test_raw  # free memory
 
-    # -------- construct prompts and questions --------
-    parser = vf.ThinkParser(extract_fn=extract_boxed_answer) if use_think else vf.Parser(extract_fn=extract_boxed_answer)
+    parser = (
+        vf.ThinkParser(extract_fn=extract_boxed_answer) if use_think else vf.Parser(extract_fn=extract_boxed_answer)
+    )
     system_prompt = THINK_BOXED_SYSTEM_PROMPT if use_think else BOXED_SYSTEM_PROMPT
 
-    # -------- rubric --------
     def correct_answer_reward_func(parser, completion, answer, **kwargs) -> float:
         response = parser.parse_answer(completion) or ""
         return 1.0 if response == answer else 0.0
 
     rubric = vf.Rubric(
-        funcs=[
-            correct_answer_reward_func,
-            parser.get_format_reward_func()
-        ],
-        weights=[1.0, 0.0],
+        funcs=[correct_answer_reward_func],
+        weights=[1.0],
         parser=parser,
     )
 
-    return vf.SingleTurnEnv(
-        eval_dataset=test_ds,
-        system_prompt=system_prompt,
-        parser=parser,
-        rubric=rubric,
-        **kwargs
-    )
\ No newline at end of file
+    return vf.SingleTurnEnv(eval_dataset=test_ds, system_prompt=system_prompt, parser=parser, rubric=rubric, **kwargs)

From 0375015d981b789dc41b3ba2bd4458687d5d6d22 Mon Sep 17 00:00:00 2001
From: mkieffer1107 <mkieffer1107@gmail.com>
Date: Fri, 3 Oct 2025 17:23:46 -0400
Subject: [PATCH 7/8] Fixed bugs and made more robust parser

---
 environments/medbullets/README.md     |  2 --
 environments/medbullets/medbullets.py | 47 +++++++++++++++++++++------
 2 files changed, 37 insertions(+), 12 deletions(-)

diff --git a/environments/medbullets/README.md b/environments/medbullets/README.md
index e848205b..65d8343c 100644
--- a/environments/medbullets/README.md
+++ b/environments/medbullets/README.md
@@ -43,7 +43,6 @@ Notes:
 - Use `-a` / `--env-args` to pass environment-specific configuration as a JSON object.
 
 ### Environment Arguments
-Document any supported environment arguments and their meaning. Example:
 
 | Arg                  | Type | Default | Description                                                                                                                                                                          |
 | -------------------- | ---- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
@@ -54,7 +53,6 @@ Document any supported environment arguments and their meaning. Example:
 
 
 ### Metrics
-Summarize key metrics your rubric emits and how they’re interpreted.
 
 | Metric | Meaning |
 | ------ | ------- |
diff --git a/environments/medbullets/medbullets.py b/environments/medbullets/medbullets.py
index a696271d..eccad2a9 100644
--- a/environments/medbullets/medbullets.py
+++ b/environments/medbullets/medbullets.py
@@ -1,14 +1,13 @@
 import random
-
+import re
 import verifiers as vf
 from datasets import Dataset, load_dataset
 from datasets.utils.logging import disable_progress_bar
 from verifiers.utils.data_utils import BOXED_SYSTEM_PROMPT, THINK_BOXED_SYSTEM_PROMPT, extract_boxed_answer
-
 disable_progress_bar()  # suppress datasets mapping progress bar
 
 
-def _build_question_str(question: str, options: dict[str, str]) -> str:
+def _build_question(question: str, options: dict[str, str]) -> str:
     opts = "\n".join(f"{k}. {v}" for k, v in options.items())
     return f"Question: {question}\n\n{opts}"
 
@@ -61,7 +60,11 @@ def _format_row(row: dict) -> dict:
                     answer_letter = letter
                     break
 
-        question_str = _build_question_str(question, opts)
+
+
+        instruction = "The following are multiple choice questions (with answers) about health. Think step by step and then output the single letter answer at the end like \\boxed{A}.\n\n"
+        question = _build_question(question, opts)
+        prompt = instruction + question
 
         # question and answer have been moved to top-level, so remove them here
         info = dict(row)
@@ -72,7 +75,7 @@ def _format_row(row: dict) -> dict:
             info["options"] = opts
 
         return {
-            "question": question_str,
+            "question": prompt,
             "answer": answer_letter,
             "info": info,
         }
@@ -80,7 +83,13 @@ def _format_row(row: dict) -> dict:
     return ds.map(_format_row, remove_columns=ds.column_names).filter(lambda row: row is not None)
 
 
-def load_environment(num_options: int = 4, use_think: bool = False, shuffle: bool = False, **kwargs) -> vf.Environment:
+def load_environment(
+        num_test_examples: int = -1, 
+        num_options: int = 4, 
+        use_think: bool = False, 
+        shuffle: bool = False,  
+        **kwargs
+    ) -> vf.Environment:
     """
     Single-turn Medbullets environment using HuggingFace `mkieffer/Medbullets` dataset
 
@@ -109,17 +118,35 @@ def load_environment(num_options: int = 4, use_think: bool = False, shuffle: boo
     else:
         raise ValueError("'num_options' must be 4 or 5")
 
+    # -------- limit number of examples if specified --------
+    if num_test_examples != -1:
+        test_raw = test_raw.select(range(min(num_test_examples, len(test_raw))))
+
+    # -------- convert rows to vf format and shuffle row order --------
     test_ds = _to_vf_format(test_raw, num_options=num_options, shuffle=shuffle)
     del test_raw  # free memory
 
-    parser = (
-        vf.ThinkParser(extract_fn=extract_boxed_answer) if use_think else vf.Parser(extract_fn=extract_boxed_answer)
-    )
+    # -------- construct prompts and questions --------
+    parser = vf.ThinkParser(extract_fn=extract_boxed_answer) if use_think else vf.Parser(extract_fn=extract_boxed_answer)
     system_prompt = THINK_BOXED_SYSTEM_PROMPT if use_think else BOXED_SYSTEM_PROMPT
 
+    # -------- rubric --------
     def correct_answer_reward_func(parser, completion, answer, **kwargs) -> float:
         response = parser.parse_answer(completion) or ""
-        return 1.0 if response == answer else 0.0
+        response = response.strip()
+
+        # remove \text{...} wrapper if present
+        text_match = re.match(r'\\text\{(.+)\}', response)
+        if text_match:
+            response = text_match.group(1).strip()
+
+        # try to extract a letter at the beginning
+        # matches: "H", "H.", "H:", "(H)", "(H).", "H. Some text", "(A) Some text", etc.
+        letter_match = re.match(r'^\(?([A-J])\)?(?:[.:\s]|$)', response)
+        if letter_match:
+            extracted_letter = letter_match.group(1)
+            return 1.0 if extracted_letter.upper() == answer.upper() else 0.0
+        return 0.0
 
     rubric = vf.Rubric(
         funcs=[correct_answer_reward_func],

From 15c21a399427cc52ab3fa905f2119986507f5629 Mon Sep 17 00:00:00 2001
From: mkieffer1107 <mkieffer1107@gmail.com>
Date: Fri, 3 Oct 2025 17:43:41 -0400
Subject: [PATCH 8/8] Updated README and added more robust parser

---
 environments/medbullets/README.md     |  4 +-
 environments/medbullets/medbullets.py | 53 +++++++++++++++------------
 2 files changed, 31 insertions(+), 26 deletions(-)

diff --git a/environments/medbullets/README.md b/environments/medbullets/README.md
index 65d8343c..428396d3 100644
--- a/environments/medbullets/README.md
+++ b/environments/medbullets/README.md
@@ -36,7 +36,7 @@ Configure model and sampling:
 uv run vf-eval medbullets \
     -m gpt-4.1-mini   \
     -n -1 -r 3 -t 1024 -T 0.7  \
-    -a '{"use_think": false, "num_options": 4, "num_test_examples": -1, "shuffle": true}'
+    -a '{"use_think": false, "num_options": 4, "shuffle": true}'
 ```
 
 Notes:
@@ -46,7 +46,6 @@ Notes:
 
 | Arg                  | Type | Default | Description                                                                                                                                                                          |
 | -------------------- | ---- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `num_test_examples` | int  | `-1`    | Limit the number of test examples (`-1` for all)                                                                                                                            |
 | `num_options`        | int  | `4`     | Number of options: `4` → {A, B, C, D}; `5` → {A, B, C, D, E}                                                |
 | `use_think`          | bool | `False` | Whether to check for `<think>...</think>` formatting with `ThinkParser`|
 | `shuffle`            | bool | `False` | Whether to shuffle answer choices |
@@ -57,4 +56,3 @@ Notes:
 | Metric | Meaning |
 | ------ | ------- |
 | `correct_answer_reward_func` | (weight 1.0): 1.0 if parsed letter is correct, else 0.0|
-| `parser.get_format_reward_func()` | (weight 0.0): optional format adherence (not counted) |
diff --git a/environments/medbullets/medbullets.py b/environments/medbullets/medbullets.py
index eccad2a9..36b84b8b 100644
--- a/environments/medbullets/medbullets.py
+++ b/environments/medbullets/medbullets.py
@@ -1,9 +1,15 @@
-import random
 import re
+import random
+
 import verifiers as vf
 from datasets import Dataset, load_dataset
 from datasets.utils.logging import disable_progress_bar
-from verifiers.utils.data_utils import BOXED_SYSTEM_PROMPT, THINK_BOXED_SYSTEM_PROMPT, extract_boxed_answer
+from verifiers.utils.data_utils import (
+    BOXED_SYSTEM_PROMPT,
+    THINK_BOXED_SYSTEM_PROMPT,
+    extract_boxed_answer,
+)
+
 disable_progress_bar()  # suppress datasets mapping progress bar
 
 
@@ -27,7 +33,9 @@ def _to_vf_format(ds: Dataset, num_options: int, shuffle: bool) -> Dataset:
 
     def _format_row(row: dict) -> dict:
         question = row.get("question", "") or ""  # question string
-        opts = row.get("options", {}) or {}  # answer choices, map of letter to answer text
+        opts = (
+            row.get("options", {}) or {}
+        )  # answer choices, map of letter to answer text
 
         # strip option E if num_options == 4
         if num_options == 4:
@@ -60,8 +68,6 @@ def _format_row(row: dict) -> dict:
                     answer_letter = letter
                     break
 
-
-
         instruction = "The following are multiple choice questions (with answers) about health. Think step by step and then output the single letter answer at the end like \\boxed{A}.\n\n"
         question = _build_question(question, opts)
         prompt = instruction + question
@@ -80,16 +86,14 @@ def _format_row(row: dict) -> dict:
             "info": info,
         }
 
-    return ds.map(_format_row, remove_columns=ds.column_names).filter(lambda row: row is not None)
+    return ds.map(_format_row, remove_columns=ds.column_names).filter(
+        lambda row: row is not None
+    )
 
 
 def load_environment(
-        num_test_examples: int = -1, 
-        num_options: int = 4, 
-        use_think: bool = False, 
-        shuffle: bool = False,  
-        **kwargs
-    ) -> vf.Environment:
+    num_options: int = 4, use_think: bool = False, shuffle: bool = False, **kwargs
+) -> vf.Environment:
     """
     Single-turn Medbullets environment using HuggingFace `mkieffer/Medbullets` dataset
 
@@ -118,31 +122,28 @@ def load_environment(
     else:
         raise ValueError("'num_options' must be 4 or 5")
 
-    # -------- limit number of examples if specified --------
-    if num_test_examples != -1:
-        test_raw = test_raw.select(range(min(num_test_examples, len(test_raw))))
-
-    # -------- convert rows to vf format and shuffle row order --------
     test_ds = _to_vf_format(test_raw, num_options=num_options, shuffle=shuffle)
     del test_raw  # free memory
 
-    # -------- construct prompts and questions --------
-    parser = vf.ThinkParser(extract_fn=extract_boxed_answer) if use_think else vf.Parser(extract_fn=extract_boxed_answer)
+    parser = (
+        vf.ThinkParser(extract_fn=extract_boxed_answer)
+        if use_think
+        else vf.Parser(extract_fn=extract_boxed_answer)
+    )
     system_prompt = THINK_BOXED_SYSTEM_PROMPT if use_think else BOXED_SYSTEM_PROMPT
 
-    # -------- rubric --------
     def correct_answer_reward_func(parser, completion, answer, **kwargs) -> float:
         response = parser.parse_answer(completion) or ""
         response = response.strip()
 
         # remove \text{...} wrapper if present
-        text_match = re.match(r'\\text\{(.+)\}', response)
+        text_match = re.match(r"\\text\{(.+)\}", response)
         if text_match:
             response = text_match.group(1).strip()
 
         # try to extract a letter at the beginning
         # matches: "H", "H.", "H:", "(H)", "(H).", "H. Some text", "(A) Some text", etc.
-        letter_match = re.match(r'^\(?([A-J])\)?(?:[.:\s]|$)', response)
+        letter_match = re.match(r"^\(?([A-J])\)?(?:[.:\s]|$)", response)
         if letter_match:
             extracted_letter = letter_match.group(1)
             return 1.0 if extracted_letter.upper() == answer.upper() else 0.0
@@ -154,4 +155,10 @@ def correct_answer_reward_func(parser, completion, answer, **kwargs) -> float:
         parser=parser,
     )
 
-    return vf.SingleTurnEnv(eval_dataset=test_ds, system_prompt=system_prompt, parser=parser, rubric=rubric, **kwargs)
+    return vf.SingleTurnEnv(
+        eval_dataset=test_ds,
+        system_prompt=system_prompt,
+        parser=parser,
+        rubric=rubric,
+        **kwargs,
+    )