From 2ab0b5c42a64b72d0c748c27cae0a7e1823cf5a1 Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Sun, 21 Sep 2025 22:45:37 -0700
Subject: [PATCH 01/10] Update quickstart

---
 eval_protocol/pytest/evaluation_test.py       | 50 +++++++++++++++++--
 eval_protocol/pytest/utils.py                 |  2 +-
 .../quickstart/llm_judge_braintrust.py        | 34 +++++++------
 .../quickstart/llm_judge_langfuse.py          | 30 ++++++-----
 .../quickstart/llm_judge_langsmith.py         | 16 ++++--
 .../quickstart/llm_judge_openai_responses.py  |  1 -
 6 files changed, 93 insertions(+), 40 deletions(-)

diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py
index 2e1254d6..d22c78b5 100644
--- a/eval_protocol/pytest/evaluation_test.py
+++ b/eval_protocol/pytest/evaluation_test.py
@@ -379,7 +379,38 @@ async def _execute_groupwise_eval_with_semaphore(
                                 pointwise_tasks.append(
                                     asyncio.create_task(_execute_pointwise_eval_with_semaphore(row=row))
                                 )
-                            results = await asyncio.gather(*pointwise_tasks)
+
+                            # Add tqdm progress bar for evaluations with proper cleanup
+                            eval_position = run_idx + 2  # Position after rollout progress bar
+                            with tqdm(
+                                total=len(pointwise_tasks),
+                                desc=f"  Eval {run_idx + 1}",
+                                unit="eval",
+                                file=sys.__stderr__,
+                                leave=False,
+                                position=eval_position,
+                                dynamic_ncols=True,
+                                miniters=1,
+                                mininterval=0.1,
+                                bar_format="{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]",
+                            ) as eval_pbar:
+
+                                async def task_with_progress(task):
+                                    try:
+                                        result = await task
+                                        return result
+                                    finally:
+                                        eval_pbar.update(1)
+
+                                wrapped_tasks = [task_with_progress(task) for task in pointwise_tasks]
+                                try:
+                                    results = await asyncio.gather(*wrapped_tasks)
+                                except Exception:
+                                    # Propagate cancellation to the real tasks and await them to quiesce
+                                    for task in pointwise_tasks:
+                                        task.cancel()
+                                    await asyncio.gather(*pointwise_tasks, return_exceptions=True)
+                                    raise
 
                             all_results[run_idx] = results
                         elif mode == "groupwise":
@@ -510,14 +541,23 @@ async def _collect_result(config, lst):  # pyright: ignore[reportUnknownParamete
                         ) as run_pbar:
 
                             async def execute_run_with_progress(run_idx: int, config):
-                                result = await execute_run(run_idx, config)
-                                run_pbar.update(1)
-                                return result
+                                try:
+                                    result = await execute_run(run_idx, config)
+                                    return result
+                                finally:
+                                    run_pbar.update(1)
 
                             tasks = []
                             for run_idx in range(num_runs):
                                 tasks.append(asyncio.create_task(execute_run_with_progress(run_idx, config)))
-                            await asyncio.gather(*tasks)  # pyright: ignore[reportUnknownArgumentType]
+                            try:
+                                await asyncio.gather(*tasks)
+                            except Exception:
+                                # Propagate cancellation to tasks and await them to quiesce
+                                for task in tasks:
+                                    task.cancel()
+                                await asyncio.gather(*tasks, return_exceptions=True)
+                                raise
 
                     experiment_duration_seconds = time.perf_counter() - experiment_start_time
 
diff --git a/eval_protocol/pytest/utils.py b/eval_protocol/pytest/utils.py
index 0b0c6bc7..aa98ecf3 100644
--- a/eval_protocol/pytest/utils.py
+++ b/eval_protocol/pytest/utils.py
@@ -277,7 +277,7 @@ async def execute_row_with_backoff_and_log(task: asyncio.Task, row: EvaluationRo
         position = run_idx + 1  # Position 0 is reserved for main run bar, so shift up by 1
         with tqdm(
             total=len(retry_tasks),
-            desc=f"  Run {position}",
+            desc=f"  Run {run_idx + 1}",
             unit="rollout",
             file=sys.__stderr__,
             leave=False,
diff --git a/eval_protocol/quickstart/llm_judge_braintrust.py b/eval_protocol/quickstart/llm_judge_braintrust.py
index 65bde54c..56c99d5b 100644
--- a/eval_protocol/quickstart/llm_judge_braintrust.py
+++ b/eval_protocol/quickstart/llm_judge_braintrust.py
@@ -13,25 +13,26 @@
     EvaluationRow,
     SingleTurnRolloutProcessor,
     create_braintrust_adapter,
+    DefaultParameterIdGenerator,
 )
+
 # adapter = create_braintrust_adapter()
+input_rows = [
+    #         adapter.get_evaluation_rows(
+    #             btql_query=f"""
+    # select: *
+    # from: project_logs('{os.getenv("BRAINTRUST_PROJECT_ID")}') traces
+    # filter: is_root = true
+    # limit: 10
+    # """
+    #         )
+]
 
 
 @pytest.mark.skipif(os.environ.get("CI") == "true", reason="Skip in CI")
-@pytest.mark.asyncio
-@evaluation_test(
-    input_rows=[
-        #         adapter.get_evaluation_rows(
-        #             btql_query=f"""
-        # select: *
-        # from: project_logs('{os.getenv("BRAINTRUST_PROJECT_ID")}') traces
-        # filter: is_root = true
-        # limit: 10
-        # """
-        #         )
-        []
-    ],
-    completion_params=[
+@pytest.mark.parametrize(
+    "completion_params",
+    [
         {"model": "gpt-4.1"},
         {
             "max_tokens": 131000,
@@ -44,10 +45,13 @@
             "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-20b",
         },
     ],
+    ids=DefaultParameterIdGenerator.generate_id_from_dict,
+)
+@evaluation_test(
+    input_rows=input_rows,
     rollout_processor=SingleTurnRolloutProcessor(),
     preprocess_fn=multi_turn_assistant_to_ground_truth,
     max_concurrent_rollouts=64,
-    aggregation_method="bootstrap",
 )
 async def test_llm_judge(row: EvaluationRow) -> EvaluationRow:
     return await aha_judge(row)
diff --git a/eval_protocol/quickstart/llm_judge_langfuse.py b/eval_protocol/quickstart/llm_judge_langfuse.py
index bfd19b0d..c8dbedac 100644
--- a/eval_protocol/quickstart/llm_judge_langfuse.py
+++ b/eval_protocol/quickstart/llm_judge_langfuse.py
@@ -14,25 +14,25 @@
     EvaluationRow,
     SingleTurnRolloutProcessor,
     create_langfuse_adapter,
+    DefaultParameterIdGenerator,
 )
 
 from eval_protocol.quickstart import aha_judge
 
 adapter = create_langfuse_adapter()
+input_rows = adapter.get_evaluation_rows(
+    to_timestamp=datetime(2025, 9, 12, 0, 11, 18),
+    limit=1,
+    sample_size=1,
+    sleep_between_gets=3.0,
+    max_retries=5,
+)
 
 
 @pytest.mark.skipif(os.environ.get("CI") == "true", reason="Skip in CI")
-@evaluation_test(
-    input_rows=[
-        adapter.get_evaluation_rows(
-            to_timestamp=datetime(2025, 9, 12, 0, 11, 18),
-            limit=711,
-            sample_size=50,
-            sleep_between_gets=3.0,
-            max_retries=5,
-        )
-    ],
-    completion_params=[
+@pytest.mark.parametrize(
+    "completion_params",
+    [
         {"model": "gpt-4.1"},
         {
             "max_tokens": 131000,
@@ -45,10 +45,14 @@
             "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-20b",
         },
     ],
+    ids=DefaultParameterIdGenerator.generate_id_from_dict,
+)
+@evaluation_test(
+    input_rows=[input_rows],
     rollout_processor=SingleTurnRolloutProcessor(),
     preprocess_fn=multi_turn_assistant_to_ground_truth,
-    max_concurrent_rollouts=2,
-    aggregation_method="bootstrap",
+    max_concurrent_rollouts=64,
+    max_concurrent_evaluations=2,
 )
 async def test_llm_judge(row: EvaluationRow) -> EvaluationRow:
     return await aha_judge(row)
diff --git a/eval_protocol/quickstart/llm_judge_langsmith.py b/eval_protocol/quickstart/llm_judge_langsmith.py
index b8850ee0..cd19a738 100644
--- a/eval_protocol/quickstart/llm_judge_langsmith.py
+++ b/eval_protocol/quickstart/llm_judge_langsmith.py
@@ -31,6 +31,7 @@
     EvaluationRow,
     SingleTurnRolloutProcessor,
     LangSmithAdapter,
+    DefaultParameterIdGenerator,
 )
 
 
@@ -53,11 +54,13 @@ def fetch_langsmith_traces_as_evaluation_rows(
         return []
 
 
+input_rows = fetch_langsmith_traces_as_evaluation_rows()
+
+
 @pytest.mark.skipif(os.environ.get("CI") == "true", reason="Skip in CI")
-@pytest.mark.asyncio
-@evaluation_test(
-    input_rows=[fetch_langsmith_traces_as_evaluation_rows()],
-    completion_params=[
+@pytest.mark.parametrize(
+    "completion_params",
+    [
         {
             "model": "fireworks_ai/accounts/fireworks/models/qwen3-235b-a22b-instruct-2507",
         },
@@ -67,9 +70,12 @@ def fetch_langsmith_traces_as_evaluation_rows(
             "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b",
         },
     ],
+    ids=DefaultParameterIdGenerator.generate_id_from_dict,
+)
+@evaluation_test(
+    input_rows=[input_rows],
     rollout_processor=SingleTurnRolloutProcessor(),
     preprocess_fn=multi_turn_assistant_to_ground_truth,
-    aggregation_method="bootstrap",
 )
 async def test_llm_judge_langsmith(row: EvaluationRow) -> EvaluationRow:
     """LLM Judge evaluation over LangSmith-sourced rows, persisted locally by Eval Protocol.
diff --git a/eval_protocol/quickstart/llm_judge_openai_responses.py b/eval_protocol/quickstart/llm_judge_openai_responses.py
index e096c759..8a81b9cc 100644
--- a/eval_protocol/quickstart/llm_judge_openai_responses.py
+++ b/eval_protocol/quickstart/llm_judge_openai_responses.py
@@ -57,7 +57,6 @@
     input_rows=[input_rows],
     rollout_processor=SingleTurnRolloutProcessor(),
     preprocess_fn=multi_turn_assistant_to_ground_truth,
-    aggregation_method="bootstrap",
 )
 async def test_llm_judge_openai_responses(row: EvaluationRow) -> EvaluationRow:
     return await aha_judge(row)

From c952929cdb681588f482fb5e7b473020fa0f0e28 Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Mon, 22 Sep 2025 00:28:33 -0700
Subject: [PATCH 02/10] update

---
 eval_protocol/quickstart/llm_judge.py          | 13 ++++++++-----
 eval_protocol/quickstart/llm_judge_langfuse.py |  6 +++---
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/eval_protocol/quickstart/llm_judge.py b/eval_protocol/quickstart/llm_judge.py
index e9621658..a5225857 100644
--- a/eval_protocol/quickstart/llm_judge.py
+++ b/eval_protocol/quickstart/llm_judge.py
@@ -47,11 +47,14 @@ async def aha_judge(
     model_a_answer = str(row.ground_truth)
     model_b_answer = serialize_message(row.messages[-1])
 
-    client = AsyncOpenAI(api_key=judge_config.get("api_key"), base_url=judge_config.get("base_url"))
-
-    # Run two judgment rounds in sequence (A vs B, then B vs A)
-    result1 = await run_single_judgment(question_text, model_a_answer, model_b_answer, row.tools, judge_config, client)
-    result2 = await run_single_judgment(question_text, model_b_answer, model_a_answer, row.tools, judge_config, client)
+    async with AsyncOpenAI(api_key=judge_config.get("api_key"), base_url=judge_config.get("base_url")) as client:
+        # Run two judgment rounds in sequence (A vs B, then B vs A)
+        result1 = await run_single_judgment(
+            question_text, model_a_answer, model_b_answer, row.tools, judge_config, client
+        )
+        result2 = await run_single_judgment(
+            question_text, model_b_answer, model_a_answer, row.tools, judge_config, client
+        )
 
     if not result1 or not result2 or not result1.get("score") or not result2.get("score"):
         # If either judgment failed, mark as invalid (don't include in distribution)
diff --git a/eval_protocol/quickstart/llm_judge_langfuse.py b/eval_protocol/quickstart/llm_judge_langfuse.py
index c8dbedac..21fbc849 100644
--- a/eval_protocol/quickstart/llm_judge_langfuse.py
+++ b/eval_protocol/quickstart/llm_judge_langfuse.py
@@ -22,8 +22,8 @@
 adapter = create_langfuse_adapter()
 input_rows = adapter.get_evaluation_rows(
     to_timestamp=datetime(2025, 9, 12, 0, 11, 18),
-    limit=1,
-    sample_size=1,
+    limit=711,
+    sample_size=50,
     sleep_between_gets=3.0,
     max_retries=5,
 )
@@ -52,7 +52,7 @@
     rollout_processor=SingleTurnRolloutProcessor(),
     preprocess_fn=multi_turn_assistant_to_ground_truth,
     max_concurrent_rollouts=64,
-    max_concurrent_evaluations=2,
+    max_concurrent_evaluations=16,
 )
 async def test_llm_judge(row: EvaluationRow) -> EvaluationRow:
     return await aha_judge(row)

From 3adfc104e277395c7b3510b937d6f4cd74918f4e Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Mon, 22 Sep 2025 00:29:52 -0700
Subject: [PATCH 03/10] update

---
 eval_protocol/quickstart/llm_judge_braintrust.py       | 2 +-
 eval_protocol/quickstart/llm_judge_langfuse.py         | 3 +--
 eval_protocol/quickstart/llm_judge_langsmith.py        | 1 +
 eval_protocol/quickstart/llm_judge_openai_responses.py | 1 +
 4 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/eval_protocol/quickstart/llm_judge_braintrust.py b/eval_protocol/quickstart/llm_judge_braintrust.py
index 56c99d5b..4a4b665b 100644
--- a/eval_protocol/quickstart/llm_judge_braintrust.py
+++ b/eval_protocol/quickstart/llm_judge_braintrust.py
@@ -51,7 +51,7 @@
     input_rows=input_rows,
     rollout_processor=SingleTurnRolloutProcessor(),
     preprocess_fn=multi_turn_assistant_to_ground_truth,
-    max_concurrent_rollouts=64,
+    max_concurrent_evaluations=2,
 )
 async def test_llm_judge(row: EvaluationRow) -> EvaluationRow:
     return await aha_judge(row)
diff --git a/eval_protocol/quickstart/llm_judge_langfuse.py b/eval_protocol/quickstart/llm_judge_langfuse.py
index 21fbc849..d1dc725c 100644
--- a/eval_protocol/quickstart/llm_judge_langfuse.py
+++ b/eval_protocol/quickstart/llm_judge_langfuse.py
@@ -51,8 +51,7 @@
     input_rows=[input_rows],
     rollout_processor=SingleTurnRolloutProcessor(),
     preprocess_fn=multi_turn_assistant_to_ground_truth,
-    max_concurrent_rollouts=64,
-    max_concurrent_evaluations=16,
+    max_concurrent_evaluations=2,
 )
 async def test_llm_judge(row: EvaluationRow) -> EvaluationRow:
     return await aha_judge(row)
diff --git a/eval_protocol/quickstart/llm_judge_langsmith.py b/eval_protocol/quickstart/llm_judge_langsmith.py
index cd19a738..4531a3c4 100644
--- a/eval_protocol/quickstart/llm_judge_langsmith.py
+++ b/eval_protocol/quickstart/llm_judge_langsmith.py
@@ -76,6 +76,7 @@ def fetch_langsmith_traces_as_evaluation_rows(
     input_rows=[input_rows],
     rollout_processor=SingleTurnRolloutProcessor(),
     preprocess_fn=multi_turn_assistant_to_ground_truth,
+    max_concurrent_evaluations=2,
 )
 async def test_llm_judge_langsmith(row: EvaluationRow) -> EvaluationRow:
     """LLM Judge evaluation over LangSmith-sourced rows, persisted locally by Eval Protocol.
diff --git a/eval_protocol/quickstart/llm_judge_openai_responses.py b/eval_protocol/quickstart/llm_judge_openai_responses.py
index 8a81b9cc..9d5b1a7e 100644
--- a/eval_protocol/quickstart/llm_judge_openai_responses.py
+++ b/eval_protocol/quickstart/llm_judge_openai_responses.py
@@ -57,6 +57,7 @@
     input_rows=[input_rows],
     rollout_processor=SingleTurnRolloutProcessor(),
     preprocess_fn=multi_turn_assistant_to_ground_truth,
+    max_concurrent_evaluations=2,
 )
 async def test_llm_judge_openai_responses(row: EvaluationRow) -> EvaluationRow:
     return await aha_judge(row)

From 311d31b89f5258d2ba68cbf50258f947056d1447 Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Mon, 22 Sep 2025 00:39:40 -0700
Subject: [PATCH 04/10] update braintrust

---
 .../quickstart/llm_judge_braintrust.py         | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/eval_protocol/quickstart/llm_judge_braintrust.py b/eval_protocol/quickstart/llm_judge_braintrust.py
index 4a4b665b..61ce6c7d 100644
--- a/eval_protocol/quickstart/llm_judge_braintrust.py
+++ b/eval_protocol/quickstart/llm_judge_braintrust.py
@@ -16,16 +16,16 @@
     DefaultParameterIdGenerator,
 )
 
-# adapter = create_braintrust_adapter()
+adapter = create_braintrust_adapter()
 input_rows = [
-    #         adapter.get_evaluation_rows(
-    #             btql_query=f"""
-    # select: *
-    # from: project_logs('{os.getenv("BRAINTRUST_PROJECT_ID")}') traces
-    # filter: is_root = true
-    # limit: 10
-    # """
-    #         )
+    adapter.get_evaluation_rows(
+        btql_query=f"""
+    select: *
+    from: project_logs('{os.getenv("BRAINTRUST_PROJECT_ID")}') traces
+    filter: is_root = true
+    limit: 10
+    """
+    )
 ]
 
 

From 94991cda82a89cd91336811528619f892cf20249 Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Mon, 22 Sep 2025 00:51:38 -0700
Subject: [PATCH 05/10] test

---
 .github/workflows/ci.yml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 03e836f9..977ec775 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -70,6 +70,12 @@ jobs:
         with:
           fetch-depth: 0 # Fetch all history for all tags and branches
 
+      - name: Check CI environment variable
+        run: |
+          echo "CI=$CI"
+          echo "CI (from env)=$(env | grep '^CI=')"
+          python -c "import os; print('CI (python):', repr(os.getenv('CI')))"
+
       - name: Set up Python ${{ matrix.python-version }}
         uses: actions/setup-python@v5
         with:

From 9ea0ac81f293eb5743482ef19feb66ee91fa3eeb Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Mon, 22 Sep 2025 00:56:59 -0700
Subject: [PATCH 06/10] remove

---
 .github/workflows/ci.yml | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 977ec775..03e836f9 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -70,12 +70,6 @@ jobs:
         with:
           fetch-depth: 0 # Fetch all history for all tags and branches
 
-      - name: Check CI environment variable
-        run: |
-          echo "CI=$CI"
-          echo "CI (from env)=$(env | grep '^CI=')"
-          python -c "import os; print('CI (python):', repr(os.getenv('CI')))"
-
       - name: Set up Python ${{ matrix.python-version }}
         uses: actions/setup-python@v5
         with:

From fd12a2ab046b5212408bf3c0eb2ed9a01c120070 Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Mon, 22 Sep 2025 01:03:22 -0700
Subject: [PATCH 07/10] add new braintrust key

---
 .github/workflows/ci.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 03e836f9..91723565 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -92,6 +92,8 @@ jobs:
           E2B_API_KEY: ${{ secrets.E2B_API_KEY }}
           FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }}
           FIREWORKS_ACCOUNT_ID: ${{ secrets.FIREWORKS_ACCOUNT_ID }}
+          BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }}
+          BRAINTRUST_PROJECT_ID: ${{ secrets.BRAINTRUST_PROJECT_ID }}
           SUPABASE_PASSWORD: ${{ secrets.SUPABASE_PASSWORD }}
           SUPABASE_HOST: ${{ secrets.SUPABASE_HOST }}
           SUPABASE_PORT: ${{ secrets.SUPABASE_PORT }}

From a4d4099f22e724f516e83b6b53673ad56caf2bfd Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Mon, 22 Sep 2025 01:13:45 -0700
Subject: [PATCH 08/10] remove braintrust keys

---
 .github/workflows/ci.yml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 91723565..03e836f9 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -92,8 +92,6 @@ jobs:
           E2B_API_KEY: ${{ secrets.E2B_API_KEY }}
           FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }}
           FIREWORKS_ACCOUNT_ID: ${{ secrets.FIREWORKS_ACCOUNT_ID }}
-          BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }}
-          BRAINTRUST_PROJECT_ID: ${{ secrets.BRAINTRUST_PROJECT_ID }}
           SUPABASE_PASSWORD: ${{ secrets.SUPABASE_PASSWORD }}
           SUPABASE_HOST: ${{ secrets.SUPABASE_HOST }}
           SUPABASE_PORT: ${{ secrets.SUPABASE_PORT }}

From 9200ecd167738273d0e769dd31dd1b0b5d8cec9a Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Mon, 22 Sep 2025 09:07:59 -0700
Subject: [PATCH 09/10] remove id stuff and comment out braintrust

---
 .../quickstart/llm_judge_braintrust.py        | 31 +++++++++++--------
 .../quickstart/llm_judge_langfuse.py          |  1 -
 .../quickstart/llm_judge_langsmith.py         |  1 -
 .../quickstart/llm_judge_openai_responses.py  |  1 -
 4 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/eval_protocol/quickstart/llm_judge_braintrust.py b/eval_protocol/quickstart/llm_judge_braintrust.py
index 61ce6c7d..91bce9cf 100644
--- a/eval_protocol/quickstart/llm_judge_braintrust.py
+++ b/eval_protocol/quickstart/llm_judge_braintrust.py
@@ -6,6 +6,10 @@
 
 import pytest
 
+# Skip entire module in CI to prevent import-time side effects
+if os.environ.get("CI") == "true":
+    pytest.skip("Skip quickstart in CI", allow_module_level=True)
+
 from eval_protocol import (
     evaluation_test,
     aha_judge,
@@ -16,17 +20,19 @@
     DefaultParameterIdGenerator,
 )
 
-adapter = create_braintrust_adapter()
-input_rows = [
-    adapter.get_evaluation_rows(
-        btql_query=f"""
-    select: *
-    from: project_logs('{os.getenv("BRAINTRUST_PROJECT_ID")}') traces
-    filter: is_root = true
-    limit: 10
-    """
-    )
-]
+# adapter = create_braintrust_adapter()
+# input_rows = [
+#     adapter.get_evaluation_rows(
+#         btql_query=f"""
+#     select: *
+#     from: project_logs('{os.getenv("BRAINTRUST_PROJECT_ID")}') traces
+#     filter: is_root = true
+#     limit: 10
+#     """
+#     )
+# ]
+input_rows = []
+# uncomment when dataloader is fixed
 
 
 @pytest.mark.skipif(os.environ.get("CI") == "true", reason="Skip in CI")
@@ -45,10 +51,9 @@
             "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-20b",
         },
     ],
-    ids=DefaultParameterIdGenerator.generate_id_from_dict,
 )
 @evaluation_test(
-    input_rows=input_rows,
+    input_rows=[input_rows],
     rollout_processor=SingleTurnRolloutProcessor(),
     preprocess_fn=multi_turn_assistant_to_ground_truth,
     max_concurrent_evaluations=2,
diff --git a/eval_protocol/quickstart/llm_judge_langfuse.py b/eval_protocol/quickstart/llm_judge_langfuse.py
index d1dc725c..a8e92c05 100644
--- a/eval_protocol/quickstart/llm_judge_langfuse.py
+++ b/eval_protocol/quickstart/llm_judge_langfuse.py
@@ -45,7 +45,6 @@
             "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-20b",
         },
     ],
-    ids=DefaultParameterIdGenerator.generate_id_from_dict,
 )
 @evaluation_test(
     input_rows=[input_rows],
diff --git a/eval_protocol/quickstart/llm_judge_langsmith.py b/eval_protocol/quickstart/llm_judge_langsmith.py
index 4531a3c4..f62fdb28 100644
--- a/eval_protocol/quickstart/llm_judge_langsmith.py
+++ b/eval_protocol/quickstart/llm_judge_langsmith.py
@@ -70,7 +70,6 @@ def fetch_langsmith_traces_as_evaluation_rows(
             "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b",
         },
     ],
-    ids=DefaultParameterIdGenerator.generate_id_from_dict,
 )
 @evaluation_test(
     input_rows=[input_rows],
diff --git a/eval_protocol/quickstart/llm_judge_openai_responses.py b/eval_protocol/quickstart/llm_judge_openai_responses.py
index 9d5b1a7e..a30feee0 100644
--- a/eval_protocol/quickstart/llm_judge_openai_responses.py
+++ b/eval_protocol/quickstart/llm_judge_openai_responses.py
@@ -51,7 +51,6 @@
             "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905",
         },
     ],
-    ids=DefaultParameterIdGenerator.generate_id_from_dict,
 )
 @evaluation_test(
     input_rows=[input_rows],

From 824a998bcb9f5ad4e2491d22a7d1b42c4ee59ce4 Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Mon, 22 Sep 2025 09:48:07 -0700
Subject: [PATCH 10/10] move tqdm to utils file

---
 eval_protocol/pytest/evaluation_test.py | 67 ++------------------
 eval_protocol/pytest/utils.py           | 83 +++++++++++++++++++++++++
 2 files changed, 88 insertions(+), 62 deletions(-)

diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py
index d22c78b5..3a1bac70 100644
--- a/eval_protocol/pytest/evaluation_test.py
+++ b/eval_protocol/pytest/evaluation_test.py
@@ -9,7 +9,6 @@
 from collections.abc import Sequence
 
 import pytest
-from tqdm import tqdm
 
 from eval_protocol.dataset_logger import default_logger
 from eval_protocol.dataset_logger.dataset_logger import DatasetLogger
@@ -58,6 +57,8 @@
     parse_ep_num_runs,
     parse_ep_passed_threshold,
     rollout_processor_with_retry,
+    run_tasks_with_eval_progress,
+    run_tasks_with_run_progress,
 )
 from eval_protocol.utils.show_results_url import show_results_url
 
@@ -380,37 +381,8 @@ async def _execute_groupwise_eval_with_semaphore(
                                     asyncio.create_task(_execute_pointwise_eval_with_semaphore(row=row))
                                 )
 
-                            # Add tqdm progress bar for evaluations with proper cleanup
-                            eval_position = run_idx + 2  # Position after rollout progress bar
-                            with tqdm(
-                                total=len(pointwise_tasks),
-                                desc=f"  Eval {run_idx + 1}",
-                                unit="eval",
-                                file=sys.__stderr__,
-                                leave=False,
-                                position=eval_position,
-                                dynamic_ncols=True,
-                                miniters=1,
-                                mininterval=0.1,
-                                bar_format="{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]",
-                            ) as eval_pbar:
-
-                                async def task_with_progress(task):
-                                    try:
-                                        result = await task
-                                        return result
-                                    finally:
-                                        eval_pbar.update(1)
-
-                                wrapped_tasks = [task_with_progress(task) for task in pointwise_tasks]
-                                try:
-                                    results = await asyncio.gather(*wrapped_tasks)
-                                except Exception:
-                                    # Propagate cancellation to the real tasks and await them to quiesce
-                                    for task in pointwise_tasks:
-                                        task.cancel()
-                                    await asyncio.gather(*pointwise_tasks, return_exceptions=True)
-                                    raise
+                            # Run evaluation tasks with progress bar
+                            results = await run_tasks_with_eval_progress(pointwise_tasks, run_idx)
 
                             all_results[run_idx] = results
                         elif mode == "groupwise":
@@ -528,36 +500,7 @@ async def _collect_result(config, lst):  # pyright: ignore[reportUnknownParamete
                     else:
                         # For other processors, create all tasks at once and run in parallel
                         # Concurrency is now controlled by the shared semaphore in each rollout processor
-                        with tqdm(
-                            total=num_runs,
-                            desc="Runs (Parallel)",
-                            unit="run",
-                            file=sys.__stderr__,
-                            position=0,
-                            leave=True,
-                            dynamic_ncols=True,
-                            miniters=1,
-                            bar_format="{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]",
-                        ) as run_pbar:
-
-                            async def execute_run_with_progress(run_idx: int, config):
-                                try:
-                                    result = await execute_run(run_idx, config)
-                                    return result
-                                finally:
-                                    run_pbar.update(1)
-
-                            tasks = []
-                            for run_idx in range(num_runs):
-                                tasks.append(asyncio.create_task(execute_run_with_progress(run_idx, config)))
-                            try:
-                                await asyncio.gather(*tasks)
-                            except Exception:
-                                # Propagate cancellation to tasks and await them to quiesce
-                                for task in tasks:
-                                    task.cancel()
-                                await asyncio.gather(*tasks, return_exceptions=True)
-                                raise
+                        await run_tasks_with_run_progress(execute_run, num_runs, config)
 
                     experiment_duration_seconds = time.perf_counter() - experiment_start_time
 
diff --git a/eval_protocol/pytest/utils.py b/eval_protocol/pytest/utils.py
index aa98ecf3..ce15cd19 100644
--- a/eval_protocol/pytest/utils.py
+++ b/eval_protocol/pytest/utils.py
@@ -33,6 +33,89 @@
 AggregationMethod = Literal["mean", "max", "min", "bootstrap"]
 
 
+async def run_tasks_with_eval_progress(pointwise_tasks: list, run_idx: int):
+    """
+    Run evaluation tasks with a progress bar and proper cancellation handling.
+
+    Args:
+        pointwise_tasks: List of asyncio tasks to execute
+        run_idx: Run index for progress bar positioning and naming
+
+    Returns:
+        Results from all tasks
+    """
+    eval_position = run_idx + 2  # Position after rollout progress bar
+    with tqdm(
+        total=len(pointwise_tasks),
+        desc=f"  Eval {run_idx + 1}",
+        unit="eval",
+        file=sys.__stderr__,
+        leave=False,
+        position=eval_position,
+        dynamic_ncols=True,
+        miniters=1,
+        mininterval=0.1,
+        bar_format="{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]",
+    ) as eval_pbar:
+
+        async def task_with_progress(task):
+            try:
+                result = await task
+                return result
+            finally:
+                eval_pbar.update(1)
+
+        wrapped_tasks = [task_with_progress(task) for task in pointwise_tasks]
+        try:
+            results = await asyncio.gather(*wrapped_tasks)
+            return results
+        except Exception:
+            # Propagate cancellation to the real tasks and await them to quiesce
+            for task in pointwise_tasks:
+                task.cancel()
+            await asyncio.gather(*pointwise_tasks, return_exceptions=True)
+            raise
+
+
+async def run_tasks_with_run_progress(execute_run_func, num_runs, config):
+    """
+    Run tasks with a parallel runs progress bar, preserving original logic.
+
+    Args:
+        execute_run_func: The execute_run function to call
+        num_runs: Number of runs to execute
+        config: Configuration to pass to execute_run_func
+    """
+    with tqdm(
+        total=num_runs,
+        desc="Runs (Parallel)",
+        unit="run",
+        file=sys.__stderr__,
+        position=0,
+        leave=True,
+        dynamic_ncols=True,
+        miniters=1,
+        bar_format="{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]",
+    ) as run_pbar:
+
+        async def execute_run_with_progress(run_idx: int, config):
+            result = await execute_run_func(run_idx, config)
+            run_pbar.update(1)
+            return result
+
+        tasks = []
+        for run_idx in range(num_runs):
+            tasks.append(asyncio.create_task(execute_run_with_progress(run_idx, config)))
+        try:
+            await asyncio.gather(*tasks)
+        except Exception:
+            # Propagate cancellation to tasks and await them to quiesce
+            for task in tasks:
+                task.cancel()
+            await asyncio.gather(*tasks, return_exceptions=True)
+            raise
+
+
 def calculate_bootstrap_scores(all_scores: list[float]) -> float:
     """
     Calculate bootstrap confidence intervals for individual scores.