From 2ab0b5c42a64b72d0c748c27cae0a7e1823cf5a1 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Sun, 21 Sep 2025 22:45:37 -0700 Subject: [PATCH 01/10] Update quickstart --- eval_protocol/pytest/evaluation_test.py | 50 +++++++++++++++++-- eval_protocol/pytest/utils.py | 2 +- .../quickstart/llm_judge_braintrust.py | 34 +++++++------ .../quickstart/llm_judge_langfuse.py | 30 ++++++----- .../quickstart/llm_judge_langsmith.py | 16 ++++-- .../quickstart/llm_judge_openai_responses.py | 1 - 6 files changed, 93 insertions(+), 40 deletions(-) diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py index 2e1254d6..d22c78b5 100644 --- a/eval_protocol/pytest/evaluation_test.py +++ b/eval_protocol/pytest/evaluation_test.py @@ -379,7 +379,38 @@ async def _execute_groupwise_eval_with_semaphore( pointwise_tasks.append( asyncio.create_task(_execute_pointwise_eval_with_semaphore(row=row)) ) - results = await asyncio.gather(*pointwise_tasks) + + # Add tqdm progress bar for evaluations with proper cleanup + eval_position = run_idx + 2 # Position after rollout progress bar + with tqdm( + total=len(pointwise_tasks), + desc=f" Eval {run_idx + 1}", + unit="eval", + file=sys.__stderr__, + leave=False, + position=eval_position, + dynamic_ncols=True, + miniters=1, + mininterval=0.1, + bar_format="{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]", + ) as eval_pbar: + + async def task_with_progress(task): + try: + result = await task + return result + finally: + eval_pbar.update(1) + + wrapped_tasks = [task_with_progress(task) for task in pointwise_tasks] + try: + results = await asyncio.gather(*wrapped_tasks) + except Exception: + # Propagate cancellation to the real tasks and await them to quiesce + for task in pointwise_tasks: + task.cancel() + await asyncio.gather(*pointwise_tasks, return_exceptions=True) + raise all_results[run_idx] = results elif mode == "groupwise": @@ -510,14 +541,23 @@ async def _collect_result(config, lst): # pyright: ignore[reportUnknownParamete ) as run_pbar: async def execute_run_with_progress(run_idx: int, config): - result = await execute_run(run_idx, config) - run_pbar.update(1) - return result + try: + result = await execute_run(run_idx, config) + return result + finally: + run_pbar.update(1) tasks = [] for run_idx in range(num_runs): tasks.append(asyncio.create_task(execute_run_with_progress(run_idx, config))) - await asyncio.gather(*tasks) # pyright: ignore[reportUnknownArgumentType] + try: + await asyncio.gather(*tasks) + except Exception: + # Propagate cancellation to tasks and await them to quiesce + for task in tasks: + task.cancel() + await asyncio.gather(*tasks, return_exceptions=True) + raise experiment_duration_seconds = time.perf_counter() - experiment_start_time diff --git a/eval_protocol/pytest/utils.py b/eval_protocol/pytest/utils.py index 0b0c6bc7..aa98ecf3 100644 --- a/eval_protocol/pytest/utils.py +++ b/eval_protocol/pytest/utils.py @@ -277,7 +277,7 @@ async def execute_row_with_backoff_and_log(task: asyncio.Task, row: EvaluationRo position = run_idx + 1 # Position 0 is reserved for main run bar, so shift up by 1 with tqdm( total=len(retry_tasks), - desc=f" Run {position}", + desc=f" Run {run_idx + 1}", unit="rollout", file=sys.__stderr__, leave=False, diff --git a/eval_protocol/quickstart/llm_judge_braintrust.py b/eval_protocol/quickstart/llm_judge_braintrust.py index 65bde54c..56c99d5b 100644 --- a/eval_protocol/quickstart/llm_judge_braintrust.py +++ b/eval_protocol/quickstart/llm_judge_braintrust.py @@ -13,25 +13,26 @@ EvaluationRow, SingleTurnRolloutProcessor, create_braintrust_adapter, + DefaultParameterIdGenerator, ) + # adapter = create_braintrust_adapter() +input_rows = [ + # adapter.get_evaluation_rows( + # btql_query=f""" + # select: * + # from: project_logs('{os.getenv("BRAINTRUST_PROJECT_ID")}') traces + # filter: is_root = true + # limit: 10 + # """ + # ) +] @pytest.mark.skipif(os.environ.get("CI") == "true", reason="Skip in CI") -@pytest.mark.asyncio -@evaluation_test( - input_rows=[ - # adapter.get_evaluation_rows( - # btql_query=f""" - # select: * - # from: project_logs('{os.getenv("BRAINTRUST_PROJECT_ID")}') traces - # filter: is_root = true - # limit: 10 - # """ - # ) - [] - ], - completion_params=[ +@pytest.mark.parametrize( + "completion_params", + [ {"model": "gpt-4.1"}, { "max_tokens": 131000, @@ -44,10 +45,13 @@ "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-20b", }, ], + ids=DefaultParameterIdGenerator.generate_id_from_dict, +) +@evaluation_test( + input_rows=input_rows, rollout_processor=SingleTurnRolloutProcessor(), preprocess_fn=multi_turn_assistant_to_ground_truth, max_concurrent_rollouts=64, - aggregation_method="bootstrap", ) async def test_llm_judge(row: EvaluationRow) -> EvaluationRow: return await aha_judge(row) diff --git a/eval_protocol/quickstart/llm_judge_langfuse.py b/eval_protocol/quickstart/llm_judge_langfuse.py index bfd19b0d..c8dbedac 100644 --- a/eval_protocol/quickstart/llm_judge_langfuse.py +++ b/eval_protocol/quickstart/llm_judge_langfuse.py @@ -14,25 +14,25 @@ EvaluationRow, SingleTurnRolloutProcessor, create_langfuse_adapter, + DefaultParameterIdGenerator, ) from eval_protocol.quickstart import aha_judge adapter = create_langfuse_adapter() +input_rows = adapter.get_evaluation_rows( + to_timestamp=datetime(2025, 9, 12, 0, 11, 18), + limit=1, + sample_size=1, + sleep_between_gets=3.0, + max_retries=5, +) @pytest.mark.skipif(os.environ.get("CI") == "true", reason="Skip in CI") -@evaluation_test( - input_rows=[ - adapter.get_evaluation_rows( - to_timestamp=datetime(2025, 9, 12, 0, 11, 18), - limit=711, - sample_size=50, - sleep_between_gets=3.0, - max_retries=5, - ) - ], - completion_params=[ +@pytest.mark.parametrize( + "completion_params", + [ {"model": "gpt-4.1"}, { "max_tokens": 131000, @@ -45,10 +45,14 @@ "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-20b", }, ], + ids=DefaultParameterIdGenerator.generate_id_from_dict, +) +@evaluation_test( + input_rows=[input_rows], rollout_processor=SingleTurnRolloutProcessor(), preprocess_fn=multi_turn_assistant_to_ground_truth, - max_concurrent_rollouts=2, - aggregation_method="bootstrap", + max_concurrent_rollouts=64, + max_concurrent_evaluations=2, ) async def test_llm_judge(row: EvaluationRow) -> EvaluationRow: return await aha_judge(row) diff --git a/eval_protocol/quickstart/llm_judge_langsmith.py b/eval_protocol/quickstart/llm_judge_langsmith.py index b8850ee0..cd19a738 100644 --- a/eval_protocol/quickstart/llm_judge_langsmith.py +++ b/eval_protocol/quickstart/llm_judge_langsmith.py @@ -31,6 +31,7 @@ EvaluationRow, SingleTurnRolloutProcessor, LangSmithAdapter, + DefaultParameterIdGenerator, ) @@ -53,11 +54,13 @@ def fetch_langsmith_traces_as_evaluation_rows( return [] +input_rows = fetch_langsmith_traces_as_evaluation_rows() + + @pytest.mark.skipif(os.environ.get("CI") == "true", reason="Skip in CI") -@pytest.mark.asyncio -@evaluation_test( - input_rows=[fetch_langsmith_traces_as_evaluation_rows()], - completion_params=[ +@pytest.mark.parametrize( + "completion_params", + [ { "model": "fireworks_ai/accounts/fireworks/models/qwen3-235b-a22b-instruct-2507", }, @@ -67,9 +70,12 @@ def fetch_langsmith_traces_as_evaluation_rows( "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b", }, ], + ids=DefaultParameterIdGenerator.generate_id_from_dict, +) +@evaluation_test( + input_rows=[input_rows], rollout_processor=SingleTurnRolloutProcessor(), preprocess_fn=multi_turn_assistant_to_ground_truth, - aggregation_method="bootstrap", ) async def test_llm_judge_langsmith(row: EvaluationRow) -> EvaluationRow: """LLM Judge evaluation over LangSmith-sourced rows, persisted locally by Eval Protocol. diff --git a/eval_protocol/quickstart/llm_judge_openai_responses.py b/eval_protocol/quickstart/llm_judge_openai_responses.py index e096c759..8a81b9cc 100644 --- a/eval_protocol/quickstart/llm_judge_openai_responses.py +++ b/eval_protocol/quickstart/llm_judge_openai_responses.py @@ -57,7 +57,6 @@ input_rows=[input_rows], rollout_processor=SingleTurnRolloutProcessor(), preprocess_fn=multi_turn_assistant_to_ground_truth, - aggregation_method="bootstrap", ) async def test_llm_judge_openai_responses(row: EvaluationRow) -> EvaluationRow: return await aha_judge(row) From c952929cdb681588f482fb5e7b473020fa0f0e28 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Mon, 22 Sep 2025 00:28:33 -0700 Subject: [PATCH 02/10] update --- eval_protocol/quickstart/llm_judge.py | 13 ++++++++----- eval_protocol/quickstart/llm_judge_langfuse.py | 6 +++--- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/eval_protocol/quickstart/llm_judge.py b/eval_protocol/quickstart/llm_judge.py index e9621658..a5225857 100644 --- a/eval_protocol/quickstart/llm_judge.py +++ b/eval_protocol/quickstart/llm_judge.py @@ -47,11 +47,14 @@ async def aha_judge( model_a_answer = str(row.ground_truth) model_b_answer = serialize_message(row.messages[-1]) - client = AsyncOpenAI(api_key=judge_config.get("api_key"), base_url=judge_config.get("base_url")) - - # Run two judgment rounds in sequence (A vs B, then B vs A) - result1 = await run_single_judgment(question_text, model_a_answer, model_b_answer, row.tools, judge_config, client) - result2 = await run_single_judgment(question_text, model_b_answer, model_a_answer, row.tools, judge_config, client) + async with AsyncOpenAI(api_key=judge_config.get("api_key"), base_url=judge_config.get("base_url")) as client: + # Run two judgment rounds in sequence (A vs B, then B vs A) + result1 = await run_single_judgment( + question_text, model_a_answer, model_b_answer, row.tools, judge_config, client + ) + result2 = await run_single_judgment( + question_text, model_b_answer, model_a_answer, row.tools, judge_config, client + ) if not result1 or not result2 or not result1.get("score") or not result2.get("score"): # If either judgment failed, mark as invalid (don't include in distribution) diff --git a/eval_protocol/quickstart/llm_judge_langfuse.py b/eval_protocol/quickstart/llm_judge_langfuse.py index c8dbedac..21fbc849 100644 --- a/eval_protocol/quickstart/llm_judge_langfuse.py +++ b/eval_protocol/quickstart/llm_judge_langfuse.py @@ -22,8 +22,8 @@ adapter = create_langfuse_adapter() input_rows = adapter.get_evaluation_rows( to_timestamp=datetime(2025, 9, 12, 0, 11, 18), - limit=1, - sample_size=1, + limit=711, + sample_size=50, sleep_between_gets=3.0, max_retries=5, ) @@ -52,7 +52,7 @@ rollout_processor=SingleTurnRolloutProcessor(), preprocess_fn=multi_turn_assistant_to_ground_truth, max_concurrent_rollouts=64, - max_concurrent_evaluations=2, + max_concurrent_evaluations=16, ) async def test_llm_judge(row: EvaluationRow) -> EvaluationRow: return await aha_judge(row) From 3adfc104e277395c7b3510b937d6f4cd74918f4e Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Mon, 22 Sep 2025 00:29:52 -0700 Subject: [PATCH 03/10] update --- eval_protocol/quickstart/llm_judge_braintrust.py | 2 +- eval_protocol/quickstart/llm_judge_langfuse.py | 3 +-- eval_protocol/quickstart/llm_judge_langsmith.py | 1 + eval_protocol/quickstart/llm_judge_openai_responses.py | 1 + 4 files changed, 4 insertions(+), 3 deletions(-) diff --git a/eval_protocol/quickstart/llm_judge_braintrust.py b/eval_protocol/quickstart/llm_judge_braintrust.py index 56c99d5b..4a4b665b 100644 --- a/eval_protocol/quickstart/llm_judge_braintrust.py +++ b/eval_protocol/quickstart/llm_judge_braintrust.py @@ -51,7 +51,7 @@ input_rows=input_rows, rollout_processor=SingleTurnRolloutProcessor(), preprocess_fn=multi_turn_assistant_to_ground_truth, - max_concurrent_rollouts=64, + max_concurrent_evaluations=2, ) async def test_llm_judge(row: EvaluationRow) -> EvaluationRow: return await aha_judge(row) diff --git a/eval_protocol/quickstart/llm_judge_langfuse.py b/eval_protocol/quickstart/llm_judge_langfuse.py index 21fbc849..d1dc725c 100644 --- a/eval_protocol/quickstart/llm_judge_langfuse.py +++ b/eval_protocol/quickstart/llm_judge_langfuse.py @@ -51,8 +51,7 @@ input_rows=[input_rows], rollout_processor=SingleTurnRolloutProcessor(), preprocess_fn=multi_turn_assistant_to_ground_truth, - max_concurrent_rollouts=64, - max_concurrent_evaluations=16, + max_concurrent_evaluations=2, ) async def test_llm_judge(row: EvaluationRow) -> EvaluationRow: return await aha_judge(row) diff --git a/eval_protocol/quickstart/llm_judge_langsmith.py b/eval_protocol/quickstart/llm_judge_langsmith.py index cd19a738..4531a3c4 100644 --- a/eval_protocol/quickstart/llm_judge_langsmith.py +++ b/eval_protocol/quickstart/llm_judge_langsmith.py @@ -76,6 +76,7 @@ def fetch_langsmith_traces_as_evaluation_rows( input_rows=[input_rows], rollout_processor=SingleTurnRolloutProcessor(), preprocess_fn=multi_turn_assistant_to_ground_truth, + max_concurrent_evaluations=2, ) async def test_llm_judge_langsmith(row: EvaluationRow) -> EvaluationRow: """LLM Judge evaluation over LangSmith-sourced rows, persisted locally by Eval Protocol. diff --git a/eval_protocol/quickstart/llm_judge_openai_responses.py b/eval_protocol/quickstart/llm_judge_openai_responses.py index 8a81b9cc..9d5b1a7e 100644 --- a/eval_protocol/quickstart/llm_judge_openai_responses.py +++ b/eval_protocol/quickstart/llm_judge_openai_responses.py @@ -57,6 +57,7 @@ input_rows=[input_rows], rollout_processor=SingleTurnRolloutProcessor(), preprocess_fn=multi_turn_assistant_to_ground_truth, + max_concurrent_evaluations=2, ) async def test_llm_judge_openai_responses(row: EvaluationRow) -> EvaluationRow: return await aha_judge(row) From 311d31b89f5258d2ba68cbf50258f947056d1447 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Mon, 22 Sep 2025 00:39:40 -0700 Subject: [PATCH 04/10] update braintrust --- .../quickstart/llm_judge_braintrust.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/eval_protocol/quickstart/llm_judge_braintrust.py b/eval_protocol/quickstart/llm_judge_braintrust.py index 4a4b665b..61ce6c7d 100644 --- a/eval_protocol/quickstart/llm_judge_braintrust.py +++ b/eval_protocol/quickstart/llm_judge_braintrust.py @@ -16,16 +16,16 @@ DefaultParameterIdGenerator, ) -# adapter = create_braintrust_adapter() +adapter = create_braintrust_adapter() input_rows = [ - # adapter.get_evaluation_rows( - # btql_query=f""" - # select: * - # from: project_logs('{os.getenv("BRAINTRUST_PROJECT_ID")}') traces - # filter: is_root = true - # limit: 10 - # """ - # ) + adapter.get_evaluation_rows( + btql_query=f""" + select: * + from: project_logs('{os.getenv("BRAINTRUST_PROJECT_ID")}') traces + filter: is_root = true + limit: 10 + """ + ) ] From 94991cda82a89cd91336811528619f892cf20249 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Mon, 22 Sep 2025 00:51:38 -0700 Subject: [PATCH 05/10] test --- .github/workflows/ci.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 03e836f9..977ec775 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -70,6 +70,12 @@ jobs: with: fetch-depth: 0 # Fetch all history for all tags and branches + - name: Check CI environment variable + run: | + echo "CI=$CI" + echo "CI (from env)=$(env | grep '^CI=')" + python -c "import os; print('CI (python):', repr(os.getenv('CI')))" + - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v5 with: From 9ea0ac81f293eb5743482ef19feb66ee91fa3eeb Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Mon, 22 Sep 2025 00:56:59 -0700 Subject: [PATCH 06/10] remove --- .github/workflows/ci.yml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 977ec775..03e836f9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -70,12 +70,6 @@ jobs: with: fetch-depth: 0 # Fetch all history for all tags and branches - - name: Check CI environment variable - run: | - echo "CI=$CI" - echo "CI (from env)=$(env | grep '^CI=')" - python -c "import os; print('CI (python):', repr(os.getenv('CI')))" - - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v5 with: From fd12a2ab046b5212408bf3c0eb2ed9a01c120070 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Mon, 22 Sep 2025 01:03:22 -0700 Subject: [PATCH 07/10] add new braintrust key --- .github/workflows/ci.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 03e836f9..91723565 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -92,6 +92,8 @@ jobs: E2B_API_KEY: ${{ secrets.E2B_API_KEY }} FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }} FIREWORKS_ACCOUNT_ID: ${{ secrets.FIREWORKS_ACCOUNT_ID }} + BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }} + BRAINTRUST_PROJECT_ID: ${{ secrets.BRAINTRUST_PROJECT_ID }} SUPABASE_PASSWORD: ${{ secrets.SUPABASE_PASSWORD }} SUPABASE_HOST: ${{ secrets.SUPABASE_HOST }} SUPABASE_PORT: ${{ secrets.SUPABASE_PORT }} From a4d4099f22e724f516e83b6b53673ad56caf2bfd Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Mon, 22 Sep 2025 01:13:45 -0700 Subject: [PATCH 08/10] remove braintrust keys --- .github/workflows/ci.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 91723565..03e836f9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -92,8 +92,6 @@ jobs: E2B_API_KEY: ${{ secrets.E2B_API_KEY }} FIREWORKS_API_KEY: ${{ secrets.FIREWORKS_API_KEY }} FIREWORKS_ACCOUNT_ID: ${{ secrets.FIREWORKS_ACCOUNT_ID }} - BRAINTRUST_API_KEY: ${{ secrets.BRAINTRUST_API_KEY }} - BRAINTRUST_PROJECT_ID: ${{ secrets.BRAINTRUST_PROJECT_ID }} SUPABASE_PASSWORD: ${{ secrets.SUPABASE_PASSWORD }} SUPABASE_HOST: ${{ secrets.SUPABASE_HOST }} SUPABASE_PORT: ${{ secrets.SUPABASE_PORT }} From 9200ecd167738273d0e769dd31dd1b0b5d8cec9a Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Mon, 22 Sep 2025 09:07:59 -0700 Subject: [PATCH 09/10] remove id stuff and comment out braintrust --- .../quickstart/llm_judge_braintrust.py | 31 +++++++++++-------- .../quickstart/llm_judge_langfuse.py | 1 - .../quickstart/llm_judge_langsmith.py | 1 - .../quickstart/llm_judge_openai_responses.py | 1 - 4 files changed, 18 insertions(+), 16 deletions(-) diff --git a/eval_protocol/quickstart/llm_judge_braintrust.py b/eval_protocol/quickstart/llm_judge_braintrust.py index 61ce6c7d..91bce9cf 100644 --- a/eval_protocol/quickstart/llm_judge_braintrust.py +++ b/eval_protocol/quickstart/llm_judge_braintrust.py @@ -6,6 +6,10 @@ import pytest +# Skip entire module in CI to prevent import-time side effects +if os.environ.get("CI") == "true": + pytest.skip("Skip quickstart in CI", allow_module_level=True) + from eval_protocol import ( evaluation_test, aha_judge, @@ -16,17 +20,19 @@ DefaultParameterIdGenerator, ) -adapter = create_braintrust_adapter() -input_rows = [ - adapter.get_evaluation_rows( - btql_query=f""" - select: * - from: project_logs('{os.getenv("BRAINTRUST_PROJECT_ID")}') traces - filter: is_root = true - limit: 10 - """ - ) -] +# adapter = create_braintrust_adapter() +# input_rows = [ +# adapter.get_evaluation_rows( +# btql_query=f""" +# select: * +# from: project_logs('{os.getenv("BRAINTRUST_PROJECT_ID")}') traces +# filter: is_root = true +# limit: 10 +# """ +# ) +# ] +input_rows = [] +# uncomment when dataloader is fixed @pytest.mark.skipif(os.environ.get("CI") == "true", reason="Skip in CI") @@ -45,10 +51,9 @@ "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-20b", }, ], - ids=DefaultParameterIdGenerator.generate_id_from_dict, ) @evaluation_test( - input_rows=input_rows, + input_rows=[input_rows], rollout_processor=SingleTurnRolloutProcessor(), preprocess_fn=multi_turn_assistant_to_ground_truth, max_concurrent_evaluations=2, diff --git a/eval_protocol/quickstart/llm_judge_langfuse.py b/eval_protocol/quickstart/llm_judge_langfuse.py index d1dc725c..a8e92c05 100644 --- a/eval_protocol/quickstart/llm_judge_langfuse.py +++ b/eval_protocol/quickstart/llm_judge_langfuse.py @@ -45,7 +45,6 @@ "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-20b", }, ], - ids=DefaultParameterIdGenerator.generate_id_from_dict, ) @evaluation_test( input_rows=[input_rows], diff --git a/eval_protocol/quickstart/llm_judge_langsmith.py b/eval_protocol/quickstart/llm_judge_langsmith.py index 4531a3c4..f62fdb28 100644 --- a/eval_protocol/quickstart/llm_judge_langsmith.py +++ b/eval_protocol/quickstart/llm_judge_langsmith.py @@ -70,7 +70,6 @@ def fetch_langsmith_traces_as_evaluation_rows( "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b", }, ], - ids=DefaultParameterIdGenerator.generate_id_from_dict, ) @evaluation_test( input_rows=[input_rows], diff --git a/eval_protocol/quickstart/llm_judge_openai_responses.py b/eval_protocol/quickstart/llm_judge_openai_responses.py index 9d5b1a7e..a30feee0 100644 --- a/eval_protocol/quickstart/llm_judge_openai_responses.py +++ b/eval_protocol/quickstart/llm_judge_openai_responses.py @@ -51,7 +51,6 @@ "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905", }, ], - ids=DefaultParameterIdGenerator.generate_id_from_dict, ) @evaluation_test( input_rows=[input_rows], From 824a998bcb9f5ad4e2491d22a7d1b42c4ee59ce4 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Mon, 22 Sep 2025 09:48:07 -0700 Subject: [PATCH 10/10] move tqdm to utils file --- eval_protocol/pytest/evaluation_test.py | 67 ++------------------ eval_protocol/pytest/utils.py | 83 +++++++++++++++++++++++++ 2 files changed, 88 insertions(+), 62 deletions(-) diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py index d22c78b5..3a1bac70 100644 --- a/eval_protocol/pytest/evaluation_test.py +++ b/eval_protocol/pytest/evaluation_test.py @@ -9,7 +9,6 @@ from collections.abc import Sequence import pytest -from tqdm import tqdm from eval_protocol.dataset_logger import default_logger from eval_protocol.dataset_logger.dataset_logger import DatasetLogger @@ -58,6 +57,8 @@ parse_ep_num_runs, parse_ep_passed_threshold, rollout_processor_with_retry, + run_tasks_with_eval_progress, + run_tasks_with_run_progress, ) from eval_protocol.utils.show_results_url import show_results_url @@ -380,37 +381,8 @@ async def _execute_groupwise_eval_with_semaphore( asyncio.create_task(_execute_pointwise_eval_with_semaphore(row=row)) ) - # Add tqdm progress bar for evaluations with proper cleanup - eval_position = run_idx + 2 # Position after rollout progress bar - with tqdm( - total=len(pointwise_tasks), - desc=f" Eval {run_idx + 1}", - unit="eval", - file=sys.__stderr__, - leave=False, - position=eval_position, - dynamic_ncols=True, - miniters=1, - mininterval=0.1, - bar_format="{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]", - ) as eval_pbar: - - async def task_with_progress(task): - try: - result = await task - return result - finally: - eval_pbar.update(1) - - wrapped_tasks = [task_with_progress(task) for task in pointwise_tasks] - try: - results = await asyncio.gather(*wrapped_tasks) - except Exception: - # Propagate cancellation to the real tasks and await them to quiesce - for task in pointwise_tasks: - task.cancel() - await asyncio.gather(*pointwise_tasks, return_exceptions=True) - raise + # Run evaluation tasks with progress bar + results = await run_tasks_with_eval_progress(pointwise_tasks, run_idx) all_results[run_idx] = results elif mode == "groupwise": @@ -528,36 +500,7 @@ async def _collect_result(config, lst): # pyright: ignore[reportUnknownParamete else: # For other processors, create all tasks at once and run in parallel # Concurrency is now controlled by the shared semaphore in each rollout processor - with tqdm( - total=num_runs, - desc="Runs (Parallel)", - unit="run", - file=sys.__stderr__, - position=0, - leave=True, - dynamic_ncols=True, - miniters=1, - bar_format="{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]", - ) as run_pbar: - - async def execute_run_with_progress(run_idx: int, config): - try: - result = await execute_run(run_idx, config) - return result - finally: - run_pbar.update(1) - - tasks = [] - for run_idx in range(num_runs): - tasks.append(asyncio.create_task(execute_run_with_progress(run_idx, config))) - try: - await asyncio.gather(*tasks) - except Exception: - # Propagate cancellation to tasks and await them to quiesce - for task in tasks: - task.cancel() - await asyncio.gather(*tasks, return_exceptions=True) - raise + await run_tasks_with_run_progress(execute_run, num_runs, config) experiment_duration_seconds = time.perf_counter() - experiment_start_time diff --git a/eval_protocol/pytest/utils.py b/eval_protocol/pytest/utils.py index aa98ecf3..ce15cd19 100644 --- a/eval_protocol/pytest/utils.py +++ b/eval_protocol/pytest/utils.py @@ -33,6 +33,89 @@ AggregationMethod = Literal["mean", "max", "min", "bootstrap"] +async def run_tasks_with_eval_progress(pointwise_tasks: list, run_idx: int): + """ + Run evaluation tasks with a progress bar and proper cancellation handling. + + Args: + pointwise_tasks: List of asyncio tasks to execute + run_idx: Run index for progress bar positioning and naming + + Returns: + Results from all tasks + """ + eval_position = run_idx + 2 # Position after rollout progress bar + with tqdm( + total=len(pointwise_tasks), + desc=f" Eval {run_idx + 1}", + unit="eval", + file=sys.__stderr__, + leave=False, + position=eval_position, + dynamic_ncols=True, + miniters=1, + mininterval=0.1, + bar_format="{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]", + ) as eval_pbar: + + async def task_with_progress(task): + try: + result = await task + return result + finally: + eval_pbar.update(1) + + wrapped_tasks = [task_with_progress(task) for task in pointwise_tasks] + try: + results = await asyncio.gather(*wrapped_tasks) + return results + except Exception: + # Propagate cancellation to the real tasks and await them to quiesce + for task in pointwise_tasks: + task.cancel() + await asyncio.gather(*pointwise_tasks, return_exceptions=True) + raise + + +async def run_tasks_with_run_progress(execute_run_func, num_runs, config): + """ + Run tasks with a parallel runs progress bar, preserving original logic. + + Args: + execute_run_func: The execute_run function to call + num_runs: Number of runs to execute + config: Configuration to pass to execute_run_func + """ + with tqdm( + total=num_runs, + desc="Runs (Parallel)", + unit="run", + file=sys.__stderr__, + position=0, + leave=True, + dynamic_ncols=True, + miniters=1, + bar_format="{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]", + ) as run_pbar: + + async def execute_run_with_progress(run_idx: int, config): + result = await execute_run_func(run_idx, config) + run_pbar.update(1) + return result + + tasks = [] + for run_idx in range(num_runs): + tasks.append(asyncio.create_task(execute_run_with_progress(run_idx, config))) + try: + await asyncio.gather(*tasks) + except Exception: + # Propagate cancellation to tasks and await them to quiesce + for task in tasks: + task.cancel() + await asyncio.gather(*tasks, return_exceptions=True) + raise + + def calculate_bootstrap_scores(all_scores: list[float]) -> float: """ Calculate bootstrap confidence intervals for individual scores.