fix tests

benjibc · benjibc · commit ca2793fab447 · 2025-10-28T12:19:31.000-07:00
diff --git a/tests/pytest/test_pytest_math_format_length.py b/tests/pytest/test_pytest_math_format_length.py
@@ -5,12 +5,10 @@
 from eval_protocol.rewards.length import count_tokens
 from eval_protocol.rewards.math import math_reward
 from examples.math_with_format_and_length.main import check_think_answer_format
-from tests.pytest.helper.gsm8k_to_evaluation_row import gsm8k_to_evaluation_row
 
 
 @evaluation_test(
     input_dataset=["development/gsm8k_sample.jsonl"],
-    dataset_adapter=gsm8k_to_evaluation_row,
     completion_params=[{"temperature": 0.0, "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"}],
     max_dataset_rows=5,
     passed_threshold=0.0,
diff --git a/tests/pytest/test_pytest_word_count_example.py b/tests/pytest/test_pytest_word_count_example.py
@@ -2,12 +2,10 @@
 
 from eval_protocol.models import EvaluateResult, EvaluationRow, MetricResult
 from eval_protocol.pytest import SingleTurnRolloutProcessor, evaluation_test
-from tests.pytest.helper.word_count_to_evaluation_row import word_count_to_evaluation_row
 
 
 @evaluation_test(
     input_dataset=["development/gsm8k_sample.jsonl"],
-    dataset_adapter=word_count_to_evaluation_row,
     completion_params=[{"temperature": 0.0, "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"}],
     max_dataset_rows=5,
     passed_threshold=0.3,  # Reasonable threshold for word count evaluation