eval-protocol · benjibc · Oct 30, 2025 · Oct 30, 2025
diff --git a/.gitignore b/.gitignore
@@ -242,3 +242,4 @@ package-lock.json
 package.json
 tau2-bench
 *.err
+eval-protocol
diff --git a/eval_protocol/cli.py b/eval_protocol/cli.py
@@ -402,6 +402,8 @@ def parse_args(args=None):
     rft_parser.add_argument("--evaluation-dataset", help="Optional separate eval dataset id")
     rft_parser.add_argument("--eval-auto-carveout", dest="eval_auto_carveout", action="store_true", default=True)
     rft_parser.add_argument("--no-eval-auto-carveout", dest="eval_auto_carveout", action="store_false")
+    # Rollout chunking
+    rft_parser.add_argument("--chunk-size", type=int, help="Data chunk size for rollout batching")
     # Inference params
     rft_parser.add_argument("--temperature", type=float)
     rft_parser.add_argument("--top-p", type=float)

diff --git a/eval_protocol/cli_commands/create_rft.py b/eval_protocol/cli_commands/create_rft.py
@@ -379,6 +379,7 @@ def create_rft_command(args) -> int:
         "trainingConfig": training_config,
         "inferenceParameters": inference_params or None,
         "wandbConfig": wandb_config,
+        "chunkSize": getattr(args, "chunk_size", None),
         "outputStats": None,
         "outputMetrics": None,
         "mcpServer": None,

diff --git a/tests/pytest/gsm8k/test_pytest_math_example.py b/tests/pytest/gsm8k/test_pytest_math_example.py
@@ -4,6 +4,9 @@
 import os
 from eval_protocol.data_loader.jsonl_data_loader import EvaluationRowJsonlDataLoader
 from typing import List, Dict, Any, Optional
+import logging
+
+logger = logging.getLogger(__name__)
 
 
 def extract_answer_digits(ground_truth: str) -> Optional[str]:
@@ -54,6 +57,7 @@ def test_math_dataset(row: EvaluationRow, **kwargs) -> EvaluationRow:
         EvaluationRow with the evaluation result
     """
     #### Get predicted answer value
+    logger.info(f"I am beginning to execute GSM8k rollout: {row.execution_metadata.rollout_id}")
     prediction = extract_answer_digits(str(row.messages[2].content))
     gt = extract_answer_digits(str(row.ground_truth))
 
@@ -77,5 +81,6 @@ def test_math_dataset(row: EvaluationRow, **kwargs) -> EvaluationRow:
         is_score_valid=True,  # Optional: Whether the score is valid, true by default
         reason=reason,  # Optional: The reason for the score
     )
+    logger.info(f"I am done executing GSM8k rollout: {row.execution_metadata.rollout_id}")
     row.evaluation_result = evaluation_result
     return row