eval-protocol · dphuang2 · Dec 14, 2025 · Dec 14, 2025 · Dec 14, 2025 · Dec 14, 2025
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -53,13 +53,15 @@ jobs:
           uv run basedpyright || true
 
   test-core:
-    name: Core Tests (Python ${{ matrix.python-version }})
+    name: Core Tests (Python ${{ matrix.python-version }}, Shard ${{ matrix.shard }}/${{ matrix.total-shards }})
     runs-on: ubuntu-latest
     needs: lint-and-type-check
     strategy:
       fail-fast: false
       matrix:
         python-version: ["3.10", "3.11", "3.12"]
+        shard: [1, 2, 3, 4]
+        total-shards: [4]
 
     steps:
       - uses: actions/checkout@v4
@@ -82,7 +84,7 @@ jobs:
       - name: Install tau2 for testing
         run: uv pip install git+https://github.com/sierra-research/tau2-bench.git@main
 
-      - name: Run Core Tests with pytest-xdist
+      - name: Run Core Tests with pytest-xdist (Shard ${{ matrix.shard }}/${{ matrix.total-shards }})
         env:
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
           E2B_API_KEY: ${{ secrets.E2B_API_KEY }}
@@ -94,31 +96,7 @@ jobs:
           SUPABASE_DATABASE: ${{ secrets.SUPABASE_DATABASE }}
           SUPABASE_USER: ${{ secrets.SUPABASE_USER }}
           PYTHONWARNINGS: "ignore::DeprecationWarning,ignore::RuntimeWarning"
-        run: |
-          # Run most tests in parallel, but explicitly ignore tests that manage their own servers or are slow
-          uv run pytest \
-            -n auto \
-            --ignore=tests/test_batch_evaluation.py \
-            --ignore=tests/pytest/test_frozen_lake.py \
-            --ignore=tests/pytest/test_lunar_lander.py \
-            --ignore=tests/pytest/test_tau_bench_airline.py \
-            --ignore=tests/pytest/test_apps_coding.py \
-            --ignore=tests/test_tau_bench_airline_smoke.py \
-            --ignore=tests/pytest/test_svgbench.py \
-            --ignore=tests/pytest/test_livesvgbench.py \
-            --ignore=tests/remote_server/test_remote_fireworks.py \
-            --ignore=tests/remote_server/test_remote_fireworks_propagate_status.py \
-            --ignore=tests/logging/test_elasticsearch_direct_http_handler.py \
-            --ignore=eval_protocol/benchmarks/ \
-            --ignore=eval_protocol/quickstart/ \
-            --cov=eval_protocol --cov-append --cov-report=xml --cov-report=term-missing -v --durations=10
-
-      - name: Store coverage file
-        uses: actions/upload-artifact@v4
-        with:
-          name: coverage-core-${{ matrix.python-version }}
-          path: coverage.xml
-          retention-days: 1
+        run: uv run ./scripts/run_sharded_tests.sh ${{ matrix.shard }} ${{ matrix.total-shards }}
 
   test-batch-evaluation:
     name: Batch Evaluation Tests
@@ -153,13 +131,7 @@ jobs:
           PYTHONWARNINGS: "ignore::DeprecationWarning,ignore::RuntimeWarning"
         run: |
           # Run only this specific test file, WITHOUT xdist
-          uv run pytest tests/test_batch_evaluation.py --cov=eval_protocol --cov-append --cov-report=xml -v --durations=10
-      - name: Store coverage file
-        uses: actions/upload-artifact@v4
-        with:
-          name: coverage-batch-eval
-          path: coverage.xml
-          retention-days: 1
+          uv run pytest tests/test_batch_evaluation.py -v --durations=10
 
   test-mcp-e2e:
     name: MCP End-to-End Tests
@@ -183,27 +155,3 @@ jobs:
 
       - name: Install tau2 for testing
         run: uv pip install git+https://github.com/sierra-research/tau2-bench.git@main
-
-      - name: Store coverage file
-        uses: actions/upload-artifact@v4
-        with:
-          name: coverage-mcp-e2e
-          path: coverage.xml
-          retention-days: 1
-
-  upload-coverage:
-    name: Upload Coverage
-    runs-on: ubuntu-latest
-    needs: [test-core, test-batch-evaluation, test-mcp-e2e]
-    steps:
-      - name: Download all coverage artifacts
-        uses: actions/download-artifact@v4
-        with:
-          path: coverage-artifacts
-      - name: Upload coverage to Codecov
-        uses: codecov/codecov-action@v3
-        with:
-          token: ${{ secrets.CODECOV_TOKEN }}
-          directory: ./coverage-artifacts/
-          fail_ci_if_error: false
-          verbose: true
diff --git a/scripts/run_sharded_tests.sh b/scripts/run_sharded_tests.sh
@@ -0,0 +1,80 @@
+#!/usr/bin/env bash
+# Script to run a shard of tests for parallel CI execution
+# Usage: ./scripts/run_sharded_tests.sh <shard> <total_shards> [--dry-run]
+# Example: ./scripts/run_sharded_tests.sh 1 4
+# Example: ./scripts/run_sharded_tests.sh 1 4 --dry-run
+
+set -e
+
+SHARD=${1:-1}
+TOTAL_SHARDS=${2:-4}
+DRY_RUN=${3:-""}
+
+if [ "$SHARD" -lt 1 ] || [ "$SHARD" -gt "$TOTAL_SHARDS" ]; then
+	echo "Error: Shard must be between 1 and $TOTAL_SHARDS"
+	exit 1
+fi
+
+# Collect all test files, excluding ignored ones
+TEST_FILES=$(find tests -name "test_*.py" \
+	! -path "tests/test_batch_evaluation.py" \
+	! -path "tests/pytest/test_frozen_lake.py" \
+	! -path "tests/pytest/test_lunar_lander.py" \
+	! -path "tests/pytest/test_tau_bench_airline.py" \
+	! -path "tests/pytest/test_apps_coding.py" \
+	! -path "tests/test_tau_bench_airline_smoke.py" \
+	! -path "tests/pytest/test_svgbench.py" \
+	! -path "tests/pytest/test_livesvgbench.py" \
+	! -path "tests/remote_server/test_remote_fireworks.py" \
+	! -path "tests/remote_server/test_remote_fireworks_propagate_status.py" \
+	! -path "tests/logging/test_elasticsearch_direct_http_handler.py" \
+	| sort)
+
+# Count total files
+TOTAL_FILES=$(echo "$TEST_FILES" | wc -l | tr -d ' ')
+
+# Calculate start and end line numbers for this shard (1-indexed for sed)
+FILES_PER_SHARD=$(( (TOTAL_FILES + TOTAL_SHARDS - 1) / TOTAL_SHARDS ))
+START_LINE=$(( (SHARD - 1) * FILES_PER_SHARD + 1 ))
+END_LINE=$(( START_LINE + FILES_PER_SHARD - 1 ))
+if [ $END_LINE -gt $TOTAL_FILES ]; then
+	END_LINE=$TOTAL_FILES
+fi
+
+# Get files for this shard using sed
+SHARD_FILES=$(echo "$TEST_FILES" | sed -n "${START_LINE},${END_LINE}p")
+SHARD_COUNT=$(echo "$SHARD_FILES" | grep -c . || echo 0)
+
+echo "========================================"
+echo "Running shard $SHARD of $TOTAL_SHARDS"
+echo "========================================"
+echo "Total test files: $TOTAL_FILES"
+echo "Files per shard: ~$FILES_PER_SHARD"
+echo "Files in this shard: $SHARD_COUNT"
+echo "Line range: $START_LINE to $END_LINE"
+echo "----------------------------------------"
+echo "Files:"
+echo "$SHARD_FILES" | while read -r f; do
+	echo "  $f"
+done
+echo "----------------------------------------"
+
+if [ "$SHARD_COUNT" -eq 0 ] || [ -z "$SHARD_FILES" ]; then
+	echo "No files in this shard, skipping tests"
+	exit 0
+fi
+
+# Check if --dry-run flag is passed
+if [ "$DRY_RUN" = "--dry-run" ]; then
+	echo "Dry run mode - not executing tests"
+	exit 0
+fi
+
+# Run tests for this shard
+# shellcheck disable=SC2086
+exec pytest \
+	-n auto \
+	--ignore=eval_protocol/benchmarks/ \
+	--ignore=eval_protocol/quickstart/ \
+	-v --durations=10 \
+	$SHARD_FILES
diff --git a/tests/pytest/test_pytest_klavis_mcp.py b/tests/pytest/test_pytest_klavis_mcp.py
@@ -52,7 +52,12 @@ async def test_pytest_klavis_mcp(row: EvaluationRow) -> EvaluationRow:
         )
         response_text = response.choices[0].message.content
         logger.info("response_text: %s", response_text)
-        score = json.loads(response_text or "{}")["score"]
+        try:
+            parsed = json.loads(response_text or "{}")
+            score = parsed.get("score", 0.0)
+        except (json.JSONDecodeError, TypeError):
+            logger.warning("Failed to parse response as JSON: %s", response_text)
+            score = 0.0
 
         row.evaluation_result = EvaluateResult(
             score=score,

diff --git a/tests/test_cli_create_rft.py b/tests/test_cli_create_rft.py
@@ -1031,7 +1031,12 @@ def _fake_create_dataset_from_jsonl(account_id, api_key, api_base, dataset_id, d
     assert captured["jsonl_path"] == str(jsonl_path)
 
 
-def test_cli_full_command_style_evaluator_and_dataset_flags(monkeypatch):
+def test_cli_full_command_style_evaluator_and_dataset_flags(tmp_path, monkeypatch):
+    # Isolate CWD so _discover_tests doesn't run pytest in the real project
+    project = tmp_path / "proj"
+    project.mkdir()
+    monkeypatch.chdir(project)
+
     # Env
     monkeypatch.setenv("FIREWORKS_API_KEY", "fw_dummy")
     monkeypatch.setenv("FIREWORKS_ACCOUNT_ID", "pyroworks-dev")