Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 0 additions & 60 deletions .github/PULL_REQUEST_TEMPLATE.md

This file was deleted.

64 changes: 6 additions & 58 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -53,13 +53,15 @@ jobs:
uv run basedpyright || true

test-core:
name: Core Tests (Python ${{ matrix.python-version }})
name: Core Tests (Python ${{ matrix.python-version }}, Shard ${{ matrix.shard }}/${{ matrix.total-shards }})
runs-on: ubuntu-latest
needs: lint-and-type-check
strategy:
fail-fast: false
matrix:
python-version: ["3.10", "3.11", "3.12"]
shard: [1, 2, 3, 4]
total-shards: [4]

steps:
- uses: actions/checkout@v4
Expand All @@ -82,7 +84,7 @@ jobs:
- name: Install tau2 for testing
run: uv pip install git+https://github.com/sierra-research/tau2-bench.git@main

- name: Run Core Tests with pytest-xdist
- name: Run Core Tests with pytest-xdist (Shard ${{ matrix.shard }}/${{ matrix.total-shards }})
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
E2B_API_KEY: ${{ secrets.E2B_API_KEY }}
Expand All @@ -94,31 +96,7 @@ jobs:
SUPABASE_DATABASE: ${{ secrets.SUPABASE_DATABASE }}
SUPABASE_USER: ${{ secrets.SUPABASE_USER }}
PYTHONWARNINGS: "ignore::DeprecationWarning,ignore::RuntimeWarning"
run: |
# Run most tests in parallel, but explicitly ignore tests that manage their own servers or are slow
uv run pytest \
-n auto \
--ignore=tests/test_batch_evaluation.py \
--ignore=tests/pytest/test_frozen_lake.py \
--ignore=tests/pytest/test_lunar_lander.py \
--ignore=tests/pytest/test_tau_bench_airline.py \
--ignore=tests/pytest/test_apps_coding.py \
--ignore=tests/test_tau_bench_airline_smoke.py \
--ignore=tests/pytest/test_svgbench.py \
--ignore=tests/pytest/test_livesvgbench.py \
--ignore=tests/remote_server/test_remote_fireworks.py \
--ignore=tests/remote_server/test_remote_fireworks_propagate_status.py \
--ignore=tests/logging/test_elasticsearch_direct_http_handler.py \
--ignore=eval_protocol/benchmarks/ \
--ignore=eval_protocol/quickstart/ \
--cov=eval_protocol --cov-append --cov-report=xml --cov-report=term-missing -v --durations=10

- name: Store coverage file
uses: actions/upload-artifact@v4
with:
name: coverage-core-${{ matrix.python-version }}
path: coverage.xml
retention-days: 1
run: uv run ./scripts/run_sharded_tests.sh ${{ matrix.shard }} ${{ matrix.total-shards }}

test-batch-evaluation:
name: Batch Evaluation Tests
Expand Down Expand Up @@ -153,13 +131,7 @@ jobs:
PYTHONWARNINGS: "ignore::DeprecationWarning,ignore::RuntimeWarning"
run: |
# Run only this specific test file, WITHOUT xdist
uv run pytest tests/test_batch_evaluation.py --cov=eval_protocol --cov-append --cov-report=xml -v --durations=10
- name: Store coverage file
uses: actions/upload-artifact@v4
with:
name: coverage-batch-eval
path: coverage.xml
retention-days: 1
uv run pytest tests/test_batch_evaluation.py -v --durations=10

test-mcp-e2e:
name: MCP End-to-End Tests
Expand All @@ -183,27 +155,3 @@ jobs:

- name: Install tau2 for testing
run: uv pip install git+https://github.com/sierra-research/tau2-bench.git@main

- name: Store coverage file
uses: actions/upload-artifact@v4
with:
name: coverage-mcp-e2e
path: coverage.xml
retention-days: 1

upload-coverage:
name: Upload Coverage
runs-on: ubuntu-latest
needs: [test-core, test-batch-evaluation, test-mcp-e2e]
steps:
- name: Download all coverage artifacts
uses: actions/download-artifact@v4
with:
path: coverage-artifacts
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v3
with:
token: ${{ secrets.CODECOV_TOKEN }}
directory: ./coverage-artifacts/
fail_ci_if_error: false
verbose: true
80 changes: 80 additions & 0 deletions scripts/run_sharded_tests.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
#!/usr/bin/env bash
# Script to run a shard of tests for parallel CI execution
# Usage: ./scripts/run_sharded_tests.sh <shard> <total_shards> [--dry-run]
# Example: ./scripts/run_sharded_tests.sh 1 4
# Example: ./scripts/run_sharded_tests.sh 1 4 --dry-run

set -e

SHARD=${1:-1}
TOTAL_SHARDS=${2:-4}
DRY_RUN=${3:-""}

if [ "$SHARD" -lt 1 ] || [ "$SHARD" -gt "$TOTAL_SHARDS" ]; then
echo "Error: Shard must be between 1 and $TOTAL_SHARDS"
exit 1
fi

# Collect all test files, excluding ignored ones
TEST_FILES=$(find tests -name "test_*.py" \
! -path "tests/test_batch_evaluation.py" \
! -path "tests/pytest/test_frozen_lake.py" \
! -path "tests/pytest/test_lunar_lander.py" \
! -path "tests/pytest/test_tau_bench_airline.py" \
! -path "tests/pytest/test_apps_coding.py" \
! -path "tests/test_tau_bench_airline_smoke.py" \
! -path "tests/pytest/test_svgbench.py" \
! -path "tests/pytest/test_livesvgbench.py" \
! -path "tests/remote_server/test_remote_fireworks.py" \
! -path "tests/remote_server/test_remote_fireworks_propagate_status.py" \
! -path "tests/logging/test_elasticsearch_direct_http_handler.py" \
| sort)

# Count total files
TOTAL_FILES=$(echo "$TEST_FILES" | wc -l | tr -d ' ')

# Calculate start and end line numbers for this shard (1-indexed for sed)
FILES_PER_SHARD=$(( (TOTAL_FILES + TOTAL_SHARDS - 1) / TOTAL_SHARDS ))
START_LINE=$(( (SHARD - 1) * FILES_PER_SHARD + 1 ))
END_LINE=$(( START_LINE + FILES_PER_SHARD - 1 ))
if [ $END_LINE -gt $TOTAL_FILES ]; then
END_LINE=$TOTAL_FILES
fi

# Get files for this shard using sed
SHARD_FILES=$(echo "$TEST_FILES" | sed -n "${START_LINE},${END_LINE}p")
SHARD_COUNT=$(echo "$SHARD_FILES" | grep -c . || echo 0)

echo "========================================"
echo "Running shard $SHARD of $TOTAL_SHARDS"
echo "========================================"
echo "Total test files: $TOTAL_FILES"
echo "Files per shard: ~$FILES_PER_SHARD"
echo "Files in this shard: $SHARD_COUNT"
echo "Line range: $START_LINE to $END_LINE"
echo "----------------------------------------"
echo "Files:"
echo "$SHARD_FILES" | while read -r f; do
echo " $f"
done
echo "----------------------------------------"

if [ "$SHARD_COUNT" -eq 0 ] || [ -z "$SHARD_FILES" ]; then
echo "No files in this shard, skipping tests"
exit 0
fi

# Check if --dry-run flag is passed
if [ "$DRY_RUN" = "--dry-run" ]; then
echo "Dry run mode - not executing tests"
exit 0
fi

# Run tests for this shard
# shellcheck disable=SC2086
exec pytest \
-n auto \
--ignore=eval_protocol/benchmarks/ \
--ignore=eval_protocol/quickstart/ \
-v --durations=10 \
$SHARD_FILES
7 changes: 6 additions & 1 deletion tests/pytest/test_pytest_klavis_mcp.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,12 @@ async def test_pytest_klavis_mcp(row: EvaluationRow) -> EvaluationRow:
)
response_text = response.choices[0].message.content
logger.info("response_text: %s", response_text)
score = json.loads(response_text or "{}")["score"]
try:
parsed = json.loads(response_text or "{}")
score = parsed.get("score", 0.0)
except (json.JSONDecodeError, TypeError):
logger.warning("Failed to parse response as JSON: %s", response_text)
score = 0.0
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: Exception handler misses AttributeError from dict method

The try/except block catches json.JSONDecodeError and TypeError, but if the API returns valid JSON that isn't a dictionary (like an integer "123" or a list "[]"), calling .get() on the parsed value raises AttributeError, which escapes the handler. Adding AttributeError to the except clause would make the error handling complete.

Fix in Cursor Fix in Web


row.evaluation_result = EvaluateResult(
score=score,
Expand Down
7 changes: 6 additions & 1 deletion tests/test_cli_create_rft.py
Original file line number Diff line number Diff line change
Expand Up @@ -1031,7 +1031,12 @@ def _fake_create_dataset_from_jsonl(account_id, api_key, api_base, dataset_id, d
assert captured["jsonl_path"] == str(jsonl_path)


def test_cli_full_command_style_evaluator_and_dataset_flags(monkeypatch):
def test_cli_full_command_style_evaluator_and_dataset_flags(tmp_path, monkeypatch):
# Isolate CWD so _discover_tests doesn't run pytest in the real project
project = tmp_path / "proj"
project.mkdir()
monkeypatch.chdir(project)

# Env
monkeypatch.setenv("FIREWORKS_API_KEY", "fw_dummy")
monkeypatch.setenv("FIREWORKS_ACCOUNT_ID", "pyroworks-dev")
Expand Down
Loading