diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md deleted file mode 100644 index 19b52f0b..00000000 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ /dev/null @@ -1,60 +0,0 @@ ---- -name: Pull Request -about: Propose changes to the codebase -title: "Brief description of changes" -labels: '' -assignees: '' - ---- - -## Description - -Please include a summary of the change and which issue is fixed or feature is implemented. Please also include relevant motivation and context. List any dependencies that are required for this change. - -Fixes # (issue) -Implements # (issue) - -## Type of change - -Please delete options that are not relevant. - -- [ ] Bug fix (non-breaking change which fixes an issue) -- [ ] New feature (non-breaking change which adds functionality) -- [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected) -- [ ] This change requires a documentation update -- [ ] Refactoring/Code cleanup -- [ ] Build/CI/CD related changes -- [ ] Other (please describe): - -## How Has This Been Tested? - -Please describe the tests that you ran to verify your changes. Provide instructions so we can reproduce. Please also list any relevant details for your test configuration. - -- [ ] Test A -- [ ] Test B - -**Test Configuration**: -* Firmware version: -* Hardware: -* Toolchain: -* SDK: - -## Checklist: - -- [ ] My code follows the style guidelines of this project (ran `black .`, `isort .`, `flake8 .`) -- [ ] I have performed a self-review of my own code -- [ ] I have commented my code, particularly in hard-to-understand areas -- [ ] I have made corresponding changes to the documentation -- [ ] My changes generate no new warnings -- [ ] I have added tests that prove my fix is effective or that my feature works -- [ ] New and existing unit tests pass locally with my changes -- [ ] Any dependent changes have been merged and published in downstream modules -- [ ] I have checked my code and corrected any misspellings - -## Screenshots (if applicable) - -If applicable, add screenshots to help showcase your changes. - -## Additional context - -Add any other context about the PR here. diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d29a04a3..8e6c6f78 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -53,13 +53,15 @@ jobs: uv run basedpyright || true test-core: - name: Core Tests (Python ${{ matrix.python-version }}) + name: Core Tests (Python ${{ matrix.python-version }}, Shard ${{ matrix.shard }}/${{ matrix.total-shards }}) runs-on: ubuntu-latest needs: lint-and-type-check strategy: fail-fast: false matrix: python-version: ["3.10", "3.11", "3.12"] + shard: [1, 2, 3, 4] + total-shards: [4] steps: - uses: actions/checkout@v4 @@ -82,7 +84,7 @@ jobs: - name: Install tau2 for testing run: uv pip install git+https://github.com/sierra-research/tau2-bench.git@main - - name: Run Core Tests with pytest-xdist + - name: Run Core Tests with pytest-xdist (Shard ${{ matrix.shard }}/${{ matrix.total-shards }}) env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} E2B_API_KEY: ${{ secrets.E2B_API_KEY }} @@ -94,31 +96,7 @@ jobs: SUPABASE_DATABASE: ${{ secrets.SUPABASE_DATABASE }} SUPABASE_USER: ${{ secrets.SUPABASE_USER }} PYTHONWARNINGS: "ignore::DeprecationWarning,ignore::RuntimeWarning" - run: | - # Run most tests in parallel, but explicitly ignore tests that manage their own servers or are slow - uv run pytest \ - -n auto \ - --ignore=tests/test_batch_evaluation.py \ - --ignore=tests/pytest/test_frozen_lake.py \ - --ignore=tests/pytest/test_lunar_lander.py \ - --ignore=tests/pytest/test_tau_bench_airline.py \ - --ignore=tests/pytest/test_apps_coding.py \ - --ignore=tests/test_tau_bench_airline_smoke.py \ - --ignore=tests/pytest/test_svgbench.py \ - --ignore=tests/pytest/test_livesvgbench.py \ - --ignore=tests/remote_server/test_remote_fireworks.py \ - --ignore=tests/remote_server/test_remote_fireworks_propagate_status.py \ - --ignore=tests/logging/test_elasticsearch_direct_http_handler.py \ - --ignore=eval_protocol/benchmarks/ \ - --ignore=eval_protocol/quickstart/ \ - --cov=eval_protocol --cov-append --cov-report=xml --cov-report=term-missing -v --durations=10 - - - name: Store coverage file - uses: actions/upload-artifact@v4 - with: - name: coverage-core-${{ matrix.python-version }} - path: coverage.xml - retention-days: 1 + run: uv run ./scripts/run_sharded_tests.sh ${{ matrix.shard }} ${{ matrix.total-shards }} test-batch-evaluation: name: Batch Evaluation Tests @@ -153,13 +131,7 @@ jobs: PYTHONWARNINGS: "ignore::DeprecationWarning,ignore::RuntimeWarning" run: | # Run only this specific test file, WITHOUT xdist - uv run pytest tests/test_batch_evaluation.py --cov=eval_protocol --cov-append --cov-report=xml -v --durations=10 - - name: Store coverage file - uses: actions/upload-artifact@v4 - with: - name: coverage-batch-eval - path: coverage.xml - retention-days: 1 + uv run pytest tests/test_batch_evaluation.py -v --durations=10 test-mcp-e2e: name: MCP End-to-End Tests @@ -183,27 +155,3 @@ jobs: - name: Install tau2 for testing run: uv pip install git+https://github.com/sierra-research/tau2-bench.git@main - - - name: Store coverage file - uses: actions/upload-artifact@v4 - with: - name: coverage-mcp-e2e - path: coverage.xml - retention-days: 1 - - upload-coverage: - name: Upload Coverage - runs-on: ubuntu-latest - needs: [test-core, test-batch-evaluation, test-mcp-e2e] - steps: - - name: Download all coverage artifacts - uses: actions/download-artifact@v4 - with: - path: coverage-artifacts - - name: Upload coverage to Codecov - uses: codecov/codecov-action@v3 - with: - token: ${{ secrets.CODECOV_TOKEN }} - directory: ./coverage-artifacts/ - fail_ci_if_error: false - verbose: true diff --git a/scripts/run_sharded_tests.sh b/scripts/run_sharded_tests.sh new file mode 100755 index 00000000..6c492100 --- /dev/null +++ b/scripts/run_sharded_tests.sh @@ -0,0 +1,80 @@ +#!/usr/bin/env bash +# Script to run a shard of tests for parallel CI execution +# Usage: ./scripts/run_sharded_tests.sh [--dry-run] +# Example: ./scripts/run_sharded_tests.sh 1 4 +# Example: ./scripts/run_sharded_tests.sh 1 4 --dry-run + +set -e + +SHARD=${1:-1} +TOTAL_SHARDS=${2:-4} +DRY_RUN=${3:-""} + +if [ "$SHARD" -lt 1 ] || [ "$SHARD" -gt "$TOTAL_SHARDS" ]; then + echo "Error: Shard must be between 1 and $TOTAL_SHARDS" + exit 1 +fi + +# Collect all test files, excluding ignored ones +TEST_FILES=$(find tests -name "test_*.py" \ + ! -path "tests/test_batch_evaluation.py" \ + ! -path "tests/pytest/test_frozen_lake.py" \ + ! -path "tests/pytest/test_lunar_lander.py" \ + ! -path "tests/pytest/test_tau_bench_airline.py" \ + ! -path "tests/pytest/test_apps_coding.py" \ + ! -path "tests/test_tau_bench_airline_smoke.py" \ + ! -path "tests/pytest/test_svgbench.py" \ + ! -path "tests/pytest/test_livesvgbench.py" \ + ! -path "tests/remote_server/test_remote_fireworks.py" \ + ! -path "tests/remote_server/test_remote_fireworks_propagate_status.py" \ + ! -path "tests/logging/test_elasticsearch_direct_http_handler.py" \ + | sort) + +# Count total files +TOTAL_FILES=$(echo "$TEST_FILES" | wc -l | tr -d ' ') + +# Calculate start and end line numbers for this shard (1-indexed for sed) +FILES_PER_SHARD=$(( (TOTAL_FILES + TOTAL_SHARDS - 1) / TOTAL_SHARDS )) +START_LINE=$(( (SHARD - 1) * FILES_PER_SHARD + 1 )) +END_LINE=$(( START_LINE + FILES_PER_SHARD - 1 )) +if [ $END_LINE -gt $TOTAL_FILES ]; then + END_LINE=$TOTAL_FILES +fi + +# Get files for this shard using sed +SHARD_FILES=$(echo "$TEST_FILES" | sed -n "${START_LINE},${END_LINE}p") +SHARD_COUNT=$(echo "$SHARD_FILES" | grep -c . || echo 0) + +echo "========================================" +echo "Running shard $SHARD of $TOTAL_SHARDS" +echo "========================================" +echo "Total test files: $TOTAL_FILES" +echo "Files per shard: ~$FILES_PER_SHARD" +echo "Files in this shard: $SHARD_COUNT" +echo "Line range: $START_LINE to $END_LINE" +echo "----------------------------------------" +echo "Files:" +echo "$SHARD_FILES" | while read -r f; do + echo " $f" +done +echo "----------------------------------------" + +if [ "$SHARD_COUNT" -eq 0 ] || [ -z "$SHARD_FILES" ]; then + echo "No files in this shard, skipping tests" + exit 0 +fi + +# Check if --dry-run flag is passed +if [ "$DRY_RUN" = "--dry-run" ]; then + echo "Dry run mode - not executing tests" + exit 0 +fi + +# Run tests for this shard +# shellcheck disable=SC2086 +exec pytest \ + -n auto \ + --ignore=eval_protocol/benchmarks/ \ + --ignore=eval_protocol/quickstart/ \ + -v --durations=10 \ + $SHARD_FILES diff --git a/tests/pytest/test_pytest_klavis_mcp.py b/tests/pytest/test_pytest_klavis_mcp.py index c7e92b06..9bcb3e97 100644 --- a/tests/pytest/test_pytest_klavis_mcp.py +++ b/tests/pytest/test_pytest_klavis_mcp.py @@ -52,7 +52,12 @@ async def test_pytest_klavis_mcp(row: EvaluationRow) -> EvaluationRow: ) response_text = response.choices[0].message.content logger.info("response_text: %s", response_text) - score = json.loads(response_text or "{}")["score"] + try: + parsed = json.loads(response_text or "{}") + score = parsed.get("score", 0.0) + except (json.JSONDecodeError, TypeError): + logger.warning("Failed to parse response as JSON: %s", response_text) + score = 0.0 row.evaluation_result = EvaluateResult( score=score, diff --git a/tests/test_cli_create_rft.py b/tests/test_cli_create_rft.py index 71f2a064..ec6732de 100644 --- a/tests/test_cli_create_rft.py +++ b/tests/test_cli_create_rft.py @@ -1031,7 +1031,12 @@ def _fake_create_dataset_from_jsonl(account_id, api_key, api_base, dataset_id, d assert captured["jsonl_path"] == str(jsonl_path) -def test_cli_full_command_style_evaluator_and_dataset_flags(monkeypatch): +def test_cli_full_command_style_evaluator_and_dataset_flags(tmp_path, monkeypatch): + # Isolate CWD so _discover_tests doesn't run pytest in the real project + project = tmp_path / "proj" + project.mkdir() + monkeypatch.chdir(project) + # Env monkeypatch.setenv("FIREWORKS_API_KEY", "fw_dummy") monkeypatch.setenv("FIREWORKS_ACCOUNT_ID", "pyroworks-dev")