@@ -53,13 +53,15 @@ jobs:
5353 uv run basedpyright || true
5454
5555 test-core :
56- name : Core Tests (Python ${{ matrix.python-version }})
56+ name : Core Tests (Python ${{ matrix.python-version }}, Shard ${{ matrix.shard }}/${{ matrix.total-shards }} )
5757 runs-on : ubuntu-latest
5858 needs : lint-and-type-check
5959 strategy :
6060 fail-fast : false
6161 matrix :
6262 python-version : ["3.10", "3.11", "3.12"]
63+ shard : [1, 2, 3, 4]
64+ total-shards : [4]
6365
6466 steps :
6567 - uses : actions/checkout@v4
8284 - name : Install tau2 for testing
8385 run : uv pip install git+https://github.com/sierra-research/tau2-bench.git@main
8486
85- - name : Run Core Tests with pytest-xdist
87+ - name : Run Core Tests with pytest-xdist (Shard ${{ matrix.shard }}/${{ matrix.total-shards }})
8688 env :
8789 OPENAI_API_KEY : ${{ secrets.OPENAI_API_KEY }}
8890 E2B_API_KEY : ${{ secrets.E2B_API_KEY }}
9496 SUPABASE_DATABASE : ${{ secrets.SUPABASE_DATABASE }}
9597 SUPABASE_USER : ${{ secrets.SUPABASE_USER }}
9698 PYTHONWARNINGS : " ignore::DeprecationWarning,ignore::RuntimeWarning"
97- run : |
98- # Run most tests in parallel, but explicitly ignore tests that manage their own servers or are slow
99- uv run pytest \
100- -n auto \
101- --ignore=tests/test_batch_evaluation.py \
102- --ignore=tests/pytest/test_frozen_lake.py \
103- --ignore=tests/pytest/test_lunar_lander.py \
104- --ignore=tests/pytest/test_tau_bench_airline.py \
105- --ignore=tests/pytest/test_apps_coding.py \
106- --ignore=tests/test_tau_bench_airline_smoke.py \
107- --ignore=tests/pytest/test_svgbench.py \
108- --ignore=tests/pytest/test_livesvgbench.py \
109- --ignore=tests/remote_server/test_remote_fireworks.py \
110- --ignore=tests/remote_server/test_remote_fireworks_propagate_status.py \
111- --ignore=tests/logging/test_elasticsearch_direct_http_handler.py \
112- --ignore=eval_protocol/benchmarks/ \
113- --ignore=eval_protocol/quickstart/ \
114- --cov=eval_protocol --cov-append --cov-report=xml --cov-report=term-missing -v --durations=10
115-
116- - name : Store coverage file
117- uses : actions/upload-artifact@v4
118- with :
119- name : coverage-core-${{ matrix.python-version }}
120- path : coverage.xml
121- retention-days : 1
99+ run : uv run ./scripts/run_sharded_tests.sh ${{ matrix.shard }} ${{ matrix.total-shards }}
122100
123101 test-batch-evaluation :
124102 name : Batch Evaluation Tests
@@ -153,13 +131,7 @@ jobs:
153131 PYTHONWARNINGS : " ignore::DeprecationWarning,ignore::RuntimeWarning"
154132 run : |
155133 # Run only this specific test file, WITHOUT xdist
156- uv run pytest tests/test_batch_evaluation.py --cov=eval_protocol --cov-append --cov-report=xml -v --durations=10
157- - name : Store coverage file
158- uses : actions/upload-artifact@v4
159- with :
160- name : coverage-batch-eval
161- path : coverage.xml
162- retention-days : 1
134+ uv run pytest tests/test_batch_evaluation.py -v --durations=10
163135
164136 test-mcp-e2e :
165137 name : MCP End-to-End Tests
@@ -183,27 +155,3 @@ jobs:
183155
184156 - name : Install tau2 for testing
185157 run : uv pip install git+https://github.com/sierra-research/tau2-bench.git@main
186-
187- - name : Store coverage file
188- uses : actions/upload-artifact@v4
189- with :
190- name : coverage-mcp-e2e
191- path : coverage.xml
192- retention-days : 1
193-
194- upload-coverage :
195- name : Upload Coverage
196- runs-on : ubuntu-latest
197- needs : [test-core, test-batch-evaluation, test-mcp-e2e]
198- steps :
199- - name : Download all coverage artifacts
200- uses : actions/download-artifact@v4
201- with :
202- path : coverage-artifacts
203- - name : Upload coverage to Codecov
204- uses : codecov/codecov-action@v3
205- with :
206- token : ${{ secrets.CODECOV_TOKEN }}
207- directory : ./coverage-artifacts/
208- fail_ci_if_error : false
209- verbose : true
0 commit comments