diff --git a/.github/unittest/linux/scripts/run_all.sh b/.github/unittest/linux/scripts/run_all.sh index 1a6e3e6d0be..599c70bb66e 100755 --- a/.github/unittest/linux/scripts/run_all.sh +++ b/.github/unittest/linux/scripts/run_all.sh @@ -270,6 +270,28 @@ fi TORCHRL_TEST_SUITE="${TORCHRL_TEST_SUITE:-all}" # all|distributed|nondistributed +# GPU test filtering: Run GPU-only tests on GPU machines, CPU-only tests on CPU machines. +# This avoids running ~2000+ tests on expensive GPU machines when only ~30 require GPU. +# Tests are marked with @pytest.mark.gpu if they require CUDA. +# +# Set TORCHRL_GPU_FILTER=0 to disable this optimization and run all tests. +# +# We use an array to handle the marker expression properly (avoids quoting issues). +GPU_MARKER_FILTER=() +if [ "${TORCHRL_GPU_FILTER:-1}" = "1" ]; then + if [ "${CU_VERSION:-}" == cpu ]; then + # CPU job: run only tests that do NOT require GPU + GPU_MARKER_FILTER=(-m 'not gpu') + echo "GPU filtering enabled: Running CPU-only tests (excluding @pytest.mark.gpu)" + else + # GPU job: run only tests that require GPU + GPU_MARKER_FILTER=(-m gpu) + echo "GPU filtering enabled: Running GPU-only tests (@pytest.mark.gpu)" + fi +else + echo "GPU filtering disabled: Running all tests" +fi + export PYTORCH_TEST_WITH_SLOW='1' python -m torch.utils.collect_env @@ -292,6 +314,7 @@ run_distributed_tests() { local json_report_args="--json-report --json-report-file=${json_report_dir}/test-results-distributed.json --json-report-indent=2" # Run both test_distributed.py and test_rb_distributed.py (both use torch.distributed) + # Note: distributed tests always run on GPU, no need for GPU_MARKER_FILTER here python .github/unittest/helpers/coverage_run_parallel.py -m pytest test/test_distributed.py test/test_rb_distributed.py \ ${json_report_args} \ --instafail --durations 200 -vv --capture no \ @@ -327,12 +350,14 @@ run_non_distributed_tests() { 1) echo "Running shard 1: test_transforms.py only" python .github/unittest/helpers/coverage_run_parallel.py -m pytest test/test_transforms.py \ + "${GPU_MARKER_FILTER[@]}" \ ${json_report_args} \ ${common_args} ;; 2) echo "Running shard 2: test_envs.py and test_collectors.py" python .github/unittest/helpers/coverage_run_parallel.py -m pytest test/test_envs.py test/test_collectors.py \ + "${GPU_MARKER_FILTER[@]}" \ ${json_report_args} \ ${common_args} ;; @@ -344,6 +369,7 @@ run_non_distributed_tests() { --ignore test/test_envs.py \ --ignore test/test_collectors.py \ ${xdist_args} \ + "${GPU_MARKER_FILTER[@]}" \ ${json_report_args} \ ${common_args} ;; @@ -351,6 +377,7 @@ run_non_distributed_tests() { echo "Running all tests (no sharding)" python .github/unittest/helpers/coverage_run_parallel.py -m pytest test \ ${common_ignores} \ + "${GPU_MARKER_FILTER[@]}" \ ${json_report_args} \ ${common_args} ;; diff --git a/.github/workflows/validate-test-partitioning.yml b/.github/workflows/validate-test-partitioning.yml new file mode 100644 index 00000000000..b2974b78fb4 --- /dev/null +++ b/.github/workflows/validate-test-partitioning.yml @@ -0,0 +1,73 @@ +# Validates that GPU/CPU test partitioning covers all tests +# +# This workflow ensures that: +# 1. Tests marked with @pytest.mark.gpu + tests not marked = all tests +# 2. No tests are accidentally excluded from CI +# +# Runs on PRs to catch partitioning issues before merge. +name: Validate Test Partitioning + +on: + pull_request: + push: + branches: [main, nightly] + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + validate: + runs-on: ubuntu-latest + timeout-minutes: 10 + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Install minimal dependencies + run: | + pip install pytest tensordict + pip install torch --index-url https://download.pytorch.org/whl/cpu + # Skip editable install - just add to PYTHONPATH for test collection + + - name: Validate test partitioning + env: + PYTHONPATH: ${{ github.workspace }} + run: | + set -e + + echo "==================================================" + echo " TEST PARTITIONING VALIDATION" + echo "==================================================" + + # Collect test counts + # Note: We ignore test_loggers.py due to torchvision operator issues on CPU-only + ALL=$(pytest --collect-only -q test/ --ignore test/test_loggers.py 2>/dev/null | tail -1 | grep -oE "^[0-9]+") + GPU=$(pytest --collect-only -q -m gpu test/ --ignore test/test_loggers.py 2>/dev/null | tail -1 | grep -oE "^[0-9]+") + CPU=$(pytest --collect-only -q -m "not gpu" test/ --ignore test/test_loggers.py 2>/dev/null | tail -1 | grep -oE "^[0-9]+") + + echo "" + echo "Total tests: $ALL" + echo "GPU tests (@pytest.mark.gpu): $GPU" + echo "CPU tests (not gpu): $CPU" + echo "GPU + CPU: $((GPU + CPU))" + echo "" + + # Validate: GPU + CPU should equal ALL + if [ "$((GPU + CPU))" -eq "$ALL" ]; then + echo "✅ PASS: Test partitioning is valid!" + echo " All tests are accounted for." + else + echo "❌ FAIL: Test partitioning mismatch!" + echo " GPU ($GPU) + CPU ($CPU) = $((GPU + CPU)), but total is $ALL" + echo "" + echo " This means some tests are either:" + echo " - Missing the @pytest.mark.gpu marker (if they require CUDA)" + echo " - Being excluded unintentionally" + exit 1 + fi diff --git a/pytest.ini b/pytest.ini index 39fe36617a1..d0aa36cc299 100644 --- a/pytest.ini +++ b/pytest.ini @@ -6,6 +6,8 @@ addopts = --tb=native markers = unity_editor + slow: mark test as slow to run + gpu: mark test as requiring a GPU (CUDA device) testpaths = test xfail_strict = True diff --git a/test/compile/test_compile_collectors.py b/test/compile/test_compile_collectors.py index 703e9dc7b91..d4417bb33ce 100644 --- a/test/compile/test_compile_collectors.py +++ b/test/compile/test_compile_collectors.py @@ -77,6 +77,7 @@ def test_compiled_policy(self, collector_cls, compile_policy, device): collector.shutdown() del collector + @pytest.mark.gpu @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is not available") @pytest.mark.parametrize( "collector_cls", diff --git a/test/conftest.py b/test/conftest.py index c4e56cd5f0a..7ddfb2e1e9d 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -148,6 +148,9 @@ def pytest_runtest_setup(item): def pytest_configure(config): config.addinivalue_line("markers", "slow: mark test as slow to run") + config.addinivalue_line( + "markers", "gpu: mark test as requiring a GPU (CUDA device)" + ) def pytest_collection_modifyitems(config, items): diff --git a/test/llm/test_llm_updaters.py b/test/llm/test_llm_updaters.py index ddc2a2a08a7..bf04d32d499 100644 --- a/test/llm/test_llm_updaters.py +++ b/test/llm/test_llm_updaters.py @@ -72,6 +72,7 @@ def get_open_port(): ) +@pytest.mark.gpu @pytest.mark.skipif(not _has_transformers, reason="missing transformers dependencies") @pytest.mark.skipif(not _has_vllm, reason="missing vllm dependencies") @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") @@ -415,6 +416,7 @@ def test_local_llm_specific_features(self, target_vllm_engine): "See LLM_TEST_ISSUES.md for details.", strict=False, ) +@pytest.mark.gpu @pytest.mark.skipif(not _has_ray, reason="missing ray dependencies") @pytest.mark.skipif(not _has_vllm, reason="missing vllm dependencies") @pytest.mark.skipif(not _has_transformers, reason="missing transformers dependencies") @@ -611,6 +613,7 @@ def test_weight_sync_vllm_collective_ray(self, request): ray.shutdown() +@pytest.mark.gpu @pytest.mark.xfail( reason="AsyncVLLM tests fail due to Ray placement group timeout. " "See LLM_TEST_ISSUES.md for details.", diff --git a/test/llm/test_vllm.py b/test/llm/test_vllm.py index 8378b60e640..6f2ffcb59ac 100644 --- a/test/llm/test_vllm.py +++ b/test/llm/test_vllm.py @@ -39,6 +39,7 @@ def sampling_params(): class TestAsyncVLLMIntegration: """Integration tests for AsyncVLLM with real models.""" + @pytest.mark.gpu @pytest.mark.skipif(not _has_vllm, reason="vllm not available") @pytest.mark.skipif(not _has_ray, reason="ray not available") @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") @@ -111,6 +112,7 @@ def test_vllm_api_compatibility(self, sampling_params): finally: service.shutdown() + @pytest.mark.gpu @pytest.mark.skipif(not _has_vllm, reason="vllm not available") @pytest.mark.skipif(not _has_ray, reason="ray not available") @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") diff --git a/test/llm/test_wrapper.py b/test/llm/test_wrapper.py index a0485b05851..4d322392d54 100644 --- a/test/llm/test_wrapper.py +++ b/test/llm/test_wrapper.py @@ -2104,6 +2104,7 @@ def test_log_probs_consistency( "See LLM_TEST_ISSUES.md for details.", strict=False, ) + @pytest.mark.gpu @pytest.mark.skipif(not _has_vllm, reason="vllm not available") @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") def test_sync_async_vllm_strict_equivalence( diff --git a/test/test_collectors.py b/test/test_collectors.py index 8a283128101..dd60eb431a4 100644 --- a/test/test_collectors.py +++ b/test/test_collectors.py @@ -2333,6 +2333,7 @@ def _reset(self, tensordict: TensorDictBase, **kwargs) -> TensorDictBase: def _set_seed(self, seed: int | None) -> None: ... + @pytest.mark.gpu @pytest.mark.skipif(not torch.cuda.is_available(), reason="no cuda device") @pytest.mark.parametrize("env_device", ["cuda:0", "cpu"]) @pytest.mark.parametrize("storing_device", [None, "cuda:0", "cpu"]) @@ -2371,6 +2372,7 @@ def test_no_synchronize(self, env_device, storing_device, no_cuda_sync): assert u == i, i mock_synchronize.assert_not_called() + @pytest.mark.gpu @pytest.mark.parametrize("device", ["cuda", "cpu"]) @pytest.mark.parametrize("storing_device", ["cuda", "cpu"]) @pytest.mark.skipif(not torch.cuda.is_available(), reason="no cuda device found") @@ -3162,6 +3164,7 @@ def test_multi_collector_consistency( assert_allclose_td(c2.unsqueeze(0), d2) +@pytest.mark.gpu @pytest.mark.skipif( not torch.cuda.is_available() and (not has_mps()), reason="No casting if no cuda", @@ -3363,6 +3366,7 @@ def test_param_sync_mixed_device( col.shutdown() del col + @pytest.mark.gpu @pytest.mark.skipif( not torch.cuda.is_available() or torch.cuda.device_count() < 3, reason="requires at least 3 CUDA devices", diff --git a/test/test_envs.py b/test/test_envs.py index 2d3cf4c76ae..8895d79b15c 100644 --- a/test/test_envs.py +++ b/test/test_envs.py @@ -597,6 +597,7 @@ def test_auto_spec(self, env_type): env.auto_specs_(policy, tensordict=td.copy(), observation_key=obs_vals) env.check_env_specs(tensordict=td.copy()) + @pytest.mark.gpu @pytest.mark.skipif(not torch.cuda.device_count(), reason="No cuda device found.") @pytest.mark.parametrize("break_when_any_done", [True, False]) def test_auto_cast_to_device(self, break_when_any_done): @@ -1526,6 +1527,7 @@ def test_parallel_env_with_policy( # env_serial.close() env0.close(raise_if_closed=False) + @pytest.mark.gpu @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA required") @pytest.mark.parametrize("heterogeneous", [False, True]) def test_transform_env_transform_no_device( @@ -1638,6 +1640,7 @@ def test_parallel_env_custom_method(self, parallel, maybe_fork_ParallelEnv): finally: env.close(raise_if_closed=False) + @pytest.mark.gpu @pytest.mark.skipif(not torch.cuda.device_count(), reason="no cuda to test on") @pytest.mark.skipif(not _has_gym, reason="no gym") @pytest.mark.parametrize("frame_skip", [4]) @@ -1742,6 +1745,7 @@ def test_parallel_env_cast( env_serial.close(raise_if_closed=False) env0.close(raise_if_closed=False) + @pytest.mark.gpu @pytest.mark.skipif(not _has_gym, reason="no gym") @pytest.mark.skipif(not torch.cuda.device_count(), reason="no cuda device detected") @pytest.mark.parametrize("frame_skip", [4]) @@ -2726,6 +2730,7 @@ def test_marl_group_type(group_type): check_marl_grouping(group_type.get_group_map(agent_names), agent_names) +@pytest.mark.gpu @pytest.mark.skipif(not torch.cuda.device_count(), reason="No cuda device") class TestConcurrentEnvs: """Concurrent parallel envs on multiple procs can interfere.""" diff --git a/test/test_libs.py b/test/test_libs.py index 274b9ec406c..97277f444d8 100644 --- a/test/test_libs.py +++ b/test/test_libs.py @@ -2157,6 +2157,7 @@ def test_set_seed_and_reset_works(self): assert isinstance(td, TensorDict) + @pytest.mark.gpu @pytest.mark.skipif(not torch.cuda.is_available(), reason="requires cuda") def test_dmcontrol_kwargs_preserved_with_seed(self): """Test that kwargs like camera_id are preserved when seed is provided. @@ -2182,6 +2183,7 @@ def test_dmcontrol_kwargs_preserved_with_seed(self): finally: env.close() + @pytest.mark.gpu @pytest.mark.skipif(not torch.cuda.is_available(), reason="requires cuda") @pytest.mark.parametrize("env_name,task", [["cheetah", "run"]]) @pytest.mark.parametrize("frame_skip", [1, 3]) @@ -2849,6 +2851,7 @@ def test_multithread_env_shutdown(self): assert not env.is_closed env.close() + @pytest.mark.gpu @pytest.mark.skipif(not torch.cuda.device_count(), reason="no cuda to test on") @pytest.mark.skipif(not _has_gym, reason="no gym") @pytest.mark.parametrize("frame_skip", [4]) @@ -2889,6 +2892,7 @@ def test_multithreaded_env_cast( assert td_device.device == torch.device(device), env_multithread env_multithread.close() + @pytest.mark.gpu @pytest.mark.skipif(not _has_gym, reason="no gym") @pytest.mark.skipif(not torch.cuda.device_count(), reason="no cuda device detected") @pytest.mark.parametrize("frame_skip", [4]) @@ -3170,6 +3174,7 @@ def test_brax_automatic_cache_clearing_parameter(self, envname, device, freq): out_td, next_td = env.step_and_maybe_reset(next_td) assert env._step_count == i + 1 + @pytest.mark.gpu @pytest.mark.skipif(not torch.cuda.is_available(), reason="requires cuda") def test_brax_kwargs_preserved_with_seed(self, envname, device): """Test that kwargs like camera_id are preserved when seed is provided. diff --git a/test/test_rb.py b/test/test_rb.py index 86d22d4ba32..73441f9e91c 100644 --- a/test/test_rb.py +++ b/test/test_rb.py @@ -753,6 +753,7 @@ def test_state_dict(self, storage_type, data_type): storage2.get(range(10)) ) + @pytest.mark.gpu @pytest.mark.skipif( not torch.cuda.device_count(), reason="not cuda device found to test rb storage.", diff --git a/test/test_specs.py b/test/test_specs.py index fd2f68aaf77..1a09b17cc47 100644 --- a/test/test_specs.py +++ b/test/test_specs.py @@ -3001,6 +3001,7 @@ def test_stack_zero_shape(self, stack_dim): assert r["a"].shape == torch.Size([*shape, 1, 3, 2]) # access tensor assert (r["a"] == 0).all() + @pytest.mark.gpu @pytest.mark.skipif(not torch.cuda.device_count(), reason="no cuda") @pytest.mark.parametrize("stack_dim", [0, 1, 2, -3, -2, -1]) def test_to(self, stack_dim): @@ -3958,6 +3959,7 @@ def test_encode(self): assert r.get("nontensor").shape == (1,) +@pytest.mark.gpu @pytest.mark.skipif(not torch.cuda.is_available(), reason="not cuda device") def test_device_ordinal(): device = torch.device("cpu") diff --git a/test/test_transforms.py b/test/test_transforms.py index cd5c880df3a..65e6939afb7 100644 --- a/test/test_transforms.py +++ b/test/test_transforms.py @@ -1292,6 +1292,7 @@ def test_constant_padding(self, padding_value): assert (cat_td.get("cat_first_key") == padding_value).sum() == N - 4 +@pytest.mark.gpu @pytest.mark.skipif(not _has_tv, reason="torchvision not installed") @pytest.mark.skipif(not torch.cuda.device_count(), reason="Testing R3M on cuda only") @pytest.mark.parametrize("device", [torch.device("cuda:0")]) @@ -8788,6 +8789,7 @@ def test_transform_env(self): assert (env.reset()["_eps_gSDE"] != 0.0).all() +@pytest.mark.gpu @pytest.mark.skipif(not _has_tv, reason="torchvision not installed") @pytest.mark.skipif(not torch.cuda.device_count(), reason="Testing VIP on cuda only") @pytest.mark.parametrize("device", [torch.device("cuda:0")]) @@ -9259,6 +9261,7 @@ def test_vip_spec_against_real(self, model, tensor_pixels_key, device): assert set(expected_keys) == set(transformed_env.rollout(3).keys(True)) +@pytest.mark.gpu @pytest.mark.skipif(not _has_vc, reason="vc_models not installed") @pytest.mark.skipif(not torch.cuda.device_count(), reason="VC1 should run on cuda") @pytest.mark.parametrize("device", [torch.device("cuda:0")]) @@ -10952,6 +10955,7 @@ def test_finitetensordictcheck(self, device): with pytest.raises(ValueError, match="Encountered a non-finite tensor"): ftd(td) + @pytest.mark.gpu @pytest.mark.skipif(not torch.cuda.device_count(), reason="no cuda device found") @pytest.mark.parametrize("device", get_default_devices()) def test_pin_mem(self, device):