From d9a8af0ad487f077fd3985682565bb9194748cf1 Mon Sep 17 00:00:00 2001 From: vmoens Date: Wed, 28 Jan 2026 10:35:18 +0000 Subject: [PATCH 1/4] Update [ghstack-poisoned] --- .github/unittest/linux/scripts/run_all.sh | 23 ++++++++++++++++++++++- pytest.ini | 2 ++ test/compile/test_compile_collectors.py | 1 + test/conftest.py | 3 +++ test/llm/test_llm_updaters.py | 3 +++ test/llm/test_vllm.py | 2 ++ test/llm/test_wrapper.py | 1 + test/test_collectors.py | 4 ++++ test/test_envs.py | 5 +++++ test/test_libs.py | 5 +++++ test/test_rb.py | 1 + test/test_specs.py | 2 ++ test/test_transforms.py | 4 ++++ 13 files changed, 55 insertions(+), 1 deletion(-) diff --git a/.github/unittest/linux/scripts/run_all.sh b/.github/unittest/linux/scripts/run_all.sh index 9eda805c0e2..a645fb6796b 100755 --- a/.github/unittest/linux/scripts/run_all.sh +++ b/.github/unittest/linux/scripts/run_all.sh @@ -269,6 +269,26 @@ fi TORCHRL_TEST_SUITE="${TORCHRL_TEST_SUITE:-all}" # all|distributed|nondistributed +# GPU test filtering: Run GPU-only tests on GPU machines, CPU-only tests on CPU machines. +# This avoids running ~2000+ tests on expensive GPU machines when only ~30 require GPU. +# Tests are marked with @pytest.mark.gpu if they require CUDA. +# +# Set TORCHRL_GPU_FILTER=0 to disable this optimization and run all tests. +if [ "${TORCHRL_GPU_FILTER:-1}" = "1" ]; then + if [ "${CU_VERSION:-}" == cpu ]; then + # CPU job: run only tests that do NOT require GPU + GPU_MARKER_FILTER='-m "not gpu"' + echo "GPU filtering enabled: Running CPU-only tests (excluding @pytest.mark.gpu)" + else + # GPU job: run only tests that require GPU + GPU_MARKER_FILTER='-m gpu' + echo "GPU filtering enabled: Running GPU-only tests (@pytest.mark.gpu)" + fi +else + GPU_MARKER_FILTER="" + echo "GPU filtering disabled: Running all tests" +fi + export PYTORCH_TEST_WITH_SLOW='1' python -m torch.utils.collect_env @@ -287,6 +307,7 @@ run_distributed_tests() { return 1 fi # Run both test_distributed.py and test_rb_distributed.py (both use torch.distributed) + # Note: distributed tests always run on GPU, no need for GPU_MARKER_FILTER here python .github/unittest/helpers/coverage_run_parallel.py -m pytest test/test_distributed.py test/test_rb_distributed.py \ --instafail --durations 200 -vv --capture no \ --timeout=120 --mp_fork_if_no_cuda @@ -303,7 +324,7 @@ run_non_distributed_tests() { # - Shard 3: Everything else (can use pytest-xdist for parallelism) local shard="${TORCHRL_TEST_SHARD:-all}" local common_ignores="--ignore test/test_rlhf.py --ignore test/test_distributed.py --ignore test/test_rb_distributed.py --ignore test/llm --ignore test/test_setup.py" - local common_args="--instafail --durations 200 -vv --capture no --timeout=120 --mp_fork_if_no_cuda" + local common_args="--instafail --durations 200 -vv --capture no --timeout=120 --mp_fork_if_no_cuda ${GPU_MARKER_FILTER}" # pytest-xdist parallelism: use -n auto for shard 3 (fewer multiprocessing tests) # Set TORCHRL_XDIST=0 to disable parallel execution diff --git a/pytest.ini b/pytest.ini index 39fe36617a1..d0aa36cc299 100644 --- a/pytest.ini +++ b/pytest.ini @@ -6,6 +6,8 @@ addopts = --tb=native markers = unity_editor + slow: mark test as slow to run + gpu: mark test as requiring a GPU (CUDA device) testpaths = test xfail_strict = True diff --git a/test/compile/test_compile_collectors.py b/test/compile/test_compile_collectors.py index 703e9dc7b91..d4417bb33ce 100644 --- a/test/compile/test_compile_collectors.py +++ b/test/compile/test_compile_collectors.py @@ -77,6 +77,7 @@ def test_compiled_policy(self, collector_cls, compile_policy, device): collector.shutdown() del collector + @pytest.mark.gpu @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is not available") @pytest.mark.parametrize( "collector_cls", diff --git a/test/conftest.py b/test/conftest.py index db03c5ba0dd..7f7cbd9cea6 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -145,6 +145,9 @@ def pytest_runtest_setup(item): def pytest_configure(config): config.addinivalue_line("markers", "slow: mark test as slow to run") + config.addinivalue_line( + "markers", "gpu: mark test as requiring a GPU (CUDA device)" + ) def pytest_collection_modifyitems(config, items): diff --git a/test/llm/test_llm_updaters.py b/test/llm/test_llm_updaters.py index ddc2a2a08a7..bf04d32d499 100644 --- a/test/llm/test_llm_updaters.py +++ b/test/llm/test_llm_updaters.py @@ -72,6 +72,7 @@ def get_open_port(): ) +@pytest.mark.gpu @pytest.mark.skipif(not _has_transformers, reason="missing transformers dependencies") @pytest.mark.skipif(not _has_vllm, reason="missing vllm dependencies") @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") @@ -415,6 +416,7 @@ def test_local_llm_specific_features(self, target_vllm_engine): "See LLM_TEST_ISSUES.md for details.", strict=False, ) +@pytest.mark.gpu @pytest.mark.skipif(not _has_ray, reason="missing ray dependencies") @pytest.mark.skipif(not _has_vllm, reason="missing vllm dependencies") @pytest.mark.skipif(not _has_transformers, reason="missing transformers dependencies") @@ -611,6 +613,7 @@ def test_weight_sync_vllm_collective_ray(self, request): ray.shutdown() +@pytest.mark.gpu @pytest.mark.xfail( reason="AsyncVLLM tests fail due to Ray placement group timeout. " "See LLM_TEST_ISSUES.md for details.", diff --git a/test/llm/test_vllm.py b/test/llm/test_vllm.py index 8378b60e640..6f2ffcb59ac 100644 --- a/test/llm/test_vllm.py +++ b/test/llm/test_vllm.py @@ -39,6 +39,7 @@ def sampling_params(): class TestAsyncVLLMIntegration: """Integration tests for AsyncVLLM with real models.""" + @pytest.mark.gpu @pytest.mark.skipif(not _has_vllm, reason="vllm not available") @pytest.mark.skipif(not _has_ray, reason="ray not available") @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") @@ -111,6 +112,7 @@ def test_vllm_api_compatibility(self, sampling_params): finally: service.shutdown() + @pytest.mark.gpu @pytest.mark.skipif(not _has_vllm, reason="vllm not available") @pytest.mark.skipif(not _has_ray, reason="ray not available") @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") diff --git a/test/llm/test_wrapper.py b/test/llm/test_wrapper.py index a0485b05851..4d322392d54 100644 --- a/test/llm/test_wrapper.py +++ b/test/llm/test_wrapper.py @@ -2104,6 +2104,7 @@ def test_log_probs_consistency( "See LLM_TEST_ISSUES.md for details.", strict=False, ) + @pytest.mark.gpu @pytest.mark.skipif(not _has_vllm, reason="vllm not available") @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") def test_sync_async_vllm_strict_equivalence( diff --git a/test/test_collectors.py b/test/test_collectors.py index 8a283128101..dd60eb431a4 100644 --- a/test/test_collectors.py +++ b/test/test_collectors.py @@ -2333,6 +2333,7 @@ def _reset(self, tensordict: TensorDictBase, **kwargs) -> TensorDictBase: def _set_seed(self, seed: int | None) -> None: ... + @pytest.mark.gpu @pytest.mark.skipif(not torch.cuda.is_available(), reason="no cuda device") @pytest.mark.parametrize("env_device", ["cuda:0", "cpu"]) @pytest.mark.parametrize("storing_device", [None, "cuda:0", "cpu"]) @@ -2371,6 +2372,7 @@ def test_no_synchronize(self, env_device, storing_device, no_cuda_sync): assert u == i, i mock_synchronize.assert_not_called() + @pytest.mark.gpu @pytest.mark.parametrize("device", ["cuda", "cpu"]) @pytest.mark.parametrize("storing_device", ["cuda", "cpu"]) @pytest.mark.skipif(not torch.cuda.is_available(), reason="no cuda device found") @@ -3162,6 +3164,7 @@ def test_multi_collector_consistency( assert_allclose_td(c2.unsqueeze(0), d2) +@pytest.mark.gpu @pytest.mark.skipif( not torch.cuda.is_available() and (not has_mps()), reason="No casting if no cuda", @@ -3363,6 +3366,7 @@ def test_param_sync_mixed_device( col.shutdown() del col + @pytest.mark.gpu @pytest.mark.skipif( not torch.cuda.is_available() or torch.cuda.device_count() < 3, reason="requires at least 3 CUDA devices", diff --git a/test/test_envs.py b/test/test_envs.py index 2d3cf4c76ae..8895d79b15c 100644 --- a/test/test_envs.py +++ b/test/test_envs.py @@ -597,6 +597,7 @@ def test_auto_spec(self, env_type): env.auto_specs_(policy, tensordict=td.copy(), observation_key=obs_vals) env.check_env_specs(tensordict=td.copy()) + @pytest.mark.gpu @pytest.mark.skipif(not torch.cuda.device_count(), reason="No cuda device found.") @pytest.mark.parametrize("break_when_any_done", [True, False]) def test_auto_cast_to_device(self, break_when_any_done): @@ -1526,6 +1527,7 @@ def test_parallel_env_with_policy( # env_serial.close() env0.close(raise_if_closed=False) + @pytest.mark.gpu @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA required") @pytest.mark.parametrize("heterogeneous", [False, True]) def test_transform_env_transform_no_device( @@ -1638,6 +1640,7 @@ def test_parallel_env_custom_method(self, parallel, maybe_fork_ParallelEnv): finally: env.close(raise_if_closed=False) + @pytest.mark.gpu @pytest.mark.skipif(not torch.cuda.device_count(), reason="no cuda to test on") @pytest.mark.skipif(not _has_gym, reason="no gym") @pytest.mark.parametrize("frame_skip", [4]) @@ -1742,6 +1745,7 @@ def test_parallel_env_cast( env_serial.close(raise_if_closed=False) env0.close(raise_if_closed=False) + @pytest.mark.gpu @pytest.mark.skipif(not _has_gym, reason="no gym") @pytest.mark.skipif(not torch.cuda.device_count(), reason="no cuda device detected") @pytest.mark.parametrize("frame_skip", [4]) @@ -2726,6 +2730,7 @@ def test_marl_group_type(group_type): check_marl_grouping(group_type.get_group_map(agent_names), agent_names) +@pytest.mark.gpu @pytest.mark.skipif(not torch.cuda.device_count(), reason="No cuda device") class TestConcurrentEnvs: """Concurrent parallel envs on multiple procs can interfere.""" diff --git a/test/test_libs.py b/test/test_libs.py index ebe679e9e06..ee0d39f2e25 100644 --- a/test/test_libs.py +++ b/test/test_libs.py @@ -2157,6 +2157,7 @@ def test_set_seed_and_reset_works(self): assert isinstance(td, TensorDict) + @pytest.mark.gpu @pytest.mark.skipif(not torch.cuda.is_available(), reason="requires cuda") def test_dmcontrol_kwargs_preserved_with_seed(self): """Test that kwargs like camera_id are preserved when seed is provided. @@ -2182,6 +2183,7 @@ def test_dmcontrol_kwargs_preserved_with_seed(self): finally: env.close() + @pytest.mark.gpu @pytest.mark.skipif(not torch.cuda.is_available(), reason="requires cuda") @pytest.mark.parametrize("env_name,task", [["cheetah", "run"]]) @pytest.mark.parametrize("frame_skip", [1, 3]) @@ -2776,6 +2778,7 @@ def test_multithread_env_shutdown(self): assert not env.is_closed env.close() + @pytest.mark.gpu @pytest.mark.skipif(not torch.cuda.device_count(), reason="no cuda to test on") @pytest.mark.skipif(not _has_gym, reason="no gym") @pytest.mark.parametrize("frame_skip", [4]) @@ -2816,6 +2819,7 @@ def test_multithreaded_env_cast( assert td_device.device == torch.device(device), env_multithread env_multithread.close() + @pytest.mark.gpu @pytest.mark.skipif(not _has_gym, reason="no gym") @pytest.mark.skipif(not torch.cuda.device_count(), reason="no cuda device detected") @pytest.mark.parametrize("frame_skip", [4]) @@ -3097,6 +3101,7 @@ def test_brax_automatic_cache_clearing_parameter(self, envname, device, freq): out_td, next_td = env.step_and_maybe_reset(next_td) assert env._step_count == i + 1 + @pytest.mark.gpu @pytest.mark.skipif(not torch.cuda.is_available(), reason="requires cuda") def test_brax_kwargs_preserved_with_seed(self, envname, device): """Test that kwargs like camera_id are preserved when seed is provided. diff --git a/test/test_rb.py b/test/test_rb.py index 86d22d4ba32..73441f9e91c 100644 --- a/test/test_rb.py +++ b/test/test_rb.py @@ -753,6 +753,7 @@ def test_state_dict(self, storage_type, data_type): storage2.get(range(10)) ) + @pytest.mark.gpu @pytest.mark.skipif( not torch.cuda.device_count(), reason="not cuda device found to test rb storage.", diff --git a/test/test_specs.py b/test/test_specs.py index ba68e0a3fb2..04dc922e969 100644 --- a/test/test_specs.py +++ b/test/test_specs.py @@ -3001,6 +3001,7 @@ def test_stack_zero_shape(self, stack_dim): assert r["a"].shape == torch.Size([*shape, 1, 3, 2]) # access tensor assert (r["a"] == 0).all() + @pytest.mark.gpu @pytest.mark.skipif(not torch.cuda.device_count(), reason="no cuda") @pytest.mark.parametrize("stack_dim", [0, 1, 2, -3, -2, -1]) def test_to(self, stack_dim): @@ -3958,6 +3959,7 @@ def test_encode(self): assert r.get("nontensor").shape == (1,) +@pytest.mark.gpu @pytest.mark.skipif(not torch.cuda.is_available(), reason="not cuda device") def test_device_ordinal(): device = torch.device("cpu") diff --git a/test/test_transforms.py b/test/test_transforms.py index b96aa7f9bde..ef912ba00d4 100644 --- a/test/test_transforms.py +++ b/test/test_transforms.py @@ -1292,6 +1292,7 @@ def test_constant_padding(self, padding_value): assert (cat_td.get("cat_first_key") == padding_value).sum() == N - 4 +@pytest.mark.gpu @pytest.mark.skipif(not _has_tv, reason="torchvision not installed") @pytest.mark.skipif(not torch.cuda.device_count(), reason="Testing R3M on cuda only") @pytest.mark.parametrize("device", [torch.device("cuda:0")]) @@ -8748,6 +8749,7 @@ def test_transform_env(self): assert (env.reset()["_eps_gSDE"] != 0.0).all() +@pytest.mark.gpu @pytest.mark.skipif(not _has_tv, reason="torchvision not installed") @pytest.mark.skipif(not torch.cuda.device_count(), reason="Testing VIP on cuda only") @pytest.mark.parametrize("device", [torch.device("cuda:0")]) @@ -9219,6 +9221,7 @@ def test_vip_spec_against_real(self, model, tensor_pixels_key, device): assert set(expected_keys) == set(transformed_env.rollout(3).keys(True)) +@pytest.mark.gpu @pytest.mark.skipif(not _has_vc, reason="vc_models not installed") @pytest.mark.skipif(not torch.cuda.device_count(), reason="VC1 should run on cuda") @pytest.mark.parametrize("device", [torch.device("cuda:0")]) @@ -10912,6 +10915,7 @@ def test_finitetensordictcheck(self, device): with pytest.raises(ValueError, match="Encountered a non-finite tensor"): ftd(td) + @pytest.mark.gpu @pytest.mark.skipif(not torch.cuda.device_count(), reason="no cuda device found") @pytest.mark.parametrize("device", get_default_devices()) def test_pin_mem(self, device): From f6705083f71324edd4ab6533ac290394a716cb1a Mon Sep 17 00:00:00 2001 From: vmoens Date: Wed, 28 Jan 2026 11:07:56 +0000 Subject: [PATCH 2/4] Update [ghstack-poisoned] --- .github/unittest/linux/scripts/run_all.sh | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/.github/unittest/linux/scripts/run_all.sh b/.github/unittest/linux/scripts/run_all.sh index a645fb6796b..43b25048d09 100755 --- a/.github/unittest/linux/scripts/run_all.sh +++ b/.github/unittest/linux/scripts/run_all.sh @@ -274,18 +274,20 @@ TORCHRL_TEST_SUITE="${TORCHRL_TEST_SUITE:-all}" # all|distributed|nondistributed # Tests are marked with @pytest.mark.gpu if they require CUDA. # # Set TORCHRL_GPU_FILTER=0 to disable this optimization and run all tests. +# +# We use an array to handle the marker expression properly (avoids quoting issues). +GPU_MARKER_FILTER=() if [ "${TORCHRL_GPU_FILTER:-1}" = "1" ]; then if [ "${CU_VERSION:-}" == cpu ]; then # CPU job: run only tests that do NOT require GPU - GPU_MARKER_FILTER='-m "not gpu"' + GPU_MARKER_FILTER=(-m 'not gpu') echo "GPU filtering enabled: Running CPU-only tests (excluding @pytest.mark.gpu)" else # GPU job: run only tests that require GPU - GPU_MARKER_FILTER='-m gpu' + GPU_MARKER_FILTER=(-m gpu) echo "GPU filtering enabled: Running GPU-only tests (@pytest.mark.gpu)" fi else - GPU_MARKER_FILTER="" echo "GPU filtering disabled: Running all tests" fi @@ -324,7 +326,7 @@ run_non_distributed_tests() { # - Shard 3: Everything else (can use pytest-xdist for parallelism) local shard="${TORCHRL_TEST_SHARD:-all}" local common_ignores="--ignore test/test_rlhf.py --ignore test/test_distributed.py --ignore test/test_rb_distributed.py --ignore test/llm --ignore test/test_setup.py" - local common_args="--instafail --durations 200 -vv --capture no --timeout=120 --mp_fork_if_no_cuda ${GPU_MARKER_FILTER}" + local common_args="--instafail --durations 200 -vv --capture no --timeout=120 --mp_fork_if_no_cuda" # pytest-xdist parallelism: use -n auto for shard 3 (fewer multiprocessing tests) # Set TORCHRL_XDIST=0 to disable parallel execution @@ -338,12 +340,12 @@ run_non_distributed_tests() { 1) echo "Running shard 1: test_transforms.py only" python .github/unittest/helpers/coverage_run_parallel.py -m pytest test/test_transforms.py \ - ${common_args} + "${GPU_MARKER_FILTER[@]}" ${common_args} ;; 2) echo "Running shard 2: test_envs.py and test_collectors.py" python .github/unittest/helpers/coverage_run_parallel.py -m pytest test/test_envs.py test/test_collectors.py \ - ${common_args} + "${GPU_MARKER_FILTER[@]}" ${common_args} ;; 3) echo "Running shard 3: All other tests" @@ -353,13 +355,13 @@ run_non_distributed_tests() { --ignore test/test_envs.py \ --ignore test/test_collectors.py \ ${xdist_args} \ - ${common_args} + "${GPU_MARKER_FILTER[@]}" ${common_args} ;; all|"") echo "Running all tests (no sharding)" python .github/unittest/helpers/coverage_run_parallel.py -m pytest test \ ${common_ignores} \ - ${common_args} + "${GPU_MARKER_FILTER[@]}" ${common_args} ;; *) echo "Unknown TORCHRL_TEST_SHARD='${shard}'. Expected: all|1|2|3." From 8910a1dae7da23c3feffe1c869982575b9e06a6b Mon Sep 17 00:00:00 2001 From: vmoens Date: Wed, 28 Jan 2026 18:15:27 +0000 Subject: [PATCH 3/4] Update [ghstack-poisoned] --- .../workflows/validate-test-partitioning.yml | 71 +++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 .github/workflows/validate-test-partitioning.yml diff --git a/.github/workflows/validate-test-partitioning.yml b/.github/workflows/validate-test-partitioning.yml new file mode 100644 index 00000000000..49fb67a66ad --- /dev/null +++ b/.github/workflows/validate-test-partitioning.yml @@ -0,0 +1,71 @@ +# Validates that GPU/CPU test partitioning covers all tests +# +# This workflow ensures that: +# 1. Tests marked with @pytest.mark.gpu + tests not marked = all tests +# 2. No tests are accidentally excluded from CI +# +# Runs on PRs to catch partitioning issues before merge. +name: Validate Test Partitioning + +on: + pull_request: + push: + branches: [main, nightly] + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + validate: + runs-on: ubuntu-latest + timeout-minutes: 10 + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Install minimal dependencies + run: | + pip install pytest torch --index-url https://download.pytorch.org/whl/cpu + pip install tensordict + pip install -e . --no-build-isolation --no-deps + + - name: Validate test partitioning + run: | + set -e + + echo "==================================================" + echo " TEST PARTITIONING VALIDATION" + echo "==================================================" + + # Collect test counts + # Note: We ignore test_loggers.py due to torchvision operator issues on CPU-only + ALL=$(pytest --collect-only -q test/ --ignore test/test_loggers.py 2>/dev/null | tail -1 | grep -oE "^[0-9]+") + GPU=$(pytest --collect-only -q -m gpu test/ --ignore test/test_loggers.py 2>/dev/null | tail -1 | grep -oE "^[0-9]+") + CPU=$(pytest --collect-only -q -m "not gpu" test/ --ignore test/test_loggers.py 2>/dev/null | tail -1 | grep -oE "^[0-9]+") + + echo "" + echo "Total tests: $ALL" + echo "GPU tests (@pytest.mark.gpu): $GPU" + echo "CPU tests (not gpu): $CPU" + echo "GPU + CPU: $((GPU + CPU))" + echo "" + + # Validate: GPU + CPU should equal ALL + if [ "$((GPU + CPU))" -eq "$ALL" ]; then + echo "✅ PASS: Test partitioning is valid!" + echo " All tests are accounted for." + else + echo "❌ FAIL: Test partitioning mismatch!" + echo " GPU ($GPU) + CPU ($CPU) = $((GPU + CPU)), but total is $ALL" + echo "" + echo " This means some tests are either:" + echo " - Missing the @pytest.mark.gpu marker (if they require CUDA)" + echo " - Being excluded unintentionally" + exit 1 + fi From 3fc3208ffb8081a3a9383a545deb6f7db6e6a76e Mon Sep 17 00:00:00 2001 From: vmoens Date: Thu, 29 Jan 2026 09:43:29 +0000 Subject: [PATCH 4/4] Update [ghstack-poisoned] --- .github/workflows/validate-test-partitioning.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/validate-test-partitioning.yml b/.github/workflows/validate-test-partitioning.yml index a4d5c883f7e..b2974b78fb4 100644 --- a/.github/workflows/validate-test-partitioning.yml +++ b/.github/workflows/validate-test-partitioning.yml @@ -33,9 +33,11 @@ jobs: run: | pip install pytest tensordict pip install torch --index-url https://download.pytorch.org/whl/cpu - pip install -e . --no-build-isolation --no-deps + # Skip editable install - just add to PYTHONPATH for test collection - name: Validate test partitioning + env: + PYTHONPATH: ${{ github.workspace }} run: | set -e