From d9a8af0ad487f077fd3985682565bb9194748cf1 Mon Sep 17 00:00:00 2001
From: vmoens <vincentmoens@gmail.com>
Date: Wed, 28 Jan 2026 10:35:18 +0000
Subject: [PATCH 1/4] Update

[ghstack-poisoned]
---
 .github/unittest/linux/scripts/run_all.sh | 23 ++++++++++++++++++++++-
 pytest.ini                                |  2 ++
 test/compile/test_compile_collectors.py   |  1 +
 test/conftest.py                          |  3 +++
 test/llm/test_llm_updaters.py             |  3 +++
 test/llm/test_vllm.py                     |  2 ++
 test/llm/test_wrapper.py                  |  1 +
 test/test_collectors.py                   |  4 ++++
 test/test_envs.py                         |  5 +++++
 test/test_libs.py                         |  5 +++++
 test/test_rb.py                           |  1 +
 test/test_specs.py                        |  2 ++
 test/test_transforms.py                   |  4 ++++
 13 files changed, 55 insertions(+), 1 deletion(-)

diff --git a/.github/unittest/linux/scripts/run_all.sh b/.github/unittest/linux/scripts/run_all.sh
index 9eda805c0e2..a645fb6796b 100755
--- a/.github/unittest/linux/scripts/run_all.sh
+++ b/.github/unittest/linux/scripts/run_all.sh
@@ -269,6 +269,26 @@ fi
 
 TORCHRL_TEST_SUITE="${TORCHRL_TEST_SUITE:-all}" # all|distributed|nondistributed
 
+# GPU test filtering: Run GPU-only tests on GPU machines, CPU-only tests on CPU machines.
+# This avoids running ~2000+ tests on expensive GPU machines when only ~30 require GPU.
+# Tests are marked with @pytest.mark.gpu if they require CUDA.
+#
+# Set TORCHRL_GPU_FILTER=0 to disable this optimization and run all tests.
+if [ "${TORCHRL_GPU_FILTER:-1}" = "1" ]; then
+  if [ "${CU_VERSION:-}" == cpu ]; then
+    # CPU job: run only tests that do NOT require GPU
+    GPU_MARKER_FILTER='-m "not gpu"'
+    echo "GPU filtering enabled: Running CPU-only tests (excluding @pytest.mark.gpu)"
+  else
+    # GPU job: run only tests that require GPU
+    GPU_MARKER_FILTER='-m gpu'
+    echo "GPU filtering enabled: Running GPU-only tests (@pytest.mark.gpu)"
+  fi
+else
+  GPU_MARKER_FILTER=""
+  echo "GPU filtering disabled: Running all tests"
+fi
+
 export PYTORCH_TEST_WITH_SLOW='1'
 python -m torch.utils.collect_env
 
@@ -287,6 +307,7 @@ run_distributed_tests() {
     return 1
   fi
   # Run both test_distributed.py and test_rb_distributed.py (both use torch.distributed)
+  # Note: distributed tests always run on GPU, no need for GPU_MARKER_FILTER here
   python .github/unittest/helpers/coverage_run_parallel.py -m pytest test/test_distributed.py test/test_rb_distributed.py \
     --instafail --durations 200 -vv --capture no \
     --timeout=120 --mp_fork_if_no_cuda
@@ -303,7 +324,7 @@ run_non_distributed_tests() {
   # - Shard 3: Everything else (can use pytest-xdist for parallelism)
   local shard="${TORCHRL_TEST_SHARD:-all}"
   local common_ignores="--ignore test/test_rlhf.py --ignore test/test_distributed.py --ignore test/test_rb_distributed.py --ignore test/llm --ignore test/test_setup.py"
-  local common_args="--instafail --durations 200 -vv --capture no --timeout=120 --mp_fork_if_no_cuda"
+  local common_args="--instafail --durations 200 -vv --capture no --timeout=120 --mp_fork_if_no_cuda ${GPU_MARKER_FILTER}"
   
   # pytest-xdist parallelism: use -n auto for shard 3 (fewer multiprocessing tests)
   # Set TORCHRL_XDIST=0 to disable parallel execution
diff --git a/pytest.ini b/pytest.ini
index 39fe36617a1..d0aa36cc299 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -6,6 +6,8 @@ addopts =
     --tb=native
 markers =
     unity_editor
+    slow: mark test as slow to run
+    gpu: mark test as requiring a GPU (CUDA device)
 testpaths =
     test
 xfail_strict = True
diff --git a/test/compile/test_compile_collectors.py b/test/compile/test_compile_collectors.py
index 703e9dc7b91..d4417bb33ce 100644
--- a/test/compile/test_compile_collectors.py
+++ b/test/compile/test_compile_collectors.py
@@ -77,6 +77,7 @@ def test_compiled_policy(self, collector_cls, compile_policy, device):
             collector.shutdown()
             del collector
 
+    @pytest.mark.gpu
     @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is not available")
     @pytest.mark.parametrize(
         "collector_cls",
diff --git a/test/conftest.py b/test/conftest.py
index db03c5ba0dd..7f7cbd9cea6 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -145,6 +145,9 @@ def pytest_runtest_setup(item):
 
 def pytest_configure(config):
     config.addinivalue_line("markers", "slow: mark test as slow to run")
+    config.addinivalue_line(
+        "markers", "gpu: mark test as requiring a GPU (CUDA device)"
+    )
 
 
 def pytest_collection_modifyitems(config, items):
diff --git a/test/llm/test_llm_updaters.py b/test/llm/test_llm_updaters.py
index ddc2a2a08a7..bf04d32d499 100644
--- a/test/llm/test_llm_updaters.py
+++ b/test/llm/test_llm_updaters.py
@@ -72,6 +72,7 @@ def get_open_port():
     )
 
 
+@pytest.mark.gpu
 @pytest.mark.skipif(not _has_transformers, reason="missing transformers dependencies")
 @pytest.mark.skipif(not _has_vllm, reason="missing vllm dependencies")
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
@@ -415,6 +416,7 @@ def test_local_llm_specific_features(self, target_vllm_engine):
     "See LLM_TEST_ISSUES.md for details.",
     strict=False,
 )
+@pytest.mark.gpu
 @pytest.mark.skipif(not _has_ray, reason="missing ray dependencies")
 @pytest.mark.skipif(not _has_vllm, reason="missing vllm dependencies")
 @pytest.mark.skipif(not _has_transformers, reason="missing transformers dependencies")
@@ -611,6 +613,7 @@ def test_weight_sync_vllm_collective_ray(self, request):
                 ray.shutdown()
 
 
+@pytest.mark.gpu
 @pytest.mark.xfail(
     reason="AsyncVLLM tests fail due to Ray placement group timeout. "
     "See LLM_TEST_ISSUES.md for details.",
diff --git a/test/llm/test_vllm.py b/test/llm/test_vllm.py
index 8378b60e640..6f2ffcb59ac 100644
--- a/test/llm/test_vllm.py
+++ b/test/llm/test_vllm.py
@@ -39,6 +39,7 @@ def sampling_params():
 class TestAsyncVLLMIntegration:
     """Integration tests for AsyncVLLM with real models."""
 
+    @pytest.mark.gpu
     @pytest.mark.skipif(not _has_vllm, reason="vllm not available")
     @pytest.mark.skipif(not _has_ray, reason="ray not available")
     @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
@@ -111,6 +112,7 @@ def test_vllm_api_compatibility(self, sampling_params):
         finally:
             service.shutdown()
 
+    @pytest.mark.gpu
     @pytest.mark.skipif(not _has_vllm, reason="vllm not available")
     @pytest.mark.skipif(not _has_ray, reason="ray not available")
     @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
diff --git a/test/llm/test_wrapper.py b/test/llm/test_wrapper.py
index a0485b05851..4d322392d54 100644
--- a/test/llm/test_wrapper.py
+++ b/test/llm/test_wrapper.py
@@ -2104,6 +2104,7 @@ def test_log_probs_consistency(
         "See LLM_TEST_ISSUES.md for details.",
         strict=False,
     )
+    @pytest.mark.gpu
     @pytest.mark.skipif(not _has_vllm, reason="vllm not available")
     @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
     def test_sync_async_vllm_strict_equivalence(
diff --git a/test/test_collectors.py b/test/test_collectors.py
index 8a283128101..dd60eb431a4 100644
--- a/test/test_collectors.py
+++ b/test/test_collectors.py
@@ -2333,6 +2333,7 @@ def _reset(self, tensordict: TensorDictBase, **kwargs) -> TensorDictBase:
         def _set_seed(self, seed: int | None) -> None:
             ...
 
+    @pytest.mark.gpu
     @pytest.mark.skipif(not torch.cuda.is_available(), reason="no cuda device")
     @pytest.mark.parametrize("env_device", ["cuda:0", "cpu"])
     @pytest.mark.parametrize("storing_device", [None, "cuda:0", "cpu"])
@@ -2371,6 +2372,7 @@ def test_no_synchronize(self, env_device, storing_device, no_cuda_sync):
                     assert u == i, i
                 mock_synchronize.assert_not_called()
 
+    @pytest.mark.gpu
     @pytest.mark.parametrize("device", ["cuda", "cpu"])
     @pytest.mark.parametrize("storing_device", ["cuda", "cpu"])
     @pytest.mark.skipif(not torch.cuda.is_available(), reason="no cuda device found")
@@ -3162,6 +3164,7 @@ def test_multi_collector_consistency(
         assert_allclose_td(c2.unsqueeze(0), d2)
 
 
+@pytest.mark.gpu
 @pytest.mark.skipif(
     not torch.cuda.is_available() and (not has_mps()),
     reason="No casting if no cuda",
@@ -3363,6 +3366,7 @@ def test_param_sync_mixed_device(
             col.shutdown()
             del col
 
+    @pytest.mark.gpu
     @pytest.mark.skipif(
         not torch.cuda.is_available() or torch.cuda.device_count() < 3,
         reason="requires at least 3 CUDA devices",
diff --git a/test/test_envs.py b/test/test_envs.py
index 2d3cf4c76ae..8895d79b15c 100644
--- a/test/test_envs.py
+++ b/test/test_envs.py
@@ -597,6 +597,7 @@ def test_auto_spec(self, env_type):
         env.auto_specs_(policy, tensordict=td.copy(), observation_key=obs_vals)
         env.check_env_specs(tensordict=td.copy())
 
+    @pytest.mark.gpu
     @pytest.mark.skipif(not torch.cuda.device_count(), reason="No cuda device found.")
     @pytest.mark.parametrize("break_when_any_done", [True, False])
     def test_auto_cast_to_device(self, break_when_any_done):
@@ -1526,6 +1527,7 @@ def test_parallel_env_with_policy(
             # env_serial.close()
             env0.close(raise_if_closed=False)
 
+    @pytest.mark.gpu
     @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA required")
     @pytest.mark.parametrize("heterogeneous", [False, True])
     def test_transform_env_transform_no_device(
@@ -1638,6 +1640,7 @@ def test_parallel_env_custom_method(self, parallel, maybe_fork_ParallelEnv):
         finally:
             env.close(raise_if_closed=False)
 
+    @pytest.mark.gpu
     @pytest.mark.skipif(not torch.cuda.device_count(), reason="no cuda to test on")
     @pytest.mark.skipif(not _has_gym, reason="no gym")
     @pytest.mark.parametrize("frame_skip", [4])
@@ -1742,6 +1745,7 @@ def test_parallel_env_cast(
             env_serial.close(raise_if_closed=False)
             env0.close(raise_if_closed=False)
 
+    @pytest.mark.gpu
     @pytest.mark.skipif(not _has_gym, reason="no gym")
     @pytest.mark.skipif(not torch.cuda.device_count(), reason="no cuda device detected")
     @pytest.mark.parametrize("frame_skip", [4])
@@ -2726,6 +2730,7 @@ def test_marl_group_type(group_type):
         check_marl_grouping(group_type.get_group_map(agent_names), agent_names)
 
 
+@pytest.mark.gpu
 @pytest.mark.skipif(not torch.cuda.device_count(), reason="No cuda device")
 class TestConcurrentEnvs:
     """Concurrent parallel envs on multiple procs can interfere."""
diff --git a/test/test_libs.py b/test/test_libs.py
index ebe679e9e06..ee0d39f2e25 100644
--- a/test/test_libs.py
+++ b/test/test_libs.py
@@ -2157,6 +2157,7 @@ def test_set_seed_and_reset_works(self):
 
         assert isinstance(td, TensorDict)
 
+    @pytest.mark.gpu
     @pytest.mark.skipif(not torch.cuda.is_available(), reason="requires cuda")
     def test_dmcontrol_kwargs_preserved_with_seed(self):
         """Test that kwargs like camera_id are preserved when seed is provided.
@@ -2182,6 +2183,7 @@ def test_dmcontrol_kwargs_preserved_with_seed(self):
         finally:
             env.close()
 
+    @pytest.mark.gpu
     @pytest.mark.skipif(not torch.cuda.is_available(), reason="requires cuda")
     @pytest.mark.parametrize("env_name,task", [["cheetah", "run"]])
     @pytest.mark.parametrize("frame_skip", [1, 3])
@@ -2776,6 +2778,7 @@ def test_multithread_env_shutdown(self):
         assert not env.is_closed
         env.close()
 
+    @pytest.mark.gpu
     @pytest.mark.skipif(not torch.cuda.device_count(), reason="no cuda to test on")
     @pytest.mark.skipif(not _has_gym, reason="no gym")
     @pytest.mark.parametrize("frame_skip", [4])
@@ -2816,6 +2819,7 @@ def test_multithreaded_env_cast(
         assert td_device.device == torch.device(device), env_multithread
         env_multithread.close()
 
+    @pytest.mark.gpu
     @pytest.mark.skipif(not _has_gym, reason="no gym")
     @pytest.mark.skipif(not torch.cuda.device_count(), reason="no cuda device detected")
     @pytest.mark.parametrize("frame_skip", [4])
@@ -3097,6 +3101,7 @@ def test_brax_automatic_cache_clearing_parameter(self, envname, device, freq):
             out_td, next_td = env.step_and_maybe_reset(next_td)
             assert env._step_count == i + 1
 
+    @pytest.mark.gpu
     @pytest.mark.skipif(not torch.cuda.is_available(), reason="requires cuda")
     def test_brax_kwargs_preserved_with_seed(self, envname, device):
         """Test that kwargs like camera_id are preserved when seed is provided.
diff --git a/test/test_rb.py b/test/test_rb.py
index 86d22d4ba32..73441f9e91c 100644
--- a/test/test_rb.py
+++ b/test/test_rb.py
@@ -753,6 +753,7 @@ def test_state_dict(self, storage_type, data_type):
             storage2.get(range(10))
         )
 
+    @pytest.mark.gpu
     @pytest.mark.skipif(
         not torch.cuda.device_count(),
         reason="not cuda device found to test rb storage.",
diff --git a/test/test_specs.py b/test/test_specs.py
index ba68e0a3fb2..04dc922e969 100644
--- a/test/test_specs.py
+++ b/test/test_specs.py
@@ -3001,6 +3001,7 @@ def test_stack_zero_shape(self, stack_dim):
             assert r["a"].shape == torch.Size([*shape, 1, 3, 2])  # access tensor
         assert (r["a"] == 0).all()
 
+    @pytest.mark.gpu
     @pytest.mark.skipif(not torch.cuda.device_count(), reason="no cuda")
     @pytest.mark.parametrize("stack_dim", [0, 1, 2, -3, -2, -1])
     def test_to(self, stack_dim):
@@ -3958,6 +3959,7 @@ def test_encode(self):
         assert r.get("nontensor").shape == (1,)
 
 
+@pytest.mark.gpu
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="not cuda device")
 def test_device_ordinal():
     device = torch.device("cpu")
diff --git a/test/test_transforms.py b/test/test_transforms.py
index b96aa7f9bde..ef912ba00d4 100644
--- a/test/test_transforms.py
+++ b/test/test_transforms.py
@@ -1292,6 +1292,7 @@ def test_constant_padding(self, padding_value):
         assert (cat_td.get("cat_first_key") == padding_value).sum() == N - 4
 
 
+@pytest.mark.gpu
 @pytest.mark.skipif(not _has_tv, reason="torchvision not installed")
 @pytest.mark.skipif(not torch.cuda.device_count(), reason="Testing R3M on cuda only")
 @pytest.mark.parametrize("device", [torch.device("cuda:0")])
@@ -8748,6 +8749,7 @@ def test_transform_env(self):
         assert (env.reset()["_eps_gSDE"] != 0.0).all()
 
 
+@pytest.mark.gpu
 @pytest.mark.skipif(not _has_tv, reason="torchvision not installed")
 @pytest.mark.skipif(not torch.cuda.device_count(), reason="Testing VIP on cuda only")
 @pytest.mark.parametrize("device", [torch.device("cuda:0")])
@@ -9219,6 +9221,7 @@ def test_vip_spec_against_real(self, model, tensor_pixels_key, device):
         assert set(expected_keys) == set(transformed_env.rollout(3).keys(True))
 
 
+@pytest.mark.gpu
 @pytest.mark.skipif(not _has_vc, reason="vc_models not installed")
 @pytest.mark.skipif(not torch.cuda.device_count(), reason="VC1 should run on cuda")
 @pytest.mark.parametrize("device", [torch.device("cuda:0")])
@@ -10912,6 +10915,7 @@ def test_finitetensordictcheck(self, device):
         with pytest.raises(ValueError, match="Encountered a non-finite tensor"):
             ftd(td)
 
+    @pytest.mark.gpu
     @pytest.mark.skipif(not torch.cuda.device_count(), reason="no cuda device found")
     @pytest.mark.parametrize("device", get_default_devices())
     def test_pin_mem(self, device):

From f6705083f71324edd4ab6533ac290394a716cb1a Mon Sep 17 00:00:00 2001
From: vmoens <vincentmoens@gmail.com>
Date: Wed, 28 Jan 2026 11:07:56 +0000
Subject: [PATCH 2/4] Update

[ghstack-poisoned]
---
 .github/unittest/linux/scripts/run_all.sh | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/.github/unittest/linux/scripts/run_all.sh b/.github/unittest/linux/scripts/run_all.sh
index a645fb6796b..43b25048d09 100755
--- a/.github/unittest/linux/scripts/run_all.sh
+++ b/.github/unittest/linux/scripts/run_all.sh
@@ -274,18 +274,20 @@ TORCHRL_TEST_SUITE="${TORCHRL_TEST_SUITE:-all}" # all|distributed|nondistributed
 # Tests are marked with @pytest.mark.gpu if they require CUDA.
 #
 # Set TORCHRL_GPU_FILTER=0 to disable this optimization and run all tests.
+#
+# We use an array to handle the marker expression properly (avoids quoting issues).
+GPU_MARKER_FILTER=()
 if [ "${TORCHRL_GPU_FILTER:-1}" = "1" ]; then
   if [ "${CU_VERSION:-}" == cpu ]; then
     # CPU job: run only tests that do NOT require GPU
-    GPU_MARKER_FILTER='-m "not gpu"'
+    GPU_MARKER_FILTER=(-m 'not gpu')
     echo "GPU filtering enabled: Running CPU-only tests (excluding @pytest.mark.gpu)"
   else
     # GPU job: run only tests that require GPU
-    GPU_MARKER_FILTER='-m gpu'
+    GPU_MARKER_FILTER=(-m gpu)
     echo "GPU filtering enabled: Running GPU-only tests (@pytest.mark.gpu)"
   fi
 else
-  GPU_MARKER_FILTER=""
   echo "GPU filtering disabled: Running all tests"
 fi
 
@@ -324,7 +326,7 @@ run_non_distributed_tests() {
   # - Shard 3: Everything else (can use pytest-xdist for parallelism)
   local shard="${TORCHRL_TEST_SHARD:-all}"
   local common_ignores="--ignore test/test_rlhf.py --ignore test/test_distributed.py --ignore test/test_rb_distributed.py --ignore test/llm --ignore test/test_setup.py"
-  local common_args="--instafail --durations 200 -vv --capture no --timeout=120 --mp_fork_if_no_cuda ${GPU_MARKER_FILTER}"
+  local common_args="--instafail --durations 200 -vv --capture no --timeout=120 --mp_fork_if_no_cuda"
   
   # pytest-xdist parallelism: use -n auto for shard 3 (fewer multiprocessing tests)
   # Set TORCHRL_XDIST=0 to disable parallel execution
@@ -338,12 +340,12 @@ run_non_distributed_tests() {
     1)
       echo "Running shard 1: test_transforms.py only"
       python .github/unittest/helpers/coverage_run_parallel.py -m pytest test/test_transforms.py \
-        ${common_args}
+        "${GPU_MARKER_FILTER[@]}" ${common_args}
       ;;
     2)
       echo "Running shard 2: test_envs.py and test_collectors.py"
       python .github/unittest/helpers/coverage_run_parallel.py -m pytest test/test_envs.py test/test_collectors.py \
-        ${common_args}
+        "${GPU_MARKER_FILTER[@]}" ${common_args}
       ;;
     3)
       echo "Running shard 3: All other tests"
@@ -353,13 +355,13 @@ run_non_distributed_tests() {
         --ignore test/test_envs.py \
         --ignore test/test_collectors.py \
         ${xdist_args} \
-        ${common_args}
+        "${GPU_MARKER_FILTER[@]}" ${common_args}
       ;;
     all|"")
       echo "Running all tests (no sharding)"
       python .github/unittest/helpers/coverage_run_parallel.py -m pytest test \
         ${common_ignores} \
-        ${common_args}
+        "${GPU_MARKER_FILTER[@]}" ${common_args}
       ;;
     *)
       echo "Unknown TORCHRL_TEST_SHARD='${shard}'. Expected: all|1|2|3."

From 8910a1dae7da23c3feffe1c869982575b9e06a6b Mon Sep 17 00:00:00 2001
From: vmoens <vincentmoens@gmail.com>
Date: Wed, 28 Jan 2026 18:15:27 +0000
Subject: [PATCH 3/4] Update

[ghstack-poisoned]
---
 .../workflows/validate-test-partitioning.yml  | 71 +++++++++++++++++++
 1 file changed, 71 insertions(+)
 create mode 100644 .github/workflows/validate-test-partitioning.yml

diff --git a/.github/workflows/validate-test-partitioning.yml b/.github/workflows/validate-test-partitioning.yml
new file mode 100644
index 00000000000..49fb67a66ad
--- /dev/null
+++ b/.github/workflows/validate-test-partitioning.yml
@@ -0,0 +1,71 @@
+# Validates that GPU/CPU test partitioning covers all tests
+#
+# This workflow ensures that:
+# 1. Tests marked with @pytest.mark.gpu + tests not marked = all tests
+# 2. No tests are accidentally excluded from CI
+#
+# Runs on PRs to catch partitioning issues before merge.
+name: Validate Test Partitioning
+
+on:
+  pull_request:
+  push:
+    branches: [main, nightly]
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  validate:
+    runs-on: ubuntu-latest
+    timeout-minutes: 10
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+
+      - name: Install minimal dependencies
+        run: |
+          pip install pytest torch --index-url https://download.pytorch.org/whl/cpu
+          pip install tensordict
+          pip install -e . --no-build-isolation --no-deps
+
+      - name: Validate test partitioning
+        run: |
+          set -e
+
+          echo "=================================================="
+          echo "        TEST PARTITIONING VALIDATION"
+          echo "=================================================="
+
+          # Collect test counts
+          # Note: We ignore test_loggers.py due to torchvision operator issues on CPU-only
+          ALL=$(pytest --collect-only -q test/ --ignore test/test_loggers.py 2>/dev/null | tail -1 | grep -oE "^[0-9]+")
+          GPU=$(pytest --collect-only -q -m gpu test/ --ignore test/test_loggers.py 2>/dev/null | tail -1 | grep -oE "^[0-9]+")
+          CPU=$(pytest --collect-only -q -m "not gpu" test/ --ignore test/test_loggers.py 2>/dev/null | tail -1 | grep -oE "^[0-9]+")
+
+          echo ""
+          echo "Total tests:                  $ALL"
+          echo "GPU tests (@pytest.mark.gpu): $GPU"
+          echo "CPU tests (not gpu):          $CPU"
+          echo "GPU + CPU:                    $((GPU + CPU))"
+          echo ""
+
+          # Validate: GPU + CPU should equal ALL
+          if [ "$((GPU + CPU))" -eq "$ALL" ]; then
+            echo "✅ PASS: Test partitioning is valid!"
+            echo "   All tests are accounted for."
+          else
+            echo "❌ FAIL: Test partitioning mismatch!"
+            echo "   GPU ($GPU) + CPU ($CPU) = $((GPU + CPU)), but total is $ALL"
+            echo ""
+            echo "   This means some tests are either:"
+            echo "   - Missing the @pytest.mark.gpu marker (if they require CUDA)"
+            echo "   - Being excluded unintentionally"
+            exit 1
+          fi

From 3fc3208ffb8081a3a9383a545deb6f7db6e6a76e Mon Sep 17 00:00:00 2001
From: vmoens <vincentmoens@gmail.com>
Date: Thu, 29 Jan 2026 09:43:29 +0000
Subject: [PATCH 4/4] Update

[ghstack-poisoned]
---
 .github/workflows/validate-test-partitioning.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/validate-test-partitioning.yml b/.github/workflows/validate-test-partitioning.yml
index a4d5c883f7e..b2974b78fb4 100644
--- a/.github/workflows/validate-test-partitioning.yml
+++ b/.github/workflows/validate-test-partitioning.yml
@@ -33,9 +33,11 @@ jobs:
         run: |
           pip install pytest tensordict
           pip install torch --index-url https://download.pytorch.org/whl/cpu
-          pip install -e . --no-build-isolation --no-deps
+          # Skip editable install - just add to PYTHONPATH for test collection
 
       - name: Validate test partitioning
+        env:
+          PYTHONPATH: ${{ github.workspace }}
         run: |
           set -e