diff --git a/.github/unittest/linux/scripts/run_all.sh b/.github/unittest/linux/scripts/run_all.sh
index 1a6e3e6d0be..599c70bb66e 100755
--- a/.github/unittest/linux/scripts/run_all.sh
+++ b/.github/unittest/linux/scripts/run_all.sh
@@ -270,6 +270,28 @@ fi
 
 TORCHRL_TEST_SUITE="${TORCHRL_TEST_SUITE:-all}" # all|distributed|nondistributed
 
+# GPU test filtering: Run GPU-only tests on GPU machines, CPU-only tests on CPU machines.
+# This avoids running ~2000+ tests on expensive GPU machines when only ~30 require GPU.
+# Tests are marked with @pytest.mark.gpu if they require CUDA.
+#
+# Set TORCHRL_GPU_FILTER=0 to disable this optimization and run all tests.
+#
+# We use an array to handle the marker expression properly (avoids quoting issues).
+GPU_MARKER_FILTER=()
+if [ "${TORCHRL_GPU_FILTER:-1}" = "1" ]; then
+  if [ "${CU_VERSION:-}" == cpu ]; then
+    # CPU job: run only tests that do NOT require GPU
+    GPU_MARKER_FILTER=(-m 'not gpu')
+    echo "GPU filtering enabled: Running CPU-only tests (excluding @pytest.mark.gpu)"
+  else
+    # GPU job: run only tests that require GPU
+    GPU_MARKER_FILTER=(-m gpu)
+    echo "GPU filtering enabled: Running GPU-only tests (@pytest.mark.gpu)"
+  fi
+else
+  echo "GPU filtering disabled: Running all tests"
+fi
+
 export PYTORCH_TEST_WITH_SLOW='1'
 python -m torch.utils.collect_env
 
@@ -292,6 +314,7 @@ run_distributed_tests() {
   local json_report_args="--json-report --json-report-file=${json_report_dir}/test-results-distributed.json --json-report-indent=2"
   
   # Run both test_distributed.py and test_rb_distributed.py (both use torch.distributed)
+  # Note: distributed tests always run on GPU, no need for GPU_MARKER_FILTER here
   python .github/unittest/helpers/coverage_run_parallel.py -m pytest test/test_distributed.py test/test_rb_distributed.py \
     ${json_report_args} \
     --instafail --durations 200 -vv --capture no \
@@ -327,12 +350,14 @@ run_non_distributed_tests() {
     1)
       echo "Running shard 1: test_transforms.py only"
       python .github/unittest/helpers/coverage_run_parallel.py -m pytest test/test_transforms.py \
+        "${GPU_MARKER_FILTER[@]}" \
         ${json_report_args} \
         ${common_args}
       ;;
     2)
       echo "Running shard 2: test_envs.py and test_collectors.py"
       python .github/unittest/helpers/coverage_run_parallel.py -m pytest test/test_envs.py test/test_collectors.py \
+        "${GPU_MARKER_FILTER[@]}" \
         ${json_report_args} \
         ${common_args}
       ;;
@@ -344,6 +369,7 @@ run_non_distributed_tests() {
         --ignore test/test_envs.py \
         --ignore test/test_collectors.py \
         ${xdist_args} \
+        "${GPU_MARKER_FILTER[@]}" \
         ${json_report_args} \
         ${common_args}
       ;;
@@ -351,6 +377,7 @@ run_non_distributed_tests() {
       echo "Running all tests (no sharding)"
       python .github/unittest/helpers/coverage_run_parallel.py -m pytest test \
         ${common_ignores} \
+        "${GPU_MARKER_FILTER[@]}" \
         ${json_report_args} \
         ${common_args}
       ;;
diff --git a/.github/workflows/validate-test-partitioning.yml b/.github/workflows/validate-test-partitioning.yml
new file mode 100644
index 00000000000..b2974b78fb4
--- /dev/null
+++ b/.github/workflows/validate-test-partitioning.yml
@@ -0,0 +1,73 @@
+# Validates that GPU/CPU test partitioning covers all tests
+#
+# This workflow ensures that:
+# 1. Tests marked with @pytest.mark.gpu + tests not marked = all tests
+# 2. No tests are accidentally excluded from CI
+#
+# Runs on PRs to catch partitioning issues before merge.
+name: Validate Test Partitioning
+
+on:
+  pull_request:
+  push:
+    branches: [main, nightly]
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  validate:
+    runs-on: ubuntu-latest
+    timeout-minutes: 10
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+
+      - name: Install minimal dependencies
+        run: |
+          pip install pytest tensordict
+          pip install torch --index-url https://download.pytorch.org/whl/cpu
+          # Skip editable install - just add to PYTHONPATH for test collection
+
+      - name: Validate test partitioning
+        env:
+          PYTHONPATH: ${{ github.workspace }}
+        run: |
+          set -e
+
+          echo "=================================================="
+          echo "        TEST PARTITIONING VALIDATION"
+          echo "=================================================="
+
+          # Collect test counts
+          # Note: We ignore test_loggers.py due to torchvision operator issues on CPU-only
+          ALL=$(pytest --collect-only -q test/ --ignore test/test_loggers.py 2>/dev/null | tail -1 | grep -oE "^[0-9]+")
+          GPU=$(pytest --collect-only -q -m gpu test/ --ignore test/test_loggers.py 2>/dev/null | tail -1 | grep -oE "^[0-9]+")
+          CPU=$(pytest --collect-only -q -m "not gpu" test/ --ignore test/test_loggers.py 2>/dev/null | tail -1 | grep -oE "^[0-9]+")
+
+          echo ""
+          echo "Total tests:                  $ALL"
+          echo "GPU tests (@pytest.mark.gpu): $GPU"
+          echo "CPU tests (not gpu):          $CPU"
+          echo "GPU + CPU:                    $((GPU + CPU))"
+          echo ""
+
+          # Validate: GPU + CPU should equal ALL
+          if [ "$((GPU + CPU))" -eq "$ALL" ]; then
+            echo "✅ PASS: Test partitioning is valid!"
+            echo "   All tests are accounted for."
+          else
+            echo "❌ FAIL: Test partitioning mismatch!"
+            echo "   GPU ($GPU) + CPU ($CPU) = $((GPU + CPU)), but total is $ALL"
+            echo ""
+            echo "   This means some tests are either:"
+            echo "   - Missing the @pytest.mark.gpu marker (if they require CUDA)"
+            echo "   - Being excluded unintentionally"
+            exit 1
+          fi
diff --git a/pytest.ini b/pytest.ini
index 39fe36617a1..d0aa36cc299 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -6,6 +6,8 @@ addopts =
     --tb=native
 markers =
     unity_editor
+    slow: mark test as slow to run
+    gpu: mark test as requiring a GPU (CUDA device)
 testpaths =
     test
 xfail_strict = True
diff --git a/test/compile/test_compile_collectors.py b/test/compile/test_compile_collectors.py
index 703e9dc7b91..d4417bb33ce 100644
--- a/test/compile/test_compile_collectors.py
+++ b/test/compile/test_compile_collectors.py
@@ -77,6 +77,7 @@ def test_compiled_policy(self, collector_cls, compile_policy, device):
             collector.shutdown()
             del collector
 
+    @pytest.mark.gpu
     @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is not available")
     @pytest.mark.parametrize(
         "collector_cls",
diff --git a/test/conftest.py b/test/conftest.py
index c4e56cd5f0a..7ddfb2e1e9d 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -148,6 +148,9 @@ def pytest_runtest_setup(item):
 
 def pytest_configure(config):
     config.addinivalue_line("markers", "slow: mark test as slow to run")
+    config.addinivalue_line(
+        "markers", "gpu: mark test as requiring a GPU (CUDA device)"
+    )
 
 
 def pytest_collection_modifyitems(config, items):
diff --git a/test/llm/test_llm_updaters.py b/test/llm/test_llm_updaters.py
index ddc2a2a08a7..bf04d32d499 100644
--- a/test/llm/test_llm_updaters.py
+++ b/test/llm/test_llm_updaters.py
@@ -72,6 +72,7 @@ def get_open_port():
     )
 
 
+@pytest.mark.gpu
 @pytest.mark.skipif(not _has_transformers, reason="missing transformers dependencies")
 @pytest.mark.skipif(not _has_vllm, reason="missing vllm dependencies")
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
@@ -415,6 +416,7 @@ def test_local_llm_specific_features(self, target_vllm_engine):
     "See LLM_TEST_ISSUES.md for details.",
     strict=False,
 )
+@pytest.mark.gpu
 @pytest.mark.skipif(not _has_ray, reason="missing ray dependencies")
 @pytest.mark.skipif(not _has_vllm, reason="missing vllm dependencies")
 @pytest.mark.skipif(not _has_transformers, reason="missing transformers dependencies")
@@ -611,6 +613,7 @@ def test_weight_sync_vllm_collective_ray(self, request):
                 ray.shutdown()
 
 
+@pytest.mark.gpu
 @pytest.mark.xfail(
     reason="AsyncVLLM tests fail due to Ray placement group timeout. "
     "See LLM_TEST_ISSUES.md for details.",
diff --git a/test/llm/test_vllm.py b/test/llm/test_vllm.py
index 8378b60e640..6f2ffcb59ac 100644
--- a/test/llm/test_vllm.py
+++ b/test/llm/test_vllm.py
@@ -39,6 +39,7 @@ def sampling_params():
 class TestAsyncVLLMIntegration:
     """Integration tests for AsyncVLLM with real models."""
 
+    @pytest.mark.gpu
     @pytest.mark.skipif(not _has_vllm, reason="vllm not available")
     @pytest.mark.skipif(not _has_ray, reason="ray not available")
     @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
@@ -111,6 +112,7 @@ def test_vllm_api_compatibility(self, sampling_params):
         finally:
             service.shutdown()
 
+    @pytest.mark.gpu
     @pytest.mark.skipif(not _has_vllm, reason="vllm not available")
     @pytest.mark.skipif(not _has_ray, reason="ray not available")
     @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
diff --git a/test/llm/test_wrapper.py b/test/llm/test_wrapper.py
index a0485b05851..4d322392d54 100644
--- a/test/llm/test_wrapper.py
+++ b/test/llm/test_wrapper.py
@@ -2104,6 +2104,7 @@ def test_log_probs_consistency(
         "See LLM_TEST_ISSUES.md for details.",
         strict=False,
     )
+    @pytest.mark.gpu
     @pytest.mark.skipif(not _has_vllm, reason="vllm not available")
     @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
     def test_sync_async_vllm_strict_equivalence(
diff --git a/test/test_collectors.py b/test/test_collectors.py
index 8a283128101..dd60eb431a4 100644
--- a/test/test_collectors.py
+++ b/test/test_collectors.py
@@ -2333,6 +2333,7 @@ def _reset(self, tensordict: TensorDictBase, **kwargs) -> TensorDictBase:
         def _set_seed(self, seed: int | None) -> None:
             ...
 
+    @pytest.mark.gpu
     @pytest.mark.skipif(not torch.cuda.is_available(), reason="no cuda device")
     @pytest.mark.parametrize("env_device", ["cuda:0", "cpu"])
     @pytest.mark.parametrize("storing_device", [None, "cuda:0", "cpu"])
@@ -2371,6 +2372,7 @@ def test_no_synchronize(self, env_device, storing_device, no_cuda_sync):
                     assert u == i, i
                 mock_synchronize.assert_not_called()
 
+    @pytest.mark.gpu
     @pytest.mark.parametrize("device", ["cuda", "cpu"])
     @pytest.mark.parametrize("storing_device", ["cuda", "cpu"])
     @pytest.mark.skipif(not torch.cuda.is_available(), reason="no cuda device found")
@@ -3162,6 +3164,7 @@ def test_multi_collector_consistency(
         assert_allclose_td(c2.unsqueeze(0), d2)
 
 
+@pytest.mark.gpu
 @pytest.mark.skipif(
     not torch.cuda.is_available() and (not has_mps()),
     reason="No casting if no cuda",
@@ -3363,6 +3366,7 @@ def test_param_sync_mixed_device(
             col.shutdown()
             del col
 
+    @pytest.mark.gpu
     @pytest.mark.skipif(
         not torch.cuda.is_available() or torch.cuda.device_count() < 3,
         reason="requires at least 3 CUDA devices",
diff --git a/test/test_envs.py b/test/test_envs.py
index 2d3cf4c76ae..8895d79b15c 100644
--- a/test/test_envs.py
+++ b/test/test_envs.py
@@ -597,6 +597,7 @@ def test_auto_spec(self, env_type):
         env.auto_specs_(policy, tensordict=td.copy(), observation_key=obs_vals)
         env.check_env_specs(tensordict=td.copy())
 
+    @pytest.mark.gpu
     @pytest.mark.skipif(not torch.cuda.device_count(), reason="No cuda device found.")
     @pytest.mark.parametrize("break_when_any_done", [True, False])
     def test_auto_cast_to_device(self, break_when_any_done):
@@ -1526,6 +1527,7 @@ def test_parallel_env_with_policy(
             # env_serial.close()
             env0.close(raise_if_closed=False)
 
+    @pytest.mark.gpu
     @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA required")
     @pytest.mark.parametrize("heterogeneous", [False, True])
     def test_transform_env_transform_no_device(
@@ -1638,6 +1640,7 @@ def test_parallel_env_custom_method(self, parallel, maybe_fork_ParallelEnv):
         finally:
             env.close(raise_if_closed=False)
 
+    @pytest.mark.gpu
     @pytest.mark.skipif(not torch.cuda.device_count(), reason="no cuda to test on")
     @pytest.mark.skipif(not _has_gym, reason="no gym")
     @pytest.mark.parametrize("frame_skip", [4])
@@ -1742,6 +1745,7 @@ def test_parallel_env_cast(
             env_serial.close(raise_if_closed=False)
             env0.close(raise_if_closed=False)
 
+    @pytest.mark.gpu
     @pytest.mark.skipif(not _has_gym, reason="no gym")
     @pytest.mark.skipif(not torch.cuda.device_count(), reason="no cuda device detected")
     @pytest.mark.parametrize("frame_skip", [4])
@@ -2726,6 +2730,7 @@ def test_marl_group_type(group_type):
         check_marl_grouping(group_type.get_group_map(agent_names), agent_names)
 
 
+@pytest.mark.gpu
 @pytest.mark.skipif(not torch.cuda.device_count(), reason="No cuda device")
 class TestConcurrentEnvs:
     """Concurrent parallel envs on multiple procs can interfere."""
diff --git a/test/test_libs.py b/test/test_libs.py
index 274b9ec406c..97277f444d8 100644
--- a/test/test_libs.py
+++ b/test/test_libs.py
@@ -2157,6 +2157,7 @@ def test_set_seed_and_reset_works(self):
 
         assert isinstance(td, TensorDict)
 
+    @pytest.mark.gpu
     @pytest.mark.skipif(not torch.cuda.is_available(), reason="requires cuda")
     def test_dmcontrol_kwargs_preserved_with_seed(self):
         """Test that kwargs like camera_id are preserved when seed is provided.
@@ -2182,6 +2183,7 @@ def test_dmcontrol_kwargs_preserved_with_seed(self):
         finally:
             env.close()
 
+    @pytest.mark.gpu
     @pytest.mark.skipif(not torch.cuda.is_available(), reason="requires cuda")
     @pytest.mark.parametrize("env_name,task", [["cheetah", "run"]])
     @pytest.mark.parametrize("frame_skip", [1, 3])
@@ -2849,6 +2851,7 @@ def test_multithread_env_shutdown(self):
         assert not env.is_closed
         env.close()
 
+    @pytest.mark.gpu
     @pytest.mark.skipif(not torch.cuda.device_count(), reason="no cuda to test on")
     @pytest.mark.skipif(not _has_gym, reason="no gym")
     @pytest.mark.parametrize("frame_skip", [4])
@@ -2889,6 +2892,7 @@ def test_multithreaded_env_cast(
         assert td_device.device == torch.device(device), env_multithread
         env_multithread.close()
 
+    @pytest.mark.gpu
     @pytest.mark.skipif(not _has_gym, reason="no gym")
     @pytest.mark.skipif(not torch.cuda.device_count(), reason="no cuda device detected")
     @pytest.mark.parametrize("frame_skip", [4])
@@ -3170,6 +3174,7 @@ def test_brax_automatic_cache_clearing_parameter(self, envname, device, freq):
             out_td, next_td = env.step_and_maybe_reset(next_td)
             assert env._step_count == i + 1
 
+    @pytest.mark.gpu
     @pytest.mark.skipif(not torch.cuda.is_available(), reason="requires cuda")
     def test_brax_kwargs_preserved_with_seed(self, envname, device):
         """Test that kwargs like camera_id are preserved when seed is provided.
diff --git a/test/test_rb.py b/test/test_rb.py
index 86d22d4ba32..73441f9e91c 100644
--- a/test/test_rb.py
+++ b/test/test_rb.py
@@ -753,6 +753,7 @@ def test_state_dict(self, storage_type, data_type):
             storage2.get(range(10))
         )
 
+    @pytest.mark.gpu
     @pytest.mark.skipif(
         not torch.cuda.device_count(),
         reason="not cuda device found to test rb storage.",
diff --git a/test/test_specs.py b/test/test_specs.py
index fd2f68aaf77..1a09b17cc47 100644
--- a/test/test_specs.py
+++ b/test/test_specs.py
@@ -3001,6 +3001,7 @@ def test_stack_zero_shape(self, stack_dim):
             assert r["a"].shape == torch.Size([*shape, 1, 3, 2])  # access tensor
         assert (r["a"] == 0).all()
 
+    @pytest.mark.gpu
     @pytest.mark.skipif(not torch.cuda.device_count(), reason="no cuda")
     @pytest.mark.parametrize("stack_dim", [0, 1, 2, -3, -2, -1])
     def test_to(self, stack_dim):
@@ -3958,6 +3959,7 @@ def test_encode(self):
         assert r.get("nontensor").shape == (1,)
 
 
+@pytest.mark.gpu
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="not cuda device")
 def test_device_ordinal():
     device = torch.device("cpu")
diff --git a/test/test_transforms.py b/test/test_transforms.py
index cd5c880df3a..65e6939afb7 100644
--- a/test/test_transforms.py
+++ b/test/test_transforms.py
@@ -1292,6 +1292,7 @@ def test_constant_padding(self, padding_value):
         assert (cat_td.get("cat_first_key") == padding_value).sum() == N - 4
 
 
+@pytest.mark.gpu
 @pytest.mark.skipif(not _has_tv, reason="torchvision not installed")
 @pytest.mark.skipif(not torch.cuda.device_count(), reason="Testing R3M on cuda only")
 @pytest.mark.parametrize("device", [torch.device("cuda:0")])
@@ -8788,6 +8789,7 @@ def test_transform_env(self):
         assert (env.reset()["_eps_gSDE"] != 0.0).all()
 
 
+@pytest.mark.gpu
 @pytest.mark.skipif(not _has_tv, reason="torchvision not installed")
 @pytest.mark.skipif(not torch.cuda.device_count(), reason="Testing VIP on cuda only")
 @pytest.mark.parametrize("device", [torch.device("cuda:0")])
@@ -9259,6 +9261,7 @@ def test_vip_spec_against_real(self, model, tensor_pixels_key, device):
         assert set(expected_keys) == set(transformed_env.rollout(3).keys(True))
 
 
+@pytest.mark.gpu
 @pytest.mark.skipif(not _has_vc, reason="vc_models not installed")
 @pytest.mark.skipif(not torch.cuda.device_count(), reason="VC1 should run on cuda")
 @pytest.mark.parametrize("device", [torch.device("cuda:0")])
@@ -10952,6 +10955,7 @@ def test_finitetensordictcheck(self, device):
         with pytest.raises(ValueError, match="Encountered a non-finite tensor"):
             ftd(td)
 
+    @pytest.mark.gpu
     @pytest.mark.skipif(not torch.cuda.device_count(), reason="no cuda device found")
     @pytest.mark.parametrize("device", get_default_devices())
     def test_pin_mem(self, device):