From 438a967546fd7138be4205f6215c7fecf96600fd Mon Sep 17 00:00:00 2001 From: Vincent Moens Date: Sat, 14 Mar 2026 04:46:10 -0700 Subject: [PATCH] [CI] Add torch_geometric integration tests Add CI workflow and tests to verify torchrl compatibility with torch_geometric (addresses #2679). Tests cover: - deepcopy of modules containing torch_geometric layers - The collector's meta-device deepcopy pattern - Collector integration with torch_geometric-based policies - TensorDictModule wrapping of torch_geometric modules Made-with: Cursor --- .github/labeler.yml | 14 +++ .../scripts_torch_geometric/environment.yml | 20 ++++ .../scripts_torch_geometric/install.sh | 57 ++++++++++ .../scripts_torch_geometric/post_process.sh | 6 + .../scripts_torch_geometric/run_all.sh | 9 ++ .../scripts_torch_geometric/run_test.sh | 23 ++++ .../scripts_torch_geometric/setup_env.sh | 44 ++++++++ .github/workflows/test-linux-libs.yml | 36 ++++++ test/test_libs.py | 105 ++++++++++++++++++ 9 files changed, 314 insertions(+) create mode 100644 .github/unittest/linux_libs/scripts_torch_geometric/environment.yml create mode 100755 .github/unittest/linux_libs/scripts_torch_geometric/install.sh create mode 100755 .github/unittest/linux_libs/scripts_torch_geometric/post_process.sh create mode 100755 .github/unittest/linux_libs/scripts_torch_geometric/run_all.sh create mode 100755 .github/unittest/linux_libs/scripts_torch_geometric/run_test.sh create mode 100755 .github/unittest/linux_libs/scripts_torch_geometric/setup_env.sh diff --git a/.github/labeler.yml b/.github/labeler.yml index 5bebcb9924c..d46bdb1ba84 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -105,6 +105,20 @@ - any-glob-to-any-file: - 'torchrl/data/datasets/**' +# ============================================================================= +# Integrations Sub-Labels (trigger specific tests in test-linux-libs.yml) +# ============================================================================= +"Integrations/torch_geometric": + - changed-files: + - any-glob-to-any-file: ['torchrl/collectors/**', 'torchrl/modules/**'] + +# Parent Integrations label (any integration-related change) +"Integrations": + - changed-files: + - any-glob-to-any-file: + - 'torchrl/collectors/**' + - 'torchrl/modules/**' + # ============================================================================= # LLM (triggers test-linux-llm.yml) # ============================================================================= diff --git a/.github/unittest/linux_libs/scripts_torch_geometric/environment.yml b/.github/unittest/linux_libs/scripts_torch_geometric/environment.yml new file mode 100644 index 00000000000..b4bc944fbbc --- /dev/null +++ b/.github/unittest/linux_libs/scripts_torch_geometric/environment.yml @@ -0,0 +1,20 @@ +channels: + - pytorch + - defaults +dependencies: + - pip + - pip: + - hypothesis + - future + - cloudpickle + - pytest + - pytest-cov + - pytest-mock + - pytest-instafail + - pytest-rerunfailures + - pytest-error-for-skips + - expecttest + - pybind11[global] + - pyyaml + - scipy + - torch_geometric diff --git a/.github/unittest/linux_libs/scripts_torch_geometric/install.sh b/.github/unittest/linux_libs/scripts_torch_geometric/install.sh new file mode 100755 index 00000000000..204cf25e3cb --- /dev/null +++ b/.github/unittest/linux_libs/scripts_torch_geometric/install.sh @@ -0,0 +1,57 @@ +#!/usr/bin/env bash + +unset PYTORCH_VERSION + +set -euxo pipefail + +eval "$(./conda/bin/conda shell.bash hook)" +conda activate ./env + +if [ "${CU_VERSION:-}" == cpu ] ; then + version="cpu" +else + if [[ ${#CU_VERSION} -eq 4 ]]; then + CUDA_VERSION="${CU_VERSION:2:1}.${CU_VERSION:3:1}" + elif [[ ${#CU_VERSION} -eq 5 ]]; then + CUDA_VERSION="${CU_VERSION:2:2}.${CU_VERSION:4:1}" + fi + echo "Using CUDA $CUDA_VERSION as determined by CU_VERSION ($CU_VERSION)" + version="$(python -c "print('.'.join(\"${CUDA_VERSION}\".split('.')[:2]))")" +fi + +# submodules +git submodule sync && git submodule update --init --recursive + +printf "Installing PyTorch with cu128" +if [[ "$TORCH_VERSION" == "nightly" ]]; then + if [ "${CU_VERSION:-}" == cpu ] ; then + pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu -U + else + pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu128 -U + fi +elif [[ "$TORCH_VERSION" == "stable" ]]; then + if [ "${CU_VERSION:-}" == cpu ] ; then + pip3 install torch --index-url https://download.pytorch.org/whl/cpu + else + pip3 install torch --index-url https://download.pytorch.org/whl/cu128 + fi +else + printf "Failed to install pytorch" + exit 1 +fi + +# install tensordict +if [[ "$RELEASE" == 0 ]]; then + pip3 install git+https://github.com/pytorch/tensordict.git +else + pip3 install tensordict +fi + +# smoke test +python -c "import functorch;import tensordict" + +printf "* Installing torchrl\n" +python -m pip install -e . --no-build-isolation + +# smoke test +python -c "import torchrl" diff --git a/.github/unittest/linux_libs/scripts_torch_geometric/post_process.sh b/.github/unittest/linux_libs/scripts_torch_geometric/post_process.sh new file mode 100755 index 00000000000..e97bf2a7b1b --- /dev/null +++ b/.github/unittest/linux_libs/scripts_torch_geometric/post_process.sh @@ -0,0 +1,6 @@ +#!/usr/bin/env bash + +set -e + +eval "$(./conda/bin/conda shell.bash hook)" +conda activate ./env diff --git a/.github/unittest/linux_libs/scripts_torch_geometric/run_all.sh b/.github/unittest/linux_libs/scripts_torch_geometric/run_all.sh new file mode 100755 index 00000000000..8fb42cec533 --- /dev/null +++ b/.github/unittest/linux_libs/scripts_torch_geometric/run_all.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash + +set -euxo pipefail + +this_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" +bash ${this_dir}/setup_env.sh +bash ${this_dir}/install.sh +bash ${this_dir}/run_test.sh +bash ${this_dir}/post_process.sh diff --git a/.github/unittest/linux_libs/scripts_torch_geometric/run_test.sh b/.github/unittest/linux_libs/scripts_torch_geometric/run_test.sh new file mode 100755 index 00000000000..df9a49b2b53 --- /dev/null +++ b/.github/unittest/linux_libs/scripts_torch_geometric/run_test.sh @@ -0,0 +1,23 @@ +#!/usr/bin/env bash + +set -euxo pipefail + +eval "$(./conda/bin/conda shell.bash hook)" +conda activate ./env + +export PYTORCH_TEST_WITH_SLOW='1' +export LAZY_LEGACY_OP=False +python -m torch.utils.collect_env +git config --global --add safe.directory '*' + +root_dir="$(git rev-parse --show-toplevel)" +env_dir="${root_dir}/env" +lib_dir="${env_dir}/lib" + +conda deactivate && conda activate ./env + +python -c "import torch_geometric" + +python .github/unittest/helpers/coverage_run_parallel.py -m pytest test/test_libs.py --instafail -v --durations 200 --capture no -k TestTorchGeometric --error-for-skips +coverage combine -q +coverage xml -i diff --git a/.github/unittest/linux_libs/scripts_torch_geometric/setup_env.sh b/.github/unittest/linux_libs/scripts_torch_geometric/setup_env.sh new file mode 100755 index 00000000000..d7dbd1bb7e6 --- /dev/null +++ b/.github/unittest/linux_libs/scripts_torch_geometric/setup_env.sh @@ -0,0 +1,44 @@ +#!/usr/bin/env bash + +set -euxo pipefail + +apt-get update && apt-get upgrade -y && apt-get install -y git cmake +git config --global --add safe.directory '*' +apt-get install -y wget gcc g++ + +this_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" +root_dir="$(git rev-parse --show-toplevel)" +conda_dir="${root_dir}/conda" +env_dir="${root_dir}/env" + +cd "${root_dir}" + +case "$(uname -s)" in + Darwin*) os=MacOSX;; + *) os=Linux +esac + +# 1. Install conda at ./conda +if [ ! -d "${conda_dir}" ]; then + printf "* Installing conda\n" + wget -O miniconda.sh "http://repo.continuum.io/miniconda/Miniconda3-latest-${os}-x86_64.sh" + bash ./miniconda.sh -b -f -p "${conda_dir}" +fi +eval "$(${conda_dir}/bin/conda shell.bash hook)" + +# 2. Create test environment at ./env +printf "python: ${PYTHON_VERSION}\n" +if [ ! -d "${env_dir}" ]; then + printf "* Creating a test environment\n" + conda create --prefix "${env_dir}" -y python="$PYTHON_VERSION" +fi +conda activate "${env_dir}" + +# 3. Install Conda dependencies +printf "* Installing dependencies (except PyTorch)\n" +echo " - python=${PYTHON_VERSION}" >> "${this_dir}/environment.yml" +cat "${this_dir}/environment.yml" + +pip install pip --upgrade + +conda env update --file "${this_dir}/environment.yml" --prune diff --git a/.github/workflows/test-linux-libs.yml b/.github/workflows/test-linux-libs.yml index b1840a11fef..58003c528a4 100644 --- a/.github/workflows/test-linux-libs.yml +++ b/.github/workflows/test-linux-libs.yml @@ -794,3 +794,39 @@ jobs: bash .github/unittest/linux_libs/scripts_vmas/install.sh bash .github/unittest/linux_libs/scripts_vmas/run_test.sh bash .github/unittest/linux_libs/scripts_vmas/post_process.sh + + unittests-torch_geometric: + strategy: + matrix: + python_version: ["3.10"] + cuda_arch_version: ["12.8"] + if: ${{ github.event_name == 'push' || github.event_name == 'workflow_call' || github.event_name == 'workflow_dispatch' || contains(github.event.pull_request.labels.*.name, 'Integrations') || contains(github.event.pull_request.labels.*.name, 'Integrations/torch_geometric') }} + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + with: + repository: pytorch/rl + runner: "linux.g5.4xlarge.nvidia.gpu" + gpu-arch-type: cuda + gpu-arch-version: "12.8" + docker-image: "nvidia/cuda:12.4.0-devel-ubuntu22.04" + timeout: 120 + script: | + if [[ "${{ github.ref }}" =~ release/* ]]; then + export RELEASE=1 + export TORCH_VERSION=stable + else + export RELEASE=0 + export TORCH_VERSION=nightly + fi + + set -euo pipefail + export PYTHON_VERSION="3.10" + export CU_VERSION="12.8" + export TAR_OPTIONS="--no-same-owner" + export UPLOAD_CHANNEL="nightly" + export TF_CPP_MIN_LOG_LEVEL=0 + export BATCHED_PIPE_TIMEOUT=60 + export TD_GET_DEFAULTS_TO_NONE=1 + + nvidia-smi + + bash .github/unittest/linux_libs/scripts_torch_geometric/run_all.sh diff --git a/test/test_libs.py b/test/test_libs.py index ba4386b4def..8857da7f7af 100644 --- a/test/test_libs.py +++ b/test/test_libs.py @@ -151,6 +151,7 @@ importlib.util.find_spec("mujoco") is not None or importlib.util.find_spec("mujoco_py") is not None ) +_has_torch_geometric = importlib.util.find_spec("torch_geometric") is not None def _has_atari_for_gym(): @@ -6038,6 +6039,110 @@ def test_procgen_start_level_num_levels(self): env.close() +@pytest.mark.skipif(not _has_torch_geometric, reason="torch_geometric not installed") +class TestTorchGeometric: + """Tests for torch_geometric compatibility with torchrl (issue #2679). + + The primary concern is that torch_geometric modules override __deepcopy__ + in a way that conflicts with torchrl's collector parameter-mapping logic. + """ + + def _make_pyg_module(self, in_features=10, hidden=32, out_features=4): + from torch_geometric.nn import Linear as PyGLinear + + class PyGModule(nn.Module): + def __init__(self): + super().__init__() + self.pyg_linear = PyGLinear(in_features, hidden) + self.head = nn.Linear(hidden, out_features) + + def forward(self, x): + return self.head(torch.relu(self.pyg_linear(x))) + + return PyGModule() + + def test_deepcopy(self): + module = self._make_pyg_module() + module_copy = copy.deepcopy(module) + x = torch.randn(5, 10) + out_orig = module(x) + out_copy = module_copy(x) + assert out_orig.shape == out_copy.shape == (5, 4) + + def test_deepcopy_meta_device(self): + """Reproduce the collector's internal deepcopy pattern that triggers #2679.""" + module = self._make_pyg_module() + param_and_buf = TensorDict.from_module(module, as_module=True) + + with param_and_buf.data.to("meta").to_module(module): + module_copy = copy.deepcopy(module) + + param_and_buf.to_module(module_copy) + + x = torch.randn(5, 10) + out = module_copy(x) + assert out.shape == (5, 4) + + @pytest.mark.skipif( + not (torch.cuda.is_available() and torch.cuda.device_count()), + reason="CUDA required for collector device-mapping test", + ) + def test_collector_with_pyg_policy(self): + from torchrl.testing.mocking_classes import ContinuousActionVecMockEnv + + in_features = 7 + act_features = 7 + module = self._make_pyg_module( + in_features=in_features, hidden=32, out_features=act_features + ) + policy = TensorDictModule(module, in_keys=["observation"], out_keys=["action"]) + + collector = Collector( + create_env_fn=ContinuousActionVecMockEnv, + policy=policy, + total_frames=20, + frames_per_batch=10, + device="cpu", + policy_device="cuda:0", + ) + for data in collector: + assert "action" in data + break + collector.shutdown() + + def test_collector_with_pyg_policy_same_device(self): + from torchrl.testing.mocking_classes import ContinuousActionVecMockEnv + + in_features = 7 + act_features = 7 + module = self._make_pyg_module( + in_features=in_features, hidden=32, out_features=act_features + ) + policy = TensorDictModule(module, in_keys=["observation"], out_keys=["action"]) + + collector = Collector( + create_env_fn=ContinuousActionVecMockEnv, + policy=policy, + total_frames=20, + frames_per_batch=10, + device="cpu", + ) + for data in collector: + assert "action" in data + break + collector.shutdown() + + def test_tensordict_module_wrap(self): + module = self._make_pyg_module() + td_module = TensorDictModule( + module, in_keys=["observation"], out_keys=["action"] + ) + td = TensorDict({"observation": torch.randn(3, 10)}) + out = td_module(td) + assert "action" in out + assert out["action"].shape == (3, 4) + + if __name__ == "__main__": args, unknown = argparse.ArgumentParser().parse_known_args() pytest.main([__file__, "--capture", "no", "--exitfirst"] + unknown)