From 1fecb54ae045300681d85f623a2a429fab8d7087 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Thu, 18 Dec 2025 22:47:31 +0800 Subject: [PATCH 1/4] CI: bump CUDA image to 12.9.1 Signed-off-by: Jinzhe Zeng --- .github/workflows/test_cuda.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test_cuda.yml b/.github/workflows/test_cuda.yml index a934401a20..71bdc6503f 100644 --- a/.github/workflows/test_cuda.yml +++ b/.github/workflows/test_cuda.yml @@ -19,7 +19,7 @@ jobs: runs-on: nvidia # https://github.com/deepmodeling/deepmd-kit/pull/2884#issuecomment-1744216845 container: - image: nvidia/cuda:12.6.2-cudnn-devel-ubuntu22.04 + image: nvidia/cuda:12.9.1-cudnn-devel-ubuntu22.04 options: --gpus all if: github.repository_owner == 'deepmodeling' && (github.event_name == 'pull_request' && github.event.label && github.event.label.name == 'Test CUDA' || github.event_name == 'workflow_dispatch' || github.event_name == 'merge_group') steps: From 8244877af0a7b4db7f4112178d37bf7d6a37e08c Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Fri, 19 Dec 2025 15:49:46 +0800 Subject: [PATCH 2/4] debug GPU Signed-off-by: Jinzhe Zeng --- .github/workflows/test_cuda.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/test_cuda.yml b/.github/workflows/test_cuda.yml index 71bdc6503f..db09662e87 100644 --- a/.github/workflows/test_cuda.yml +++ b/.github/workflows/test_cuda.yml @@ -62,6 +62,8 @@ jobs: XLA_PYTHON_CLIENT_PREALLOCATE: false XLA_PYTHON_CLIENT_ALLOCATOR: platform FLAGS_use_stride_compute_kernel: 0 + TF_CPP_MAX_VLOG_LEVEL: 3 + if: false - name: Convert models run: source/tests/infer/convert-models.sh - run: | @@ -74,6 +76,7 @@ jobs: CMAKE_GENERATOR: Ninja DP_VARIANT: cuda DP_USE_MPICH2: 1 + TF_CPP_MAX_VLOG_LEVEL: 3 - run: | export LD_LIBRARY_PATH=$CUDA_PATH/lib64:/usr/lib/x86_64-linux-gnu/:$GITHUB_WORKSPACE/dp_test/lib:$LD_LIBRARY_PATH export PATH=$GITHUB_WORKSPACE/dp_test/bin:$PATH @@ -88,6 +91,7 @@ jobs: TF_INTER_OP_PARALLELISM_THREADS: 1 LAMMPS_PLUGIN_PATH: ${{ github.workspace }}/dp_test/lib/deepmd_lmp CUDA_VISIBLE_DEVICES: 0 + TF_CPP_MAX_VLOG_LEVEL: 3 pass: name: Pass testing on CUDA needs: [test_cuda] From 55a1d37a2d22b64c5a58dab32217fb15ced4964c Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Fri, 19 Dec 2025 17:06:43 +0800 Subject: [PATCH 3/4] bump cublas version Signed-off-by: Jinzhe Zeng --- .github/workflows/test_cuda.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test_cuda.yml b/.github/workflows/test_cuda.yml index db09662e87..cdc08c803a 100644 --- a/.github/workflows/test_cuda.yml +++ b/.github/workflows/test_cuda.yml @@ -49,6 +49,8 @@ jobs: export TENSORFLOW_ROOT=$(python -c 'import importlib.util,pathlib;print(pathlib.Path(importlib.util.find_spec("tensorflow").origin).parent)') pip install --find-links "https://www.paddlepaddle.org.cn/packages/nightly/cu126/paddlepaddle-gpu/" --index-url https://pypi.org/simple "paddlepaddle-gpu==3.3.0.dev20251204" source/install/uv_with_retry.sh pip install --system -v -e .[gpu,test,lmp,cu12,torch,jax] mpi4py --reinstall-package deepmd-kit + # See https://github.com/jax-ml/jax/issues/29042 + source/install/uv_with_retry.sh pip install -U nvidia-cublas-cu12>=12.9.0.13 env: DP_VARIANT: cuda DP_ENABLE_NATIVE_OPTIMIZATION: 1 @@ -62,7 +64,6 @@ jobs: XLA_PYTHON_CLIENT_PREALLOCATE: false XLA_PYTHON_CLIENT_ALLOCATOR: platform FLAGS_use_stride_compute_kernel: 0 - TF_CPP_MAX_VLOG_LEVEL: 3 if: false - name: Convert models run: source/tests/infer/convert-models.sh @@ -76,7 +77,6 @@ jobs: CMAKE_GENERATOR: Ninja DP_VARIANT: cuda DP_USE_MPICH2: 1 - TF_CPP_MAX_VLOG_LEVEL: 3 - run: | export LD_LIBRARY_PATH=$CUDA_PATH/lib64:/usr/lib/x86_64-linux-gnu/:$GITHUB_WORKSPACE/dp_test/lib:$LD_LIBRARY_PATH export PATH=$GITHUB_WORKSPACE/dp_test/bin:$PATH @@ -91,7 +91,6 @@ jobs: TF_INTER_OP_PARALLELISM_THREADS: 1 LAMMPS_PLUGIN_PATH: ${{ github.workspace }}/dp_test/lib/deepmd_lmp CUDA_VISIBLE_DEVICES: 0 - TF_CPP_MAX_VLOG_LEVEL: 3 pass: name: Pass testing on CUDA needs: [test_cuda] From 67f15f3795a4054aaa6bfc72a099f528a7ce5486 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Fri, 19 Dec 2025 17:46:59 +0800 Subject: [PATCH 4/4] system Signed-off-by: Jinzhe Zeng --- .github/workflows/test_cuda.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test_cuda.yml b/.github/workflows/test_cuda.yml index cdc08c803a..355b5cff4f 100644 --- a/.github/workflows/test_cuda.yml +++ b/.github/workflows/test_cuda.yml @@ -50,7 +50,7 @@ jobs: pip install --find-links "https://www.paddlepaddle.org.cn/packages/nightly/cu126/paddlepaddle-gpu/" --index-url https://pypi.org/simple "paddlepaddle-gpu==3.3.0.dev20251204" source/install/uv_with_retry.sh pip install --system -v -e .[gpu,test,lmp,cu12,torch,jax] mpi4py --reinstall-package deepmd-kit # See https://github.com/jax-ml/jax/issues/29042 - source/install/uv_with_retry.sh pip install -U nvidia-cublas-cu12>=12.9.0.13 + source/install/uv_with_retry.sh pip install --system -U 'nvidia-cublas-cu12>=12.9.0.13' env: DP_VARIANT: cuda DP_ENABLE_NATIVE_OPTIMIZATION: 1