From a099482d6b2d4a6f6f09217a24c76e2e15ac0753 Mon Sep 17 00:00:00 2001 From: Alvaro Moran Date: Tue, 3 Feb 2026 10:04:58 +0000 Subject: [PATCH 1/6] ci(dependencies): neuronx packages are cached across jobs This is to avoid downloading the files several times, so to speed up each job. Now one job will always download the files in a cached location and reuse that. Note that for setups using the action when the mount point is not shared, the only difference is that it will not benefit from the shared download, but it will work anyway. --- .../install_neuronx_runtime/action.yml | 19 ++++++---- .github/workflows/prepare_cache.yml | 37 +++++++++++++++++++ 2 files changed, 49 insertions(+), 7 deletions(-) create mode 100644 .github/workflows/prepare_cache.yml diff --git a/.github/actions/install_neuronx_runtime/action.yml b/.github/actions/install_neuronx_runtime/action.yml index b09df2f45..050e58a02 100644 --- a/.github/actions/install_neuronx_runtime/action.yml +++ b/.github/actions/install_neuronx_runtime/action.yml @@ -3,16 +3,21 @@ description: install system and python packages for an AWS Neuronx SDK version runs: using: "composite" steps: + - name: Set APT_CACHE_DIR env variable + shell: bash + run: | + echo "APT_CACHE_DIR=/mnt/hf_cache/apt-${GITHUB_SHA}" >> $GITHUB_ENV + - name: Wait for cache to be ready + uses: lewagon/wait-on-check-action@v1.5.0 + with: + ref: ${{ github.ref }} + check-name: "Setup Neuronx system cache" + repo-token: ${{ github.token }} + wait-interval: 10 - name: Install Neuronx system packages shell: bash run: | - . /etc/os-release - sudo tee /etc/apt/sources.list.d/neuron.list > /dev/null < /dev/null < Date: Tue, 3 Feb 2026 14:38:14 +0000 Subject: [PATCH 2/6] chore: add claude code settings to gitignore --- .../install_neuronx_runtime/action.yml | 91 +++++++++++++++++-- .github/workflows/prepare_cache.yml | 37 -------- .gitignore | 3 + 3 files changed, 84 insertions(+), 47 deletions(-) delete mode 100644 .github/workflows/prepare_cache.yml diff --git a/.github/actions/install_neuronx_runtime/action.yml b/.github/actions/install_neuronx_runtime/action.yml index 050e58a02..7d5b29c0a 100644 --- a/.github/actions/install_neuronx_runtime/action.yml +++ b/.github/actions/install_neuronx_runtime/action.yml @@ -3,21 +3,92 @@ description: install system and python packages for an AWS Neuronx SDK version runs: using: "composite" steps: - - name: Set APT_CACHE_DIR env variable + - name: Set Neuronx deb variables shell: bash run: | - echo "APT_CACHE_DIR=/mnt/hf_cache/apt-${GITHUB_SHA}" >> $GITHUB_ENV - - name: Wait for cache to be ready - uses: lewagon/wait-on-check-action@v1.5.0 - with: - ref: ${{ github.ref }} - check-name: "Setup Neuronx system cache" - repo-token: ${{ github.token }} - wait-interval: 10 + NEURON_DEBS_LIST="aws-neuronx-tools=2.26.14.0 aws-neuronx-runtime-lib=2.28.23.0-dd5879008 aws-neuronx-collectives=2.28.27.0-bc30ece58" + echo "APT_CACHE_DIR=/mnt/hf_cache/apt-cache" >> $GITHUB_ENV + echo "NEURON_DEBS=${NEURON_DEBS_LIST}" >> $GITHUB_ENV + echo "NEURON_CACHE_KEY=$(echo -n \"${NEURON_DEBS_LIST}\" | sha256sum | awk '{print $1}')" >> $GITHUB_ENV + - name: Prepare cache directory and download Neuronx system dependencies + shell: bash + run: | + CACHE_DIR_KEY="${APT_CACHE_DIR}/${NEURON_CACHE_KEY}" + LOCKDIR="${CACHE_DIR_KEY}.lock" + READY_MARKER="${CACHE_DIR_KEY}/.ready" + LOCK_ACQUIRED=false + MAX_WAIT=300 # 5 minutes timeout + + # Create cache directory structure if it doesn't exist + sudo mkdir -p "${APT_CACHE_DIR}" 2>/dev/null || true + sudo chmod 777 "${APT_CACHE_DIR}" 2>/dev/null || true + + # Try to acquire lock with timeout (mkdir is atomic even on NFS) + for i in $(seq 1 $MAX_WAIT); do + if mkdir "${LOCKDIR}" 2>/dev/null; then + LOCK_ACQUIRED=true + echo "🔒 Lock acquired (attempt $i)" + + # Set up cleanup trap to remove lock directory on exit + trap "rmdir '${LOCKDIR}' 2>/dev/null || true" EXIT + + # Check if cache needs to be built + if [ ! -f "${READY_MARKER}" ]; then + echo "📦 Downloading Neuronx system dependencies..." + sudo mkdir -p "${CACHE_DIR_KEY}" + sudo chmod 777 "${CACHE_DIR_KEY}" + . /etc/os-release + sudo tee /etc/apt/sources.list.d/neuron.list > /dev/null </dev/null || true + trap - EXIT + break + fi + + # Lock not acquired, wait and retry + if [ $i -eq 1 ]; then + echo "⏳ Waiting for another runner to complete download..." + fi + sleep 1 + done + + if [ "$LOCK_ACQUIRED" = "false" ]; then + echo "⚠️ Lock timeout after ${MAX_WAIT}s - will install directly from apt" + fi - name: Install Neuronx system packages shell: bash run: | - sudo dpkg -i ${APT_CACHE_DIR}/*.deb + CACHE_DIR_KEY="${APT_CACHE_DIR}/${NEURON_CACHE_KEY}" + READY_MARKER="${CACHE_DIR_KEY}/.ready" + + if [ -f "${READY_MARKER}" ]; then + echo "📦 Installing from cache: ${CACHE_DIR_KEY}" + sudo dpkg -i "${CACHE_DIR_KEY}"/*.deb + else + echo "📦 Cache not available, installing directly from apt..." + . /etc/os-release + sudo tee /etc/apt/sources.list.d/neuron.list > /dev/null < /dev/null < Date: Wed, 4 Feb 2026 08:40:34 +0000 Subject: [PATCH 3/6] feat(ci): add cache cleanup pipeline --- .github/workflows/cleanup_inf2_cache.yml | 34 ++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 .github/workflows/cleanup_inf2_cache.yml diff --git a/.github/workflows/cleanup_inf2_cache.yml b/.github/workflows/cleanup_inf2_cache.yml new file mode 100644 index 000000000..50250a302 --- /dev/null +++ b/.github/workflows/cleanup_inf2_cache.yml @@ -0,0 +1,34 @@ +# The workflow file for cleaning up Inf2 cache +# It can be triggered by dispatch event or scheduler, to remove unused cache files on /mnt/hf_cache apt entries +name: Cleanup Inf2 Cache +on: + workflow_dispatch: + schedule: + # Schedule the workflow to run every second day at midnight UTC + - cron: '0 0 */2 * *' + +jobs: + do-the-job: + name: Apt cache cleanup + runs-on: + group: aws-inf2-8xlarge + steps: + - name: Apt cache cleanup + run: | + # list all /mnt/hf_cache/apt-cache subdirectories that do not contain a .lock directory + for dir in /mnt/hf_cache/apt-cache/*; do + if [[ -d "$dir" && ! -d "$dir/.lock" ]]; then + # get the size of the directory + size=$(sudo du -sh "$dir" | awk '{print $1}') + echo "Removing $dir ($size)" + sudo rm -rf "$dir" + fi + done + # list all /mnt/hf_cache/apt-cache subdirectories that contain a .lock directory + for dir in /mnt/hf_cache/apt-cache/*; do + if [[ -d "$dir" && -d "$dir/.lock" ]]; then + # get the size of the directory + size=$(sudo du -sh "$dir" | awk '{print $1}') + echo "Keeping $dir ($size) that is locked" + fi + done From 457ed28d526c0374a24d7eadc7acc5846ba0d00f Mon Sep 17 00:00:00 2001 From: Alvaro Moran Date: Tue, 3 Feb 2026 10:30:07 +0000 Subject: [PATCH 4/6] WIP: remove test --- .github/actions/install_neuronx_runtime/action.yml | 9 ++++++++- tests/decoder/test_decoder_export.py | 1 + tests/vllm/service/test_vllm_agentic.py | 1 + 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/.github/actions/install_neuronx_runtime/action.yml b/.github/actions/install_neuronx_runtime/action.yml index 7d5b29c0a..b93003d58 100644 --- a/.github/actions/install_neuronx_runtime/action.yml +++ b/.github/actions/install_neuronx_runtime/action.yml @@ -48,7 +48,7 @@ runs: ${NEURON_DEBS} touch "${READY_MARKER}" echo "✅ Download complete: ${CACHE_DIR_KEY}" - ls -lh "${CACHE_DIR_KEY}"/*.deb || true + else echo "✅ Cache already ready (another runner downloaded it)" fi @@ -77,6 +77,13 @@ runs: if [ -f "${READY_MARKER}" ]; then echo "📦 Installing from cache: ${CACHE_DIR_KEY}" + + # Show packages being installed in a collapsible group + echo "::group::📋 Packages to install" + ls -lh "${CACHE_DIR_KEY}"/*.deb || echo "❌ No .deb files found" + echo "📊 Total: $(ls -1 "${CACHE_DIR_KEY}"/*.deb 2>/dev/null | wc -l) packages" + echo "::endgroup::" + sudo dpkg -i "${CACHE_DIR_KEY}"/*.deb else echo "📦 Cache not available, installing directly from apt..." diff --git a/tests/decoder/test_decoder_export.py b/tests/decoder/test_decoder_export.py index 46871d578..7e3b3b9a4 100644 --- a/tests/decoder/test_decoder_export.py +++ b/tests/decoder/test_decoder_export.py @@ -106,6 +106,7 @@ def test_decoder_export_hub_models( _test_decoder_export_save_reload(model_id=any_decoder_model, is_local=False, load_weights=False) +# TODO: remove this test, it is just a test for CI @pytest.mark.parametrize("is_local", [True, False], ids=["local", "from_hub"]) @pytest.mark.parametrize("load_weights", [True, False], ids=["with-weights", "without-weights"]) def test_decoder_export_save_reload( diff --git a/tests/vllm/service/test_vllm_agentic.py b/tests/vllm/service/test_vllm_agentic.py index dde0db934..53d0a89f6 100644 --- a/tests/vllm/service/test_vllm_agentic.py +++ b/tests/vllm/service/test_vllm_agentic.py @@ -44,6 +44,7 @@ async def greedy_with_tools( return generated_tokens, generated_text, tool_calls +# TODO: remove this comment, it is just a test for CI # Note: we use Qwen3-0.6B as a test model because it is a small model that is easy to test and it supports tool calling. @pytest.mark.asyncio @pytest.mark.parametrize("neuron_llm_config", ["qwen3-1x8192"], indirect=True) From 8d00a03588e6144a13d5c21ef8a3e88c9e5008cf Mon Sep 17 00:00:00 2001 From: Alvaro Moran Date: Wed, 4 Feb 2026 10:28:08 +0000 Subject: [PATCH 5/6] fix(ci): correctly report neuron driver version when availble apt-show will report the version of the available package, installed or not. --- .github/actions/install_neuronx_runtime/action.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/actions/install_neuronx_runtime/action.yml b/.github/actions/install_neuronx_runtime/action.yml index b93003d58..c6a30f51b 100644 --- a/.github/actions/install_neuronx_runtime/action.yml +++ b/.github/actions/install_neuronx_runtime/action.yml @@ -101,4 +101,4 @@ runs: - name: Display driver version shell: bash run: | - apt show aws-neuronx-dkms + cat /sys/module/neuron/version || echo "Neuron driver version not available" From 7e16c05e6137e08ec92b95caf65c0b1f33fd3cbf Mon Sep 17 00:00:00 2001 From: Alvaro Moran Date: Wed, 4 Feb 2026 10:30:44 +0000 Subject: [PATCH 6/6] WIP --- .github/actions/install_neuronx_runtime/action.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/actions/install_neuronx_runtime/action.yml b/.github/actions/install_neuronx_runtime/action.yml index c6a30f51b..9444ed7e0 100644 --- a/.github/actions/install_neuronx_runtime/action.yml +++ b/.github/actions/install_neuronx_runtime/action.yml @@ -80,8 +80,9 @@ runs: # Show packages being installed in a collapsible group echo "::group::📋 Packages to install" - ls -lh "${CACHE_DIR_KEY}"/*.deb || echo "❌ No .deb files found" - echo "📊 Total: $(ls -1 "${CACHE_DIR_KEY}"/*.deb 2>/dev/null | wc -l) packages" + cd ${CACHE_DIR_KEY} + ls -lh *.deb || echo "❌ No .deb files found" + echo "📊 Total: $(ls -1 *.deb 2>/dev/null | wc -l) packages" echo "::endgroup::" sudo dpkg -i "${CACHE_DIR_KEY}"/*.deb @@ -97,6 +98,7 @@ runs: fi export PATH=/opt/aws/neuron/bin:$PATH + echo "PATH=${PATH}" >> $GITHUB_ENV dpkg -l | grep neuron - name: Display driver version shell: bash