Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 91 additions & 6 deletions .github/actions/install_neuronx_runtime/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,104 @@ description: install system and python packages for an AWS Neuronx SDK version
runs:
using: "composite"
steps:
- name: Set Neuronx deb variables
shell: bash
run: |
NEURON_DEBS_LIST="aws-neuronx-tools=2.26.14.0 aws-neuronx-runtime-lib=2.28.23.0-dd5879008 aws-neuronx-collectives=2.28.27.0-bc30ece58"
echo "APT_CACHE_DIR=/mnt/hf_cache/apt-cache" >> $GITHUB_ENV
echo "NEURON_DEBS=${NEURON_DEBS_LIST}" >> $GITHUB_ENV
echo "NEURON_CACHE_KEY=$(echo -n \"${NEURON_DEBS_LIST}\" | sha256sum | awk '{print $1}')" >> $GITHUB_ENV
- name: Prepare cache directory and download Neuronx system dependencies
shell: bash
run: |
CACHE_DIR_KEY="${APT_CACHE_DIR}/${NEURON_CACHE_KEY}"
LOCKDIR="${CACHE_DIR_KEY}.lock"
READY_MARKER="${CACHE_DIR_KEY}/.ready"
LOCK_ACQUIRED=false
MAX_WAIT=300 # 5 minutes timeout

# Create cache directory structure if it doesn't exist
sudo mkdir -p "${APT_CACHE_DIR}" 2>/dev/null || true
sudo chmod 777 "${APT_CACHE_DIR}" 2>/dev/null || true

# Try to acquire lock with timeout (mkdir is atomic even on NFS)
for i in $(seq 1 $MAX_WAIT); do
if mkdir "${LOCKDIR}" 2>/dev/null; then
LOCK_ACQUIRED=true
echo "🔒 Lock acquired (attempt $i)"

# Set up cleanup trap to remove lock directory on exit
trap "rmdir '${LOCKDIR}' 2>/dev/null || true" EXIT

# Check if cache needs to be built
if [ ! -f "${READY_MARKER}" ]; then
echo "📦 Downloading Neuronx system dependencies..."
sudo mkdir -p "${CACHE_DIR_KEY}"
sudo chmod 777 "${CACHE_DIR_KEY}"
. /etc/os-release
sudo tee /etc/apt/sources.list.d/neuron.list > /dev/null <<EOF
deb https://apt.repos.neuron.amazonaws.com ${VERSION_CODENAME} main
EOF
wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | sudo apt-key add -
sudo apt-get update -qq
sudo apt-get install -y --download-only \
-o Dir::Cache::archives="${CACHE_DIR_KEY}" \
${NEURON_DEBS}
touch "${READY_MARKER}"
echo "✅ Download complete: ${CACHE_DIR_KEY}"

else
echo "✅ Cache already ready (another runner downloaded it)"
fi

# Release lock
rmdir "${LOCKDIR}" 2>/dev/null || true
trap - EXIT
break
fi

# Lock not acquired, wait and retry
if [ $i -eq 1 ]; then
echo "⏳ Waiting for another runner to complete download..."
fi
sleep 1
done

if [ "$LOCK_ACQUIRED" = "false" ]; then
echo "⚠️ Lock timeout after ${MAX_WAIT}s - will install directly from apt"
fi
- name: Install Neuronx system packages
shell: bash
run: |
. /etc/os-release
sudo tee /etc/apt/sources.list.d/neuron.list > /dev/null <<EOF
CACHE_DIR_KEY="${APT_CACHE_DIR}/${NEURON_CACHE_KEY}"
READY_MARKER="${CACHE_DIR_KEY}/.ready"

if [ -f "${READY_MARKER}" ]; then
echo "📦 Installing from cache: ${CACHE_DIR_KEY}"

# Show packages being installed in a collapsible group
echo "::group::📋 Packages to install"
cd ${CACHE_DIR_KEY}
ls -lh *.deb || echo "❌ No .deb files found"
echo "📊 Total: $(ls -1 *.deb 2>/dev/null | wc -l) packages"
echo "::endgroup::"

sudo dpkg -i "${CACHE_DIR_KEY}"/*.deb
else
echo "📦 Cache not available, installing directly from apt..."
. /etc/os-release
sudo tee /etc/apt/sources.list.d/neuron.list > /dev/null <<EOF
deb https://apt.repos.neuron.amazonaws.com ${VERSION_CODENAME} main
EOF
wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | sudo apt-key add -
sudo apt-get update -y
sudo apt-get install aws-neuronx-tools=2.26.14.0 aws-neuronx-runtime-lib=2.28.23.0-dd5879008 aws-neuronx-collectives=2.28.27.0-bc30ece58 -y
wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | sudo apt-key add -
sudo apt-get update -qq
sudo apt-get install -y ${NEURON_DEBS}
fi

export PATH=/opt/aws/neuron/bin:$PATH
echo "PATH=${PATH}" >> $GITHUB_ENV
dpkg -l | grep neuron
- name: Display driver version
shell: bash
run: |
apt show aws-neuronx-dkms
cat /sys/module/neuron/version || echo "Neuron driver version not available"
34 changes: 34 additions & 0 deletions .github/workflows/cleanup_inf2_cache.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# The workflow file for cleaning up Inf2 cache
# It can be triggered by dispatch event or scheduler, to remove unused cache files on /mnt/hf_cache apt entries
name: Cleanup Inf2 Cache
on:
workflow_dispatch:
schedule:
# Schedule the workflow to run every second day at midnight UTC
- cron: '0 0 */2 * *'

jobs:
do-the-job:
name: Apt cache cleanup
runs-on:
group: aws-inf2-8xlarge
steps:
- name: Apt cache cleanup
run: |
# list all /mnt/hf_cache/apt-cache subdirectories that do not contain a .lock directory
for dir in /mnt/hf_cache/apt-cache/*; do
if [[ -d "$dir" && ! -d "$dir/.lock" ]]; then
# get the size of the directory
size=$(sudo du -sh "$dir" | awk '{print $1}')
echo "Removing $dir ($size)"
sudo rm -rf "$dir"
fi
done
# list all /mnt/hf_cache/apt-cache subdirectories that contain a .lock directory
for dir in /mnt/hf_cache/apt-cache/*; do
if [[ -d "$dir" && -d "$dir/.lock" ]]; then
# get the size of the directory
size=$(sudo du -sh "$dir" | awk '{print $1}')
echo "Keeping $dir ($size) that is locked"
fi
done
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -134,3 +134,6 @@ dmypy.json

.vscode
neuronxcc*/

# Ignore claude settings
.claude/
1 change: 1 addition & 0 deletions tests/decoder/test_decoder_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ def test_decoder_export_hub_models(
_test_decoder_export_save_reload(model_id=any_decoder_model, is_local=False, load_weights=False)


# TODO: remove this test, it is just a test for CI
@pytest.mark.parametrize("is_local", [True, False], ids=["local", "from_hub"])
@pytest.mark.parametrize("load_weights", [True, False], ids=["with-weights", "without-weights"])
def test_decoder_export_save_reload(
Expand Down
1 change: 1 addition & 0 deletions tests/vllm/service/test_vllm_agentic.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ async def greedy_with_tools(
return generated_tokens, generated_text, tool_calls


# TODO: remove this comment, it is just a test for CI
# Note: we use Qwen3-0.6B as a test model because it is a small model that is easy to test and it supports tool calling.
@pytest.mark.asyncio
@pytest.mark.parametrize("neuron_llm_config", ["qwen3-1x8192"], indirect=True)
Expand Down