diff --git a/.github/actions/install_neuronx_runtime/action.yml b/.github/actions/install_neuronx_runtime/action.yml index b09df2f45..9444ed7e0 100644 --- a/.github/actions/install_neuronx_runtime/action.yml +++ b/.github/actions/install_neuronx_runtime/action.yml @@ -3,19 +3,104 @@ description: install system and python packages for an AWS Neuronx SDK version runs: using: "composite" steps: + - name: Set Neuronx deb variables + shell: bash + run: | + NEURON_DEBS_LIST="aws-neuronx-tools=2.26.14.0 aws-neuronx-runtime-lib=2.28.23.0-dd5879008 aws-neuronx-collectives=2.28.27.0-bc30ece58" + echo "APT_CACHE_DIR=/mnt/hf_cache/apt-cache" >> $GITHUB_ENV + echo "NEURON_DEBS=${NEURON_DEBS_LIST}" >> $GITHUB_ENV + echo "NEURON_CACHE_KEY=$(echo -n \"${NEURON_DEBS_LIST}\" | sha256sum | awk '{print $1}')" >> $GITHUB_ENV + - name: Prepare cache directory and download Neuronx system dependencies + shell: bash + run: | + CACHE_DIR_KEY="${APT_CACHE_DIR}/${NEURON_CACHE_KEY}" + LOCKDIR="${CACHE_DIR_KEY}.lock" + READY_MARKER="${CACHE_DIR_KEY}/.ready" + LOCK_ACQUIRED=false + MAX_WAIT=300 # 5 minutes timeout + + # Create cache directory structure if it doesn't exist + sudo mkdir -p "${APT_CACHE_DIR}" 2>/dev/null || true + sudo chmod 777 "${APT_CACHE_DIR}" 2>/dev/null || true + + # Try to acquire lock with timeout (mkdir is atomic even on NFS) + for i in $(seq 1 $MAX_WAIT); do + if mkdir "${LOCKDIR}" 2>/dev/null; then + LOCK_ACQUIRED=true + echo "🔒 Lock acquired (attempt $i)" + + # Set up cleanup trap to remove lock directory on exit + trap "rmdir '${LOCKDIR}' 2>/dev/null || true" EXIT + + # Check if cache needs to be built + if [ ! -f "${READY_MARKER}" ]; then + echo "📦 Downloading Neuronx system dependencies..." + sudo mkdir -p "${CACHE_DIR_KEY}" + sudo chmod 777 "${CACHE_DIR_KEY}" + . /etc/os-release + sudo tee /etc/apt/sources.list.d/neuron.list > /dev/null </dev/null || true + trap - EXIT + break + fi + + # Lock not acquired, wait and retry + if [ $i -eq 1 ]; then + echo "⏳ Waiting for another runner to complete download..." + fi + sleep 1 + done + + if [ "$LOCK_ACQUIRED" = "false" ]; then + echo "⚠️ Lock timeout after ${MAX_WAIT}s - will install directly from apt" + fi - name: Install Neuronx system packages shell: bash run: | - . /etc/os-release - sudo tee /etc/apt/sources.list.d/neuron.list > /dev/null </dev/null | wc -l) packages" + echo "::endgroup::" + + sudo dpkg -i "${CACHE_DIR_KEY}"/*.deb + else + echo "📦 Cache not available, installing directly from apt..." + . /etc/os-release + sudo tee /etc/apt/sources.list.d/neuron.list > /dev/null <> $GITHUB_ENV dpkg -l | grep neuron - name: Display driver version shell: bash run: | - apt show aws-neuronx-dkms + cat /sys/module/neuron/version || echo "Neuron driver version not available" diff --git a/.github/workflows/cleanup_inf2_cache.yml b/.github/workflows/cleanup_inf2_cache.yml new file mode 100644 index 000000000..50250a302 --- /dev/null +++ b/.github/workflows/cleanup_inf2_cache.yml @@ -0,0 +1,34 @@ +# The workflow file for cleaning up Inf2 cache +# It can be triggered by dispatch event or scheduler, to remove unused cache files on /mnt/hf_cache apt entries +name: Cleanup Inf2 Cache +on: + workflow_dispatch: + schedule: + # Schedule the workflow to run every second day at midnight UTC + - cron: '0 0 */2 * *' + +jobs: + do-the-job: + name: Apt cache cleanup + runs-on: + group: aws-inf2-8xlarge + steps: + - name: Apt cache cleanup + run: | + # list all /mnt/hf_cache/apt-cache subdirectories that do not contain a .lock directory + for dir in /mnt/hf_cache/apt-cache/*; do + if [[ -d "$dir" && ! -d "$dir/.lock" ]]; then + # get the size of the directory + size=$(sudo du -sh "$dir" | awk '{print $1}') + echo "Removing $dir ($size)" + sudo rm -rf "$dir" + fi + done + # list all /mnt/hf_cache/apt-cache subdirectories that contain a .lock directory + for dir in /mnt/hf_cache/apt-cache/*; do + if [[ -d "$dir" && -d "$dir/.lock" ]]; then + # get the size of the directory + size=$(sudo du -sh "$dir" | awk '{print $1}') + echo "Keeping $dir ($size) that is locked" + fi + done diff --git a/.gitignore b/.gitignore index dc61514d0..b7cc22ab8 100644 --- a/.gitignore +++ b/.gitignore @@ -134,3 +134,6 @@ dmypy.json .vscode neuronxcc*/ + +# Ignore claude settings +.claude/ diff --git a/tests/decoder/test_decoder_export.py b/tests/decoder/test_decoder_export.py index 46871d578..7e3b3b9a4 100644 --- a/tests/decoder/test_decoder_export.py +++ b/tests/decoder/test_decoder_export.py @@ -106,6 +106,7 @@ def test_decoder_export_hub_models( _test_decoder_export_save_reload(model_id=any_decoder_model, is_local=False, load_weights=False) +# TODO: remove this test, it is just a test for CI @pytest.mark.parametrize("is_local", [True, False], ids=["local", "from_hub"]) @pytest.mark.parametrize("load_weights", [True, False], ids=["with-weights", "without-weights"]) def test_decoder_export_save_reload( diff --git a/tests/vllm/service/test_vllm_agentic.py b/tests/vllm/service/test_vllm_agentic.py index dde0db934..53d0a89f6 100644 --- a/tests/vllm/service/test_vllm_agentic.py +++ b/tests/vllm/service/test_vllm_agentic.py @@ -44,6 +44,7 @@ async def greedy_with_tools( return generated_tokens, generated_text, tool_calls +# TODO: remove this comment, it is just a test for CI # Note: we use Qwen3-0.6B as a test model because it is a small model that is easy to test and it supports tool calling. @pytest.mark.asyncio @pytest.mark.parametrize("neuron_llm_config", ["qwen3-1x8192"], indirect=True)