[ET-VK][CI] Add test-vulkan-genai job for Parakeet on NVIDIA GPU runner

ssjia · ssjia · commit 1248fe263595 · 2026-03-19T15:48:48.000-07:00
Pull Request resolved: #18335 Add a new GitHub CI job that exports and runs the Parakeet TDT model with the Vulkan backend on an NVIDIA GPU runner. The Vulkan export and runner code already exists but had no CI coverage. - Add `--gpu` flag to `setup-vulkan-linux-deps.sh` to skip SwiftShader installation when running on machines with a real GPU driver - Add `vulkan` as a supported device in `export_model_artifact.sh` and `test_model_e2e.sh` - Add `test-vulkan-genai` job to `pull.yml` on `linux.g5.4xlarge.nvidia.gpu` ghstack-source-id: 354902046 @exported-using-ghexport Differential Revision: [D97344728](https://our.internmc.facebook.com/intern/diff/D97344728/)
diff --git a/.ci/scripts/export_model_artifact.sh b/.ci/scripts/export_model_artifact.sh
@@ -14,7 +14,7 @@ Usage: export_model_artifact.sh <device> <hf_model> [quant_name] [output_dir] [m
 Export a HuggingFace model to CUDA/Metal/XNNPACK format with optional quantization.
 
 Arguments:
-  device       cuda, metal, or xnnpack (required)
+  device       cuda, metal, vulkan, or xnnpack (required)
 
   hf_model     HuggingFace model ID (required)
                Supported models:
@@ -49,6 +49,7 @@ Examples:
   export_model_artifact.sh cuda "mistralai/Voxtral-Mini-3B-2507" "quantized-int4-tile-packed"
   export_model_artifact.sh cuda-windows "nvidia/diar_streaming_sortformer_4spk-v2" "non-quantized" "./output"
   export_model_artifact.sh cuda "google/gemma-3-4b-it" "non-quantized" "./output"
+  export_model_artifact.sh vulkan "nvidia/parakeet-tdt" "non-quantized" "./output"
   export_model_artifact.sh cuda "nvidia/parakeet-tdt" "non-quantized" "./output"
   export_model_artifact.sh xnnpack "nvidia/parakeet-tdt" "quantized-8da4w" "./output"
   export_model_artifact.sh xnnpack "mistralai/Voxtral-Mini-4B-Realtime-2602" "quantized-8da4w" "./output"
@@ -103,9 +104,11 @@ case "$DEVICE" in
     ;;
   xnnpack)
     ;;
+  vulkan)
+    ;;
   *)
     echo "Error: Unsupported device '$DEVICE'"
-    echo "Supported devices: cuda, cuda-windows, metal, xnnpack"
+    echo "Supported devices: cuda, cuda-windows, metal, vulkan, xnnpack"
     exit 1
     ;;
 esac
@@ -218,8 +221,8 @@ case "$QUANT_NAME" in
     EXTRA_ARGS="--qlinear fpa4w --qlinear_encoder fpa4w"
     ;;
   quantized-8da4w)
-    if [ "$DEVICE" != "xnnpack" ]; then
-      echo "Error: quantized-8da4w is only supported with xnnpack device"
+    if [ "$DEVICE" != "xnnpack" ] && [ "$DEVICE" != "vulkan" ]; then
+      echo "Error: quantized-8da4w is only supported with xnnpack or vulkan device"
       exit 1
     fi
     EXTRA_ARGS="--qlinear 8da4w --qlinear_group_size 32 --qlinear_encoder 8da4w --qlinear_encoder_group_size 32"
@@ -242,9 +245,11 @@ pip list
 if [ "$MODEL_NAME" = "parakeet" ]; then
   pip install -r examples/models/parakeet/install_requirements.txt
 
-  # Set dtype based on backend (XNNPACK uses fp32, CUDA/Metal use bf16)
+  # Set dtype based on backend (XNNPACK uses fp32, Vulkan uses fp16, CUDA/Metal use bf16)
   if [ "$DEVICE" = "xnnpack" ]; then
     DTYPE_ARG=""
+  elif [ "$DEVICE" = "vulkan" ]; then
+    DTYPE_ARG="--vulkan_force_fp16"
   else
     DTYPE_ARG="--dtype bf16"
   fi
diff --git a/.ci/scripts/setup-vulkan-linux-deps.sh b/.ci/scripts/setup-vulkan-linux-deps.sh
@@ -45,5 +45,15 @@ install_vulkan_sdk() {
 
 VULKAN_SDK_VERSION="1.4.321.1"
 
-install_swiftshader
+# Parse arguments: --gpu skips SwiftShader (use NVIDIA driver's Vulkan ICD instead)
+USE_GPU=false
+for arg in "$@"; do
+  case $arg in
+    --gpu) USE_GPU=true ;;
+  esac
+done
+
+if [ "$USE_GPU" = false ]; then
+  install_swiftshader
+fi
 install_vulkan_sdk "${VULKAN_SDK_VERSION}"
diff --git a/.ci/scripts/test_model_e2e.sh b/.ci/scripts/test_model_e2e.sh
@@ -14,7 +14,7 @@ Usage: test_model_e2e.sh <device> <hf_model> <quant_name> [model_dir] [mode]
 Build and run end-to-end tests for CUDA/Metal/XNNPACK models.
 
 Arguments:
-  device      cuda, metal, or xnnpack (required)
+  device      cuda, metal, vulkan, or xnnpack (required)
 
   hf_model    HuggingFace model ID (required)
               Supported models:
@@ -47,6 +47,7 @@ Examples:
   test_model_e2e.sh metal "openai/whisper-small" "non-quantized"
   test_model_e2e.sh cuda "mistralai/Voxtral-Mini-3B-2507" "quantized-int4-tile-packed" "./model_output"
   test_model_e2e.sh cuda "nvidia/diar_streaming_sortformer_4spk-v2" "non-quantized" "./model_output"
+  test_model_e2e.sh vulkan "nvidia/parakeet-tdt" "non-quantized" "./model_output"
   test_model_e2e.sh cuda "nvidia/parakeet-tdt" "non-quantized" "./model_output"
   test_model_e2e.sh xnnpack "nvidia/parakeet-tdt" "quantized-8da4w" "./model_output"
   test_model_e2e.sh metal "mistralai/Voxtral-Mini-4B-Realtime-2602" "non-quantized" "." "vr-streaming"
@@ -262,8 +263,8 @@ echo "::endgroup::"
 
 echo "::group::Build $MODEL_NAME Runner"
 
-if [ "$DEVICE" != "cuda" ] && [ "$DEVICE" != "metal" ] && [ "$DEVICE" != "xnnpack" ]; then
-  echo "Error: Unsupported device '$DEVICE'. Must be 'cuda', 'metal', or 'xnnpack'."
+if [ "$DEVICE" != "cuda" ] && [ "$DEVICE" != "metal" ] && [ "$DEVICE" != "vulkan" ] && [ "$DEVICE" != "xnnpack" ]; then
+  echo "Error: Unsupported device '$DEVICE'. Must be 'cuda', 'metal', 'vulkan', or 'xnnpack'."
   exit 1
 fi
 
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
@@ -1310,6 +1310,41 @@ jobs:
         python -m unittest backends/vulkan/test/test_vulkan_delegate.py -k "*pt2e*"
         python -m unittest backends/vulkan/test/test_vulkan_delegate.py -k "*torchao*"
 
+  test-vulkan-genai:
+    name: test-vulkan-genai
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    with:
+      runner: linux.g5.4xlarge.nvidia.gpu
+      docker-image: ci-image:executorch-ubuntu-22.04-clang12
+      gpu-arch-type: cuda
+      gpu-arch-version: "12.6"
+      submodules: 'recursive'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 90
+      script: |
+        set -eux
+
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        # Setup Vulkan SDK (no SwiftShader — use NVIDIA driver's Vulkan ICD)
+        source .ci/scripts/setup-vulkan-linux-deps.sh --gpu
+
+        # Setup ExecuTorch
+        PYTHON_EXECUTABLE=python \
+        CMAKE_ARGS="-DEXECUTORCH_BUILD_VULKAN=ON" \
+        .ci/scripts/setup-linux.sh --build-tool "cmake"
+
+        # Export parakeet with Vulkan backend
+        bash .ci/scripts/export_model_artifact.sh vulkan "nvidia/parakeet-tdt" "quantized-8da4w" "${RUNNER_ARTIFACT_DIR}"
+
+        # Build runner and test e2e
+        bash .ci/scripts/test_model_e2e.sh vulkan "nvidia/parakeet-tdt" "quantized-8da4w" "${RUNNER_ARTIFACT_DIR}"
+
   test-coreml-bc-macos:
     name: test-coreml-bc-macos (${{ matrix.runner }})
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main