@@ -19,12 +19,9 @@ Arguments:
1919 hf_model HuggingFace model ID (required)
2020 Supported models:
2121 - mistralai/Voxtral-Mini-3B-2507
22- - nvidia/diar_streaming_sortformer_4spk-v2
2322 - openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo})
2423 - google/gemma-3-4b-it
25- - Qwen/Qwen3-0.6B
2624 - nvidia/parakeet-tdt
27- - facebook/dinov2-small-imagenet1k-1-layer
2825 - mistralai/Voxtral-Mini-4B-Realtime-2602
2926
3027 quant_name Quantization type (required)
@@ -46,7 +43,6 @@ Arguments:
4643Examples:
4744 test_model_e2e.sh metal "openai/whisper-small" "non-quantized"
4845 test_model_e2e.sh cuda "mistralai/Voxtral-Mini-3B-2507" "quantized-int4-tile-packed" "./model_output"
49- test_model_e2e.sh cuda "nvidia/diar_streaming_sortformer_4spk-v2" "non-quantized" "./model_output"
5046 test_model_e2e.sh cuda "nvidia/parakeet-tdt" "non-quantized" "./model_output"
5147 test_model_e2e.sh xnnpack "nvidia/parakeet-tdt" "quantized-8da4w" "./model_output"
5248 test_model_e2e.sh metal "mistralai/Voxtral-Mini-4B-Realtime-2602" "non-quantized" "." "vr-streaming"
@@ -155,18 +151,6 @@ case "$HF_MODEL" in
155151 AUDIO_FILE=" "
156152 IMAGE_PATH=" docs/source/_static/img/et-logo.png"
157153 ;;
158- Qwen/Qwen3-0.6B)
159- MODEL_NAME=" qwen3"
160- RUNNER_TARGET=" llama_main"
161- RUNNER_PATH=" llama"
162- EXPECTED_OUTPUT=" Paris"
163- PREPROCESSOR=" "
164- TOKENIZER_URL=" https://huggingface.co/Qwen/Qwen3-0.6B/resolve/main" # @lint-ignore
165- TOKENIZER_FILE=" "
166- AUDIO_URL=" "
167- AUDIO_FILE=" "
168- IMAGE_PATH=" "
169- ;;
170154 nvidia/parakeet-tdt)
171155 MODEL_NAME=" parakeet"
172156 RUNNER_TARGET=" parakeet_runner"
@@ -179,31 +163,6 @@ case "$HF_MODEL" in
179163 AUDIO_FILE=" test_audio.wav"
180164 IMAGE_PATH=" "
181165 ;;
182- nvidia/diar_streaming_sortformer_4spk-v2)
183- MODEL_NAME=" sortformer"
184- RUNNER_TARGET=" sortformer_runner"
185- RUNNER_PATH=" sortformer"
186- EXPECTED_OUTPUT=" Speaker 1"
187- PREPROCESSOR=" "
188- TOKENIZER_URL=" "
189- TOKENIZER_FILE=" "
190- AUDIO_URL=" https://github.com/voxserv/audio_quality_testing_samples/raw/refs/heads/master/testaudio/16000/test01_20s.wav"
191- AUDIO_FILE=" poem.wav"
192- IMAGE_PATH=" "
193- ;;
194- facebook/dinov2-small-imagenet1k-1-layer)
195- MODEL_NAME=" dinov2"
196- RUNNER_TARGET=" dinov2_runner"
197- RUNNER_PATH=" dinov2"
198- EXPECTED_OUTPUT=" Samoyed"
199- PREPROCESSOR=" "
200- TOKENIZER_URL=" "
201- TOKENIZER_FILE=" "
202- AUDIO_URL=" "
203- AUDIO_FILE=" "
204- IMAGE_URL=" https://github.com/pytorch/hub/raw/master/images/dog.jpg"
205- IMAGE_PATH=" "
206- ;;
207166 mistralai/Voxtral-Mini-4B-Realtime-2602)
208167 MODEL_NAME=" voxtral_realtime"
209168 RUNNER_TARGET=" voxtral_realtime_runner"
@@ -218,7 +177,7 @@ case "$HF_MODEL" in
218177 ;;
219178 * )
220179 echo " Error: Unsupported model '$HF_MODEL '"
221- echo " Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, nvidia/diar_streaming_sortformer_4spk-v2, openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}), google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/parakeet-tdt, facebook/dinov2-small-imagenet1k-1-layer "
180+ echo " Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}), google/gemma-3-4b-it, nvidia/parakeet-tdt"
222181 exit 1
223182 ;;
224183esac
@@ -231,8 +190,8 @@ echo "::endgroup::"
231190echo " ::group::Prepare $MODEL_NAME Artifacts"
232191
233192
234- # Download tokenizer files (skip for models that bundle tokenizer in export or do not use one )
235- if [ " $MODEL_NAME " != " parakeet" ] && [ " $MODEL_NAME " != " voxtral_realtime" ] && [ " $MODEL_NAME " != " sortformer " ] && [ " $MODEL_NAME " != " dinov2 " ] ; then
193+ # Download tokenizer files (skip for parakeet and voxtral_realtime which bundle tokenizer in export)
194+ if [ " $MODEL_NAME " != " parakeet" ] && [ " $MODEL_NAME " != " voxtral_realtime" ]; then
236195 if [ " $TOKENIZER_FILE " != " " ]; then
237196 curl -L $TOKENIZER_URL /$TOKENIZER_FILE -o $MODEL_DIR /$TOKENIZER_FILE
238197 else
@@ -248,15 +207,10 @@ if [ "$AUDIO_URL" != "" ]; then
248207elif [[ " $MODEL_NAME " == * whisper* ]] || [ " $MODEL_NAME " = " voxtral_realtime" ]; then
249208 conda install -y -c conda-forge " ffmpeg<8"
250209 pip install datasets soundfile
251- pip install torchcodec==0.11.0.dev20260310 --extra-index-url https://download.pytorch.org/whl/nightly/cpu
210+ pip install torchcodec==0.11.0.dev20260312 --extra-index-url https://download.pytorch.org/whl/nightly/cpu
252211 python -c " from datasets import load_dataset;import soundfile as sf;sample = load_dataset('distil-whisper/librispeech_long', 'clean', split='validation')[0]['audio'];sf.write('${MODEL_DIR} /$AUDIO_FILE ', sample['array'][:sample['sampling_rate']*30], sample['sampling_rate'])"
253212fi
254213
255- # Download test image for vision models
256- if [ -n " ${IMAGE_URL:- } " ]; then
257- curl -L " $IMAGE_URL " -o " ${MODEL_DIR} /test_image.jpg"
258- fi
259-
260214ls -al
261215echo " ::endgroup::"
262216
@@ -292,14 +246,9 @@ if [ "$(uname -s)" = "Darwin" ] && [ -f "$RUNNER_BIN" ]; then
292246 install_name_tool -change /opt/llvm-openmp/lib/libomp.dylib @rpath/libomp.dylib " $RUNNER_BIN "
293247 fi
294248fi
295- # For CUDA, add named data argument (Metal embeds data in .pte).
296- # Llama runner uses --data_paths, other runners use --data_path.
249+ # For CUDA, add data_path argument (Metal embeds data in .pte)
297250if [ " $DEVICE " = " cuda" ]; then
298- if [ " $RUNNER_PATH " = " llama" ]; then
299- RUNNER_ARGS=" $RUNNER_ARGS --data_paths ${MODEL_DIR} /aoti_cuda_blob.ptd"
300- else
301- RUNNER_ARGS=" $RUNNER_ARGS --data_path ${MODEL_DIR} /aoti_cuda_blob.ptd"
302- fi
251+ RUNNER_ARGS=" $RUNNER_ARGS --data_path ${MODEL_DIR} /aoti_cuda_blob.ptd"
303252fi
304253
305254# Add model-specific arguments
@@ -313,44 +262,26 @@ case "$MODEL_NAME" in
313262 gemma3)
314263 RUNNER_ARGS=" $RUNNER_ARGS --tokenizer_path ${MODEL_DIR} / --image_path $IMAGE_PATH "
315264 ;;
316- qwen3)
317- PROMPT_FILE=" ${MODEL_DIR} /qwen3_prompt.txt"
318- cat > " ${PROMPT_FILE} " << 'EOF '
319- <|im_start|>user
320- What is the capital of France?<|im_end|>
321- <|im_start|>assistant
322- EOF
323- RUNNER_ARGS=" $RUNNER_ARGS --tokenizer_path ${MODEL_DIR} / --prompt_file ${PROMPT_FILE} "
324- ;;
325265 parakeet)
326266 RUNNER_ARGS=" --model_path ${MODEL_DIR} /model.pte --audio_path ${MODEL_DIR} /$AUDIO_FILE --tokenizer_path ${MODEL_DIR} /$TOKENIZER_FILE "
327267 # For CUDA, add data_path argument (Metal embeds data in .pte)
328268 if [ " $DEVICE " = " cuda" ]; then
329269 RUNNER_ARGS=" $RUNNER_ARGS --data_path ${MODEL_DIR} /aoti_cuda_blob.ptd"
330270 fi
331271 ;;
332- sortformer)
333- RUNNER_ARGS=" --model_path ${MODEL_DIR} /model.pte --audio_path ${MODEL_DIR} /$AUDIO_FILE "
334- if [ " $DEVICE " = " cuda" ]; then
335- RUNNER_ARGS=" $RUNNER_ARGS --data_path ${MODEL_DIR} /aoti_cuda_blob.ptd"
336- fi
337- ;;
338- dinov2)
339- RUNNER_ARGS=" --model_path ${MODEL_DIR} /model.pte --image_path ${MODEL_DIR} /test_image.jpg"
340- if [ " $DEVICE " = " cuda" ]; then
341- RUNNER_ARGS=" $RUNNER_ARGS --data_path ${MODEL_DIR} /aoti_cuda_blob.ptd"
342- fi
343- ;;
344272 voxtral_realtime)
345273 RUNNER_ARGS=" --model_path ${MODEL_DIR} /model.pte --tokenizer_path ${MODEL_DIR} /$TOKENIZER_FILE --preprocessor_path ${MODEL_DIR} /$PREPROCESSOR --audio_path ${MODEL_DIR} /$AUDIO_FILE --temperature 0"
346- # Add CUDA data path if present
347- if [ " $DEVICE " = " cuda" ] && [ -f " ${MODEL_DIR} /aoti_cuda_blob.ptd" ]; then
348- RUNNER_ARGS=" $RUNNER_ARGS --data_path ${MODEL_DIR} /aoti_cuda_blob.ptd"
349- fi
350274 # Determine streaming mode based on MODE parameter
351- USE_STREAMING=" true"
352- if [ " $MODE " = " vr-offline" ]; then
275+ USE_STREAMING=" false"
276+ if [ " $MODE " = " vr-streaming" ]; then
277+ USE_STREAMING=" true"
278+ elif [ " $MODE " = " vr-offline" ]; then
353279 USE_STREAMING=" false"
280+ elif [ -z " $MODE " ]; then
281+ # Auto-detect: XNNPACK uses streaming, others use offline
282+ if [ " $DEVICE " = " xnnpack" ]; then
283+ USE_STREAMING=" true"
284+ fi
354285 fi
355286 # Add streaming flag if needed
356287 if [ " $USE_STREAMING " = " true" ]; then
0 commit comments