diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-deepseek-ai-deepseek-r1-decode-amd-mi300x-dp8-moe-ep8.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-deepseek-ai-deepseek-r1-decode-amd-mi300x-dp8-moe-ep8.helm.yaml index 9f674a3..3d08c6b 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-deepseek-ai-deepseek-r1-decode-amd-mi300x-dp8-moe-ep8.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-deepseek-ai-deepseek-r1-decode-amd-mi300x-dp8-moe-ep8.helm.yaml @@ -79,7 +79,7 @@ spec: --max-num-batched-token 16384 --gpu-memory-utilization 0.92 --block-size 16 - --max-model-len 8192 + --max-model-len -1 --max-num-seqs 2048 resources: requests: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-deepseek-ai-deepseek-r1-distill-llama-8b-amd-mi250-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-deepseek-ai-deepseek-r1-distill-llama-8b-amd-mi250-tp2.helm.yaml index 759f894..840a063 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-deepseek-ai-deepseek-r1-distill-llama-8b-amd-mi250-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-deepseek-ai-deepseek-r1-distill-llama-8b-amd-mi250-tp2.helm.yaml @@ -27,7 +27,7 @@ spec: value: >- --disable-uvicorn-access-log --no-enable-log-requests - --max-model-len 16384 + --max-model-len -1 --max-num-batched-tokens 8192 resources: requests: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-deepseek-ai-deepseek-r1-distill-llama-8b-amd-mi300x-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-deepseek-ai-deepseek-r1-distill-llama-8b-amd-mi300x-tp2.helm.yaml index afcee93..7db8ce0 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-deepseek-ai-deepseek-r1-distill-llama-8b-amd-mi300x-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-deepseek-ai-deepseek-r1-distill-llama-8b-amd-mi300x-tp2.helm.yaml @@ -27,7 +27,7 @@ spec: value: >- --disable-uvicorn-access-log --no-enable-log-requests - --max-model-len 16384 + --max-model-len -1 --max-num-batched-tokens 8192 resources: requests: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-deepseek-ai-deepseek-r1-distill-llama-8b-decode-amd-mi250-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-deepseek-ai-deepseek-r1-distill-llama-8b-decode-amd-mi250-tp2.helm.yaml index 65783b2..669a43b 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-deepseek-ai-deepseek-r1-distill-llama-8b-decode-amd-mi250-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-deepseek-ai-deepseek-r1-distill-llama-8b-decode-amd-mi250-tp2.helm.yaml @@ -27,7 +27,7 @@ spec: value: >- --disable-uvicorn-access-log --no-enable-log-requests - --max-model-len 16384 + --max-model-len -1 --max-num-batched-tokens 8192 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' resources: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-deepseek-ai-deepseek-r1-distill-llama-8b-decode-amd-mi300x-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-deepseek-ai-deepseek-r1-distill-llama-8b-decode-amd-mi300x-tp2.helm.yaml index 3b6ae37..20e31ed 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-deepseek-ai-deepseek-r1-distill-llama-8b-decode-amd-mi300x-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-deepseek-ai-deepseek-r1-distill-llama-8b-decode-amd-mi300x-tp2.helm.yaml @@ -27,7 +27,7 @@ spec: value: >- --disable-uvicorn-access-log --no-enable-log-requests - --max-model-len 16384 + --max-model-len -1 --max-num-batched-tokens 8192 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' resources: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-deepseek-ai-deepseek-r1-distill-llama-8b-prefill-amd-mi250-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-deepseek-ai-deepseek-r1-distill-llama-8b-prefill-amd-mi250-tp2.helm.yaml index 055701a..55caa50 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-deepseek-ai-deepseek-r1-distill-llama-8b-prefill-amd-mi250-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-deepseek-ai-deepseek-r1-distill-llama-8b-prefill-amd-mi250-tp2.helm.yaml @@ -27,7 +27,7 @@ spec: value: >- --disable-uvicorn-access-log --no-enable-log-requests - --max-model-len 16384 + --max-model-len -1 --max-num-batched-tokens 8192 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' resources: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-deepseek-ai-deepseek-r1-distill-llama-8b-prefill-amd-mi300x-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-deepseek-ai-deepseek-r1-distill-llama-8b-prefill-amd-mi300x-tp2.helm.yaml index 786782b..2557408 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-deepseek-ai-deepseek-r1-distill-llama-8b-prefill-amd-mi300x-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-deepseek-ai-deepseek-r1-distill-llama-8b-prefill-amd-mi300x-tp2.helm.yaml @@ -27,7 +27,7 @@ spec: value: >- --disable-uvicorn-access-log --no-enable-log-requests - --max-model-len 16384 + --max-model-len -1 --max-num-batched-tokens 8192 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' resources: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-deepseek-ai-deepseek-r1-prefill-amd-mi300x-dp8-moe-ep8.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-deepseek-ai-deepseek-r1-prefill-amd-mi300x-dp8-moe-ep8.helm.yaml index 19f2074..a3a693e 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-deepseek-ai-deepseek-r1-prefill-amd-mi300x-dp8-moe-ep8.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-deepseek-ai-deepseek-r1-prefill-amd-mi300x-dp8-moe-ep8.helm.yaml @@ -79,7 +79,7 @@ spec: --max-num-batched-token 64000 --gpu-memory-utilization 0.92 --block-size 16 - --max-model-len 8192 + --max-model-len -1 --max-num-seqs 2048 resources: requests: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-ibm-granite-granite-3.3-8b-instruct-amd-mi250-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-ibm-granite-granite-3.3-8b-instruct-amd-mi250-tp2.helm.yaml index 6ccc34f..89bc5dd 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-ibm-granite-granite-3.3-8b-instruct-amd-mi250-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-ibm-granite-granite-3.3-8b-instruct-amd-mi250-tp2.helm.yaml @@ -27,7 +27,7 @@ spec: value: >- --disable-uvicorn-access-log --no-enable-log-requests - --max-model-len 16384 + --max-model-len -1 --max-num-batched-tokens 8192 resources: requests: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-ibm-granite-granite-3.3-8b-instruct-amd-mi300x-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-ibm-granite-granite-3.3-8b-instruct-amd-mi300x-tp2.helm.yaml index 7d0651d..7bc7680 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-ibm-granite-granite-3.3-8b-instruct-amd-mi300x-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-ibm-granite-granite-3.3-8b-instruct-amd-mi300x-tp2.helm.yaml @@ -27,7 +27,7 @@ spec: value: >- --disable-uvicorn-access-log --no-enable-log-requests - --max-model-len 16384 + --max-model-len -1 --max-num-batched-tokens 8192 resources: requests: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-ibm-granite-granite-3.3-8b-instruct-decode-amd-mi250-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-ibm-granite-granite-3.3-8b-instruct-decode-amd-mi250-tp2.helm.yaml index f49462b..b414f7f 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-ibm-granite-granite-3.3-8b-instruct-decode-amd-mi250-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-ibm-granite-granite-3.3-8b-instruct-decode-amd-mi250-tp2.helm.yaml @@ -27,7 +27,7 @@ spec: value: >- --disable-uvicorn-access-log --no-enable-log-requests - --max-model-len 16384 + --max-model-len -1 --max-num-batched-tokens 8192 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' resources: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-ibm-granite-granite-3.3-8b-instruct-decode-amd-mi300x-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-ibm-granite-granite-3.3-8b-instruct-decode-amd-mi300x-tp2.helm.yaml index 2a2f2de..8376ed3 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-ibm-granite-granite-3.3-8b-instruct-decode-amd-mi300x-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-ibm-granite-granite-3.3-8b-instruct-decode-amd-mi300x-tp2.helm.yaml @@ -27,7 +27,7 @@ spec: value: >- --disable-uvicorn-access-log --no-enable-log-requests - --max-model-len 16384 + --max-model-len -1 --max-num-batched-tokens 8192 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' resources: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-ibm-granite-granite-3.3-8b-instruct-prefill-amd-mi250-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-ibm-granite-granite-3.3-8b-instruct-prefill-amd-mi250-tp2.helm.yaml index ce89c3b..9f16c0d 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-ibm-granite-granite-3.3-8b-instruct-prefill-amd-mi250-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-ibm-granite-granite-3.3-8b-instruct-prefill-amd-mi250-tp2.helm.yaml @@ -27,7 +27,7 @@ spec: value: >- --disable-uvicorn-access-log --no-enable-log-requests - --max-model-len 16384 + --max-model-len -1 --max-num-batched-tokens 8192 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' resources: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-ibm-granite-granite-3.3-8b-instruct-prefill-amd-mi300x-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-ibm-granite-granite-3.3-8b-instruct-prefill-amd-mi300x-tp2.helm.yaml index e5041a4..de09c11 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-ibm-granite-granite-3.3-8b-instruct-prefill-amd-mi300x-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-ibm-granite-granite-3.3-8b-instruct-prefill-amd-mi300x-tp2.helm.yaml @@ -27,7 +27,7 @@ spec: value: >- --disable-uvicorn-access-log --no-enable-log-requests - --max-model-len 16384 + --max-model-len -1 --max-num-batched-tokens 8192 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' resources: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-meta-llama-llama-3.2-1b-instruct-amd-mi250-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-meta-llama-llama-3.2-1b-instruct-amd-mi250-tp2.helm.yaml index d155aa7..96a9c5f 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-meta-llama-llama-3.2-1b-instruct-amd-mi250-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-meta-llama-llama-3.2-1b-instruct-amd-mi250-tp2.helm.yaml @@ -27,7 +27,7 @@ spec: value: >- --disable-uvicorn-access-log --no-enable-log-requests - --max-model-len 16384 + --max-model-len -1 --max-num-batched-tokens 8192 resources: requests: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-meta-llama-llama-3.2-1b-instruct-amd-mi300x-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-meta-llama-llama-3.2-1b-instruct-amd-mi300x-tp2.helm.yaml index c643873..2cf58c0 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-meta-llama-llama-3.2-1b-instruct-amd-mi300x-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-meta-llama-llama-3.2-1b-instruct-amd-mi300x-tp2.helm.yaml @@ -27,7 +27,7 @@ spec: value: >- --disable-uvicorn-access-log --no-enable-log-requests - --max-model-len 16384 + --max-model-len -1 --max-num-batched-tokens 8192 resources: requests: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-meta-llama-llama-3.2-1b-instruct-decode-amd-mi250-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-meta-llama-llama-3.2-1b-instruct-decode-amd-mi250-tp2.helm.yaml index 7262909..2377be2 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-meta-llama-llama-3.2-1b-instruct-decode-amd-mi250-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-meta-llama-llama-3.2-1b-instruct-decode-amd-mi250-tp2.helm.yaml @@ -27,7 +27,7 @@ spec: value: >- --disable-uvicorn-access-log --no-enable-log-requests - --max-model-len 16384 + --max-model-len -1 --max-num-batched-tokens 8192 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' resources: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-meta-llama-llama-3.2-1b-instruct-decode-amd-mi300x-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-meta-llama-llama-3.2-1b-instruct-decode-amd-mi300x-tp2.helm.yaml index 86d2743..59c6887 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-meta-llama-llama-3.2-1b-instruct-decode-amd-mi300x-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-meta-llama-llama-3.2-1b-instruct-decode-amd-mi300x-tp2.helm.yaml @@ -27,7 +27,7 @@ spec: value: >- --disable-uvicorn-access-log --no-enable-log-requests - --max-model-len 16384 + --max-model-len -1 --max-num-batched-tokens 8192 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' resources: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-meta-llama-llama-3.2-1b-instruct-prefill-amd-mi250-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-meta-llama-llama-3.2-1b-instruct-prefill-amd-mi250-tp2.helm.yaml index 221a931..40aa1ce 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-meta-llama-llama-3.2-1b-instruct-prefill-amd-mi250-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-meta-llama-llama-3.2-1b-instruct-prefill-amd-mi250-tp2.helm.yaml @@ -27,7 +27,7 @@ spec: value: >- --disable-uvicorn-access-log --no-enable-log-requests - --max-model-len 16384 + --max-model-len -1 --max-num-batched-tokens 8192 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' resources: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-meta-llama-llama-3.2-1b-instruct-prefill-amd-mi300x-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-meta-llama-llama-3.2-1b-instruct-prefill-amd-mi300x-tp2.helm.yaml index c133099..ad3bb8f 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-meta-llama-llama-3.2-1b-instruct-prefill-amd-mi300x-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-meta-llama-llama-3.2-1b-instruct-prefill-amd-mi300x-tp2.helm.yaml @@ -27,7 +27,7 @@ spec: value: >- --disable-uvicorn-access-log --no-enable-log-requests - --max-model-len 16384 + --max-model-len -1 --max-num-batched-tokens 8192 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' resources: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-microsoft-phi-mini-moe-instruct-amd-mi250-dp2-moe-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-microsoft-phi-mini-moe-instruct-amd-mi250-dp2-moe-tp2.helm.yaml index 122f2c9..ef4aa14 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-microsoft-phi-mini-moe-instruct-amd-mi250-dp2-moe-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-microsoft-phi-mini-moe-instruct-amd-mi250-dp2-moe-tp2.helm.yaml @@ -28,7 +28,7 @@ spec: --trust-remote-code --disable-uvicorn-access-log --no-enable-log-requests - --max-model-len 4096 + --max-model-len -1 --max-num-batched-tokens 4096 resources: requests: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-microsoft-phi-mini-moe-instruct-decode-amd-mi250-dp2-moe-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-microsoft-phi-mini-moe-instruct-decode-amd-mi250-dp2-moe-tp2.helm.yaml index ee6005f..51c70fb 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-microsoft-phi-mini-moe-instruct-decode-amd-mi250-dp2-moe-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-microsoft-phi-mini-moe-instruct-decode-amd-mi250-dp2-moe-tp2.helm.yaml @@ -28,7 +28,7 @@ spec: --trust-remote-code --disable-uvicorn-access-log --no-enable-log-requests - --max-model-len 4096 + --max-model-len -1 --max-num-batched-tokens 4096 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' resources: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-microsoft-phi-mini-moe-instruct-prefill-amd-mi250-dp2-moe-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-microsoft-phi-mini-moe-instruct-prefill-amd-mi250-dp2-moe-tp2.helm.yaml index 0d468de..ffd9c33 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-microsoft-phi-mini-moe-instruct-prefill-amd-mi250-dp2-moe-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-microsoft-phi-mini-moe-instruct-prefill-amd-mi250-dp2-moe-tp2.helm.yaml @@ -28,7 +28,7 @@ spec: --trust-remote-code --disable-uvicorn-access-log --no-enable-log-requests - --max-model-len 4096 + --max-model-len -1 --max-num-batched-tokens 4096 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' resources: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-mistralai-mistral-7b-instruct-v0.3-amd-mi250-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-mistralai-mistral-7b-instruct-v0.3-amd-mi250-tp2.helm.yaml index 279007e..1182e13 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-mistralai-mistral-7b-instruct-v0.3-amd-mi250-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-mistralai-mistral-7b-instruct-v0.3-amd-mi250-tp2.helm.yaml @@ -27,7 +27,7 @@ spec: value: >- --disable-uvicorn-access-log --no-enable-log-requests - --max-model-len 16384 + --max-model-len -1 --max-num-batched-tokens 8192 resources: requests: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-mistralai-mistral-7b-instruct-v0.3-amd-mi300x-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-mistralai-mistral-7b-instruct-v0.3-amd-mi300x-tp2.helm.yaml index 462edc8..5bf02f1 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-mistralai-mistral-7b-instruct-v0.3-amd-mi300x-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-mistralai-mistral-7b-instruct-v0.3-amd-mi300x-tp2.helm.yaml @@ -27,7 +27,7 @@ spec: value: >- --disable-uvicorn-access-log --no-enable-log-requests - --max-model-len 16384 + --max-model-len -1 --max-num-batched-tokens 8192 resources: requests: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-mistralai-mistral-7b-instruct-v0.3-decode-amd-mi250-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-mistralai-mistral-7b-instruct-v0.3-decode-amd-mi250-tp2.helm.yaml index 0084756..0925a5c 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-mistralai-mistral-7b-instruct-v0.3-decode-amd-mi250-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-mistralai-mistral-7b-instruct-v0.3-decode-amd-mi250-tp2.helm.yaml @@ -27,7 +27,7 @@ spec: value: >- --disable-uvicorn-access-log --no-enable-log-requests - --max-model-len 16384 + --max-model-len -1 --max-num-batched-tokens 8192 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' resources: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-mistralai-mistral-7b-instruct-v0.3-decode-amd-mi300x-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-mistralai-mistral-7b-instruct-v0.3-decode-amd-mi300x-tp2.helm.yaml index f775027..c7c7d4e 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-mistralai-mistral-7b-instruct-v0.3-decode-amd-mi300x-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-mistralai-mistral-7b-instruct-v0.3-decode-amd-mi300x-tp2.helm.yaml @@ -27,7 +27,7 @@ spec: value: >- --disable-uvicorn-access-log --no-enable-log-requests - --max-model-len 16384 + --max-model-len -1 --max-num-batched-tokens 8192 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' resources: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-mistralai-mistral-7b-instruct-v0.3-prefill-amd-mi250-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-mistralai-mistral-7b-instruct-v0.3-prefill-amd-mi250-tp2.helm.yaml index b361c33..d0f9f87 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-mistralai-mistral-7b-instruct-v0.3-prefill-amd-mi250-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-mistralai-mistral-7b-instruct-v0.3-prefill-amd-mi250-tp2.helm.yaml @@ -27,7 +27,7 @@ spec: value: >- --disable-uvicorn-access-log --no-enable-log-requests - --max-model-len 16384 + --max-model-len -1 --max-num-batched-tokens 8192 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' resources: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-mistralai-mistral-7b-instruct-v0.3-prefill-amd-mi300x-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-mistralai-mistral-7b-instruct-v0.3-prefill-amd-mi300x-tp2.helm.yaml index e4374d7..2cb947d 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-mistralai-mistral-7b-instruct-v0.3-prefill-amd-mi300x-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-mistralai-mistral-7b-instruct-v0.3-prefill-amd-mi300x-tp2.helm.yaml @@ -27,7 +27,7 @@ spec: value: >- --disable-uvicorn-access-log --no-enable-log-requests - --max-model-len 16384 + --max-model-len -1 --max-num-batched-tokens 8192 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' resources: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-openai-gpt-oss-20b-amd-mi250-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-openai-gpt-oss-20b-amd-mi250-tp2.helm.yaml index 95834bc..e4cfebd 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-openai-gpt-oss-20b-amd-mi250-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-openai-gpt-oss-20b-amd-mi250-tp2.helm.yaml @@ -28,7 +28,7 @@ spec: --trust-remote-code --disable-uvicorn-access-log --no-enable-log-requests - --max-model-len 16384 + --max-model-len -1 --max-num-batched-tokens 8192 resources: requests: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-openai-gpt-oss-20b-amd-mi300x-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-openai-gpt-oss-20b-amd-mi300x-tp2.helm.yaml index bc588ee..fde2dc6 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-openai-gpt-oss-20b-amd-mi300x-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-openai-gpt-oss-20b-amd-mi300x-tp2.helm.yaml @@ -28,7 +28,7 @@ spec: --trust-remote-code --disable-uvicorn-access-log --no-enable-log-requests - --max-model-len 16384 + --max-model-len -1 --max-num-batched-tokens 8192 resources: requests: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-openai-gpt-oss-20b-decode-amd-mi250-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-openai-gpt-oss-20b-decode-amd-mi250-tp2.helm.yaml index e945d3e..068e04c 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-openai-gpt-oss-20b-decode-amd-mi250-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-openai-gpt-oss-20b-decode-amd-mi250-tp2.helm.yaml @@ -28,7 +28,7 @@ spec: --trust-remote-code --disable-uvicorn-access-log --no-enable-log-requests - --max-model-len 16384 + --max-model-len -1 --max-num-batched-tokens 8192 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' resources: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-openai-gpt-oss-20b-decode-amd-mi300x-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-openai-gpt-oss-20b-decode-amd-mi300x-tp2.helm.yaml index c2ff0ab..41a5f46 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-openai-gpt-oss-20b-decode-amd-mi300x-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-openai-gpt-oss-20b-decode-amd-mi300x-tp2.helm.yaml @@ -28,7 +28,7 @@ spec: --trust-remote-code --disable-uvicorn-access-log --no-enable-log-requests - --max-model-len 16384 + --max-model-len -1 --max-num-batched-tokens 8192 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' resources: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-openai-gpt-oss-20b-prefill-amd-mi250-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-openai-gpt-oss-20b-prefill-amd-mi250-tp2.helm.yaml index f251c91..f84a2ab 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-openai-gpt-oss-20b-prefill-amd-mi250-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-openai-gpt-oss-20b-prefill-amd-mi250-tp2.helm.yaml @@ -28,7 +28,7 @@ spec: --trust-remote-code --disable-uvicorn-access-log --no-enable-log-requests - --max-model-len 16384 + --max-model-len -1 --max-num-batched-tokens 8192 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' resources: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-openai-gpt-oss-20b-prefill-amd-mi300x-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-openai-gpt-oss-20b-prefill-amd-mi300x-tp2.helm.yaml index 53d52a5..c68ffb0 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-openai-gpt-oss-20b-prefill-amd-mi300x-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-openai-gpt-oss-20b-prefill-amd-mi300x-tp2.helm.yaml @@ -28,7 +28,7 @@ spec: --trust-remote-code --disable-uvicorn-access-log --no-enable-log-requests - --max-model-len 16384 + --max-model-len -1 --max-num-batched-tokens 8192 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' resources: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2-0.5b-instruct-amd-mi250-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2-0.5b-instruct-amd-mi250-tp2.helm.yaml index 21cce7a..11585d4 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2-0.5b-instruct-amd-mi250-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2-0.5b-instruct-amd-mi250-tp2.helm.yaml @@ -27,7 +27,7 @@ spec: value: >- --disable-uvicorn-access-log --no-enable-log-requests - --max-model-len 16384 + --max-model-len -1 --max-num-batched-tokens 8192 resources: requests: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2-0.5b-instruct-amd-mi300x-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2-0.5b-instruct-amd-mi300x-tp2.helm.yaml index a43515c..af6a316 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2-0.5b-instruct-amd-mi300x-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2-0.5b-instruct-amd-mi300x-tp2.helm.yaml @@ -27,7 +27,7 @@ spec: value: >- --disable-uvicorn-access-log --no-enable-log-requests - --max-model-len 16384 + --max-model-len -1 --max-num-batched-tokens 8192 resources: requests: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2-0.5b-instruct-decode-amd-mi250-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2-0.5b-instruct-decode-amd-mi250-tp2.helm.yaml index 5d7ce03..8a363d5 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2-0.5b-instruct-decode-amd-mi250-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2-0.5b-instruct-decode-amd-mi250-tp2.helm.yaml @@ -27,7 +27,7 @@ spec: value: >- --disable-uvicorn-access-log --no-enable-log-requests - --max-model-len 16384 + --max-model-len -1 --max-num-batched-tokens 8192 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' resources: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2-0.5b-instruct-decode-amd-mi300x-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2-0.5b-instruct-decode-amd-mi300x-tp2.helm.yaml index 7d7e07d..e66405d 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2-0.5b-instruct-decode-amd-mi300x-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2-0.5b-instruct-decode-amd-mi300x-tp2.helm.yaml @@ -27,7 +27,7 @@ spec: value: >- --disable-uvicorn-access-log --no-enable-log-requests - --max-model-len 16384 + --max-model-len -1 --max-num-batched-tokens 8192 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' resources: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2-0.5b-instruct-prefill-amd-mi250-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2-0.5b-instruct-prefill-amd-mi250-tp2.helm.yaml index eaf8adf..bdabd2c 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2-0.5b-instruct-prefill-amd-mi250-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2-0.5b-instruct-prefill-amd-mi250-tp2.helm.yaml @@ -27,7 +27,7 @@ spec: value: >- --disable-uvicorn-access-log --no-enable-log-requests - --max-model-len 16384 + --max-model-len -1 --max-num-batched-tokens 8192 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' resources: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2-0.5b-instruct-prefill-amd-mi300x-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2-0.5b-instruct-prefill-amd-mi300x-tp2.helm.yaml index 7e9cb24..5e5bc7b 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2-0.5b-instruct-prefill-amd-mi300x-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2-0.5b-instruct-prefill-amd-mi300x-tp2.helm.yaml @@ -27,7 +27,7 @@ spec: value: >- --disable-uvicorn-access-log --no-enable-log-requests - --max-model-len 16384 + --max-model-len -1 --max-num-batched-tokens 8192 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' resources: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2.5-1.5b-instruct-amd-mi250-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2.5-1.5b-instruct-amd-mi250-tp2.helm.yaml index 5698d3a..10deb2e 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2.5-1.5b-instruct-amd-mi250-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2.5-1.5b-instruct-amd-mi250-tp2.helm.yaml @@ -27,7 +27,7 @@ spec: value: >- --disable-uvicorn-access-log --no-enable-log-requests - --max-model-len 16384 + --max-model-len -1 --max-num-batched-tokens 8192 resources: requests: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2.5-1.5b-instruct-amd-mi300x-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2.5-1.5b-instruct-amd-mi300x-tp2.helm.yaml index 23e325e..25eda78 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2.5-1.5b-instruct-amd-mi300x-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2.5-1.5b-instruct-amd-mi300x-tp2.helm.yaml @@ -27,7 +27,7 @@ spec: value: >- --disable-uvicorn-access-log --no-enable-log-requests - --max-model-len 16384 + --max-model-len -1 --max-num-batched-tokens 8192 resources: requests: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2.5-1.5b-instruct-decode-amd-mi250-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2.5-1.5b-instruct-decode-amd-mi250-tp2.helm.yaml index 47ab826..62bfc4e 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2.5-1.5b-instruct-decode-amd-mi250-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2.5-1.5b-instruct-decode-amd-mi250-tp2.helm.yaml @@ -27,7 +27,7 @@ spec: value: >- --disable-uvicorn-access-log --no-enable-log-requests - --max-model-len 16384 + --max-model-len -1 --max-num-batched-tokens 8192 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' resources: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2.5-1.5b-instruct-decode-amd-mi300x-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2.5-1.5b-instruct-decode-amd-mi300x-tp2.helm.yaml index 4667dec..2edad4d 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2.5-1.5b-instruct-decode-amd-mi300x-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2.5-1.5b-instruct-decode-amd-mi300x-tp2.helm.yaml @@ -27,7 +27,7 @@ spec: value: >- --disable-uvicorn-access-log --no-enable-log-requests - --max-model-len 16384 + --max-model-len -1 --max-num-batched-tokens 8192 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' resources: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2.5-1.5b-instruct-prefill-amd-mi250-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2.5-1.5b-instruct-prefill-amd-mi250-tp2.helm.yaml index a3d2bff..1e3d64d 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2.5-1.5b-instruct-prefill-amd-mi250-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2.5-1.5b-instruct-prefill-amd-mi250-tp2.helm.yaml @@ -27,7 +27,7 @@ spec: value: >- --disable-uvicorn-access-log --no-enable-log-requests - --max-model-len 16384 + --max-model-len -1 --max-num-batched-tokens 8192 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' resources: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2.5-1.5b-instruct-prefill-amd-mi300x-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2.5-1.5b-instruct-prefill-amd-mi300x-tp2.helm.yaml index 8e7bd5b..419620a 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2.5-1.5b-instruct-prefill-amd-mi300x-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen2.5-1.5b-instruct-prefill-amd-mi300x-tp2.helm.yaml @@ -27,7 +27,7 @@ spec: value: >- --disable-uvicorn-access-log --no-enable-log-requests - --max-model-len 16384 + --max-model-len -1 --max-num-batched-tokens 8192 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' resources: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-1.7b-amd-mi250-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-1.7b-amd-mi250-tp2.helm.yaml index bbd2ac7..13fcfde 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-1.7b-amd-mi250-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-1.7b-amd-mi250-tp2.helm.yaml @@ -27,7 +27,7 @@ spec: value: >- --disable-uvicorn-access-log --no-enable-log-requests - --max-model-len 16384 + --max-model-len -1 --max-num-batched-tokens 8192 resources: requests: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-1.7b-amd-mi300x-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-1.7b-amd-mi300x-tp2.helm.yaml index 70cd8a1..d965a2a 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-1.7b-amd-mi300x-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-1.7b-amd-mi300x-tp2.helm.yaml @@ -27,7 +27,7 @@ spec: value: >- --disable-uvicorn-access-log --no-enable-log-requests - --max-model-len 16384 + --max-model-len -1 --max-num-batched-tokens 8192 resources: requests: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-1.7b-decode-amd-mi250-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-1.7b-decode-amd-mi250-tp2.helm.yaml index c99cf3e..bc2ddfc 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-1.7b-decode-amd-mi250-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-1.7b-decode-amd-mi250-tp2.helm.yaml @@ -27,7 +27,7 @@ spec: value: >- --disable-uvicorn-access-log --no-enable-log-requests - --max-model-len 16384 + --max-model-len -1 --max-num-batched-tokens 8192 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' resources: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-1.7b-decode-amd-mi300x-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-1.7b-decode-amd-mi300x-tp2.helm.yaml index 2bdd10e..519dc19 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-1.7b-decode-amd-mi300x-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-1.7b-decode-amd-mi300x-tp2.helm.yaml @@ -27,7 +27,7 @@ spec: value: >- --disable-uvicorn-access-log --no-enable-log-requests - --max-model-len 16384 + --max-model-len -1 --max-num-batched-tokens 8192 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' resources: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-1.7b-prefill-amd-mi250-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-1.7b-prefill-amd-mi250-tp2.helm.yaml index cfc69da..498e1f4 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-1.7b-prefill-amd-mi250-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-1.7b-prefill-amd-mi250-tp2.helm.yaml @@ -27,7 +27,7 @@ spec: value: >- --disable-uvicorn-access-log --no-enable-log-requests - --max-model-len 16384 + --max-model-len -1 --max-num-batched-tokens 8192 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' resources: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-1.7b-prefill-amd-mi300x-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-1.7b-prefill-amd-mi300x-tp2.helm.yaml index 9629ec9..60ad176 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-1.7b-prefill-amd-mi300x-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-1.7b-prefill-amd-mi300x-tp2.helm.yaml @@ -27,7 +27,7 @@ spec: value: >- --disable-uvicorn-access-log --no-enable-log-requests - --max-model-len 16384 + --max-model-len -1 --max-num-batched-tokens 8192 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' resources: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-vl-8b-instruct-amd-mi250-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-vl-8b-instruct-amd-mi250-tp2.helm.yaml index 84991e5..cb90e3f 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-vl-8b-instruct-amd-mi250-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-vl-8b-instruct-amd-mi250-tp2.helm.yaml @@ -27,7 +27,7 @@ spec: value: >- --disable-uvicorn-access-log --no-enable-log-requests - --max-model-len 16384 + --max-model-len -1 --max-num-batched-tokens 8192 resources: requests: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-vl-8b-instruct-amd-mi300x-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-vl-8b-instruct-amd-mi300x-tp2.helm.yaml index 33d39e1..29a7e78 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-vl-8b-instruct-amd-mi300x-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-vl-8b-instruct-amd-mi300x-tp2.helm.yaml @@ -27,7 +27,7 @@ spec: value: >- --disable-uvicorn-access-log --no-enable-log-requests - --max-model-len 16384 + --max-model-len -1 --max-num-batched-tokens 8192 resources: requests: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-vl-8b-instruct-decode-amd-mi250-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-vl-8b-instruct-decode-amd-mi250-tp2.helm.yaml index c399638..180088e 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-vl-8b-instruct-decode-amd-mi250-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-vl-8b-instruct-decode-amd-mi250-tp2.helm.yaml @@ -27,7 +27,7 @@ spec: value: >- --disable-uvicorn-access-log --no-enable-log-requests - --max-model-len 16384 + --max-model-len -1 --max-num-batched-tokens 8192 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' resources: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-vl-8b-instruct-decode-amd-mi300x-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-vl-8b-instruct-decode-amd-mi300x-tp2.helm.yaml index 5284217..ce44068 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-vl-8b-instruct-decode-amd-mi300x-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-vl-8b-instruct-decode-amd-mi300x-tp2.helm.yaml @@ -27,7 +27,7 @@ spec: value: >- --disable-uvicorn-access-log --no-enable-log-requests - --max-model-len 16384 + --max-model-len -1 --max-num-batched-tokens 8192 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_consumer"}' resources: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-vl-8b-instruct-prefill-amd-mi250-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-vl-8b-instruct-prefill-amd-mi250-tp2.helm.yaml index 4d0c2c0..e579049 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-vl-8b-instruct-prefill-amd-mi250-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-vl-8b-instruct-prefill-amd-mi250-tp2.helm.yaml @@ -27,7 +27,7 @@ spec: value: >- --disable-uvicorn-access-log --no-enable-log-requests - --max-model-len 16384 + --max-model-len -1 --max-num-batched-tokens 8192 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' resources: diff --git a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-vl-8b-instruct-prefill-amd-mi300x-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-vl-8b-instruct-prefill-amd-mi300x-tp2.helm.yaml index cbdd48a..9698e32 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-vl-8b-instruct-prefill-amd-mi300x-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/quickstart/quickstart-vllm-qwen-qwen3-vl-8b-instruct-prefill-amd-mi300x-tp2.helm.yaml @@ -27,7 +27,7 @@ spec: value: >- --disable-uvicorn-access-log --no-enable-log-requests - --max-model-len 16384 + --max-model-len -1 --max-num-batched-tokens 8192 --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_producer"}' resources: diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/glm5/zai-org-glm-4.7-flash-nvidia-h100-sxm-1.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/glm5/zai-org-glm-4.7-flash-nvidia-h100-sxm-1.helm.yaml index 5d52b4f..7f87744 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/vllm/glm5/zai-org-glm-4.7-flash-nvidia-h100-sxm-1.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/glm5/zai-org-glm-4.7-flash-nvidia-h100-sxm-1.helm.yaml @@ -26,7 +26,7 @@ spec: --trust-remote-code --tool-call-parser glm47 --reasoning-parser glm45 - --max-model-len 32768 + --max-model-len -1 --max-num-seqs 32 --gpu-memory-utilization 0.90 --kv-cache-dtype auto diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/glm5/zai-org-glm-4.7-flash-nvidia-h100-sxm-tp2-moe-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/glm5/zai-org-glm-4.7-flash-nvidia-h100-sxm-tp2-moe-tp2.helm.yaml index 72b2f3c..8dac040 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/vllm/glm5/zai-org-glm-4.7-flash-nvidia-h100-sxm-tp2-moe-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/glm5/zai-org-glm-4.7-flash-nvidia-h100-sxm-tp2-moe-tp2.helm.yaml @@ -28,7 +28,7 @@ spec: --trust-remote-code --tool-call-parser glm47 --reasoning-parser glm45 - --max-model-len 131072 + --max-model-len -1 --max-num-seqs 64 --gpu-memory-utilization 0.90 --kv-cache-dtype auto diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/glm5/zai-org-glm-4.7-flash-nvidia-h100-sxm-tp4-moe-tp4.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/glm5/zai-org-glm-4.7-flash-nvidia-h100-sxm-tp4-moe-tp4.helm.yaml index 657c84b..2779cbb 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/vllm/glm5/zai-org-glm-4.7-flash-nvidia-h100-sxm-tp4-moe-tp4.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/glm5/zai-org-glm-4.7-flash-nvidia-h100-sxm-tp4-moe-tp4.helm.yaml @@ -28,7 +28,7 @@ spec: --trust-remote-code --tool-call-parser glm47 --reasoning-parser glm45 - --max-model-len 200000 + --max-model-len -1 --max-num-seqs 64 --gpu-memory-utilization 0.90 --kv-cache-dtype auto diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/glm5/zai-org-glm-4.7-flash-nvidia-h200-sxm-1.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/glm5/zai-org-glm-4.7-flash-nvidia-h200-sxm-1.helm.yaml index 0823255..b1982e0 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/vllm/glm5/zai-org-glm-4.7-flash-nvidia-h200-sxm-1.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/glm5/zai-org-glm-4.7-flash-nvidia-h200-sxm-1.helm.yaml @@ -26,7 +26,7 @@ spec: --trust-remote-code --tool-call-parser glm47 --reasoning-parser glm45 - --max-model-len 131072 + --max-model-len -1 --max-num-seqs 64 --gpu-memory-utilization 0.90 --kv-cache-dtype auto diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/glm5/zai-org-glm-4.7-flash-nvidia-h200-sxm-tp2-moe-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/glm5/zai-org-glm-4.7-flash-nvidia-h200-sxm-tp2-moe-tp2.helm.yaml index 6980682..6e1feea 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/vllm/glm5/zai-org-glm-4.7-flash-nvidia-h200-sxm-tp2-moe-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/glm5/zai-org-glm-4.7-flash-nvidia-h200-sxm-tp2-moe-tp2.helm.yaml @@ -28,7 +28,7 @@ spec: --trust-remote-code --tool-call-parser glm47 --reasoning-parser glm45 - --max-model-len 200000 + --max-model-len -1 --max-num-seqs 64 --gpu-memory-utilization 0.90 --kv-cache-dtype auto diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/deepseek-ai-deepseek-r1-nvidia-h100-sxm-dp16-moe-ep16.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/deepseek-ai-deepseek-r1-nvidia-h100-sxm-dp16-moe-ep16.helm.yaml index 9dd6a33..6db2261 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/deepseek-ai-deepseek-r1-nvidia-h100-sxm-dp16-moe-ep16.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/deepseek-ai-deepseek-r1-nvidia-h100-sxm-dp16-moe-ep16.helm.yaml @@ -32,7 +32,7 @@ spec: - name: ISVC_EXTRA_ARGS value: >- --trust-remote-code - --max-model-len 32768 + --max-model-len -1 --max-num-seqs 128 --max-num-batched-tokens 57344 --gpu-memory-utilization 0.9 diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/deepseek-ai-deepseek-r1-nvidia-h200-sxm-dp16-moe-ep16.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/deepseek-ai-deepseek-r1-nvidia-h200-sxm-dp16-moe-ep16.helm.yaml index 1d5c045..8a56665 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/deepseek-ai-deepseek-r1-nvidia-h200-sxm-dp16-moe-ep16.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/deepseek-ai-deepseek-r1-nvidia-h200-sxm-dp16-moe-ep16.helm.yaml @@ -32,7 +32,7 @@ spec: - name: ISVC_EXTRA_ARGS value: >- --trust-remote-code - --max-model-len 65536 + --max-model-len -1 --max-num-seqs 256 --max-num-batched-tokens 57344 --gpu-memory-utilization 0.9 diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/deepseek-ai-deepseek-r1-nvidia-h200-sxm-dp8-moe-ep8.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/deepseek-ai-deepseek-r1-nvidia-h200-sxm-dp8-moe-ep8.helm.yaml index cda2078..644aa23 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/deepseek-ai-deepseek-r1-nvidia-h200-sxm-dp8-moe-ep8.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/deepseek-ai-deepseek-r1-nvidia-h200-sxm-dp8-moe-ep8.helm.yaml @@ -29,7 +29,7 @@ spec: - name: ISVC_EXTRA_ARGS value: >- --trust-remote-code - --max-model-len 16384 + --max-model-len -1 --max-num-seqs 112 --max-num-batched-tokens 57344 --gpu-memory-utilization 0.9 diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/deepseek-ai-deepseek-r1-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/deepseek-ai-deepseek-r1-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml index c36d1fe..c377381 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/deepseek-ai-deepseek-r1-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/deepseek-ai-deepseek-r1-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml @@ -27,7 +27,7 @@ spec: - name: ISVC_EXTRA_ARGS value: >- --trust-remote-code - --max-model-len 32768 + --max-model-len -1 --max-num-seqs 128 --gpu-memory-utilization 0.9 --kv-cache-dtype auto diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/deepseek-ai-deepseek-r1-nvidia-h200-sxm-tp8-moe-tp8.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/deepseek-ai-deepseek-r1-nvidia-h200-sxm-tp8-moe-tp8.helm.yaml index dc4d4a2..7e54c32 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/deepseek-ai-deepseek-r1-nvidia-h200-sxm-tp8-moe-tp8.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/deepseek-ai-deepseek-r1-nvidia-h200-sxm-tp8-moe-tp8.helm.yaml @@ -26,7 +26,7 @@ spec: - name: ISVC_EXTRA_ARGS value: >- --trust-remote-code - --max-model-len 32768 + --max-model-len -1 --max-num-seqs 128 --gpu-memory-utilization 0.9 --kv-cache-dtype auto diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/moonshotai-kimi-k2.5-nvidia-h100-sxm-tp8-moe-ep8.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/moonshotai-kimi-k2.5-nvidia-h100-sxm-tp8-moe-ep8.helm.yaml index 46bc1dc..6bbfb3e 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/moonshotai-kimi-k2.5-nvidia-h100-sxm-tp8-moe-ep8.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/moonshotai-kimi-k2.5-nvidia-h100-sxm-tp8-moe-ep8.helm.yaml @@ -32,7 +32,7 @@ spec: --tool-call-parser kimi_k2 --reasoning-parser kimi_k2 --enable-auto-tool-choice - --max-model-len 4096 + --max-model-len -1 --max-num-seqs 16 --gpu-memory-utilization 0.95 --kv-cache-dtype auto diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/moonshotai-kimi-k2.5-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/moonshotai-kimi-k2.5-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml index ee14209..7be19bb 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/moonshotai-kimi-k2.5-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/moonshotai-kimi-k2.5-nvidia-h200-sxm-tp8-moe-ep8.helm.yaml @@ -32,7 +32,7 @@ spec: --tool-call-parser kimi_k2 --reasoning-parser kimi_k2 --enable-auto-tool-choice - --max-model-len 65536 + --max-model-len -1 --max-num-seqs 64 --gpu-memory-utilization 0.90 --kv-cache-dtype fp8 diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/moonshotai-kimi-k2.5-nvidia-h200-sxm-tp8-moe-tp8.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/moonshotai-kimi-k2.5-nvidia-h200-sxm-tp8-moe-tp8.helm.yaml index d79d3d2..5c85655 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/moonshotai-kimi-k2.5-nvidia-h200-sxm-tp8-moe-tp8.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/moonshotai-kimi-k2.5-nvidia-h200-sxm-tp8-moe-tp8.helm.yaml @@ -31,7 +31,7 @@ spec: --tool-call-parser kimi_k2 --reasoning-parser kimi_k2 --enable-auto-tool-choice - --max-model-len 65536 + --max-model-len -1 --max-num-seqs 64 --gpu-memory-utilization 0.90 --kv-cache-dtype fp8 diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/openai-gpt-oss-120b-nvidia-h100-sxm-tp2-moe-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/openai-gpt-oss-120b-nvidia-h100-sxm-tp2-moe-tp2.helm.yaml index ae84db1..b6cd54d 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/openai-gpt-oss-120b-nvidia-h100-sxm-tp2-moe-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/openai-gpt-oss-120b-nvidia-h100-sxm-tp2-moe-tp2.helm.yaml @@ -26,7 +26,7 @@ spec: - name: ISVC_EXTRA_ARGS value: >- --trust-remote-code - --max-model-len 131072 + --max-model-len -1 --max-num-seqs 64 --gpu-memory-utilization 0.90 --kv-cache-dtype fp8 diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/openai-gpt-oss-120b-nvidia-h100-sxm-tp4-moe-tp4.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/openai-gpt-oss-120b-nvidia-h100-sxm-tp4-moe-tp4.helm.yaml index 4f932e9..a389475 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/openai-gpt-oss-120b-nvidia-h100-sxm-tp4-moe-tp4.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/openai-gpt-oss-120b-nvidia-h100-sxm-tp4-moe-tp4.helm.yaml @@ -26,7 +26,7 @@ spec: - name: ISVC_EXTRA_ARGS value: >- --trust-remote-code - --max-model-len 131072 + --max-model-len -1 --max-num-seqs 128 --gpu-memory-utilization 0.90 --kv-cache-dtype fp8 diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/openai-gpt-oss-120b-nvidia-h100-sxm-tp8-moe-tp8.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/openai-gpt-oss-120b-nvidia-h100-sxm-tp8-moe-tp8.helm.yaml index 4afd8b4..e2e2508 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/openai-gpt-oss-120b-nvidia-h100-sxm-tp8-moe-tp8.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/openai-gpt-oss-120b-nvidia-h100-sxm-tp8-moe-tp8.helm.yaml @@ -26,7 +26,7 @@ spec: - name: ISVC_EXTRA_ARGS value: >- --trust-remote-code - --max-model-len 131072 + --max-model-len -1 --max-num-seqs 128 --gpu-memory-utilization 0.90 --kv-cache-dtype fp8 diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/openai-gpt-oss-120b-nvidia-h200-sxm-1.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/openai-gpt-oss-120b-nvidia-h200-sxm-1.helm.yaml index 520172c..4662afd 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/openai-gpt-oss-120b-nvidia-h200-sxm-1.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/openai-gpt-oss-120b-nvidia-h200-sxm-1.helm.yaml @@ -24,7 +24,7 @@ spec: - name: ISVC_EXTRA_ARGS value: >- --trust-remote-code - --max-model-len 131072 + --max-model-len -1 --max-num-seqs 64 --gpu-memory-utilization 0.90 --kv-cache-dtype fp8 diff --git a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/openai-gpt-oss-120b-nvidia-h200-sxm-tp2-moe-tp2.helm.yaml b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/openai-gpt-oss-120b-nvidia-h200-sxm-tp2-moe-tp2.helm.yaml index 7b5bc58..818e5bc 100644 --- a/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/openai-gpt-oss-120b-nvidia-h200-sxm-tp2-moe-tp2.helm.yaml +++ b/deploy/helm/moai-inference-preset/templates/presets/vllm/v0.15.1/openai-gpt-oss-120b-nvidia-h200-sxm-tp2-moe-tp2.helm.yaml @@ -26,7 +26,7 @@ spec: - name: ISVC_EXTRA_ARGS value: >- --trust-remote-code - --max-model-len 131072 + --max-model-len -1 --max-num-seqs 128 --gpu-memory-utilization 0.90 --kv-cache-dtype fp8