diff --git a/js/llama_webgpu_bridge.js b/js/llama_webgpu_bridge.js
index 55cf007..6dcd88c 100644
--- a/js/llama_webgpu_bridge.js
+++ b/js/llama_webgpu_bridge.js
@@ -549,6 +549,26 @@ function toUint8Array(value) {
return null;
}
+function trimUnstableUtf8Tail(text) {
+ if (typeof text !== 'string' || text.length === 0) {
+ return '';
+ }
+
+ let end = text.length;
+ while (end > 0 && text.charCodeAt(end - 1) === 0xFFFD) {
+ end -= 1;
+ }
+
+ if (end > 0) {
+ const tail = text.charCodeAt(end - 1);
+ if (tail >= 0xD800 && tail <= 0xDBFF) {
+ end -= 1;
+ }
+ }
+
+ return end === text.length ? text : text.slice(0, end);
+}
+
function toFloat32Array(value) {
if (!value) {
return null;
@@ -3831,7 +3851,8 @@ class LlamaWebGpuBridgeRuntime {
const shouldYieldForResponsiveness =
!(typeof WorkerGlobalScope !== 'undefined' && globalThis instanceof WorkerGlobalScope);
const yieldInterval = shouldYieldForResponsiveness ? 4 : 0;
- let streamed = shouldEmitCurrentText ? '' : null;
+ let streamed = '';
+ let emittedStableText = '';
while (generated < nPredict) {
if (this._abortRequested || options.signal?.aborted) {
@@ -3888,19 +3909,25 @@ class LlamaWebGpuBridgeRuntime {
}
generated += 1;
- const piece = this._core.ccall('llamadart_webgpu_last_piece', 'string', [], []) || '';
- if (piece.length === 0) {
+ const fullText = this._core.ccall('llamadart_webgpu_last_output', 'string', [], []) || '';
+ streamed = fullText;
+ const stableText = trimUnstableUtf8Tail(fullText);
+
+ if (!stableText.startsWith(emittedStableText)) {
+ emittedStableText = '';
+ }
+
+ const deltaText = stableText.slice(emittedStableText.length);
+ if (deltaText.length === 0) {
continue;
}
+ emittedStableText = stableText;
if (typeof options.onToken === 'function') {
- const piecePayload = emitTokenText ? piece : textEncoder.encode(piece);
- if (shouldEmitCurrentText) {
- streamed += piece;
- options.onToken(piecePayload, streamed);
- } else {
- options.onToken(piecePayload, null);
- }
+ const piecePayload = emitTokenText
+ ? deltaText
+ : textEncoder.encode(deltaText);
+ options.onToken(piecePayload, shouldEmitCurrentText ? fullText : null);
}
if (yieldInterval > 0 && (generated % yieldInterval) === 0) {
@@ -3909,6 +3936,17 @@ class LlamaWebGpuBridgeRuntime {
}
const text = this._core.ccall('llamadart_webgpu_last_output', 'string', [], []) || streamed || '';
+ if (typeof options.onToken === 'function') {
+ const tailText = text.startsWith(emittedStableText)
+ ? text.slice(emittedStableText.length)
+ : '';
+ if (tailText.length > 0) {
+ const piecePayload = emitTokenText
+ ? tailText
+ : textEncoder.encode(tailText);
+ options.onToken(piecePayload, shouldEmitCurrentText ? text : null);
+ }
+ }
return text;
} finally {
if (generationStarted) {
@@ -4203,6 +4241,40 @@ export class LlamaWebGpuBridge {
return sanitized;
}
+ _createCpuSafeMultimodalLoadOptions(options = {}) {
+ const sanitized = this._sanitizeModelLoadOptions(options);
+ sanitized.nGpuLayers = 0;
+
+ if (Number.isFinite(Number(sanitized.nCtx)) && Number(sanitized.nCtx) > 4096) {
+ sanitized.nCtx = 4096;
+ }
+
+ if (!Number.isFinite(Number(sanitized.nThreads)) || Number(sanitized.nThreads) <= 0) {
+ sanitized.nThreads = 4;
+ } else {
+ sanitized.nThreads = Math.min(4, Math.max(1, Math.trunc(Number(sanitized.nThreads))));
+ }
+
+ sanitized.nThreadsBatch = sanitized.nThreads;
+
+ if (!Number.isFinite(Number(sanitized.nBatch)) || Number(sanitized.nBatch) <= 0) {
+ sanitized.nBatch = 128;
+ } else {
+ sanitized.nBatch = Math.min(128, Math.max(32, Math.trunc(Number(sanitized.nBatch))));
+ }
+
+ if (!Number.isFinite(Number(sanitized.nUbatch)) || Number(sanitized.nUbatch) <= 0) {
+ sanitized.nUbatch = Math.min(64, sanitized.nBatch);
+ } else {
+ sanitized.nUbatch = Math.min(
+ sanitized.nBatch,
+ Math.min(64, Math.max(1, Math.trunc(Number(sanitized.nUbatch)))),
+ );
+ }
+
+ return sanitized;
+ }
+
_rememberLoadedModel(url, options = {}) {
const normalizedUrl = String(url || '').trim();
if (normalizedUrl.length === 0) {
@@ -4277,7 +4349,9 @@ export class LlamaWebGpuBridge {
return false;
}
- const selectedOptions = this._sanitizeModelLoadOptions(this._loadedModelOptions || {});
+ const selectedOptions = this._createCpuSafeMultimodalLoadOptions(
+ this._loadedModelOptions || {},
+ );
const applyWorkerSafeMode = async () => {
await this._callWorker('loadModelFromUrl', [this._loadedModelUrl, selectedOptions]);
@@ -4385,8 +4459,11 @@ export class LlamaWebGpuBridge {
this._hasMediaParts(options)
&& typeof this._loadedMmProjUrl === 'string'
&& this._loadedMmProjUrl.length > 0;
+ const forceCpuMultimodalFallback =
+ this._hasMediaParts(options)
+ && Number(this._loadedModelOptions?.nGpuLayers) !== 0;
- if (Number(this._runtime?._modelBytes) > 0 && !forceReloadRequested) {
+ if (Number(this._runtime?._modelBytes) > 0 && !forceReloadRequested && !forceCpuMultimodalFallback) {
if (shouldEnsureMultimodalInRuntime) {
const runtimeSupportsMedia =
(typeof this._runtime.supportsVision === 'function' && this._runtime.supportsVision())
@@ -4407,16 +4484,18 @@ export class LlamaWebGpuBridge {
return;
}
- const loadOptions = this._sanitizeModelLoadOptions(this._loadedModelOptions || {});
+ const loadOptions = forceCpuMultimodalFallback
+ ? this._createCpuSafeMultimodalLoadOptions(this._loadedModelOptions || {})
+ : this._sanitizeModelLoadOptions(this._loadedModelOptions || {});
const workerTimedOut = this._isWorkerTimeoutError(fallbackError);
const forcedCpuFallback = this._isForcedCpuMultimodalFallbackError(fallbackError);
- const forceCpuMultimodalFallback =
- this._hasMediaParts(options)
+ const shouldWarnCpuMultimodalFallback =
+ forceCpuMultimodalFallback
&& (this._isDispatchWorkgroupLimitError(fallbackError)
- || forcedCpuFallback)
- && Number(loadOptions.nGpuLayers) !== 0;
+ || forcedCpuFallback
+ || Number(this._loadedModelOptions?.nGpuLayers) !== 0);
- if (forceCpuMultimodalFallback) {
+ if (shouldWarnCpuMultimodalFallback) {
loadOptions.nGpuLayers = 0;
if (Number.isFinite(loadOptions.nCtx) && Number(loadOptions.nCtx) > 4096) {
loadOptions.nCtx = 4096;
diff --git a/src/llama_webgpu_core.cpp b/src/llama_webgpu_core.cpp
index da40304..ed2a6ce 100644
--- a/src/llama_webgpu_core.cpp
+++ b/src/llama_webgpu_core.cpp
@@ -563,6 +563,14 @@ std::string normalize_media_markers(const std::string & prompt, const size_t med
replace_all_inplace(normalized, "<|image|>", marker);
replace_all_inplace(normalized, "
", marker);
replace_all_inplace(normalized, "<|img|>", marker);
+ replace_all_inplace(
+ normalized,
+ "<|vision_start|><|image_pad|><|vision_end|>",
+ marker);
+ replace_all_inplace(
+ normalized,
+ "<|vision_start|><|video_pad|><|vision_end|>",
+ marker);
replace_all_inplace(normalized, "