diff --git a/js/llama_webgpu_bridge.js b/js/llama_webgpu_bridge.js index 55cf007..6dcd88c 100644 --- a/js/llama_webgpu_bridge.js +++ b/js/llama_webgpu_bridge.js @@ -549,6 +549,26 @@ function toUint8Array(value) { return null; } +function trimUnstableUtf8Tail(text) { + if (typeof text !== 'string' || text.length === 0) { + return ''; + } + + let end = text.length; + while (end > 0 && text.charCodeAt(end - 1) === 0xFFFD) { + end -= 1; + } + + if (end > 0) { + const tail = text.charCodeAt(end - 1); + if (tail >= 0xD800 && tail <= 0xDBFF) { + end -= 1; + } + } + + return end === text.length ? text : text.slice(0, end); +} + function toFloat32Array(value) { if (!value) { return null; @@ -3831,7 +3851,8 @@ class LlamaWebGpuBridgeRuntime { const shouldYieldForResponsiveness = !(typeof WorkerGlobalScope !== 'undefined' && globalThis instanceof WorkerGlobalScope); const yieldInterval = shouldYieldForResponsiveness ? 4 : 0; - let streamed = shouldEmitCurrentText ? '' : null; + let streamed = ''; + let emittedStableText = ''; while (generated < nPredict) { if (this._abortRequested || options.signal?.aborted) { @@ -3888,19 +3909,25 @@ class LlamaWebGpuBridgeRuntime { } generated += 1; - const piece = this._core.ccall('llamadart_webgpu_last_piece', 'string', [], []) || ''; - if (piece.length === 0) { + const fullText = this._core.ccall('llamadart_webgpu_last_output', 'string', [], []) || ''; + streamed = fullText; + const stableText = trimUnstableUtf8Tail(fullText); + + if (!stableText.startsWith(emittedStableText)) { + emittedStableText = ''; + } + + const deltaText = stableText.slice(emittedStableText.length); + if (deltaText.length === 0) { continue; } + emittedStableText = stableText; if (typeof options.onToken === 'function') { - const piecePayload = emitTokenText ? piece : textEncoder.encode(piece); - if (shouldEmitCurrentText) { - streamed += piece; - options.onToken(piecePayload, streamed); - } else { - options.onToken(piecePayload, null); - } + const piecePayload = emitTokenText + ? deltaText + : textEncoder.encode(deltaText); + options.onToken(piecePayload, shouldEmitCurrentText ? fullText : null); } if (yieldInterval > 0 && (generated % yieldInterval) === 0) { @@ -3909,6 +3936,17 @@ class LlamaWebGpuBridgeRuntime { } const text = this._core.ccall('llamadart_webgpu_last_output', 'string', [], []) || streamed || ''; + if (typeof options.onToken === 'function') { + const tailText = text.startsWith(emittedStableText) + ? text.slice(emittedStableText.length) + : ''; + if (tailText.length > 0) { + const piecePayload = emitTokenText + ? tailText + : textEncoder.encode(tailText); + options.onToken(piecePayload, shouldEmitCurrentText ? text : null); + } + } return text; } finally { if (generationStarted) { @@ -4203,6 +4241,40 @@ export class LlamaWebGpuBridge { return sanitized; } + _createCpuSafeMultimodalLoadOptions(options = {}) { + const sanitized = this._sanitizeModelLoadOptions(options); + sanitized.nGpuLayers = 0; + + if (Number.isFinite(Number(sanitized.nCtx)) && Number(sanitized.nCtx) > 4096) { + sanitized.nCtx = 4096; + } + + if (!Number.isFinite(Number(sanitized.nThreads)) || Number(sanitized.nThreads) <= 0) { + sanitized.nThreads = 4; + } else { + sanitized.nThreads = Math.min(4, Math.max(1, Math.trunc(Number(sanitized.nThreads)))); + } + + sanitized.nThreadsBatch = sanitized.nThreads; + + if (!Number.isFinite(Number(sanitized.nBatch)) || Number(sanitized.nBatch) <= 0) { + sanitized.nBatch = 128; + } else { + sanitized.nBatch = Math.min(128, Math.max(32, Math.trunc(Number(sanitized.nBatch)))); + } + + if (!Number.isFinite(Number(sanitized.nUbatch)) || Number(sanitized.nUbatch) <= 0) { + sanitized.nUbatch = Math.min(64, sanitized.nBatch); + } else { + sanitized.nUbatch = Math.min( + sanitized.nBatch, + Math.min(64, Math.max(1, Math.trunc(Number(sanitized.nUbatch)))), + ); + } + + return sanitized; + } + _rememberLoadedModel(url, options = {}) { const normalizedUrl = String(url || '').trim(); if (normalizedUrl.length === 0) { @@ -4277,7 +4349,9 @@ export class LlamaWebGpuBridge { return false; } - const selectedOptions = this._sanitizeModelLoadOptions(this._loadedModelOptions || {}); + const selectedOptions = this._createCpuSafeMultimodalLoadOptions( + this._loadedModelOptions || {}, + ); const applyWorkerSafeMode = async () => { await this._callWorker('loadModelFromUrl', [this._loadedModelUrl, selectedOptions]); @@ -4385,8 +4459,11 @@ export class LlamaWebGpuBridge { this._hasMediaParts(options) && typeof this._loadedMmProjUrl === 'string' && this._loadedMmProjUrl.length > 0; + const forceCpuMultimodalFallback = + this._hasMediaParts(options) + && Number(this._loadedModelOptions?.nGpuLayers) !== 0; - if (Number(this._runtime?._modelBytes) > 0 && !forceReloadRequested) { + if (Number(this._runtime?._modelBytes) > 0 && !forceReloadRequested && !forceCpuMultimodalFallback) { if (shouldEnsureMultimodalInRuntime) { const runtimeSupportsMedia = (typeof this._runtime.supportsVision === 'function' && this._runtime.supportsVision()) @@ -4407,16 +4484,18 @@ export class LlamaWebGpuBridge { return; } - const loadOptions = this._sanitizeModelLoadOptions(this._loadedModelOptions || {}); + const loadOptions = forceCpuMultimodalFallback + ? this._createCpuSafeMultimodalLoadOptions(this._loadedModelOptions || {}) + : this._sanitizeModelLoadOptions(this._loadedModelOptions || {}); const workerTimedOut = this._isWorkerTimeoutError(fallbackError); const forcedCpuFallback = this._isForcedCpuMultimodalFallbackError(fallbackError); - const forceCpuMultimodalFallback = - this._hasMediaParts(options) + const shouldWarnCpuMultimodalFallback = + forceCpuMultimodalFallback && (this._isDispatchWorkgroupLimitError(fallbackError) - || forcedCpuFallback) - && Number(loadOptions.nGpuLayers) !== 0; + || forcedCpuFallback + || Number(this._loadedModelOptions?.nGpuLayers) !== 0); - if (forceCpuMultimodalFallback) { + if (shouldWarnCpuMultimodalFallback) { loadOptions.nGpuLayers = 0; if (Number.isFinite(loadOptions.nCtx) && Number(loadOptions.nCtx) > 4096) { loadOptions.nCtx = 4096; diff --git a/src/llama_webgpu_core.cpp b/src/llama_webgpu_core.cpp index da40304..ed2a6ce 100644 --- a/src/llama_webgpu_core.cpp +++ b/src/llama_webgpu_core.cpp @@ -563,6 +563,14 @@ std::string normalize_media_markers(const std::string & prompt, const size_t med replace_all_inplace(normalized, "<|image|>", marker); replace_all_inplace(normalized, "", marker); replace_all_inplace(normalized, "<|img|>", marker); + replace_all_inplace( + normalized, + "<|vision_start|><|image_pad|><|vision_end|>", + marker); + replace_all_inplace( + normalized, + "<|vision_start|><|video_pad|><|vision_end|>", + marker); replace_all_inplace(normalized, "