Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 16 additions & 7 deletions wasm/build.sh
Original file line number Diff line number Diff line change
@@ -1,17 +1,20 @@
#!/bin/bash
# Build quant.cpp WASM demo
# Build quant.cpp WASM demo (multi-threaded + SIMD)
# Requires: Emscripten SDK (emcc)
#
# Usage: cd wasm && bash build.sh
# Then: python3 -m http.server 8080
# Open: http://localhost:8080
#
# Multi-threading requires Cross-Origin-Isolation headers.
# coi-serviceworker.js injects them on GitHub Pages / static hosts.

set -e

SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"

echo "=== Building quant.cpp WASM ==="
echo "=== Building quant.cpp WASM (pthreads + SIMD) ==="

# Check emcc
if ! command -v emcc &>/dev/null; then
Expand All @@ -23,13 +26,14 @@ fi

echo "emcc version: $(emcc --version | head -1)"

# Build
# Build with pthreads + SIMD128 + ASYNCIFY
emcc "$SCRIPT_DIR/quant_wasm.c" \
-I"$PROJECT_DIR" \
-o "$SCRIPT_DIR/quant.js" \
-O3 \
-msimd128 \
-flto \
-pthread \
-s WASM=1 \
-s ALLOW_MEMORY_GROWTH=1 \
-s MAXIMUM_MEMORY=4GB \
Expand All @@ -38,13 +42,15 @@ emcc "$SCRIPT_DIR/quant_wasm.c" \
-s EXPORTED_RUNTIME_METHODS='["UTF8ToString","allocateUTF8","FS"]' \
-s FORCE_FILESYSTEM=1 \
-s MODULARIZE=0 \
-s ENVIRONMENT=web \
-s ENVIRONMENT='web,worker' \
-s NO_EXIT_RUNTIME=1 \
-s ASSERTIONS=0 \
-s STACK_SIZE=1MB \
-s ASYNCIFY \
-s 'ASYNCIFY_IMPORTS=["emscripten_sleep"]' \
-s ASYNCIFY_STACK_SIZE=65536 \
-s PTHREAD_POOL_SIZE=4 \
-s PTHREAD_POOL_SIZE_STRICT=0 \
-lm \
-DNDEBUG \
-D__EMSCRIPTEN__ \
Expand All @@ -53,11 +59,14 @@ emcc "$SCRIPT_DIR/quant_wasm.c" \

echo ""
echo "=== Build complete ==="
echo "Files: quant.js ($(du -h "$SCRIPT_DIR/quant.js" | cut -f1)), quant.wasm ($(du -h "$SCRIPT_DIR/quant.wasm" | cut -f1))"
echo "Files:"
for f in quant.js quant.wasm quant.worker.js; do
[ -f "$SCRIPT_DIR/$f" ] && echo " $f ($(du -h "$SCRIPT_DIR/$f" | cut -f1))"
done
echo ""
echo "To serve locally:"
echo " cd $SCRIPT_DIR && python3 -m http.server 8080"
echo " Open http://localhost:8080"
echo ""
echo "For HTTPS (required for SharedArrayBuffer):"
echo " npx serve -s $SCRIPT_DIR --ssl-cert cert.pem --ssl-key key.pem"
echo "Note: Multi-threading requires Cross-Origin-Isolation."
echo "coi-serviceworker.js handles this automatically on GitHub Pages."
63 changes: 63 additions & 0 deletions wasm/coi-serviceworker.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
/*! coi-serviceworker v0.1.7 - Guido Zuidhof, licensed under MIT */
/*
* Service Worker that injects Cross-Origin-Opener-Policy and
* Cross-Origin-Embedder-Policy headers into all responses.
* This enables SharedArrayBuffer on hosts that don't support
* custom HTTP headers (e.g., GitHub Pages).
*
* Required for WASM pthreads (multi-threaded inference).
*/
if (typeof window === 'undefined') {
// Service Worker scope
self.addEventListener("install", () => self.skipWaiting());
self.addEventListener("activate", (e) => e.waitUntil(self.clients.claim()));

self.addEventListener("fetch", (e) => {
// Only intercept same-origin or navigation requests
if (
e.request.cache === "only-if-cached" &&
e.request.mode !== "same-origin"
) {
return;
}

e.respondWith(
fetch(e.request).then((response) => {
// Can't modify opaque responses
if (response.status === 0) return response;

const newHeaders = new Headers(response.headers);
newHeaders.set("Cross-Origin-Embedder-Policy", "credentialless");
newHeaders.set("Cross-Origin-Opener-Policy", "same-origin");

return new Response(response.body, {
status: response.status,
statusText: response.statusText,
headers: newHeaders,
});
}).catch((err) => {
console.error("coi-serviceworker fetch error:", err);
return new Response("Service Worker fetch error", { status: 500 });
})
);
});
} else {
// Window scope — register the service worker
(async () => {
if (!window.crossOriginIsolated) {
const reg = await navigator.serviceWorker.register(
window.document.currentScript.src
);
if (reg.active && !navigator.serviceWorker.controller) {
// Service worker installed but not controlling — reload to activate
window.location.reload();
} else if (!reg.active) {
// Wait for the service worker to activate, then reload
const sw = reg.installing || reg.waiting;
sw.addEventListener("statechange", () => {
if (sw.state === "activated") window.location.reload();
});
}
}
})();
}
2 changes: 2 additions & 0 deletions wasm/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
<meta http-equiv="Cross-Origin-Opener-Policy" content="same-origin">
<meta http-equiv="Cross-Origin-Embedder-Policy" content="require-corp">
<title>quant.cpp — LLM in Your Browser</title>
<!-- Service Worker for COOP/COEP headers — enables SharedArrayBuffer + pthreads on GitHub Pages -->
<script src="coi-serviceworker.js"></script>
<style>
* { margin: 0; padding: 0; box-sizing: border-box; }
body {
Expand Down
2 changes: 1 addition & 1 deletion wasm/quant.js

Large diffs are not rendered by default.

Binary file modified wasm/quant.wasm
Binary file not shown.
19 changes: 15 additions & 4 deletions wasm/quant_wasm.c
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,12 @@ static quant_ctx* g_ctx = NULL;
static char g_output[65536];
static int g_output_pos = 0;
static int g_generating = 0;
static int g_wasm_threads = 1;

/* Query thread count from JS navigator.hardwareConcurrency */
EM_JS(int, js_get_hw_concurrency, (void), {
return Math.min(navigator.hardwareConcurrency || 1, 4);
});

/* JS callback: called for each generated token */
EM_JS(void, js_on_token, (const char* text), {
Expand Down Expand Up @@ -86,11 +92,13 @@ int wasm_load_model(const char* path) {
return -1;
}

g_wasm_threads = js_get_hw_concurrency();

quant_config cfg = {
.temperature = 0.7f,
.top_p = 0.9f,
.max_tokens = 512,
.n_threads = 1, /* WASM: single thread for compatibility */
.n_threads = g_wasm_threads,
.kv_compress = 1, /* 4-bit KV compression */
};
g_ctx = quant_new(g_model, &cfg);
Expand All @@ -99,7 +107,10 @@ int wasm_load_model(const char* path) {
return -1;
}

js_on_status("Model loaded! Ready to chat.");
char status_msg[128];
snprintf(status_msg, sizeof(status_msg),
"Model loaded! Ready to chat. (%d threads)", g_wasm_threads);
js_on_status(status_msg);
return 0;
}

Expand All @@ -124,7 +135,7 @@ int wasm_generate_async(const char* prompt, float temperature, int max_tokens) {
.temperature = temperature,
.top_p = 0.9f,
.max_tokens = max_tokens > 0 ? max_tokens : 256,
.n_threads = 1,
.n_threads = g_wasm_threads,
.kv_compress = 1,
};

Expand Down Expand Up @@ -170,7 +181,7 @@ int wasm_generate(const char* prompt, float temperature, int max_tokens) {
.temperature = temperature,
.top_p = 0.9f,
.max_tokens = max_tokens > 0 ? max_tokens : 256,
.n_threads = 1,
.n_threads = g_wasm_threads,
.kv_compress = 1,
};

Expand Down
Loading