The high-level entry point for loading and running BitNet models.
Load a BitNet model from a URL.
static async load(source: string | URL, options?: LoadOptions): Promise<BitNet>| Parameter | Type | Description |
|---|---|---|
source |
string | URL |
URL to a GGUF or Safetensors file |
options |
LoadOptions |
Loading options (see below) |
Returns: Promise<BitNet>
Generate text from a prompt. Yields tokens as they are generated.
async *generate(
prompt: string | ChatMessage[],
options?: GenerateOptions
): AsyncGenerator<string>| Parameter | Type | Description |
|---|---|---|
prompt |
string | ChatMessage[] |
Plain text or chat messages |
options |
GenerateOptions |
Generation options (see below) |
Returns: AsyncGenerator<string>
Run GPU diagnostics: a forward pass with per-stage tensor readback.
async diagnose(prompt?: string): Promise<DiagnosticResult[]>| Parameter | Type | Default | Description |
|---|---|---|---|
prompt |
string |
"Hello" |
Input text to run through the model |
Returns: Promise<DiagnosticResult[]>
Release all GPU resources held by this instance. Must be called when the model is no longer needed.
dispose(): voidInitialize a WebGPU adapter and device with maximum limits for large model support.
async function initGPU(existingDevice?: GPUDevice): Promise<GPUContext>| Parameter | Type | Description |
|---|---|---|
existingDevice |
GPUDevice |
Optional existing device to reuse |
Returns: Promise<GPUContext>
Throws: GPUDeviceError if WebGPU is unavailable or adapter/device creation fails.
List all model URLs cached in IndexedDB.
async function listCachedModels(): Promise<string[]>Returns: Promise<string[]> — Array of cached model URLs.
Delete a cached model from IndexedDB.
async function deleteCachedModel(url: string): Promise<void>| Parameter | Type | Description |
|---|---|---|
url |
string |
The model URL to remove from cache |
interface LoadOptions {
device?: GPUDevice;
format?: WeightFormat; // "gguf" | "safetensors"
onProgress?: (progress: LoadProgress) => void;
signal?: AbortSignal;
}| Field | Type | Default | Description |
|---|---|---|---|
device |
GPUDevice |
Auto-created | Existing GPU device to reuse (required for Node.js — see Getting Started) |
format |
WeightFormat |
Auto-detected | Force weight format |
onProgress |
(progress: LoadProgress) => void |
— | Progress callback |
signal |
AbortSignal |
— | Abort signal to cancel loading |
interface GenerateOptions {
maxTokens?: number;
temperature?: number;
topK?: number;
repeatPenalty?: number;
repeatLastN?: number;
onToken?: (token: string) => void;
signal?: AbortSignal;
}| Field | Type | Default | Description |
|---|---|---|---|
maxTokens |
number |
256 |
Maximum tokens to generate |
temperature |
number |
1.0 |
Sampling temperature |
topK |
number |
50 |
Top-K sampling (0 = disabled) |
repeatPenalty |
number |
1.0 |
Repetition penalty (1.0 = disabled) |
repeatLastN |
number |
64 |
Window size for repetition penalty |
onToken |
(token: string) => void |
— | Callback fired for each token |
signal |
AbortSignal |
— | Abort signal to cancel generation |
interface LoadProgress {
phase: "download" | "parse" | "upload";
loaded: number;
total: number;
fraction: number; // 0.0 – 1.0
}interface ChatMessage {
role: "system" | "user" | "assistant";
content: string;
}interface DiagnosticResult {
name: string;
length: number;
min: number;
max: number;
mean: number;
rms: number;
nanCount: number;
infCount: number;
zeroCount: number;
first8: number[];
}interface ModelConfig {
modelType: "bitnet";
vocabSize: number;
hiddenSize: number;
intermediateSize: number;
numHiddenLayers: number;
numAttentionHeads: number;
numKeyValueHeads: number;
maxPositionEmbeddings: number;
rmsNormEps: number;
ropeTheta: number;
tieWordEmbeddings: boolean;
activation: "relu2" | "silu" | "swiglu";
}interface GPUContext {
device: GPUDevice;
adapter: GPUAdapter | null;
limits: GPUSupportedLimits;
}Import these configs to inspect model parameters or pass to lower-level APIs.
Config for microsoft/bitnet-b1.58-2B-4T.
{
modelType: "bitnet",
vocabSize: 128256,
hiddenSize: 2560,
intermediateSize: 6912,
numHiddenLayers: 30,
numAttentionHeads: 20,
numKeyValueHeads: 5,
maxPositionEmbeddings: 4096,
rmsNormEps: 1e-5,
ropeTheta: 500000.0,
tieWordEmbeddings: true,
activation: "relu2",
}Config for 1bitLLM/bitnet_b1_58-large (0.7B).
{
modelType: "bitnet",
vocabSize: 32002,
hiddenSize: 2048,
intermediateSize: 5632,
numHiddenLayers: 24,
numAttentionHeads: 32,
numKeyValueHeads: 32,
maxPositionEmbeddings: 2048,
rmsNormEps: 1e-6,
ropeTheta: 10000.0,
tieWordEmbeddings: false,
activation: "silu",
}Thrown by initGPU() when WebGPU is unavailable or adapter/device creation fails.
class GPUDeviceError extends Error {
name: "GPUDeviceError";
}The following internal types are also exported for advanced usage:
| Export | Kind | Description |
|---|---|---|
WeightStore |
class | GPU buffer storage for model weights |
loadModel() |
function | Low-level model loader (GGUF/Safetensors) |
LoadResult |
type | Return type of loadModel() |
BitNetModel |
class | Low-level transformer model |
Tokenizer |
class | BPE tokenizer with chat template support |
WeightFormat |
type | "gguf" | "safetensors" |
TokenizerConfig |
type | Tokenizer configuration |
WorkerRequest |
type | Web Worker message types |
WorkerResponse |
type | Web Worker response types |