OpenBitSys
diff --git a/‎evaluation/ablation/script/test_bitblas.sh‎
Lines changed: 1 addition & 0 deletions b/‎evaluation/ablation/script/test_bitblas.sh‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎evaluation/ablation/test_bitblas.py‎
Lines changed: 66 additions & 0 deletions b/‎evaluation/ablation/test_bitblas.py‎
Lines changed: 66 additions & 0 deletions
diff --git a/‎evaluation/ablation/test_marlin.py‎
Lines changed: 183 additions & 0 deletions b/‎evaluation/ablation/test_marlin.py‎
Lines changed: 183 additions & 0 deletions
diff --git a/‎evaluation/bench_throughput.py‎
Lines changed: 140 additions & 0 deletions b/‎evaluation/bench_throughput.py‎
Lines changed: 140 additions & 0 deletions
@@ -0,0 +1 @@
+CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES=0 python test_bitblas.py
@@ -0,0 +1,66 @@
+import bitblas
+import torch
+import time
+import numpy as np
+
+# uncomment to enable debug output
+# bitblas.set_log_level("Debug")
+
+# Prefill
+n_heads = 1
+seq_len = 128
+dim = 128
+matmul_config = bitblas.MatmulConfig(
+    M=1,  # M dimension
+    N=n_heads*seq_len,  # N dimension
+    K=dim,  # K dimension
+    A_dtype="float16",  # activation A dtype
+    W_dtype="int4",  # weight W dtype
+    accum_dtype="float16",  # accumulation dtype
+    out_dtype="float16",  # output dtype
+    layout="nt",  # matrix layout, "nt" indicates the layout of A is non-transpose and the layout of W is transpose
+    with_bias=False,  # bias
+    # configs for weight only quantization
+    group_size=None,  # setting for grouped quantization
+    with_scaling=False,  # setting for scaling factor
+    with_zeros=False,  # setting for zeros
+    zeros_mode=None,  # setting for how to calculating zeros
+)
+
+matmul = bitblas.Matmul(config=matmul_config)
+
+# Create input matrices
+# input_tensor = torch.rand((1, dim), dtype=torch.float16).cuda()
+weight_tensor = torch.randint(0, 7, (n_heads*seq_len, dim), dtype=torch.int8).cuda()
+
+# Warmup runs
+print("\nWarming up...")
+for _ in range(5):
+    _ = matmul.transform_weight(weight_tensor)
+    torch.cuda.synchronize()
+
+# Timing runs
+num_runs = 10
+times = []
+
+print(f"\nRunning {num_runs} timing iterations...")
+
+for i in range(num_runs):
+    torch.cuda.synchronize()
+    start_time = time.perf_counter()
+    
+    weight_tensor_int4 = matmul.transform_weight(weight_tensor)
+    
+    torch.cuda.synchronize()
+    end_time = time.perf_counter()
+    
+    elapsed_time = (end_time - start_time) * 1000  # Convert to milliseconds
+    times.append(elapsed_time)
+    
+    if (i + 1) % 20 == 0:
+        print(f"  Completed {i + 1}/{num_runs} runs")
+
+times = np.array(times)
+mean_time = np.mean(times)
+
+print(f"Mean time: {mean_time} ms")
@@ -0,0 +1,183 @@
+import torch
+import torch.nn as nn
+import numpy as np
+import time
+
+# Define the missing constants and functions for Marlin Layer
+# These would normally come from marlin-specific modules
+_perm = torch.randperm(128)  # Placeholder permutation
+_scale_perm = torch.randperm(4)  # Placeholder scale permutation  
+_scale_perm_single = torch.randperm(2)  # Placeholder single scale permutation
+
+def mul(A, B, C, s, workspace):
+    """Placeholder implementation of marlin mul function"""
+    # This is a simplified version - actual implementation would use CUDA kernels
+    A_flat = A.view(-1, A.shape[-1])
+    C_flat = C.view(-1, C.shape[-1])
+    
+    # Simulated quantized matrix multiplication
+    # In real implementation, this would dequantize B using s and perform actual GEMM
+    result = torch.matmul(A_flat.half(), torch.randn(A.shape[-1], C.shape[-1], device=A.device, dtype=torch.half))
+    C_flat.copy_(result)
+
+class Layer(nn.Module):
+    """PyTorch compatible Marlin layer; 4-bit (symmetric grouped) linear layer without bias."""
+
+    def __init__(self, infeatures, outfeatures, groupsize=-1):
+        """Create an empty Marlin layer.
+        @infeatures: number of input features (must be divisible by 128)
+        @outfeatures: number of output features (must be divisible by 256)
+        @groupsize: quantization groupsize (must be -1 or 128)
+        """
+        super().__init__()
+        if groupsize not in [-1, 128]:
+            raise ValueError('Only groupsize -1 and 128 are supported.')
+        if infeatures % 128 != 0 or outfeatures % 256 != 0:
+            raise ValueError('`infeatures` must be divisible by 128 and `outfeatures` by 256.')
+        if groupsize == -1:
+            groupsize = infeatures
+        if infeatures % groupsize != 0:
+            raise ValueError('`infeatures` must be divisible by `groupsize`.')
+        self.k = infeatures
+        self.n = outfeatures
+        self.groupsize = groupsize
+        self.register_buffer('B', torch.empty((self.k // 16, self.n * 16 // 8), dtype=torch.int))
+        self.register_buffer('s', torch.empty((self.k // groupsize, self.n), dtype=torch.half))
+        # 128 is currently the minimum `tile_n`, hence it gives the maximum workspace size; 16 is the default `max_par`
+        self.register_buffer('workspace', torch.zeros(self.n // 128 * 16, dtype=torch.int), persistent=False)
+
+    def forward(self, A):
+        C = torch.empty(A.shape[:-1] + (self.s.shape[1],), dtype=A.dtype, device=A.device)
+        mul(A.view((-1, A.shape[-1])), self.B, C.view((-1, C.shape[-1])), self.s, self.workspace)
+        return C
+
+    def pack(self, linear, scales):
+        """Pack a fake-quantized linear layer into this actual Marlin representation.
+        @linear: fake-quantized `torch.nn.Linear` layer to convert (must be of type `torch.half`)
+        @scales: corresponding quantization scales of shape `(infeatures, groups)`
+        """ 
+        if linear.weight.dtype != torch.half:
+            raise ValueError('Only `torch.half` weights are supported.')
+        tile = 16
+        maxq = 2 ** 4 - 1
+        s = scales.t()
+        w = linear.weight.data.t()
+        if self.groupsize != self.k:
+            w = w.reshape((-1, self.groupsize, self.n))
+            w = w.permute(1, 0, 2)
+            w = w.reshape((self.groupsize, -1))
+            s = s.reshape((1, -1))
+        w = torch.round(w / s).int()
+        w += (maxq + 1) // 2
+        w = torch.clamp(w, 0, maxq)
+        if self.groupsize != self.k:
+            w = w.reshape((self.groupsize, -1, self.n))
+            w = w.permute(1, 0, 2)
+            w = w.reshape((self.k, self.n)).contiguous()
+            s = s.reshape((-1, len(_scale_perm)))[:, _scale_perm]
+        else:
+            s = s.reshape((-1, len(_scale_perm_single)))[:, _scale_perm_single]
+        s = s.reshape((-1, self.n)).contiguous()
+        w = w.reshape((self.k // tile, tile, self.n // tile, tile))
+        w = w.permute((0, 2, 1, 3))
+        w = w.reshape((self.k // tile, self.n * tile))
+        res = w
+        res = res.reshape((-1, _perm.numel()))[:, _perm].reshape(res.shape)
+        q = np.zeros((res.shape[0], res.shape[1] // 8), dtype=np.uint32)
+        res = res.cpu().numpy().astype(np.uint32)
+        for i in range(8):
+            q |= res[:, i::8] << 4 * i
+        q = torch.from_numpy(q.astype(np.int32)).to(w.device)
+        self.B[:, :] = q.to(self.B.device)
+        self.s[:, :] = s.to(self.s.device)
+
+
+def test_marlin_pack_latency():
+    """Test the Marlin layer pack function latency"""
+    print("Testing Marlin Layer pack function with weight dimensions (1024, 128) and group_size=128")
+    
+    # Based on user requirements: weight (1024, 128) means out_features=1024, in_features=128
+    # After transpose in pack method: (128, 1024) -> infeatures=128, outfeatures=1024
+    infeatures = 128
+    outfeatures = 1024
+    groupsize = 128
+    
+    # Validate constraints
+    print(f"infeatures: {infeatures}, outfeatures: {outfeatures}, groupsize: {groupsize}")
+    print(f"infeatures % 128 = {infeatures % 128}")
+    print(f"outfeatures % 256 = {outfeatures % 256}")
+    print(f"infeatures % groupsize = {infeatures % groupsize}")
+    
+    # Create Marlin layer
+    marlin_layer = Layer(infeatures=infeatures, outfeatures=outfeatures, groupsize=groupsize)
+    
+    # Create a fake-quantized linear layer to pack
+    linear = nn.Linear(in_features=outfeatures, out_features=infeatures, bias=False)
+    linear.weight.data = torch.randn(infeatures, outfeatures, dtype=torch.half)
+    
+    # Create random scales with proper shape
+    # scales shape should be (infeatures, groups) = (128, 1) since groupsize=128=infeatures
+    num_groups = infeatures // groupsize
+    scales = torch.randn(infeatures, num_groups, dtype=torch.half) * 0.1 + 1.0  # scales around 1.0
+    
+    print(f"Linear layer weight shape: {linear.weight.shape}")
+    print(f"Scales shape: {scales.shape}")
+    
+    # Move to GPU if available
+    if torch.cuda.is_available():
+        marlin_layer = marlin_layer.cuda()
+        linear = linear.cuda()
+        scales = scales.cuda()
+        print("Using GPU for testing")
+    else:
+        print("Using CPU for testing")
+    
+    # Test pack function latency
+    print("\nTesting pack function latency...")
+    
+    # Warm up
+    print("Warming up...")
+    for _ in range(5):
+        marlin_layer.pack(linear, scales)
+    
+    # Measure latency
+    num_runs = 100
+    print(f"Running {num_runs} iterations...")
+    
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+    
+    start_time = time.time()
+    
+    for _ in range(num_runs):
+        marlin_layer.pack(linear, scales)
+    
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+    
+    end_time = time.time()
+    
+    avg_latency = (end_time - start_time) / num_runs * 1000  # Convert to milliseconds
+    total_time = (end_time - start_time) * 1000  # Convert to milliseconds
+    
+    print(f"\nResults:")
+    print(f"Average pack function latency: {avg_latency:.4f} ms")
+    print(f"Total time for {num_runs} runs: {total_time:.2f} ms")
+    print(f"Throughput: {num_runs / (total_time / 1000):.2f} packs/sec")
+
+
+if __name__ == "__main__":
+    # Set device
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"Using device: {device}")
+    
+    # Set random seed for reproducibility
+    torch.manual_seed(42)
+    np.random.seed(42)
+    
+    try:
+        test_marlin_pack_latency()
+    except Exception as e:
+        print(f"Error during testing: {e}")
+        import traceback
+        traceback.print_exc()
@@ -0,0 +1,140 @@
+import argparse
+import dataclasses
+import time
+import numpy as np
+import torch
+from tqdm.auto import tqdm
+from llama import LlamaForCausalLM
+from transformers import LlamaConfig, AutoTokenizer
+
+@dataclasses.dataclass
+class ModelConfig:
+  model_path: str
+  dtype: str = dataclasses.field(default="float16")
+#   device: str = dataclasses.field(default="cuda:0")
+
+
+def load_model(args):
+    # device = torch.device(args.device)
+    dtype = getattr(torch, args.dtype)
+    torch.set_default_dtype(dtype)
+
+    config = LlamaConfig.from_pretrained(args.model_path)
+    config.attn_backend = args.attn_backend
+    config.num_bits = args.num_bits
+    config.quant_mode = args.quant_mode
+    config.group_size = args.group_size
+    config.residual_block_size = 128 if args.num_bits == 4 else 256
+
+    model = LlamaForCausalLM.from_pretrained(
+        args.model_path,
+        config=config,
+        device_map="auto",
+        torch_dtype=dtype
+    )
+    return model
+
+@torch.inference_mode()
+def benchmark_throughput():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_path", default="llama3-8b-instruct")
+    parser.add_argument("--batch_size", type=int, default=1)
+    parser.add_argument("--context_len", type=int, default=2*1024)
+    parser.add_argument("--decode_len", type=int, default=256)
+    parser.add_argument("--iteration", type=int, default=10)
+    parser.add_argument("--device", type=str, default="cuda:0")
+    parser.add_argument("--dtype", type=str, default="float16")
+    parser.add_argument("--attn_backend", type=str, default="flash_attention_2")
+    parser.add_argument("--num_bits", type=int, default=4)
+    parser.add_argument("--quant_mode", type=str, default="k-channel")
+    parser.add_argument("--group_size", type=int, default=128)
+    
+    args = parser.parse_args()
+
+    model = load_model(args)
+
+    context_len = args.context_len
+    decode_len = args.decode_len
+    batch_size = args.batch_size
+    
+    dtype = getattr(torch, args.dtype)
+    device = torch.device(args.device)
+    hidden_size = model.config.hidden_size
+
+    prefill_latency = []
+    decode_latency = []
+
+    for iter_idx in tqdm(range(args.iteration)):
+        # clear cuda cache
+        torch.cuda.empty_cache()
+        torch.cuda.reset_peak_memory_stats(device)
+
+        # Prefill Stage
+        ts = time.perf_counter()
+        hidden_states = torch.randn(batch_size, context_len, hidden_size, dtype=dtype, device=device)
+        out = model(
+            inputs_embeds=hidden_states,
+            use_cache=True
+        )
+        torch.cuda.synchronize()
+        te = time.perf_counter()
+        prefill_latency.append(te - ts)
+        
+        # Memory stats after prefill
+        if iter_idx == 0:
+            print(f"GPU Memory Allocated: {torch.cuda.memory_allocated(device) / 1e6:.2f} MB")
+            print(f"Peak GPU Memory: {torch.cuda.max_memory_allocated(device) / 1e6:.2f} MB")
+
+        # Warm up for decode
+        for _ in range(5):
+            hidden_states = torch.randn(batch_size, 1, hidden_size, dtype=dtype, device=device)
+            model(
+                inputs_embeds=hidden_states,
+                past_key_values=out.past_key_values,
+                use_cache=True,
+            )
+
+        # Decode Stage - measure total time for all tokens
+        ts_decode_total = time.perf_counter()
+        for _ in range(decode_len):
+            hidden_states = torch.randn(batch_size, 1, hidden_size, dtype=dtype, device=device)
+            out = model(
+                inputs_embeds=hidden_states,
+                past_key_values=out.past_key_values,
+                use_cache=True,
+            )
+        torch.cuda.synchronize()
+        te_decode_total = time.perf_counter()
+        decode_latency.append(te_decode_total - ts_decode_total)
+    
+    # Calculate metrics
+    avg_prefill_latency = np.mean(prefill_latency)
+    avg_decode_latency = np.mean(decode_latency)
+    # avg_decode_latency -= 0.0019366741180 * 32
+    
+    # Calculate throughput
+    prefill_throughput = (batch_size * context_len) / avg_prefill_latency
+    decode_throughput = (batch_size * decode_len) / avg_decode_latency
+    
+    # Print results in a table format
+    print("\n===== BENCHMARK RESULTS =====")
+    print(f"Model: {args.model_path}")
+    print(f"Batch Size: {batch_size}")
+    print(f"Context Length: {context_len}")
+    print(f"Decode Length: {decode_len}")
+    print(f"Quantization: {args.num_bits}-bit {args.quant_mode}")
+    print("\n--- Latency ---")
+    print(f"Avg Prefill Latency: {avg_prefill_latency:.4f} s")
+    print(f"Avg Decode Latency (total): {avg_decode_latency:.4f} s")
+    print(f"Avg Decode Latency (per token): {avg_decode_latency/decode_len:.4f} s")
+    print("\n--- Throughput ---")
+    print(f"Prefill Throughput: {prefill_throughput:.2f} tokens/s")
+    print(f"Decode Throughput: {decode_throughput:.2f} tokens/s")
+    
+    # CSV format for easy parsing
+    print("\n--- CSV Format ---")
+    print("batch_size,context_len,decode_len,prefill_latency,decode_latency,prefill_throughput,decode_throughput")
+    print(f"{batch_size},{context_len},{decode_len},{avg_prefill_latency:.4f},{avg_decode_latency:.4f},{prefill_throughput:.2f},{decode_throughput:.2f}")
+
+if __name__ == "__main__":
+    benchmark_throughput()
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES=0 python test_bitblas.py`