SJTU-DENG-Lab
diff --git a/‎diffulex/extensions/quantization/README.md‎
Lines changed: 69 additions & 0 deletions b/‎diffulex/extensions/quantization/README.md‎
Lines changed: 69 additions & 0 deletions
diff --git a/‎diffulex/extensions/quantization/__init__.py‎
Lines changed: 62 additions & 8 deletions b/‎diffulex/extensions/quantization/__init__.py‎
Lines changed: 62 additions & 8 deletions
diff --git a/‎diffulex/extensions/quantization/bootstrap.py‎
Lines changed: 139 additions & 0 deletions b/‎diffulex/extensions/quantization/bootstrap.py‎
Lines changed: 139 additions & 0 deletions
diff --git a/‎diffulex/extensions/quantization/context.py‎
Lines changed: 7 additions & 1 deletion b/‎diffulex/extensions/quantization/context.py‎
Lines changed: 7 additions & 1 deletion
@@ -27,6 +27,7 @@ Zero-coupling quantization support for Diffulex. This extension adds support for
 **KV Cache Quantization:**
 - FP8 E4M3: 8-bit KV cache with E4M3 format
 - FP8 E5M2: 8-bit KV cache with E5M2 format
+- **Custom FP8 Triton Kernel**: On-the-fly dequantization during attention
 - BF16: No quantization (default)
 
 ## Installation
@@ -203,6 +204,46 @@ class CustomLinearStrategy(LinearQuantizationStrategy):
 
 ## Architecture
 
+### Directory Structure
+
+```
+diffulex/extensions/quantization/
+├── __init__.py                 # Main API exports
+├── bootstrap.py                # Extension initialization
+├── config.py                   # Configuration classes
+├── context.py                  # Strategy context management
+├── registry.py                 # Strategy registry
+├── strategy.py                 # Base strategy classes
+├── layer_patch.py              # Layer monkey patching
+├── layer_mixin.py              # Quantized layer mixin
+├── kv_cache_patch.py           # KV cache quantization
+├── linear_plans.py             # Forward plan definitions
+├── linear_plan_builder.py      # Plan builder
+├── loader_patch.py             # Weight loader patching
+├── test_basic.py               # Basic tests
+├── README.md                   # This documentation
+├── kernels/                    # Kernel implementations
+│   ├── __init__.py
+│   ├── kernel_registry.py      # Kernel registry & base classes
+│   ├── kernel_availability.py  # Availability checking & warnings
+│   ├── vllm_kernels.py         # vLLM kernel wrappers
+│   └── triton_kernels/         # Custom Triton kernels
+│       ├── __init__.py
+│       └── fp8_kv_attention.py # FP8 KV attention kernel
+└── strategies/                 # Quantization strategies
+    ├── __init__.py
+    ├── kv_cache_bf16.py
+    ├── kv_cache_fp8_running_max.py
+    ├── linear_bf16.py
+    ├── linear_fp8_w8a8.py
+    ├── linear_fp8_w8a16.py
+    ├── linear_int8_w8a8.py
+    ├── linear_int8_w8a16.py
+    ├── linear_gptq_w*.py
+    ├── linear_awq_*.py
+    └── linear_w4a8_cutlass.py
+```
+
 ### Zero-Coupling Design
 
 This extension uses a **zero-coupling architecture** that ensures:
@@ -236,6 +277,34 @@ Run the test suite:
 python -m diffulex.extensions.quantization.test_basic
 ```
 
+## Advanced Features
+
+### Custom FP8 KV Cache Triton Kernel
+
+The extension includes a custom Triton kernel for FP8 KV cache attention that performs on-the-fly dequantization, avoiding explicit dequantize-copy operations:
+
+```python
+from diffulex.extensions import quantization
+
+# Check if Triton kernel is available
+if quantization._HAS_FP8_TRITON_KERNEL:
+    print("FP8 Triton kernel available")
+
+# Enable FP8 KV cache
+quantization.enable(kv_cache_dtype="fp8_e4m3")
+
+# The kernel will be automatically used for attention computation
+```
+
+**Benefits:**
+- On-the-fly dequantization in Triton kernel
+- Reduces memory bandwidth by ~50% for KV cache
+- Faster than explicit dequantize + attention
+
+**Requirements:**
+- Triton >= 2.0
+- CUDA-capable GPU
+
 ## Troubleshooting
 
 ### "Cannot import name 'enable'"
 
@@ -19,6 +19,7 @@
 - AWQ W4A16: 4-bit AWQ quantized weights
 - GPTQ/AWQ + Marlin: Optimized kernels for above
 - FP8 KV Cache: FP8 quantized KV cache
+- Custom Triton Kernels: On-the-fly dequantization
 
 Example usage:
     # FP8 W8A8 with FP8 KV Cache
@@ -53,13 +54,33 @@
     auto_enable_from_config,
 )
 
-# Kernel availability checking
-from .kernel_availability import (
+# Kernels package (unified interface)
+from .kernels import (
+    # Registry
+    KernelRegistry,
+    register_kernel,
+    get_kernel,
+    list_available_kernels,
+    # Availability
     check_vllm_op_available,
+    check_kernel_available,
     get_kernel_status,
     print_kernel_status,
     set_strict_mode,
     is_strict_mode,
+    warn_kernel_unavailable,
+    # vLLM wrappers
+    VllmGPTQGemm,
+    VllmAWQGemm,
+    VllmMarlinGemm,
+    VllmCutlassScaledMM,
+    VllmAllSparkW8A16,
+    VllmCutlassW4A8,
+    VllmFp8LinearOp,
+    # Triton kernels
+    Fp8KVAttentionKernel,
+    fp8_kv_attention_forward,
+    _HAS_TRITON_KERNELS,
 )
 
 # Configuration
@@ -96,14 +117,16 @@
 # Concrete strategies (for advanced usage)
 from .strategies.kv_cache_bf16 import BF16KVCacheStrategy
 from .strategies.linear_bf16 import BF16LinearStrategy
-from .strategies.linear_fp8_w8a8 import FP8W8A8LinearStrategy
-from .strategies.linear_fp8_w8a16 import FP8W8A16LinearStrategy
+from .strategies.linear_fp8_w8a8 import FP8E4M3W8A8LinearStrategy, FP8E5M2W8A8LinearStrategy
+from .strategies.linear_fp8_w8a16 import FP8E4M3W8A16LinearStrategy, FP8E5M2W8A16LinearStrategy
 from .strategies.linear_int8_w8a8 import INT8W8A8LinearStrategy
 from .strategies.linear_int8_w8a16 import INT8W8A16LinearStrategy
-from .strategies.linear_gptq_w2a16 import GPTQW2A16LinearStrategy
-from .strategies.linear_gptq_w3a16 import GPTQW3A16LinearStrategy
-from .strategies.linear_gptq_w4a16 import GPTQW4A16LinearStrategy
-from .strategies.linear_gptq_w8a16 import GPTQW8A16LinearStrategy
+from .strategies.linear_gptq_wxa16 import (
+    GPTQW2A16LinearStrategy,
+    GPTQW3A16LinearStrategy,
+    GPTQW4A16LinearStrategy,
+    GPTQW8A16LinearStrategy,
+)
 from .strategies.linear_gptq_marlin_w4a16 import GPTQMarlinW4A16LinearStrategy
 from .strategies.linear_gptq_marlin_w8a16 import GPTQMarlinW8A16LinearStrategy
 from .strategies.linear_awq_w4a16 import AWQW4A16LinearStrategy
@@ -131,6 +154,7 @@
     ForwardPlanBase,
     ForwardPlanSig,
     BF16Plan,
+    QuantizedLinearPlan,
     QuantInt8W8A16Plan,
     QuantInt8W8A8Plan,
     QuantFP8W8A8Plan,
@@ -143,6 +167,9 @@
 )
 from .linear_plan_builder import build_forward_plan, rebuild_plan_if_needed
 
+# Offline quantization
+from .quantize_model import quantize_model
+
 __all__ = [
     # Bootstrap
     "enable",
@@ -152,6 +179,29 @@
     "configure_from_args",
     "auto_enable_from_config",
 
+    # Kernels
+    "KernelRegistry",
+    "register_kernel",
+    "get_kernel",
+    "list_available_kernels",
+    "check_vllm_op_available",
+    "check_kernel_available",
+    "get_kernel_status",
+    "print_kernel_status",
+    "set_strict_mode",
+    "is_strict_mode",
+    "warn_kernel_unavailable",
+    "VllmGPTQGemm",
+    "VllmAWQGemm",
+    "VllmMarlinGemm",
+    "VllmCutlassScaledMM",
+    "VllmAllSparkW8A16",
+    "VllmCutlassW4A8",
+    "VllmFp8LinearOp",
+    "Fp8KVAttentionKernel",
+    "fp8_kv_attention_forward",
+    "_HAS_TRITON_KERNELS",
+    
     # Configuration
     "QuantizationConfig",
     "KVCacheQuantConfig",
@@ -210,6 +260,7 @@
     "ForwardPlanBase",
     "ForwardPlanSig",
     "BF16Plan",
+    "QuantizedLinearPlan",
     "QuantInt8W8A16Plan",
     "QuantInt8W8A8Plan",
     "QuantFP8W8A8Plan",
@@ -221,4 +272,7 @@
     "DirectMarlinGemmPlan",
     "build_forward_plan",
     "rebuild_plan_if_needed",
+    
+    # Offline quantization
+    "quantize_model",
 ]
@@ -15,6 +15,8 @@
 import sys
 from typing import Optional, Dict, Any
 
+import torch
+
 # Global state
 _is_enabled = False
 _quant_config: Optional[Dict[str, Any]] = None
@@ -236,6 +238,14 @@ def patched_init(self, *args, **kwargs):
             patch_loader()
         except Exception:
             pass
+    
+    # Patch model to quantize weights after loading
+    # Only patch when the actual engine module is imported (not during recursion)
+    if module_name == 'diffulex.diffulex' or module_name == 'diffulex.engine.tp_worker':
+        try:
+            _patch_model_for_weight_quantization(module)
+        except Exception:
+            pass
 
 
 # Convenience function for configuring quantization from CLI args
@@ -333,3 +343,132 @@ def auto_enable_from_config(config):
             }
 
     return enable(config=quant_config)
+
+
+def _patch_model_for_weight_quantization(module):
+    """
+    Patch model initialization to quantize weights after loading.
+    
+    This ensures online quantization (INT8/FP8) is applied to weights
+    immediately after model creation, not during each forward pass.
+    """
+    from .context import get_linear_strategy
+    from .layer_mixin import LinearQuantizationMixin
+    
+    # Find the Diffulex class
+    DiffulexClass = None
+    for attr_name in ['DiffulexTPWorker', 'Diffulex', 'DiffulexDPWorker']:
+        if hasattr(module, attr_name):
+            DiffulexClass = getattr(module, attr_name)
+            break
+    
+    if DiffulexClass is None:
+        return
+    
+    original_init = DiffulexClass.__init__
+    
+    def patched_init(self, *args, **kwargs):
+        # Call original init
+        original_init(self, *args, **kwargs)
+        
+        # After initialization, quantize weights if needed
+        _quantize_model_weights(self)
+    
+    DiffulexClass.__init__ = patched_init
+
+
+def _quantize_model_weights(model_wrapper):
+    """
+    Quantize all linear layer weights in the model.
+    
+    This is called once after model loading to pre-quantize weights.
+    """
+    from .context import get_linear_strategy
+    from .layer_mixin import LinearQuantizationMixin
+    
+    # Check if already quantized (avoid duplicate quantization in multi-worker setup)
+    if getattr(model_wrapper, '_weights_quantized', False):
+        return
+    
+    # Get model runner
+    model_runner = getattr(model_wrapper, 'model_runner', None)
+    if model_runner is None:
+        return
+    
+    model = getattr(model_runner, 'model', None)
+    if model is None:
+        return
+    
+    # Get current quantization config
+    weight_method = _quant_config.get('weights', {}).get('method', 'bf16')
+    
+    # Skip if not online quantization
+    if weight_method in ['bf16', 'none']:
+        return
+    
+    # Skip if offline quantization (GPTQ/AWQ) - those are already quantized
+    if any(fmt in weight_method.lower() for fmt in ['gptq', 'awq', 'marlin']):
+        return
+    
+    # Mark as quantized to avoid duplicate work
+    model_wrapper._weights_quantized = True
+    
+    print(f"[Quantization] Pre-quantizing model weights to {weight_method}...")
+    
+    # Get strategy
+    strategy = get_linear_strategy('attn')  # Use attn strategy for all
+    if strategy is None:
+        return
+    
+    quantized_count = 0
+    total_saved_bytes = 0
+    
+    # Iterate through all modules
+    for name, module in model.named_modules():
+        # Check if this is a quantized linear layer
+        if isinstance(module, LinearQuantizationMixin):
+            # Skip if already quantized
+            if module.has_quantized_weight() or module.has_offline_quantized_weight():
+                continue
+            
+            # Quantize weight
+            try:
+                weight = module.weight
+                if weight is None or weight.dtype != torch.bfloat16:
+                    continue
+                
+                original_size = weight.numel() * weight.element_size()
+                
+                # Use strategy to quantize weight
+                q_weight, w_meta = strategy.quantize_weight_for_kernel(weight)
+                w_scale = w_meta.get('scale')
+                w_zero = w_meta.get('zero_point')
+                
+                # Store quantized weight
+                module.set_quantized_weight(q_weight, w_scale, w_zero)
+                
+                # Delete original weight to save memory
+                if hasattr(module, 'weight'):
+                    delattr(module, 'weight')
+                    if 'weight' in module._parameters:
+                        del module._parameters['weight']
+                
+                quantized_size = q_weight.numel() * q_weight.element_size()
+                total_saved_bytes += (original_size - quantized_size)
+                quantized_count += 1
+                
+            except Exception as e:
+                # Log but continue
+                print(f"[Quantization] Warning: Failed to quantize {name}: {e}")
+                continue
+    
+    if quantized_count > 0:
+        saved_mb = total_saved_bytes / (1024 ** 2)
+        print(f"[Quantization] Pre-quantized {quantized_count} layers to {weight_method}")
+        print(f"[Quantization] Estimated memory saved: {saved_mb:.1f} MB")
+        
+        # Force CUDA synchronization to get accurate memory stats
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
+            mem_allocated = torch.cuda.memory_allocated() / 1024**3
+            print(f"[Quantization] Current GPU memory: {mem_allocated:.2f} GB")
@@ -87,13 +87,19 @@ def _act_quant_cache_key(self, x: torch.Tensor) -> tuple:
         Uses data pointer, shape, stride, dtype, device, and version
         to ensure cache correctness.
         """
+        # Handle inference tensors (no version tracking in no_grad mode)
+        try:
+            version = int(x._version)
+        except (RuntimeError, AttributeError):
+            version = -1
+        
         return (
             int(x.data_ptr()),
             tuple(x.shape),
             tuple(x.stride()),
             str(x.dtype),
             str(x.device),
-            int(getattr(x, "_version", -1)),
+            version,
         )
 
     def get_cached_act_quant(self, x: torch.Tensor) -> Optional[Tuple[torch.Tensor, torch.Tensor]]: