inclusionAI · zheyang0825 · Apr 9, 2026 · Apr 21, 2026 · Apr 21, 2026 · May 12, 2026
diff --git a/benchmarks/bench_kda_decode.py b/benchmarks/bench_kda_decode.py
@@ -165,7 +165,7 @@ def write_markdown_report(args, gpu_name: str, sections: list[tuple[int, int, li
     def summary(vals):
         if not vals:
             return "n/a"
-        return f"avg={sum(vals)/len(vals):.2f}x, min={min(vals):.2f}x, max={max(vals):.2f}x"
+        return f"avg={sum(vals) / len(vals):.2f}x, min={min(vals):.2f}x, max={max(vals):.2f}x"
 
     lines = []
     lines.append("# Benchmark Results - KDA Decode")
@@ -340,9 +340,15 @@ def setup_fla_v_last():
         state_bench_fla_v_last.copy_(state_init_v_last)
 
     with torch.no_grad():
-        t_cula_v_last = benchmark_fn(lambda: call_cula_v_last(state_bench_cula_v_last), setup_fn=setup_cula_v_last, warmup=w, rep=r)
-        t_cula_k_last = benchmark_fn(lambda: call_cula_k_last(state_bench_cula_k_last), setup_fn=setup_cula_k_last, warmup=w, rep=r)
-        t_fla_v_last = benchmark_fn(lambda: call_fla_v_last(state_bench_fla_v_last), setup_fn=setup_fla_v_last, warmup=w, rep=r)
+        t_cula_v_last = benchmark_fn(
+            lambda: call_cula_v_last(state_bench_cula_v_last), setup_fn=setup_cula_v_last, warmup=w, rep=r
+        )
+        t_cula_k_last = benchmark_fn(
+            lambda: call_cula_k_last(state_bench_cula_k_last), setup_fn=setup_cula_k_last, warmup=w, rep=r
+        )
+        t_fla_v_last = benchmark_fn(
+            lambda: call_fla_v_last(state_bench_fla_v_last), setup_fn=setup_fla_v_last, warmup=w, rep=r
+        )
 
     return {
         "N": N,
@@ -421,10 +427,7 @@ def print_section(h_dim: int, v_dim: int):
             )
 
         print()
-        hdr_out = (
-            f"{'N':>5} | {'cuLA v out RMSE':>16} | {'rel':>10} | "
-            f"{'cuLA k out RMSE':>16} | {'rel':>10}"
-        )
+        hdr_out = f"{'N':>5} | {'cuLA v out RMSE':>16} | {'rel':>10} | {'cuLA k out RMSE':>16} | {'rel':>10}"
         print(hdr_out)
         print("-" * len(hdr_out))
         for res in results:
@@ -434,10 +437,7 @@ def print_section(h_dim: int, v_dim: int):
             )
 
         print()
-        hdr_state = (
-            f"{'N':>5} | {'cuLA v state RMSE':>18} | {'rel':>10} | "
-            f"{'cuLA k state RMSE':>18} | {'rel':>10}"
-        )
+        hdr_state = f"{'N':>5} | {'cuLA v state RMSE':>18} | {'rel':>10} | {'cuLA k state RMSE':>18} | {'rel':>10}"
         print(hdr_state)
         print("-" * len(hdr_state))
         for res in results:

diff --git a/cula/kda/__init__.py b/cula/kda/__init__.py
@@ -12,13 +12,34 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cula.kda.chunk import chunk_kda
-from cula.kda.hopper_fused_fwd import cula_kda_prefill as kda_prefill_hopper
-from cula.ops.kda_decode import fused_sigmoid_gating_delta_rule_update, kda_decode
-
-__all__ = [
-    "chunk_kda",
-    "kda_decode",
-    "fused_sigmoid_gating_delta_rule_update",
-    "kda_prefill_hopper",
-]
+import importlib
+
+# 公开接口映射到 (模块路径, 真实函数名)
+_API_MAP = {
+    "chunk_kda": ("cula.kda.chunk", "chunk_kda"),
+    "kda_prefill_hopper": ("cula.kda.hopper_fused_fwd", "cula_kda_prefill"),
+    "kda_decode": ("cula.ops.kda_decode", "kda_decode"),
+    "fused_sigmoid_gating_delta_rule_update": ("cula.ops.kda_decode", "fused_sigmoid_gating_delta_rule_update"),
+}
+
+
+def __getattr__(name: str):
+    if name not in _API_MAP:
+        raise AttributeError(f"module 'cula.kda' has no attribute {name!r}")
+
+    mod_path, attr_name = _API_MAP[name]
+
+    try:
+        mod = importlib.import_module(mod_path)
+    except ImportError:
+        raise ImportError(
+            "cula.kda requires flash-linear-attention. Install with: pip install cuda-linear-attention[fla]"
+        ) from None
+
+    try:
+        return getattr(mod, attr_name)
+    except AttributeError as e:
+        raise AttributeError(f"Module {mod_path} does not define {attr_name}") from e
+
+
+__all__ = list(_API_MAP.keys())
diff --git a/cula/ops/kda_fully_fused_wip.py → cula/kda/kda_fully_fused_wip.py b/cula/ops/kda_fully_fused_wip.py → cula/kda/kda_fully_fused_wip.py
diff --git a/cula/ops/__init__.py b/cula/ops/__init__.py
@@ -20,4 +20,3 @@
     "fused_sigmoid_gating_delta_rule_update",
     "linear_attention_decode",
 ]
-
diff --git a/cula/ops/chunk_delta_h.py b/cula/ops/chunk_delta_h.py
@@ -32,10 +32,8 @@
 from cutlass.cute.runtime import make_fake_compact_tensor, make_fake_stream
 from cutlass.cute.typing import Float32, Int32, Int64
 from cutlass.cutlass_dsl import T as _T
-from fla.ops.utils import prepare_chunk_indices, prepare_lens
-from fla.utils import tensor_cache
 
-from cula.utils import USE_FAST_MATH, assert_blackwell
+from cula.utils import USE_FAST_MATH, assert_blackwell, prepare_chunk_indices, prepare_lens, tensor_cache
 
 
 # in FLA, cumsum returns int64 tensor by default

diff --git a/cula/ops/fwd_o.py b/cula/ops/fwd_o.py
@@ -77,9 +77,8 @@
 from cutlass.cute.nvgpu import cpasync, tcgen05
 from cutlass.cute.runtime import make_fake_compact_tensor, make_fake_stream
 from cutlass.cute.typing import Float32, Int32, Int64
-from fla.ops.utils import prepare_chunk_indices
 
-from cula.utils import USE_FAST_MATH, assert_blackwell
+from cula.utils import USE_FAST_MATH, assert_blackwell, prepare_chunk_indices
 
 PRINT_DEBUG = False
 PRINT_SMEM_DEBUG = False  # Print SMEM contents after TMA loads for non-aligned varlen debug

diff --git a/cula/ops/kda_decode.py b/cula/ops/kda_decode.py
@@ -114,6 +114,7 @@ def _get_cached_dispatch_bundle(
         _get_cached_cute_tensor(o, leading_dim=o.ndim - 1),
     )
 
+
 def _select_small_blocks_per_state(N: int, H: int, HV: int, V: int) -> int:
     del HV
     num_v_tiles_small = V // TILE_V_SMALL
@@ -245,7 +246,7 @@ def _try_fast_dense_decode(
         N4_DENSE_SMALL_HV_PARALLEL_HEAD_THRESHOLD if N <= 4 else DENSE_SMALL_HV_PARALLEL_HEAD_THRESHOLD
     )
     dense_small_hv_parallel = (
-        use_small_batch and H <= dense_small_hv_parallel_head_threshold and N <= DENSE_SMALL_HV_PARALLEL_MAX_N
+        use_small_batch and dense_small_hv_parallel_head_threshold >= H and N <= DENSE_SMALL_HV_PARALLEL_MAX_N
     )
     num_blocks_per_state_small = _select_small_blocks_per_state(N, H, HV, V)
 
@@ -1938,7 +1939,7 @@ def kda_decode(
     dense_small_hv_parallel = (
         use_small_batch
         and (not is_varlen_decode)
-        and H <= dense_small_hv_parallel_head_threshold
+        and dense_small_hv_parallel_head_threshold >= H
         and N <= DENSE_SMALL_HV_PARALLEL_MAX_N
     )
 
@@ -1962,12 +1963,16 @@ def kda_decode(
         pool_size = h0_source.shape[0]
         if state_layout == "kv":
             if h0_source.shape[2:] != (K, V):
-                raise ValueError(f"State layout mismatch for state_layout='kv': got {h0_source.shape}, expected (..., {HV}, {K}, {V})")
+                raise ValueError(
+                    f"State layout mismatch for state_layout='kv': got {h0_source.shape}, expected (..., {HV}, {K}, {V})"
+                )
             state_layout_is_kv = True
             fast_path = True
         else:
             if h0_source.shape[2:] != (V, K):
-                raise ValueError(f"State layout mismatch for state_layout='vk': got {h0_source.shape}, expected (..., {HV}, {V}, {K})")
+                raise ValueError(
+                    f"State layout mismatch for state_layout='vk': got {h0_source.shape}, expected (..., {HV}, {V}, {K})"
+                )
             fast_path = True
 
     if fast_path:

diff --git a/cula/ops/kda_decode_fla.py b/cula/ops/kda_decode_fla.py
@@ -1,9 +1,8 @@
-from typing import Optional
-
 import torch
 import triton
 import triton.language as tl
 
+
 @triton.jit(do_not_specialize=["T"])
 def fused_sigmoid_gating_delta_rule_update_kernel(
     A_log,
@@ -78,13 +77,7 @@ def fused_sigmoid_gating_delta_rule_update_kernel(
     if USE_INITIAL_STATE:
         idx = tl.load(h0_indices + i_n)
         if idx >= 0:
-            p_h0 = (
-                h0_source
-                + idx * HV * K * V
-                + i_hv * K * V
-                + o_k[:, None] * V
-                + o_v[None, :]
-            )
+            p_h0 = h0_source + idx * HV * K * V + i_hv * K * V + o_k[:, None] * V + o_v[None, :]
             b_h += tl.load(p_h0, mask=mask_h, other=0).to(tl.float32)
 
     for _ in range(0, T):
@@ -155,13 +148,7 @@ def fused_sigmoid_gating_delta_rule_update_kernel(
     if USE_INITIAL_STATE:
         idx = tl.load(h0_indices + i_n)
         if idx >= 0:
-            p_h0 = (
-                h0_source
-                + idx * HV * K * V
-                + i_hv * K * V
-                + o_k[:, None] * V
-                + o_v[None, :]
-            )
+            p_h0 = h0_source + idx * HV * K * V + i_hv * K * V + o_k[:, None] * V + o_v[None, :]
             tl.store(p_h0, b_h.to(p_h0.dtype.element_ty), mask=mask_h)
 
 
@@ -177,9 +164,9 @@ def fused_sigmoid_gating_delta_rule_update(
     b: torch.Tensor,
     initial_state_source: torch.Tensor,
     initial_state_indices: torch.Tensor,
-    scale: Optional[float] = None,
+    scale: float | None = None,
     use_qk_l2norm_in_kernel: bool = False,
-    cu_seqlens: Optional[torch.Tensor] = None,
+    cu_seqlens: torch.Tensor | None = None,
     is_kda: bool = False,
 ):
     """

diff --git a/cula/utils.py b/cula/utils.py
@@ -233,3 +233,62 @@ def _get_cache_buf(name: str, nbytes: int, device: torch.device) -> torch.Tensor
         buf = torch.empty(nbytes, dtype=torch.uint8, device=device)
         _cache_buf[key] = buf
     return buf
+
+
+# ---------------------------------------------------------------------------
+# Tensor cache
+# Adapted from: https://github.com/fla-org/flash-linear-attention/blob/main/fla/utils.py
+# Original copyright: 2024 The FLA Authors (same Apache 2.0 license)
+# ---------------------------------------------------------------------------
+_CULA_DISABLE_TENSOR_CACHE: bool = os.getenv("CULA_DISABLE_TENSOR_CACHE", "0") == "1"
+
+
+def tensor_cache(fn):
+    """Single-entry cache for functions with tensor inputs (identity-based)."""
+    last_args = None
+    last_kwargs = None
+    last_result = None
+
+    @functools.wraps(fn)
+    def wrapper(*args, **kwargs):
+        nonlocal last_args, last_kwargs, last_result
+        if _CULA_DISABLE_TENSOR_CACHE:
+            return fn(*args, **kwargs)
+        if last_args is not None and last_kwargs is not None:
+            if len(args) == len(last_args) and len(kwargs) == len(last_kwargs):
+                if all(a is b for a, b in zip(args, last_args, strict=False)) and all(
+                    k in last_kwargs and v is last_kwargs[k] for k, v in kwargs.items()
+                ):
+                    return last_result
+        result = fn(*args, **kwargs)
+        last_args, last_kwargs, last_result = args, kwargs, result
+        return result
+
+    return wrapper
+
+
+# ---------------------------------------------------------------------------
+# Sequence-length helpers
+# Adapted from: https://github.com/fla-org/flash-linear-attention/blob/main/fla/ops/utils/index.py
+# Original copyright: 2024 The FLA Authors (same Apache 2.0 license)
+# ---------------------------------------------------------------------------
+@tensor_cache
+def prepare_lens(cu_seqlens: torch.LongTensor) -> torch.LongTensor:
+    return torch.diff(cu_seqlens)
+
+
+@tensor_cache
+def prepare_chunk_indices(
+    cu_seqlens: torch.LongTensor,
+    chunk_size: int,
+    cu_seqlens_cpu: torch.LongTensor | None = None,
+) -> torch.LongTensor:
+    import triton  # already available as a transitive dep of cutlass-dsl
+
+    if cu_seqlens_cpu is not None:
+        indices = torch.cat(
+            [torch.arange(n, device=cu_seqlens.device) for n in triton.cdiv(prepare_lens(cu_seqlens_cpu), chunk_size).tolist()]
+        )
+        return torch.stack([indices.eq(0).cumsum(0) - 1, indices], 1).to(cu_seqlens)
+    indices = torch.cat([torch.arange(n) for n in triton.cdiv(prepare_lens(cu_seqlens), chunk_size).tolist()])
+    return torch.stack([indices.eq(0).cumsum(0) - 1, indices], 1).to(cu_seqlens)
diff --git a/pyproject.toml b/pyproject.toml
@@ -14,9 +14,12 @@ dependencies = [
     "nvidia-cutlass-dsl==4.4.2",
     "apache-tvm-ffi==0.1.9",
 ]
-license = "Apache-2.0"
+license = {text = "Apache-2.0"}
 
 [project.optional-dependencies]
+fla = [
+    "flash-linear-attention>=0.5",
+]
 dev = [
     "pytest",
     "matplotlib",
@@ -80,8 +83,10 @@ force-sort-within-sections = false
 "__init__.py" = ["F401"]
 "cula/_version.py" = ["UP007"]
 # TODO: fix undefined names (exp_g, chunk_kda_bwd_dqkwg) — WIP code
-"cula/ops/kda_fully_fused_wip.py" = ["F821"]
+"cula/kda/kda_fully_fused_wip.py" = ["F821"]
 "cula/kda/blackwell_fused_fwd.py" = ["F821"]
+# Ignore unused variable warnings in tests (often intentional for clarity)
+"tests/*" = ["F841"]
 
 [tool.setuptools_scm]
 # write generated version into package for runtime access

diff --git a/tests/test_kda_decode.py b/tests/test_kda_decode.py
@@ -33,7 +33,7 @@
 
 sys.path.insert(0, str(pathlib.Path(__file__).resolve().parent.parent))
 
-from cula.kda import kda_decode, fused_sigmoid_gating_delta_rule_update
+from cula.kda import fused_sigmoid_gating_delta_rule_update, kda_decode
 
 
 # ---------------------------------------------------------------------------
Original file line number	Diff line number	Diff line change
Expand Up		@@ -20,4 +20,3 @@
		"fused_sigmoid_gating_delta_rule_update",
		"linear_attention_decode",
		]