quantized matmul kernel

chapman20j · copybara-github · commit f85ca8829acf · 2026-03-30T10:29:17.000-07:00
PiperOrigin-RevId: 891781786
diff --git a/qwix/_src/core/dot_general.py b/qwix/_src/core/dot_general.py
@@ -15,12 +15,15 @@
 # pylint: disable=line-too-long
 
 from collections.abc import Collection, Sequence
+import dataclasses
 import itertools
 from typing import Any
+
 import jax
 from jax import numpy as jnp
 from qwix._src.core import numerics
 from qwix._src.core import qarray
+from qwix.contrib.kernels import quantized_matmul
 
 
 def get_how_to_quantize(
@@ -97,6 +100,7 @@ def _apply_tiling(
   Returns:
     A tuple of (new_ca, new_ba, sum_axes).
   """
+  a = 0
   new_ca = [a + sum(t <= a for t in tiled_axes) for a in contracting_axes]
   new_ba = [a + sum(t < a for t in tiled_axes) for a in batch_axes]
   # We choose to insert the tile_count axes to the end of the batch axes.
@@ -399,6 +403,9 @@ def dot_general(
     dimension_numbers: jax.lax.DotDimensionNumbers,
     precision: jax.lax.PrecisionLike = None,
     preferred_element_type: jax.typing.DTypeLike | None = None,
+    *,
+    use_kernel: bool = False,
+    kernel_config: quantized_matmul.QuantizedMatmulConfig | None = None,
     **kwargs,
 ) -> jax.Array:
   """Computes a general dot product with support for ``QArray`` inputs.
@@ -413,6 +420,8 @@ def dot_general(
     dimension_numbers: The dimension numbers passed to dot_general.
     precision: The precision for jax.lax.dot_general.
     preferred_element_type: The preferred element type for jax.lax.dot_general.
+    use_kernel: Whether to use the Pallas kernel implementation.
+    kernel_config: Keyword arguments to pass to the Pallas kernel.
     **kwargs: Additional keyword arguments to dot_general.
 
   Returns:
@@ -453,6 +462,19 @@ def dot_general(
           use_fast_dot_general = False
           break
 
+  if (
+      use_kernel
+      and isinstance(lhs, qarray.QArray)
+      and isinstance(rhs, qarray.QArray)
+      and quantized_matmul.can_use_qmm_in_dot_general(
+          lhs, rhs, dimension_numbers
+      )
+  ):
+    kernel_kwargs = dataclasses.asdict(kernel_config)
+    return quantized_matmul.q_matmul(
+        lhs.qvalue, lhs.scale, rhs.qvalue, rhs.scale, **kernel_kwargs
+    )
+
   if use_fast_dot_general:
     return _fast_dot_general(
         lhs,
diff --git a/qwix/contrib/kernels/quantized_matmul.py b/qwix/contrib/kernels/quantized_matmul.py
@@ -0,0 +1,166 @@
+# Copyright 2026 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Implements a quantized matmul kernel."""
+
+import dataclasses
+from typing import Any
+
+import jax
+import jax.experimental.pallas as pl
+import jax.numpy as jnp
+from qwix._src.core import qarray
+
+INTERPRET: bool = True
+
+
+@dataclasses.dataclass
+class QuantizedMatmulConfig:
+  bm: int = 128
+  bk: int = 128
+  bn: int = 128
+  dtype: jnp.dtype = jnp.float32
+
+
+def can_use_qmm(x, sx, y, sy, *, bm, bk, bn):
+  """Returns whether the quantized matmul can be used."""
+  mdim, kdim = x.shape
+  _, ndim = y.shape
+  k_tiles = sx.shape[1]
+
+  if mdim % bm != 0 or ndim % bn != 0 or kdim % bk != 0:
+    # Block size must divide matrix size.
+    return False
+  grid = (mdim // bm, ndim // bn, kdim // bk)
+
+  # k information
+  k_tile_size = kdim // k_tiles
+  if k_tile_size != bk:
+    # Block size must match the tile size for the reduction axis.
+    return False
+  if sx.shape[1] != sy.shape[0]:
+    # Number of tiles must match for the scales.
+    return False
+
+  if sx.shape[0] != grid[2] or sx.shape[0] != 1:
+    # Scale size must match grid size or be 1.
+    return False
+
+  if sy.shape[1] != grid[1] or sy.shape[1] != 1:
+    # Scale size must match grid size or be 1.
+    return False
+
+  return True
+
+
+def can_use_qmm_in_dot_general(
+    lhs: qarray.QArray, rhs: qarray.QArray, dimension_numbers: Any
+):
+  """Returns whether the quantized matmul can be used in dot_general."""
+  # Check the qarrays.
+  if lhs.zero_point is not None or rhs.zero_point is not None:
+    return False
+
+  # Check the dimension numbers.
+  if not (
+      len(dimension_numbers) != 2
+      or len(dimension_numbers[0]) != 2
+      or len(dimension_numbers[1]) != 2
+      or tuple(dimension_numbers[0][0]) != (1,)
+      or tuple(dimension_numbers[0][1]) != (0,)
+      or len(dimension_numbers[1][0]) != 0
+      or len(dimension_numbers[1][1]) != 0
+  ):
+    return False
+
+  return True
+
+
+def quantized_matmul_kernel(x_ref, sx_ref, y_ref, sy_ref, o_ref):
+  @pl.when(pl.program_id(2) == 0)
+  def _():
+    o_ref[...] = jnp.zeros_like(o_ref)
+
+  o_ref[...] += (
+      jnp.matmul(x_ref[...], y_ref[...]).astype(sx_ref.dtype)
+      * sx_ref[...]
+      * sy_ref[...]
+  )
+
+
+def q_matmul(x, sx, y, sy, *, bm=128, bk=128, bn=128, dtype=jnp.float32):
+  """Computes a quantized matmul with support for subchannel quantization.
+
+  This kernel does not cover all cases. In particular, it requires that
+  the block sizes match the tile sizes, and that the scale sizes match the grid
+  size or be 1.
+
+  Args:
+    x: The left-hand side matrix.
+    sx: The left-hand side scales.
+    y: The right-hand side matrix.
+    sy: The right-hand side scales.
+    bm: The block size for the m dimension.
+    bk: The block size for the k dimension.
+    bn: The block size for the n dimension.
+    dtype: The data type of the output.
+
+  Returns:
+    The quantized matmul.
+  """
+  mdim, kdim = x.shape
+  _, ndim = y.shape
+  k_tiles = sx.shape[1]
+
+  # Block specs for x and y.
+  assert mdim % bm == 0, f'Block size must divide matrix size,  {mdim=} {bm=}'
+  assert ndim % bn == 0, f'Block size must divide matrix size,  {ndim=} {bn=}'
+  assert kdim % bk == 0, f'Block size must divide matrix size,  {kdim=} {bk=}'
+  grid = (mdim // bm, ndim // bn, kdim // bk)
+  x_blockspec = pl.BlockSpec((bm, bk), lambda a, b, c: (a, c))
+  y_blockspec = pl.BlockSpec((bk, bn), lambda a, b, c: (c, b))
+
+  # k information
+  k_tile_size = kdim // k_tiles
+  assert k_tile_size == bk, (
+      'Block size must match the tile size for the reduction axis'
+      f' {k_tile_size=} {bk=}'
+  )
+  assert sx.shape[1] == sy.shape[0], 'Number of tiles must match for the scales'
+
+  # m information
+  if sx.shape[0] == 1:
+    sx_blockspec = pl.BlockSpec((1, 1), lambda a, b, c: (0, c))
+  else:
+    assert (
+        sx.shape[0] == grid[0]
+    ), f'Scale size must match grid size,  {sx.shape[0]=} {grid[0]=}'
+    sx_blockspec = pl.BlockSpec((1, 1), lambda a, b, c: (a, c))
+
+  # n information
+  if sy.shape[1] == 1:
+    sy_blockspec = pl.BlockSpec((1, 1), lambda a, b, c: (c, 0))
+  else:
+    assert (
+        sy.shape[1] == grid[1]
+    ), f'Scale size must match grid size,  {sy.shape[1]=} {grid[1]=}'
+    sy_blockspec = pl.BlockSpec((1, 1), lambda a, b, c: (c, b))
+
+  return pl.pallas_call(
+      quantized_matmul_kernel,
+      out_shape=jax.ShapeDtypeStruct((mdim, ndim), dtype),
+      grid=grid,
+      in_specs=(x_blockspec, sx_blockspec, y_blockspec, sy_blockspec),
+      out_specs=pl.BlockSpec((bm, bn), lambda a, b, c: (a, b)),
+      interpret=INTERPRET,
+  )(x, sx, y, sy).astype(dtype)
diff --git a/tests/_src/core/dot_general_test.py b/tests/_src/core/dot_general_test.py
@@ -19,6 +19,7 @@
 from qwix._src.core import dot_general
 from qwix._src.core import einsum
 from qwix._src.core import qarray
+from qwix.contrib.kernels import quantized_matmul
 
 
 class DotGeneralTest(parameterized.TestCase):
@@ -146,6 +147,34 @@ def test_fast_dot_general_channelwise_contracting(self):
     self.assertEqual(res.shape, (4, 4))
     self.assertTrue(jnp.allclose(res, jnp.full((4, 4), 8.0), atol=0.1))
 
+  def test_kernel_dot_general(self):
+    lhs = jnp.ones((4, 8), jnp.float32)
+    rhs = jnp.ones((8, 16), jnp.float32)
+
+    # Channelwise on axis 1 (contracting)
+    lhs_how = qarray.HowToQuantize(
+        qtype=jnp.int8,
+        tiled_axes={1: 1},
+    )
+    # Channelwise on axis 0 (contracting)
+    rhs_how = qarray.HowToQuantize(
+        qtype=jnp.int8,
+        tiled_axes={0: 1},
+    )
+
+    q_lhs = qarray.quantize(lhs, lhs_how)
+    q_rhs = qarray.quantize(rhs, rhs_how)
+
+    kernel_config = quantized_matmul.QuantizedMatmulConfig(bm=4, bn=16, bk=1)
+
+    _ = dot_general.dot_general(
+        q_lhs,
+        q_rhs,
+        (([1], [0]), ([], [])),
+        use_kernel=True,
+        kernel_config=kernel_config,
+    )
+
 
 if __name__ == '__main__':
   absltest.main()
diff --git a/tests/contrib/kernels/quantized_matmul_test.py b/tests/contrib/kernels/quantized_matmul_test.py
@@ -0,0 +1,34 @@
+import jax.numpy as jnp
+from qwix._src.core import qarray
+from qwix.contrib.kernels import quantized_matmul
+
+from google3.testing.pybase import googletest
+
+
+class QuantizedMatmulTest(googletest.TestCase):
+
+  def test_kernel_dot_general(self):
+    lhs = jnp.ones((4, 8), jnp.float32)
+    rhs = jnp.ones((8, 16), jnp.float32)
+
+    # Channelwise on axis 1 (contracting)
+    lhs_how = qarray.HowToQuantize(
+        qtype=jnp.int8,
+        tiled_axes={1: 1},
+    )
+    # Channelwise on axis 0 (contracting)
+    rhs_how = qarray.HowToQuantize(
+        qtype=jnp.int8,
+        tiled_axes={0: 1},
+    )
+
+    q_lhs = qarray.quantize(lhs, lhs_how)
+    q_rhs = qarray.quantize(rhs, rhs_how)
+
+    _ = quantized_matmul.q_matmul(
+        q_lhs.qvalue, q_lhs.scale, q_rhs.qvalue, q_rhs.scale, bm=4, bn=16, bk=1
+    )
+
+
+if __name__ == "__main__":
+  googletest.main()