microsoft · LeiWang1999 · May 5, 2025 · Apr 21, 2025 · Apr 21, 2025 · Apr 21, 2025
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -57,10 +57,16 @@ jobs:
         source bitblas_ci/bin/activate
         python -m pip install --upgrade pip
         if [ -f requirements-test.txt ]; then python -m pip install -r requirements-test.txt; fi
+        if [ -f bitblas_ci/bin/cmake ]; then
+          rm bitblas_ci/bin/cmake
+          hash -r
+        fi
 
     - name: Install project in wheel mode
       run: |
         source bitblas_ci/bin/activate
+        export PATH="/usr/bin:$PATH"
+        bash install.sh
         python -m pip install .
 
     - name: Run tests

diff --git a/.gitmodules b/.gitmodules
@@ -5,8 +5,7 @@
 [submodule "3rdparty/tilelang"]
 	path = 3rdparty/tilelang
 	url = https://github.com/tile-ai/tilelang
-	branch = bitblas
+	branch = main
 [submodule "3rdparty/cutlass"]
 	path = 3rdparty/cutlass
-	url = https://github.com/tile-ai/cutlass
-	branch = tldev
+	url = https://github.com/NVIDIA/cutlass.git
diff --git a/3rdparty/cutlass b/3rdparty/cutlass
diff --git a/3rdparty/tilelang b/3rdparty/tilelang
diff --git a/3rdparty/tvm b/3rdparty/tvm
diff --git a/README.md b/README.md
@@ -98,7 +98,7 @@ We are continuously expanding the support matrix. If you have any specific requi
 - **Python Version**: >= 3.8
 - **CUDA Version**: >= 11.0
 
-The easiest way to install BitBLAS is direcly from the PyPi using pip. To install the latest version, run the following command in your terminal.
+The easiest way to install BitBLAS is directly from the PyPi using pip. To install the latest version, run the following command in your terminal.
 
 ```bash
 pip install bitblas

diff --git a/bitblas/base/roller/bestfit.py b/bitblas/base/roller/bestfit.py
@@ -1,8 +1,10 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
+"""Benefit For BitBLAS Schedule"""
+
 
-"""Benifit For BitBLAS Schedule"""
 class Block:
+
     def __init__(self, start, end, is_free):
         self.start = start
         self.end = end
@@ -21,6 +23,7 @@ def __repr__(self) -> str:
 
 
 class BestFit:
+
     def __init__(self, align=32):
         self.limit = 0
         self.list = []
@@ -30,17 +33,16 @@ def malloc(self, size) -> Block:
         size = (size + self.align - 1) // self.align * self.align
         found = None
         for block in self.list:
-            if block.is_free and block.size() >= size:
-                if not found or found.size() > block.size():
-                    found = block
+            if (block.is_free and block.size() >= size and
+                (not found or block.size() < found.size())):
+                found = block
         if found:
             found.is_free = False
             remain = found.size() - size
             if remain != 0:
                 found.end -= remain
                 self.list.insert(
-                    self.list.index(found) + 1, Block(found.end, found.end + remain, True)
-                )
+                    self.list.index(found) + 1, Block(found.end, found.end + remain, True))
             return found
         elif len(self.list) > 0 and self.list[-1].is_free:
             add = size - self.list[-1].size()

diff --git a/bitblas/base/roller/node.py b/bitblas/base/roller/node.py
@@ -232,8 +232,9 @@ def propagate(self, tile, rstep: Optional[Dict] = None, targets=None):
         if rstep is None:
             rstep = {}
         shape = {
-            self.block_analyzer.get_output_buffers(block)[0].name:
-            [tvm.arith.ConstIntBound(0, val - 1) for val in tile] for block in self.schedule_stages
+            self.block_analyzer.get_output_buffers(block)[0].name: [
+                tvm.arith.ConstIntBound(0, val - 1) for val in tile
+            ] for block in self.schedule_stages
         }
         return self.ana.infer(shape, rstep, targets)
 

diff --git a/bitblas/base/roller/policy/__init__.py b/bitblas/base/roller/policy/__init__.py
@@ -1,5 +1,5 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
-from .default import DefaultPolicy
-from .tensorcore import TensorCorePolicy
+from .default import DefaultPolicy  # noqa: F401
+from .tensorcore import TensorCorePolicy  # noqa: F401
diff --git a/bitblas/base/roller/shape_inference/__init__.py b/bitblas/base/roller/shape_inference/__init__.py
@@ -1,4 +1,4 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 
-from .tir import get_analyzer_by_tir # pylint: disable=unused-import
+from .tir import get_analyzer_by_tir  # pylint: disable=unused-import # noqa: F401
diff --git a/bitblas/base/roller/shape_inference/common.py b/bitblas/base/roller/shape_inference/common.py
@@ -8,16 +8,21 @@
 
 
 class Statement():
-    def __init__(self, output: str, dependent_region: dict, var_map: OrderedDict, range_map: OrderedDict):
+
+    def __init__(self, output: str, dependent_region: dict, var_map: OrderedDict,
+                 range_map: OrderedDict):
         self.output = output
         self.dependent_region = dependent_region
         self.var_map = var_map
         self.range_map = range_map
 
+
 def _merge_two_bounds(x: arith.ConstIntBound, y: arith.ConstIntBound):
     return arith.ConstIntBound(min(x.min_value, y.min_value), max(x.max_value, y.max_value))
 
+
 class InputShapeInference():
+
     def __init__(self, deps: List[Statement]):
         self.deps = deps
 
@@ -34,17 +39,19 @@ def _infer(self, shape: Dict[str, List[arith.ConstIntBound]], rstep: Dict[str, i
             for name, regions in dep.dependent_region.items():
                 for region in regions:
                     bounds = [ana.const_int_bound(index) for index in region]
-                    if name in shape: # simply merge two bounds
+                    if name in shape:  # simply merge two bounds
                         bounds = [_merge_two_bounds(x, y) for x, y in zip(shape[name], bounds)]
                     shape[name] = bounds
 
         for name, bounds in shape.items():
             shape[name] = [c.max_value - c.min_value + 1 for c in bounds]
         return shape
 
-    def infer(self, shape, rstep: Dict[str, int] = {}):
+    def infer(self, shape, rstep: Dict[str, int] = None):
+        if rstep is None:
+            rstep = {}
         if isinstance(shape, (list, tuple)):
-            shape = {"output0" : [arith.ConstIntBound(0, val - 1) for val in shape]}
+            shape = {"output0": [arith.ConstIntBound(0, val - 1) for val in shape]}
         shape = self._infer(shape, rstep)
         return shape
 
@@ -63,4 +70,3 @@ def get_input_exprs(self, output_exprs):
                 input_expr = [ana.simplify(index) for index in region]
                 result[name] = input_expr
         return result
-
diff --git a/bitblas/base/roller/shape_inference/tir.py b/bitblas/base/roller/shape_inference/tir.py
@@ -8,6 +8,7 @@
 
 
 class Statement:
+
     def __init__(self, block_analyzer, block: BlockRV):
         self.block_analyzer = block_analyzer
         self.block = block
@@ -79,6 +80,7 @@ def __repr__(self):
 
 
 class DependencyAnalysis(object):
+
     def __init__(self, deps):
         self.deps = deps
         # issue: duplicate name when we have two same ops.
@@ -90,8 +92,8 @@ def _construct_unique_name2dep(self, deps):
         This is a workaround for the issue that we have two same ops' fuse case.
         See https://github.com/apache/tvm/issues/16433
         """
-        _names:Set = set()
-        name2dep:Mapping = {}
+        _names: Set = set()
+        name2dep: Mapping = {}
         for dep in deps:
             output_buffer = dep.block_analyzer.get_output_buffers(dep.block)[0]
             base_name = output_buffer.name
@@ -105,7 +107,7 @@ def _construct_unique_name2dep(self, deps):
                 _names.add(base_name)
             name2dep[base_name] = dep
         return name2dep
-        
+
     def get_or_create_node(self, name):
         if name not in self.mapping:
             self.mapping[name] = TensorDepNode(name)
@@ -114,8 +116,7 @@ def get_or_create_node(self, name):
     def traverse_dependencies(self, compute):
         if isinstance(compute, Statement):
             node = self.get_or_create_node(
-                compute.block_analyzer.get_output_buffers(compute.block)[0].name
-            )
+                compute.block_analyzer.get_output_buffers(compute.block)[0].name)
             # Loop through input tensors
             for input_buffer in compute.block_analyzer.get_input_buffers(compute.block):
                 # Get the input node
@@ -169,6 +170,7 @@ def _find_path_recursive(self, current_node, target_name, visited, path):
 
 
 class InputShapeInference:
+
     def __init__(self, deps: List[Statement]):
         self.deps = deps
         self.target_mapping = {}
@@ -242,9 +244,12 @@ def construct_dependency_target(self, targets: Tuple[str]):
         self.target_mapping[targets] = input_vars, mapping
         return input_vars, mapping
 
-    def infer(
-        self, shape: Dict[str, List[arith.ConstIntBound]], rstep: Dict[str, int] = {}, targets=None
-    ):
+    def infer(self,
+              shape: Dict[str, List[arith.ConstIntBound]],
+              rstep: Dict[str, int] = None,
+              targets=None):
+        if rstep is None:
+            rstep = {}
         compute_targets = tuple(shape.keys())
         input_vars, mapping = self.construct_dependency_target(compute_targets)
         ana = arith.Analyzer()
@@ -257,8 +262,7 @@ def infer(
             # assume the dom.min is always 0, maybe we can extend the IterInfo to include the min value.
             if ax.var.name in rstep:
                 bound = arith.ConstIntBound(
-                    int(ax.dom.min), int(ax.dom.min + min(ax.dom.extent, rstep[ax.var.name]) - 1)
-                )
+                    int(ax.dom.min), int(ax.dom.min + min(ax.dom.extent, rstep[ax.var.name]) - 1))
             else:
                 bound = arith.ConstIntBound(int(ax.dom.min), int(ax.dom.min + ax.dom.extent - 1))
             ana.update(ax.var, bound, True)
@@ -318,16 +322,14 @@ def get_input_exprs(self, output_exprs):
 
 
 def region_exist_in_list(a, list) -> bool:
+
     def expr_is_same(a, b) -> bool:
         if isinstance(a, tir.IntImm) and isinstance(b, tir.IntImm):
             return a.value == b.value
         return structural_equal(a, b)
 
     def region_is_same(a, b) -> bool:
-        for indice_a, indice_b in zip(a, b):
-            if not expr_is_same(indice_a, indice_b):
-                return False
-        return True
+        return all(expr_is_same(indice_a, indice_b) for indice_a, indice_b in zip(a, b))
 
     return any([region_is_same(a, x) for x in list])
 
@@ -340,9 +342,7 @@ def walk_indice(expr):
             return expr
         else:
             return None
-    elif isinstance(expr, tir.expr.ConstExpr):
-        return expr
-    elif isinstance(expr, tir.Var):
+    elif isinstance(expr, (tir.expr.ConstExpr, tir.Var)):
         return expr
     elif isinstance(expr, tir.ProducerLoad):
         return None
@@ -381,7 +381,7 @@ def fvisit(x):
                         with T.init():
                             T_dense_reindex[T.int64(0), v0, v1] = T.float16(0)
                         T_dense_reindex[T.int64(0), v0, v1] = T_dense_reindex[T.int64(0), v0, v1] + A_reindex[T.int64(0), v0, v2] * B_reindex[T.int64(0), v1, v2]
-                For exmaple, the T_dense_reindex has three dims, however there're only two spatial loops.
+                For example, the T_dense_reindex has three dims, however there're only two spatial loops.
                 """
                 continue
             index.append(expr)

diff --git a/bitblas/gpu/base.py b/bitblas/gpu/base.py
@@ -15,7 +15,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-# 
+#
 # /* Modifications Copyright (c) Microsoft. */
 # The code below is mostly copied from apache/tvm base.py in dlight.
 """Base schedule rule for GPU operators."""

diff --git a/bitblas/gpu/fallback.py b/bitblas/gpu/fallback.py
@@ -15,7 +15,7 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-# 
+#
 # Modifications Copyright (c) Microsoft.
 # The code below is mostly copied from apache/tvm fallback.py in dlight.
 # pylint: disable=missing-docstring
@@ -61,15 +61,9 @@ def apply(  # pylint: disable=too-many-locals,missing-docstring
             dom_kind = block.dom_kind()
             block = block.block_rv
 
-            if (
-                any(
-                    [
-                        sch.get(loop_rv).thread_binding is not None
-                        for loop_rv in sch.get_loops(block)
-                    ]
-                )
-                or len(sch.get_loops(block)) == 0
-            ):
+            if (any([
+                    sch.get(loop_rv).thread_binding is not None for loop_rv in sch.get_loops(block)
+            ]) or len(sch.get_loops(block)) == 0):
                 continue
 
             for loop, iter_type in zip(sch.get_loops(block), dom_kind):
@@ -92,4 +86,3 @@ def apply(  # pylint: disable=too-many-locals,missing-docstring
             sch.decompose_reduction(block, r_loop)
 
         return sch
-
diff --git a/bitblas/gpu/gemv.py b/bitblas/gpu/gemv.py
@@ -500,7 +500,7 @@ def apply(
         if not isinstance(len_S, int):
             TS, TR = 1, 64
 
-        while TS * TR > target.max_num_threads:
+        while target.max_num_threads < TS * TR:
             if TS > 1:
                 TS //= 2
             else:
+0 −6		apps/microtvm/cmsisnn/requirements.txt
+0 −6		apps/microtvm/ethosu/requirements.txt
+0 −2		apps/microtvm/pyproject.toml
+1 −1		apps/sgx/README.md
+0 −1		conda/recipe/meta.yaml
+1 −1		docker/Dockerfile.demo_opencl
+0 −2		docker/install/ubuntu2004_install_python_package.sh
+0 −2		docker/install/ubuntu_install_python_package.sh
+7 −0		include/tvm/arith/analyzer.h
+18 −13		include/tvm/runtime/data_type.h
+0 −4		python/gen_requirements.py
+0 −16		python/tvm/_ffi/base.py
+1 −1		python/tvm/_ffi/libinfo.py
+5 −4		python/tvm/contrib/pickle_memoize.py
+2 −2		python/tvm/relay/transform/memory_plan.py
+5 −2		python/tvm/script/parser/core/parser.py
+11 −1		python/tvm/script/parser/tir/parser.py
+5 −11		python/tvm/target/generic_func.py
+2 −2		python/tvm/te/hybrid/__init__.py
+5 −4		python/tvm/te/tag.py
+3 −3		src/arith/canonical_simplify.cc
+6 −0		src/arith/const_int_bound.cc
+1 −1		src/ir/expr.cc
+1 −1		src/relay/transforms/to_mixed_precision.cc
+2 −2		src/runtime/contrib/cublas/cublas.cc
+3 −0		src/runtime/cuda/cuda_device_api.cc
+2 −1		src/script/ir_builder/tir/ir.cc
+11 −0		src/script/printer/ir/misc.cc
+1 −1		src/target/llvm/codegen_llvm.cc
+1 −1		src/target/source/codegen_c.cc
+5 −5		src/target/source/codegen_cuda.cc
+0 −2		src/target/source/literal/cuda_half_t.h
+11 −8		src/tir/ir/expr.cc
+4 −4		src/tir/op/op.cc
+1 −1		src/tir/transforms/dtype_conversion.h
+2 −1		src/tir/transforms/merge_shared_memory_allocations.cc
+3 −2		src/tir/transforms/storage_rewrite.cc