Skip to content

Commit 0dbebb9

Browse files
author
root
committed
IFU-master-2025-02-07
2 parents e4fe8b5 + a0a95fd commit 0dbebb9

24 files changed

Lines changed: 1307 additions & 179 deletions

.github/workflows/python-package.yml

Lines changed: 50 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ jobs:
5858
# This job matrix builds the CUDA versions of the libraries for platforms that support CUDA (Linux x64/aarch64 + Windows x64)
5959
##
6060
build-shared-libs-cuda:
61+
if: github.ref_name != 'multi-backend-refactor'
6162
strategy:
6263
matrix:
6364
os: [ubuntu-latest, windows-latest]
@@ -148,7 +149,7 @@ jobs:
148149
build-wheels:
149150
needs:
150151
- build-shared-libs
151-
- build-shared-libs-cuda
152+
# - build-shared-libs-cuda reduce the pkg size + build times for the preview release
152153
- build-shared-libs-rocm
153154
strategy:
154155
matrix:
@@ -166,6 +167,13 @@ jobs:
166167
runs-on: ${{ matrix.os }}
167168
steps:
168169
- uses: actions/checkout@v4
170+
with:
171+
fetch-depth: 1 # shallow clone
172+
- name: Fetch tags for dynamic versioning in setup.py
173+
run: |
174+
git fetch --depth=1 origin --tags
175+
echo "Available Git tags:"
176+
git tag -n
169177
- name: Download build artifact
170178
uses: actions/download-artifact@v4
171179
with:
@@ -183,7 +191,8 @@ jobs:
183191
python-version: ${{ matrix.python-version }}
184192
cache: pip
185193
- run: pip install build wheel
186-
- run: python -m build .
194+
# for now need to do the below instead of prior `python -m build .`, which didn't allow us to access git tags
195+
- run: python -m build --sdist && python -m build --wheel
187196
- name: Determine and Set Platform Tag, then Tag Wheel
188197
shell: bash
189198
run: |
@@ -197,6 +206,45 @@ jobs:
197206
path: dist/bitsandbytes-*.whl
198207
retention-days: 7
199208

209+
upload-pre-release-wheels:
210+
name: Create release and upload artifacts
211+
runs-on: ubuntu-latest
212+
if: github.ref_name == 'multi-backend-refactor'
213+
permissions:
214+
contents: write
215+
needs:
216+
- build-wheels
217+
steps:
218+
- name: Download and rename artifacts
219+
uses: actions/download-artifact@v4
220+
with:
221+
path: tmp/
222+
pattern: "bdist_wheel_*"
223+
merge-multiple: true
224+
- name: Inspect tmp directory after downloading artifacts
225+
run: ls -alFR tmp/
226+
- name: Move and rename wheel files with pattern replacement
227+
run: |
228+
mkdir -p wheels/
229+
find tmp/ -type f -name '*.whl' -print0 | while IFS= read -r -d '' wheel; do
230+
wheel_filename=$(basename "$wheel")
231+
# Remove the gith hash, e.g. `+1234567`, for a stable download link on the multi-backend pre-release
232+
cleaned_filename=$(echo "$wheel_filename" | sed -E 's/\+[0-9a-f]{7}-/-/g')
233+
mv "$wheel" "wheels/$cleaned_filename"
234+
done
235+
- name: Inspect wheels directory after renaming files
236+
run: ls -alFR wheels/
237+
- name: Create release and upload artifacts
238+
uses: softprops/action-gh-release@v2.0.8
239+
with:
240+
files: wheels/*.whl
241+
prerelease: true
242+
name: Multi-Backend Preview
243+
tag_name: continuous-release_multi-backend-refactor
244+
make_latest: false
245+
draft: false
246+
target_commitish: ${{ github.sha }}
247+
200248
audit-wheels:
201249
needs: build-wheels
202250
runs-on: ubuntu-latest

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,8 @@ dmypy.json
151151
# vim
152152
*.swp
153153

154+
# BNB-specific stuff
154155
dependencies
155156
cuda_build
156157
output/
158+
bitsandbytes/_version.py

CMakeLists.txt

Lines changed: 50 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
# For GCC: `cmake -B build . && cmake --build build`
44
# For MSVC: `cmake -B build . && cmake --build build --config Release`
55
# You can also use the following options and variables
6-
# - COMPUTE_BACKEND: Set to `cpu`, `cuda`, `hip` or `mps` to select the backend
6+
# - COMPUTE_BACKEND: Set to `cpu`, `cuda`, `hip`, `mps` or `npu` to select the backend
77
# - NO_CUBLASLT: Default OFF, will skip building/linking CUBLASLT support
88
# - CUDA_VERSION: The expected CUDA version, for sanity checking. The actual version
99
# is whatever CMake finds on your path.
@@ -29,11 +29,12 @@ set(CUDA_FILES csrc/ops.cu csrc/kernels.cu)
2929
set(HIP_FILES csrc/ops.hip csrc/kernels.hip)
3030
set(MPS_FILES csrc/mps_ops.mm)
3131
set(METAL_FILES csrc/mps_kernels.metal)
32+
set(NPU_FILES csrc/npu_ops.cpp)
3233
# C++ sources are always included
3334
list(APPEND SRC_FILES ${CPP_FILES})
3435

35-
set(COMPUTE_BACKEND "cpu" CACHE STRING "The compute backend to use (cpu, cuda, hip, mps)")
36-
set_property(CACHE COMPUTE_BACKEND PROPERTY STRINGS cpu cuda hip mps)
36+
set(COMPUTE_BACKEND "cpu" CACHE STRING "The compute backend to use (cpu, cuda, hip, mps, npu)")
37+
set_property(CACHE COMPUTE_BACKEND PROPERTY STRINGS cpu cuda hip mps npu)
3738
option(PTXAS_VERBOSE "Pass through -v flag to PTX Assembler" OFF)
3839

3940
if(APPLE)
@@ -69,6 +70,11 @@ elseif(${COMPUTE_BACKEND} STREQUAL "mps")
6970
set(BUILD_CUDA OFF)
7071
set(BUILD_HIP OFF)
7172
set(BUILD_MPS ON)
73+
elseif(${COMPUTE_BACKEND} STREQUAL "npu")
74+
set(BUILD_CUDA OFF)
75+
set(BUILD_HIP OFF)
76+
set(BUILD_MPS OFF)
77+
set(BUILD_NPU ON)
7278
else()
7379
set(BUILD_CUDA OFF)
7480
set(BUILD_HIP OFF)
@@ -82,6 +88,11 @@ if(BUILD_CUDA)
8288
# This needs to be added *before* we try to enable the CUDA language so CMake's compiler check passes.
8389
if(MSVC AND MSVC_VERSION VERSION_GREATER_EQUAL 1940)
8490
string(APPEND CMAKE_CUDA_FLAGS " --allow-unsupported-compiler")
91+
92+
# This is needed to build with VS2022 17.11+ and CUDA < 12.4.
93+
if (MSVC_VERSION VERSION_GREATER_EQUAL 1941)
94+
string(APPEND CMAKE_CUDA_FLAGS " -D_ALLOW_COMPILER_AND_STL_VERSION_MISMATCH")
95+
endif()
8596
endif()
8697

8798
enable_language(CUDA) # This will fail if CUDA is not found
@@ -227,6 +238,33 @@ elseif(BUILD_MPS)
227238
COMMENT "Compiling Metal kernels"
228239
VERBATIM)
229240
add_custom_target(metallib DEPENDS "bitsandbytes/bitsandbytes.metallib")
241+
elseif(BUILD_NPU)
242+
list(APPEND SRC_FILES ${NPU_FILES})
243+
244+
set(SOC_VERSION "Ascend910B4" CACHE STRING "system on chip type")
245+
set(ASCEND_CANN_PACKAGE_PATH $ENV{ASCEND_HOME_PATH} CACHE
246+
STRING "ASCEND CAN package installation directory"
247+
)
248+
249+
# ${KERNEL_FILES} are used to compile library, push files written by ascendc in ${KERNEL_FILES}.
250+
# ref to cmake/npu.cmake ascendc_library, cmake/cpu.cmake add_library
251+
# file(GLOB KERNEL_FILES ${CMAKE_CURRENT_SOURCE_DIR}/csrc/npu_kernels.cpp)
252+
file(GLOB KERNEL_FILES csrc/npu_kernels.cpp)
253+
254+
if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
255+
set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
256+
elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
257+
set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
258+
else()
259+
message(FATAL_ERROR "ascendc_kernel_cmake does not exist ,please check whether the can package is installed")
260+
endif()
261+
include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
262+
263+
# ascendc_library use to add kernel file to generate ascendc library
264+
ascendc_library(ascendc_kernels_npu STATIC ${KERNEL_FILES})
265+
266+
string(APPEND BNB_OUTPUT_NAME "_npu")
267+
add_compile_definitions(BUILD_NPU)
230268
else()
231269
string(APPEND BNB_OUTPUT_NAME "_cpu")
232270
set(GPU_SOURCES)
@@ -244,7 +282,11 @@ endif()
244282

245283
set_source_files_properties(${CPP_FILES} PROPERTIES LANGUAGE CXX)
246284
add_library(bitsandbytes SHARED ${SRC_FILES})
247-
target_compile_features(bitsandbytes PUBLIC cxx_std_14)
285+
if(BUILD_NPU)
286+
target_compile_features(bitsandbytes PUBLIC cxx_std_17)
287+
else()
288+
target_compile_features(bitsandbytes PUBLIC cxx_std_14)
289+
endif()
248290
target_include_directories(bitsandbytes PUBLIC csrc include)
249291

250292

@@ -301,6 +343,10 @@ if(BUILD_MPS)
301343
add_dependencies(bitsandbytes metallib)
302344
target_link_libraries(bitsandbytes objc "-framework Foundation" "-framework Metal" "-framework MetalPerformanceShaders" "-framework MetalPerformanceShadersGraph")
303345
endif()
346+
if(BUILD_NPU)
347+
target_compile_options(bitsandbytes PRIVATE -O2 -std=c++17)
348+
target_link_libraries(bitsandbytes PRIVATE $<BUILD_INTERFACE:host_intf_pub> ascendc_kernels_npu)
349+
endif()
304350

305351
if(WIN32)
306352
set_target_properties(bitsandbytes PROPERTIES PREFIX "lib")

_typos.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,15 @@
33
[default]
44
extend-ignore-re = [
55
"@Ther-nul", # valid Github user
6+
"CANN", # CANN (Compute Architecture for Neural Networks) is a heterogeneous computing architecture for Ascend NPU
67
]
78

89
[default.extend-identifiers]
910

1011
[type.py.extend-words]
1112
"BA" = "BA" # used as a commented-out variable in tests
13+
"cann" = "cann" # cann (Compute Architecture for Neural Networks) is a heterogeneous computing architecture for Ascend NPU
14+
1215

1316
[type.cuda.extend-words]
1417
"subtile" = "subtile"

bitsandbytes/__init__.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,9 @@
33
# This source code is licensed under the MIT license found in the
44
# LICENSE file in the root directory of this source tree.
55

6+
# Import the dynamically generated version from _version.py (see setup.py)
7+
from ._version import __version__ # isort: skip # type: ignore
8+
69
import torch
710

811
from . import research, utils
@@ -14,15 +17,15 @@
1417
matmul_cublas,
1518
mm_cublas,
1619
)
17-
from .backends import register_backend
20+
from .backends import backends, register_backend
1821
from .backends.cpu import CPUBackend
1922
from .backends.npu import NPUBackend
2023
from .cextension import lib
21-
from .nn import modules
2224

2325
features = {"multi_backend"}
2426
supported_torch_devices = {
2527
"cuda", # includes ROCm
28+
"npu", # Ascend NPU
2629
"xpu", # Intel GPU
2730
"cpu",
2831
}
@@ -61,6 +64,11 @@
6164
if hasattr(torch, "npu") and torch.npu.is_available():
6265
register_backend("npu", NPUBackend())
6366

67+
68+
# import module after decided backends
69+
if backends:
70+
from .nn import modules
71+
6472
# TODO: Other potential backends:
6573
# XLA - Google TPU / PJRT runtime
6674
# HPU - Habana / Intel Gaudi
@@ -73,5 +81,3 @@
7381
"optim.optimizer.Optimizer8bit": False,
7482
"optim.optimizer.MockArgs": False,
7583
}
76-
77-
__version__ = "0.43.3.dev"

bitsandbytes/autograd/_functions.py

Lines changed: 49 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -221,7 +221,7 @@ def backward(ctx, grad_output):
221221

222222
def supports_igemmlt(device: torch.device) -> bool:
223223
"""check if this device supports the optimized int8 kernel"""
224-
if device == torch.device("cpu"):
224+
if device == torch.device("cpu") or torch.device("xpu"):
225225
return True
226226
if torch.version.hip:
227227
return False if BNB_HIP_VERSION < 601 else True
@@ -463,7 +463,9 @@ def backward(ctx, grad_output):
463463
if len(grad_output.shape) == 3:
464464
grad_output = grad_output.reshape(-1, grad_output.shape[-1]).contiguous()
465465

466-
Cgrad, Cgradt, SCgrad, SCgradt, coo_tensor = F.double_quant(grad_output.to(torch.float16))
466+
Cgrad, Cgradt, SCgrad, SCgradt, coo_tensor = None, None, None, None, None
467+
if req_gradB or (req_gradA and state.CBt):
468+
Cgrad, Cgradt, SCgrad, SCgradt, coo_tensor = F.double_quant(grad_output.to(torch.float16))
467469
if req_gradB:
468470
CxAt, SAt = F.transform(CAt, formatB, transpose=True)
469471
C32grad, Sgrad = F.transform(Cgradt, "col32", transpose=True)
@@ -517,7 +519,12 @@ def forward(ctx, A, B, out=None, bias=None, quant_state: Optional[F.QuantState]
517519

518520
# 1. Dequantize
519521
# 2. MatmulnN
520-
output = torch.nn.functional.linear(A, F.dequantize_4bit(B, quant_state).to(A.dtype).t(), bias)
522+
if A.device.type == "npu":
523+
output = torch.matmul(A, F.dequantize_4bit(B, quant_state).to(A.dtype).t())
524+
if bias is not None:
525+
output += bias
526+
else:
527+
output = torch.nn.functional.linear(A, F.dequantize_4bit(B, quant_state).to(A.dtype).t(), bias)
521528

522529
# 3. Save state
523530
ctx.state = quant_state
@@ -548,11 +555,37 @@ def backward(ctx, grad_output):
548555
# not supported by PyTorch. TODO: create work-around
549556
# if req_gradB: grad_B = torch.matmul(grad_output.t(), A)
550557
if req_gradA:
551-
grad_A = torch.matmul(grad_output, F.dequantize_4bit(B, ctx.state).to(grad_output.dtype).t())
558+
if grad_output.device.type == "npu":
559+
grad_A = torch.matmul(grad_output, F.dequantize_4bit(B, ctx.state).to(grad_output.dtype))
560+
else:
561+
grad_A = torch.matmul(grad_output, F.dequantize_4bit(B, ctx.state).to(grad_output.dtype).t())
552562

553563
return grad_A, grad_B, None, grad_bias, None
554564

555565

566+
class MatMul8bitFp(torch.autograd.Function):
567+
# For Intel CPU and XPU, the double quant has many unsafe operations which will breaks the finetune.
568+
# We'd like to use dequant + matmul to run finetune currently.
569+
570+
@staticmethod
571+
def forward(ctx, A, B, out=None, bias=None, state=MatmulLtState):
572+
CB = B.data.to(A.dtype).mul_(state.SCB.unsqueeze(1).mul(1.0 / 127.0)).t()
573+
output = torch.matmul(A, CB).to(A.dtype)
574+
ctx.state = state
575+
ctx.dtype_A = A.dtype
576+
ctx.grad_shape = A.shape
577+
return output
578+
579+
@staticmethod
580+
def backward(ctx, grad_output):
581+
state = ctx.state
582+
B = state.CxB if state.CxB is not None else state.CB
583+
CB = B.to(ctx.dtype_A).mul_(state.SCB.unsqueeze(1).mul(1.0 / 127.0))
584+
grad_A = torch.matmul(grad_output, CB).view(ctx.grad_shape).to(ctx.dtype_A)
585+
586+
return grad_A, None, None, None, None
587+
588+
556589
def matmul(
557590
A: torch.Tensor,
558591
B: torch.Tensor,
@@ -564,6 +597,8 @@ def matmul(
564597
state = state or MatmulLtState()
565598
if threshold > 0.0:
566599
state.threshold = threshold
600+
if A.device.type in ("cpu", "xpu") and state.is_training:
601+
return MatMul8bitFp.apply(A, B, out, bias, state)
567602
return MatMul8bitLt.apply(A, B, out, bias, state)
568603

569604

@@ -575,8 +610,16 @@ def matmul_4bit(
575610
bias=None,
576611
):
577612
assert quant_state is not None
578-
if (A.numel() == A.shape[-1] or A.device.type == "cpu") and A.requires_grad == False:
579-
# CPU backend does not require A to be a vector
613+
if A.device.type in ("cpu", "xpu") and A.requires_grad == False:
614+
if getattr(quant_state, "ipex", False):
615+
B = B.t() if len(B.shape) == 2 else B
616+
out = F.gemv_4bit(A, B, out, state=quant_state)
617+
if bias is not None:
618+
out += bias
619+
return out
620+
else:
621+
return MatMul4Bit.apply(A, B, out, bias, quant_state)
622+
elif A.numel() == A.shape[-1] and A.requires_grad == False and A.device.type != "npu":
580623
if A.shape[-1] % quant_state.blocksize != 0:
581624
warn(
582625
f"Some matrices hidden dimension is not a multiple of {quant_state.blocksize} and efficient inference kernels are not supported for these (slow). Matrix input size found: {A.shape}",

0 commit comments

Comments
 (0)