Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/backend/SYCL.md
Original file line number Diff line number Diff line change
Expand Up @@ -689,6 +689,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
| GGML_SYCL_F16 | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path. (1.) |
| GGML_SYCL_GRAPH | OFF *(default)* \|ON *(Optional)* | Enable build with [SYCL Graph extension](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc). |
| GGML_SYCL_DNN | ON *(default)* \|OFF *(Optional)* | Enable build with oneDNN. |
| GGML_SYCL_HOST_MEM_FALLBACK | ON *(default)* \|OFF *(Optional)* | Allow host memory fallback when device memory is full during quantized weight reorder. Enables inference to continue at reduced speed (reading over PCIe) instead of failing. Requires Linux kernel 6.8+. |
| CMAKE_C_COMPILER | `icx` *(Linux)*, `icx/cl` *(Windows)* | Set `icx` compiler for SYCL code path. |
| CMAKE_CXX_COMPILER | `icpx` *(Linux)*, `icx` *(Windows)* | Set `icpx/icx` compiler for SYCL code path. |

Expand Down
1 change: 1 addition & 0 deletions ggml/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,7 @@ option(GGML_RPC "ggml: use RPC"
option(GGML_SYCL "ggml: use SYCL" OFF)
option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF)
option(GGML_SYCL_GRAPH "ggml: enable graphs in the SYCL backend" ON)
option(GGML_SYCL_HOST_MEM_FALLBACK "ggml: allow host memory fallback in SYCL reorder (requires kernel 6.8+)" ON)
option(GGML_SYCL_DNN "ggml: enable oneDNN in the SYCL backend" ON)
set (GGML_SYCL_TARGET "INTEL" CACHE STRING
"ggml: sycl target device")
Expand Down
5 changes: 5 additions & 0 deletions ggml/src/ggml-sycl/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,11 @@ if (GGML_SYCL_GRAPH)
target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_GRAPH)
endif()

if (GGML_SYCL_HOST_MEM_FALLBACK)
message(STATUS "find GGML_SYCL_HOST_MEM_FALLBACK")
target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_HOST_MEM_FALLBACK)
endif()

if (GGML_SYCL_DEVICE_ARCH)
target_compile_options(ggml-sycl PRIVATE -Xsycl-target-backend --offload-arch=${GGML_SYCL_DEVICE_ARCH})
target_link_options(ggml-sycl PRIVATE -Xsycl-target-backend --offload-arch=${GGML_SYCL_DEVICE_ARCH})
Expand Down
33 changes: 31 additions & 2 deletions ggml/src/ggml-sycl/convert.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,25 @@ static void dequantize_row_q4_0_sycl_reorder(const void *vx, dst_t *y, const int

}

template <typename dst_t>
static void dequantize_row_q8_0_sycl_reorder(const void *vx, dst_t *y, const int64_t k,
dpct::queue_ptr stream) {

dpct::has_capability_or_fail(stream->get_device(),
{sycl::aspect::fp16});

int constexpr WARP_K = WARP_SIZE * QK8_0;
const int n_warp = (k + WARP_K - 1) / WARP_K;
GGML_ASSERT(k % QK8_0 == 0);
stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, n_warp) *
sycl::range<3>(1, 1, WARP_SIZE),
sycl::range<3>(1, 1, WARP_SIZE)),
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]]{
dequantize_block_q8_0_reorder(vx, y, k, item_ct1);
});

}

template <typename dst_t>
static void dequantize_row_q4_1_sycl(const void *vx, dst_t *y, const int64_t k,
dpct::queue_ptr stream) {
Expand Down Expand Up @@ -614,7 +633,12 @@ to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst) {
case GGML_TYPE_Q5_1:
return dequantize_block_sycl<QK5_1, QR5_1, dequantize_q5_1>;
case GGML_TYPE_Q8_0:
return dequantize_block_sycl<QK8_0, QR8_0, dequantize_q8_0>;
if (dst->src[0]->extra &&
((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) {
return dequantize_row_q8_0_sycl_reorder;
} else {
return dequantize_block_sycl<QK8_0, QR8_0, dequantize_q8_0>;
}
case GGML_TYPE_Q2_K:
return dequantize_row_q2_K_sycl;
case GGML_TYPE_Q3_K:
Expand Down Expand Up @@ -683,7 +707,12 @@ to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst) {
case GGML_TYPE_Q5_1:
return dequantize_block_sycl<QK5_1, QR5_1, dequantize_q5_1>;
case GGML_TYPE_Q8_0:
return dequantize_block_sycl<QK8_0, QR8_0, dequantize_q8_0>;
if (dst->src[0]->extra &&
((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) {
return dequantize_row_q8_0_sycl_reorder;
} else {
return dequantize_block_sycl<QK8_0, QR8_0, dequantize_q8_0>;
}
case GGML_TYPE_Q2_K:
return dequantize_row_q2_K_sycl;
case GGML_TYPE_Q3_K:
Expand Down
28 changes: 28 additions & 0 deletions ggml/src/ggml-sycl/dequantize.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,34 @@ static void dequantize_block_q4_0_reorder(const void * __restrict__ vx, dst_t *

}

// Dequantize Q8_0 from reorder layout: [all qs (k bytes)][all d values]
// Each thread handles one block of QK8_0 elements.
template<typename dst_t>
static void dequantize_block_q8_0_reorder(const void * __restrict__ vx, dst_t * __restrict__ yy, int64_t k,
const sycl::nd_item<3> &item_ct1) {

const int64_t i = item_ct1.get_group(2);
const int64_t tid = item_ct1.get_local_id(2);
const int lane_ib = i * WARP_SIZE + tid;

if (lane_ib >= k / QK8_0) {
return;
}

dst_t * y_ptr = yy + lane_ib * QK8_0;

auto qs = (const int8_t*)vx + lane_ib * QK8_0;
auto s_ptr = (const sycl::half*)((const uint8_t*)vx + k) + lane_ib;

const float d = float(*s_ptr);

#pragma unroll
for (int l = 0; l < QK8_0; ++l) {
y_ptr[l] = d * qs[l];
}

}

template<typename dst_t>
static void dequantize_block_q4_1(const void * __restrict__ vx, dst_t * __restrict__ yy, int64_t nb32,
const sycl::nd_item<3> &item_ct1) {
Expand Down
Loading