Skip to content

Commit cb2443e

Browse files
laneeeeezuolve.lai
andauthored
feat: support iluvatar backend qwen3 0.6b run through (#481)
Co-authored-by: zuolve.lai <zuolve.lai@iluvatar.com>
1 parent cbad9d4 commit cb2443e

37 files changed

+1200
-35
lines changed

CMakeLists.txt

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ set_property(GLOBAL PROPERTY USE_FOLDERS ON)
33

44
option(USE_NPU "Enable NPU support" OFF)
55
option(USE_MLU "Enable MLU support" OFF)
6+
option(USE_ILU "Enable ILU support" OFF)
67
option(USE_CUDA "Enable CUDA support" OFF)
78
add_compile_definitions(YLT_ENABLE_IBV)
89
add_definitions(-DYLT_ENABLE_IBV)
@@ -105,7 +106,7 @@ set(CMAKE_CXX_STANDARD 20)
105106
set(CMAKE_CXX_STANDARD_REQUIRED ON)
106107
set(CMAKE_CXX_EXTENSIONS ON)
107108

108-
if(USE_NPU)
109+
if(USE_NPU OR USE_ILU)
109110
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
110111
add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0)
111112
elseif(USE_MLU OR USE_CUDA)
@@ -208,6 +209,19 @@ if(USE_CUDA)
208209
message(STATUS "TORCH_CUDA_ARCH_LIST: ${TORCH_CUDA_ARCH_LIST}")
209210
endif()
210211

212+
if(USE_ILU)
213+
set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake/Modules;${CMAKE_MODULE_PATH}")
214+
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
215+
set(CMAKE_CUDA_ARCHITECTURES "ivcore11")
216+
set(WARNINGS_AS_ERRORS OFF)
217+
if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
218+
add_definitions(
219+
-Wno-c++11-narrowing
220+
-Wno-thread-safety-analysis
221+
)
222+
endif()
223+
endif()
224+
211225
# configure vcpkg
212226
# have to set CMAKE_TOOLCHAIN_FILE before first project call.
213227
# if (DEFINED ENV{VCPKG_ROOT} AND NOT DEFINED CMAKE_TOOLCHAIN_FILE)
@@ -425,6 +439,23 @@ if(USE_CUDA)
425439
)
426440
endif()
427441

442+
if(USE_ILU)
443+
add_definitions(-DUSE_ILU)
444+
set(CMAKE_VERBOSE_MAKEFILE ON)
445+
include_directories(
446+
$ENV{PYTHON_INCLUDE_PATH}
447+
$ENV{PYTORCH_INSTALL_PATH}/include
448+
$ENV{PYTORCH_INSTALL_PATH}/include/torch/csrc/api/include
449+
$ENV{IXFORMER_INSTALL_PATH}/csrc/include/ixformer
450+
)
451+
452+
link_directories(
453+
$ENV{PYTHON_LIB_PATH}
454+
$ENV{PYTORCH_INSTALL_PATH}/lib
455+
$ENV{IXFORMER_INSTALL_PATH}
456+
)
457+
endif()
458+
428459
# check if USE_CXX11_ABI is set correctly
429460
# if (DEFINED USE_CXX11_ABI)
430461
# parse_make_options(${TORCH_CXX_FLAGS} "TORCH_CXX_FLAGS")

setup.py

Lines changed: 29 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,12 @@ def get_device_type():
3838
if torch.cuda.is_available():
3939
return "cuda"
4040

41+
try:
42+
import ixformer
43+
return "ilu"
44+
except ImportError:
45+
pass
46+
4147
try:
4248
import torch_mlu
4349
if torch.mlu.is_available():
@@ -143,6 +149,14 @@ def get_torch_mlu_root_path():
143149
except ImportError:
144150
return None
145151

152+
def get_ixformer_root_path():
153+
try:
154+
import ixformer
155+
import os
156+
return os.path.dirname(os.path.abspath(ixformer.__file__))
157+
except ImportError:
158+
return None
159+
146160
def get_nccl_root_path():
147161
try:
148162
from nvidia import nccl
@@ -253,7 +267,14 @@ def set_cuda_envs():
253267
os.environ["LIBTORCH_ROOT"] = get_torch_root_path()
254268
os.environ["PYTORCH_INSTALL_PATH"] = get_torch_root_path()
255269
os.environ["CUDA_TOOLKIT_ROOT_DIR"] = "/usr/local/cuda"
256-
270+
271+
def set_ilu_envs():
272+
os.environ["PYTHON_INCLUDE_PATH"] = get_python_include_path()
273+
os.environ["PYTHON_LIB_PATH"] = get_torch_root_path()
274+
os.environ["LIBTORCH_ROOT"] = get_torch_root_path()
275+
os.environ["PYTORCH_INSTALL_PATH"] = get_torch_root_path()
276+
os.environ["IXFORMER_INSTALL_PATH"] = get_ixformer_root_path()
277+
257278
class CMakeExtension(Extension):
258279
def __init__(self, name: str, path: str, sourcedir: str = "") -> None:
259280
super().__init__(name, sources=[])
@@ -337,7 +358,7 @@ def build_extension(self, ext: CMakeExtension):
337358
f"-DDEVICE_ARCH={self.arch.upper()}",
338359
f"-DINSTALL_XLLM_KERNELS={'ON' if self.install_xllm_kernels else 'OFF'}",
339360
]
340-
361+
341362
if self.device == "a2" or self.device == "a3":
342363
cmake_args += ["-DUSE_NPU=ON"]
343364
# set npu environment variables
@@ -352,6 +373,9 @@ def build_extension(self, ext: CMakeExtension):
352373
f"-DCMAKE_CUDA_ARCHITECTURES={cuda_architectures}"]
353374
# set cuda environment variables
354375
set_cuda_envs()
376+
elif self.device == "ilu":
377+
cmake_args += ["-DUSE_ILU=ON"]
378+
set_ilu_envs()
355379
else:
356380
raise ValueError("Please set --device to a2 or a3 or mlu or cuda.")
357381

@@ -375,6 +399,7 @@ def build_extension(self, ext: CMakeExtension):
375399

376400
build_args = ["--config", build_type]
377401
max_jobs = os.getenv("MAX_JOBS", str(os.cpu_count()))
402+
# max_jobs="2"
378403
build_args += ["-j" + max_jobs]
379404

380405
env = os.environ.copy()
@@ -604,9 +629,9 @@ def parse_arguments():
604629
parser.add_argument(
605630
'--device',
606631
type=str.lower,
607-
choices=['auto', 'a2', 'a3', 'mlu', 'cuda'],
632+
choices=['auto', 'a2', 'a3', 'mlu', 'cuda', 'ilu'],
608633
default='auto',
609-
help='Device type: a2, a3, mlu, or cuda (case-insensitive)'
634+
help='Device type: a2, a3, mlu, ilu or cuda (case-insensitive)'
610635
)
611636

612637
parser.add_argument(

third_party/CMakeLists.txt

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,4 +19,32 @@ target_include_directories(mooncake_store PUBLIC
1919
${CMAKE_CURRENT_SOURCE_DIR}/Mooncake/mooncake-transfer-engine/include
2020
)
2121

22+
if(USE_ILU)
23+
if(TARGET cpprest)
24+
set_target_properties(cpprest PROPERTIES
25+
CXX_STANDARD 20
26+
CXX_STANDARD_REQUIRED ON
27+
CXX_EXTENSIONS OFF
28+
)
29+
endif()
30+
if(TARGET transfer_engine)
31+
target_compile_options(transfer_engine PRIVATE -std=c++20)
32+
set_target_properties(transfer_engine PROPERTIES
33+
CXX_STANDARD 20
34+
CXX_STANDARD_REQUIRED ON
35+
)
36+
message(STATUS "Set C++20 for transfer_engine target")
37+
endif()
38+
if(TARGET SMHasherSupport)
39+
set_target_properties(SMHasherSupport PROPERTIES
40+
CXX_STANDARD 11
41+
CXX_STANDARD_REQUIRED ON
42+
CXX_EXTENSIONS OFF
43+
)
44+
message(STATUS "SMHasherSupport target found and configured")
45+
else()
46+
message(WARNING "SMHasherSupport target not found after adding smhasher")
47+
endif()
48+
endif()
49+
2250
target_link_libraries(mooncake_store PUBLIC transfer_engine cachelib_memory_allocator)

xllm/core/framework/batch/batch_input_builder.cpp

100755100644
Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -207,7 +207,7 @@ void BatchInputBuilder::process_sequences_multithreaded() {
207207
state_.q_seq_lens.insert(state_.q_seq_lens.end(),
208208
state.q_seq_lens.begin(),
209209
state.q_seq_lens.end());
210-
#elif defined(USE_MLU) || defined(USE_CUDA)
210+
#elif defined(USE_MLU) || defined(USE_CUDA) || defined(USE_ILU)
211211
int32_t seq_len_offset = state_.seq_lens.back();
212212
// skip the first element which is 0
213213
for (size_t i = 1; i < state.seq_lens.size(); ++i) {
@@ -293,7 +293,7 @@ void BatchInputBuilder::process_single_sequence(
293293
#if defined(USE_NPU)
294294
state.seq_lens.push_back(seq_len + offset);
295295
state.q_seq_lens.push_back(q_seq_len);
296-
#elif defined(USE_MLU) || defined(USE_CUDA)
296+
#elif defined(USE_MLU) || defined(USE_CUDA) || defined(USE_ILU)
297297
state.seq_lens.push_back(state.seq_lens.back() + seq_len + offset);
298298
state.q_seq_lens.push_back(state.q_seq_lens.back() + q_seq_len);
299299
#endif
@@ -527,7 +527,7 @@ void BatchInputBuilder::padding_decode_batch_size(
527527
#if defined(USE_NPU)
528528
state_.seq_lens.push_back(num_decoding_tokens);
529529
state_.q_seq_lens.push_back(num_decoding_tokens);
530-
#elif defined(USE_MLU) || defined(USE_CUDA)
530+
#elif defined(USE_MLU) || defined(USE_CUDA) || defined(USE_ILU)
531531
state_.seq_lens.push_back(state_.seq_lens.back() + num_decoding_tokens);
532532
state_.q_seq_lens.push_back(state_.q_seq_lens.back() +
533533
num_decoding_tokens);

xllm/core/framework/batch/batch_input_builder.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ class BatchInputBuilder {
8585
#if defined(USE_NPU)
8686
std::vector<int32_t> seq_lens;
8787
std::vector<int32_t> q_seq_lens;
88-
#elif defined(USE_MLU) || defined(USE_CUDA)
88+
#elif defined(USE_MLU) || defined(USE_CUDA) || defined(USE_ILU)
8989
std::vector<int32_t> seq_lens = {0}; // cu_seq_lens
9090
std::vector<int32_t> q_seq_lens = {0}; // q_cu_seq_len
9191
#endif

xllm/core/framework/parallel_state/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ cc_library(
1212
$<$<BOOL:${USE_NPU}>:npu_process_group.h>
1313
$<$<BOOL:${USE_MLU}>:mlu_process_group.h>
1414
$<$<BOOL:${USE_CUDA}>:cuda_process_group.h>
15+
$<$<BOOL:${USE_ILU}>:ilu_process_group.h>
1516
collective_communicator.h
1617
SRCS
1718
mapping_npu.cpp

xllm/core/framework/parallel_state/collective_communicator.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@ limitations under the License.
2525
#include "mlu_process_group.h"
2626
#elif defined(USE_CUDA)
2727
#include "cuda_process_group.h"
28+
#elif defined(USE_ILU)
29+
#include "ilu_process_group.h"
2830
#endif
2931
#include "common/global_flags.h"
3032
#include "parallel_args.h"
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
/* Copyright 2025 The xLLM Authors. All Rights Reserved.
2+
3+
Licensed under the Apache License, Version 2.0 (the "License");
4+
you may not use this file except in compliance with the License.
5+
You may obtain a copy of the License at
6+
7+
https://github.com/jd-opensource/xllm/blob/main/LICENSE
8+
9+
Unless required by applicable law or agreed to in writing, software
10+
distributed under the License is distributed on an "AS IS" BASIS,
11+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
See the License for the specific language governing permissions and
13+
limitations under the License.
14+
==============================================================================*/
15+
16+
#pragma once
17+
18+
#include <torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp>
19+
20+
#include "process_group.h"
21+
22+
namespace xllm {
23+
24+
class ProcessGroupImpl : public ProcessGroup {
25+
public:
26+
ProcessGroupImpl(int32_t global_rank,
27+
int32_t world_size,
28+
int32_t rank_size,
29+
int32_t port,
30+
bool trans,
31+
const std::string& host,
32+
const std::string& group_name,
33+
const torch::Device& device)
34+
: ProcessGroup(device) {
35+
c10::intrusive_ptr<c10d::ProcessGroupNCCL::Options> pg_options =
36+
c10d::ProcessGroupNCCL::Options::create();
37+
#if TORCH_VERSION_MAJOR > 2 || \
38+
(TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR >= 7)
39+
pg_options->group_name = group_name;
40+
#endif
41+
int32_t rank = global_rank;
42+
if (world_size != rank_size) {
43+
auto [local_rank, group_ranks] =
44+
get_group_rank(world_size, global_rank, rank_size, trans);
45+
pg_options->global_ranks_in_group = group_ranks;
46+
rank = local_rank;
47+
}
48+
49+
auto store = create_tcp_store(host, port, rank);
50+
pg_ = std::make_unique<c10d::ProcessGroupNCCL>(
51+
store, rank, rank_size, pg_options);
52+
}
53+
};
54+
55+
} // namespace xllm

xllm/core/framework/parallel_state/process_group.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ limitations under the License.
2121
#include "mlu_process_group.h"
2222
#elif defined(USE_CUDA)
2323
#include "cuda_process_group.h"
24+
#elif defined(USE_ILU)
25+
#include "ilu_process_group.h"
2426
#endif
2527

2628
namespace {

xllm/core/kernels/CMakeLists.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,10 @@ if(USE_CUDA)
1212
add_subdirectory(cuda)
1313
endif()
1414

15+
if(USE_ILU)
16+
add_subdirectory(ilu)
17+
endif()
18+
1519
cc_library(
1620
NAME
1721
kernels
@@ -25,4 +29,5 @@ cc_library(
2529
$<$<BOOL:${USE_NPU}>:npu_kernels>
2630
$<$<BOOL:${USE_MLU}>:mlu_kernels>
2731
$<$<BOOL:${USE_CUDA}>:cuda_kernels>
32+
$<$<BOOL:${USE_ILU}>:ilu_kernels>
2833
)

0 commit comments

Comments
 (0)