Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 8 additions & 13 deletions xllm/api_service/rec_completion_service_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,7 @@ limitations under the License.
#include "completion.pb.h"
#include "core/distributed_runtime/llm_master.h"
#include "core/distributed_runtime/rec_master.h"
#include "core/framework/request/mm_data.h"
#include "core/framework/request/request_output.h"
#include "core/util/utils.h"

#define likely(x) __builtin_expect(!!(x), 1)
#define unlikely(x) __builtin_expect(!!(x), 0)
Expand Down Expand Up @@ -167,18 +165,15 @@ void RecCompletionServiceImpl::process_async_impl(
}

const auto& rpc_request_ref = call->request();
std::optional<MMData> mm_data = std::nullopt;
std::optional<std::vector<proto::InferInputTensor>> input_tensors =
std::nullopt;
if (rpc_request_ref.input_tensors_size()) {
// HISTOGRAM_OBSERVE(rec_input_first_dim,
// rpc_request_ref.input_tensors(0).shape(0));

MMDict mm_dict;
std::vector<proto::InferInputTensor> tensors;
tensors.reserve(rpc_request_ref.input_tensors_size());
for (int i = 0; i < rpc_request_ref.input_tensors_size(); ++i) {
const auto& tensor = rpc_request_ref.input_tensors(i);
mm_dict[tensor.name()] =
xllm::util::convert_rec_tensor_to_torch(tensor).to(torch::kBFloat16);
tensors.push_back(rpc_request_ref.input_tensors(i));
}
mm_data = std::move(MMData(MMType::EMBEDDING, mm_dict));
input_tensors = std::move(tensors);
}

// schedule the request
Expand All @@ -187,7 +182,7 @@ void RecCompletionServiceImpl::process_async_impl(
master_->handle_request(
std::move(rpc_request_ref.prompt()),
std::move(prompt_tokens),
std::move(mm_data),
std::move(input_tensors),
std::move(request_params),
[call,
model,
Expand Down Expand Up @@ -219,4 +214,4 @@ void RecCompletionServiceImpl::process_async_impl(
});
}

} // namespace xllm
} // namespace xllm
2 changes: 1 addition & 1 deletion xllm/core/common/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ cc_library(
$<$<BOOL:${USE_NPU}>:mspti_helper.h>
options.h
rate_limiter.h
rec_model_utils.h
types.h
device_monitor.h
version_singleton.h
Expand Down Expand Up @@ -66,4 +67,3 @@ cc_test(
target_link_libraries(common PRIVATE OpenSSL::SSL OpenSSL::Crypto protobuf::libprotobuf)
add_dependencies(common brpc-static)


48 changes: 48 additions & 0 deletions xllm/core/common/rec_model_utils.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
/* Copyright 2025 The xLLM Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

https://github.com/jd-opensource/xllm/blob/main/LICENSE

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

#pragma once

#include <cstdint>
#include <string_view>

namespace xllm {

enum class RecModelKind : int8_t {
kNone = 0,
kOneRec = 1,
kLlmRec = 2,
};

inline constexpr bool is_onerec_model_type(std::string_view model_type) {
return model_type == "onerec";
}

inline constexpr bool is_llmrec_model_type(std::string_view model_type) {
return model_type == "qwen2" || model_type == "qwen3";
}

inline constexpr RecModelKind get_rec_model_kind(std::string_view model_type) {
if (is_onerec_model_type(model_type)) {
return RecModelKind::kOneRec;
}
if (is_llmrec_model_type(model_type)) {
return RecModelKind::kLlmRec;
}
return RecModelKind::kNone;
}

} // namespace xllm

5 changes: 5 additions & 0 deletions xllm/core/common/types.h
Original file line number Diff line number Diff line change
Expand Up @@ -292,4 +292,9 @@ struct EplbInfo {
inline constexpr int REC_TOKEN_SIZE = 3;

using RecTokenTriple = std::array<int32_t, REC_TOKEN_SIZE>;

inline constexpr const char* LLM_REC_INPUT_TOKENS = "llm_rec_input_tokens";
inline constexpr const char* LLM_REC_INPUT_INDICES = "llm_rec_input_indices";
inline constexpr const char* LLM_REC_INPUT_EMBEDDING =
"llm_rec_input_embedding";
} // namespace xllm
7 changes: 7 additions & 0 deletions xllm/core/distributed_runtime/master.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ limitations under the License.
#include <boost/algorithm/string.hpp>
#include <csignal>
#include <memory>
#include <optional>
#include <thread>
#include <utility>

Expand All @@ -30,6 +31,7 @@ limitations under the License.
#include "common/types.h"
#include "dit_master.h"
#include "framework/model/model_args.h"
#include "framework/model_loader.h"
#include "framework/request/request.h"
#include "llm_engine.h"
#include "llm_master.h"
Expand Down Expand Up @@ -237,8 +239,13 @@ Master::Master(const Options& options, EngineType type) : options_(options) {
options_.enable_schedule_overlap(false);
LOG(WARNING) << "Force to disable schedule overlap for REC model, not "
"supported yet.";

auto model_loader = ModelLoader::create(options_.model_path());
const std::string rec_model_type = model_loader->model_args().model_type();

runtime::Options eng_options;
eng_options.model_path(options_.model_path())
.rec_model_type(std::make_optional(rec_model_type))
.devices(devices)
.backend(options_.backend())
.block_size(options_.block_size())
Expand Down
Loading