Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
212 changes: 212 additions & 0 deletions extension/llm/runner/llm_runner_helper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include <executorch/extension/llm/runner/multimodal_runner.h>
#include <executorch/extension/llm/runner/stats.h>
#include <executorch/extension/llm/runner/text_llm_runner.h>
#include <executorch/extension/llm/runner/text_llm_session.h>
#include <executorch/extension/llm/runner/text_prefiller.h>
#include <executorch/extension/llm/runner/text_token_generator.h>
#include <executorch/extension/memory_allocator/cpu_caching_malloc_allocator.h>
Expand All @@ -29,8 +30,18 @@
namespace executorch::extension::llm {

using ::executorch::extension::Module;
using ::executorch::extension::Program;
using ::executorch::runtime::Error;

// Assembles the per-Module components (decoder/prefiller/token generator/io
// manager/stats) into a TextLLMRunner. Shared by the path-based and the
// shared-Program (TextLLMEngine session) construction paths.
static std::unique_ptr<TextLLMRunner> assemble_text_llm_runner(
std::unique_ptr<Module> module,
std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
float temperature,
const std::string& method_name);

std::unique_ptr<tokenizers::Tokenizer> load_tokenizer(
const std::string& tokenizer_path,
std::unique_ptr<std::vector<std::string>> special_tokens,
Expand Down Expand Up @@ -251,6 +262,15 @@ std::unique_ptr<TextLLMRunner> create_text_llm_runner(
max_cached_memory_size_bytes_));
}

return assemble_text_llm_runner(
std::move(module), std::move(tokenizer), temperature, method_name);
}

static std::unique_ptr<TextLLMRunner> assemble_text_llm_runner(
std::unique_ptr<Module> module,
std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
float temperature,
const std::string& method_name) {
// Get metadata from Module
ET_LOG(Info, "Reading metadata from model");
auto metadata_result = llm::get_llm_metadata(tokenizer.get(), module.get());
Expand Down Expand Up @@ -305,6 +325,198 @@ std::unique_ptr<TextLLMRunner> create_text_llm_runner(
temperature);
}

// Builds a TextLLMRunner over an already-loaded Program: the runner's Module
// reuses `program` while owning its own method state and KV cache. File-local —
// the per-session construction path for TextLLMEngine (which keeps the backing
// DataLoader alive for the runners' lifetime). External callers go through
// LLMEngine -> LLMSession, not a raw shared-Program runner.
static std::unique_ptr<TextLLMRunner> create_text_llm_runner_from_program(
std::shared_ptr<Program> program,
std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
float temperature,
const std::string& method_name) {
if (!tokenizer || !tokenizer->is_loaded()) {
ET_LOG(Error, "Tokenizer is null or not loaded");
return nullptr;
}
if (!program) {
ET_LOG(Error, "Program is null");
return nullptr;
}
// A Module over the already-loaded Program: it reuses that Program rather
// than re-loading it, and its loaded method allocates its own planned (KV)
// memory. Whether packed weights are physically shared vs. re-materialized
// per method instance is backend-dependent (serving_capacity() is the
// authority).
constexpr uint32_t kMaxCachedMemoryBytes = 1024 * 1024 * 10; // 10MB
auto module = std::make_unique<Module>(
std::move(program),
nullptr, // memory allocator
std::make_unique<executorch::extension::CPUCachingAllocator>(
kMaxCachedMemoryBytes));
return assemble_text_llm_runner(
std::move(module), std::move(tokenizer), temperature, method_name);
}

namespace detail {
// The TextLLM adapter: implements the model-agnostic LLMSession over a
// TextLLMRunner. TextLLMRunner's token-step methods are private; this adapter
// is their only (friended) caller, so the engine and server depend solely on
// LLMSession.
TextLLMSession::TextLLMSession(std::unique_ptr<TextLLMRunner> runner)
: runner_(std::move(runner)) {}

Error TextLLMSession::prefill_tokens(
std::vector<uint64_t> tokens,
const SamplingConfig* initial_sampling) {
// The model samples the FIRST generated token during prefill, so apply the
// request's sampling here (not a stale default). Only temperature is
// plumbed; reject non-default top_p/top_k/seed for parity with decode_one().
float temperature = -1.0f;
if (initial_sampling != nullptr) {
if (initial_sampling->top_p != 1.0f || initial_sampling->top_k != 0 ||
initial_sampling->seed != 0) {
ET_LOG(
Error,
"TextLLMSession: only temperature is supported; top_p/top_k/seed "
"are not yet implemented");
return ::executorch::runtime::Error::NotSupported;
}
temperature = initial_sampling->temperature;
}
return runner_->prefill_tokens(std::move(tokens), temperature).error();
}

::executorch::runtime::Result<DecodeResult> TextLLMSession::decode_one(
const SamplingConfig& sampling) {
// Only temperature is plumbed today; top_p/top_k/seed need a per-session
// sampler (a follow-up). Reject non-default values rather than silently
// ignoring them, so callers can't assume constraints are applied.
if (sampling.top_p != 1.0f || sampling.top_k != 0 || sampling.seed != 0) {
ET_LOG(
Error,
"TextLLMSession: only temperature is supported; top_p/top_k/seed are "
"not yet implemented");
return ::executorch::runtime::Error::NotSupported;
}
return runner_->decode_one(sampling.temperature);
}

Error TextLLMSession::seek(int64_t pos) {
return runner_->seek(pos);
}

int64_t TextLLMSession::position() const {
return runner_->position();
}

Error TextLLMSession::reset() {
runner_->reset();
return Error::Ok;
}

void TextLLMSession::stop() {
runner_->stop();
}

std::unique_ptr<LLMSession> make_text_llm_session(
std::unique_ptr<TextLLMRunner> runner) {
return std::make_unique<TextLLMSession>(std::move(runner));
}
} // namespace detail

TextLLMEngine::TextLLMEngine(
std::unique_ptr<Module> loader_module,
std::shared_ptr<Program> program,
std::string tokenizer_path,
float temperature,
std::string method_name,
std::unordered_map<std::string, int64_t> metadata)
: loader_module_(std::move(loader_module)),
program_(std::move(program)),
tokenizer_path_(std::move(tokenizer_path)),
temperature_(temperature),
method_name_(std::move(method_name)),
metadata_(std::move(metadata)) {}

std::unique_ptr<TextLLMEngine> TextLLMEngine::create(
const std::string& model_path,
const std::string& tokenizer_path,
std::optional<const std::string> data_path,
float temperature,
const std::string& method_name,
Module::LoadMode load_mode) {
// External .ptd weights are not yet supported for shared sessions: each
// session Module built from the shared Program would also need the
// data_map_loader threaded into its load_method() to resolve external
// weights (see Module::load_method merged_data_map_). Fail loudly rather than
// silently produce sessions that error on first generate.
if (data_path.has_value()) {
ET_LOG(
Error,
"TextLLMEngine: external data_path (.ptd) is not yet supported for "
"shared sessions; use a self-contained .pte for now.");
return nullptr;
}
// Load the program ONCE; sessions reuse it (loaded a single time, per-session
// KV). Physical weight sharing across sessions is backend-dependent — see
// serving_capacity().
auto loader_module = std::make_unique<Module>(model_path, load_mode);
if (loader_module->load() != Error::Ok) {
ET_LOG(
Error,
"TextLLMEngine: failed to load program from %s",
model_path.c_str());
return nullptr;
}
auto program = loader_module->program();
if (!program) {
ET_LOG(Error, "TextLLMEngine: program is null after load");
return nullptr;
}
// Read model-level metadata once (shared by all sessions).
auto meta_tokenizer = load_tokenizer(tokenizer_path);
if (!meta_tokenizer) {
ET_LOG(
Error,
"TextLLMEngine: failed to load tokenizer from %s",
tokenizer_path.c_str());
return nullptr;
}
auto metadata_result =
get_llm_metadata(meta_tokenizer.get(), loader_module.get());
if (metadata_result.error() != Error::Ok) {
ET_LOG(Error, "TextLLMEngine: failed to read metadata");
return nullptr;
}
return std::unique_ptr<TextLLMEngine>(new TextLLMEngine(
std::move(loader_module),
std::move(program),
tokenizer_path,
temperature,
method_name,
metadata_result.get()));
}

::executorch::runtime::Result<std::unique_ptr<LLMSession>>
TextLLMEngine::create_session() {
auto tokenizer = load_tokenizer(tokenizer_path_);
if (!tokenizer) {
ET_LOG(
Error,
"TextLLMEngine: failed to load tokenizer from %s",
tokenizer_path_.c_str());
return Error::InvalidState;
}
auto runner = create_text_llm_runner_from_program(
program_, std::move(tokenizer), temperature_, method_name_);
if (!runner) {
ET_LOG(Error, "TextLLMEngine: failed to build session runner");
return Error::InvalidState;
}
return detail::make_text_llm_session(std::move(runner));
}

std::unique_ptr<MultimodalRunner> create_multimodal_runner(
const std::string& model_path,
std::unique_ptr<::tokenizers::Tokenizer> tokenizer,
Expand Down
71 changes: 71 additions & 0 deletions extension/llm/runner/llm_runner_helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include <vector>

#include <executorch/extension/llm/runner/constants.h>
#include <executorch/extension/llm/runner/llm_session.h>
#include <executorch/extension/module/module.h>
#include <executorch/runtime/core/result.h>
#include <executorch/runtime/platform/compiler.h>
Expand Down Expand Up @@ -141,6 +142,76 @@ ET_EXPERIMENTAL std::unique_ptr<TextLLMRunner> create_text_llm_runner(
const std::string& method_name = "forward",
Module::LoadMode load_mode = Module::LoadMode::MmapUseMlockIgnoreErrors);

/**
* @brief Engine for multi-session text generation over one loaded Program.
*
* Loads the model's Program (weights/constants) once; create_session() builds a
* TextLLMRunner that reuses that Program but owns its own method/KV state. This
* is the correctness-first foundation for serving multiple conversations.
* Backend execution should be serialized by the caller until per-backend thread
* safety is proven (Module::execute is not assumed thread-safe). Whether extra
* sessions avoid duplicating packed weights is backend-dependent and reported
* by serving_capacity() (conservatively one).
*/
class ET_EXPERIMENTAL TextLLMEngine : public LLMEngine {
public:
static std::unique_ptr<TextLLMEngine> create(
const std::string& model_path,
const std::string& tokenizer_path,
std::optional<const std::string> data_path = std::nullopt,
float temperature = -1.0f,
const std::string& method_name = "forward",
Module::LoadMode load_mode = Module::LoadMode::MmapUseMlockIgnoreErrors);

// Returns a TextLLMSession (LLMSession) that reuses this engine's loaded
// Program (physical weight sharing is backend-dependent; see
// serving_capacity).
::executorch::runtime::Result<std::unique_ptr<LLMSession>> create_session()
override;
// Conservative: a single physical session (no proven cross-session weight
// sharing). Raise on a backend proven to share packed weights.
LLMServingCapacity serving_capacity() const override {
return LLMServingCapacity{};
}
const std::unordered_map<std::string, int64_t>& metadata() const override {
return metadata_;
}

TextLLMEngine(const TextLLMEngine&) = delete;
TextLLMEngine& operator=(const TextLLMEngine&) = delete;

private:
TextLLMEngine(
std::unique_ptr<Module> loader_module,
std::shared_ptr<Program> program,
std::string tokenizer_path,
float temperature,
std::string method_name,
std::unordered_map<std::string, int64_t> metadata);

// Keeps the shared Program's DataLoader alive for the lifetime of sessions.
std::unique_ptr<Module> loader_module_;
std::shared_ptr<Program> program_;
std::string tokenizer_path_;
float temperature_;
std::string method_name_;
std::unordered_map<std::string, int64_t> metadata_;
};

namespace detail {
// Implementation detail (not a public API): wraps a TextLLMRunner in an
// LLMSession (the runner -> session seam). The supported entry point is
// LLMEngine::create_session(); this exists only so TextLLMEngine can build its
// sessions and so unit tests can drive the runner's token-step primitives
// through the public LLMSession surface (the concrete adapter type is private).
// Do not depend on wrapping arbitrary runners.
//
// @param runner A loaded TextLLMRunner; ownership transfers to the session.
// @return std::unique_ptr<LLMSession> wrapping `runner`.
std::unique_ptr<LLMSession> make_text_llm_session(
std::unique_ptr<TextLLMRunner> runner);
} // namespace detail

/**
* @brief Creates a MultimodalRunner instance with dependency injection
*
Expand Down
Loading
Loading