Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 38 additions & 13 deletions ggml/src/ggml-openvino/ggml-decoder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,7 @@ int extract_layer_from_name(const std::string & name) {
return layer;
}

std::pair<ModelParams, ComputeParams> GgmlOvDecoder::compute_llm_params(ggml_cgraph * cgraph, bool is_static) {
std::pair<ModelParams, ComputeParams> GgmlOvDecoder::compute_llm_params(ggml_cgraph * cgraph, bool is_static, std::optional<ComputeParams> old_c_params) {
ModelParams model_params;
ComputeParams compute_params;
for (int i = 0; i < cgraph->n_nodes; i++) {
Expand All @@ -268,6 +268,9 @@ std::pair<ModelParams, ComputeParams> GgmlOvDecoder::compute_llm_params(ggml_cgr
model_params.head_size = node->src[0]->ne[0];
compute_params.input_len = node->src[0]->ne[1];

const auto old_c_prefill_chunk_max = old_c_params.has_value() ? old_c_params.value().prefill_chunk_max : 1;
compute_params.prefill_chunk_max = std::max(static_cast<int>(old_c_prefill_chunk_max), static_cast<int>(compute_params.input_len));

auto * cache_k_perm = node->src[1];
if (cache_k_perm->op == GGML_OP_CPY) {
cache_k_perm = cache_k_perm->src[0];
Expand Down Expand Up @@ -332,51 +335,73 @@ void GgmlOvDecoder::validate_cgraph() const {

ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, const ggml_tensor * input) const {
if (m_naive) {
return input!= nullptr ? ov::PartialShape{get_shape(input)} : ov::PartialShape{get_shape(op)};
return input != nullptr ? ov::PartialShape{get_shape(input)} : ov::PartialShape{get_shape(op)};
}
auto name = std::string(input->name);
ov::PartialShape input_shape;

// ggml_tensor gives us exact measurements in all cases, so none of those should be -1 (as that's an
// OpenVINO native convention). All of those are passed thorugh directly.
//
// Cases where a tensor dimension size varies are handled case-by-case below. We provide a PartialShape to
// communicate the worst-case scenario: a PartialShape has a lower and upper bound on the dimension,
// used to inform OpenVINO optimizations. An issue was observed with OpenCL remote buffers not allocating
// unless such a range was provided (considerations with remote memory). Although that's not the responsibility
// of llama.cpp to solve, providing dimension bounds is useful nonetheless.

const auto prefill_upper = m_is_prefill ? m_prefill_chunk_size : 1;
const auto dim_span_ctx = ov::Dimension(1, m_model_params.ctx);

if (is_inp_tok(input, op) || is_inp_pos(input, op)) {
// tokens or positions
int len = m_is_static ? (m_is_prefill ? m_prefill_chunk_size : 1) : -1;
input_shape = ov::PartialShape{1, 1, 1, len};
if (m_is_static) {
input_shape = ov::PartialShape{1, 1, 1, prefill_upper};
} else {
input_shape = ov::PartialShape{1, 1, 1, ov::Dimension(1, m_compute_params.prefill_chunk_max)};
}

} else if (is_output_idx(input, op)) {
// output index
input_shape = ov::PartialShape{1, 1, 1, m_is_static ? m_compute_params.output_len : -1};

if (m_is_static) {
input_shape = ov::PartialShape{1, 1, 1, m_compute_params.output_len};
} else {
input_shape = ov::PartialShape{1, 1, 1, ov::Dimension(1, m_compute_params.output_len)};
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The changes look good to me but why do we need to remove one liners for this?

For example, can we still use something like this?

input_shape = ov::PartialShape{1, 1, 1, m_is_static ?
                        m_compute_params.output_len : ov::Dimension(1, m_compute_params.output_len)};

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yea i can change it back to how it was

}
} else if (is_inp_mask(input, op)) {
// mask
if (m_is_static) {
input_shape = ov::PartialShape{1, 1, m_is_prefill ? m_prefill_chunk_size : 1, m_model_params.ctx};
input_shape = ov::PartialShape{1, 1, prefill_upper, m_model_params.ctx};
} else if (m_is_stateful) {
input_shape = ov::PartialShape{1, 1, -1, -1};
input_shape = ov::PartialShape{1, 1, dim_span_ctx, dim_span_ctx};
} else {
input_shape = ov::PartialShape{-1, 1, -1, -1};
input_shape = ov::PartialShape{dim_span_ctx, 1, dim_span_ctx, dim_span_ctx};
}

} else if (is_kvcache(input, op)) {
// kvcache
input_shape = ov::PartialShape{get_shape(input)};
if (!m_is_static) {
// do not fix ctx size to make llama-bench work across test params
input_shape[2] = -1;
input_shape[2] = dim_span_ctx;
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

llama-bench failed on my side for larger context sizes and for all stateful executions.
tested with llama3.2 1B q4_0

stateless: -d 512,1024 fails
stateful: all ctx sizes fail

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

investigating this now

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cache wasn't being properly invalidated and there's no easy way (AFAICT) to get the max possible batch size

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

actually it looks like your problem also exists without these changes(?)

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

set_n_threads: n_threads = 6, n_threads_batch = 6
GGML OpenVINO backend std::exception: might be outdated RESHAPE case
graph_compute: ggml_backend_sched_graph_compute_async failed with error -1
process_ubatch: failed to compute graph, compute status: -1
decode: removing memory module entries for seq_id = 0, pos = [0, +inf)
llama_decode: failed to decode, ret = -3
test_prompt: failed to decode prompt batch, res = -3
main: error: failed to run prompt warmup

is what I see when I run ./bin/llama-bench --batch-size 128 --ubatch-size 128 -m [the llama 3.2 1b instruct q4_0 GGUF] --verbose

I'll try and fix the CPU native cases on my end and file some PRs for that if I can. For now I'm assuming these failures are unrelated and we can sync on Monday to understand the details here.

}
if (is_stateful()) {
// Convert stateless KV cache layout [1, 1, seq, n_heads_kv * head_size]
// to stateful layout [1, seq, n_heads_kv, head_size].
assert(input_shape.size() == 4 && input_shape[0] == 1 && input_shape[1] == 1 &&
input_shape[2].is_dynamic() &&
input_shape[3] == (m_model_params.n_heads_kv * m_model_params.head_size));
input_shape = {input_shape[0], ov::Dimension::dynamic(), m_model_params.n_heads_kv,
input_shape = {input_shape[0], dim_span_ctx, m_model_params.n_heads_kv,
m_model_params.head_size};
}

} else if (is_kv_idx(input, op)) {
// kv update index
int len = m_is_static ? (m_is_prefill ? m_prefill_chunk_size : 1) : -1;
input_shape = ov::PartialShape{1, 1, 1, len};
if (m_is_static) {
int len = m_is_prefill ? m_prefill_chunk_size : 1;
input_shape = ov::PartialShape{1, 1, 1, len};
} else {
input_shape = ov::PartialShape{1, 1, 1, dim_span_ctx};
}

} else {
input_shape = ov::PartialShape{get_shape(input)};
Expand Down
30 changes: 26 additions & 4 deletions ggml/src/ggml-openvino/ggml-decoder.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,28 @@ struct ComputeParams {
int token_len_per_seq = -1;
int past_kv_len = -1;
int output_len = 1;

// Batch size is provided as a PartialShape since it can vary from call
// to call. There *is* a maximum value, but it isn't easy to pull that
// at this point, so we track the current maximum we've seen so far.
// This is the upper bound on the PartialShape (lower bound is 1), so
// we re-build in cases where the maximum has been exceeded (and the value
// is saved for the next iteration, the cycle repeats)
//
// There is an implicit re-build done within OpenVINO for cases where the
// current batch size is within the range from minimum to maximum, so we only
// care about growing the bounds.
int prefill_chunk_max = 1;

bool can_reuse_dynamically(const ComputeParams & other) const {
GGML_ASSERT(other.prefill_chunk_max >= 1);
return prefill_chunk_max >= other.prefill_chunk_max;
}

bool can_reuse_statically(const ComputeParams & other) const {
GGML_ASSERT(other.prefill_chunk_max >= 1);
return prefill_chunk_max >= other.prefill_chunk_max;
}
};

class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
Expand All @@ -68,9 +90,9 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
ComputeParams & compute_params,
std::map<std::string, std::shared_ptr<ov::Node>> & model_weights,
bool is_static,
bool is_stateful = false,
bool is_prefill = false,
int prefill_chunk_size = 256);
bool is_stateful,
bool is_prefill,
int prefill_chunk_size);

// Naive graph decoder
GgmlOvDecoder(ggml_cgraph * cgraph, std::map<std::string, std::shared_ptr<ov::Node>> & model_weights);
Expand Down Expand Up @@ -190,7 +212,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {

void clear_model_weights() { m_model_weights.clear(); }

static std::pair<ModelParams, ComputeParams> compute_llm_params(ggml_cgraph * cgraph, bool is_static);
static std::pair<ModelParams, ComputeParams> compute_llm_params(ggml_cgraph * cgraph, bool is_static, std::optional<ComputeParams> old_c_params);

ModelParams get_model_params() const { return m_model_params; }

Expand Down
16 changes: 11 additions & 5 deletions ggml/src/ggml-openvino/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,8 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<
const auto & config = ggml_openvino_get_compile_config();
auto device = r_ctx->device;
bool stateful = r_ctx->stateful;
bool prefill = false;
bool prefill_chunk_size = 256; // TODO: fix me
static auto is_static = false;

if (is_naive(cgraph)) {
Expand All @@ -95,7 +97,7 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<
std::shared_ptr<ov::InferRequest> infer_request;
ModelParams m_params;
ComputeParams c_params;
std::tie(m_params, c_params) = GgmlOvDecoder::compute_llm_params(cgraph, is_static);
std::tie(m_params, c_params) = GgmlOvDecoder::compute_llm_params(cgraph, is_static, std::optional<ComputeParams>());

graph_key key(cgraph);
bool cache_hit;
Expand All @@ -112,10 +114,12 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<

cache_hit = it != r_ctx->decoder_cache.end();
ModelParams old_m_params;
ComputeParams old_c_params;
if (cache_hit) {
ggml_decoder = it->second;
old_m_params = ggml_decoder->get_model_params();
cache_hit = old_m_params.can_reuse_dynamically(m_params);
old_c_params = ggml_decoder->get_compute_params();
cache_hit = old_m_params.can_reuse_dynamically(m_params) && old_c_params.can_reuse_dynamically(c_params);
}

if (cache_hit) {
Expand Down Expand Up @@ -175,7 +179,7 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<
std::shared_ptr<ov::Model> model;
auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph);

ggml_decoder = std::make_shared<GgmlOvDecoder>(cgraph, m_params, c_params, model_weights, is_static, stateful);
ggml_decoder = std::make_shared<GgmlOvDecoder>(cgraph, m_params, c_params, model_weights, is_static, stateful, prefill, prefill_chunk_size);
decoder_end_time = ggml_time_us();

auto input_model = std::make_shared<ov::frontend::ggml::InputModel>(ggml_decoder);
Expand Down Expand Up @@ -294,7 +298,7 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
std::shared_ptr<ov::InferRequest> infer_request;
ModelParams m_params;
ComputeParams c_params;
std::tie(m_params, c_params) = GgmlOvDecoder::compute_llm_params(cgraph, is_static);
std::tie(m_params, c_params) = GgmlOvDecoder::compute_llm_params(cgraph, is_static, std::optional<ComputeParams>());

const auto * inp_pos = get_inp_pos_tensor(cgraph);
const auto is_prefill = get_is_prefill(inp_pos);
Expand All @@ -310,10 +314,12 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o

cache_hit = it != r_ctx->decoder_cache.end();
ModelParams old_m_params;
ComputeParams old_c_params;
if (cache_hit) {
ggml_decoder = it->second;
old_m_params = ggml_decoder->get_model_params();
cache_hit = old_m_params.can_reuse_statically(m_params);
old_c_params = ggml_decoder->get_compute_params();
cache_hit = old_m_params.can_reuse_statically(m_params) && old_c_params.can_reuse_statically(c_params);
}

if (cache_hit) {
Expand Down