-
Notifications
You must be signed in to change notification settings - Fork 17k
fix(openvino): define PartialShape bounds for tensors #21637
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -256,7 +256,7 @@ int extract_layer_from_name(const std::string & name) { | |
| return layer; | ||
| } | ||
|
|
||
| std::pair<ModelParams, ComputeParams> GgmlOvDecoder::compute_llm_params(ggml_cgraph * cgraph, bool is_static) { | ||
| std::pair<ModelParams, ComputeParams> GgmlOvDecoder::compute_llm_params(ggml_cgraph * cgraph, bool is_static, std::optional<ComputeParams> old_c_params) { | ||
| ModelParams model_params; | ||
| ComputeParams compute_params; | ||
| for (int i = 0; i < cgraph->n_nodes; i++) { | ||
|
|
@@ -268,6 +268,9 @@ std::pair<ModelParams, ComputeParams> GgmlOvDecoder::compute_llm_params(ggml_cgr | |
| model_params.head_size = node->src[0]->ne[0]; | ||
| compute_params.input_len = node->src[0]->ne[1]; | ||
|
|
||
| const auto old_c_prefill_chunk_max = old_c_params.has_value() ? old_c_params.value().prefill_chunk_max : 1; | ||
| compute_params.prefill_chunk_max = std::max(static_cast<int>(old_c_prefill_chunk_max), static_cast<int>(compute_params.input_len)); | ||
|
|
||
| auto * cache_k_perm = node->src[1]; | ||
| if (cache_k_perm->op == GGML_OP_CPY) { | ||
| cache_k_perm = cache_k_perm->src[0]; | ||
|
|
@@ -332,51 +335,73 @@ void GgmlOvDecoder::validate_cgraph() const { | |
|
|
||
| ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, const ggml_tensor * input) const { | ||
| if (m_naive) { | ||
| return input!= nullptr ? ov::PartialShape{get_shape(input)} : ov::PartialShape{get_shape(op)}; | ||
| return input != nullptr ? ov::PartialShape{get_shape(input)} : ov::PartialShape{get_shape(op)}; | ||
| } | ||
| auto name = std::string(input->name); | ||
| ov::PartialShape input_shape; | ||
|
|
||
| // ggml_tensor gives us exact measurements in all cases, so none of those should be -1 (as that's an | ||
| // OpenVINO native convention). All of those are passed thorugh directly. | ||
| // | ||
| // Cases where a tensor dimension size varies are handled case-by-case below. We provide a PartialShape to | ||
| // communicate the worst-case scenario: a PartialShape has a lower and upper bound on the dimension, | ||
| // used to inform OpenVINO optimizations. An issue was observed with OpenCL remote buffers not allocating | ||
| // unless such a range was provided (considerations with remote memory). Although that's not the responsibility | ||
| // of llama.cpp to solve, providing dimension bounds is useful nonetheless. | ||
|
|
||
| const auto prefill_upper = m_is_prefill ? m_prefill_chunk_size : 1; | ||
| const auto dim_span_ctx = ov::Dimension(1, m_model_params.ctx); | ||
|
|
||
| if (is_inp_tok(input, op) || is_inp_pos(input, op)) { | ||
| // tokens or positions | ||
| int len = m_is_static ? (m_is_prefill ? m_prefill_chunk_size : 1) : -1; | ||
| input_shape = ov::PartialShape{1, 1, 1, len}; | ||
| if (m_is_static) { | ||
| input_shape = ov::PartialShape{1, 1, 1, prefill_upper}; | ||
| } else { | ||
| input_shape = ov::PartialShape{1, 1, 1, ov::Dimension(1, m_compute_params.prefill_chunk_max)}; | ||
| } | ||
|
|
||
| } else if (is_output_idx(input, op)) { | ||
| // output index | ||
| input_shape = ov::PartialShape{1, 1, 1, m_is_static ? m_compute_params.output_len : -1}; | ||
|
|
||
| if (m_is_static) { | ||
| input_shape = ov::PartialShape{1, 1, 1, m_compute_params.output_len}; | ||
| } else { | ||
| input_shape = ov::PartialShape{1, 1, 1, ov::Dimension(1, m_compute_params.output_len)}; | ||
| } | ||
| } else if (is_inp_mask(input, op)) { | ||
| // mask | ||
| if (m_is_static) { | ||
| input_shape = ov::PartialShape{1, 1, m_is_prefill ? m_prefill_chunk_size : 1, m_model_params.ctx}; | ||
| input_shape = ov::PartialShape{1, 1, prefill_upper, m_model_params.ctx}; | ||
| } else if (m_is_stateful) { | ||
| input_shape = ov::PartialShape{1, 1, -1, -1}; | ||
| input_shape = ov::PartialShape{1, 1, dim_span_ctx, dim_span_ctx}; | ||
| } else { | ||
| input_shape = ov::PartialShape{-1, 1, -1, -1}; | ||
| input_shape = ov::PartialShape{dim_span_ctx, 1, dim_span_ctx, dim_span_ctx}; | ||
| } | ||
|
|
||
| } else if (is_kvcache(input, op)) { | ||
| // kvcache | ||
| input_shape = ov::PartialShape{get_shape(input)}; | ||
| if (!m_is_static) { | ||
| // do not fix ctx size to make llama-bench work across test params | ||
| input_shape[2] = -1; | ||
| input_shape[2] = dim_span_ctx; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. llama-bench failed on my side for larger context sizes and for all stateful executions. stateless: -d 512,1024 fails
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. investigating this now
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. cache wasn't being properly invalidated and there's no easy way (AFAICT) to get the max possible batch size
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. actually it looks like your problem also exists without these changes(?)
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is what I see when I run I'll try and fix the CPU native cases on my end and file some PRs for that if I can. For now I'm assuming these failures are unrelated and we can sync on Monday to understand the details here. |
||
| } | ||
| if (is_stateful()) { | ||
| // Convert stateless KV cache layout [1, 1, seq, n_heads_kv * head_size] | ||
| // to stateful layout [1, seq, n_heads_kv, head_size]. | ||
| assert(input_shape.size() == 4 && input_shape[0] == 1 && input_shape[1] == 1 && | ||
| input_shape[2].is_dynamic() && | ||
| input_shape[3] == (m_model_params.n_heads_kv * m_model_params.head_size)); | ||
| input_shape = {input_shape[0], ov::Dimension::dynamic(), m_model_params.n_heads_kv, | ||
| input_shape = {input_shape[0], dim_span_ctx, m_model_params.n_heads_kv, | ||
| m_model_params.head_size}; | ||
| } | ||
|
|
||
| } else if (is_kv_idx(input, op)) { | ||
| // kv update index | ||
| int len = m_is_static ? (m_is_prefill ? m_prefill_chunk_size : 1) : -1; | ||
| input_shape = ov::PartialShape{1, 1, 1, len}; | ||
| if (m_is_static) { | ||
| int len = m_is_prefill ? m_prefill_chunk_size : 1; | ||
| input_shape = ov::PartialShape{1, 1, 1, len}; | ||
| } else { | ||
| input_shape = ov::PartialShape{1, 1, 1, dim_span_ctx}; | ||
| } | ||
|
|
||
| } else { | ||
| input_shape = ov::PartialShape{get_shape(input)}; | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The changes look good to me but why do we need to remove one liners for this?
For example, can we still use something like this?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yea i can change it back to how it was