ggml-org · thedanhoffman · Apr 6, 2026 · Apr 10, 2026 · cavusmustafa · Apr 8, 2026
diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp
@@ -256,7 +256,7 @@ int extract_layer_from_name(const std::string & name) {
     return layer;
 }
 
-std::pair<ModelParams, ComputeParams> GgmlOvDecoder::compute_llm_params(ggml_cgraph * cgraph, bool is_static) {
+std::pair<ModelParams, ComputeParams> GgmlOvDecoder::compute_llm_params(ggml_cgraph * cgraph, bool is_static, std::optional<ComputeParams> old_c_params) {
     ModelParams model_params;
     ComputeParams compute_params;
     for (int i = 0; i < cgraph->n_nodes; i++) {
@@ -268,6 +268,9 @@ std::pair<ModelParams, ComputeParams> GgmlOvDecoder::compute_llm_params(ggml_cgr
             model_params.head_size = node->src[0]->ne[0];
             compute_params.input_len = node->src[0]->ne[1];
 
+            const auto old_c_prefill_chunk_max = old_c_params.has_value() ? old_c_params.value().prefill_chunk_max : 1;
+            compute_params.prefill_chunk_max = std::max(static_cast<int>(old_c_prefill_chunk_max), static_cast<int>(compute_params.input_len));
+
             auto * cache_k_perm = node->src[1];
             if (cache_k_perm->op == GGML_OP_CPY) {
                 cache_k_perm = cache_k_perm->src[0];
@@ -332,51 +335,73 @@ void GgmlOvDecoder::validate_cgraph() const {
 
 ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, const ggml_tensor * input) const {
     if (m_naive) {
-        return input!= nullptr ? ov::PartialShape{get_shape(input)} : ov::PartialShape{get_shape(op)};
+        return input != nullptr ? ov::PartialShape{get_shape(input)} : ov::PartialShape{get_shape(op)};
     }
     auto name = std::string(input->name);
     ov::PartialShape input_shape;
 
+    // ggml_tensor gives us exact measurements in all cases, so none of those should be -1 (as that's an
+    // OpenVINO native convention). All of those are passed thorugh directly.
+    //
+    // Cases where a tensor dimension size varies are handled case-by-case below. We provide a PartialShape to
+    // communicate the worst-case scenario: a PartialShape has a lower and upper bound on the dimension,
+    // used to inform OpenVINO optimizations. An issue was observed with OpenCL remote buffers not allocating
+    // unless such a range was provided (considerations with remote memory). Although that's not the responsibility
+    // of llama.cpp to solve, providing dimension bounds is useful nonetheless.
+
+    const auto prefill_upper = m_is_prefill ? m_prefill_chunk_size : 1;
+    const auto dim_span_ctx = ov::Dimension(1, m_model_params.ctx);
+
     if (is_inp_tok(input, op) || is_inp_pos(input, op)) {
         // tokens or positions
-        int len = m_is_static ? (m_is_prefill ? m_prefill_chunk_size : 1) : -1;
-        input_shape = ov::PartialShape{1, 1, 1, len};
+        if (m_is_static) {
+            input_shape = ov::PartialShape{1, 1, 1, prefill_upper};
+        } else {
+            input_shape = ov::PartialShape{1, 1, 1, ov::Dimension(1, m_compute_params.prefill_chunk_max)};
+        }
 
     } else if (is_output_idx(input, op)) {
         // output index
-        input_shape = ov::PartialShape{1, 1, 1, m_is_static ? m_compute_params.output_len : -1};
-
+        if (m_is_static) {
+            input_shape = ov::PartialShape{1, 1, 1, m_compute_params.output_len};
+        } else {
+            input_shape = ov::PartialShape{1, 1, 1, ov::Dimension(1, m_compute_params.output_len)};
+        }
     } else if (is_inp_mask(input, op)) {
         // mask
         if (m_is_static) {
-            input_shape = ov::PartialShape{1, 1, m_is_prefill ? m_prefill_chunk_size : 1, m_model_params.ctx};
+            input_shape = ov::PartialShape{1, 1, prefill_upper, m_model_params.ctx};
         } else if (m_is_stateful) {
-            input_shape = ov::PartialShape{1, 1, -1, -1};
+            input_shape = ov::PartialShape{1, 1, dim_span_ctx, dim_span_ctx};
         } else {
-            input_shape = ov::PartialShape{-1, 1, -1, -1};
+            input_shape = ov::PartialShape{dim_span_ctx, 1, dim_span_ctx, dim_span_ctx};
         }
 
     } else if (is_kvcache(input, op)) {
         // kvcache
         input_shape = ov::PartialShape{get_shape(input)};
         if (!m_is_static) {
             // do not fix ctx size to make llama-bench work across test params
-            input_shape[2] = -1;
+            input_shape[2] = dim_span_ctx;
         }
         if (is_stateful()) {
             // Convert stateless KV cache layout [1, 1, seq, n_heads_kv * head_size]
             // to stateful layout [1, seq, n_heads_kv, head_size].
             assert(input_shape.size() == 4 && input_shape[0] == 1 && input_shape[1] == 1 &&
                    input_shape[2].is_dynamic() &&
                    input_shape[3] == (m_model_params.n_heads_kv * m_model_params.head_size));
-            input_shape = {input_shape[0], ov::Dimension::dynamic(), m_model_params.n_heads_kv,
+            input_shape = {input_shape[0], dim_span_ctx, m_model_params.n_heads_kv,
                            m_model_params.head_size};
         }
 
     } else if (is_kv_idx(input, op)) {
         // kv update index
-        int len = m_is_static ? (m_is_prefill ? m_prefill_chunk_size : 1) : -1;
-        input_shape = ov::PartialShape{1, 1, 1, len};
+        if (m_is_static) {
+            int len = m_is_prefill ? m_prefill_chunk_size : 1;
+            input_shape = ov::PartialShape{1, 1, 1, len};
+        } else {
+            input_shape = ov::PartialShape{1, 1, 1, dim_span_ctx};
+        }
 
     } else {
         input_shape = ov::PartialShape{get_shape(input)};

diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h
@@ -47,6 +47,28 @@ struct ComputeParams {
     int token_len_per_seq = -1;
     int past_kv_len = -1;
     int output_len = 1;
+
+    // Batch size is provided as a PartialShape since it can vary from call
+    // to call. There *is* a maximum value, but it isn't easy to pull that
+    // at this point, so we track the current maximum we've seen so far.
+    // This is the upper bound on the PartialShape (lower bound is 1), so
+    // we re-build in cases where the maximum has been exceeded (and the value
+    // is saved for the next iteration, the cycle repeats)
+    //
+    // There is an implicit re-build done within OpenVINO for cases where the
+    // current batch size is within the range from minimum to maximum, so we only
+    // care about growing the bounds.
+    int prefill_chunk_max = 1;
+
+    bool can_reuse_dynamically(const ComputeParams & other) const {
+        GGML_ASSERT(other.prefill_chunk_max >= 1);
+        return prefill_chunk_max >= other.prefill_chunk_max;
+    }
+
+    bool can_reuse_statically(const ComputeParams & other) const {
+        GGML_ASSERT(other.prefill_chunk_max >= 1);
+        return prefill_chunk_max >= other.prefill_chunk_max;
+    }
 };
 
 class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
@@ -68,9 +90,9 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
                   ComputeParams & compute_params,
                   std::map<std::string, std::shared_ptr<ov::Node>> & model_weights,
                   bool is_static,
-                  bool is_stateful = false,
-                  bool is_prefill = false,
-                  int prefill_chunk_size = 256);
+                  bool is_stateful,
+                  bool is_prefill,
+                  int prefill_chunk_size);
 
     // Naive graph decoder
     GgmlOvDecoder(ggml_cgraph * cgraph, std::map<std::string, std::shared_ptr<ov::Node>> & model_weights);
@@ -190,7 +212,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder {
 
     void clear_model_weights() { m_model_weights.clear(); }
 
-    static std::pair<ModelParams, ComputeParams> compute_llm_params(ggml_cgraph * cgraph, bool is_static);
+    static std::pair<ModelParams, ComputeParams> compute_llm_params(ggml_cgraph * cgraph, bool is_static, std::optional<ComputeParams> old_c_params);
 
     ModelParams get_model_params() const { return m_model_params; }
 

diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp
@@ -83,6 +83,8 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<
     const auto & config = ggml_openvino_get_compile_config();
     auto device = r_ctx->device;
     bool stateful = r_ctx->stateful;
+    bool prefill = false;
+    bool prefill_chunk_size = 256; // TODO: fix me
     static auto is_static = false;
 
     if (is_naive(cgraph)) {
@@ -95,7 +97,7 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<
     std::shared_ptr<ov::InferRequest> infer_request;
     ModelParams m_params;
     ComputeParams c_params;
-    std::tie(m_params, c_params) = GgmlOvDecoder::compute_llm_params(cgraph, is_static);
+    std::tie(m_params, c_params) = GgmlOvDecoder::compute_llm_params(cgraph, is_static, std::optional<ComputeParams>());
 
     graph_key key(cgraph);
     bool cache_hit;
@@ -112,10 +114,12 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<
 
         cache_hit = it != r_ctx->decoder_cache.end();
         ModelParams old_m_params;
+        ComputeParams old_c_params;
         if (cache_hit) {
             ggml_decoder = it->second;
             old_m_params = ggml_decoder->get_model_params();
-            cache_hit = old_m_params.can_reuse_dynamically(m_params);
+            old_c_params = ggml_decoder->get_compute_params();
+            cache_hit = old_m_params.can_reuse_dynamically(m_params) && old_c_params.can_reuse_dynamically(c_params);
         }
 
         if (cache_hit) {
@@ -175,7 +179,7 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr<
             std::shared_ptr<ov::Model> model;
             auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph);
 
-            ggml_decoder = std::make_shared<GgmlOvDecoder>(cgraph, m_params, c_params, model_weights, is_static, stateful);
+            ggml_decoder = std::make_shared<GgmlOvDecoder>(cgraph, m_params, c_params, model_weights, is_static, stateful, prefill, prefill_chunk_size);
             decoder_end_time = ggml_time_us();
 
             auto input_model = std::make_shared<ov::frontend::ggml::InputModel>(ggml_decoder);
@@ -294,7 +298,7 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
     std::shared_ptr<ov::InferRequest> infer_request;
     ModelParams m_params;
     ComputeParams c_params;
-    std::tie(m_params, c_params) = GgmlOvDecoder::compute_llm_params(cgraph, is_static);
+    std::tie(m_params, c_params) = GgmlOvDecoder::compute_llm_params(cgraph, is_static, std::optional<ComputeParams>());
 
     const auto * inp_pos = get_inp_pos_tensor(cgraph);
     const auto is_prefill = get_is_prefill(inp_pos);
@@ -310,10 +314,12 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr<o
 
     cache_hit = it != r_ctx->decoder_cache.end();
     ModelParams old_m_params;
+    ComputeParams old_c_params;
     if (cache_hit) {
         ggml_decoder = it->second;
         old_m_params = ggml_decoder->get_model_params();
-        cache_hit = old_m_params.can_reuse_statically(m_params);
+        old_c_params = ggml_decoder->get_compute_params();
+        cache_hit = old_m_params.can_reuse_statically(m_params) && old_c_params.can_reuse_statically(c_params);
     }
 
     if (cache_hit) {