From e4d7c6cb2ed37e97153e8834f65ba91bdbf9fe93 Mon Sep 17 00:00:00 2001 From: Martin Evans Date: Sat, 14 Mar 2026 14:30:13 +0000 Subject: [PATCH] - Updated binaries to 73c9eb8ceda397b651dbb6661b2935f0283a2b1d (Qwen3.5 support!) - Removed deprecated native func `llama_adapter_lora_free` and related managed method `LoraAdapter.Unload` --- LLama/Batched/BatchedExecutor.cs | 17 ++++--- LLama/LLamaEmbedder.cs | 5 +- LLama/LLamaReranker.cs | 6 +-- LLama/LLamaSharp.csproj | 2 +- LLama/Native/LLamaFtype.cs | 7 ++- LLama/Native/LLamaModelQuantizeParams.cs | 10 ++++ LLama/Native/LoraAdapter.cs | 20 -------- LLama/Native/NativeApi.Mtmd.cs | 2 +- LLama/Native/NativeApi.cs | 36 ++++++-------- LLama/Native/SafeLLamaContextHandle.cs | 61 +++++++++++++++++++++--- LLama/Native/SafeLlamaModelHandle.cs | 6 +-- llama.cpp | 2 +- 12 files changed, 101 insertions(+), 73 deletions(-) diff --git a/LLama/Batched/BatchedExecutor.cs b/LLama/Batched/BatchedExecutor.cs index 1a6698b1a..db9be6a7a 100644 --- a/LLama/Batched/BatchedExecutor.cs +++ b/LLama/Batched/BatchedExecutor.cs @@ -43,7 +43,12 @@ public sealed class BatchedExecutor /// The this executor is using /// public LLamaWeights Model { get; } - + + /// + /// The optional this executor is using + /// + public MtmdWeights? ClipModel { get; } + /// /// Get the number of tokens in the batch, waiting for to be called /// @@ -79,12 +84,8 @@ public int BatchedTokenCount /// /// The model to use /// Parameters to create a new context - public BatchedExecutor(LLamaWeights model, IContextParams contextParams) - : this(model, contextParams, null) - { - } - - public BatchedExecutor(LLamaWeights model, IContextParams contextParams, MtmdWeights? clipModel) + /// Clip model to use for multimodal capabilities + public BatchedExecutor(LLamaWeights model, IContextParams contextParams, MtmdWeights? clipModel = null) { Model = model; Context = model.CreateContext(contextParams); @@ -92,8 +93,6 @@ public BatchedExecutor(LLamaWeights model, IContextParams contextParams, MtmdWei Epoch = 1; } - public MtmdWeights? ClipModel { get; } - /// /// Start a new /// diff --git a/LLama/LLamaEmbedder.cs b/LLama/LLamaEmbedder.cs index eee9a01e9..5f6a2878d 100644 --- a/LLama/LLamaEmbedder.cs +++ b/LLama/LLamaEmbedder.cs @@ -1,14 +1,11 @@ using System; using System.Collections.Generic; -using System.Linq; using System.Threading; using System.Threading.Tasks; using LLama.Abstractions; using LLama.Exceptions; using LLama.Native; -using Microsoft.Extensions.AI; using Microsoft.Extensions.Logging; -using static System.Net.Mime.MediaTypeNames; namespace LLama; @@ -79,7 +76,7 @@ public async Task> GetEmbeddings(string input, Cancellati Context.Dispose(); Context = _weights.CreateContext(_params, _logger); - NativeApi.llama_set_embeddings(Context.NativeHandle, true); + Context.NativeHandle.SetEmbeddings(true); // Add all of the tokens to the batch var tokens = Context.Tokenize(input, special: true); diff --git a/LLama/LLamaReranker.cs b/LLama/LLamaReranker.cs index 16a206c40..58919fd0a 100644 --- a/LLama/LLamaReranker.cs +++ b/LLama/LLamaReranker.cs @@ -1,11 +1,7 @@ using System; using System.Collections.Generic; -using System.IO; -using System.Linq; -using System.Text; using System.Threading; using System.Threading.Tasks; -using System.Xml.Linq; using LLama.Abstractions; using LLama.Exceptions; using LLama.Native; @@ -44,7 +40,7 @@ public LLamaReranker(LLamaWeights weights, IContextParams @params, ILogger? logg if (@params.PoolingType != LLamaPoolingType.Rank) throw new NotSupportedException("Computing rank score, PoolingType must be equal to LLamaPoolingType.Rank"); Context = weights.CreateContext(@params, logger); - NativeApi.llama_set_embeddings(Context.NativeHandle, true); + Context.NativeHandle.SetEmbeddings(true); } /// diff --git a/LLama/LLamaSharp.csproj b/LLama/LLamaSharp.csproj index 4bdf9289a..9eff34c4e 100644 --- a/LLama/LLamaSharp.csproj +++ b/LLama/LLamaSharp.csproj @@ -57,7 +57,7 @@ - ff4affb4c1aa7eb4_v3 + 73c9eb8ceda397b diff --git a/LLama/Native/LLamaFtype.cs b/LLama/Native/LLamaFtype.cs index 813bad1ae..d48be1855 100644 --- a/LLama/Native/LLamaFtype.cs +++ b/LLama/Native/LLamaFtype.cs @@ -206,7 +206,12 @@ public enum LLamaFtype /// except 1d tensors /// LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38, - + + /// + /// Except 1d tensors + /// + LLAMA_FTYPE_MOSTLY_NVFP4 = 39, + /// /// File type was not specified /// diff --git a/LLama/Native/LLamaModelQuantizeParams.cs b/LLama/Native/LLamaModelQuantizeParams.cs index 857f0cfb9..d47c78f11 100644 --- a/LLama/Native/LLamaModelQuantizeParams.cs +++ b/LLama/Native/LLamaModelQuantizeParams.cs @@ -79,6 +79,16 @@ public bool keep_split } private sbyte _keep_split; + /// + /// calculate and show the final quantization size without performing quantization + /// + public bool dry_run + { + get => Convert.ToBoolean(_dry_run); + set => _dry_run = Convert.ToSByte(value); + } + private sbyte _dry_run; + /// /// pointer to importance matrix data /// diff --git a/LLama/Native/LoraAdapter.cs b/LLama/Native/LoraAdapter.cs index 8fdd649a3..9bc24ce17 100644 --- a/LLama/Native/LoraAdapter.cs +++ b/LLama/Native/LoraAdapter.cs @@ -22,30 +22,10 @@ public class LoraAdapter /// internal IntPtr Pointer { get; } - /// - /// Indicates if this adapter has been unloaded - /// - internal bool Loaded { get; private set; } - internal LoraAdapter(SafeLlamaModelHandle model, string path, IntPtr nativePtr) { Model = model; Path = path; Pointer = nativePtr; - Loaded = true; - } - - /// - /// Unload this adapter - /// - public void Unload() - { - Loaded = false; - llama_adapter_lora_free(Pointer); - - // Manually free a LoRA adapter. loaded adapters will be free when the associated model is deleted - [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] - [Obsolete("adapters are now freed together with the associated model")] - static extern void llama_adapter_lora_free(IntPtr adapter); } } \ No newline at end of file diff --git a/LLama/Native/NativeApi.Mtmd.cs b/LLama/Native/NativeApi.Mtmd.cs index 827c0e1b0..5eb75028f 100644 --- a/LLama/Native/NativeApi.Mtmd.cs +++ b/LLama/Native/NativeApi.Mtmd.cs @@ -168,7 +168,7 @@ internal static unsafe void mtmd_bitmap_set_id(SafeMtmdEmbed bitmap, string? id) // tokenize ---------------------------------------------------------- /// - /// Native text structure consumed by . + /// Native text structure consumed by . /// internal unsafe struct mtmd_input_text_native { diff --git a/LLama/Native/NativeApi.cs b/LLama/Native/NativeApi.cs index ce8c36197..381754103 100644 --- a/LLama/Native/NativeApi.cs +++ b/LLama/Native/NativeApi.cs @@ -132,19 +132,7 @@ public static void llama_empty_call() [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] public static extern unsafe nuint llama_state_seq_load_file(SafeLLamaContextHandle ctx, string filepath, LLamaSeqId dest_seq_id, LLamaToken* tokens_out, nuint n_token_capacity, out nuint n_token_count_out); - /// - /// Set whether to use causal attention or not. If set to true, the model will only attend to the past tokens - /// - [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] - public static extern void llama_set_causal_attn(SafeLLamaContextHandle ctx, [MarshalAs(UnmanagedType.U1)] bool causalAttn); - - /// - /// Set whether the context outputs embeddings or not - /// - /// - /// If true, embeddings will be returned but logits will not - [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] - public static extern void llama_set_embeddings(SafeLLamaContextHandle ctx, [MarshalAs(UnmanagedType.U1)] bool embeddings); + /// /// Set abort callback @@ -152,14 +140,6 @@ public static void llama_empty_call() [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] public static extern void llama_set_abort_callback(SafeLLamaContextHandle ctx, IntPtr /* ggml_abort_callback */ abortCallback, IntPtr abortCallbackData); - /// - /// Get the n_seq_max for this context - /// - /// - /// - [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] - public static extern uint llama_n_seq_max(SafeLLamaContextHandle ctx); - /// /// Get all output token embeddings. /// When pooling_type == LLAMA_POOLING_TYPE_NONE or when using a generative model, the embeddings for which @@ -515,6 +495,18 @@ public static extern unsafe LLamaParamsFitStatus llama_params_fit( [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)] public static extern long llama_time_us(); - + /* Directly exposes `ggml_tensor` and `gguf_context` which LLamaSharp does not currently support! + + typedef void (* llama_model_set_tensor_data_t) (struct ggml_tensor * tensor, void* userdata); + + // Create a new model from GGUF metadata as well as a function to set the tensor data + // - tensors are created as GGML_TYPE_F32 by default, + // override by adding a tensor with the same name but a different name to the context + LLAMA_API struct llama_model * llama_model_init_from_user( + struct gguf_context * metadata, + llama_model_set_tensor_data_t set_tensor_data, // function to initialize tensor data with + void* set_tensor_data_ud, // userdata for function + struct llama_model_params params); + */ } } diff --git a/LLama/Native/SafeLLamaContextHandle.cs b/LLama/Native/SafeLLamaContextHandle.cs index 5ec78f053..71261eefb 100644 --- a/LLama/Native/SafeLLamaContextHandle.cs +++ b/LLama/Native/SafeLLamaContextHandle.cs @@ -37,7 +37,7 @@ public sealed class SafeLLamaContextHandle /// /// Get the number of maximum sequences allowed /// - public uint MaxSeq => NativeApi.llama_n_seq_max(this); + public uint MaxSeq => llama_n_seq_max(this); /// /// Get or set the number of threads used for generation of a single token. @@ -355,6 +355,7 @@ static SafeLLamaContextHandle() /// /// The length of the value string (on success) -1 otherwise [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] + // ReSharper disable once InconsistentNaming private static extern int llama_adapter_meta_val_str(IntPtr adapter, string key, StringBuilder buf, UIntPtr buf_size); /// @@ -374,6 +375,7 @@ static SafeLLamaContextHandle() /// /// The length of string i.e meta key (on success) -1 otherwise [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] + // ReSharper disable once InconsistentNaming private static extern int llama_adapter_meta_key_by_index(IntPtr adapter, int i, StringBuilder buf, UIntPtr buf_size); /// @@ -385,6 +387,7 @@ static SafeLLamaContextHandle() /// /// The length of value string (on success) -1 otherwise [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] + // ReSharper disable once InconsistentNaming private static extern int llama_adapter_meta_val_by_index(IntPtr adapter, int i, StringBuilder buf, UIntPtr buf_size); /// @@ -424,6 +427,56 @@ static SafeLLamaContextHandle() /// [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] private static extern void llama_set_warmup(SafeLLamaContextHandle ctx, [MarshalAs(UnmanagedType.U1)] bool warmup); + + /// + /// Set whether to use causal attention or not. If set to true, the model will only attend to the past tokens + /// + [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] + private static extern void llama_set_causal_attn(SafeLLamaContextHandle ctx, [MarshalAs(UnmanagedType.U1)] bool causalAttn); + + /// + /// Set whether the context outputs embeddings or not + /// + /// + /// If true, embeddings will be returned but logits will not + [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] + private static extern void llama_set_embeddings(SafeLLamaContextHandle ctx, [MarshalAs(UnmanagedType.U1)] bool embeddings); + + /// + /// Get the n_seq_max for this context + /// + /// + /// + [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] + private static extern uint llama_n_seq_max(SafeLLamaContextHandle ctx); + #endregion + + #region Setters + /// + /// Set whether the model is in warmup mode or not + /// If true, all model tensors are activated during to load and cache their weights. + /// + public void SetWarmup(bool value) + { + llama_set_warmup(this, value); + } + + /// + /// Set whether to use causal attention or not. If set to true, the model will only attend to the past tokens + /// + public void SetCausalAttention(bool value) + { + llama_set_causal_attn(this, value); + } + + /// + /// Set whether the context outputs embeddings or not + /// + /// If true, embeddings will be returned but logits will not + public void SetEmbeddings(bool value) + { + llama_set_embeddings(this, value); + } #endregion #region LoRA @@ -434,14 +487,10 @@ static SafeLLamaContextHandle() /// public void SetLoraAdapters(params Span<(LoraAdapter Adapter, float Scale)> adapters) { - // Check adapters are all valid + // Check adapters are all valid and attached to this model foreach (var adapter in adapters) - { if (adapter.Adapter.Model != ModelHandle) throw new ArgumentException("Cannot add LoRA adapter which was loaded for a different model"); - if (!adapter.Adapter.Loaded) - throw new ArgumentException("Cannot add LoRA adapter which has been unloaded"); - } // Copy data into buffers Span adapterPtrs = stackalloc IntPtr[adapters.Length]; diff --git a/LLama/Native/SafeLlamaModelHandle.cs b/LLama/Native/SafeLlamaModelHandle.cs index 668074090..2a6855741 100644 --- a/LLama/Native/SafeLlamaModelHandle.cs +++ b/LLama/Native/SafeLlamaModelHandle.cs @@ -175,7 +175,7 @@ static SafeLlamaModelHandle() private static extern unsafe byte* llama_model_chat_template(SafeLlamaModelHandle model, string? name); /// - /// Load the model from a file + /// Load a model from a file /// If the file is split into multiple parts, the file name must follow this pattern: {name}-%05d-of-%05d.gguf /// If the split file name does not follow this pattern, use llama_model_load_from_splits /// @@ -186,7 +186,7 @@ static SafeLlamaModelHandle() private static extern SafeLlamaModelHandle llama_model_load_from_file(string path, LLamaModelParams @params); /// - /// Load the model from multiple splits (support custom naming scheme) + /// Load a model from multiple splits (support custom naming scheme) /// The paths must be in the correct order /// /// @@ -460,7 +460,7 @@ private static int llama_model_meta_val_str(SafeLlamaModelHandle model, string k /// /// [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)] - private static extern string? llama_model_cls_label(SafeLlamaModelHandle model, uint i); + private static extern IntPtr /* char* */ llama_model_cls_label(SafeLlamaModelHandle model, uint i); #endregion #region LoRA diff --git a/llama.cpp b/llama.cpp index ff4affb4c..73c9eb8ce 160000 --- a/llama.cpp +++ b/llama.cpp @@ -1 +1 @@ -Subproject commit ff4affb4c1aa7eb4f28a0d9de1b205bd719802f2 +Subproject commit 73c9eb8ceda397b651dbb6661b2935f0283a2b1d