diff --git a/LLama/Batched/BatchedExecutor.cs b/LLama/Batched/BatchedExecutor.cs
index 1a6698b1a..db9be6a7a 100644
--- a/LLama/Batched/BatchedExecutor.cs
+++ b/LLama/Batched/BatchedExecutor.cs
@@ -43,7 +43,12 @@ public sealed class BatchedExecutor
/// The this executor is using
///
public LLamaWeights Model { get; }
-
+
+ ///
+ /// The optional this executor is using
+ ///
+ public MtmdWeights? ClipModel { get; }
+
///
/// Get the number of tokens in the batch, waiting for to be called
///
@@ -79,12 +84,8 @@ public int BatchedTokenCount
///
/// The model to use
/// Parameters to create a new context
- public BatchedExecutor(LLamaWeights model, IContextParams contextParams)
- : this(model, contextParams, null)
- {
- }
-
- public BatchedExecutor(LLamaWeights model, IContextParams contextParams, MtmdWeights? clipModel)
+ /// Clip model to use for multimodal capabilities
+ public BatchedExecutor(LLamaWeights model, IContextParams contextParams, MtmdWeights? clipModel = null)
{
Model = model;
Context = model.CreateContext(contextParams);
@@ -92,8 +93,6 @@ public BatchedExecutor(LLamaWeights model, IContextParams contextParams, MtmdWei
Epoch = 1;
}
- public MtmdWeights? ClipModel { get; }
-
///
/// Start a new
///
diff --git a/LLama/LLamaEmbedder.cs b/LLama/LLamaEmbedder.cs
index eee9a01e9..5f6a2878d 100644
--- a/LLama/LLamaEmbedder.cs
+++ b/LLama/LLamaEmbedder.cs
@@ -1,14 +1,11 @@
using System;
using System.Collections.Generic;
-using System.Linq;
using System.Threading;
using System.Threading.Tasks;
using LLama.Abstractions;
using LLama.Exceptions;
using LLama.Native;
-using Microsoft.Extensions.AI;
using Microsoft.Extensions.Logging;
-using static System.Net.Mime.MediaTypeNames;
namespace LLama;
@@ -79,7 +76,7 @@ public async Task> GetEmbeddings(string input, Cancellati
Context.Dispose();
Context = _weights.CreateContext(_params, _logger);
- NativeApi.llama_set_embeddings(Context.NativeHandle, true);
+ Context.NativeHandle.SetEmbeddings(true);
// Add all of the tokens to the batch
var tokens = Context.Tokenize(input, special: true);
diff --git a/LLama/LLamaReranker.cs b/LLama/LLamaReranker.cs
index 16a206c40..58919fd0a 100644
--- a/LLama/LLamaReranker.cs
+++ b/LLama/LLamaReranker.cs
@@ -1,11 +1,7 @@
using System;
using System.Collections.Generic;
-using System.IO;
-using System.Linq;
-using System.Text;
using System.Threading;
using System.Threading.Tasks;
-using System.Xml.Linq;
using LLama.Abstractions;
using LLama.Exceptions;
using LLama.Native;
@@ -44,7 +40,7 @@ public LLamaReranker(LLamaWeights weights, IContextParams @params, ILogger? logg
if (@params.PoolingType != LLamaPoolingType.Rank)
throw new NotSupportedException("Computing rank score, PoolingType must be equal to LLamaPoolingType.Rank");
Context = weights.CreateContext(@params, logger);
- NativeApi.llama_set_embeddings(Context.NativeHandle, true);
+ Context.NativeHandle.SetEmbeddings(true);
}
///
diff --git a/LLama/LLamaSharp.csproj b/LLama/LLamaSharp.csproj
index 4bdf9289a..9eff34c4e 100644
--- a/LLama/LLamaSharp.csproj
+++ b/LLama/LLamaSharp.csproj
@@ -57,7 +57,7 @@
- ff4affb4c1aa7eb4_v3
+ 73c9eb8ceda397b
diff --git a/LLama/Native/LLamaFtype.cs b/LLama/Native/LLamaFtype.cs
index 813bad1ae..d48be1855 100644
--- a/LLama/Native/LLamaFtype.cs
+++ b/LLama/Native/LLamaFtype.cs
@@ -206,7 +206,12 @@ public enum LLamaFtype
/// except 1d tensors
///
LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38,
-
+
+ ///
+ /// Except 1d tensors
+ ///
+ LLAMA_FTYPE_MOSTLY_NVFP4 = 39,
+
///
/// File type was not specified
///
diff --git a/LLama/Native/LLamaModelQuantizeParams.cs b/LLama/Native/LLamaModelQuantizeParams.cs
index 857f0cfb9..d47c78f11 100644
--- a/LLama/Native/LLamaModelQuantizeParams.cs
+++ b/LLama/Native/LLamaModelQuantizeParams.cs
@@ -79,6 +79,16 @@ public bool keep_split
}
private sbyte _keep_split;
+ ///
+ /// calculate and show the final quantization size without performing quantization
+ ///
+ public bool dry_run
+ {
+ get => Convert.ToBoolean(_dry_run);
+ set => _dry_run = Convert.ToSByte(value);
+ }
+ private sbyte _dry_run;
+
///
/// pointer to importance matrix data
///
diff --git a/LLama/Native/LoraAdapter.cs b/LLama/Native/LoraAdapter.cs
index 8fdd649a3..9bc24ce17 100644
--- a/LLama/Native/LoraAdapter.cs
+++ b/LLama/Native/LoraAdapter.cs
@@ -22,30 +22,10 @@ public class LoraAdapter
///
internal IntPtr Pointer { get; }
- ///
- /// Indicates if this adapter has been unloaded
- ///
- internal bool Loaded { get; private set; }
-
internal LoraAdapter(SafeLlamaModelHandle model, string path, IntPtr nativePtr)
{
Model = model;
Path = path;
Pointer = nativePtr;
- Loaded = true;
- }
-
- ///
- /// Unload this adapter
- ///
- public void Unload()
- {
- Loaded = false;
- llama_adapter_lora_free(Pointer);
-
- // Manually free a LoRA adapter. loaded adapters will be free when the associated model is deleted
- [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
- [Obsolete("adapters are now freed together with the associated model")]
- static extern void llama_adapter_lora_free(IntPtr adapter);
}
}
\ No newline at end of file
diff --git a/LLama/Native/NativeApi.Mtmd.cs b/LLama/Native/NativeApi.Mtmd.cs
index 827c0e1b0..5eb75028f 100644
--- a/LLama/Native/NativeApi.Mtmd.cs
+++ b/LLama/Native/NativeApi.Mtmd.cs
@@ -168,7 +168,7 @@ internal static unsafe void mtmd_bitmap_set_id(SafeMtmdEmbed bitmap, string? id)
// tokenize ----------------------------------------------------------
///
- /// Native text structure consumed by .
+ /// Native text structure consumed by .
///
internal unsafe struct mtmd_input_text_native
{
diff --git a/LLama/Native/NativeApi.cs b/LLama/Native/NativeApi.cs
index ce8c36197..381754103 100644
--- a/LLama/Native/NativeApi.cs
+++ b/LLama/Native/NativeApi.cs
@@ -132,19 +132,7 @@ public static void llama_empty_call()
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
public static extern unsafe nuint llama_state_seq_load_file(SafeLLamaContextHandle ctx, string filepath, LLamaSeqId dest_seq_id, LLamaToken* tokens_out, nuint n_token_capacity, out nuint n_token_count_out);
- ///
- /// Set whether to use causal attention or not. If set to true, the model will only attend to the past tokens
- ///
- [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
- public static extern void llama_set_causal_attn(SafeLLamaContextHandle ctx, [MarshalAs(UnmanagedType.U1)] bool causalAttn);
-
- ///
- /// Set whether the context outputs embeddings or not
- ///
- ///
- /// If true, embeddings will be returned but logits will not
- [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
- public static extern void llama_set_embeddings(SafeLLamaContextHandle ctx, [MarshalAs(UnmanagedType.U1)] bool embeddings);
+
///
/// Set abort callback
@@ -152,14 +140,6 @@ public static void llama_empty_call()
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
public static extern void llama_set_abort_callback(SafeLLamaContextHandle ctx, IntPtr /* ggml_abort_callback */ abortCallback, IntPtr abortCallbackData);
- ///
- /// Get the n_seq_max for this context
- ///
- ///
- ///
- [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
- public static extern uint llama_n_seq_max(SafeLLamaContextHandle ctx);
-
///
/// Get all output token embeddings.
/// When pooling_type == LLAMA_POOLING_TYPE_NONE or when using a generative model, the embeddings for which
@@ -515,6 +495,18 @@ public static extern unsafe LLamaParamsFitStatus llama_params_fit(
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
public static extern long llama_time_us();
-
+ /* Directly exposes `ggml_tensor` and `gguf_context` which LLamaSharp does not currently support!
+
+ typedef void (* llama_model_set_tensor_data_t) (struct ggml_tensor * tensor, void* userdata);
+
+ // Create a new model from GGUF metadata as well as a function to set the tensor data
+ // - tensors are created as GGML_TYPE_F32 by default,
+ // override by adding a tensor with the same name but a different name to the context
+ LLAMA_API struct llama_model * llama_model_init_from_user(
+ struct gguf_context * metadata,
+ llama_model_set_tensor_data_t set_tensor_data, // function to initialize tensor data with
+ void* set_tensor_data_ud, // userdata for function
+ struct llama_model_params params);
+ */
}
}
diff --git a/LLama/Native/SafeLLamaContextHandle.cs b/LLama/Native/SafeLLamaContextHandle.cs
index 5ec78f053..71261eefb 100644
--- a/LLama/Native/SafeLLamaContextHandle.cs
+++ b/LLama/Native/SafeLLamaContextHandle.cs
@@ -37,7 +37,7 @@ public sealed class SafeLLamaContextHandle
///
/// Get the number of maximum sequences allowed
///
- public uint MaxSeq => NativeApi.llama_n_seq_max(this);
+ public uint MaxSeq => llama_n_seq_max(this);
///
/// Get or set the number of threads used for generation of a single token.
@@ -355,6 +355,7 @@ static SafeLLamaContextHandle()
///
/// The length of the value string (on success) -1 otherwise
[DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
+ // ReSharper disable once InconsistentNaming
private static extern int llama_adapter_meta_val_str(IntPtr adapter, string key, StringBuilder buf, UIntPtr buf_size);
///
@@ -374,6 +375,7 @@ static SafeLLamaContextHandle()
///
/// The length of string i.e meta key (on success) -1 otherwise
[DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
+ // ReSharper disable once InconsistentNaming
private static extern int llama_adapter_meta_key_by_index(IntPtr adapter, int i, StringBuilder buf, UIntPtr buf_size);
///
@@ -385,6 +387,7 @@ static SafeLLamaContextHandle()
///
/// The length of value string (on success) -1 otherwise
[DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
+ // ReSharper disable once InconsistentNaming
private static extern int llama_adapter_meta_val_by_index(IntPtr adapter, int i, StringBuilder buf, UIntPtr buf_size);
///
@@ -424,6 +427,56 @@ static SafeLLamaContextHandle()
///
[DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
private static extern void llama_set_warmup(SafeLLamaContextHandle ctx, [MarshalAs(UnmanagedType.U1)] bool warmup);
+
+ ///
+ /// Set whether to use causal attention or not. If set to true, the model will only attend to the past tokens
+ ///
+ [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
+ private static extern void llama_set_causal_attn(SafeLLamaContextHandle ctx, [MarshalAs(UnmanagedType.U1)] bool causalAttn);
+
+ ///
+ /// Set whether the context outputs embeddings or not
+ ///
+ ///
+ /// If true, embeddings will be returned but logits will not
+ [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
+ private static extern void llama_set_embeddings(SafeLLamaContextHandle ctx, [MarshalAs(UnmanagedType.U1)] bool embeddings);
+
+ ///
+ /// Get the n_seq_max for this context
+ ///
+ ///
+ ///
+ [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
+ private static extern uint llama_n_seq_max(SafeLLamaContextHandle ctx);
+ #endregion
+
+ #region Setters
+ ///
+ /// Set whether the model is in warmup mode or not
+ /// If true, all model tensors are activated during to load and cache their weights.
+ ///
+ public void SetWarmup(bool value)
+ {
+ llama_set_warmup(this, value);
+ }
+
+ ///
+ /// Set whether to use causal attention or not. If set to true, the model will only attend to the past tokens
+ ///
+ public void SetCausalAttention(bool value)
+ {
+ llama_set_causal_attn(this, value);
+ }
+
+ ///
+ /// Set whether the context outputs embeddings or not
+ ///
+ /// If true, embeddings will be returned but logits will not
+ public void SetEmbeddings(bool value)
+ {
+ llama_set_embeddings(this, value);
+ }
#endregion
#region LoRA
@@ -434,14 +487,10 @@ static SafeLLamaContextHandle()
///
public void SetLoraAdapters(params Span<(LoraAdapter Adapter, float Scale)> adapters)
{
- // Check adapters are all valid
+ // Check adapters are all valid and attached to this model
foreach (var adapter in adapters)
- {
if (adapter.Adapter.Model != ModelHandle)
throw new ArgumentException("Cannot add LoRA adapter which was loaded for a different model");
- if (!adapter.Adapter.Loaded)
- throw new ArgumentException("Cannot add LoRA adapter which has been unloaded");
- }
// Copy data into buffers
Span adapterPtrs = stackalloc IntPtr[adapters.Length];
diff --git a/LLama/Native/SafeLlamaModelHandle.cs b/LLama/Native/SafeLlamaModelHandle.cs
index 668074090..2a6855741 100644
--- a/LLama/Native/SafeLlamaModelHandle.cs
+++ b/LLama/Native/SafeLlamaModelHandle.cs
@@ -175,7 +175,7 @@ static SafeLlamaModelHandle()
private static extern unsafe byte* llama_model_chat_template(SafeLlamaModelHandle model, string? name);
///
- /// Load the model from a file
+ /// Load a model from a file
/// If the file is split into multiple parts, the file name must follow this pattern: {name}-%05d-of-%05d.gguf
/// If the split file name does not follow this pattern, use llama_model_load_from_splits
///
@@ -186,7 +186,7 @@ static SafeLlamaModelHandle()
private static extern SafeLlamaModelHandle llama_model_load_from_file(string path, LLamaModelParams @params);
///
- /// Load the model from multiple splits (support custom naming scheme)
+ /// Load a model from multiple splits (support custom naming scheme)
/// The paths must be in the correct order
///
///
@@ -460,7 +460,7 @@ private static int llama_model_meta_val_str(SafeLlamaModelHandle model, string k
///
///
[DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
- private static extern string? llama_model_cls_label(SafeLlamaModelHandle model, uint i);
+ private static extern IntPtr /* char* */ llama_model_cls_label(SafeLlamaModelHandle model, uint i);
#endregion
#region LoRA
diff --git a/llama.cpp b/llama.cpp
index ff4affb4c..73c9eb8ce 160000
--- a/llama.cpp
+++ b/llama.cpp
@@ -1 +1 @@
-Subproject commit ff4affb4c1aa7eb4f28a0d9de1b205bd719802f2
+Subproject commit 73c9eb8ceda397b651dbb6661b2935f0283a2b1d