diff --git a/src/Build5Nines.SharpVector.OpenAI/Embeddings/OpenAIEmbeddingsGenerator.cs b/src/Build5Nines.SharpVector.OpenAI/Embeddings/OpenAIEmbeddingsGenerator.cs index c7a5e6b..2d2eb7f 100644 --- a/src/Build5Nines.SharpVector.OpenAI/Embeddings/OpenAIEmbeddingsGenerator.cs +++ b/src/Build5Nines.SharpVector.OpenAI/Embeddings/OpenAIEmbeddingsGenerator.cs @@ -1,9 +1,11 @@ using Build5Nines.SharpVector.Embeddings; using OpenAI.Embeddings; +using System.Collections.Generic; +using System.Linq; namespace Build5Nines.SharpVector.OpenAI.Embeddings; -public class OpenAIEmbeddingsGenerator : IEmbeddingsGenerator +public class OpenAIEmbeddingsGenerator : IEmbeddingsGenerator //IBatchEmbeddingsGenerator { protected EmbeddingClient EmbeddingClient { get; private set; } @@ -18,4 +20,29 @@ public async Task GenerateEmbeddingsAsync(string text) var vector = embedding.ToFloats(); return vector.ToArray(); } + + /// + /// Generates embeddings for a batch of input texts using the OpenAI embeddings client. + /// This leverages the API's multi-input batching for improved throughput and reduced overhead. + /// + /// Collection of non-empty texts to embed. + /// A list of float vectors aligned to the input order. + public async Task> GenerateEmbeddingsAsync(IEnumerable texts) + { + if (texts is null) throw new ArgumentNullException(nameof(texts)); + + var inputs = texts.ToList(); + if (inputs.Count == 0) + { + return Array.Empty(); + } + + // Call the batch embeddings API once for all inputs. + var batchResult = await EmbeddingClient.GenerateEmbeddingsAsync(inputs); + + // Map the embeddings to float arrays while preserving order. + var vectors = batchResult.Value.Select(e => e.ToFloats().ToArray()).ToList(); + + return vectors; + } } \ No newline at end of file diff --git a/src/Build5Nines.SharpVector/Build5Nines.SharpVector.csproj b/src/Build5Nines.SharpVector/Build5Nines.SharpVector.csproj index 5d8ec2d..8183a8e 100644 --- a/src/Build5Nines.SharpVector/Build5Nines.SharpVector.csproj +++ b/src/Build5Nines.SharpVector/Build5Nines.SharpVector.csproj @@ -9,7 +9,7 @@ Build5Nines.SharpVector https://sharpvector.build5nines.com https://github.com/Build5Nines/SharpVector - 2.1.2 + 2.1.3 Lightweight In-memory Vector Database to embed in any .NET Applications Copyright (c) 2025 Build5Nines LLC README.md diff --git a/src/Build5Nines.SharpVector/Embeddings/IBatchEmbeddingsGenerator.cs b/src/Build5Nines.SharpVector/Embeddings/IBatchEmbeddingsGenerator.cs new file mode 100644 index 0000000..5a38eaa --- /dev/null +++ b/src/Build5Nines.SharpVector/Embeddings/IBatchEmbeddingsGenerator.cs @@ -0,0 +1,15 @@ +namespace Build5Nines.SharpVector.Embeddings; + +/// +/// Optional capability for embeddings generators to support batch embedding of multiple texts. +/// Implementations can leverage provider APIs that accept multi-input requests for better performance. +/// +public interface IBatchEmbeddingsGenerator : IEmbeddingsGenerator +{ + /// + /// Generates embeddings for multiple input texts in a single call when supported. + /// + /// Collection of texts to embed. Order should be preserved in output. + /// A read-only list of embeddings vectors corresponding to the input order. + Task> GenerateEmbeddingsAsync(IEnumerable texts); +} diff --git a/src/Build5Nines.SharpVector/IVectorDatabase.cs b/src/Build5Nines.SharpVector/IVectorDatabase.cs index 47c163e..68ef7ea 100644 --- a/src/Build5Nines.SharpVector/IVectorDatabase.cs +++ b/src/Build5Nines.SharpVector/IVectorDatabase.cs @@ -28,6 +28,13 @@ public interface IVectorDatabase /// Task AddTextAsync(TDocument text, TMetadata? metadata = default(TMetadata)); + /// + /// Adds multiple texts with Metadata to the database and returns their IDs + /// + /// The texts and metadata to add in batch. + /// The IDs of the added texts. + Task> AddTextsAsync(IEnumerable<(TDocument text, TMetadata? metadata)> items); + /// /// Get all the Ids for each text the database. /// diff --git a/src/Build5Nines.SharpVector/MemoryVectorDatabaseBase.cs b/src/Build5Nines.SharpVector/MemoryVectorDatabaseBase.cs index d701881..c1bf17f 100644 --- a/src/Build5Nines.SharpVector/MemoryVectorDatabaseBase.cs +++ b/src/Build5Nines.SharpVector/MemoryVectorDatabaseBase.cs @@ -107,6 +107,22 @@ public IEnumerable GetIds() return id; } + public async Task> AddTextsAsync(IEnumerable<(TVocabularyKey text, TMetadata? metadata)> items) + { + if (items is null) throw new ArgumentNullException(nameof(items)); + + var ids = new List(); + + foreach(var item in items) + { + TId id = await AddTextAsync(item.text, item.metadata); + ids.Add(id); + } + + return ids; + } + + /// /// Retrieves a text and metadata by its ID /// @@ -469,6 +485,48 @@ public IEnumerable GetIds() return id; } + /// + /// Adds multiple texts with optional metadata to the database efficiently. + /// If the embeddings generator supports batching, this will generate vectors in a single multi-input call. + /// + /// Collection of (text, metadata) tuples to add. + /// List of generated IDs in the same order as inputs. + public async Task> AddTextsAsync(IEnumerable<(string text, TMetadata? metadata)> items) + { + if (items is null) throw new ArgumentNullException(nameof(items)); + + var list = items.ToList(); + if (list.Count == 0) return Array.Empty(); + + // Try batch embeddings if supported + float[][] vectors; + if (EmbeddingsGenerator is IBatchEmbeddingsGenerator batchGen) + { + var batch = await batchGen.GenerateEmbeddingsAsync(list.Select(i => i.text)); + vectors = batch.Select(v => v.ToArray()).ToArray(); + } + else + { + // Fallback to per-item embedding + vectors = new float[list.Count][]; + for (int i = 0; i < list.Count; i++) + { + vectors[i] = await EmbeddingsGenerator.GenerateEmbeddingsAsync(list[i].text); + } + } + + // Store items and produce IDs + var ids = new List(list.Count); + for (int i = 0; i < list.Count; i++) + { + TId id = _idGenerator.NewId(); + ids.Add(id); + await VectorStore.SetAsync(id, new VectorTextItem(list[i].text, list[i].metadata, vectors[i])); + } + + return ids; + } + /// /// Retrieves a text and metadata by its ID /// diff --git a/src/OpenAIConsoleTest/OpenAIConsoleTest.csproj b/src/OpenAIConsoleTest/OpenAIConsoleTest.csproj index 34baf81..c50589d 100644 --- a/src/OpenAIConsoleTest/OpenAIConsoleTest.csproj +++ b/src/OpenAIConsoleTest/OpenAIConsoleTest.csproj @@ -14,6 +14,7 @@ + diff --git a/src/SharpVectorTest/BatchAddTests.cs b/src/SharpVectorTest/BatchAddTests.cs new file mode 100644 index 0000000..857c99d --- /dev/null +++ b/src/SharpVectorTest/BatchAddTests.cs @@ -0,0 +1,74 @@ +namespace SharpVectorTest; + +using System.Linq; +using System.Threading.Tasks; +using Build5Nines.SharpVector; +using Build5Nines.SharpVector.Embeddings; +using Build5Nines.SharpVector.Id; +using Build5Nines.SharpVector.VectorCompare; +using Build5Nines.SharpVector.VectorStore; + +[TestClass] +public class BatchAddTests +{ + [TestMethod] + public async Task AddTextsAsync_UsesBatchEmbeddings_WhenAvailable() + { + var db = new BatchMockMemoryVectorDatabase(); + + var inputs = new (string text, string? metadata)[] + { + ("one", "m1"), + ("two", "m2"), + ("three", "m3") + }; + + var ids = await db.AddTextsAsync(inputs); + + Assert.AreEqual(3, ids.Count); + + var results = db.Search("one"); + Assert.AreEqual(3, results.Texts.Count()); + + // Ensure vectors were assigned from batch generator (length = 5 per mock) + foreach (var item in db) + { + Assert.AreEqual(5, item.Vector.Length); + } + } +} + +public class BatchMockMemoryVectorDatabase + : MemoryVectorDatabaseBase< + int, + string, + MemoryDictionaryVectorStore, + IntIdGenerator, + CosineSimilarityVectorComparer + > +{ + public BatchMockMemoryVectorDatabase() + : base( + new MockBatchEmbeddingsGenerator(), + new MemoryDictionaryVectorStore() + ) + { } +} + +public class MockBatchEmbeddingsGenerator : IEmbeddingsGenerator, IBatchEmbeddingsGenerator +{ +#pragma warning disable CS1998 // Async method lacks 'await' operators and will run synchronously + public async Task GenerateEmbeddingsAsync(string text) +#pragma warning restore CS1998 // Async method lacks 'await' operators and will run synchronously + { + return new float[] { 0.1f, 0.2f, 0.3f, 0.4f, 0.5f }; + } + +#pragma warning disable CS1998 // Async method lacks 'await' operators and will run synchronously + public async Task> GenerateEmbeddingsAsync(IEnumerable texts) +#pragma warning restore CS1998 // Async method lacks 'await' operators and will run synchronously + { + // Return a different first value to ensure we can recognize batched path if needed + return texts.Select((t, idx) => new float[] { 0.9f, 0.2f, 0.3f, 0.4f, 0.5f }).ToList(); + } +} diff --git a/src/SharpVectorTest/VectorDatabaseTests.cs b/src/SharpVectorTest/VectorDatabaseTests.cs index 2891afa..eee522f 100644 --- a/src/SharpVectorTest/VectorDatabaseTests.cs +++ b/src/SharpVectorTest/VectorDatabaseTests.cs @@ -98,6 +98,30 @@ public void BasicMemoryVectorDatabase_05() Assert.AreEqual("metadata2", results.Texts.First().Metadata); } + [TestMethod] + public void BasicMemoryVectorDatabase_05_Batch() + { + var vdb = new BasicMemoryVectorDatabase(); + + // // Load Vector Database with some sample text + var inputs = new (string text, string? metadata)[] + { + ("The 👑 King", "metadata1"), + ("It's 🔥 Fire.", "metadata2"), + ("No emoji", "metadata3") + }; + vdb.AddTextsAsync(inputs).Wait(); + + var results = vdb.Search("🔥", pageCount: 1); + + Assert.AreEqual(1, results.Texts.Count()); + Assert.AreEqual(0.5773503184318542, results.Texts.First().Similarity); + Assert.AreEqual("It's 🔥 Fire.", results.Texts.First().Text); + Assert.AreEqual(2, results.Texts.First().Id); + Assert.AreEqual("metadata2", results.Texts.First().Metadata); + } + + [TestMethod] public void BasicMemoryVectorDatabase_06() {