From 180b93a03dd2d5b3ef4f5fcaf528cb899e732630 Mon Sep 17 00:00:00 2001 From: Au Nguyen Date: Fri, 28 Mar 2025 12:02:22 +0700 Subject: [PATCH 1/2] feat: Enrich page number to text partitions by using structured extracting content --- service/Abstractions/DataFormats/Chunk.cs | 2 +- service/Abstractions/Pipeline/DataPipeline.cs | 8 +++ .../Core/DataFormats/Office/MsExcelDecoder.cs | 4 +- .../DataFormats/Office/MsPowerPointDecoder.cs | 2 +- .../Core/DataFormats/Office/MsWordDecoder.cs | 4 +- service/Core/DataFormats/Pdf/PdfDecoder.cs | 2 +- .../Core/DataFormats/Text/MarkDownDecoder.cs | 4 +- service/Core/DataFormats/Text/TextDecoder.cs | 7 ++- .../Core/Handlers/TextExtractionHandler.cs | 1 + .../Core/Handlers/TextPartitioningHandler.cs | 62 +++++++++++++------ 10 files changed, 64 insertions(+), 32 deletions(-) diff --git a/service/Abstractions/DataFormats/Chunk.cs b/service/Abstractions/DataFormats/Chunk.cs index 135add8bb..c3f881f15 100644 --- a/service/Abstractions/DataFormats/Chunk.cs +++ b/service/Abstractions/DataFormats/Chunk.cs @@ -55,7 +55,7 @@ public bool SentencesAreComplete { get { - return this.Metadata.TryGetValue(MetaPageNumber, out var value) && JsonSerializer.Deserialize(value); + return this.Metadata.TryGetValue(MetaSentencesAreComplete, out var value) && JsonSerializer.Deserialize(value); } } diff --git a/service/Abstractions/Pipeline/DataPipeline.cs b/service/Abstractions/Pipeline/DataPipeline.cs index 7b6f1a6a1..becbcaeeb 100644 --- a/service/Abstractions/Pipeline/DataPipeline.cs +++ b/service/Abstractions/Pipeline/DataPipeline.cs @@ -6,6 +6,7 @@ using System.Linq; using System.Text.Json.Serialization; using Microsoft.KernelMemory.Context; +using Microsoft.KernelMemory.DataFormats; namespace Microsoft.KernelMemory.Pipeline; @@ -182,6 +183,13 @@ public class GeneratedFileDetails : FileDetailsBase [JsonPropertyOrder(16)] [JsonPropertyName("content_sha256")] public string ContentSHA256 { get; set; } = string.Empty; + + /// + /// Cached content object for structured data that will reduce deserializing operations + /// + [JsonPropertyOrder(17)] + [JsonPropertyName("file_content_object")] + public FileContent? FileContentObject { get; set; } = null; } public class FileDetails : FileDetailsBase diff --git a/service/Core/DataFormats/Office/MsExcelDecoder.cs b/service/Core/DataFormats/Office/MsExcelDecoder.cs index bea867811..debad9596 100644 --- a/service/Core/DataFormats/Office/MsExcelDecoder.cs +++ b/service/Core/DataFormats/Office/MsExcelDecoder.cs @@ -30,7 +30,7 @@ public MsExcelDecoder(MsExcelDecoderConfig? config = null, ILoggerFactory? logge /// public bool SupportsMimeType(string mimeType) { - return mimeType != null && mimeType.StartsWith(MimeTypes.MsExcelX, StringComparison.OrdinalIgnoreCase); + return mimeType != null && (mimeType.StartsWith(MimeTypes.MsExcelX, StringComparison.OrdinalIgnoreCase)); } /// @@ -153,7 +153,7 @@ public Task DecodeAsync(Stream data, CancellationToken cancellation string worksheetContent = sb.ToString().NormalizeNewlines(true); sb.Clear(); - result.Sections.Add(new Chunk(worksheetContent, worksheetNumber, Chunk.Meta(sentencesAreComplete: true))); + result.Sections.Add(new Chunk(worksheetContent, worksheetNumber, Chunk.Meta(sentencesAreComplete: true, worksheetNumber))); } return Task.FromResult(result); diff --git a/service/Core/DataFormats/Office/MsPowerPointDecoder.cs b/service/Core/DataFormats/Office/MsPowerPointDecoder.cs index 2be7b5c67..a311a2d53 100644 --- a/service/Core/DataFormats/Office/MsPowerPointDecoder.cs +++ b/service/Core/DataFormats/Office/MsPowerPointDecoder.cs @@ -115,7 +115,7 @@ public Task DecodeAsync(Stream data, CancellationToken cancellation string slideContent = sb.ToString().NormalizeNewlines(true); sb.Clear(); - result.Sections.Add(new Chunk(slideContent, slideNumber, Chunk.Meta(sentencesAreComplete: true))); + result.Sections.Add(new Chunk(slideContent, slideNumber, Chunk.Meta(sentencesAreComplete: true, slideNumber))); } } diff --git a/service/Core/DataFormats/Office/MsWordDecoder.cs b/service/Core/DataFormats/Office/MsWordDecoder.cs index 7b6c1a1b1..8d53bf3c7 100644 --- a/service/Core/DataFormats/Office/MsWordDecoder.cs +++ b/service/Core/DataFormats/Office/MsWordDecoder.cs @@ -83,7 +83,7 @@ public Task DecodeAsync(Stream data, CancellationToken cancellation // Note: no trimming, use original spacing when working with pages string pageContent = sb.ToString().NormalizeNewlines(false); sb.Clear(); - result.Sections.Add(new Chunk(pageContent, pageNumber, Chunk.Meta(sentencesAreComplete: true))); + result.Sections.Add(new Chunk(pageContent, pageNumber, Chunk.Meta(sentencesAreComplete: true, pageNumber))); pageNumber++; } @@ -93,7 +93,7 @@ public Task DecodeAsync(Stream data, CancellationToken cancellation // Note: no trimming, use original spacing when working with pages string lastPageContent = sb.ToString().NormalizeNewlines(false); - result.Sections.Add(new Chunk(lastPageContent, pageNumber, Chunk.Meta(sentencesAreComplete: true))); + result.Sections.Add(new Chunk(lastPageContent, pageNumber, Chunk.Meta(sentencesAreComplete: true, pageNumber))); return Task.FromResult(result); } diff --git a/service/Core/DataFormats/Pdf/PdfDecoder.cs b/service/Core/DataFormats/Pdf/PdfDecoder.cs index 736d9017e..37baf0679 100644 --- a/service/Core/DataFormats/Pdf/PdfDecoder.cs +++ b/service/Core/DataFormats/Pdf/PdfDecoder.cs @@ -60,7 +60,7 @@ public Task DecodeAsync(Stream data, CancellationToken cancellation // Note: no trimming, use original spacing when working with pages string pageContent = ContentOrderTextExtractor.GetText(page).NormalizeNewlines(false) ?? string.Empty; - result.Sections.Add(new Chunk(pageContent, page.Number, Chunk.Meta(sentencesAreComplete: false))); + result.Sections.Add(new Chunk(pageContent, page.Number, Chunk.Meta(false, page.Number))); } return Task.FromResult(result); diff --git a/service/Core/DataFormats/Text/MarkDownDecoder.cs b/service/Core/DataFormats/Text/MarkDownDecoder.cs index 6a45915fb..d6c1b83c6 100644 --- a/service/Core/DataFormats/Text/MarkDownDecoder.cs +++ b/service/Core/DataFormats/Text/MarkDownDecoder.cs @@ -40,7 +40,7 @@ public Task DecodeAsync(BinaryData data, CancellationToken cancella this._log.LogDebug("Extracting text from markdown file"); var result = new FileContent(MimeTypes.MarkDown); - result.Sections.Add(new(data.ToString().Trim(), 1, Chunk.Meta(sentencesAreComplete: true))); + result.Sections.Add(new(data.ToString().Trim(), 1, Chunk.Meta(sentencesAreComplete: true, 1))); return Task.FromResult(result)!; } @@ -54,7 +54,7 @@ public async Task DecodeAsync(Stream data, CancellationToken cancel using var reader = new StreamReader(data); var content = await reader.ReadToEndAsync(cancellationToken).ConfigureAwait(false); - result.Sections.Add(new(content.Trim(), 1, Chunk.Meta(sentencesAreComplete: true))); + result.Sections.Add(new(content.Trim(), 1, Chunk.Meta(sentencesAreComplete: true, 1))); return result; } } diff --git a/service/Core/DataFormats/Text/TextDecoder.cs b/service/Core/DataFormats/Text/TextDecoder.cs index cf95e89ae..7183fedc4 100644 --- a/service/Core/DataFormats/Text/TextDecoder.cs +++ b/service/Core/DataFormats/Text/TextDecoder.cs @@ -26,7 +26,8 @@ public bool SupportsMimeType(string mimeType) { return mimeType != null && ( mimeType.StartsWith(MimeTypes.PlainText, StringComparison.OrdinalIgnoreCase) || - mimeType.StartsWith(MimeTypes.Json, StringComparison.OrdinalIgnoreCase) + mimeType.StartsWith(MimeTypes.Json, StringComparison.OrdinalIgnoreCase) || + mimeType.StartsWith(MimeTypes.CSVData, StringComparison.OrdinalIgnoreCase) ); } @@ -43,7 +44,7 @@ public Task DecodeAsync(BinaryData data, CancellationToken cancella this._log.LogDebug("Extracting text from file"); var result = new FileContent(MimeTypes.PlainText); - result.Sections.Add(new(data.ToString().Trim(), 1, Chunk.Meta(sentencesAreComplete: true))); + result.Sections.Add(new(data.ToString().Trim(), 1, Chunk.Meta(sentencesAreComplete: true, 1))); return Task.FromResult(result)!; } @@ -57,7 +58,7 @@ public async Task DecodeAsync(Stream data, CancellationToken cancel using var reader = new StreamReader(data); var content = await reader.ReadToEndAsync(cancellationToken).ConfigureAwait(false); - result.Sections.Add(new(content.Trim(), 1, Chunk.Meta(sentencesAreComplete: true))); + result.Sections.Add(new(content.Trim(), 1, Chunk.Meta(sentencesAreComplete: true, 1))); return result; } } diff --git a/service/Core/Handlers/TextExtractionHandler.cs b/service/Core/Handlers/TextExtractionHandler.cs index a9ffb9bb6..0b65f6e2e 100644 --- a/service/Core/Handlers/TextExtractionHandler.cs +++ b/service/Core/Handlers/TextExtractionHandler.cs @@ -128,6 +128,7 @@ public TextExtractionHandler( MimeType = content.MimeType, ArtifactType = DataPipeline.ArtifactTypes.ExtractedContent, Tags = pipeline.Tags, + FileContentObject = content, }; destFile2Details.MarkProcessedBy(this); uploadedFile.GeneratedFiles.Add(destFile2, destFile2Details); diff --git a/service/Core/Handlers/TextPartitioningHandler.cs b/service/Core/Handlers/TextPartitioningHandler.cs index 429d59041..826d79424 100644 --- a/service/Core/Handlers/TextPartitioningHandler.cs +++ b/service/Core/Handlers/TextPartitioningHandler.cs @@ -2,6 +2,8 @@ using System; using System.Collections.Generic; +using System.Linq; +using System.Text.Json; using System.Threading; using System.Threading.Tasks; using Microsoft.Extensions.Logging; @@ -9,6 +11,7 @@ using Microsoft.KernelMemory.Chunkers; using Microsoft.KernelMemory.Configuration; using Microsoft.KernelMemory.Context; +using Microsoft.KernelMemory.DataFormats; using Microsoft.KernelMemory.Diagnostics; using Microsoft.KernelMemory.Extensions; using Microsoft.KernelMemory.Pipeline; @@ -107,37 +110,50 @@ public TextPartitioningHandler( continue; } - // Partition only the original text - if (file.ArtifactType != DataPipeline.ArtifactTypes.ExtractedText) + // Partition only the structured content + if (file.ArtifactType != DataPipeline.ArtifactTypes.ExtractedContent) { - this._log.LogTrace("Skipping file {0} (not original text)", file.Name); + this._log.LogTrace("Skipping file {0} (not structured content)", file.Name); continue; } - // Use a different partitioning strategy depending on the file type - List chunks; - BinaryData fileContent = await this._orchestrator.ReadFileAsync(pipeline, file.Name, cancellationToken).ConfigureAwait(false); - string chunksMimeType = MimeTypes.PlainText; - - // Skip empty partitions. Also: partitionContent.ToString() throws an exception if there are no bytes. - if (fileContent.IsEmpty) { continue; } + List chunks = []; + this._log.LogDebug("Partitioning text file {0}", file.Name); + + var structuredChunks = file.FileContentObject?.Sections; + if (structuredChunks is null) + { + BinaryData binary = await this._orchestrator.ReadFileAsync(pipeline, file.Name, cancellationToken).ConfigureAwait(false); + if (binary.IsEmpty) + { + structuredChunks = []; + } + else + { + var fileContent = JsonSerializer.Deserialize(binary.ToString()); + structuredChunks = fileContent?.Sections ?? []; + } + } switch (file.MimeType) { case MimeTypes.PlainText: { - this._log.LogDebug("Partitioning text file {0}", file.Name); - string content = fileContent.ToString(); - chunks = this._plainTextChunker.Split(content, new PlainTextChunkerOptions { MaxTokensPerChunk = maxTokensPerChunk, Overlap = overlappingTokens, ChunkHeader = chunkHeader }); + foreach (var structuredChunk in structuredChunks) + { + var chunksInAPage = this._plainTextChunker.Split(structuredChunk.Content, new PlainTextChunkerOptions { MaxTokensPerChunk = maxTokensPerChunk, Overlap = overlappingTokens, ChunkHeader = chunkHeader }); + chunks.AddRange(chunksInAPage.Select(c => new StructuredChunk { Content = c, PageNumber = structuredChunk.PageNumber })); + } break; } case MimeTypes.MarkDown: { - this._log.LogDebug("Partitioning MarkDown file {0}", file.Name); - string content = fileContent.ToString(); - chunksMimeType = MimeTypes.MarkDown; - chunks = this._markDownChunker.Split(content, new MarkDownChunkerOptions { MaxTokensPerChunk = maxTokensPerChunk, Overlap = overlappingTokens, ChunkHeader = chunkHeader }); + foreach (var structuredChunk in structuredChunks) + { + var chunksInAPage = this._markDownChunker.Split(structuredChunk.Content, new MarkDownChunkerOptions { MaxTokensPerChunk = maxTokensPerChunk, Overlap = overlappingTokens, ChunkHeader = chunkHeader }); + chunks.AddRange(chunksInAPage.Select(c => new StructuredChunk { Content = c, PageNumber = structuredChunk.PageNumber })); + } break; } @@ -156,8 +172,8 @@ public TextPartitioningHandler( for (int partitionNumber = 0; partitionNumber < chunks.Count; partitionNumber++) { // TODO: turn partitions in objects with more details, e.g. page number - string text = chunks[partitionNumber]; - int sectionNumber = 0; // TODO: use this to store the page number (if any) + string text = chunks[partitionNumber].Content; + int sectionNumber = chunks[partitionNumber].PageNumber; BinaryData textData = new(text); var destFile = uploadedFile.GetPartitionFileName(partitionNumber); @@ -169,7 +185,7 @@ public TextPartitioningHandler( ParentId = uploadedFile.Id, Name = destFile, Size = text.Length, - MimeType = chunksMimeType, + MimeType = file.MimeType, ArtifactType = DataPipeline.ArtifactTypes.TextPartition, PartitionNumber = partitionNumber, SectionNumber = sectionNumber, @@ -203,4 +219,10 @@ private static ConfigurationException ChunkTooBigForEmbeddingsException(int valu return new ConfigurationException(errMsg); } #pragma warning restore CA2254 + + internal class StructuredChunk + { + public string Content { get; set; } + public int PageNumber { get; set; } + } } From d6a1c9ffc84517f2ed66b667a67bff575375ba71 Mon Sep 17 00:00:00 2001 From: Au Nguyen Date: Fri, 28 Mar 2025 14:58:31 +0700 Subject: [PATCH 2/2] chore: format code and check nullable props --- service/Core/Handlers/TextPartitioningHandler.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/service/Core/Handlers/TextPartitioningHandler.cs b/service/Core/Handlers/TextPartitioningHandler.cs index 826d79424..c88f439fa 100644 --- a/service/Core/Handlers/TextPartitioningHandler.cs +++ b/service/Core/Handlers/TextPartitioningHandler.cs @@ -119,7 +119,7 @@ public TextPartitioningHandler( List chunks = []; this._log.LogDebug("Partitioning text file {0}", file.Name); - + var structuredChunks = file.FileContentObject?.Sections; if (structuredChunks is null) { @@ -222,7 +222,7 @@ private static ConfigurationException ChunkTooBigForEmbeddingsException(int valu internal class StructuredChunk { - public string Content { get; set; } + public required string Content { get; set; } public int PageNumber { get; set; } } }