microsoft · lunmatu101 · Mar 28, 2025 · Mar 28, 2025 · Mar 28, 2025 · Jun 18, 2025
diff --git a/service/Abstractions/Pipeline/DataPipeline.cs b/service/Abstractions/Pipeline/DataPipeline.cs
@@ -6,6 +6,7 @@
 using System.Linq;
 using System.Text.Json.Serialization;
 using Microsoft.KernelMemory.Context;
+using Microsoft.KernelMemory.DataFormats;
 
 namespace Microsoft.KernelMemory.Pipeline;
 
@@ -182,6 +183,13 @@ public class GeneratedFileDetails : FileDetailsBase
         [JsonPropertyOrder(16)]
         [JsonPropertyName("content_sha256")]
         public string ContentSHA256 { get; set; } = string.Empty;
+
+        /// <summary>
+        /// Cached content object for structured data that will reduce deserializing operations
+        /// </summary>
+        [JsonPropertyOrder(17)]
+        [JsonPropertyName("file_content_object")]
+        public FileContent? FileContentObject { get; set; } = null;
     }
 
     public class FileDetails : FileDetailsBase

diff --git a/service/Core/DataFormats/Office/MsExcelDecoder.cs b/service/Core/DataFormats/Office/MsExcelDecoder.cs
@@ -30,7 +30,7 @@ public MsExcelDecoder(MsExcelDecoderConfig? config = null, ILoggerFactory? logge
     /// <inheritdoc />
     public bool SupportsMimeType(string mimeType)
     {
-        return mimeType != null && mimeType.StartsWith(MimeTypes.MsExcelX, StringComparison.OrdinalIgnoreCase);
+        return mimeType != null && (mimeType.StartsWith(MimeTypes.MsExcelX, StringComparison.OrdinalIgnoreCase));
     }
 
     /// <inheritdoc />
@@ -153,7 +153,7 @@ public Task<FileContent> DecodeAsync(Stream data, CancellationToken cancellation
 
             string worksheetContent = sb.ToString().NormalizeNewlines(true);
             sb.Clear();
-            result.Sections.Add(new Chunk(worksheetContent, worksheetNumber, Chunk.Meta(sentencesAreComplete: true)));
+            result.Sections.Add(new Chunk(worksheetContent, worksheetNumber, Chunk.Meta(sentencesAreComplete: true, worksheetNumber)));
         }
 
         return Task.FromResult(result);

diff --git a/service/Core/DataFormats/Office/MsPowerPointDecoder.cs b/service/Core/DataFormats/Office/MsPowerPointDecoder.cs
@@ -115,7 +115,7 @@ public Task<FileContent> DecodeAsync(Stream data, CancellationToken cancellation
 
                 string slideContent = sb.ToString().NormalizeNewlines(true);
                 sb.Clear();
-                result.Sections.Add(new Chunk(slideContent, slideNumber, Chunk.Meta(sentencesAreComplete: true)));
+                result.Sections.Add(new Chunk(slideContent, slideNumber, Chunk.Meta(sentencesAreComplete: true, slideNumber)));
             }
         }
 

diff --git a/service/Core/DataFormats/Office/MsWordDecoder.cs b/service/Core/DataFormats/Office/MsWordDecoder.cs
@@ -83,7 +83,7 @@ public Task<FileContent> DecodeAsync(Stream data, CancellationToken cancellation
                         // Note: no trimming, use original spacing when working with pages
                         string pageContent = sb.ToString().NormalizeNewlines(false);
                         sb.Clear();
-                        result.Sections.Add(new Chunk(pageContent, pageNumber, Chunk.Meta(sentencesAreComplete: true)));
+                        result.Sections.Add(new Chunk(pageContent, pageNumber, Chunk.Meta(sentencesAreComplete: true, pageNumber)));
                         pageNumber++;
                     }
 
@@ -93,7 +93,7 @@ public Task<FileContent> DecodeAsync(Stream data, CancellationToken cancellation
 
             // Note: no trimming, use original spacing when working with pages
             string lastPageContent = sb.ToString().NormalizeNewlines(false);
-            result.Sections.Add(new Chunk(lastPageContent, pageNumber, Chunk.Meta(sentencesAreComplete: true)));
+            result.Sections.Add(new Chunk(lastPageContent, pageNumber, Chunk.Meta(sentencesAreComplete: true, pageNumber)));
 
             return Task.FromResult(result);
         }

diff --git a/service/Core/DataFormats/Pdf/PdfDecoder.cs b/service/Core/DataFormats/Pdf/PdfDecoder.cs
@@ -60,7 +60,7 @@ public Task<FileContent> DecodeAsync(Stream data, CancellationToken cancellation
             // Note: no trimming, use original spacing when working with pages
             string pageContent = ContentOrderTextExtractor.GetText(page).NormalizeNewlines(false) ?? string.Empty;
 
-            result.Sections.Add(new Chunk(pageContent, page.Number, Chunk.Meta(sentencesAreComplete: false)));
+            result.Sections.Add(new Chunk(pageContent, page.Number, Chunk.Meta(false, page.Number)));
         }
 
         return Task.FromResult(result);

diff --git a/service/Core/DataFormats/Text/MarkDownDecoder.cs b/service/Core/DataFormats/Text/MarkDownDecoder.cs
@@ -40,7 +40,7 @@ public Task<FileContent> DecodeAsync(BinaryData data, CancellationToken cancella
         this._log.LogDebug("Extracting text from markdown file");
 
         var result = new FileContent(MimeTypes.MarkDown);
-        result.Sections.Add(new(data.ToString().Trim(), 1, Chunk.Meta(sentencesAreComplete: true)));
+        result.Sections.Add(new(data.ToString().Trim(), 1, Chunk.Meta(sentencesAreComplete: true, 1)));
 
         return Task.FromResult(result)!;
     }
@@ -54,7 +54,7 @@ public async Task<FileContent> DecodeAsync(Stream data, CancellationToken cancel
         using var reader = new StreamReader(data);
         var content = await reader.ReadToEndAsync(cancellationToken).ConfigureAwait(false);
 
-        result.Sections.Add(new(content.Trim(), 1, Chunk.Meta(sentencesAreComplete: true)));
+        result.Sections.Add(new(content.Trim(), 1, Chunk.Meta(sentencesAreComplete: true, 1)));
         return result;
     }
 }
diff --git a/service/Core/DataFormats/Text/TextDecoder.cs b/service/Core/DataFormats/Text/TextDecoder.cs
@@ -26,7 +26,8 @@ public bool SupportsMimeType(string mimeType)
     {
         return mimeType != null && (
             mimeType.StartsWith(MimeTypes.PlainText, StringComparison.OrdinalIgnoreCase) ||
-            mimeType.StartsWith(MimeTypes.Json, StringComparison.OrdinalIgnoreCase)
+            mimeType.StartsWith(MimeTypes.Json, StringComparison.OrdinalIgnoreCase) ||
+            mimeType.StartsWith(MimeTypes.CSVData, StringComparison.OrdinalIgnoreCase)
         );
     }
 
@@ -43,7 +44,7 @@ public Task<FileContent> DecodeAsync(BinaryData data, CancellationToken cancella
         this._log.LogDebug("Extracting text from file");
 
         var result = new FileContent(MimeTypes.PlainText);
-        result.Sections.Add(new(data.ToString().Trim(), 1, Chunk.Meta(sentencesAreComplete: true)));
+        result.Sections.Add(new(data.ToString().Trim(), 1, Chunk.Meta(sentencesAreComplete: true, 1)));
 
         return Task.FromResult(result)!;
     }
@@ -57,7 +58,7 @@ public async Task<FileContent> DecodeAsync(Stream data, CancellationToken cancel
         using var reader = new StreamReader(data);
         var content = await reader.ReadToEndAsync(cancellationToken).ConfigureAwait(false);
 
-        result.Sections.Add(new(content.Trim(), 1, Chunk.Meta(sentencesAreComplete: true)));
+        result.Sections.Add(new(content.Trim(), 1, Chunk.Meta(sentencesAreComplete: true, 1)));
         return result;
     }
 }
diff --git a/service/Core/Handlers/TextExtractionHandler.cs b/service/Core/Handlers/TextExtractionHandler.cs
@@ -128,6 +128,7 @@ public TextExtractionHandler(
                     MimeType = content.MimeType,
                     ArtifactType = DataPipeline.ArtifactTypes.ExtractedContent,
                     Tags = pipeline.Tags,
+                    FileContentObject = content,
                 };
                 destFile2Details.MarkProcessedBy(this);
                 uploadedFile.GeneratedFiles.Add(destFile2, destFile2Details);

diff --git a/service/Core/Handlers/TextPartitioningHandler.cs b/service/Core/Handlers/TextPartitioningHandler.cs
@@ -2,13 +2,16 @@
 
 using System;
 using System.Collections.Generic;
+using System.Linq;
+using System.Text.Json;
 using System.Threading;
 using System.Threading.Tasks;
 using Microsoft.Extensions.Logging;
 using Microsoft.KernelMemory.AI;
 using Microsoft.KernelMemory.Chunkers;
 using Microsoft.KernelMemory.Configuration;
 using Microsoft.KernelMemory.Context;
+using Microsoft.KernelMemory.DataFormats;
 using Microsoft.KernelMemory.Diagnostics;
 using Microsoft.KernelMemory.Extensions;
 using Microsoft.KernelMemory.Pipeline;
@@ -107,37 +110,50 @@ public TextPartitioningHandler(
                     continue;
                 }
 
-                // Partition only the original text
-                if (file.ArtifactType != DataPipeline.ArtifactTypes.ExtractedText)
+                // Partition only the structured content
+                if (file.ArtifactType != DataPipeline.ArtifactTypes.ExtractedContent)
                 {
-                    this._log.LogTrace("Skipping file {0} (not original text)", file.Name);
+                    this._log.LogTrace("Skipping file {0} (not structured content)", file.Name);
                     continue;
                 }
 
-                // Use a different partitioning strategy depending on the file type
-                List<string> chunks;
-                BinaryData fileContent = await this._orchestrator.ReadFileAsync(pipeline, file.Name, cancellationToken).ConfigureAwait(false);
-                string chunksMimeType = MimeTypes.PlainText;
+                List<StructuredChunk> chunks = [];
+                this._log.LogDebug("Partitioning text file {0}", file.Name);
 
-                // Skip empty partitions. Also: partitionContent.ToString() throws an exception if there are no bytes.
-                if (fileContent.IsEmpty) { continue; }
+                var structuredChunks = file.FileContentObject?.Sections;
+                if (structuredChunks is null)
+                {
+                    BinaryData binary = await this._orchestrator.ReadFileAsync(pipeline, file.Name, cancellationToken).ConfigureAwait(false);
+                    if (binary.IsEmpty)
+                    {
+                        structuredChunks = [];
+                    }
+                    else
+                    {
+                        var fileContent = JsonSerializer.Deserialize<FileContent>(binary.ToString());
+                        structuredChunks = fileContent?.Sections ?? [];
+                    }
+                }
 
                 switch (file.MimeType)
                 {
                     case MimeTypes.PlainText:
                     {
-                        this._log.LogDebug("Partitioning text file {0}", file.Name);
-                        string content = fileContent.ToString();
-                        chunks = this._plainTextChunker.Split(content, new PlainTextChunkerOptions { MaxTokensPerChunk = maxTokensPerChunk, Overlap = overlappingTokens, ChunkHeader = chunkHeader });
+                        foreach (var structuredChunk in structuredChunks)
+                        {
+                            var chunksInAPage = this._plainTextChunker.Split(structuredChunk.Content, new PlainTextChunkerOptions { MaxTokensPerChunk = maxTokensPerChunk, Overlap = overlappingTokens, ChunkHeader = chunkHeader });
+                            chunks.AddRange(chunksInAPage.Select(c => new StructuredChunk { Content = c, PageNumber = structuredChunk.PageNumber }));
+                        }
                         break;
                     }
 
                     case MimeTypes.MarkDown:
                     {
-                        this._log.LogDebug("Partitioning MarkDown file {0}", file.Name);
-                        string content = fileContent.ToString();
-                        chunksMimeType = MimeTypes.MarkDown;
-                        chunks = this._markDownChunker.Split(content, new MarkDownChunkerOptions { MaxTokensPerChunk = maxTokensPerChunk, Overlap = overlappingTokens, ChunkHeader = chunkHeader });
+                        foreach (var structuredChunk in structuredChunks)
+                        {
+                            var chunksInAPage = this._markDownChunker.Split(structuredChunk.Content, new MarkDownChunkerOptions { MaxTokensPerChunk = maxTokensPerChunk, Overlap = overlappingTokens, ChunkHeader = chunkHeader });
+                            chunks.AddRange(chunksInAPage.Select(c => new StructuredChunk { Content = c, PageNumber = structuredChunk.PageNumber }));
+                        }
                         break;
                     }
 
@@ -156,8 +172,8 @@ public TextPartitioningHandler(
                 for (int partitionNumber = 0; partitionNumber < chunks.Count; partitionNumber++)
                 {
                     // TODO: turn partitions in objects with more details, e.g. page number
-                    string text = chunks[partitionNumber];
-                    int sectionNumber = 0; // TODO: use this to store the page number (if any)
+                    string text = chunks[partitionNumber].Content;
+                    int sectionNumber = chunks[partitionNumber].PageNumber;
                     BinaryData textData = new(text);
 
                     var destFile = uploadedFile.GetPartitionFileName(partitionNumber);
@@ -169,7 +185,7 @@ public TextPartitioningHandler(
                         ParentId = uploadedFile.Id,
                         Name = destFile,
                         Size = text.Length,
-                        MimeType = chunksMimeType,
+                        MimeType = file.MimeType,
                         ArtifactType = DataPipeline.ArtifactTypes.TextPartition,
                         PartitionNumber = partitionNumber,
                         SectionNumber = sectionNumber,
@@ -203,4 +219,10 @@ private static ConfigurationException ChunkTooBigForEmbeddingsException(int valu
         return new ConfigurationException(errMsg);
     }
 #pragma warning restore CA2254
+
+    internal class StructuredChunk
+    {
+        public required string Content { get; set; }
+        public int PageNumber { get; set; }
+    }
 }