diff --git a/service/Abstractions/Pipeline/DataPipeline.cs b/service/Abstractions/Pipeline/DataPipeline.cs
index 7b6f1a6a1..becbcaeeb 100644
--- a/service/Abstractions/Pipeline/DataPipeline.cs
+++ b/service/Abstractions/Pipeline/DataPipeline.cs
@@ -6,6 +6,7 @@
using System.Linq;
using System.Text.Json.Serialization;
using Microsoft.KernelMemory.Context;
+using Microsoft.KernelMemory.DataFormats;
namespace Microsoft.KernelMemory.Pipeline;
@@ -182,6 +183,13 @@ public class GeneratedFileDetails : FileDetailsBase
[JsonPropertyOrder(16)]
[JsonPropertyName("content_sha256")]
public string ContentSHA256 { get; set; } = string.Empty;
+
+ ///
+ /// Cached content object for structured data that will reduce deserializing operations
+ ///
+ [JsonPropertyOrder(17)]
+ [JsonPropertyName("file_content_object")]
+ public FileContent? FileContentObject { get; set; } = null;
}
public class FileDetails : FileDetailsBase
diff --git a/service/Core/DataFormats/Office/MsExcelDecoder.cs b/service/Core/DataFormats/Office/MsExcelDecoder.cs
index bea867811..debad9596 100644
--- a/service/Core/DataFormats/Office/MsExcelDecoder.cs
+++ b/service/Core/DataFormats/Office/MsExcelDecoder.cs
@@ -30,7 +30,7 @@ public MsExcelDecoder(MsExcelDecoderConfig? config = null, ILoggerFactory? logge
///
public bool SupportsMimeType(string mimeType)
{
- return mimeType != null && mimeType.StartsWith(MimeTypes.MsExcelX, StringComparison.OrdinalIgnoreCase);
+ return mimeType != null && (mimeType.StartsWith(MimeTypes.MsExcelX, StringComparison.OrdinalIgnoreCase));
}
///
@@ -153,7 +153,7 @@ public Task DecodeAsync(Stream data, CancellationToken cancellation
string worksheetContent = sb.ToString().NormalizeNewlines(true);
sb.Clear();
- result.Sections.Add(new Chunk(worksheetContent, worksheetNumber, Chunk.Meta(sentencesAreComplete: true)));
+ result.Sections.Add(new Chunk(worksheetContent, worksheetNumber, Chunk.Meta(sentencesAreComplete: true, worksheetNumber)));
}
return Task.FromResult(result);
diff --git a/service/Core/DataFormats/Office/MsPowerPointDecoder.cs b/service/Core/DataFormats/Office/MsPowerPointDecoder.cs
index 2be7b5c67..a311a2d53 100644
--- a/service/Core/DataFormats/Office/MsPowerPointDecoder.cs
+++ b/service/Core/DataFormats/Office/MsPowerPointDecoder.cs
@@ -115,7 +115,7 @@ public Task DecodeAsync(Stream data, CancellationToken cancellation
string slideContent = sb.ToString().NormalizeNewlines(true);
sb.Clear();
- result.Sections.Add(new Chunk(slideContent, slideNumber, Chunk.Meta(sentencesAreComplete: true)));
+ result.Sections.Add(new Chunk(slideContent, slideNumber, Chunk.Meta(sentencesAreComplete: true, slideNumber)));
}
}
diff --git a/service/Core/DataFormats/Office/MsWordDecoder.cs b/service/Core/DataFormats/Office/MsWordDecoder.cs
index 7b6c1a1b1..8d53bf3c7 100644
--- a/service/Core/DataFormats/Office/MsWordDecoder.cs
+++ b/service/Core/DataFormats/Office/MsWordDecoder.cs
@@ -83,7 +83,7 @@ public Task DecodeAsync(Stream data, CancellationToken cancellation
// Note: no trimming, use original spacing when working with pages
string pageContent = sb.ToString().NormalizeNewlines(false);
sb.Clear();
- result.Sections.Add(new Chunk(pageContent, pageNumber, Chunk.Meta(sentencesAreComplete: true)));
+ result.Sections.Add(new Chunk(pageContent, pageNumber, Chunk.Meta(sentencesAreComplete: true, pageNumber)));
pageNumber++;
}
@@ -93,7 +93,7 @@ public Task DecodeAsync(Stream data, CancellationToken cancellation
// Note: no trimming, use original spacing when working with pages
string lastPageContent = sb.ToString().NormalizeNewlines(false);
- result.Sections.Add(new Chunk(lastPageContent, pageNumber, Chunk.Meta(sentencesAreComplete: true)));
+ result.Sections.Add(new Chunk(lastPageContent, pageNumber, Chunk.Meta(sentencesAreComplete: true, pageNumber)));
return Task.FromResult(result);
}
diff --git a/service/Core/DataFormats/Pdf/PdfDecoder.cs b/service/Core/DataFormats/Pdf/PdfDecoder.cs
index 736d9017e..37baf0679 100644
--- a/service/Core/DataFormats/Pdf/PdfDecoder.cs
+++ b/service/Core/DataFormats/Pdf/PdfDecoder.cs
@@ -60,7 +60,7 @@ public Task DecodeAsync(Stream data, CancellationToken cancellation
// Note: no trimming, use original spacing when working with pages
string pageContent = ContentOrderTextExtractor.GetText(page).NormalizeNewlines(false) ?? string.Empty;
- result.Sections.Add(new Chunk(pageContent, page.Number, Chunk.Meta(sentencesAreComplete: false)));
+ result.Sections.Add(new Chunk(pageContent, page.Number, Chunk.Meta(false, page.Number)));
}
return Task.FromResult(result);
diff --git a/service/Core/DataFormats/Text/MarkDownDecoder.cs b/service/Core/DataFormats/Text/MarkDownDecoder.cs
index 6a45915fb..d6c1b83c6 100644
--- a/service/Core/DataFormats/Text/MarkDownDecoder.cs
+++ b/service/Core/DataFormats/Text/MarkDownDecoder.cs
@@ -40,7 +40,7 @@ public Task DecodeAsync(BinaryData data, CancellationToken cancella
this._log.LogDebug("Extracting text from markdown file");
var result = new FileContent(MimeTypes.MarkDown);
- result.Sections.Add(new(data.ToString().Trim(), 1, Chunk.Meta(sentencesAreComplete: true)));
+ result.Sections.Add(new(data.ToString().Trim(), 1, Chunk.Meta(sentencesAreComplete: true, 1)));
return Task.FromResult(result)!;
}
@@ -54,7 +54,7 @@ public async Task DecodeAsync(Stream data, CancellationToken cancel
using var reader = new StreamReader(data);
var content = await reader.ReadToEndAsync(cancellationToken).ConfigureAwait(false);
- result.Sections.Add(new(content.Trim(), 1, Chunk.Meta(sentencesAreComplete: true)));
+ result.Sections.Add(new(content.Trim(), 1, Chunk.Meta(sentencesAreComplete: true, 1)));
return result;
}
}
diff --git a/service/Core/DataFormats/Text/TextDecoder.cs b/service/Core/DataFormats/Text/TextDecoder.cs
index cf95e89ae..7183fedc4 100644
--- a/service/Core/DataFormats/Text/TextDecoder.cs
+++ b/service/Core/DataFormats/Text/TextDecoder.cs
@@ -26,7 +26,8 @@ public bool SupportsMimeType(string mimeType)
{
return mimeType != null && (
mimeType.StartsWith(MimeTypes.PlainText, StringComparison.OrdinalIgnoreCase) ||
- mimeType.StartsWith(MimeTypes.Json, StringComparison.OrdinalIgnoreCase)
+ mimeType.StartsWith(MimeTypes.Json, StringComparison.OrdinalIgnoreCase) ||
+ mimeType.StartsWith(MimeTypes.CSVData, StringComparison.OrdinalIgnoreCase)
);
}
@@ -43,7 +44,7 @@ public Task DecodeAsync(BinaryData data, CancellationToken cancella
this._log.LogDebug("Extracting text from file");
var result = new FileContent(MimeTypes.PlainText);
- result.Sections.Add(new(data.ToString().Trim(), 1, Chunk.Meta(sentencesAreComplete: true)));
+ result.Sections.Add(new(data.ToString().Trim(), 1, Chunk.Meta(sentencesAreComplete: true, 1)));
return Task.FromResult(result)!;
}
@@ -57,7 +58,7 @@ public async Task DecodeAsync(Stream data, CancellationToken cancel
using var reader = new StreamReader(data);
var content = await reader.ReadToEndAsync(cancellationToken).ConfigureAwait(false);
- result.Sections.Add(new(content.Trim(), 1, Chunk.Meta(sentencesAreComplete: true)));
+ result.Sections.Add(new(content.Trim(), 1, Chunk.Meta(sentencesAreComplete: true, 1)));
return result;
}
}
diff --git a/service/Core/Handlers/TextExtractionHandler.cs b/service/Core/Handlers/TextExtractionHandler.cs
index a9ffb9bb6..0b65f6e2e 100644
--- a/service/Core/Handlers/TextExtractionHandler.cs
+++ b/service/Core/Handlers/TextExtractionHandler.cs
@@ -128,6 +128,7 @@ public TextExtractionHandler(
MimeType = content.MimeType,
ArtifactType = DataPipeline.ArtifactTypes.ExtractedContent,
Tags = pipeline.Tags,
+ FileContentObject = content,
};
destFile2Details.MarkProcessedBy(this);
uploadedFile.GeneratedFiles.Add(destFile2, destFile2Details);
diff --git a/service/Core/Handlers/TextPartitioningHandler.cs b/service/Core/Handlers/TextPartitioningHandler.cs
index 429d59041..c88f439fa 100644
--- a/service/Core/Handlers/TextPartitioningHandler.cs
+++ b/service/Core/Handlers/TextPartitioningHandler.cs
@@ -2,6 +2,8 @@
using System;
using System.Collections.Generic;
+using System.Linq;
+using System.Text.Json;
using System.Threading;
using System.Threading.Tasks;
using Microsoft.Extensions.Logging;
@@ -9,6 +11,7 @@
using Microsoft.KernelMemory.Chunkers;
using Microsoft.KernelMemory.Configuration;
using Microsoft.KernelMemory.Context;
+using Microsoft.KernelMemory.DataFormats;
using Microsoft.KernelMemory.Diagnostics;
using Microsoft.KernelMemory.Extensions;
using Microsoft.KernelMemory.Pipeline;
@@ -107,37 +110,50 @@ public TextPartitioningHandler(
continue;
}
- // Partition only the original text
- if (file.ArtifactType != DataPipeline.ArtifactTypes.ExtractedText)
+ // Partition only the structured content
+ if (file.ArtifactType != DataPipeline.ArtifactTypes.ExtractedContent)
{
- this._log.LogTrace("Skipping file {0} (not original text)", file.Name);
+ this._log.LogTrace("Skipping file {0} (not structured content)", file.Name);
continue;
}
- // Use a different partitioning strategy depending on the file type
- List chunks;
- BinaryData fileContent = await this._orchestrator.ReadFileAsync(pipeline, file.Name, cancellationToken).ConfigureAwait(false);
- string chunksMimeType = MimeTypes.PlainText;
+ List chunks = [];
+ this._log.LogDebug("Partitioning text file {0}", file.Name);
- // Skip empty partitions. Also: partitionContent.ToString() throws an exception if there are no bytes.
- if (fileContent.IsEmpty) { continue; }
+ var structuredChunks = file.FileContentObject?.Sections;
+ if (structuredChunks is null)
+ {
+ BinaryData binary = await this._orchestrator.ReadFileAsync(pipeline, file.Name, cancellationToken).ConfigureAwait(false);
+ if (binary.IsEmpty)
+ {
+ structuredChunks = [];
+ }
+ else
+ {
+ var fileContent = JsonSerializer.Deserialize(binary.ToString());
+ structuredChunks = fileContent?.Sections ?? [];
+ }
+ }
switch (file.MimeType)
{
case MimeTypes.PlainText:
{
- this._log.LogDebug("Partitioning text file {0}", file.Name);
- string content = fileContent.ToString();
- chunks = this._plainTextChunker.Split(content, new PlainTextChunkerOptions { MaxTokensPerChunk = maxTokensPerChunk, Overlap = overlappingTokens, ChunkHeader = chunkHeader });
+ foreach (var structuredChunk in structuredChunks)
+ {
+ var chunksInAPage = this._plainTextChunker.Split(structuredChunk.Content, new PlainTextChunkerOptions { MaxTokensPerChunk = maxTokensPerChunk, Overlap = overlappingTokens, ChunkHeader = chunkHeader });
+ chunks.AddRange(chunksInAPage.Select(c => new StructuredChunk { Content = c, PageNumber = structuredChunk.PageNumber }));
+ }
break;
}
case MimeTypes.MarkDown:
{
- this._log.LogDebug("Partitioning MarkDown file {0}", file.Name);
- string content = fileContent.ToString();
- chunksMimeType = MimeTypes.MarkDown;
- chunks = this._markDownChunker.Split(content, new MarkDownChunkerOptions { MaxTokensPerChunk = maxTokensPerChunk, Overlap = overlappingTokens, ChunkHeader = chunkHeader });
+ foreach (var structuredChunk in structuredChunks)
+ {
+ var chunksInAPage = this._markDownChunker.Split(structuredChunk.Content, new MarkDownChunkerOptions { MaxTokensPerChunk = maxTokensPerChunk, Overlap = overlappingTokens, ChunkHeader = chunkHeader });
+ chunks.AddRange(chunksInAPage.Select(c => new StructuredChunk { Content = c, PageNumber = structuredChunk.PageNumber }));
+ }
break;
}
@@ -156,8 +172,8 @@ public TextPartitioningHandler(
for (int partitionNumber = 0; partitionNumber < chunks.Count; partitionNumber++)
{
// TODO: turn partitions in objects with more details, e.g. page number
- string text = chunks[partitionNumber];
- int sectionNumber = 0; // TODO: use this to store the page number (if any)
+ string text = chunks[partitionNumber].Content;
+ int sectionNumber = chunks[partitionNumber].PageNumber;
BinaryData textData = new(text);
var destFile = uploadedFile.GetPartitionFileName(partitionNumber);
@@ -169,7 +185,7 @@ public TextPartitioningHandler(
ParentId = uploadedFile.Id,
Name = destFile,
Size = text.Length,
- MimeType = chunksMimeType,
+ MimeType = file.MimeType,
ArtifactType = DataPipeline.ArtifactTypes.TextPartition,
PartitionNumber = partitionNumber,
SectionNumber = sectionNumber,
@@ -203,4 +219,10 @@ private static ConfigurationException ChunkTooBigForEmbeddingsException(int valu
return new ConfigurationException(errMsg);
}
#pragma warning restore CA2254
+
+ internal class StructuredChunk
+ {
+ public required string Content { get; set; }
+ public int PageNumber { get; set; }
+ }
}