Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions service/Abstractions/Pipeline/DataPipeline.cs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
using System.Linq;
using System.Text.Json.Serialization;
using Microsoft.KernelMemory.Context;
using Microsoft.KernelMemory.DataFormats;

namespace Microsoft.KernelMemory.Pipeline;

Expand Down Expand Up @@ -182,6 +183,13 @@ public class GeneratedFileDetails : FileDetailsBase
[JsonPropertyOrder(16)]
[JsonPropertyName("content_sha256")]
public string ContentSHA256 { get; set; } = string.Empty;

/// <summary>
/// Cached content object for structured data that will reduce deserializing operations
/// </summary>
[JsonPropertyOrder(17)]
[JsonPropertyName("file_content_object")]
public FileContent? FileContentObject { get; set; } = null;
}

public class FileDetails : FileDetailsBase
Expand Down
4 changes: 2 additions & 2 deletions service/Core/DataFormats/Office/MsExcelDecoder.cs
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ public MsExcelDecoder(MsExcelDecoderConfig? config = null, ILoggerFactory? logge
/// <inheritdoc />
public bool SupportsMimeType(string mimeType)
{
return mimeType != null && mimeType.StartsWith(MimeTypes.MsExcelX, StringComparison.OrdinalIgnoreCase);
return mimeType != null && (mimeType.StartsWith(MimeTypes.MsExcelX, StringComparison.OrdinalIgnoreCase));
}

/// <inheritdoc />
Expand Down Expand Up @@ -153,7 +153,7 @@ public Task<FileContent> DecodeAsync(Stream data, CancellationToken cancellation

string worksheetContent = sb.ToString().NormalizeNewlines(true);
sb.Clear();
result.Sections.Add(new Chunk(worksheetContent, worksheetNumber, Chunk.Meta(sentencesAreComplete: true)));
result.Sections.Add(new Chunk(worksheetContent, worksheetNumber, Chunk.Meta(sentencesAreComplete: true, worksheetNumber)));
}

return Task.FromResult(result);
Expand Down
2 changes: 1 addition & 1 deletion service/Core/DataFormats/Office/MsPowerPointDecoder.cs
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ public Task<FileContent> DecodeAsync(Stream data, CancellationToken cancellation

string slideContent = sb.ToString().NormalizeNewlines(true);
sb.Clear();
result.Sections.Add(new Chunk(slideContent, slideNumber, Chunk.Meta(sentencesAreComplete: true)));
result.Sections.Add(new Chunk(slideContent, slideNumber, Chunk.Meta(sentencesAreComplete: true, slideNumber)));
}
}

Expand Down
4 changes: 2 additions & 2 deletions service/Core/DataFormats/Office/MsWordDecoder.cs
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ public Task<FileContent> DecodeAsync(Stream data, CancellationToken cancellation
// Note: no trimming, use original spacing when working with pages
string pageContent = sb.ToString().NormalizeNewlines(false);
sb.Clear();
result.Sections.Add(new Chunk(pageContent, pageNumber, Chunk.Meta(sentencesAreComplete: true)));
result.Sections.Add(new Chunk(pageContent, pageNumber, Chunk.Meta(sentencesAreComplete: true, pageNumber)));
pageNumber++;
}

Expand All @@ -93,7 +93,7 @@ public Task<FileContent> DecodeAsync(Stream data, CancellationToken cancellation

// Note: no trimming, use original spacing when working with pages
string lastPageContent = sb.ToString().NormalizeNewlines(false);
result.Sections.Add(new Chunk(lastPageContent, pageNumber, Chunk.Meta(sentencesAreComplete: true)));
result.Sections.Add(new Chunk(lastPageContent, pageNumber, Chunk.Meta(sentencesAreComplete: true, pageNumber)));

return Task.FromResult(result);
}
Expand Down
2 changes: 1 addition & 1 deletion service/Core/DataFormats/Pdf/PdfDecoder.cs
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ public Task<FileContent> DecodeAsync(Stream data, CancellationToken cancellation
// Note: no trimming, use original spacing when working with pages
string pageContent = ContentOrderTextExtractor.GetText(page).NormalizeNewlines(false) ?? string.Empty;

result.Sections.Add(new Chunk(pageContent, page.Number, Chunk.Meta(sentencesAreComplete: false)));
result.Sections.Add(new Chunk(pageContent, page.Number, Chunk.Meta(false, page.Number)));
}

return Task.FromResult(result);
Expand Down
4 changes: 2 additions & 2 deletions service/Core/DataFormats/Text/MarkDownDecoder.cs
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ public Task<FileContent> DecodeAsync(BinaryData data, CancellationToken cancella
this._log.LogDebug("Extracting text from markdown file");

var result = new FileContent(MimeTypes.MarkDown);
result.Sections.Add(new(data.ToString().Trim(), 1, Chunk.Meta(sentencesAreComplete: true)));
result.Sections.Add(new(data.ToString().Trim(), 1, Chunk.Meta(sentencesAreComplete: true, 1)));

return Task.FromResult(result)!;
}
Expand All @@ -54,7 +54,7 @@ public async Task<FileContent> DecodeAsync(Stream data, CancellationToken cancel
using var reader = new StreamReader(data);
var content = await reader.ReadToEndAsync(cancellationToken).ConfigureAwait(false);

result.Sections.Add(new(content.Trim(), 1, Chunk.Meta(sentencesAreComplete: true)));
result.Sections.Add(new(content.Trim(), 1, Chunk.Meta(sentencesAreComplete: true, 1)));
return result;
}
}
7 changes: 4 additions & 3 deletions service/Core/DataFormats/Text/TextDecoder.cs
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ public bool SupportsMimeType(string mimeType)
{
return mimeType != null && (
mimeType.StartsWith(MimeTypes.PlainText, StringComparison.OrdinalIgnoreCase) ||
mimeType.StartsWith(MimeTypes.Json, StringComparison.OrdinalIgnoreCase)
mimeType.StartsWith(MimeTypes.Json, StringComparison.OrdinalIgnoreCase) ||
mimeType.StartsWith(MimeTypes.CSVData, StringComparison.OrdinalIgnoreCase)
);
}

Expand All @@ -43,7 +44,7 @@ public Task<FileContent> DecodeAsync(BinaryData data, CancellationToken cancella
this._log.LogDebug("Extracting text from file");

var result = new FileContent(MimeTypes.PlainText);
result.Sections.Add(new(data.ToString().Trim(), 1, Chunk.Meta(sentencesAreComplete: true)));
result.Sections.Add(new(data.ToString().Trim(), 1, Chunk.Meta(sentencesAreComplete: true, 1)));

return Task.FromResult(result)!;
}
Expand All @@ -57,7 +58,7 @@ public async Task<FileContent> DecodeAsync(Stream data, CancellationToken cancel
using var reader = new StreamReader(data);
var content = await reader.ReadToEndAsync(cancellationToken).ConfigureAwait(false);

result.Sections.Add(new(content.Trim(), 1, Chunk.Meta(sentencesAreComplete: true)));
result.Sections.Add(new(content.Trim(), 1, Chunk.Meta(sentencesAreComplete: true, 1)));
return result;
}
}
1 change: 1 addition & 0 deletions service/Core/Handlers/TextExtractionHandler.cs
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ public TextExtractionHandler(
MimeType = content.MimeType,
ArtifactType = DataPipeline.ArtifactTypes.ExtractedContent,
Tags = pipeline.Tags,
FileContentObject = content,
};
destFile2Details.MarkProcessedBy(this);
uploadedFile.GeneratedFiles.Add(destFile2, destFile2Details);
Expand Down
60 changes: 41 additions & 19 deletions service/Core/Handlers/TextPartitioningHandler.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,16 @@

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text.Json;
using System.Threading;
using System.Threading.Tasks;
using Microsoft.Extensions.Logging;
using Microsoft.KernelMemory.AI;
using Microsoft.KernelMemory.Chunkers;
using Microsoft.KernelMemory.Configuration;
using Microsoft.KernelMemory.Context;
using Microsoft.KernelMemory.DataFormats;
using Microsoft.KernelMemory.Diagnostics;
using Microsoft.KernelMemory.Extensions;
using Microsoft.KernelMemory.Pipeline;
Expand Down Expand Up @@ -107,37 +110,50 @@ public TextPartitioningHandler(
continue;
}

// Partition only the original text
if (file.ArtifactType != DataPipeline.ArtifactTypes.ExtractedText)
// Partition only the structured content
if (file.ArtifactType != DataPipeline.ArtifactTypes.ExtractedContent)
{
this._log.LogTrace("Skipping file {0} (not original text)", file.Name);
this._log.LogTrace("Skipping file {0} (not structured content)", file.Name);
continue;
}

// Use a different partitioning strategy depending on the file type
List<string> chunks;
BinaryData fileContent = await this._orchestrator.ReadFileAsync(pipeline, file.Name, cancellationToken).ConfigureAwait(false);
string chunksMimeType = MimeTypes.PlainText;
List<StructuredChunk> chunks = [];
this._log.LogDebug("Partitioning text file {0}", file.Name);

// Skip empty partitions. Also: partitionContent.ToString() throws an exception if there are no bytes.
if (fileContent.IsEmpty) { continue; }
var structuredChunks = file.FileContentObject?.Sections;
if (structuredChunks is null)
{
BinaryData binary = await this._orchestrator.ReadFileAsync(pipeline, file.Name, cancellationToken).ConfigureAwait(false);
if (binary.IsEmpty)
{
structuredChunks = [];
}
else
{
var fileContent = JsonSerializer.Deserialize<FileContent>(binary.ToString());
structuredChunks = fileContent?.Sections ?? [];
}
}

switch (file.MimeType)
{
case MimeTypes.PlainText:
{
this._log.LogDebug("Partitioning text file {0}", file.Name);
string content = fileContent.ToString();
chunks = this._plainTextChunker.Split(content, new PlainTextChunkerOptions { MaxTokensPerChunk = maxTokensPerChunk, Overlap = overlappingTokens, ChunkHeader = chunkHeader });
foreach (var structuredChunk in structuredChunks)
{
var chunksInAPage = this._plainTextChunker.Split(structuredChunk.Content, new PlainTextChunkerOptions { MaxTokensPerChunk = maxTokensPerChunk, Overlap = overlappingTokens, ChunkHeader = chunkHeader });
chunks.AddRange(chunksInAPage.Select(c => new StructuredChunk { Content = c, PageNumber = structuredChunk.PageNumber }));
}
break;
}

case MimeTypes.MarkDown:
{
this._log.LogDebug("Partitioning MarkDown file {0}", file.Name);
string content = fileContent.ToString();
chunksMimeType = MimeTypes.MarkDown;
chunks = this._markDownChunker.Split(content, new MarkDownChunkerOptions { MaxTokensPerChunk = maxTokensPerChunk, Overlap = overlappingTokens, ChunkHeader = chunkHeader });
foreach (var structuredChunk in structuredChunks)
{
var chunksInAPage = this._markDownChunker.Split(structuredChunk.Content, new MarkDownChunkerOptions { MaxTokensPerChunk = maxTokensPerChunk, Overlap = overlappingTokens, ChunkHeader = chunkHeader });
chunks.AddRange(chunksInAPage.Select(c => new StructuredChunk { Content = c, PageNumber = structuredChunk.PageNumber }));
}
break;
}

Expand All @@ -156,8 +172,8 @@ public TextPartitioningHandler(
for (int partitionNumber = 0; partitionNumber < chunks.Count; partitionNumber++)
{
// TODO: turn partitions in objects with more details, e.g. page number
string text = chunks[partitionNumber];
int sectionNumber = 0; // TODO: use this to store the page number (if any)
string text = chunks[partitionNumber].Content;
int sectionNumber = chunks[partitionNumber].PageNumber;
BinaryData textData = new(text);

var destFile = uploadedFile.GetPartitionFileName(partitionNumber);
Expand All @@ -169,7 +185,7 @@ public TextPartitioningHandler(
ParentId = uploadedFile.Id,
Name = destFile,
Size = text.Length,
MimeType = chunksMimeType,
MimeType = file.MimeType,
ArtifactType = DataPipeline.ArtifactTypes.TextPartition,
PartitionNumber = partitionNumber,
SectionNumber = sectionNumber,
Expand Down Expand Up @@ -203,4 +219,10 @@ private static ConfigurationException ChunkTooBigForEmbeddingsException(int valu
return new ConfigurationException(errMsg);
}
#pragma warning restore CA2254

internal class StructuredChunk
{
public required string Content { get; set; }
public int PageNumber { get; set; }
}
}