From bf2c540ee3fb93c64555e467708e3be461477a03 Mon Sep 17 00:00:00 2001 From: ksemenenko Date: Sat, 11 Oct 2025 17:50:14 +0200 Subject: [PATCH 1/6] fix review feedback --- README.md | 55 ++- eng/install-dotnet.sh | 40 ++ .../Conversion/ConversionArtifacts.cs | 173 ++++++++ .../Conversion/ConversionPipeline.cs | 51 +++ .../Conversion/ConversionPipelineContext.cs | 46 ++ .../Conversion/IConversionMiddleware.cs | 24 ++ .../Middleware/AiImageEnrichmentMiddleware.cs | 324 ++++++++++++++ src/MarkItDown/Converters/DocxConverter.cs | 349 ++++++++++----- src/MarkItDown/Converters/PdfConverter.cs | 387 +++++++++++++---- src/MarkItDown/Converters/PptxConverter.cs | 397 +++++++++++++----- src/MarkItDown/DocumentConverterResult.cs | 9 +- src/MarkItDown/MarkItDown.cs | 37 +- src/MarkItDown/MarkItDownOptions.cs | 12 + src/MarkItDown/MetadataKeys.cs | 1 + tests/MarkItDown.Tests/DocxConverterTests.cs | 48 +++ .../Fixtures/DocxInlineImageFactory.cs | 106 +++++ .../PdfConverterIntelligenceTests.cs | 59 +++ tests/MarkItDown.Tests/NewConvertersTests.cs | 49 +++ tests/MarkItDown.Tests/PdfConverterTests.cs | 30 ++ tests/MarkItDown.Tests/PptxConverterTests.cs | 47 +++ tests/MarkItDown.Tests/RecordingPipeline.cs | 64 +++ 21 files changed, 1995 insertions(+), 313 deletions(-) create mode 100755 eng/install-dotnet.sh create mode 100644 src/MarkItDown/Conversion/ConversionArtifacts.cs create mode 100644 src/MarkItDown/Conversion/ConversionPipeline.cs create mode 100644 src/MarkItDown/Conversion/ConversionPipelineContext.cs create mode 100644 src/MarkItDown/Conversion/IConversionMiddleware.cs create mode 100644 src/MarkItDown/Conversion/Middleware/AiImageEnrichmentMiddleware.cs create mode 100644 tests/MarkItDown.Tests/DocxConverterTests.cs create mode 100644 tests/MarkItDown.Tests/Fixtures/DocxInlineImageFactory.cs create mode 100644 tests/MarkItDown.Tests/PptxConverterTests.cs create mode 100644 tests/MarkItDown.Tests/RecordingPipeline.cs diff --git a/README.md b/README.md index dcecfee20..6a79bdd2c 100644 --- a/README.md +++ b/README.md @@ -56,10 +56,12 @@ This is a high-fidelity C# port of Microsoft's original [MarkItDown Python libra ✨ **Modern .NET** - Targets .NET 9.0 with up-to-date language features 📦 **NuGet Package** - Drop-in dependency for libraries and automation pipelines -🔄 **Async/Await** - Fully asynchronous pipeline for responsive apps -🧠 **LLM-Optimized** - Markdown tailored for AI ingestion and summarisation -🔧 **Extensible** - Register custom converters or plug additional caption/transcription services -🧭 **Smart Detection** - Automatic MIME, charset, and file-type guessing (including data/file URIs) +🔄 **Async/Await** - Fully asynchronous pipeline for responsive apps +🧠 **LLM-Optimized** - Markdown tailored for AI ingestion and summarisation +🔧 **Extensible** - Register custom converters or plug additional caption/transcription services +🧩 **Conversion middleware** - Compose post-processing steps with `IConversionMiddleware` (AI enrichment ready) +📂 **Raw artifacts API** - Inspect text blocks, tables, and images via `DocumentConverterResult.Artifacts` +🧭 **Smart Detection** - Automatic MIME, charset, and file-type guessing (including data/file URIs) ⚡ **High Performance** - Stream-friendly, minimal allocations, zero temp files ## 📋 Format Support @@ -102,11 +104,12 @@ This is a high-fidelity C# port of Microsoft's original [MarkItDown Python libra - Header detection based on formatting - List item recognition - Title extraction from document content +- Page snapshot artifacts ensure every page can be sent through AI enrichment (OCR, diagram-to-Mermaid, chart narration) even when the PDF exposes selectable text ### Office Documents (DOCX/XLSX/PPTX) -- **Word (.docx)**: Headers, paragraphs, tables, bold/italic formatting +- **Word (.docx)**: Headers, paragraphs, tables, bold/italic formatting, and embedded images captured for AI enrichment (OCR, Mermaid-ready diagrams) - **Excel (.xlsx)**: Spreadsheet data as Markdown tables with sheet organization -- **PowerPoint (.pptx)**: Slide-by-slide content with title recognition +- **PowerPoint (.pptx)**: Slide-by-slide content with title recognition plus image artifacts primed for detailed AI captions and diagrams ### CSV Conversion Features - Automatic table formatting with headers @@ -1056,6 +1059,17 @@ var result = await markItDown.ConvertAsync("document.pdf"); Console.WriteLine(result.Markdown); ``` +### .NET SDK Setup + +MarkItDown targets .NET 9.0. If your environment does not have the required SDK, run the helper script once: + +```bash +./eng/install-dotnet.sh +``` + +The script installs the SDK into `~/.dotnet` using the official `dotnet-install` bootstrapper and prints the environment +variables to add to your shell profile so the `dotnet` CLI is available on subsequent sessions. + ### Building from Source ```bash @@ -1084,6 +1098,10 @@ The command emits standard test results plus a Cobertura coverage report at [ReportGenerator](https://github.com/danielpalme/ReportGenerator) can turn this into HTML or Markdown dashboards. +> ✅ The regression suite now exercises DOCX and PPTX conversions with embedded imagery, ensuring conversion middleware runs and enriched descriptions remain attached to the composed Markdown. +> +> ✅ Additional image-placement regressions verify that AI-generated captions are injected immediately after each source placeholder for DOCX, PPTX, and PDF outputs. + ### Project Structure ``` @@ -1218,6 +1236,31 @@ var options = new MarkItDownOptions var markItDown = new MarkItDown(options); ``` +### Conversion Middleware & Raw Artifacts + +Every conversion now exposes the raw extraction artifacts that feed the Markdown composer. Use `DocumentConverterResult.Artifacts` to inspect page text, tables, or embedded images before they are flattened into Markdown. You can plug additional processing by registering `IConversionMiddleware` instances through `MarkItDownOptions.ConversionMiddleware`. Middleware executes after extraction and can mutate segments, enrich metadata, or call external AI services. When an `IChatClient` is supplied and `EnableAiImageEnrichment` remains `true` (default), MarkItDown automatically adds the built-in `AiImageEnrichmentMiddleware` to describe charts, diagrams, and other visuals. The middleware keeps enriched prose anchored to the exact Markdown placeholder emitted during extraction, ensuring captions, Mermaid diagrams, and OCR text land beside the original image instead of drifting to the end of the section. + +```csharp +var options = new MarkItDownOptions +{ + AiModels = new StaticAiModelProvider(chatClient: myChatClient, speechToTextClient: null), + ConversionMiddleware = new IConversionMiddleware[] + { + new MyDomainSpecificMiddleware() + } +}; + +var markItDown = new MarkItDown(options); +var result = await markItDown.ConvertAsync("docs/diagram.docx"); + +foreach (var image in result.Artifacts.Images) +{ + Console.WriteLine($"Image {image.Label}: {image.DetailedDescription}"); +} +``` + +Set `EnableAiImageEnrichment` to `false` when you need a completely custom pipeline with no default AI step. + ### Production Configuration with Error Handling ```csharp diff --git a/eng/install-dotnet.sh b/eng/install-dotnet.sh new file mode 100755 index 000000000..b4db8d6a4 --- /dev/null +++ b/eng/install-dotnet.sh @@ -0,0 +1,40 @@ +#!/usr/bin/env bash +set -euo pipefail + +CHANNEL="9.0" +INSTALL_DIR="${DOTNET_ROOT:-$HOME/.dotnet}" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +RESOLVED_INSTALL_DIR="${INSTALL_DIR}" +TEMP_SCRIPT="${SCRIPT_DIR}/dotnet-install.sh" + +cleanup() { + rm -f "${TEMP_SCRIPT}" +} +trap cleanup EXIT + +if ! command -v wget >/dev/null 2>&1 && ! command -v curl >/dev/null 2>&1; then + echo "Either wget or curl is required to download dotnet-install.sh" >&2 + exit 1 +fi + +DOWNLOAD_TOOL="wget" +DOWNLOAD_ARGS=("-q" "-O") +URL="https://dot.net/v1/dotnet-install.sh" + +if command -v curl >/dev/null 2>&1; then + DOWNLOAD_TOOL="curl" + DOWNLOAD_ARGS=("-sSL" "-o") +fi + +${DOWNLOAD_TOOL} "${DOWNLOAD_ARGS[@]}" "${TEMP_SCRIPT}" "${URL}" +chmod +x "${TEMP_SCRIPT}" + +"${TEMP_SCRIPT}" --channel "${CHANNEL}" --install-dir "${INSTALL_DIR}" --no-path + +cat < +/// Represents the raw artifacts extracted during conversion prior to Markdown composition. +/// +public sealed class ConversionArtifacts +{ + /// + /// Initializes a new instance of the class. + /// + public ConversionArtifacts() + { + TextBlocks = new List(); + Tables = new List(); + Images = new List(); + Metadata = new Dictionary(); + } + + private ConversionArtifacts(bool _) + { + TextBlocks = EmptyTextBlocks; + Tables = EmptyTables; + Images = EmptyImages; + Metadata = EmptyMetadata; + } + + /// + /// Gets a reusable empty instance. + /// + public static ConversionArtifacts Empty { get; } = new(true); + + private static readonly IList EmptyTextBlocks = new ReadOnlyCollection(Array.Empty()); + private static readonly IList EmptyTables = new ReadOnlyCollection(Array.Empty()); + private static readonly IList EmptyImages = new ReadOnlyCollection(Array.Empty()); + private static readonly IDictionary EmptyMetadata = new ReadOnlyDictionary(new Dictionary()); + + /// + /// Gets the raw text artifacts captured from the source. + /// + public IList TextBlocks { get; } + + /// + /// Gets the tabular artifacts captured from the source. + /// + public IList Tables { get; } + + /// + /// Gets the image artifacts captured from the source. + /// + public IList Images { get; } + + /// + /// Gets conversion-level metadata surfaced by the converter. + /// + public IDictionary Metadata { get; } +} + +/// +/// Represents a block of text extracted from the source document. +/// +public sealed class TextArtifact +{ + public TextArtifact(string text, int? pageNumber = null, string? source = null, string? label = null) + { + Text = text ?? string.Empty; + PageNumber = pageNumber; + Source = source; + Label = label; + } + + public string Text { get; set; } + + public int? PageNumber { get; set; } + + public string? Source { get; set; } + + public string? Label { get; set; } +} + +/// +/// Represents tabular content extracted from the source document. +/// +public sealed class TableArtifact +{ + public TableArtifact(IList> rows, int? pageNumber = null, string? source = null, string? label = null) + { + Rows = rows ?? throw new ArgumentNullException(nameof(rows)); + PageNumber = pageNumber; + Source = source; + Label = label; + } + + public IList> Rows { get; } + + public int? PageNumber { get; set; } + + public string? Source { get; set; } + + public string? Label { get; set; } +} + +/// +/// Represents an image extracted from the source document. +/// +public sealed class ImageArtifact +{ + public ImageArtifact(byte[] data, string? contentType = null, int? pageNumber = null, string? source = null, string? label = null) + { + Data = data ?? throw new ArgumentNullException(nameof(data)); + ContentType = contentType; + PageNumber = pageNumber; + Source = source; + Label = label; + Metadata = new Dictionary(); + } + + /// + /// Gets the raw binary data for the image. + /// + public byte[] Data { get; } + + /// + /// Gets the content type associated with the image. + /// + public string? ContentType { get; set; } + + /// + /// Gets or sets the page number that owns the image, when applicable. + /// + public int? PageNumber { get; set; } + + /// + /// Gets or sets the logical source identifier for the image. + /// + public string? Source { get; set; } + + /// + /// Gets or sets the friendly label for the image. + /// + public string? Label { get; set; } + + /// + /// Gets or sets the enriched description generated for the image. + /// + public string? DetailedDescription { get; set; } + + /// + /// Gets or sets a Mermaid diagram representation when the image depicts structured data. + /// + public string? MermaidDiagram { get; set; } + + /// + /// Gets or sets additional textual extraction (such as OCR output). + /// + public string? RawText { get; set; } + + /// + /// Gets metadata describing the image artifact. + /// + public IDictionary Metadata { get; } + + /// + /// Gets or sets the segment index that references this artifact within the composed output. + /// + public int? SegmentIndex { get; set; } + + /// + /// Gets or sets the Markdown placeholder that was emitted during extraction for this image. + /// + public string? PlaceholderMarkdown { get; set; } +} diff --git a/src/MarkItDown/Conversion/ConversionPipeline.cs b/src/MarkItDown/Conversion/ConversionPipeline.cs new file mode 100644 index 000000000..221aff03f --- /dev/null +++ b/src/MarkItDown/Conversion/ConversionPipeline.cs @@ -0,0 +1,51 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Threading; +using System.Threading.Tasks; +using MarkItDown.Intelligence; +using Microsoft.Extensions.Logging; + +namespace MarkItDown; + +/// +/// Sequential middleware pipeline that executes configured components. +/// +public sealed class ConversionPipeline : IConversionPipeline +{ + private readonly IReadOnlyList middlewares; + private readonly IAiModelProvider aiModels; + private readonly ILogger? logger; + + public static IConversionPipeline Empty { get; } = new ConversionPipeline(Array.Empty(), NullAiModelProvider.Instance, logger: null); + + public ConversionPipeline(IEnumerable middlewares, IAiModelProvider aiModels, ILogger? logger) + { + this.middlewares = (middlewares ?? throw new ArgumentNullException(nameof(middlewares))).ToArray(); + this.aiModels = aiModels ?? NullAiModelProvider.Instance; + this.logger = logger; + } + + public async Task ExecuteAsync(StreamInfo streamInfo, ConversionArtifacts artifacts, IList segments, CancellationToken cancellationToken) + { + if (middlewares.Count == 0) + { + return; + } + + var context = new ConversionPipelineContext(streamInfo, artifacts, segments, aiModels, logger); + foreach (var middleware in middlewares) + { + cancellationToken.ThrowIfCancellationRequested(); + + try + { + await middleware.InvokeAsync(context, cancellationToken).ConfigureAwait(false); + } + catch (Exception ex) + { + logger?.LogWarning(ex, "Conversion middleware {Middleware} failed", middleware.GetType().Name); + } + } + } +} diff --git a/src/MarkItDown/Conversion/ConversionPipelineContext.cs b/src/MarkItDown/Conversion/ConversionPipelineContext.cs new file mode 100644 index 000000000..859f3de5b --- /dev/null +++ b/src/MarkItDown/Conversion/ConversionPipelineContext.cs @@ -0,0 +1,46 @@ +using System; +using System.Collections.Generic; +using MarkItDown.Intelligence; +using Microsoft.Extensions.Logging; + +namespace MarkItDown; + +/// +/// Provides context data for conversion middleware. +/// +public sealed class ConversionPipelineContext +{ + internal ConversionPipelineContext(StreamInfo streamInfo, ConversionArtifacts artifacts, IList segments, IAiModelProvider aiModels, ILogger? logger) + { + StreamInfo = streamInfo ?? throw new ArgumentNullException(nameof(streamInfo)); + Artifacts = artifacts ?? throw new ArgumentNullException(nameof(artifacts)); + Segments = segments ?? throw new ArgumentNullException(nameof(segments)); + AiModels = aiModels ?? NullAiModelProvider.Instance; + Logger = logger; + } + + /// + /// Gets metadata that describes the converted stream. + /// + public StreamInfo StreamInfo { get; } + + /// + /// Gets the raw artifacts extracted by the converter. + /// + public ConversionArtifacts Artifacts { get; } + + /// + /// Gets the mutable list of segments generated by the converter. + /// + public IList Segments { get; } + + /// + /// Gets access to configured AI model providers. + /// + public IAiModelProvider AiModels { get; } + + /// + /// Gets the logger supplied by the conversion host. + /// + public ILogger? Logger { get; } +} diff --git a/src/MarkItDown/Conversion/IConversionMiddleware.cs b/src/MarkItDown/Conversion/IConversionMiddleware.cs new file mode 100644 index 000000000..fe6051b27 --- /dev/null +++ b/src/MarkItDown/Conversion/IConversionMiddleware.cs @@ -0,0 +1,24 @@ +using System.Collections.Generic; +using System.Threading; +using System.Threading.Tasks; + +namespace MarkItDown; + +/// +/// Represents a middleware component that can inspect or modify conversion artifacts before Markdown composition. +/// +public interface IConversionMiddleware +{ + /// + /// Invoked for each conversion with the extracted artifacts and mutable segment list. + /// + Task InvokeAsync(ConversionPipelineContext context, CancellationToken cancellationToken); +} + +/// +/// Abstraction for executing a middleware pipeline. +/// +public interface IConversionPipeline +{ + Task ExecuteAsync(StreamInfo streamInfo, ConversionArtifacts artifacts, IList segments, CancellationToken cancellationToken); +} diff --git a/src/MarkItDown/Conversion/Middleware/AiImageEnrichmentMiddleware.cs b/src/MarkItDown/Conversion/Middleware/AiImageEnrichmentMiddleware.cs new file mode 100644 index 000000000..4afbb6995 --- /dev/null +++ b/src/MarkItDown/Conversion/Middleware/AiImageEnrichmentMiddleware.cs @@ -0,0 +1,324 @@ +using System; +using System.Collections.Generic; +using System.Globalization; +using System.Linq; +using System.Text; +using System.Text.Json; +using System.Threading; +using System.Threading.Tasks; +using MarkItDown; +using Microsoft.Extensions.AI; +using Microsoft.Extensions.Logging; + +namespace MarkItDown.Conversion.Middleware; + +/// +/// Middleware that enriches extracted images using an . +/// +public sealed class AiImageEnrichmentMiddleware : IConversionMiddleware +{ + public async Task InvokeAsync(ConversionPipelineContext context, CancellationToken cancellationToken) + { + if (context is null) + { + throw new ArgumentNullException(nameof(context)); + } + + if (context.Artifacts.Images.Count == 0) + { + return; + } + + var chatClient = context.AiModels.ChatClient; + if (chatClient is null) + { + return; + } + + foreach (var image in context.Artifacts.Images) + { + cancellationToken.ThrowIfCancellationRequested(); + + if (image.DetailedDescription is not null) + { + continue; + } + + var prompt = BuildPrompt(context.StreamInfo, image); + ChatResponse? response = null; + + try + { + response = await chatClient.GetResponseAsync( + prompt, + new ChatOptions + { + Temperature = 0.1f, + }, + useJsonSchemaResponseFormat: true, + cancellationToken: cancellationToken).ConfigureAwait(false); + } + catch (Exception ex) + { + context.Logger?.LogWarning(ex, "Image enrichment failed for {Label}", image.Label ?? image.PageNumber?.ToString(CultureInfo.InvariantCulture)); + continue; + } + + if (response is null) + { + continue; + } + + ImageInsight? insight = null; + if (!response.TryGetResult(out insight) && !string.IsNullOrWhiteSpace(response.Text)) + { + try + { + insight = JsonSerializer.Deserialize(response.Text); + } + catch (JsonException) + { + insight = new ImageInsight { Summary = response.Text }; // fall back to raw text + } + } + + var markdown = insight?.ToMarkdown(); + if (string.IsNullOrWhiteSpace(markdown)) + { + markdown = response.Text; + } + + if (string.IsNullOrWhiteSpace(markdown)) + { + continue; + } + + image.DetailedDescription = markdown.Trim(); + image.MermaidDiagram = insight?.MermaidDiagram; + image.RawText = insight?.ExtractedText; + + UpdateImageMetadata(image, insight); + UpdateSegments(context, image, markdown!, insight?.MermaidDiagram); + + image.Metadata["detailedDescription"] = image.DetailedDescription; + } + } + + private static string BuildPrompt(StreamInfo streamInfo, ImageArtifact image) + { + var builder = new StringBuilder(); + builder.AppendLine("You are an assistant that analyses embedded document images to produce exhaustive textual descriptions."); + builder.AppendLine("Describe every meaningful component, explain relationships, and capture all numeric values."); + builder.AppendLine("If the image contains a flow chart, architecture, timeline, UML, or graph, produce a Mermaid diagram representing it."); + builder.AppendLine("If the content is a chart or table, provide a Markdown table capturing the data points."); + builder.AppendLine("Always respond with JSON using the following schema:"); + builder.AppendLine("{\"summary\": string, \"keyFindings\": string[], \"mermaidDiagram\": string | null, \"dataTableMarkdown\": string | null, \"extractedText\": string | null}"); + builder.AppendLine("The summary should be a dense paragraph covering every notable element."); + builder.AppendLine("Key findings should include facts, metrics, and contextual insights."); + builder.AppendLine("When a Mermaid diagram is not applicable return null for mermaidDiagram."); + builder.AppendLine("When there is no table or chart return null for dataTableMarkdown."); + builder.AppendLine("When no OCR text is relevant return null for extractedText."); + + builder.AppendLine(); + builder.AppendLine("Image metadata:"); + var source = streamInfo.FileName ?? streamInfo.LocalPath ?? streamInfo.Url ?? "unknown"; + builder.AppendLine($"- Source: {source}"); + builder.AppendLine($"- Page: {image.PageNumber?.ToString(CultureInfo.InvariantCulture) ?? "unknown"}"); + builder.AppendLine($"- MimeType: {image.ContentType ?? "unknown"}"); + builder.AppendLine(); + + var base64 = Convert.ToBase64String(image.Data); + builder.Append("ImagePayload: data:"); + builder.Append(image.ContentType ?? "application/octet-stream"); + builder.Append(";base64,"); + builder.Append(base64); + + return builder.ToString(); + } + + private static void UpdateImageMetadata(ImageArtifact image, ImageInsight? insight) + { + if (!string.IsNullOrWhiteSpace(insight?.Summary)) + { + image.Metadata[MetadataKeys.Caption] = insight!.Summary!; + } + + if (!string.IsNullOrWhiteSpace(insight?.ExtractedText)) + { + image.Metadata["ocrText"] = insight!.ExtractedText!; + } + + if (!string.IsNullOrWhiteSpace(insight?.DataTableMarkdown)) + { + image.Metadata["dataTableMarkdown"] = insight!.DataTableMarkdown!; + } + } + + private static void UpdateSegments(ConversionPipelineContext context, ImageArtifact image, string markdown, string? mermaid) + { + if (image.SegmentIndex is int index && index >= 0 && index < context.Segments.Count) + { + var existing = context.Segments[index]; + var metadata = new Dictionary(existing.AdditionalMetadata) + { + ["imageEnriched"] = "true" + }; + + if (image.PageNumber is int page && !metadata.ContainsKey(MetadataKeys.Page)) + { + metadata[MetadataKeys.Page] = page.ToString(CultureInfo.InvariantCulture); + } + + var enrichmentBlock = BuildEnrichmentBlock(markdown, mermaid); + var updatedMarkdown = existing.Markdown; + + if (!string.IsNullOrWhiteSpace(image.PlaceholderMarkdown) && + TryInjectAfterPlaceholder(existing.Markdown, image.PlaceholderMarkdown!, enrichmentBlock, out var injected)) + { + updatedMarkdown = injected; + } + else + { + var builder = new StringBuilder(existing.Markdown); + builder.Append(enrichmentBlock); + updatedMarkdown = builder.ToString(); + } + + context.Segments[index] = new DocumentSegment( + updatedMarkdown.TrimEnd(), + existing.Type, + existing.Number, + existing.Label, + existing.StartTime, + existing.EndTime, + existing.Source, + metadata); + } + else + { + var metadata = new Dictionary + { + ["imageEnriched"] = "true" + }; + + if (image.PageNumber is int page) + { + metadata[MetadataKeys.Page] = page.ToString(CultureInfo.InvariantCulture); + } + + if (!string.IsNullOrWhiteSpace(mermaid)) + { + metadata["mermaid"] = "true"; + } + + var composed = BuildStandaloneEnrichment(markdown, mermaid); + + context.Segments.Add(new DocumentSegment( + composed, + SegmentType.Image, + image.PageNumber, + image.Label, + source: image.Source, + additionalMetadata: metadata)); + image.SegmentIndex = context.Segments.Count - 1; + image.PlaceholderMarkdown ??= markdown.Trim(); + } + } + + private static bool ContainsMermaid(string markdown) + => markdown.Contains("```mermaid", StringComparison.OrdinalIgnoreCase); + + private static string BuildEnrichmentBlock(string markdown, string? mermaid) + { + var builder = new StringBuilder(); + builder.AppendLine(); + builder.AppendLine(markdown.Trim()); + + if (!string.IsNullOrWhiteSpace(mermaid) && !ContainsMermaid(markdown)) + { + builder.AppendLine(); + builder.AppendLine("```mermaid"); + builder.AppendLine(mermaid!.Trim()); + builder.AppendLine("```"); + } + + return builder.ToString(); + } + + private static string BuildStandaloneEnrichment(string markdown, string? mermaid) + { + var builder = new StringBuilder(markdown.Trim()); + + if (!string.IsNullOrWhiteSpace(mermaid) && !ContainsMermaid(markdown)) + { + builder.AppendLine().AppendLine("```mermaid"); + builder.AppendLine(mermaid!.Trim()); + builder.AppendLine("```"); + } + + return builder.ToString().TrimEnd(); + } + + private static bool TryInjectAfterPlaceholder(string existingMarkdown, string placeholder, string enrichmentBlock, out string updated) + { + var index = existingMarkdown.IndexOf(placeholder, StringComparison.Ordinal); + if (index < 0) + { + updated = existingMarkdown; + return false; + } + + var insertPosition = index + placeholder.Length; + var builder = new StringBuilder(existingMarkdown.Length + enrichmentBlock.Length); + builder.Append(existingMarkdown, 0, insertPosition); + builder.Append(enrichmentBlock); + builder.Append(existingMarkdown.AsSpan(insertPosition)); + updated = builder.ToString(); + return true; + } + + private sealed class ImageInsight + { + public string? Summary { get; set; } + + public IList KeyFindings { get; set; } = new List(); + + public string? MermaidDiagram { get; set; } + + public string? DataTableMarkdown { get; set; } + + public string? ExtractedText { get; set; } + + public string ToMarkdown() + { + var builder = new StringBuilder(); + + if (!string.IsNullOrWhiteSpace(Summary)) + { + builder.AppendLine(Summary.Trim()); + } + + if (KeyFindings.Count > 0) + { + builder.AppendLine().AppendLine("Key findings:"); + foreach (var finding in KeyFindings.Where(static f => !string.IsNullOrWhiteSpace(f))) + { + builder.Append("- ").AppendLine(finding.Trim()); + } + } + + if (!string.IsNullOrWhiteSpace(DataTableMarkdown)) + { + builder.AppendLine().AppendLine(DataTableMarkdown.Trim()); + } + + if (!string.IsNullOrWhiteSpace(ExtractedText)) + { + builder.AppendLine().AppendLine("Extracted text:"); + builder.AppendLine(ExtractedText.Trim()); + } + + return builder.ToString().Trim(); + } + } +} diff --git a/src/MarkItDown/Converters/DocxConverter.cs b/src/MarkItDown/Converters/DocxConverter.cs index 2c884cc01..e78ec50b1 100644 --- a/src/MarkItDown/Converters/DocxConverter.cs +++ b/src/MarkItDown/Converters/DocxConverter.cs @@ -1,9 +1,14 @@ +using System; using System.Collections.Generic; using System.Globalization; +using System.IO; using System.Linq; using System.Text; +using System.Threading; +using System.Threading.Tasks; using DocumentFormat.OpenXml.Packaging; using DocumentFormat.OpenXml.Wordprocessing; +using MarkItDown.Intelligence; namespace MarkItDown.Converters; @@ -23,10 +28,14 @@ public sealed class DocxConverter : IDocumentConverter }; private readonly SegmentOptions segmentOptions; + private readonly IConversionPipeline conversionPipeline; + private readonly IImageUnderstandingProvider? imageUnderstandingProvider; - public DocxConverter(SegmentOptions? segmentOptions = null) + public DocxConverter(SegmentOptions? segmentOptions = null, IConversionPipeline? pipeline = null, IImageUnderstandingProvider? imageProvider = null) { this.segmentOptions = segmentOptions ?? SegmentOptions.Default; + conversionPipeline = pipeline ?? ConversionPipeline.Empty; + imageUnderstandingProvider = imageProvider; } public int Priority => 210; // Between PDF and plain text @@ -76,18 +85,24 @@ public bool Accepts(Stream stream, StreamInfo streamInfo, CancellationToken canc return true; } - public Task ConvertAsync(Stream stream, StreamInfo streamInfo, CancellationToken cancellationToken = default) + public async Task ConvertAsync(Stream stream, StreamInfo streamInfo, CancellationToken cancellationToken = default) { try { if (stream.CanSeek) stream.Position = 0; - var segments = ExtractDocumentSegments(stream, streamInfo.FileName, cancellationToken); - var markdown = SegmentMarkdownComposer.Compose(segments, segmentOptions); - var title = ExtractTitle(markdown); + var extraction = await ExtractDocumentAsync(stream, streamInfo, cancellationToken).ConfigureAwait(false); + await conversionPipeline.ExecuteAsync(streamInfo, extraction.Artifacts, extraction.Segments, cancellationToken).ConfigureAwait(false); - return Task.FromResult(new DocumentConverterResult(markdown, title, segments)); + var markdown = SegmentMarkdownComposer.Compose(extraction.Segments, segmentOptions); + var title = ExtractTitle(extraction.RawText); + if (string.IsNullOrWhiteSpace(title)) + { + title = ExtractTitle(markdown); + } + + return new DocumentConverterResult(markdown, title, extraction.Segments, extraction.Artifacts); } catch (Exception ex) when (ex is not MarkItDownException) { @@ -95,20 +110,42 @@ public Task ConvertAsync(Stream stream, StreamInfo stre } } - private IReadOnlyList ExtractDocumentSegments(Stream stream, string? fileName, CancellationToken cancellationToken) + private sealed class DocxExtractionResult + { + public DocxExtractionResult(List segments, ConversionArtifacts artifacts, string rawText) + { + Segments = segments; + Artifacts = artifacts; + RawText = rawText; + } + + public List Segments { get; } + + public ConversionArtifacts Artifacts { get; } + + public string RawText { get; } + } + + private sealed record PageContent(int PageNumber, string Markdown); + + private async Task ExtractDocumentAsync(Stream stream, StreamInfo streamInfo, CancellationToken cancellationToken) { using var wordDocument = WordprocessingDocument.Open(stream, false); var body = wordDocument.MainDocumentPart?.Document?.Body; if (body is null) { - return Array.Empty(); + return new DocxExtractionResult(new List(), new ConversionArtifacts(), string.Empty); } - var segments = new List(); + var artifacts = new ConversionArtifacts(); + var pages = new List(); var pageBuilder = new StringBuilder(); + var rawTextBuilder = new StringBuilder(); + var imagesByPage = new Dictionary>(); var pageNumber = 1; - var source = fileName; + var tableCount = 0; + var imageCount = 0; foreach (var element in body.Elements()) { @@ -120,7 +157,7 @@ private IReadOnlyList ExtractDocumentSegments(Stream stream, st { if (ContainsPageBreak(paragraph)) { - FinalizePageSegment(segments, pageBuilder, ref pageNumber, source); + FinalizePage(pages, pageBuilder, ref pageNumber, rawTextBuilder); } var paragraphMarkdown = ConvertParagraph(paragraph); @@ -130,16 +167,35 @@ private IReadOnlyList ExtractDocumentSegments(Stream stream, st pageBuilder.AppendLine(); } + if (wordDocument.MainDocumentPart is not null) + { + foreach (var drawing in paragraph.Descendants()) + { + var artifact = await ExtractImageAsync(drawing, wordDocument.MainDocumentPart, pageNumber, streamInfo, cancellationToken).ConfigureAwait(false); + if (artifact is null) + { + continue; + } + + imageCount++; + artifact.Label ??= $"Image {imageCount}"; + AppendImagePlaceholder(pageBuilder, artifact); + AddImageToPage(imagesByPage, pageNumber, artifact); + } + } + break; } case Table table: { - var tableMarkdown = ConvertTableToMarkdown(table); + var (tableMarkdown, rawTable) = ConvertTable(table); if (!string.IsNullOrWhiteSpace(tableMarkdown)) { pageBuilder.AppendLine(tableMarkdown.TrimEnd()); pageBuilder.AppendLine(); + tableCount++; + artifacts.Tables.Add(new TableArtifact(rawTable, pageNumber, streamInfo.FileName, $"Table {tableCount}")); } break; @@ -159,133 +215,133 @@ private IReadOnlyList ExtractDocumentSegments(Stream stream, st } } - if (pageBuilder.Length > 0) + FinalizePage(pages, pageBuilder, ref pageNumber, rawTextBuilder); + + var segments = new List(); + + foreach (var page in pages) { - var markdown = pageBuilder.ToString().TrimEnd(); - if (!string.IsNullOrEmpty(markdown)) + var segment = CreatePageSegment(page.Markdown, page.PageNumber, streamInfo.FileName); + var index = segments.Count; + segments.Add(segment); + artifacts.TextBlocks.Add(new TextArtifact(page.Markdown, page.PageNumber, streamInfo.FileName, segment.Label)); + + if (imagesByPage.TryGetValue(page.PageNumber, out var pageImages)) { - segments.Add(CreatePageSegment(markdown, pageNumber, source)); + foreach (var artifact in pageImages) + { + artifact.SegmentIndex = index; + artifacts.Images.Add(artifact); + } } } - return segments; + return new DocxExtractionResult(segments, artifacts, rawTextBuilder.ToString().Trim()); } - private static void FinalizePageSegment(List segments, StringBuilder builder, ref int pageNumber, string? source) + private static void FinalizePage(ICollection pages, StringBuilder builder, ref int pageNumber, StringBuilder rawTextBuilder) { + if (builder.Length == 0) + { + pageNumber++; + return; + } + var markdown = builder.ToString().TrimEnd(); if (!string.IsNullOrEmpty(markdown)) { - segments.Add(CreatePageSegment(markdown, pageNumber, source)); + pages.Add(new PageContent(pageNumber, markdown)); + + if (rawTextBuilder.Length > 0) + { + rawTextBuilder.AppendLine(); + } + + rawTextBuilder.AppendLine(markdown); } pageNumber++; builder.Clear(); } - private static DocumentSegment CreatePageSegment(string markdown, int pageNumber, string? source) + private static void AppendImagePlaceholder(StringBuilder builder, ImageArtifact artifact) { - var metadata = new Dictionary + var mimeType = string.IsNullOrWhiteSpace(artifact.ContentType) ? "image/png" : artifact.ContentType; + var base64 = Convert.ToBase64String(artifact.Data); + var label = artifact.Label ?? "Document image"; + var placeholder = $"![{label}](data:{mimeType};base64,{base64})"; + + artifact.PlaceholderMarkdown = placeholder; + builder.AppendLine(placeholder); + builder.AppendLine(); + } + + private static void AddImageToPage(IDictionary> imagesByPage, int pageNumber, ImageArtifact artifact) + { + if (!imagesByPage.TryGetValue(pageNumber, out var list)) { - ["page"] = pageNumber.ToString(CultureInfo.InvariantCulture) - }; + list = new List(); + imagesByPage[pageNumber] = list; + } - return new DocumentSegment( - markdown: markdown, - type: SegmentType.Page, - number: pageNumber, - label: $"Page {pageNumber}", - source: source, - additionalMetadata: metadata); + list.Add(artifact); } - private static bool ContainsPageBreak(Paragraph paragraph) - => paragraph.Descendants().Any() || - paragraph.Descendants().Any(b => b.Type?.Value == BreakValues.Page); - - private static string ConvertParagraph(Paragraph paragraph) + private async Task ExtractImageAsync(DocumentFormat.OpenXml.Wordprocessing.Drawing drawing, MainDocumentPart mainDocumentPart, int pageNumber, StreamInfo streamInfo, CancellationToken cancellationToken) { - var paragraphText = new StringBuilder(); - var isHeading = false; - var headingLevel = 0; - - var paragraphProperties = paragraph.ParagraphProperties; - if (paragraphProperties?.ParagraphStyleId?.Val?.Value is string styleId) + var blip = drawing.Descendants().FirstOrDefault(); + if (blip?.Embed?.Value is not string relationshipId) { - styleId = styleId.ToLowerInvariant(); - if (styleId.StartsWith("heading", StringComparison.Ordinal)) - { - isHeading = true; - if (int.TryParse(styleId.Replace("heading", string.Empty, StringComparison.Ordinal), out var level)) - { - headingLevel = Math.Clamp(level, 1, 6); - } - } + return null; } - foreach (var run in paragraph.Elements()) + if (mainDocumentPart.GetPartById(relationshipId) is not ImagePart imagePart) { - var runProperties = run.RunProperties; - var currentBold = runProperties?.Bold != null; - var currentItalic = runProperties?.Italic != null; + return null; + } - foreach (var textElement in run.Elements()) - { - switch (textElement) - { - case Text text: - { - var textContent = text.Text; - if (string.IsNullOrEmpty(textContent)) - { - continue; - } + using var partStream = imagePart.GetStream(); + using var memory = new MemoryStream(); + await partStream.CopyToAsync(memory, cancellationToken).ConfigureAwait(false); + var data = memory.ToArray(); - if (currentBold && !isHeading) - { - textContent = $"**{textContent}**"; - } + var artifact = new ImageArtifact(data, imagePart.ContentType, pageNumber, streamInfo.FileName); + artifact.Metadata[MetadataKeys.Page] = pageNumber.ToString(CultureInfo.InvariantCulture); - if (currentItalic && !isHeading) - { - textContent = $"*{textContent}*"; - } + if (imageUnderstandingProvider is not null) + { + try + { + using var analysisStream = new MemoryStream(data, writable: false); + var result = await imageUnderstandingProvider.AnalyzeAsync(analysisStream, streamInfo, cancellationToken).ConfigureAwait(false); + if (!string.IsNullOrWhiteSpace(result?.Caption)) + { + artifact.Metadata[MetadataKeys.Caption] = result!.Caption!; + } - paragraphText.Append(textContent); - break; - } - case TabChar: - paragraphText.Append('\t'); - break; - case Break br when br.Type?.Value == BreakValues.TextWrapping: - paragraphText.AppendLine(); - break; + if (!string.IsNullOrWhiteSpace(result?.Text)) + { + artifact.Metadata["ocrText"] = result!.Text!; + artifact.RawText = result.Text; } } + catch + { + // Ignore provider failures to keep extraction resilient. + } } - var finalText = paragraphText.ToString().Trim(); - if (string.IsNullOrWhiteSpace(finalText)) - { - return string.Empty; - } - - if (isHeading && headingLevel > 0) - { - return $"{new string('#', headingLevel)} {finalText}"; - } - - return finalText; + return artifact; } - private static string ConvertTableToMarkdown(Table table) + private static (string Markdown, IList> RawTable) ConvertTable(Table table) { - var tableData = new List>(); + var tableData = new List>(); var rows = table.Elements().ToList(); if (rows.Count == 0) { - return string.Empty; + return (string.Empty, tableData); } var gridColCount = table.GetFirstChild()?.Elements()?.Count() ?? 0; @@ -337,7 +393,7 @@ private static string ConvertTableToMarkdown(Table table) if (!tableData.Any()) { - return string.Empty; + return (string.Empty, tableData); } var markdown = new StringBuilder(); @@ -368,7 +424,102 @@ private static string ConvertTableToMarkdown(Table table) markdown.AppendLine(); } - return markdown.ToString(); + return (markdown.ToString(), tableData); + } + + private static DocumentSegment CreatePageSegment(string markdown, int pageNumber, string? source) + { + var metadata = new Dictionary + { + [MetadataKeys.Page] = pageNumber.ToString(CultureInfo.InvariantCulture) + }; + + return new DocumentSegment( + markdown: markdown, + type: SegmentType.Page, + number: pageNumber, + label: $"Page {pageNumber}", + source: source, + additionalMetadata: metadata); + } + + private static bool ContainsPageBreak(Paragraph paragraph) + => paragraph.Descendants().Any() || + paragraph.Descendants().Any(b => b.Type?.Value == BreakValues.Page); + + private static string ConvertParagraph(Paragraph paragraph) + { + var paragraphText = new StringBuilder(); + var isHeading = false; + var headingLevel = 0; + + var paragraphProperties = paragraph.ParagraphProperties; + if (paragraphProperties?.ParagraphStyleId?.Val?.Value is string styleId) + { + styleId = styleId.ToLowerInvariant(); + if (styleId.StartsWith("heading", StringComparison.Ordinal)) + { + isHeading = true; + if (int.TryParse(styleId.Replace("heading", string.Empty, StringComparison.Ordinal), out var level)) + { + headingLevel = Math.Clamp(level, 1, 6); + } + } + } + + foreach (var run in paragraph.Elements()) + { + var runProperties = run.RunProperties; + var currentBold = runProperties?.Bold != null; + var currentItalic = runProperties?.Italic != null; + + foreach (var textElement in run.Elements()) + { + switch (textElement) + { + case Text text: + { + var textContent = text.Text; + if (string.IsNullOrEmpty(textContent)) + { + continue; + } + + if (currentBold && !isHeading) + { + textContent = $"**{textContent}**"; + } + + if (currentItalic && !isHeading) + { + textContent = $"*{textContent}*"; + } + + paragraphText.Append(textContent); + break; + } + case TabChar: + paragraphText.Append('\t'); + break; + case Break br when br.Type?.Value == BreakValues.TextWrapping: + paragraphText.AppendLine(); + break; + } + } + } + + var finalText = paragraphText.ToString().Trim(); + if (string.IsNullOrWhiteSpace(finalText)) + { + return string.Empty; + } + + if (isHeading && headingLevel > 0) + { + return $"{new string('#', headingLevel)} {finalText}"; + } + + return finalText; } private static int GetGridSpan(TableCell cell) diff --git a/src/MarkItDown/Converters/PdfConverter.cs b/src/MarkItDown/Converters/PdfConverter.cs index b8da42434..191ccac82 100644 --- a/src/MarkItDown/Converters/PdfConverter.cs +++ b/src/MarkItDown/Converters/PdfConverter.cs @@ -12,6 +12,7 @@ using MarkItDown.Intelligence; using MarkItDown.Intelligence.Models; using PDFtoImage; +using static PDFtoImage.Conversion; using SkiaSharp; using UglyToad.PdfPig; @@ -37,12 +38,14 @@ public sealed class PdfConverter : IDocumentConverter private readonly SegmentOptions segmentOptions; private readonly IDocumentIntelligenceProvider? documentIntelligenceProvider; private readonly IImageUnderstandingProvider? imageUnderstandingProvider; + private readonly IConversionPipeline conversionPipeline; public PdfConverter( SegmentOptions? segmentOptions = null, IDocumentIntelligenceProvider? documentProvider = null, - IImageUnderstandingProvider? imageProvider = null) - : this(new PdfPigTextExtractor(), new PdfToImageRenderer(), segmentOptions, documentProvider, imageProvider) + IImageUnderstandingProvider? imageProvider = null, + IConversionPipeline? pipeline = null) + : this(new PdfPigTextExtractor(), new PdfToImageRenderer(), segmentOptions, documentProvider, imageProvider, pipeline) { } @@ -51,13 +54,15 @@ internal PdfConverter( IPdfImageRenderer imageRenderer, SegmentOptions? segmentOptions = null, IDocumentIntelligenceProvider? documentProvider = null, - IImageUnderstandingProvider? imageProvider = null) + IImageUnderstandingProvider? imageProvider = null, + IConversionPipeline? pipeline = null) { this.textExtractor = textExtractor ?? throw new ArgumentNullException(nameof(textExtractor)); this.imageRenderer = imageRenderer ?? throw new ArgumentNullException(nameof(imageRenderer)); this.segmentOptions = segmentOptions ?? SegmentOptions.Default; documentIntelligenceProvider = documentProvider; imageUnderstandingProvider = imageProvider; + conversionPipeline = pipeline ?? ConversionPipeline.Empty; } public int Priority => 200; // Between HTML and plain text @@ -109,43 +114,41 @@ public bool Accepts(Stream stream, StreamInfo streamInfo, CancellationToken canc return true; } + private sealed class PdfExtractionResult + { + public PdfExtractionResult(List segments, ConversionArtifacts artifacts, string rawText) + { + Segments = segments; + Artifacts = artifacts; + RawText = rawText; + } + + public List Segments { get; } + + public ConversionArtifacts Artifacts { get; } + + public string RawText { get; } + } + public async Task ConvertAsync(Stream stream, StreamInfo streamInfo, CancellationToken cancellationToken = default) { try { var pdfBytes = await ReadAllBytesAsync(stream, cancellationToken).ConfigureAwait(false); - var analysisSegments = await TryBuildSegmentsFromDocumentIntelligenceAsync(pdfBytes, streamInfo, cancellationToken).ConfigureAwait(false); - if (analysisSegments is not null && analysisSegments.Count > 0) - { - var markdownFromAnalysis = SegmentMarkdownComposer.Compose(analysisSegments, segmentOptions); - var titleFromAnalysis = ExtractTitle(string.Join(Environment.NewLine, analysisSegments.Select(s => s.Markdown))); - return new DocumentConverterResult(markdownFromAnalysis, titleFromAnalysis, analysisSegments); - } - - var pages = await textExtractor.ExtractTextAsync(pdfBytes, cancellationToken).ConfigureAwait(false); - var pageImages = await imageRenderer.RenderImagesAsync(pdfBytes, cancellationToken).ConfigureAwait(false); + var extraction = await TryBuildExtractionFromDocumentIntelligenceAsync(pdfBytes, streamInfo, cancellationToken).ConfigureAwait(false) + ?? await BuildExtractionFromPdfPigAsync(pdfBytes, streamInfo, cancellationToken).ConfigureAwait(false); - var segments = BuildSegmentsFromExtractedText(pages, pageImages, streamInfo.FileName); - var markdown = SegmentMarkdownComposer.Compose(segments, segmentOptions); + await conversionPipeline.ExecuteAsync(streamInfo, extraction.Artifacts, extraction.Segments, cancellationToken).ConfigureAwait(false); - var rawTextBuilder = new StringBuilder(); - foreach (var page in pages) + var markdown = SegmentMarkdownComposer.Compose(extraction.Segments, segmentOptions); + var title = ExtractTitle(extraction.RawText); + if (string.IsNullOrWhiteSpace(title)) { - if (!string.IsNullOrWhiteSpace(page.Text)) - { - if (rawTextBuilder.Length > 0) - { - rawTextBuilder.AppendLine(); - } - - rawTextBuilder.AppendLine(page.Text.Trim()); - } + title = ExtractTitle(markdown); } - var title = ExtractTitle(rawTextBuilder.ToString()); - - return new DocumentConverterResult(markdown, title, segments); + return new DocumentConverterResult(markdown, title, extraction.Segments, extraction.Artifacts); } catch (Exception ex) when (ex is not MarkItDownException) { @@ -171,7 +174,7 @@ private static async Task ReadAllBytesAsync(Stream stream, CancellationT return memory.ToArray(); } - private async Task?> TryBuildSegmentsFromDocumentIntelligenceAsync(byte[] pdfBytes, StreamInfo streamInfo, CancellationToken cancellationToken) + private async Task TryBuildExtractionFromDocumentIntelligenceAsync(byte[] pdfBytes, StreamInfo streamInfo, CancellationToken cancellationToken) { if (documentIntelligenceProvider is null) { @@ -187,7 +190,7 @@ private static async Task ReadAllBytesAsync(Stream stream, CancellationT return null; } - return await BuildSegmentsFromDocumentIntelligenceAsync(analysis, streamInfo, cancellationToken).ConfigureAwait(false); + return await BuildExtractionFromDocumentIntelligenceAsync(analysis, pdfBytes, streamInfo, cancellationToken).ConfigureAwait(false); } catch { @@ -195,71 +198,95 @@ private static async Task ReadAllBytesAsync(Stream stream, CancellationT } } - private async Task> BuildSegmentsFromDocumentIntelligenceAsync(DocumentIntelligenceResult analysis, StreamInfo streamInfo, CancellationToken cancellationToken) + private async Task BuildExtractionFromDocumentIntelligenceAsync(DocumentIntelligenceResult analysis, byte[] pdfBytes, StreamInfo streamInfo, CancellationToken cancellationToken) { var segments = new List(); - var tableMarkdown = new string[analysis.Tables.Count]; - - for (var i = 0; i < analysis.Tables.Count; i++) - { - tableMarkdown[i] = ConvertTableToMarkdown(analysis.Tables[i]); - } - - foreach (var page in analysis.Pages.OrderBy(p => p.PageNumber)) + var artifacts = new ConversionArtifacts(); + var rawTextBuilder = new StringBuilder(); + var processedTables = new HashSet(); + var pagesWithInlineImages = new HashSet(); + + var orderedPages = analysis.Pages + .OrderBy(p => p.PageNumber) + .ToList(); + var distinctPageNumbers = orderedPages + .Select(p => p.PageNumber) + .Distinct() + .ToList(); + + foreach (var page in orderedPages) { if (!string.IsNullOrWhiteSpace(page.Text)) { + var text = page.Text.Trim(); var metadata = new Dictionary(page.Metadata) { [MetadataKeys.Page] = page.PageNumber.ToString(CultureInfo.InvariantCulture) }; - segments.Add(new DocumentSegment( - markdown: page.Text.Trim(), + var segment = new DocumentSegment( + markdown: text, type: SegmentType.Page, number: page.PageNumber, label: $"Page {page.PageNumber}", source: streamInfo.FileName, - additionalMetadata: metadata)); + additionalMetadata: metadata); + + segments.Add(segment); + artifacts.TextBlocks.Add(new TextArtifact(text, page.PageNumber, streamInfo.FileName, segment.Label)); + + if (rawTextBuilder.Length > 0) + { + rawTextBuilder.AppendLine(); + } + + rawTextBuilder.AppendLine(text); } foreach (var tableIndex in page.TableIndices.Distinct()) { - if (tableIndex >= 0 && tableIndex < tableMarkdown.Length && !string.IsNullOrWhiteSpace(tableMarkdown[tableIndex])) + if (tableIndex < 0 || tableIndex >= analysis.Tables.Count) { - var metadata = new Dictionary(analysis.Tables[tableIndex].Metadata) - { - [MetadataKeys.TableIndex] = (tableIndex + 1).ToString(CultureInfo.InvariantCulture), - [MetadataKeys.Page] = page.PageNumber.ToString(CultureInfo.InvariantCulture) - }; - - segments.Add(new DocumentSegment( - markdown: tableMarkdown[tableIndex], - type: SegmentType.Table, - number: tableIndex + 1, - label: $"Table {tableIndex + 1} (Page {page.PageNumber})", - source: streamInfo.FileName, - additionalMetadata: metadata)); + continue; } - } - } - if (analysis.Images.Count > 0) - { - foreach (var image in analysis.Images) - { - cancellationToken.ThrowIfCancellationRequested(); + var table = analysis.Tables[tableIndex]; + var tableMarkdown = ConvertTableToMarkdown(table); + if (string.IsNullOrWhiteSpace(tableMarkdown)) + { + continue; + } - var caption = image.Caption; - if (caption is null && imageUnderstandingProvider is not null) + var tableMetadata = new Dictionary(table.Metadata) { - using var imageStream = new MemoryStream(image.Content, writable: false); - var result = await imageUnderstandingProvider.AnalyzeAsync(imageStream, streamInfo, cancellationToken).ConfigureAwait(false); - caption = result?.Caption ?? result?.Text; + [MetadataKeys.TableIndex] = (tableIndex + 1).ToString(CultureInfo.InvariantCulture), + [MetadataKeys.Page] = page.PageNumber.ToString(CultureInfo.InvariantCulture) + }; + + segments.Add(new DocumentSegment( + markdown: tableMarkdown, + type: SegmentType.Table, + number: tableIndex + 1, + label: $"Table {tableIndex + 1} (Page {page.PageNumber})", + source: streamInfo.FileName, + additionalMetadata: tableMetadata)); + + if (processedTables.Add(tableIndex)) + { + var rows = table.Rows.Select(static row => (IList)row.ToList()).ToList(); + artifacts.Tables.Add(new TableArtifact(rows, page.PageNumber, streamInfo.FileName, $"Table {tableIndex + 1}")); } + } + foreach (var image in analysis.Images.Where(img => img.PageNumber == page.PageNumber)) + { + cancellationToken.ThrowIfCancellationRequested(); + + var artifact = await CreateImageArtifactAsync(image, streamInfo, cancellationToken).ConfigureAwait(false); var base64 = Convert.ToBase64String(image.Content); - var md = $"![{caption ?? "Document image"}](data:{image.ContentType};base64,{base64})"; + var caption = artifact.Metadata.TryGetValue(MetadataKeys.Caption, out var storedCaption) ? storedCaption : artifact.Label; + var markdown = $"![{caption ?? "Document image"}](data:{image.ContentType};base64,{base64})"; + artifact.PlaceholderMarkdown = markdown; var metadata = new Dictionary(image.Metadata) { @@ -268,28 +295,41 @@ private async Task> BuildSegmentsFromDocumentInte if (!string.IsNullOrWhiteSpace(caption)) { - metadata[MetadataKeys.Caption] = caption; + metadata[MetadataKeys.Caption] = caption!; } - segments.Add(new DocumentSegment( - markdown: md, + var imageSegment = new DocumentSegment( + markdown: markdown, type: SegmentType.Image, number: image.PageNumber, label: caption ?? $"Image on page {image.PageNumber}", source: streamInfo.FileName, - additionalMetadata: metadata)); + additionalMetadata: metadata); + + artifact.SegmentIndex = segments.Count; + segments.Add(imageSegment); + artifacts.Images.Add(artifact); + pagesWithInlineImages.Add(page.PageNumber); } } - return segments; + await AppendMissingPageSnapshotsAsync( + distinctPageNumbers, + pagesWithInlineImages, + pdfBytes, + streamInfo, + segments, + artifacts, + cancellationToken).ConfigureAwait(false); + + return new PdfExtractionResult(segments, artifacts, rawTextBuilder.ToString().Trim()); } - private static IReadOnlyList BuildSegmentsFromExtractedText( - IReadOnlyList pages, - IReadOnlyList pageImages, - string? source) + private PdfExtractionResult BuildExtractionFromExtractedText(IReadOnlyList pages, IReadOnlyList pageImages, StreamInfo streamInfo) { var segments = new List(); + var artifacts = new ConversionArtifacts(); + var rawTextBuilder = new StringBuilder(); foreach (var page in pages) { @@ -301,16 +341,26 @@ private static IReadOnlyList BuildSegmentsFromExtractedText( var metadata = new Dictionary { - ["page"] = page.PageNumber.ToString(CultureInfo.InvariantCulture) + [MetadataKeys.Page] = page.PageNumber.ToString(CultureInfo.InvariantCulture) }; - segments.Add(new DocumentSegment( + var segment = new DocumentSegment( markdown: markdown, type: SegmentType.Page, number: page.PageNumber, label: $"Page {page.PageNumber}", - source: source, - additionalMetadata: metadata)); + source: streamInfo.FileName, + additionalMetadata: metadata); + + segments.Add(segment); + artifacts.TextBlocks.Add(new TextArtifact(markdown, page.PageNumber, streamInfo.FileName, segment.Label)); + + if (rawTextBuilder.Length > 0) + { + rawTextBuilder.AppendLine(); + } + + rawTextBuilder.AppendLine(markdown); } if (pageImages.Count > 0) @@ -319,27 +369,188 @@ private static IReadOnlyList BuildSegmentsFromExtractedText( markdown: "## Page Images", type: SegmentType.Section, label: "Page Images", - source: source)); + source: streamInfo.FileName)); for (var i = 0; i < pageImages.Count; i++) { var markdown = $"![PDF page {i + 1}](data:image/png;base64,{pageImages[i]})"; var metadata = new Dictionary { - ["page"] = (i + 1).ToString(CultureInfo.InvariantCulture) + [MetadataKeys.Page] = (i + 1).ToString(CultureInfo.InvariantCulture) }; + var segmentIndex = segments.Count; segments.Add(new DocumentSegment( markdown: markdown, type: SegmentType.Image, number: i + 1, label: $"Page {i + 1} Image", - source: source, + source: streamInfo.FileName, additionalMetadata: metadata)); + + var imageBytes = Convert.FromBase64String(pageImages[i]); + var artifact = new ImageArtifact(imageBytes, "image/png", i + 1, streamInfo.FileName, $"Page {i + 1} Image") + { + SegmentIndex = segmentIndex, + PlaceholderMarkdown = markdown + }; + artifact.Metadata[MetadataKeys.Page] = (i + 1).ToString(CultureInfo.InvariantCulture); + artifacts.Images.Add(artifact); + } + } + + return new PdfExtractionResult(segments, artifacts, rawTextBuilder.ToString().Trim()); + } + + private async Task BuildExtractionFromPdfPigAsync(byte[] pdfBytes, StreamInfo streamInfo, CancellationToken cancellationToken) + { + var pages = await textExtractor.ExtractTextAsync(pdfBytes, cancellationToken).ConfigureAwait(false); + var pageImages = await imageRenderer.RenderImagesAsync(pdfBytes, cancellationToken).ConfigureAwait(false); + return BuildExtractionFromExtractedText(pages, pageImages, streamInfo); + } + + private async Task AppendMissingPageSnapshotsAsync( + IReadOnlyList pageNumbers, + HashSet pagesWithInlineImages, + byte[] pdfBytes, + StreamInfo streamInfo, + List segments, + ConversionArtifacts artifacts, + CancellationToken cancellationToken) + { + if (pageNumbers.Count == 0) + { + return; + } + + var missingPages = pageNumbers + .Where(page => !pagesWithInlineImages.Contains(page)) + .Distinct() + .OrderBy(page => page) + .ToList(); + + if (missingPages.Count == 0) + { + return; + } + + var renderedPages = await imageRenderer.RenderImagesAsync(pdfBytes, cancellationToken).ConfigureAwait(false); + if (renderedPages.Count == 0) + { + return; + } + + var sectionAdded = false; + + foreach (var page in missingPages) + { + if (page <= 0 || page > renderedPages.Count) + { + continue; + } + + var base64 = renderedPages[page - 1]; + if (string.IsNullOrWhiteSpace(base64)) + { + continue; + } + + byte[] imageBytes; + try + { + imageBytes = Convert.FromBase64String(base64); + } + catch (FormatException) + { + continue; + } + + if (!sectionAdded) + { + segments.Add(new DocumentSegment( + markdown: "## Page Snapshots", + type: SegmentType.Section, + label: "Page Snapshots", + source: streamInfo.FileName)); + sectionAdded = true; + } + + var metadata = new Dictionary + { + [MetadataKeys.Page] = page.ToString(CultureInfo.InvariantCulture), + ["snapshot"] = "true", + }; + + var segmentIndex = segments.Count; + var placeholder = $"![PDF page {page}](data:image/png;base64,{base64})"; + segments.Add(new DocumentSegment( + markdown: placeholder, + type: SegmentType.Image, + number: page, + label: $"Page {page} Snapshot", + source: streamInfo.FileName, + additionalMetadata: metadata)); + + var artifact = new ImageArtifact(imageBytes, "image/png", page, streamInfo.FileName, $"Page {page} Snapshot") + { + SegmentIndex = segmentIndex, + PlaceholderMarkdown = placeholder + }; + artifact.Metadata[MetadataKeys.Page] = page.ToString(CultureInfo.InvariantCulture); + artifact.Metadata["snapshot"] = "true"; + artifacts.Images.Add(artifact); + } + } + + private async Task CreateImageArtifactAsync(DocumentImageResult image, StreamInfo streamInfo, CancellationToken cancellationToken) + { + var artifact = new ImageArtifact(image.Content, image.ContentType, image.PageNumber, streamInfo.FileName) + { + Label = image.Caption ?? $"Image on page {image.PageNumber}" + }; + + artifact.Metadata[MetadataKeys.Page] = image.PageNumber.ToString(CultureInfo.InvariantCulture); + + if (!string.IsNullOrWhiteSpace(image.Caption)) + { + artifact.Metadata[MetadataKeys.Caption] = image.Caption!; + } + + foreach (var pair in image.Metadata) + { + artifact.Metadata[pair.Key] = pair.Value; + } + + if (image.Metadata.TryGetValue("ocrText", out var ocr) && !string.IsNullOrWhiteSpace(ocr)) + { + artifact.RawText = ocr; + } + + if (imageUnderstandingProvider is not null) + { + try + { + using var analysisStream = new MemoryStream(image.Content, writable: false); + var result = await imageUnderstandingProvider.AnalyzeAsync(analysisStream, streamInfo, cancellationToken).ConfigureAwait(false); + if (!string.IsNullOrWhiteSpace(result?.Caption)) + { + artifact.Metadata[MetadataKeys.Caption] = result!.Caption!; + artifact.Label = result.Caption; + } + + if (!string.IsNullOrWhiteSpace(result?.Text)) + { + artifact.Metadata["ocrText"] = result!.Text!; + artifact.RawText = result.Text; + } + } + catch + { + // Ignore analysis failures; downstream middleware may handle enrichment. } } - return segments; + return artifact; } private static string ConvertTableToMarkdown(DocumentTableResult table) @@ -551,7 +762,7 @@ private static Task> RenderOnSupportedPlatformsAsync(byte[ }; #pragma warning disable CA1416 - foreach (var bitmap in Conversion.ToImages(pdfBytes, password: null, options)) + foreach (var bitmap in ToImages(pdfBytes, password: null, options)) { cancellationToken.ThrowIfCancellationRequested(); using var bmp = bitmap; diff --git a/src/MarkItDown/Converters/PptxConverter.cs b/src/MarkItDown/Converters/PptxConverter.cs index 2f11df2de..b2252d9cd 100644 --- a/src/MarkItDown/Converters/PptxConverter.cs +++ b/src/MarkItDown/Converters/PptxConverter.cs @@ -1,10 +1,15 @@ +using System; using System.Collections.Generic; using System.Globalization; +using System.IO; using System.Linq; using System.Text; -using DocumentFormat.OpenXml.Drawing; +using System.Threading; +using System.Threading.Tasks; using DocumentFormat.OpenXml.Packaging; using DocumentFormat.OpenXml.Presentation; +using ManagedCode.MimeTypes; +using MarkItDown.Intelligence; using A = DocumentFormat.OpenXml.Drawing; namespace MarkItDown.Converters; @@ -21,32 +26,38 @@ public sealed class PptxConverter : IDocumentConverter private static readonly HashSet AcceptedMimeTypes = new(StringComparer.OrdinalIgnoreCase) { - "application/vnd.openxmlformats-officedocument.presentationml.presentation" + MimeHelper.GetMimeType(".pptx"), }; private readonly SegmentOptions segmentOptions; + private readonly IConversionPipeline conversionPipeline; + private readonly IImageUnderstandingProvider? imageUnderstandingProvider; - public PptxConverter(SegmentOptions? segmentOptions = null) + public PptxConverter(SegmentOptions? segmentOptions = null, IConversionPipeline? pipeline = null, IImageUnderstandingProvider? imageProvider = null) { this.segmentOptions = segmentOptions ?? SegmentOptions.Default; + conversionPipeline = pipeline ?? ConversionPipeline.Empty; + imageUnderstandingProvider = imageProvider; } public int Priority => 230; // Between XLSX and plain text public bool AcceptsInput(StreamInfo streamInfo) { - var mimeType = streamInfo.MimeType?.ToLowerInvariant() ?? string.Empty; + var normalizedMime = MimeTypeUtilities.NormalizeMime(streamInfo); var extension = streamInfo.Extension?.ToLowerInvariant(); - // Check the extension if (extension is not null && AcceptedExtensions.Contains(extension)) + { return true; + } - // Check the mimetype - if (AcceptedMimeTypes.Contains(mimeType)) + if (normalizedMime is not null && AcceptedMimeTypes.Contains(normalizedMime)) + { return true; + } - return false; + return streamInfo.MimeType is not null && AcceptedMimeTypes.Contains(streamInfo.MimeType); } public bool Accepts(Stream stream, StreamInfo streamInfo, CancellationToken cancellationToken = default) @@ -61,15 +72,15 @@ public bool Accepts(Stream stream, StreamInfo streamInfo, CancellationToken canc try { stream.Position = 0; - var buffer = new byte[4]; - var bytesRead = stream.Read(buffer, 0, 4); + Span buffer = stackalloc byte[4]; + var bytesRead = stream.Read(buffer); stream.Position = originalPosition; if (bytesRead == 4) { // Check for ZIP file signature (PPTX files are ZIP archives) - return buffer[0] == 0x50 && buffer[1] == 0x4B && - (buffer[2] == 0x03 || buffer[2] == 0x05 || buffer[2] == 0x07) && + return buffer[0] == 0x50 && buffer[1] == 0x4B && + (buffer[2] == 0x03 || buffer[2] == 0x05 || buffer[2] == 0x07) && (buffer[3] == 0x04 || buffer[3] == 0x06 || buffer[3] == 0x08); } } @@ -82,185 +93,345 @@ public bool Accepts(Stream stream, StreamInfo streamInfo, CancellationToken canc return true; } - public Task ConvertAsync(Stream stream, StreamInfo streamInfo, CancellationToken cancellationToken = default) + public async Task ConvertAsync(Stream stream, StreamInfo streamInfo, CancellationToken cancellationToken = default) { try { - // Reset stream position if (stream.CanSeek) + { stream.Position = 0; + } - var segments = ExtractSegmentsFromPptx(stream, streamInfo.FileName, cancellationToken); - var markdown = SegmentMarkdownComposer.Compose(segments, segmentOptions); - var title = ExtractTitle(markdown, streamInfo.FileName ?? "PowerPoint Presentation"); + var extraction = await ExtractSlidesAsync(stream, streamInfo, cancellationToken).ConfigureAwait(false); + await conversionPipeline.ExecuteAsync(streamInfo, extraction.Artifacts, extraction.Segments, cancellationToken).ConfigureAwait(false); - return Task.FromResult(new DocumentConverterResult(markdown, title, segments)); + var markdown = SegmentMarkdownComposer.Compose(extraction.Segments, segmentOptions); + var title = ExtractTitle(extraction.RawText, streamInfo.FileName, markdown); + + return new DocumentConverterResult(markdown, title, extraction.Segments, extraction.Artifacts); } - catch (Exception ex) when (!(ex is MarkItDownException)) + catch (Exception ex) when (ex is not MarkItDownException) { throw new FileConversionException($"Failed to convert PPTX file: {ex.Message}", ex); } } - private static IReadOnlyList ExtractSegmentsFromPptx(Stream stream, string? fileName, CancellationToken cancellationToken) + private sealed class PptxExtractionResult { - var segments = new List(); + public PptxExtractionResult(List segments, ConversionArtifacts artifacts, string rawText) + { + Segments = segments; + Artifacts = artifacts; + RawText = rawText; + } + + public List Segments { get; } + + public ConversionArtifacts Artifacts { get; } + + public string RawText { get; } + } + + private sealed record SlideExtractionResult(string Markdown, string Text, IReadOnlyList Images); + private async Task ExtractSlidesAsync(Stream stream, StreamInfo streamInfo, CancellationToken cancellationToken) + { using var presentationDocument = PresentationDocument.Open(stream, false); var presentationPart = presentationDocument.PresentationPart; + if (presentationPart?.Presentation?.SlideIdList is null) + { + return new PptxExtractionResult(new List(), new ConversionArtifacts(), string.Empty); + } - if (presentationPart?.Presentation?.SlideIdList != null) + var segments = new List(); + var artifacts = new ConversionArtifacts(); + var rawTextBuilder = new StringBuilder(); + var slideIndex = 0; + + foreach (var slideId in presentationPart.Presentation.SlideIdList.Elements()) { - var slideIndex = 0; + cancellationToken.ThrowIfCancellationRequested(); + slideIndex++; - foreach (var slideId in presentationPart.Presentation.SlideIdList.Elements()) + if (presentationPart.GetPartById(slideId.RelationshipId!) is not SlidePart slidePart) { - cancellationToken.ThrowIfCancellationRequested(); + continue; + } + + var slideResult = await ConvertSlideToMarkdownAsync(slidePart, slideIndex, streamInfo, cancellationToken).ConfigureAwait(false); + if (string.IsNullOrWhiteSpace(slideResult.Markdown) && slideResult.Images.Count == 0) + { + continue; + } + + var metadata = new Dictionary + { + [MetadataKeys.Slide] = slideIndex.ToString(CultureInfo.InvariantCulture) + }; + + var segment = new DocumentSegment( + markdown: slideResult.Markdown, + type: SegmentType.Slide, + number: slideIndex, + label: $"Slide {slideIndex}", + source: streamInfo.FileName, + additionalMetadata: metadata); + + var segmentIndex = segments.Count; + segments.Add(segment); - slideIndex++; - var slidePart = (SlidePart)presentationPart.GetPartById(slideId.RelationshipId!); - var markdown = ConvertSlideToMarkdown(slidePart, slideIndex); + var textContent = string.IsNullOrWhiteSpace(slideResult.Text) ? slideResult.Markdown : slideResult.Text; + artifacts.TextBlocks.Add(new TextArtifact(textContent, slideIndex, streamInfo.FileName, segment.Label)); - if (string.IsNullOrWhiteSpace(markdown)) + if (!string.IsNullOrWhiteSpace(textContent)) + { + if (rawTextBuilder.Length > 0) { - continue; + rawTextBuilder.AppendLine(); } - var metadata = new Dictionary + rawTextBuilder.AppendLine(textContent); + } + + foreach (var image in slideResult.Images) + { + image.SegmentIndex = segmentIndex; + artifacts.Images.Add(image); + } + } + + return new PptxExtractionResult(segments, artifacts, rawTextBuilder.ToString().Trim()); + } + + private async Task ConvertSlideToMarkdownAsync(SlidePart slidePart, int slideNumber, StreamInfo streamInfo, CancellationToken cancellationToken) + { + var markdown = new StringBuilder(); + var text = new StringBuilder(); + var images = new List(); + var slide = slidePart.Slide; + var slideImageIndex = 0; + + markdown.AppendLine($"## Slide {slideNumber}"); + markdown.AppendLine(); + text.AppendLine($"Slide {slideNumber}"); + + if (slide.CommonSlideData?.ShapeTree is not null) + { + foreach (var element in slide.CommonSlideData.ShapeTree.Elements()) + { + cancellationToken.ThrowIfCancellationRequested(); + + switch (element) { - ["slide"] = slideIndex.ToString(CultureInfo.InvariantCulture) - }; - - segments.Add(new DocumentSegment( - markdown: markdown.TrimEnd(), - type: SegmentType.Slide, - number: slideIndex, - label: $"Slide {slideIndex}", - source: fileName, - additionalMetadata: metadata)); + case DocumentFormat.OpenXml.Presentation.Shape textShape: + AppendTextShape(textShape, markdown, text); + break; + case DocumentFormat.OpenXml.Presentation.Picture picture: + { + var artifact = await ExtractImageAsync(picture, slidePart, slideNumber, streamInfo, cancellationToken).ConfigureAwait(false); + if (artifact is null) + { + break; + } + + slideImageIndex++; + var label = artifact.Label ?? $"Slide {slideNumber} Image {slideImageIndex}"; + artifact.Label = label; + + var mimeType = string.IsNullOrWhiteSpace(artifact.ContentType) + ? MimeHelper.GetMimeType(".png") + : artifact.ContentType; + + var base64 = Convert.ToBase64String(artifact.Data); + var placeholder = $"![{label}](data:{mimeType};base64,{base64})"; + artifact.PlaceholderMarkdown = placeholder; + markdown.AppendLine(placeholder); + markdown.AppendLine(); + + text.AppendLine(label); + images.Add(artifact); + break; + } + } } } - return segments; + var markdownResult = markdown.ToString().TrimEnd(); + var textResult = text.ToString().Trim(); + return new SlideExtractionResult(markdownResult, textResult, images); } - private static string ConvertSlideToMarkdown(SlidePart slidePart, int slideNumber) + private async Task ExtractImageAsync(DocumentFormat.OpenXml.Presentation.Picture picture, SlidePart slidePart, int slideNumber, StreamInfo streamInfo, CancellationToken cancellationToken) { - var result = new StringBuilder(); - result.AppendLine($"## Slide {slideNumber}"); - result.AppendLine(); + var blip = picture.BlipFill?.Blip; + var relationshipId = blip?.Embed?.Value ?? blip?.Link?.Value; + if (string.IsNullOrWhiteSpace(relationshipId)) + { + return null; + } - var slide = slidePart.Slide; - if (slide.CommonSlideData?.ShapeTree != null) + if (slidePart.GetPartById(relationshipId) is not ImagePart imagePart) { - foreach (var shape in slide.CommonSlideData.ShapeTree.Elements()) + return null; + } + + await using var partStream = imagePart.GetStream(); + using var memory = new MemoryStream(); + await partStream.CopyToAsync(memory, cancellationToken).ConfigureAwait(false); + var data = memory.ToArray(); + + var label = picture.NonVisualPictureProperties?.NonVisualDrawingProperties?.Name; + var artifact = new ImageArtifact(data, imagePart.ContentType, slideNumber, streamInfo.FileName, label); + artifact.Metadata[MetadataKeys.Slide] = slideNumber.ToString(CultureInfo.InvariantCulture); + + if (imageUnderstandingProvider is not null) + { + try { - if (shape is DocumentFormat.OpenXml.Presentation.Shape textShape) + using var analysisStream = new MemoryStream(data, writable: false); + var result = await imageUnderstandingProvider.AnalyzeAsync(analysisStream, streamInfo, cancellationToken).ConfigureAwait(false); + if (!string.IsNullOrWhiteSpace(result?.Caption)) { - AppendTextShape(textShape, result); + artifact.Metadata[MetadataKeys.Caption] = result!.Caption!; + artifact.Label ??= result.Caption; + } + + if (!string.IsNullOrWhiteSpace(result?.Text)) + { + artifact.Metadata["ocrText"] = result!.Text!; + artifact.RawText = result.Text; } } + catch + { + // Ignore analysis failures; downstream middleware may handle enrichment. + } } - return result.ToString().TrimEnd(); + return artifact; } - private static void AppendTextShape(DocumentFormat.OpenXml.Presentation.Shape shape, StringBuilder result) + private static void AppendTextShape(DocumentFormat.OpenXml.Presentation.Shape shape, StringBuilder markdown, StringBuilder text) { var textBody = shape.TextBody; - if (textBody == null) + if (textBody is null) + { return; + } + + var placeholderShape = shape.NonVisualShapeProperties?.ApplicationNonVisualDrawingProperties?.PlaceholderShape; + var placeholderType = placeholderShape?.Type?.Value; + var isTitle = placeholderType == PlaceholderValues.Title || + placeholderType == PlaceholderValues.CenteredTitle || + placeholderType == PlaceholderValues.SubTitle; foreach (var paragraph in textBody.Elements()) { var paragraphText = new StringBuilder(); - var isTitle = false; - - // Check if this is a title based on placeholder type - var placeholderShape = shape.NonVisualShapeProperties?.ApplicationNonVisualDrawingProperties?.PlaceholderShape; - if (placeholderShape?.Type?.Value == PlaceholderValues.Title || - placeholderShape?.Type?.Value == PlaceholderValues.CenteredTitle || - placeholderShape?.Type?.Value == PlaceholderValues.SubTitle) - { - isTitle = true; - } - // Process runs in the paragraph foreach (var run in paragraph.Elements()) { var runProperties = run.RunProperties; - var text = run.Text?.Text ?? ""; - - if (!string.IsNullOrEmpty(text)) + var runText = run.Text?.Text ?? string.Empty; + if (string.IsNullOrEmpty(runText)) + { + continue; + } + + if (runProperties?.Bold?.Value == true) + { + runText = $"**{runText}**"; + } + + if (runProperties?.Italic?.Value == true) { - // Apply formatting based on run properties - if (runProperties?.Bold?.Value == true) - text = $"**{text}**"; - if (runProperties?.Italic?.Value == true) - text = $"*{text}*"; - - paragraphText.Append(text); + runText = $"*{runText}*"; } + + paragraphText.Append(runText); } - // Process text without runs (direct text) - foreach (var text in paragraph.Elements()) + foreach (var textElement in paragraph.Elements()) { - paragraphText.Append(text.Text); + if (!string.IsNullOrWhiteSpace(textElement.Text)) + { + paragraphText.Append(textElement.Text); + } } var finalText = paragraphText.ToString().Trim(); - if (!string.IsNullOrWhiteSpace(finalText)) + if (string.IsNullOrWhiteSpace(finalText)) { - if (isTitle) - { - result.AppendLine($"### {finalText}"); - } - else - { - // For now, just output as regular text - // Bullet point detection in PowerPoint is complex and varies by version - result.AppendLine(finalText); - } - result.AppendLine(); + continue; + } + + if (isTitle) + { + markdown.AppendLine($"### {finalText}"); } + else + { + markdown.AppendLine(finalText); + } + + markdown.AppendLine(); + text.AppendLine(finalText); } } - private static string ExtractTitle(string markdown, string fileName) + private static string ExtractTitle(string? rawText, string? fileName, string markdown) { - if (!string.IsNullOrWhiteSpace(markdown)) + var fromRaw = ExtractTitleCore(rawText); + if (!string.IsNullOrWhiteSpace(fromRaw)) + { + return fromRaw!; + } + + var fromMarkdown = ExtractTitleCore(markdown); + if (!string.IsNullOrWhiteSpace(fromMarkdown)) + { + return fromMarkdown!; + } + + if (!string.IsNullOrWhiteSpace(fileName)) { - var lines = markdown.Split('\n', StringSplitOptions.RemoveEmptyEntries); - - // Look for the first heading (### from slide title) - foreach (var line in lines.Take(10)) + var nameWithoutExtension = Path.GetFileNameWithoutExtension(fileName); + if (!string.IsNullOrWhiteSpace(nameWithoutExtension)) { - var trimmedLine = line.Trim(); - if (trimmedLine.StartsWith("###")) - { - return trimmedLine.TrimStart('#').Trim(); - } + return nameWithoutExtension; } + } + + return "PowerPoint Presentation"; + } + + private static string? ExtractTitleCore(string? text) + { + if (string.IsNullOrWhiteSpace(text)) + { + return null; + } + + var lines = text.Split('\n', StringSplitOptions.RemoveEmptyEntries); - // If no heading found, use the first substantial line - foreach (var line in lines.Take(5)) + foreach (var line in lines.Take(10)) + { + var trimmedLine = line.Trim(); + if (trimmedLine.StartsWith("###", StringComparison.Ordinal)) { - var trimmedLine = line.Trim(); - if (!trimmedLine.StartsWith("##") && trimmedLine.Length > 5 && trimmedLine.Length < 100) - { - return trimmedLine; - } + return trimmedLine.TrimStart('#').Trim(); } } - // Fallback to filename - if (!string.IsNullOrWhiteSpace(fileName)) + foreach (var line in lines.Take(5)) { - var nameWithoutExtension = System.IO.Path.GetFileNameWithoutExtension(fileName); - return string.IsNullOrWhiteSpace(nameWithoutExtension) ? "PowerPoint Presentation" : nameWithoutExtension; + var trimmedLine = line.Trim(); + if (!trimmedLine.StartsWith("##", StringComparison.Ordinal) && trimmedLine.Length > 5 && trimmedLine.Length < 100) + { + return trimmedLine; + } } - return "PowerPoint Presentation"; + return null; } } diff --git a/src/MarkItDown/DocumentConverterResult.cs b/src/MarkItDown/DocumentConverterResult.cs index 64d00624b..cf18872ce 100644 --- a/src/MarkItDown/DocumentConverterResult.cs +++ b/src/MarkItDown/DocumentConverterResult.cs @@ -19,13 +19,15 @@ public sealed class DocumentConverterResult /// The converted Markdown text. /// Optional title of the document. /// Optional collection of segments that represent structured slices of the output. - public DocumentConverterResult(string markdown, string? title = null, IReadOnlyList? segments = null) + /// Optional raw extraction artifacts available prior to Markdown composition. + public DocumentConverterResult(string markdown, string? title = null, IReadOnlyList? segments = null, ConversionArtifacts? artifacts = null) { Markdown = markdown ?? throw new ArgumentNullException(nameof(markdown)); Title = title; Segments = segments is null ? EmptySegments : new ReadOnlyCollection(segments.ToArray()); + Artifacts = artifacts ?? ConversionArtifacts.Empty; } /// @@ -43,6 +45,11 @@ public DocumentConverterResult(string markdown, string? title = null, IReadOnlyL /// public IReadOnlyList Segments { get; } + /// + /// Raw extraction artifacts captured during conversion. + /// + public ConversionArtifacts Artifacts { get; } + /// /// Soft-deprecated alias for Markdown. New code should migrate to using Markdown property. /// diff --git a/src/MarkItDown/MarkItDown.cs b/src/MarkItDown/MarkItDown.cs index 65e52a3dc..c7e5029c0 100644 --- a/src/MarkItDown/MarkItDown.cs +++ b/src/MarkItDown/MarkItDown.cs @@ -4,6 +4,7 @@ using Azure.Core; using Azure.Identity; using Microsoft.Extensions.Logging; +using MarkItDown.Conversion.Middleware; using MarkItDown.Converters; using MarkItDown.Intelligence; using MarkItDown.Intelligence.Providers.Aws; @@ -24,6 +25,7 @@ public sealed class MarkItDown private readonly HttpClient? _httpClient; private readonly MarkItDownOptions _options; private readonly IntelligenceProviderHub _intelligenceProviders; + private readonly IConversionPipeline _conversionPipeline; /// /// Initialize a new instance of MarkItDown. @@ -48,6 +50,7 @@ public MarkItDown(MarkItDownOptions? options, ILogger? logger = null, HttpClient _httpClient = httpClient; _converters = []; _intelligenceProviders = InitializeIntelligenceProviders(); + _conversionPipeline = BuildConversionPipeline(); if (_options.EnableBuiltins) { @@ -249,6 +252,28 @@ private void RegisterBuiltInConverters() } } + private IConversionPipeline BuildConversionPipeline() + { + var middleware = new List(); + + if (_options.EnableAiImageEnrichment) + { + middleware.Add(new AiImageEnrichmentMiddleware()); + } + + if (_options.ConversionMiddleware is { Count: > 0 }) + { + middleware.AddRange(_options.ConversionMiddleware); + } + + if (middleware.Count == 0) + { + return ConversionPipeline.Empty; + } + + return new ConversionPipeline(middleware, _intelligenceProviders.AiModels ?? NullAiModelProvider.Instance, _logger); + } + private IntelligenceProviderHub InitializeIntelligenceProviders() { IDocumentIntelligenceProvider? documentProvider = _options.DocumentIntelligenceProvider; @@ -375,10 +400,10 @@ private IEnumerable CreateBuiltInConverters() new EmlConverter(), new XmlConverter(), new ZipConverter(CreateZipInnerConverters(CreateImageConverter, CreateAudioConverter)), - new PdfConverter(_options.Segments, _intelligenceProviders.Document, _intelligenceProviders.Image), - new DocxConverter(_options.Segments), + new PdfConverter(_options.Segments, _intelligenceProviders.Document, _intelligenceProviders.Image, _conversionPipeline), + new DocxConverter(_options.Segments, _conversionPipeline, _intelligenceProviders.Image), new XlsxConverter(_options.Segments), - new PptxConverter(_options.Segments), + new PptxConverter(_options.Segments, _conversionPipeline, _intelligenceProviders.Image), CreateAudioConverter(), CreateImageConverter(), new PlainTextConverter(), @@ -401,10 +426,10 @@ private IEnumerable CreateZipInnerConverters(Func public SegmentOptions Segments { get; init; } = SegmentOptions.Default; + + /// + /// Custom middleware invoked after extraction but before Markdown composition. + /// + public IReadOnlyList ConversionMiddleware { get; init; } = Array.Empty(); + + /// + /// Gets or sets a value indicating whether AI-based image enrichment should be enabled when a chat client is present. + /// + public bool EnableAiImageEnrichment { get; init; } = true; } diff --git a/src/MarkItDown/MetadataKeys.cs b/src/MarkItDown/MetadataKeys.cs index e4af864ff..5299d7b63 100644 --- a/src/MarkItDown/MetadataKeys.cs +++ b/src/MarkItDown/MetadataKeys.cs @@ -15,6 +15,7 @@ public static class MetadataKeys public const string TableIndex = "tableIndex"; public const string Provider = "provider"; public const string ModelVersion = "modelVersion"; + public const string Slide = "slide"; public const string VideoId = "videoId"; public const string Language = "language"; public const string Segment = "segment"; diff --git a/tests/MarkItDown.Tests/DocxConverterTests.cs b/tests/MarkItDown.Tests/DocxConverterTests.cs new file mode 100644 index 000000000..646dfa66b --- /dev/null +++ b/tests/MarkItDown.Tests/DocxConverterTests.cs @@ -0,0 +1,48 @@ +using System; +using System.Threading.Tasks; +using ManagedCode.MimeTypes; +using MarkItDown; +using MarkItDown.Converters; +using MarkItDown.Tests.Fixtures; +using Shouldly; + +namespace MarkItDown.Tests; + +public class DocxConverterTests +{ + [Fact] + public async Task ConvertAsync_DocxWithImages_ExecutesPipelineAndCapturesArtifacts() + { + // Arrange + var pipeline = new RecordingPipeline("DOCX ENRICHED"); + var converter = new DocxConverter(pipeline: pipeline); + + await using var stream = DocxInlineImageFactory.Create(); + var streamInfo = new StreamInfo( + mimeType: MimeHelper.GetMimeType(".docx"), + extension: ".docx", + fileName: "doc-inline-image.docx"); + + // Act + var result = await converter.ConvertAsync(stream, streamInfo); + + // Assert + pipeline.Executed.ShouldBeTrue(); + result.Artifacts.Images.ShouldNotBeEmpty(); + result.Artifacts.TextBlocks.ShouldNotBeEmpty(); + + var image = result.Artifacts.Images[0]; + image.SegmentIndex.ShouldNotBeNull(); + image.DetailedDescription.ShouldBe("DOCX ENRICHED"); + image.Metadata.ShouldContainKey(MetadataKeys.Page); + image.PlaceholderMarkdown.ShouldNotBeNull(); + + var segment = result.Segments[image.SegmentIndex!.Value]; + segment.Markdown.ShouldContain("DOCX ENRICHED"); + segment.Type.ShouldBe(SegmentType.Page); + var placeholderIndex = segment.Markdown.IndexOf(image.PlaceholderMarkdown!, StringComparison.Ordinal); + placeholderIndex.ShouldBeGreaterThanOrEqualTo(0); + var trailing = segment.Markdown[(placeholderIndex + image.PlaceholderMarkdown!.Length)..]; + trailing.TrimStart('\r', '\n').ShouldStartWith("DOCX ENRICHED"); + } +} diff --git a/tests/MarkItDown.Tests/Fixtures/DocxInlineImageFactory.cs b/tests/MarkItDown.Tests/Fixtures/DocxInlineImageFactory.cs new file mode 100644 index 000000000..b5f236815 --- /dev/null +++ b/tests/MarkItDown.Tests/Fixtures/DocxInlineImageFactory.cs @@ -0,0 +1,106 @@ +using System; +using System.IO; +using DocumentFormat.OpenXml; +using DocumentFormat.OpenXml.Packaging; +using DocumentFormat.OpenXml.Wordprocessing; +using A = DocumentFormat.OpenXml.Drawing; +using DW = DocumentFormat.OpenXml.Drawing.Wordprocessing; +using PIC = DocumentFormat.OpenXml.Drawing.Pictures; +using WpDrawing = DocumentFormat.OpenXml.Wordprocessing.Drawing; + +namespace MarkItDown.Tests.Fixtures; + +internal static class DocxInlineImageFactory +{ + private const string PixelBase64 = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR4nGJ8/P8/AwAI/AL+Kc3sNwAAAABJRU5ErkJggg=="; + + public static MemoryStream Create(string beforeText = "Paragraph before image.", string afterText = "Paragraph after image.") + { + var workingStream = new MemoryStream(); + + using (var document = WordprocessingDocument.Create(workingStream, WordprocessingDocumentType.Document, true)) + { + var mainPart = document.AddMainDocumentPart(); + mainPart.Document = new Document(new Body()); + var body = mainPart.Document.Body ?? throw new InvalidOperationException("DOCX body was not created."); + + body.AppendChild(new Paragraph(new Run(new Text(beforeText)))); + + AppendInlineImage(mainPart, body); + + body.AppendChild(new Paragraph(new Run(new Text(afterText)))); + + mainPart.Document.Save(); + } + + var buffer = workingStream.ToArray(); + return new MemoryStream(buffer, writable: false); + } + + private static void AppendInlineImage(MainDocumentPart mainPart, Body body) + { + var imagePart = mainPart.AddImagePart(ImagePartType.Png); + using (var imageStream = new MemoryStream(Convert.FromBase64String(PixelBase64))) + { + imagePart.FeedData(imageStream); + } + + var relationshipId = mainPart.GetIdOfPart(imagePart); + const long size = 990000L; + + var inline = new DW.Inline( + new DW.Extent { Cx = size, Cy = size }, + new DW.EffectExtent + { + LeftEdge = 0L, + TopEdge = 0L, + RightEdge = 0L, + BottomEdge = 0L, + }, + new DW.DocProperties + { + Id = 1U, + Name = "Test image", + Description = "Generated pixel", + }, + new DW.NonVisualGraphicFrameDrawingProperties( + new A.GraphicFrameLocks { NoChangeAspect = true }), + new A.Graphic( + new A.GraphicData( + new PIC.Picture( + new PIC.NonVisualPictureProperties( + new PIC.NonVisualDrawingProperties + { + Id = 0U, + Name = "pixel.png", + Description = "Generated pixel", + }, + new PIC.NonVisualPictureDrawingProperties()), + new PIC.BlipFill( + new A.Blip + { + Embed = relationshipId, + CompressionState = A.BlipCompressionValues.Print, + }, + new A.Stretch(new A.FillRectangle())), + new PIC.ShapeProperties( + new A.Transform2D( + new A.Offset { X = 0L, Y = 0L }, + new A.Extents { Cx = size, Cy = size }), + new A.PresetGeometry { Preset = A.ShapeTypeValues.Rectangle })) + ) + { + Uri = "http://schemas.openxmlformats.org/drawingml/2006/picture", + }) + ) + { + DistanceFromTop = 0U, + DistanceFromBottom = 0U, + DistanceFromLeft = 0U, + DistanceFromRight = 0U, + }; + + var run = new Run(new WpDrawing(inline)); + body.AppendChild(new Paragraph(run)); + } +} diff --git a/tests/MarkItDown.Tests/Intelligence/PdfConverterIntelligenceTests.cs b/tests/MarkItDown.Tests/Intelligence/PdfConverterIntelligenceTests.cs index 2df2350fc..9ae6fdaee 100644 --- a/tests/MarkItDown.Tests/Intelligence/PdfConverterIntelligenceTests.cs +++ b/tests/MarkItDown.Tests/Intelligence/PdfConverterIntelligenceTests.cs @@ -23,6 +23,25 @@ public StubDocumentIntelligenceProvider(DocumentIntelligenceResult? result) } } + private sealed class StubPdfTextExtractor : PdfConverter.IPdfTextExtractor + { + public Task> ExtractTextAsync(byte[] pdfBytes, CancellationToken cancellationToken) + => Task.FromResult>(Array.Empty()); + } + + private sealed class StubPdfImageRenderer : PdfConverter.IPdfImageRenderer + { + private readonly IReadOnlyList images; + + public StubPdfImageRenderer(IReadOnlyList images) + { + this.images = images; + } + + public Task> RenderImagesAsync(byte[] pdfBytes, CancellationToken cancellationToken) + => Task.FromResult(images); + } + [Fact] public async Task ConvertAsync_WithDocumentIntelligenceProvider_UsesProviderSegments() { @@ -65,4 +84,44 @@ public async Task ConvertAsync_WithDocumentIntelligenceProvider_UsesProviderSegm result.Segments.ShouldContain(s => s.Type == SegmentType.Image && s.Markdown.Contains("Chart overview")); result.Markdown.ShouldContain("Summary"); } + + [Fact] + public async Task ConvertAsync_DocumentIntelligenceWithoutImages_AddsPageSnapshotsForPipeline() + { + var page = new DocumentPageResult( + pageNumber: 1, + text: "Page one body", + tableIndices: Array.Empty()); + + var providerResult = new DocumentIntelligenceResult( + pages: new[] { page }, + tables: Array.Empty(), + images: Array.Empty()); + + var provider = new StubDocumentIntelligenceProvider(providerResult); + var pipeline = new RecordingPipeline("SNAPSHOT ENRICHED"); + var textExtractor = new StubPdfTextExtractor(); + var pageImage = Convert.ToBase64String(new byte[] { 5, 4, 3, 2 }); + var imageRenderer = new StubPdfImageRenderer(new[] { pageImage }); + var converter = new PdfConverter(textExtractor, imageRenderer, pipeline: pipeline, documentProvider: provider); + + await using var stream = new MemoryStream(new byte[] { 0x25, 0x50, 0x44, 0x46 }); + var streamInfo = new StreamInfo(mimeType: "application/pdf", extension: ".pdf", fileName: "snapshot.pdf"); + + var result = await converter.ConvertAsync(stream, streamInfo); + + pipeline.Executed.ShouldBeTrue(); + result.Artifacts.Images.ShouldNotBeEmpty(); + + var snapshot = result.Artifacts.Images[0]; + snapshot.SegmentIndex.ShouldNotBeNull(); + snapshot.DetailedDescription.ShouldBe("SNAPSHOT ENRICHED"); + snapshot.Metadata.ShouldContainKey("snapshot"); + + var segment = result.Segments[snapshot.SegmentIndex!.Value]; + segment.Type.ShouldBe(SegmentType.Image); + segment.AdditionalMetadata.ShouldContainKey("snapshot"); + segment.Markdown.ShouldContain("SNAPSHOT ENRICHED"); + result.Segments.ShouldContain(s => s.Type == SegmentType.Section && s.Label == "Page Snapshots"); + } } diff --git a/tests/MarkItDown.Tests/NewConvertersTests.cs b/tests/MarkItDown.Tests/NewConvertersTests.cs index 8aa7f9b20..a545e0a4f 100644 --- a/tests/MarkItDown.Tests/NewConvertersTests.cs +++ b/tests/MarkItDown.Tests/NewConvertersTests.cs @@ -1,5 +1,9 @@ +using System.IO; +using System.Threading.Tasks; +using ManagedCode.MimeTypes; using MarkItDown; using MarkItDown.Converters; +using MarkItDown.Tests.Fixtures; using Shouldly; namespace MarkItDown.Tests; @@ -221,4 +225,49 @@ public void EmlConverter_Priority_IsBetweenPptxAndEpub() emlConverter.Priority.ShouldBeGreaterThan(pptxConverter.Priority); emlConverter.Priority.ShouldBeLessThan(epubConverter.Priority); } + + [Fact] + public async Task DocxConverter_PipelineReceivesArtifacts() + { + var pipeline = new RecordingPipeline("ENRICHED"); + var converter = new DocxConverter(pipeline: pipeline); + + await using var stream = DocxInlineImageFactory.Create(); + var streamInfo = new StreamInfo( + mimeType: "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + extension: ".docx", + fileName: "doc-inline-image.docx"); + + var result = await converter.ConvertAsync(stream, streamInfo); + + pipeline.Executed.ShouldBeTrue(); + result.Artifacts.Images.Count.ShouldBeGreaterThan(0); + var image = result.Artifacts.Images[0]; + image.SegmentIndex.ShouldNotBeNull(); + image.DetailedDescription.ShouldBe("ENRICHED"); + result.Segments[image.SegmentIndex!.Value].Markdown.ShouldContain("ENRICHED"); + } + + [Fact] + public async Task PptxConverter_PipelineReceivesArtifacts() + { + var pipeline = new RecordingPipeline("SLIDE"); + var converter = new PptxConverter(pipeline: pipeline); + + await using var stream = TestAssetLoader.OpenAsset("test.pptx"); + var streamInfo = new StreamInfo( + mimeType: MimeHelper.GetMimeType(".pptx"), + extension: ".pptx", + fileName: "test.pptx", + localPath: TestAssetLoader.GetAssetPath("test.pptx")); + + var result = await converter.ConvertAsync(stream, streamInfo); + + pipeline.Executed.ShouldBeTrue(); + result.Artifacts.Images.Count.ShouldBeGreaterThan(0); + var image = result.Artifacts.Images[0]; + image.SegmentIndex.ShouldNotBeNull(); + image.DetailedDescription.ShouldBe("SLIDE"); + result.Segments[image.SegmentIndex!.Value].Markdown.ShouldContain("SLIDE"); + } } diff --git a/tests/MarkItDown.Tests/PdfConverterTests.cs b/tests/MarkItDown.Tests/PdfConverterTests.cs index 060c3dd50..0937068d6 100644 --- a/tests/MarkItDown.Tests/PdfConverterTests.cs +++ b/tests/MarkItDown.Tests/PdfConverterTests.cs @@ -1,3 +1,4 @@ +using System; using System.Collections.Generic; using System.IO; using System.Linq; @@ -38,6 +39,35 @@ public async Task ConvertAsync_CombinesTextAndImages() Assert.Equal(2, result.Segments[1].Number); Assert.Equal(SegmentType.Image, result.Segments[^1].Type); Assert.Equal(1, result.Segments[^1].Number); + Assert.Single(result.Artifacts.Images); + Assert.NotNull(result.Artifacts.Images[0].SegmentIndex); + } + + [Fact] + public async Task PdfConverter_PipelineEnrichesImages() + { + var pipeline = new RecordingPipeline("PIPELINE"); + var textExtractor = new StubPdfTextExtractor("Page body"); + var imageRenderer = new StubPdfImageRenderer(new[] { Convert.ToBase64String(new byte[] { 1, 2, 3 }) }); + var converter = new PdfConverter(textExtractor, imageRenderer, pipeline: pipeline); + + using var stream = new MemoryStream(new byte[] { 1, 2, 3, 4 }); + var streamInfo = new StreamInfo(mimeType: "application/pdf", extension: ".pdf", fileName: "pipeline.pdf"); + + var result = await converter.ConvertAsync(stream, streamInfo); + + Assert.True(pipeline.Executed); + Assert.Single(result.Artifacts.Images); + var image = result.Artifacts.Images[0]; + Assert.NotNull(image.SegmentIndex); + Assert.Equal("PIPELINE", image.DetailedDescription); + Assert.NotNull(image.PlaceholderMarkdown); + Assert.Contains("PIPELINE", result.Segments[image.SegmentIndex!.Value].Markdown); + var imageSegment = result.Segments[image.SegmentIndex!.Value]; + var placeholderIndex = imageSegment.Markdown.IndexOf(image.PlaceholderMarkdown!, StringComparison.Ordinal); + Assert.True(placeholderIndex >= 0); + var trailing = imageSegment.Markdown[(placeholderIndex + image.PlaceholderMarkdown!.Length)..]; + Assert.StartsWith("PIPELINE", trailing.TrimStart('\r', '\n')); } [Fact] diff --git a/tests/MarkItDown.Tests/PptxConverterTests.cs b/tests/MarkItDown.Tests/PptxConverterTests.cs new file mode 100644 index 000000000..fe070191d --- /dev/null +++ b/tests/MarkItDown.Tests/PptxConverterTests.cs @@ -0,0 +1,47 @@ +using System; +using System.Threading.Tasks; +using ManagedCode.MimeTypes; +using MarkItDown; +using MarkItDown.Converters; +using Shouldly; + +namespace MarkItDown.Tests; + +public class PptxConverterTests +{ + [Fact] + public async Task ConvertAsync_PptxWithImages_ExecutesPipelineAndCapturesArtifacts() + { + // Arrange + var pipeline = new RecordingPipeline("PPTX ENRICHED"); + var converter = new PptxConverter(pipeline: pipeline); + + await using var stream = TestAssetLoader.OpenAsset("test.pptx"); + var streamInfo = new StreamInfo( + mimeType: MimeHelper.GetMimeType(".pptx"), + extension: ".pptx", + fileName: "test.pptx"); + + // Act + var result = await converter.ConvertAsync(stream, streamInfo); + + // Assert + pipeline.Executed.ShouldBeTrue(); + result.Artifacts.Images.ShouldNotBeEmpty(); + result.Artifacts.TextBlocks.ShouldNotBeEmpty(); + + var image = result.Artifacts.Images[0]; + image.SegmentIndex.ShouldNotBeNull(); + image.DetailedDescription.ShouldBe("PPTX ENRICHED"); + image.Metadata.ShouldContainKey(MetadataKeys.Slide); + image.PlaceholderMarkdown.ShouldNotBeNull(); + + var segment = result.Segments[image.SegmentIndex!.Value]; + segment.Markdown.ShouldContain("PPTX ENRICHED"); + segment.Type.ShouldBe(SegmentType.Slide); + var placeholderIndex = segment.Markdown.IndexOf(image.PlaceholderMarkdown!, StringComparison.Ordinal); + placeholderIndex.ShouldBeGreaterThanOrEqualTo(0); + var trailing = segment.Markdown[(placeholderIndex + image.PlaceholderMarkdown!.Length)..]; + trailing.TrimStart('\r', '\n').ShouldStartWith("PPTX ENRICHED"); + } +} diff --git a/tests/MarkItDown.Tests/RecordingPipeline.cs b/tests/MarkItDown.Tests/RecordingPipeline.cs new file mode 100644 index 000000000..694f0c89b --- /dev/null +++ b/tests/MarkItDown.Tests/RecordingPipeline.cs @@ -0,0 +1,64 @@ +using System; +using System.Collections.Generic; +using System.Threading; +using System.Threading.Tasks; +using MarkItDown; + +namespace MarkItDown.Tests; + +internal sealed class RecordingPipeline : IConversionPipeline +{ + private readonly string message; + + public RecordingPipeline(string message = "Pipeline") + { + this.message = message; + } + + public bool Executed { get; private set; } + + public Task ExecuteAsync(StreamInfo streamInfo, ConversionArtifacts artifacts, IList segments, CancellationToken cancellationToken) + { + Executed = true; + + if (artifacts.Images.Count > 0) + { + var artifact = artifacts.Images[0]; + if (artifact.SegmentIndex is int index && index >= 0 && index < segments.Count) + { + var segment = segments[index]; + var updatedMarkdown = InjectAfterPlaceholder(segment.Markdown, artifact.PlaceholderMarkdown, Environment.NewLine + message); + var updatedSegment = new DocumentSegment( + updatedMarkdown, + segment.Type, + segment.Number, + segment.Label, + segment.StartTime, + segment.EndTime, + segment.Source, + segment.AdditionalMetadata); + segments[index] = updatedSegment; + artifact.DetailedDescription = message; + } + } + + return Task.CompletedTask; + } + + private static string InjectAfterPlaceholder(string markdown, string? placeholder, string injection) + { + if (string.IsNullOrWhiteSpace(placeholder)) + { + return markdown + injection; + } + + var index = markdown.IndexOf(placeholder, StringComparison.Ordinal); + if (index < 0) + { + return markdown + injection; + } + + var insertPosition = index + placeholder.Length; + return markdown.Insert(insertPosition, injection); + } +} From edfd1cb261f3d7929b067fb16115c614896db23f Mon Sep 17 00:00:00 2001 From: ksemenenko Date: Sat, 11 Oct 2025 18:03:45 +0200 Subject: [PATCH 2/6] middleware: tighten image enrichment loops --- .../Middleware/AiImageEnrichmentMiddleware.cs | 86 +++++++++++++++++-- 1 file changed, 77 insertions(+), 9 deletions(-) diff --git a/src/MarkItDown/Conversion/Middleware/AiImageEnrichmentMiddleware.cs b/src/MarkItDown/Conversion/Middleware/AiImageEnrichmentMiddleware.cs index 4afbb6995..443be6917 100644 --- a/src/MarkItDown/Conversion/Middleware/AiImageEnrichmentMiddleware.cs +++ b/src/MarkItDown/Conversion/Middleware/AiImageEnrichmentMiddleware.cs @@ -1,7 +1,8 @@ using System; +using System.Buffers; using System.Collections.Generic; using System.Globalization; -using System.Linq; +using System.Runtime.InteropServices; using System.Text; using System.Text.Json; using System.Threading; @@ -35,13 +36,13 @@ public async Task InvokeAsync(ConversionPipelineContext context, CancellationTok return; } - foreach (var image in context.Artifacts.Images) + async Task ProcessImageAsync(ImageArtifact image) { cancellationToken.ThrowIfCancellationRequested(); if (image.DetailedDescription is not null) { - continue; + return; } var prompt = BuildPrompt(context.StreamInfo, image); @@ -61,12 +62,12 @@ public async Task InvokeAsync(ConversionPipelineContext context, CancellationTok catch (Exception ex) { context.Logger?.LogWarning(ex, "Image enrichment failed for {Label}", image.Label ?? image.PageNumber?.ToString(CultureInfo.InvariantCulture)); - continue; + return; } if (response is null) { - continue; + return; } ImageInsight? insight = null; @@ -90,7 +91,7 @@ public async Task InvokeAsync(ConversionPipelineContext context, CancellationTok if (string.IsNullOrWhiteSpace(markdown)) { - continue; + return; } image.DetailedDescription = markdown.Trim(); @@ -102,6 +103,21 @@ public async Task InvokeAsync(ConversionPipelineContext context, CancellationTok image.Metadata["detailedDescription"] = image.DetailedDescription; } + + if (context.Artifacts.Images is List list) + { + for (var i = 0; i < list.Count; i++) + { + await ProcessImageAsync(list[i]).ConfigureAwait(false); + } + } + else + { + foreach (var image in context.Artifacts.Images) + { + await ProcessImageAsync(image).ConfigureAwait(false); + } + } } private static string BuildPrompt(StreamInfo streamInfo, ImageArtifact image) @@ -127,11 +143,34 @@ private static string BuildPrompt(StreamInfo streamInfo, ImageArtifact image) builder.AppendLine($"- MimeType: {image.ContentType ?? "unknown"}"); builder.AppendLine(); - var base64 = Convert.ToBase64String(image.Data); builder.Append("ImagePayload: data:"); builder.Append(image.ContentType ?? "application/octet-stream"); builder.Append(";base64,"); - builder.Append(base64); + + var base64Length = checked(((image.Data.Length + 2) / 3) * 4); + char[]? rented = null; + Span buffer = base64Length <= 4096 + ? stackalloc char[base64Length] + : (rented = ArrayPool.Shared.Rent(base64Length)); + + try + { + if (Convert.TryToBase64Chars(image.Data, buffer, out var charsWritten)) + { + builder.Append(buffer[..charsWritten]); + } + else + { + builder.Append(Convert.ToBase64String(image.Data)); + } + } + finally + { + if (rented is not null) + { + ArrayPool.Shared.Return(rented); + } + } return builder.ToString(); } @@ -300,10 +339,39 @@ public string ToMarkdown() if (KeyFindings.Count > 0) { + var originalLength = builder.Length; + var anyFinding = false; builder.AppendLine().AppendLine("Key findings:"); - foreach (var finding in KeyFindings.Where(static f => !string.IsNullOrWhiteSpace(f))) + + void AppendFinding(string? finding) { + if (string.IsNullOrWhiteSpace(finding)) + { + return; + } + builder.Append("- ").AppendLine(finding.Trim()); + anyFinding = true; + } + + if (KeyFindings is List list) + { + foreach (var finding in CollectionsMarshal.AsSpan(list)) + { + AppendFinding(finding); + } + } + else + { + foreach (var finding in KeyFindings) + { + AppendFinding(finding); + } + } + + if (!anyFinding) + { + builder.Length = originalLength; } } From 9fcba23a55c12adf3b77738b5c7e7451861c7581 Mon Sep 17 00:00:00 2001 From: ksemenenko Date: Sat, 11 Oct 2025 18:15:55 +0200 Subject: [PATCH 3/6] pdf: guard snapshot rendering failures --- src/MarkItDown/Converters/PdfConverter.cs | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/src/MarkItDown/Converters/PdfConverter.cs b/src/MarkItDown/Converters/PdfConverter.cs index 191ccac82..6d3c3da60 100644 --- a/src/MarkItDown/Converters/PdfConverter.cs +++ b/src/MarkItDown/Converters/PdfConverter.cs @@ -434,7 +434,23 @@ private async Task AppendMissingPageSnapshotsAsync( return; } - var renderedPages = await imageRenderer.RenderImagesAsync(pdfBytes, cancellationToken).ConfigureAwait(false); + IReadOnlyList renderedPages; + + try + { + renderedPages = await imageRenderer.RenderImagesAsync(pdfBytes, cancellationToken).ConfigureAwait(false); + } + catch (OperationCanceledException) + { + throw; + } + catch + { + // Rendering support is optional for document intelligence; ignore failures + // so that conversions can still succeed when the renderer is unavailable. + return; + } + if (renderedPages.Count == 0) { return; From f373de0532e2a50f6ef8f2cfefbd8d2409118653 Mon Sep 17 00:00:00 2001 From: ksemenenko Date: Sat, 11 Oct 2025 18:16:01 +0200 Subject: [PATCH 4/6] pdf: tolerate missing renderer when using pdfpig --- src/MarkItDown/Converters/PdfConverter.cs | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/src/MarkItDown/Converters/PdfConverter.cs b/src/MarkItDown/Converters/PdfConverter.cs index 6d3c3da60..33ac8b37e 100644 --- a/src/MarkItDown/Converters/PdfConverter.cs +++ b/src/MarkItDown/Converters/PdfConverter.cs @@ -405,7 +405,22 @@ private PdfExtractionResult BuildExtractionFromExtractedText(IReadOnlyList BuildExtractionFromPdfPigAsync(byte[] pdfBytes, StreamInfo streamInfo, CancellationToken cancellationToken) { var pages = await textExtractor.ExtractTextAsync(pdfBytes, cancellationToken).ConfigureAwait(false); - var pageImages = await imageRenderer.RenderImagesAsync(pdfBytes, cancellationToken).ConfigureAwait(false); + + IReadOnlyList pageImages; + + try + { + pageImages = await imageRenderer.RenderImagesAsync(pdfBytes, cancellationToken).ConfigureAwait(false); + } + catch (OperationCanceledException) + { + throw; + } + catch + { + pageImages = Array.Empty(); + } + return BuildExtractionFromExtractedText(pages, pageImages, streamInfo); } From 4b3d09ca44fae299b4e36248a09b5ed674b10cf0 Mon Sep 17 00:00:00 2001 From: ksemenenko Date: Sat, 11 Oct 2025 18:18:03 +0200 Subject: [PATCH 5/6] pdf: tolerate missing renderer when using pdfpig --- src/MarkItDown/Converters/PdfConverter.cs | 35 +++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/src/MarkItDown/Converters/PdfConverter.cs b/src/MarkItDown/Converters/PdfConverter.cs index 191ccac82..33ac8b37e 100644 --- a/src/MarkItDown/Converters/PdfConverter.cs +++ b/src/MarkItDown/Converters/PdfConverter.cs @@ -405,7 +405,22 @@ private PdfExtractionResult BuildExtractionFromExtractedText(IReadOnlyList BuildExtractionFromPdfPigAsync(byte[] pdfBytes, StreamInfo streamInfo, CancellationToken cancellationToken) { var pages = await textExtractor.ExtractTextAsync(pdfBytes, cancellationToken).ConfigureAwait(false); - var pageImages = await imageRenderer.RenderImagesAsync(pdfBytes, cancellationToken).ConfigureAwait(false); + + IReadOnlyList pageImages; + + try + { + pageImages = await imageRenderer.RenderImagesAsync(pdfBytes, cancellationToken).ConfigureAwait(false); + } + catch (OperationCanceledException) + { + throw; + } + catch + { + pageImages = Array.Empty(); + } + return BuildExtractionFromExtractedText(pages, pageImages, streamInfo); } @@ -434,7 +449,23 @@ private async Task AppendMissingPageSnapshotsAsync( return; } - var renderedPages = await imageRenderer.RenderImagesAsync(pdfBytes, cancellationToken).ConfigureAwait(false); + IReadOnlyList renderedPages; + + try + { + renderedPages = await imageRenderer.RenderImagesAsync(pdfBytes, cancellationToken).ConfigureAwait(false); + } + catch (OperationCanceledException) + { + throw; + } + catch + { + // Rendering support is optional for document intelligence; ignore failures + // so that conversions can still succeed when the renderer is unavailable. + return; + } + if (renderedPages.Count == 0) { return; From c8ad6d1e3dc4bf157b8789d7887494b36169d80b Mon Sep 17 00:00:00 2001 From: ksemenenko Date: Sat, 11 Oct 2025 18:23:11 +0200 Subject: [PATCH 6/6] Update src/MarkItDown/Converters/PdfConverter.cs Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/MarkItDown/Converters/PdfConverter.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/MarkItDown/Converters/PdfConverter.cs b/src/MarkItDown/Converters/PdfConverter.cs index 33ac8b37e..b0c0562f4 100644 --- a/src/MarkItDown/Converters/PdfConverter.cs +++ b/src/MarkItDown/Converters/PdfConverter.cs @@ -416,7 +416,7 @@ private async Task BuildExtractionFromPdfPigAsync(byte[] pd { throw; } - catch + catch (Exception) { pageImages = Array.Empty(); }