managedcode · KSemenenko · Oct 11, 2025 · Oct 11, 2025 · Oct 11, 2025 · Oct 11, 2025
diff --git a/README.md b/README.md
@@ -56,10 +56,12 @@ This is a high-fidelity C# port of Microsoft's original [MarkItDown Python libra
 
 ✨ **Modern .NET** - Targets .NET 9.0 with up-to-date language features  
 📦 **NuGet Package** - Drop-in dependency for libraries and automation pipelines  
-🔄 **Async/Await** - Fully asynchronous pipeline for responsive apps  
-🧠 **LLM-Optimized** - Markdown tailored for AI ingestion and summarisation  
-🔧 **Extensible** - Register custom converters or plug additional caption/transcription services  
-🧭 **Smart Detection** - Automatic MIME, charset, and file-type guessing (including data/file URIs)  
+🔄 **Async/Await** - Fully asynchronous pipeline for responsive apps
+🧠 **LLM-Optimized** - Markdown tailored for AI ingestion and summarisation
+🔧 **Extensible** - Register custom converters or plug additional caption/transcription services
+🧩 **Conversion middleware** - Compose post-processing steps with `IConversionMiddleware` (AI enrichment ready)
+📂 **Raw artifacts API** - Inspect text blocks, tables, and images via `DocumentConverterResult.Artifacts`
+🧭 **Smart Detection** - Automatic MIME, charset, and file-type guessing (including data/file URIs)
 ⚡ **High Performance** - Stream-friendly, minimal allocations, zero temp files
 
 ## 📋 Format Support
@@ -102,11 +104,12 @@ This is a high-fidelity C# port of Microsoft's original [MarkItDown Python libra
 - Header detection based on formatting
 - List item recognition
 - Title extraction from document content
+- Page snapshot artifacts ensure every page can be sent through AI enrichment (OCR, diagram-to-Mermaid, chart narration) even when the PDF exposes selectable text
 
 ### Office Documents (DOCX/XLSX/PPTX)
-- **Word (.docx)**: Headers, paragraphs, tables, bold/italic formatting
+- **Word (.docx)**: Headers, paragraphs, tables, bold/italic formatting, and embedded images captured for AI enrichment (OCR, Mermaid-ready diagrams)
 - **Excel (.xlsx)**: Spreadsheet data as Markdown tables with sheet organization
-- **PowerPoint (.pptx)**: Slide-by-slide content with title recognition
+- **PowerPoint (.pptx)**: Slide-by-slide content with title recognition plus image artifacts primed for detailed AI captions and diagrams
 
 ### CSV Conversion Features
 - Automatic table formatting with headers
@@ -1056,6 +1059,17 @@ var result = await markItDown.ConvertAsync("document.pdf");
 Console.WriteLine(result.Markdown);
 ```
 
+### .NET SDK Setup
+
+MarkItDown targets .NET 9.0. If your environment does not have the required SDK, run the helper script once:
+
+```bash
+./eng/install-dotnet.sh
+```
+
+The script installs the SDK into `~/.dotnet` using the official `dotnet-install` bootstrapper and prints the environment
+variables to add to your shell profile so the `dotnet` CLI is available on subsequent sessions.
+
 ### Building from Source
 
 ```bash
@@ -1084,6 +1098,10 @@ The command emits standard test results plus a Cobertura coverage report at
 [ReportGenerator](https://github.com/danielpalme/ReportGenerator) can turn this into
 HTML or Markdown dashboards.
 
+> ✅ The regression suite now exercises DOCX and PPTX conversions with embedded imagery, ensuring conversion middleware runs and enriched descriptions remain attached to the composed Markdown.
+>
+> ✅ Additional image-placement regressions verify that AI-generated captions are injected immediately after each source placeholder for DOCX, PPTX, and PDF outputs.
+
 ### Project Structure
 
 ```
@@ -1218,6 +1236,31 @@ var options = new MarkItDownOptions
 var markItDown = new MarkItDown(options);
 ```
 
+### Conversion Middleware & Raw Artifacts
+
+Every conversion now exposes the raw extraction artifacts that feed the Markdown composer. Use `DocumentConverterResult.Artifacts` to inspect page text, tables, or embedded images before they are flattened into Markdown. You can plug additional processing by registering `IConversionMiddleware` instances through `MarkItDownOptions.ConversionMiddleware`. Middleware executes after extraction and can mutate segments, enrich metadata, or call external AI services. When an `IChatClient` is supplied and `EnableAiImageEnrichment` remains `true` (default), MarkItDown automatically adds the built-in `AiImageEnrichmentMiddleware` to describe charts, diagrams, and other visuals. The middleware keeps enriched prose anchored to the exact Markdown placeholder emitted during extraction, ensuring captions, Mermaid diagrams, and OCR text land beside the original image instead of drifting to the end of the section.
+
+```csharp
+var options = new MarkItDownOptions
+{
+    AiModels = new StaticAiModelProvider(chatClient: myChatClient, speechToTextClient: null),
+    ConversionMiddleware = new IConversionMiddleware[]
+    {
+        new MyDomainSpecificMiddleware()
+    }
+};
+
+var markItDown = new MarkItDown(options);
+var result = await markItDown.ConvertAsync("docs/diagram.docx");
+
+foreach (var image in result.Artifacts.Images)
+{
+    Console.WriteLine($"Image {image.Label}: {image.DetailedDescription}");
+}
+```
+
+Set `EnableAiImageEnrichment` to `false` when you need a completely custom pipeline with no default AI step.
+
 ### Production Configuration with Error Handling
 
 ```csharp

diff --git a/eng/install-dotnet.sh b/eng/install-dotnet.sh
@@ -0,0 +1,40 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+CHANNEL="9.0"
+INSTALL_DIR="${DOTNET_ROOT:-$HOME/.dotnet}"
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+RESOLVED_INSTALL_DIR="${INSTALL_DIR}"
+TEMP_SCRIPT="${SCRIPT_DIR}/dotnet-install.sh"
+
+cleanup() {
+  rm -f "${TEMP_SCRIPT}"
+}
+trap cleanup EXIT
+
+if ! command -v wget >/dev/null 2>&1 && ! command -v curl >/dev/null 2>&1; then
+  echo "Either wget or curl is required to download dotnet-install.sh" >&2
+  exit 1
+fi
+
+DOWNLOAD_TOOL="wget"
+DOWNLOAD_ARGS=("-q" "-O")
+URL="https://dot.net/v1/dotnet-install.sh"
+
+if command -v curl >/dev/null 2>&1; then
+  DOWNLOAD_TOOL="curl"
+  DOWNLOAD_ARGS=("-sSL" "-o")
+fi
+
+${DOWNLOAD_TOOL} "${DOWNLOAD_ARGS[@]}" "${TEMP_SCRIPT}" "${URL}"
+chmod +x "${TEMP_SCRIPT}"
+
+"${TEMP_SCRIPT}" --channel "${CHANNEL}" --install-dir "${INSTALL_DIR}" --no-path
+
+cat <<EON
+Add the following to your shell profile to use the installed .NET SDK:
+
+    export DOTNET_ROOT="${RESOLVED_INSTALL_DIR}"
+    export PATH="\$DOTNET_ROOT:\$PATH"
+
+EON
diff --git a/src/MarkItDown/Conversion/ConversionArtifacts.cs b/src/MarkItDown/Conversion/ConversionArtifacts.cs
@@ -0,0 +1,173 @@
+using System.Collections.ObjectModel;
+
+namespace MarkItDown;
+
+/// <summary>
+/// Represents the raw artifacts extracted during conversion prior to Markdown composition.
+/// </summary>
+public sealed class ConversionArtifacts
+{
+    /// <summary>
+    /// Initializes a new instance of the <see cref="ConversionArtifacts"/> class.
+    /// </summary>
+    public ConversionArtifacts()
+    {
+        TextBlocks = new List<TextArtifact>();
+        Tables = new List<TableArtifact>();
+        Images = new List<ImageArtifact>();
+        Metadata = new Dictionary<string, string>();
+    }
+
+    private ConversionArtifacts(bool _)
+    {
+        TextBlocks = EmptyTextBlocks;
+        Tables = EmptyTables;
+        Images = EmptyImages;
+        Metadata = EmptyMetadata;
+    }
+
+    /// <summary>
+    /// Gets a reusable empty instance.
+    /// </summary>
+    public static ConversionArtifacts Empty { get; } = new(true);
+
+    private static readonly IList<TextArtifact> EmptyTextBlocks = new ReadOnlyCollection<TextArtifact>(Array.Empty<TextArtifact>());
+    private static readonly IList<TableArtifact> EmptyTables = new ReadOnlyCollection<TableArtifact>(Array.Empty<TableArtifact>());
+    private static readonly IList<ImageArtifact> EmptyImages = new ReadOnlyCollection<ImageArtifact>(Array.Empty<ImageArtifact>());
+    private static readonly IDictionary<string, string> EmptyMetadata = new ReadOnlyDictionary<string, string>(new Dictionary<string, string>());
+
+    /// <summary>
+    /// Gets the raw text artifacts captured from the source.
+    /// </summary>
+    public IList<TextArtifact> TextBlocks { get; }
+
+    /// <summary>
+    /// Gets the tabular artifacts captured from the source.
+    /// </summary>
+    public IList<TableArtifact> Tables { get; }
+
+    /// <summary>
+    /// Gets the image artifacts captured from the source.
+    /// </summary>
+    public IList<ImageArtifact> Images { get; }
+
+    /// <summary>
+    /// Gets conversion-level metadata surfaced by the converter.
+    /// </summary>
+    public IDictionary<string, string> Metadata { get; }
+}
+
+/// <summary>
+/// Represents a block of text extracted from the source document.
+/// </summary>
+public sealed class TextArtifact
+{
+    public TextArtifact(string text, int? pageNumber = null, string? source = null, string? label = null)
+    {
+        Text = text ?? string.Empty;
+        PageNumber = pageNumber;
+        Source = source;
+        Label = label;
+    }
+
+    public string Text { get; set; }
+
+    public int? PageNumber { get; set; }
+
+    public string? Source { get; set; }
+
+    public string? Label { get; set; }
+}
+
+/// <summary>
+/// Represents tabular content extracted from the source document.
+/// </summary>
+public sealed class TableArtifact
+{
+    public TableArtifact(IList<IList<string>> rows, int? pageNumber = null, string? source = null, string? label = null)
+    {
+        Rows = rows ?? throw new ArgumentNullException(nameof(rows));
+        PageNumber = pageNumber;
+        Source = source;
+        Label = label;
+    }
+
+    public IList<IList<string>> Rows { get; }
+
+    public int? PageNumber { get; set; }
+
+    public string? Source { get; set; }
+
+    public string? Label { get; set; }
+}
+
+/// <summary>
+/// Represents an image extracted from the source document.
+/// </summary>
+public sealed class ImageArtifact
+{
+    public ImageArtifact(byte[] data, string? contentType = null, int? pageNumber = null, string? source = null, string? label = null)
+    {
+        Data = data ?? throw new ArgumentNullException(nameof(data));
+        ContentType = contentType;
+        PageNumber = pageNumber;
+        Source = source;
+        Label = label;
+        Metadata = new Dictionary<string, string>();
+    }
+
+    /// <summary>
+    /// Gets the raw binary data for the image.
+    /// </summary>
+    public byte[] Data { get; }
+
+    /// <summary>
+    /// Gets the content type associated with the image.
+    /// </summary>
+    public string? ContentType { get; set; }
+
+    /// <summary>
+    /// Gets or sets the page number that owns the image, when applicable.
+    /// </summary>
+    public int? PageNumber { get; set; }
+
+    /// <summary>
+    /// Gets or sets the logical source identifier for the image.
+    /// </summary>
+    public string? Source { get; set; }
+
+    /// <summary>
+    /// Gets or sets the friendly label for the image.
+    /// </summary>
+    public string? Label { get; set; }
+
+    /// <summary>
+    /// Gets or sets the enriched description generated for the image.
+    /// </summary>
+    public string? DetailedDescription { get; set; }
+
+    /// <summary>
+    /// Gets or sets a Mermaid diagram representation when the image depicts structured data.
+    /// </summary>
+    public string? MermaidDiagram { get; set; }
+
+    /// <summary>
+    /// Gets or sets additional textual extraction (such as OCR output).
+    /// </summary>
+    public string? RawText { get; set; }
+
+    /// <summary>
+    /// Gets metadata describing the image artifact.
+    /// </summary>
+    public IDictionary<string, string> Metadata { get; }
+
+    /// <summary>
+    /// Gets or sets the segment index that references this artifact within the composed output.
+    /// </summary>
+    public int? SegmentIndex { get; set; }
+
+    /// <summary>
+    /// Gets or sets the Markdown placeholder that was emitted during extraction for this image.
+    /// </summary>
+    public string? PlaceholderMarkdown { get; set; }
+}
diff --git a/src/MarkItDown/Conversion/ConversionPipeline.cs b/src/MarkItDown/Conversion/ConversionPipeline.cs
@@ -0,0 +1,51 @@
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Threading;
+using System.Threading.Tasks;
+using MarkItDown.Intelligence;
+using Microsoft.Extensions.Logging;
+
+namespace MarkItDown;
+
+/// <summary>
+/// Sequential middleware pipeline that executes configured <see cref="IConversionMiddleware"/> components.
+/// </summary>
+public sealed class ConversionPipeline : IConversionPipeline
+{
+    private readonly IReadOnlyList<IConversionMiddleware> middlewares;
+    private readonly IAiModelProvider aiModels;
+    private readonly ILogger? logger;
+
+    public static IConversionPipeline Empty { get; } = new ConversionPipeline(Array.Empty<IConversionMiddleware>(), NullAiModelProvider.Instance, logger: null);
+
+    public ConversionPipeline(IEnumerable<IConversionMiddleware> middlewares, IAiModelProvider aiModels, ILogger? logger)
+    {
+        this.middlewares = (middlewares ?? throw new ArgumentNullException(nameof(middlewares))).ToArray();
+        this.aiModels = aiModels ?? NullAiModelProvider.Instance;
+        this.logger = logger;
+    }
+
+    public async Task ExecuteAsync(StreamInfo streamInfo, ConversionArtifacts artifacts, IList<DocumentSegment> segments, CancellationToken cancellationToken)
+    {
+        if (middlewares.Count == 0)
+        {
+            return;
+        }
+
+        var context = new ConversionPipelineContext(streamInfo, artifacts, segments, aiModels, logger);
+        foreach (var middleware in middlewares)
+        {
+            cancellationToken.ThrowIfCancellationRequested();
+
+            try
+            {
+                await middleware.InvokeAsync(context, cancellationToken).ConfigureAwait(false);
+            }
+            catch (Exception ex)
+            {
+                logger?.LogWarning(ex, "Conversion middleware {Middleware} failed", middleware.GetType().Name);
+            }
+        }
+    }
+}