Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 27 additions & 25 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ dotnet add package ManagedCode.MarkItDown
using MarkItDown;

// Create converter instance
var markItDown = new MarkItDown();
var markItDown = new MarkItDownClient();

// Convert any file to Markdown
var result = await markItDown.ConvertAsync("document.pdf");
Expand All @@ -218,7 +218,7 @@ using Microsoft.Extensions.Logging;
// Set up logging to track conversion progress
using var loggerFactory = LoggerFactory.Create(builder => builder.AddConsole());
var logger = loggerFactory.CreateLogger<MarkItDown>();
var markItDown = new MarkItDown(logger: logger);
var markItDown = new MarkItDownClient(logger: logger);

// Convert documents for vector database ingestion
string[] documents = { "report.pdf", "data.xlsx", "webpage.html" };
Expand All @@ -245,7 +245,7 @@ foreach (var doc in documents)
```csharp
using MarkItDown;

var markItDown = new MarkItDown();
var markItDown = new MarkItDownClient();
var emailFolder = @"C:\Emails\Exports";
var outputFolder = @"C:\ProcessedEmails";

Expand All @@ -271,7 +271,7 @@ using Microsoft.Extensions.Logging;
using var loggerFactory = LoggerFactory.Create(builder => builder.AddConsole());
using var httpClient = new HttpClient();

var markItDown = new MarkItDown(
var markItDown = new MarkItDownClient(
logger: loggerFactory.CreateLogger<MarkItDown>(),
httpClient: httpClient);

Expand Down Expand Up @@ -310,7 +310,7 @@ foreach (var url in urls)
using MarkItDown;

// Convert a DOCX file and print the Markdown
var markItDown = new MarkItDown();
var markItDown = new MarkItDownClient();
DocumentConverterResult result = await markItDown.ConvertAsync("report.docx");
Console.WriteLine(result.Markdown);
```
Expand All @@ -329,7 +329,7 @@ var streamInfo = new StreamInfo(
charset: Encoding.UTF8,
fileName: "invoice.html");

var markItDown = new MarkItDown();
var markItDown = new MarkItDownClient();
var result = await markItDown.ConvertAsync(stream, streamInfo);
Console.WriteLine(result.Title);
```
Expand All @@ -340,7 +340,7 @@ Console.WriteLine(result.Title);
using MarkItDown;

// Convert an EML file to Markdown
var markItDown = new MarkItDown();
var markItDown = new MarkItDownClient();
DocumentConverterResult result = await markItDown.ConvertAsync("message.eml");

// The result includes email headers and content
Expand Down Expand Up @@ -369,7 +369,7 @@ using Microsoft.Extensions.Logging;
using var loggerFactory = LoggerFactory.Create(static builder => builder.AddConsole());
using var httpClient = new HttpClient();

var markItDown = new MarkItDown(
var markItDown = new MarkItDownClient(
logger: loggerFactory.CreateLogger<MarkItDown>(),
httpClient: httpClient);

Expand Down Expand Up @@ -462,7 +462,7 @@ var options = new MarkItDownOptions
}
};

var markItDown = new MarkItDown(options);
var markItDown = new MarkItDownClient(options);

// Segments are still available programmatically even when annotations are disabled.
```
Expand Down Expand Up @@ -494,7 +494,7 @@ public sealed class MyCustomConverter : IDocumentConverter
}
}

var markItDown = new MarkItDown();
var markItDown = new MarkItDownClient();
markItDown.RegisterConverter(new MyCustomConverter());
```

Expand Down Expand Up @@ -546,7 +546,7 @@ public class DocumentProcessor
public DocumentProcessor(ILogger<DocumentProcessor> logger)
{
_logger = logger;
_markItDown = new MarkItDown(logger: logger);
_markItDown = new MarkItDownClient(logger: logger);
}

public async Task<List<ProcessedDocument>> ProcessDirectoryAsync(
Expand Down Expand Up @@ -611,7 +611,7 @@ public class DocumentIndexer
public DocumentIndexer(IVectorStore vectorStore)
{
_vectorStore = vectorStore;
_markItDown = new MarkItDown();
_markItDown = new MarkItDownClient();
}

public async Task IndexDocumentAsync<T>(string filePath) where T : class
Expand Down Expand Up @@ -690,7 +690,7 @@ public class DocumentConversionFunction
public DocumentConversionFunction(ILogger<DocumentConversionFunction> logger)
{
_logger = logger;
_markItDown = new MarkItDown(logger: logger);
_markItDown = new MarkItDownClient(logger: logger);
}

[Function("ConvertDocument")]
Expand Down Expand Up @@ -812,6 +812,8 @@ MarkItDown exposes optional abstractions for running documents through cloud ser

The `AzureIntelligenceOptions`, `GoogleIntelligenceOptions`, and `AwsIntelligenceOptions` helpers wire the respective cloud Document AI/Vision/Speech stacks without forcing the dependency on consumers. You can still bring your own implementation by assigning the provider interfaces directly on `MarkItDownOptions`.

`MarkItDownClient` emits structured `ILogger` events and OpenTelemetry spans by default. Toggle instrumentation with `MarkItDownOptions.EnableTelemetry`, supply a custom `ActivitySource`/`Meter`, or provide a `LoggerFactory` to integrate with your application's logging pipeline.

#### Azure AI setup (keys and managed identity)

- **Docs**: [Document Intelligence](https://learn.microsoft.com/azure/ai-services/document-intelligence/), [Computer Vision Image Analysis](https://learn.microsoft.com/azure/ai-services/computer-vision/overview-image-analysis), [Video Indexer authentication](https://learn.microsoft.com/azure/azure-video-indexer/video-indexer-get-started/connect-to-azure).
Expand Down Expand Up @@ -943,7 +945,7 @@ For LLM-style post-processing, assign `MarkItDownOptions.AiModels` with an `IAiM
```csharp
using MarkItDown;

var markItDown = new MarkItDown();
var markItDown = new MarkItDownClient();

try
{
Expand Down Expand Up @@ -1010,7 +1012,7 @@ using var httpClient = new HttpClient();
httpClient.Timeout = TimeSpan.FromSeconds(30);
httpClient.DefaultRequestHeaders.Add("User-Agent", "MarkItDown/1.0");

var markItDown = new MarkItDown(httpClient: httpClient);
var markItDown = new MarkItDownClient(httpClient: httpClient);
```

**Logging for Diagnostics:**
Expand All @@ -1021,7 +1023,7 @@ using var loggerFactory = LoggerFactory.Create(builder =>
builder.AddConsole().SetMinimumLevel(LogLevel.Debug));

var logger = loggerFactory.CreateLogger<MarkItDown>();
var markItDown = new MarkItDown(logger: logger);
var markItDown = new MarkItDownClient(logger: logger);

// Now you'll see detailed conversion progress in console output
```
Expand All @@ -1034,7 +1036,7 @@ If you're familiar with the original Python library, here are the key difference

| Python | C#/.NET | Notes |
|---------|---------|--------|
| `MarkItDown()` | `new MarkItDown()` | Similar constructor |
| `MarkItDownClient()` | `new MarkItDownClient()` | Similar constructor |
| `markitdown.convert("file.pdf")` | `await markItDown.ConvertAsync("file.pdf")` | Async pattern |
| `markitdown.convert(stream, file_extension=".pdf")` | `await markItDown.ConvertAsync(stream, streamInfo)` | StreamInfo object |
| `markitdown.convert_url("https://...")` | `await markItDown.ConvertFromUrlAsync("https://...")` | Async URL conversion |
Expand All @@ -1046,15 +1048,15 @@ If you're familiar with the original Python library, here are the key difference
```python
# Python version
import markitdown
md = markitdown.MarkItDown()
md = markitdown.MarkItDownClient()
result = md.convert("document.pdf")
print(result.text_content)
```

```csharp
// C# version
using MarkItDown;
var markItDown = new MarkItDown();
var markItDown = new MarkItDownClient();
var result = await markItDown.ConvertAsync("document.pdf");
Console.WriteLine(result.Markdown);
```
Expand Down Expand Up @@ -1170,7 +1172,7 @@ Performance will vary based on your specific documents and environment. For prod

```csharp
// 1. Reuse MarkItDown instances (they're thread-safe)
var markItDown = new MarkItDown();
var markItDown = new MarkItDownClient();
await Task.WhenAll(
markItDown.ConvertAsync("file1.pdf"),
markItDown.ConvertAsync("file2.docx"),
Expand All @@ -1183,7 +1185,7 @@ var result = await markItDown.ConvertAsync("large-file.pdf", cancellationToken:

// 3. Configure HttpClient for web content (reuse connections)
using var httpClient = new HttpClient();
var markItDown = new MarkItDown(httpClient: httpClient);
var markItDown = new MarkItDownClient(httpClient: httpClient);

// 4. Pre-specify StreamInfo to skip format detection
var streamInfo = new StreamInfo(mimeType: "application/pdf", extension: ".pdf");
Expand All @@ -1202,7 +1204,7 @@ var options = new MarkItDownOptions
ExifToolPath = "/usr/local/bin/exiftool" // Path to exiftool binary (optional)
};

var markItDown = new MarkItDown(options);
var markItDown = new MarkItDownClient(options);
```

### Advanced AI Integration
Expand Down Expand Up @@ -1233,7 +1235,7 @@ var options = new MarkItDownOptions
}
};

var markItDown = new MarkItDown(options);
var markItDown = new MarkItDownClient(options);
```

### Conversion Middleware & Raw Artifacts
Expand All @@ -1250,7 +1252,7 @@ var options = new MarkItDownOptions
}
};

var markItDown = new MarkItDown(options);
var markItDown = new MarkItDownClient(options);
var result = await markItDown.ConvertAsync("docs/diagram.docx");

foreach (var image in result.Artifacts.Images)
Expand Down Expand Up @@ -1294,7 +1296,7 @@ var options = new MarkItDownOptions
}
};

var markItDown = new MarkItDown(options, logger, httpClientFactory.CreateClient());
var markItDown = new MarkItDownClient(options, logger, httpClientFactory.CreateClient());
```

## 📄 License
Expand Down
45 changes: 45 additions & 0 deletions docs/MetaMD.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# MetaMD (MMD)

MetaMD is a Markdown profile that layers structured metadata and citation-aware rendering on top of CommonMark. Files typically use the `.metamd` extension (optionally `.metamd.md`) and begin with a JSON front matter block delimited by `+++` fences.

## Front Matter Schema

```json
{
"title": "Document title",
"abstract": "Optional abstract text.",
"contributors": ["Name", "Name"],
"affiliations": ["Organisation"],
"keywords": ["term", "term"],
"references": [
{
"id": "unique-id",
"title": "Reference title",
"authors": ["Author"],
"url": "https://example.com/reference"
}
]
}
```

All properties are optional. Unknown properties are ignored by the converter.

## Reference Syntax

Inline citations use `[@id]`. During conversion each citation is replaced with a Markdown link if a URL is present, or bold text when the reference has no URL. Referenced entries are collected and emitted in a `## References` section at the end of the document, preserving author lists and links.

## Diagram Blocks

MetaMD supports lightweight diagram embedding via custom blocks:

```
:::diagram type="mermaid"
<diagram body>
:::
```

The converter rewrites these blocks as fenced code blocks using the requested diagram type (e.g., `mermaid`, `dot`, `plantuml`).

## Compatibility

Because MetaMD is a superset of Markdown, downstream tools that do not recognise the front matter or diagram directives still render the body content. The .NET converter automatically recognises `.metamd` and `.metamd.md` files, extracts metadata into headings, and normalises references for consistent Markdown output.
102 changes: 102 additions & 0 deletions src/MarkItDown/Converters/AsciiDocConverter.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading;
using System.Threading.Tasks;
using ManagedCode.MimeTypes;

namespace MarkItDown.Converters;

/// <summary>
/// Converter for AsciiDoc documents.
/// </summary>
public sealed class AsciiDocConverter : IDocumentConverter
{
private static readonly IReadOnlyCollection<string> Extensions = new[]
{
".adoc",
".asciidoc",
};

private static readonly IReadOnlyCollection<string> MimeTypes = new[]
{
MimeHelper.GetMimeType(".adoc") ?? MimeTypeUtilities.Compose("text", "asciidoc"),
MimeHelper.GetMimeType(".asciidoc") ?? MimeTypeUtilities.Compose("text", "asciidoc"),
};

private static readonly Regex Bold = new("\\*(?<text>[^*]+)\\*", RegexOptions.Compiled);
private static readonly Regex Italic = new("_(?<text>[^_]+)_", RegexOptions.Compiled);
private static readonly Regex Monospace = new("`(?<text>[^`]+)`", RegexOptions.Compiled);

public int Priority => 160;

public bool AcceptsInput(StreamInfo streamInfo)
{
var normalizedMime = MimeTypeUtilities.NormalizeMime(streamInfo);
var extension = streamInfo.Extension?.ToLowerInvariant();
if (extension is not null && Extensions.Contains(extension))
{
return true;
}

return MimeTypeUtilities.MatchesAny(normalizedMime, MimeTypes)
|| MimeTypeUtilities.MatchesAny(streamInfo.MimeType, MimeTypes);
}

public bool Accepts(Stream stream, StreamInfo streamInfo, CancellationToken cancellationToken = default) => AcceptsInput(streamInfo);

public async Task<DocumentConverterResult> ConvertAsync(Stream stream, StreamInfo streamInfo, CancellationToken cancellationToken = default)
{
using var reader = new StreamReader(stream, Encoding.UTF8, detectEncodingFromByteOrderMarks: true, leaveOpen: true);
var content = await reader.ReadToEndAsync(cancellationToken).ConfigureAwait(false);
var markdown = ConvertToMarkdown(content);
return new DocumentConverterResult(markdown, streamInfo.FileName);
}

private static string ConvertToMarkdown(string adoc)
{
var lines = adoc.Replace("\r\n", "\n").Split('\n');
var builder = new StringBuilder();
foreach (var line in lines)
{
var trimmed = line.TrimEnd();
if (string.IsNullOrWhiteSpace(trimmed))
{
builder.AppendLine();
continue;
}

if (trimmed.StartsWith("= "))
{
var level = trimmed.TakeWhile(c => c == '=').Count();
builder.AppendLine(new string('#', Math.Clamp(level, 1, 6)) + " " + trimmed[level..].Trim());
continue;
}

if (trimmed.StartsWith("=="))
{
var level = trimmed.TakeWhile(c => c == '=').Count();
builder.AppendLine(new string('#', Math.Clamp(level, 1, 6)) + " " + trimmed[level..].Trim());
continue;
}

if (trimmed.StartsWith("*") || trimmed.StartsWith("-") || trimmed.StartsWith("."))
{
var marker = trimmed[0] == '.' ? "1." : "-";
builder.AppendLine(marker + " " + trimmed[1..].Trim());
continue;
}

var converted = Bold.Replace(trimmed, m => "**" + m.Groups["text"].Value + "**");
converted = Italic.Replace(converted, m => "*" + m.Groups["text"].Value + "*");
converted = Monospace.Replace(converted, m => "`" + m.Groups["text"].Value + "`");

builder.AppendLine(converted);
}

return builder.ToString().Trim();
}
}
Loading