From e797d2141473c843f23dad07e02815702f55a7dd Mon Sep 17 00:00:00 2001 From: ksemenenko Date: Sun, 12 Oct 2025 14:06:39 +0200 Subject: [PATCH 1/3] tests: embed odt fixture --- README.md | 52 +-- docs/MetaMD.md | 45 +++ .../Converters/AsciiDocConverter.cs | 102 ++++++ src/MarkItDown/Converters/BibTexConverter.cs | 131 ++++++++ src/MarkItDown/Converters/CslJsonConverter.cs | 151 +++++++++ src/MarkItDown/Converters/CsvConverter.cs | 11 +- src/MarkItDown/Converters/DjotConverter.cs | 51 +++ src/MarkItDown/Converters/DocBookConverter.cs | 131 ++++++++ .../Converters/EndNoteXmlConverter.cs | 92 ++++++ src/MarkItDown/Converters/Fb2Converter.cs | 90 +++++ .../Converters/GraphvizConverter.cs | 54 +++ src/MarkItDown/Converters/JatsConverter.cs | 132 ++++++++ src/MarkItDown/Converters/LatexConverter.cs | 105 ++++++ src/MarkItDown/Converters/MermaidConverter.cs | 54 +++ src/MarkItDown/Converters/MetaMdConverter.cs | 248 ++++++++++++++ src/MarkItDown/Converters/OdtConverter.cs | 128 ++++++++ src/MarkItDown/Converters/OpmlConverter.cs | 76 +++++ src/MarkItDown/Converters/OrgConverter.cs | 97 ++++++ .../Converters/PlantUmlConverter.cs | 55 ++++ src/MarkItDown/Converters/RisConverter.cs | 146 +++++++++ src/MarkItDown/Converters/RstConverter.cs | 115 +++++++ src/MarkItDown/Converters/RtfConverter.cs | 184 +++++++++++ .../Converters/StructuredXmlConverterBase.cs | 98 ++++++ src/MarkItDown/Converters/TextileConverter.cs | 97 ++++++ src/MarkItDown/Converters/TikzConverter.cs | 53 +++ src/MarkItDown/Converters/TypstConverter.cs | 90 +++++ .../Converters/WikiMarkupConverter.cs | 103 ++++++ src/MarkItDown/MarkItDown.csproj | 2 +- .../{MarkItDown.cs => MarkItDownClient.cs} | 307 ++++++++++++++---- src/MarkItDown/MarkItDownDiagnostics.cs | 47 +++ src/MarkItDown/MarkItDownOptions.cs | 25 +- src/MarkItDown/MimeMapping.cs | 125 +++++-- src/MarkItDown/MimeTypeUtilities.cs | 5 + tests/MarkItDown.Tests/EmlConverterTests.cs | 2 +- .../MarkItDownClientTelemetryTests.cs | 158 +++++++++ .../MarkItDownIntegrationTests.cs | 42 +-- tests/MarkItDown.Tests/MarkItDownTests.cs | 12 +- tests/MarkItDown.Tests/NewConverterTests.cs | 2 +- tests/MarkItDown.Tests/NewConvertersTests.cs | 2 +- .../NewFormatsConverterTests.cs | 227 +++++++++++++ .../StreamInfoDetectionTests.cs | 6 +- tests/MarkItDown.Tests/TestCoveragePlan.md | 1 + tests/MarkItDown.Tests/TestFiles/sample.adoc | 6 + tests/MarkItDown.Tests/TestFiles/sample.bib | 7 + .../MarkItDown.Tests/TestFiles/sample.csljson | 13 + tests/MarkItDown.Tests/TestFiles/sample.dj | 3 + .../MarkItDown.Tests/TestFiles/sample.docbook | 12 + tests/MarkItDown.Tests/TestFiles/sample.dot | 3 + .../TestFiles/sample.endnote.xml | 26 ++ tests/MarkItDown.Tests/TestFiles/sample.fb2 | 16 + tests/MarkItDown.Tests/TestFiles/sample.jats | 16 + .../MarkItDown.Tests/TestFiles/sample.mermaid | 2 + .../MarkItDown.Tests/TestFiles/sample.metamd | 24 ++ tests/MarkItDown.Tests/TestFiles/sample.opml | 12 + tests/MarkItDown.Tests/TestFiles/sample.org | 5 + tests/MarkItDown.Tests/TestFiles/sample.puml | 3 + tests/MarkItDown.Tests/TestFiles/sample.ris | 8 + tests/MarkItDown.Tests/TestFiles/sample.rst | 7 + tests/MarkItDown.Tests/TestFiles/sample.rtf | 4 + tests/MarkItDown.Tests/TestFiles/sample.tex | 8 + .../MarkItDown.Tests/TestFiles/sample.textile | 4 + tests/MarkItDown.Tests/TestFiles/sample.tikz | 3 + tests/MarkItDown.Tests/TestFiles/sample.tsv | 3 + tests/MarkItDown.Tests/TestFiles/sample.typ | 3 + tests/MarkItDown.Tests/TestFiles/sample.wiki | 6 + 65 files changed, 3696 insertions(+), 152 deletions(-) create mode 100644 docs/MetaMD.md create mode 100644 src/MarkItDown/Converters/AsciiDocConverter.cs create mode 100644 src/MarkItDown/Converters/BibTexConverter.cs create mode 100644 src/MarkItDown/Converters/CslJsonConverter.cs create mode 100644 src/MarkItDown/Converters/DjotConverter.cs create mode 100644 src/MarkItDown/Converters/DocBookConverter.cs create mode 100644 src/MarkItDown/Converters/EndNoteXmlConverter.cs create mode 100644 src/MarkItDown/Converters/Fb2Converter.cs create mode 100644 src/MarkItDown/Converters/GraphvizConverter.cs create mode 100644 src/MarkItDown/Converters/JatsConverter.cs create mode 100644 src/MarkItDown/Converters/LatexConverter.cs create mode 100644 src/MarkItDown/Converters/MermaidConverter.cs create mode 100644 src/MarkItDown/Converters/MetaMdConverter.cs create mode 100644 src/MarkItDown/Converters/OdtConverter.cs create mode 100644 src/MarkItDown/Converters/OpmlConverter.cs create mode 100644 src/MarkItDown/Converters/OrgConverter.cs create mode 100644 src/MarkItDown/Converters/PlantUmlConverter.cs create mode 100644 src/MarkItDown/Converters/RisConverter.cs create mode 100644 src/MarkItDown/Converters/RstConverter.cs create mode 100644 src/MarkItDown/Converters/RtfConverter.cs create mode 100644 src/MarkItDown/Converters/StructuredXmlConverterBase.cs create mode 100644 src/MarkItDown/Converters/TextileConverter.cs create mode 100644 src/MarkItDown/Converters/TikzConverter.cs create mode 100644 src/MarkItDown/Converters/TypstConverter.cs create mode 100644 src/MarkItDown/Converters/WikiMarkupConverter.cs rename src/MarkItDown/{MarkItDown.cs => MarkItDownClient.cs} (65%) create mode 100644 src/MarkItDown/MarkItDownDiagnostics.cs create mode 100644 tests/MarkItDown.Tests/MarkItDownClientTelemetryTests.cs create mode 100644 tests/MarkItDown.Tests/NewFormatsConverterTests.cs create mode 100644 tests/MarkItDown.Tests/TestFiles/sample.adoc create mode 100644 tests/MarkItDown.Tests/TestFiles/sample.bib create mode 100644 tests/MarkItDown.Tests/TestFiles/sample.csljson create mode 100644 tests/MarkItDown.Tests/TestFiles/sample.dj create mode 100644 tests/MarkItDown.Tests/TestFiles/sample.docbook create mode 100644 tests/MarkItDown.Tests/TestFiles/sample.dot create mode 100644 tests/MarkItDown.Tests/TestFiles/sample.endnote.xml create mode 100644 tests/MarkItDown.Tests/TestFiles/sample.fb2 create mode 100644 tests/MarkItDown.Tests/TestFiles/sample.jats create mode 100644 tests/MarkItDown.Tests/TestFiles/sample.mermaid create mode 100644 tests/MarkItDown.Tests/TestFiles/sample.metamd create mode 100644 tests/MarkItDown.Tests/TestFiles/sample.opml create mode 100644 tests/MarkItDown.Tests/TestFiles/sample.org create mode 100644 tests/MarkItDown.Tests/TestFiles/sample.puml create mode 100644 tests/MarkItDown.Tests/TestFiles/sample.ris create mode 100644 tests/MarkItDown.Tests/TestFiles/sample.rst create mode 100644 tests/MarkItDown.Tests/TestFiles/sample.rtf create mode 100644 tests/MarkItDown.Tests/TestFiles/sample.tex create mode 100644 tests/MarkItDown.Tests/TestFiles/sample.textile create mode 100644 tests/MarkItDown.Tests/TestFiles/sample.tikz create mode 100644 tests/MarkItDown.Tests/TestFiles/sample.tsv create mode 100644 tests/MarkItDown.Tests/TestFiles/sample.typ create mode 100644 tests/MarkItDown.Tests/TestFiles/sample.wiki diff --git a/README.md b/README.md index 6a79bdd2c..550a72b55 100644 --- a/README.md +++ b/README.md @@ -199,7 +199,7 @@ dotnet add package ManagedCode.MarkItDown using MarkItDown; // Create converter instance -var markItDown = new MarkItDown(); +var markItDown = new MarkItDownClient(); // Convert any file to Markdown var result = await markItDown.ConvertAsync("document.pdf"); @@ -218,7 +218,7 @@ using Microsoft.Extensions.Logging; // Set up logging to track conversion progress using var loggerFactory = LoggerFactory.Create(builder => builder.AddConsole()); var logger = loggerFactory.CreateLogger(); -var markItDown = new MarkItDown(logger: logger); +var markItDown = new MarkItDownClient(logger: logger); // Convert documents for vector database ingestion string[] documents = { "report.pdf", "data.xlsx", "webpage.html" }; @@ -245,7 +245,7 @@ foreach (var doc in documents) ```csharp using MarkItDown; -var markItDown = new MarkItDown(); +var markItDown = new MarkItDownClient(); var emailFolder = @"C:\Emails\Exports"; var outputFolder = @"C:\ProcessedEmails"; @@ -271,7 +271,7 @@ using Microsoft.Extensions.Logging; using var loggerFactory = LoggerFactory.Create(builder => builder.AddConsole()); using var httpClient = new HttpClient(); -var markItDown = new MarkItDown( +var markItDown = new MarkItDownClient( logger: loggerFactory.CreateLogger(), httpClient: httpClient); @@ -310,7 +310,7 @@ foreach (var url in urls) using MarkItDown; // Convert a DOCX file and print the Markdown -var markItDown = new MarkItDown(); +var markItDown = new MarkItDownClient(); DocumentConverterResult result = await markItDown.ConvertAsync("report.docx"); Console.WriteLine(result.Markdown); ``` @@ -329,7 +329,7 @@ var streamInfo = new StreamInfo( charset: Encoding.UTF8, fileName: "invoice.html"); -var markItDown = new MarkItDown(); +var markItDown = new MarkItDownClient(); var result = await markItDown.ConvertAsync(stream, streamInfo); Console.WriteLine(result.Title); ``` @@ -340,7 +340,7 @@ Console.WriteLine(result.Title); using MarkItDown; // Convert an EML file to Markdown -var markItDown = new MarkItDown(); +var markItDown = new MarkItDownClient(); DocumentConverterResult result = await markItDown.ConvertAsync("message.eml"); // The result includes email headers and content @@ -369,7 +369,7 @@ using Microsoft.Extensions.Logging; using var loggerFactory = LoggerFactory.Create(static builder => builder.AddConsole()); using var httpClient = new HttpClient(); -var markItDown = new MarkItDown( +var markItDown = new MarkItDownClient( logger: loggerFactory.CreateLogger(), httpClient: httpClient); @@ -462,7 +462,7 @@ var options = new MarkItDownOptions } }; -var markItDown = new MarkItDown(options); +var markItDown = new MarkItDownClient(options); // Segments are still available programmatically even when annotations are disabled. ``` @@ -494,7 +494,7 @@ public sealed class MyCustomConverter : IDocumentConverter } } -var markItDown = new MarkItDown(); +var markItDown = new MarkItDownClient(); markItDown.RegisterConverter(new MyCustomConverter()); ``` @@ -546,7 +546,7 @@ public class DocumentProcessor public DocumentProcessor(ILogger logger) { _logger = logger; - _markItDown = new MarkItDown(logger: logger); + _markItDown = new MarkItDownClient(logger: logger); } public async Task> ProcessDirectoryAsync( @@ -611,7 +611,7 @@ public class DocumentIndexer public DocumentIndexer(IVectorStore vectorStore) { _vectorStore = vectorStore; - _markItDown = new MarkItDown(); + _markItDown = new MarkItDownClient(); } public async Task IndexDocumentAsync(string filePath) where T : class @@ -690,7 +690,7 @@ public class DocumentConversionFunction public DocumentConversionFunction(ILogger logger) { _logger = logger; - _markItDown = new MarkItDown(logger: logger); + _markItDown = new MarkItDownClient(logger: logger); } [Function("ConvertDocument")] @@ -812,6 +812,8 @@ MarkItDown exposes optional abstractions for running documents through cloud ser The `AzureIntelligenceOptions`, `GoogleIntelligenceOptions`, and `AwsIntelligenceOptions` helpers wire the respective cloud Document AI/Vision/Speech stacks without forcing the dependency on consumers. You can still bring your own implementation by assigning the provider interfaces directly on `MarkItDownOptions`. +`MarkItDownClient` emits structured `ILogger` events and OpenTelemetry spans by default. Toggle instrumentation with `MarkItDownOptions.EnableTelemetry`, supply a custom `ActivitySource`/`Meter`, or provide a `LoggerFactory` to integrate with your application's logging pipeline. + #### Azure AI setup (keys and managed identity) - **Docs**: [Document Intelligence](https://learn.microsoft.com/azure/ai-services/document-intelligence/), [Computer Vision Image Analysis](https://learn.microsoft.com/azure/ai-services/computer-vision/overview-image-analysis), [Video Indexer authentication](https://learn.microsoft.com/azure/azure-video-indexer/video-indexer-get-started/connect-to-azure). @@ -943,7 +945,7 @@ For LLM-style post-processing, assign `MarkItDownOptions.AiModels` with an `IAiM ```csharp using MarkItDown; -var markItDown = new MarkItDown(); +var markItDown = new MarkItDownClient(); try { @@ -1010,7 +1012,7 @@ using var httpClient = new HttpClient(); httpClient.Timeout = TimeSpan.FromSeconds(30); httpClient.DefaultRequestHeaders.Add("User-Agent", "MarkItDown/1.0"); -var markItDown = new MarkItDown(httpClient: httpClient); +var markItDown = new MarkItDownClient(httpClient: httpClient); ``` **Logging for Diagnostics:** @@ -1021,7 +1023,7 @@ using var loggerFactory = LoggerFactory.Create(builder => builder.AddConsole().SetMinimumLevel(LogLevel.Debug)); var logger = loggerFactory.CreateLogger(); -var markItDown = new MarkItDown(logger: logger); +var markItDown = new MarkItDownClient(logger: logger); // Now you'll see detailed conversion progress in console output ``` @@ -1034,7 +1036,7 @@ If you're familiar with the original Python library, here are the key difference | Python | C#/.NET | Notes | |---------|---------|--------| -| `MarkItDown()` | `new MarkItDown()` | Similar constructor | +| `MarkItDownClient()` | `new MarkItDownClient()` | Similar constructor | | `markitdown.convert("file.pdf")` | `await markItDown.ConvertAsync("file.pdf")` | Async pattern | | `markitdown.convert(stream, file_extension=".pdf")` | `await markItDown.ConvertAsync(stream, streamInfo)` | StreamInfo object | | `markitdown.convert_url("https://...")` | `await markItDown.ConvertFromUrlAsync("https://...")` | Async URL conversion | @@ -1046,7 +1048,7 @@ If you're familiar with the original Python library, here are the key difference ```python # Python version import markitdown -md = markitdown.MarkItDown() +md = markitdown.MarkItDownClient() result = md.convert("document.pdf") print(result.text_content) ``` @@ -1054,7 +1056,7 @@ print(result.text_content) ```csharp // C# version using MarkItDown; -var markItDown = new MarkItDown(); +var markItDown = new MarkItDownClient(); var result = await markItDown.ConvertAsync("document.pdf"); Console.WriteLine(result.Markdown); ``` @@ -1170,7 +1172,7 @@ Performance will vary based on your specific documents and environment. For prod ```csharp // 1. Reuse MarkItDown instances (they're thread-safe) -var markItDown = new MarkItDown(); +var markItDown = new MarkItDownClient(); await Task.WhenAll( markItDown.ConvertAsync("file1.pdf"), markItDown.ConvertAsync("file2.docx"), @@ -1183,7 +1185,7 @@ var result = await markItDown.ConvertAsync("large-file.pdf", cancellationToken: // 3. Configure HttpClient for web content (reuse connections) using var httpClient = new HttpClient(); -var markItDown = new MarkItDown(httpClient: httpClient); +var markItDown = new MarkItDownClient(httpClient: httpClient); // 4. Pre-specify StreamInfo to skip format detection var streamInfo = new StreamInfo(mimeType: "application/pdf", extension: ".pdf"); @@ -1202,7 +1204,7 @@ var options = new MarkItDownOptions ExifToolPath = "/usr/local/bin/exiftool" // Path to exiftool binary (optional) }; -var markItDown = new MarkItDown(options); +var markItDown = new MarkItDownClient(options); ``` ### Advanced AI Integration @@ -1233,7 +1235,7 @@ var options = new MarkItDownOptions } }; -var markItDown = new MarkItDown(options); +var markItDown = new MarkItDownClient(options); ``` ### Conversion Middleware & Raw Artifacts @@ -1250,7 +1252,7 @@ var options = new MarkItDownOptions } }; -var markItDown = new MarkItDown(options); +var markItDown = new MarkItDownClient(options); var result = await markItDown.ConvertAsync("docs/diagram.docx"); foreach (var image in result.Artifacts.Images) @@ -1294,7 +1296,7 @@ var options = new MarkItDownOptions } }; -var markItDown = new MarkItDown(options, logger, httpClientFactory.CreateClient()); +var markItDown = new MarkItDownClient(options, logger, httpClientFactory.CreateClient()); ``` ## 📄 License diff --git a/docs/MetaMD.md b/docs/MetaMD.md new file mode 100644 index 000000000..ad86f0c7a --- /dev/null +++ b/docs/MetaMD.md @@ -0,0 +1,45 @@ +# MetaMD (MMD) + +MetaMD is a Markdown profile that layers structured metadata and citation-aware rendering on top of CommonMark. Files typically use the `.metamd` extension (optionally `.metamd.md`) and begin with a JSON front matter block delimited by `+++` fences. + +## Front Matter Schema + +```json +{ + "title": "Document title", + "abstract": "Optional abstract text.", + "contributors": ["Name", "Name"], + "affiliations": ["Organisation"], + "keywords": ["term", "term"], + "references": [ + { + "id": "unique-id", + "title": "Reference title", + "authors": ["Author"], + "url": "https://example.com/reference" + } + ] +} +``` + +All properties are optional. Unknown properties are ignored by the converter. + +## Reference Syntax + +Inline citations use `[@id]`. During conversion each citation is replaced with a Markdown link if a URL is present, or bold text when the reference has no URL. Referenced entries are collected and emitted in a `## References` section at the end of the document, preserving author lists and links. + +## Diagram Blocks + +MetaMD supports lightweight diagram embedding via custom blocks: + +``` +:::diagram type="mermaid" + +::: +``` + +The converter rewrites these blocks as fenced code blocks using the requested diagram type (e.g., `mermaid`, `dot`, `plantuml`). + +## Compatibility + +Because MetaMD is a superset of Markdown, downstream tools that do not recognise the front matter or diagram directives still render the body content. The .NET converter automatically recognises `.metamd` and `.metamd.md` files, extracts metadata into headings, and normalises references for consistent Markdown output. diff --git a/src/MarkItDown/Converters/AsciiDocConverter.cs b/src/MarkItDown/Converters/AsciiDocConverter.cs new file mode 100644 index 000000000..ba2ea309d --- /dev/null +++ b/src/MarkItDown/Converters/AsciiDocConverter.cs @@ -0,0 +1,102 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Text; +using System.Text.RegularExpressions; +using System.Threading; +using System.Threading.Tasks; +using ManagedCode.MimeTypes; + +namespace MarkItDown.Converters; + +/// +/// Converter for AsciiDoc documents. +/// +public sealed class AsciiDocConverter : IDocumentConverter +{ + private static readonly IReadOnlyCollection Extensions = new[] + { + ".adoc", + ".asciidoc", + }; + + private static readonly IReadOnlyCollection MimeTypes = new[] + { + MimeHelper.GetMimeType(".adoc") ?? MimeTypeUtilities.Compose("text", "asciidoc"), + MimeHelper.GetMimeType(".asciidoc") ?? MimeTypeUtilities.Compose("text", "asciidoc"), + }; + + private static readonly Regex Bold = new("\\*(?[^*]+)\\*", RegexOptions.Compiled); + private static readonly Regex Italic = new("_(?[^_]+)_", RegexOptions.Compiled); + private static readonly Regex Monospace = new("`(?[^`]+)`", RegexOptions.Compiled); + + public int Priority => 160; + + public bool AcceptsInput(StreamInfo streamInfo) + { + var normalizedMime = MimeTypeUtilities.NormalizeMime(streamInfo); + var extension = streamInfo.Extension?.ToLowerInvariant(); + if (extension is not null && Extensions.Contains(extension)) + { + return true; + } + + return MimeTypeUtilities.MatchesAny(normalizedMime, MimeTypes) + || MimeTypeUtilities.MatchesAny(streamInfo.MimeType, MimeTypes); + } + + public bool Accepts(Stream stream, StreamInfo streamInfo, CancellationToken cancellationToken = default) => AcceptsInput(streamInfo); + + public async Task ConvertAsync(Stream stream, StreamInfo streamInfo, CancellationToken cancellationToken = default) + { + using var reader = new StreamReader(stream, Encoding.UTF8, detectEncodingFromByteOrderMarks: true, leaveOpen: true); + var content = await reader.ReadToEndAsync(cancellationToken).ConfigureAwait(false); + var markdown = ConvertToMarkdown(content); + return new DocumentConverterResult(markdown, streamInfo.FileName); + } + + private static string ConvertToMarkdown(string adoc) + { + var lines = adoc.Replace("\r\n", "\n").Split('\n'); + var builder = new StringBuilder(); + foreach (var line in lines) + { + var trimmed = line.TrimEnd(); + if (string.IsNullOrWhiteSpace(trimmed)) + { + builder.AppendLine(); + continue; + } + + if (trimmed.StartsWith("= ")) + { + var level = trimmed.TakeWhile(c => c == '=').Count(); + builder.AppendLine(new string('#', Math.Clamp(level, 1, 6)) + " " + trimmed[level..].Trim()); + continue; + } + + if (trimmed.StartsWith("==")) + { + var level = trimmed.TakeWhile(c => c == '=').Count(); + builder.AppendLine(new string('#', Math.Clamp(level, 1, 6)) + " " + trimmed[level..].Trim()); + continue; + } + + if (trimmed.StartsWith("*") || trimmed.StartsWith("-") || trimmed.StartsWith(".")) + { + var marker = trimmed[0] == '.' ? "1." : "-"; + builder.AppendLine(marker + " " + trimmed[1..].Trim()); + continue; + } + + var converted = Bold.Replace(trimmed, m => "**" + m.Groups["text"].Value + "**"); + converted = Italic.Replace(converted, m => "*" + m.Groups["text"].Value + "*"); + converted = Monospace.Replace(converted, m => "`" + m.Groups["text"].Value + "`"); + + builder.AppendLine(converted); + } + + return builder.ToString().Trim(); + } +} diff --git a/src/MarkItDown/Converters/BibTexConverter.cs b/src/MarkItDown/Converters/BibTexConverter.cs new file mode 100644 index 000000000..3c34a91ab --- /dev/null +++ b/src/MarkItDown/Converters/BibTexConverter.cs @@ -0,0 +1,131 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Text; +using System.Text.RegularExpressions; +using System.Threading; +using System.Threading.Tasks; +using ManagedCode.MimeTypes; + +namespace MarkItDown.Converters; + +/// +/// Converter for BibTeX bibliographies. +/// +public sealed class BibTexConverter : IDocumentConverter +{ + private static readonly IReadOnlyCollection Extensions = new[] + { + ".bib", + ".bibtex", + }; + + private static readonly IReadOnlyCollection MimeTypes = new[] + { + MimeHelper.GetMimeType(".bib") ?? MimeTypeUtilities.Compose("text", "x-bibtex"), + MimeHelper.GetMimeType(".bibtex") ?? MimeTypeUtilities.Compose("text", "x-bibtex"), + }; + + private static readonly Regex EntryRegex = new("@(?\\w+)\\s*\\{\\s*(?[^,]+),(?.*?)\\}\\s*(?=@|\\z)", RegexOptions.Singleline | RegexOptions.Compiled); + private static readonly Regex FieldRegex = new("(?[A-Za-z]+)\\s*=\\s*(\\{(?[^{}]*)\\}|\"(?[^\"]*)\"|(?[^,]+))", RegexOptions.Singleline | RegexOptions.Compiled); + + public int Priority => 140; + + public bool AcceptsInput(StreamInfo streamInfo) + { + var normalizedMime = MimeTypeUtilities.NormalizeMime(streamInfo); + var extension = streamInfo.Extension?.ToLowerInvariant(); + if (extension is not null && Extensions.Contains(extension)) + { + return true; + } + + return MimeTypeUtilities.MatchesAny(normalizedMime, MimeTypes) + || MimeTypeUtilities.MatchesAny(streamInfo.MimeType, MimeTypes); + } + + public bool Accepts(Stream stream, StreamInfo streamInfo, CancellationToken cancellationToken = default) => AcceptsInput(streamInfo); + + public async Task ConvertAsync(Stream stream, StreamInfo streamInfo, CancellationToken cancellationToken = default) + { + using var reader = new StreamReader(stream, Encoding.UTF8, detectEncodingFromByteOrderMarks: true, leaveOpen: true); + var content = await reader.ReadToEndAsync(cancellationToken).ConfigureAwait(false); + return new DocumentConverterResult(RenderMarkdown(content), streamInfo.FileName); + } + + private static string RenderMarkdown(string bibtex) + { + var builder = new StringBuilder(); + foreach (Match match in EntryRegex.Matches(bibtex)) + { + var type = match.Groups["type"].Value; + var key = match.Groups["key"].Value; + var fields = ParseFields(match.Groups["fields"].Value); + var title = fields.TryGetValue("title", out var rawTitle) ? rawTitle : key; + var authors = fields.TryGetValue("author", out var rawAuthors) ? FormatAuthors(rawAuthors) : null; + var year = fields.TryGetValue("year", out var rawYear) ? rawYear : null; + var venue = fields.TryGetValue("journal", out var rawJournal) ? rawJournal : fields.TryGetValue("booktitle", out var rawBookTitle) ? rawBookTitle : null; + var url = fields.TryGetValue("url", out var rawUrl) ? rawUrl : null; + + var lineBuilder = new StringBuilder(); + lineBuilder.Append("- **"); + lineBuilder.Append(title.Trim('{', '}', '"')); + lineBuilder.Append("**"); + + if (!string.IsNullOrEmpty(authors)) + { + lineBuilder.Append(" — "); + lineBuilder.Append(authors); + } + + if (!string.IsNullOrEmpty(venue)) + { + lineBuilder.Append(", "); + lineBuilder.Append(venue); + } + + if (!string.IsNullOrEmpty(year)) + { + lineBuilder.Append(" (" + year + ")"); + } + + if (!string.IsNullOrEmpty(url)) + { + lineBuilder.Append($" [link]({url})"); + } + + lineBuilder.Append(" — ``"); + lineBuilder.Append(type); + lineBuilder.Append("``"); + + builder.AppendLine(lineBuilder.ToString()); + } + + return builder.ToString().Trim(); + } + + private static Dictionary ParseFields(string body) + { + var result = new Dictionary(StringComparer.OrdinalIgnoreCase); + foreach (Match field in FieldRegex.Matches(body)) + { + var name = field.Groups["name"].Value; + var value = field.Groups["value"].Success ? field.Groups["value"].Value : field.Groups["bare"].Value; + if (!string.IsNullOrWhiteSpace(name)) + { + result[name.Trim()] = value.Trim(); + } + } + + return result; + } + + private static string FormatAuthors(string rawAuthors) + { + var authors = rawAuthors.Split(new[] { " and " }, StringSplitOptions.RemoveEmptyEntries) + .Select(a => a.Trim('{', '}', ' ', '\t', '\n', '\r')) + .ToArray(); + return string.Join(", ", authors); + } +} diff --git a/src/MarkItDown/Converters/CslJsonConverter.cs b/src/MarkItDown/Converters/CslJsonConverter.cs new file mode 100644 index 000000000..035537c6d --- /dev/null +++ b/src/MarkItDown/Converters/CslJsonConverter.cs @@ -0,0 +1,151 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Text; +using System.Text.Json; +using System.Threading; +using System.Threading.Tasks; +using ManagedCode.MimeTypes; + +namespace MarkItDown.Converters; + +/// +/// Converter for CSL JSON bibliographies. +/// +public sealed class CslJsonConverter : IDocumentConverter +{ + private static readonly IReadOnlyCollection Extensions = new[] + { + ".csljson", + }; + + private static readonly string DefaultCslMime = MimeHelper.GetMimeType(".csljson") ?? string.Empty; + + private static readonly IReadOnlyCollection MimeTypes = new[] + { + string.IsNullOrWhiteSpace(DefaultCslMime) || DefaultCslMime.StartsWith(MimeHelper.JSON, StringComparison.OrdinalIgnoreCase) + ? MimeTypeUtilities.Compose(MimeHelper.APPLICATION, "vnd.citationstyles.csl+json") + : DefaultCslMime, + MimeTypeUtilities.Compose(MimeHelper.APPLICATION, "vnd.citationstyles.csl+json"), + MimeTypeUtilities.Compose(MimeHelper.APPLICATION, "citeproc+json"), + }; + + public int Priority => 138; + + public bool AcceptsInput(StreamInfo streamInfo) + { + var normalizedMime = MimeTypeUtilities.NormalizeMime(streamInfo); + var extension = streamInfo.Extension?.ToLowerInvariant(); + if (extension is not null && Extensions.Contains(extension)) + { + return true; + } + + return MimeTypeUtilities.MatchesAny(normalizedMime, MimeTypes) + || MimeTypeUtilities.MatchesAny(streamInfo.MimeType, MimeTypes); + } + + public bool Accepts(Stream stream, StreamInfo streamInfo, CancellationToken cancellationToken = default) => AcceptsInput(streamInfo); + + public async Task ConvertAsync(Stream stream, StreamInfo streamInfo, CancellationToken cancellationToken = default) + { + using var reader = new StreamReader(stream, Encoding.UTF8, detectEncodingFromByteOrderMarks: true, leaveOpen: true); + var content = await reader.ReadToEndAsync(cancellationToken).ConfigureAwait(false); + return new DocumentConverterResult(RenderMarkdown(content), streamInfo.FileName); + } + + private static string RenderMarkdown(string json) + { + using var document = JsonDocument.Parse(json); + var builder = new StringBuilder(); + + if (document.RootElement.ValueKind == JsonValueKind.Array) + { + foreach (var element in document.RootElement.EnumerateArray()) + { + builder.AppendLine(RenderEntry(element)); + } + } + else if (document.RootElement.ValueKind == JsonValueKind.Object) + { + builder.AppendLine(RenderEntry(document.RootElement)); + } + + return builder.ToString().Trim(); + } + + private static string RenderEntry(JsonElement element) + { + var title = element.TryGetProperty("title", out var titleProperty) + ? titleProperty.GetString() + : element.TryGetProperty("id", out var idProperty) ? idProperty.GetString() : "Untitled"; + var year = element.TryGetProperty("issued", out var issued) + && issued.TryGetProperty("date-parts", out var dateParts) + && dateParts.ValueKind == JsonValueKind.Array + && dateParts.GetArrayLength() > 0 + && dateParts[0].ValueKind == JsonValueKind.Array + && dateParts[0].GetArrayLength() > 0 + ? dateParts[0][0].GetRawText() + : null; + var container = element.TryGetProperty("container-title", out var containerTitle) + ? containerTitle.GetString() + : null; + var url = element.TryGetProperty("URL", out var urlProperty) + ? urlProperty.GetString() + : element.TryGetProperty("DOI", out var doiProperty) ? "https://doi.org/" + doiProperty.GetString() : null; + + var authors = new List(); + if (element.TryGetProperty("author", out var authorArray) && authorArray.ValueKind == JsonValueKind.Array) + { + foreach (var author in authorArray.EnumerateArray()) + { + var parts = new List(); + if (author.TryGetProperty("given", out var given)) + { + parts.Add(given.GetString() ?? string.Empty); + } + + if (author.TryGetProperty("family", out var family)) + { + parts.Add(family.GetString() ?? string.Empty); + } + + var formatted = string.Join(" ", parts.Where(p => !string.IsNullOrWhiteSpace(p)).Select(p => p.Trim())); + if (!string.IsNullOrWhiteSpace(formatted)) + { + authors.Add(formatted); + } + } + } + + var lineBuilder = new StringBuilder(); + lineBuilder.Append("- **"); + lineBuilder.Append(title); + lineBuilder.Append("**"); + + if (authors.Count > 0) + { + lineBuilder.Append(" — "); + lineBuilder.Append(string.Join(", ", authors)); + } + + if (!string.IsNullOrWhiteSpace(container)) + { + lineBuilder.Append(", "); + lineBuilder.Append(container); + } + + if (!string.IsNullOrWhiteSpace(year)) + { + lineBuilder.Append(" (" + year + ")"); + } + + if (!string.IsNullOrWhiteSpace(url)) + { + lineBuilder.Append($" [link]({url})"); + } + + return lineBuilder.ToString(); + } +} diff --git a/src/MarkItDown/Converters/CsvConverter.cs b/src/MarkItDown/Converters/CsvConverter.cs index 08057f83b..df853275d 100644 --- a/src/MarkItDown/Converters/CsvConverter.cs +++ b/src/MarkItDown/Converters/CsvConverter.cs @@ -16,12 +16,15 @@ public sealed class CsvConverter : IDocumentConverter { private static readonly HashSet AcceptedExtensions = new(StringComparer.OrdinalIgnoreCase) { - ".csv" + ".csv", + ".tsv", + ".tab", }; private static readonly IReadOnlyCollection AcceptedMimeTypePrefixes = new List { MimeHelper.CSV, + MimeTypeUtilities.WithSubtype(MimeHelper.CSV, "tab-separated-values"), MimeTypeUtilities.WithType(MimeHelper.CSV, "application"), }; @@ -53,11 +56,17 @@ public async Task ConvertAsync(Stream stream, StreamInf stream.Position = 0; using var reader = new StreamReader(stream, streamInfo.Charset ?? Encoding.UTF8, detectEncodingFromByteOrderMarks: true, leaveOpen: true); + var normalizedMime = MimeTypeUtilities.NormalizeMime(streamInfo); + var isTabSeparated = string.Equals(streamInfo.Extension, ".tsv", StringComparison.OrdinalIgnoreCase) + || string.Equals(streamInfo.Extension, ".tab", StringComparison.OrdinalIgnoreCase) + || string.Equals(normalizedMime, MimeTypeUtilities.WithSubtype(MimeHelper.CSV, "tab-separated-values"), StringComparison.OrdinalIgnoreCase); + using var sepReader = await Sep.Reader(options => options with { HasHeader = true, Unescape = true, Trim = SepTrim.All, + Sep = new Sep(isTabSeparated ? '\t' : ','), }).FromAsync(reader, cancellationToken).ConfigureAwait(false); var rows = new List(); diff --git a/src/MarkItDown/Converters/DjotConverter.cs b/src/MarkItDown/Converters/DjotConverter.cs new file mode 100644 index 000000000..34e8b3e06 --- /dev/null +++ b/src/MarkItDown/Converters/DjotConverter.cs @@ -0,0 +1,51 @@ +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Text; +using System.Threading; +using System.Threading.Tasks; +using ManagedCode.MimeTypes; + +namespace MarkItDown.Converters; + +/// +/// Minimal converter for Djot documents, treated as Markdown-compatible text. +/// +public sealed class DjotConverter : IDocumentConverter +{ + private static readonly IReadOnlyCollection Extensions = new[] + { + ".dj", + ".djot", + }; + + private static readonly IReadOnlyCollection MimeTypes = new[] + { + MimeHelper.GetMimeType(".dj") ?? MimeTypeUtilities.Compose("text", "djot"), + MimeHelper.GetMimeType(".djot") ?? MimeTypeUtilities.Compose("text", "djot"), + }; + + public int Priority => 154; + + public bool AcceptsInput(StreamInfo streamInfo) + { + var normalizedMime = MimeTypeUtilities.NormalizeMime(streamInfo); + var extension = streamInfo.Extension?.ToLowerInvariant(); + if (extension is not null && Extensions.Contains(extension)) + { + return true; + } + + return MimeTypeUtilities.MatchesAny(normalizedMime, MimeTypes) + || MimeTypeUtilities.MatchesAny(streamInfo.MimeType, MimeTypes); + } + + public bool Accepts(Stream stream, StreamInfo streamInfo, CancellationToken cancellationToken = default) => AcceptsInput(streamInfo); + + public async Task ConvertAsync(Stream stream, StreamInfo streamInfo, CancellationToken cancellationToken = default) + { + using var reader = new StreamReader(stream, Encoding.UTF8, detectEncodingFromByteOrderMarks: true, leaveOpen: true); + var content = await reader.ReadToEndAsync(cancellationToken).ConfigureAwait(false); + return new DocumentConverterResult(content.Trim(), streamInfo.FileName); + } +} diff --git a/src/MarkItDown/Converters/DocBookConverter.cs b/src/MarkItDown/Converters/DocBookConverter.cs new file mode 100644 index 000000000..c28c3e211 --- /dev/null +++ b/src/MarkItDown/Converters/DocBookConverter.cs @@ -0,0 +1,131 @@ +using System; +using System.Collections.Generic; +using System.Text; +using System.Xml.Linq; +using ManagedCode.MimeTypes; + +namespace MarkItDown.Converters; + +/// +/// Converter for DocBook XML documents. +/// +public sealed class DocBookConverter : StructuredXmlConverterBase +{ + private static readonly IReadOnlyCollection Extensions = new[] + { + ".docbook", + ".dbk", + }; + + private static readonly string DocBookMime = MimeHelper.GetMimeType(".docbook") + ?? MimeTypeUtilities.WithSubtype(MimeHelper.XML, "docbook+xml"); + + private static readonly IReadOnlyCollection MimeTypes = new[] + { + DocBookMime, + MimeHelper.XML, + }; + + protected override IReadOnlyCollection AcceptedExtensions => Extensions; + + protected override IReadOnlyCollection AcceptedMimeTypes => MimeTypes; + + protected override string RenderMarkdown(XDocument document) + { + var builder = new StringBuilder(); + if (document.Root is null) + { + return string.Empty; + } + + RenderElement(document.Root, builder, 1); + return builder.ToString().Trim(); + } + + private static void RenderElement(XElement element, StringBuilder builder, int level) + { + var name = element.Name.LocalName.ToLowerInvariant(); + switch (name) + { + case "book": + case "article": + case "chapter": + case "section": + case "sect1": + case "sect2": + case "sect3": + case "sect4": + case "sect5": + foreach (var child in element.Elements()) + { + if (child.Name.LocalName.Equals("title", StringComparison.OrdinalIgnoreCase)) + { + var headingLevel = Math.Min(level, 6); + builder.AppendLine(new string('#', headingLevel) + " " + RenderTextNodes(child.Nodes())); + builder.AppendLine(); + } + else + { + RenderElement(child, builder, level + 1); + } + } + + break; + case "title": + case "subtitle": + builder.AppendLine(new string('#', Math.Min(level, 6)) + " " + RenderTextNodes(element.Nodes())); + builder.AppendLine(); + break; + case "para": + case "simpara": + case "p": + builder.AppendLine(RenderTextNodes(element.Nodes())); + builder.AppendLine(); + break; + case "itemizedlist": + case "orderedlist": + RenderList(element, builder, level, name == "itemizedlist" ? "-" : "1."); + break; + case "listitem": + builder.Append(RenderTextNodes(element.Nodes())); + break; + case "emphasis": + case "bold": + case "link": + builder.Append(RenderTextNodes(new[] { element })); + break; + default: + foreach (var child in element.Elements()) + { + RenderElement(child, builder, level); + } + + if (!element.HasElements) + { + var text = element.Value?.Trim(); + if (!string.IsNullOrEmpty(text)) + { + builder.AppendLine(text); + builder.AppendLine(); + } + } + + break; + } + } + + private static void RenderList(XElement element, StringBuilder builder, int level, string bullet) + { + var index = 0; + foreach (var item in element.Elements("listitem")) + { + var marker = bullet == "1." ? $"{++index}." : "-"; + builder.Append(new string(' ', (level - 1) * 2)); + builder.Append(marker); + builder.Append(' '); + builder.AppendLine(RenderTextNodes(item.Nodes())); + } + + builder.AppendLine(); + } +} diff --git a/src/MarkItDown/Converters/EndNoteXmlConverter.cs b/src/MarkItDown/Converters/EndNoteXmlConverter.cs new file mode 100644 index 000000000..be75154e4 --- /dev/null +++ b/src/MarkItDown/Converters/EndNoteXmlConverter.cs @@ -0,0 +1,92 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Xml.Linq; +using ManagedCode.MimeTypes; + +namespace MarkItDown.Converters; + +/// +/// Converter for EndNote XML exports. +/// +public sealed class EndNoteXmlConverter : StructuredXmlConverterBase +{ + private static readonly IReadOnlyCollection Extensions = new[] + { + ".enl", + ".endnote", + ".endnote.xml", + }; + + private static readonly string EndNoteMime = MimeHelper.GetMimeType(".enl") + ?? MimeTypeUtilities.WithSubtype(MimeHelper.XML, "endnote+xml"); + + private static readonly IReadOnlyCollection MimeTypes = new[] + { + EndNoteMime, + MimeHelper.XML, + }; + + protected override IReadOnlyCollection AcceptedExtensions => Extensions; + + protected override IReadOnlyCollection AcceptedMimeTypes => MimeTypes; + + protected override string RenderMarkdown(XDocument document) + { + if (document.Root is null) + { + return string.Empty; + } + + var ns = document.Root.GetDefaultNamespace(); + var records = document.Root.Elements(ns + "record"); + var builder = new StringBuilder(); + foreach (var record in records) + { + var title = record.Element(ns + "titles")?.Element(ns + "title")?.Value + ?? record.Element(ns + "titles")?.Element(ns + "secondary-title")?.Value + ?? "Untitled"; + var authors = record + .Element(ns + "contributors")? + .Elements() + .SelectMany(e => e.Elements(ns + "name")) + .Select(e => e.Value) + .Where(v => !string.IsNullOrWhiteSpace(v)) + .ToList() ?? new List(); + var year = record.Element(ns + "dates")?.Element(ns + "year")?.Value; + var journal = record.Element(ns + "periodical")?.Element(ns + "full-title")?.Value; + var url = record.Element(ns + "urls")?.Element(ns + "related-urls")?.Elements(ns + "url")?.FirstOrDefault()?.Value; + + builder.Append("- **"); + builder.Append(title.Trim()); + builder.Append("**"); + + if (authors.Count > 0) + { + builder.Append(" — "); + builder.Append(string.Join(", ", authors)); + } + + if (!string.IsNullOrWhiteSpace(journal)) + { + builder.Append(", "); + builder.Append(journal); + } + + if (!string.IsNullOrWhiteSpace(year)) + { + builder.Append(" (" + year + ")"); + } + + if (!string.IsNullOrWhiteSpace(url)) + { + builder.Append($" [link]({url})"); + } + + builder.AppendLine(); + } + + return builder.ToString().Trim(); + } +} diff --git a/src/MarkItDown/Converters/Fb2Converter.cs b/src/MarkItDown/Converters/Fb2Converter.cs new file mode 100644 index 000000000..86a754b24 --- /dev/null +++ b/src/MarkItDown/Converters/Fb2Converter.cs @@ -0,0 +1,90 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Xml.Linq; +using ManagedCode.MimeTypes; + +namespace MarkItDown.Converters; + +/// +/// Converter for FictionBook (FB2) XML documents. +/// +public sealed class Fb2Converter : StructuredXmlConverterBase +{ + private static readonly IReadOnlyCollection Extensions = new[] + { + ".fb2", + }; + + private static readonly string Fb2Mime = MimeHelper.GetMimeType(".fb2") + ?? MimeTypeUtilities.WithSubtype(MimeHelper.XML, "fb2+xml"); + + private static readonly IReadOnlyCollection MimeTypes = new[] + { + Fb2Mime, + MimeHelper.XML, + }; + + protected override IReadOnlyCollection AcceptedExtensions => Extensions; + + protected override IReadOnlyCollection AcceptedMimeTypes => MimeTypes; + + protected override string? ExtractTitle(XDocument document) + { + var ns = document.Root?.GetDefaultNamespace() ?? XNamespace.None; + return document.Root? + .Element(ns + "description")? + .Element(ns + "title-info")? + .Element(ns + "book-title")? + .Value?.Trim(); + } + + protected override string RenderMarkdown(XDocument document) + { + if (document.Root is null) + { + return string.Empty; + } + + var builder = new StringBuilder(); + var ns = document.Root.GetDefaultNamespace(); + foreach (var body in document.Root.Elements(ns + "body")) + { + var title = body.Element(ns + "title")?.Elements(ns + "p").FirstOrDefault()?.Value; + if (!string.IsNullOrWhiteSpace(title)) + { + builder.AppendLine($"# {title.Trim()}"); + builder.AppendLine(); + } + + foreach (var section in body.Elements(ns + "section")) + { + RenderSection(section, builder, 2, ns); + } + } + + return builder.ToString().Trim(); + } + + private static void RenderSection(XElement section, StringBuilder builder, int level, XNamespace ns) + { + var title = section.Element(ns + "title")?.Elements(ns + "p").FirstOrDefault()?.Value; + if (!string.IsNullOrWhiteSpace(title)) + { + builder.AppendLine(new string('#', Math.Min(level, 6)) + " " + title.Trim()); + builder.AppendLine(); + } + + foreach (var paragraph in section.Elements(ns + "p")) + { + builder.AppendLine(RenderTextNodes(paragraph.Nodes())); + builder.AppendLine(); + } + + foreach (var subsection in section.Elements(ns + "section")) + { + RenderSection(subsection, builder, level + 1, ns); + } + } +} diff --git a/src/MarkItDown/Converters/GraphvizConverter.cs b/src/MarkItDown/Converters/GraphvizConverter.cs new file mode 100644 index 000000000..1201eb726 --- /dev/null +++ b/src/MarkItDown/Converters/GraphvizConverter.cs @@ -0,0 +1,54 @@ +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Text; +using System.Threading; +using System.Threading.Tasks; +using ManagedCode.MimeTypes; + +namespace MarkItDown.Converters; + +/// +/// Converter for Graphviz DOT diagrams. +/// +public sealed class GraphvizConverter : IDocumentConverter +{ + private static readonly IReadOnlyCollection Extensions = new[] + { + ".dot", + ".gv", + }; + + private static readonly IReadOnlyCollection MimeTypes = new[] + { + MimeHelper.GetMimeType(".dot") ?? MimeTypeUtilities.Compose("text", "vnd.graphviz"), + }; + + public int Priority => 129; + + public bool AcceptsInput(StreamInfo streamInfo) + { + var normalizedMime = MimeTypeUtilities.NormalizeMime(streamInfo); + var extension = streamInfo.Extension?.ToLowerInvariant(); + if (extension is not null && Extensions.Contains(extension)) + { + return true; + } + + return MimeTypeUtilities.MatchesAny(normalizedMime, MimeTypes) + || MimeTypeUtilities.MatchesAny(streamInfo.MimeType, MimeTypes); + } + + public bool Accepts(Stream stream, StreamInfo streamInfo, CancellationToken cancellationToken = default) => AcceptsInput(streamInfo); + + public async Task ConvertAsync(Stream stream, StreamInfo streamInfo, CancellationToken cancellationToken = default) + { + using var reader = new StreamReader(stream, Encoding.UTF8, detectEncodingFromByteOrderMarks: true, leaveOpen: true); + var content = await reader.ReadToEndAsync(cancellationToken).ConfigureAwait(false); + var fenced = new StringBuilder(); + fenced.AppendLine("```dot"); + fenced.AppendLine(content.Trim()); + fenced.AppendLine("```"); + return new DocumentConverterResult(fenced.ToString().Trim(), streamInfo.FileName); + } +} diff --git a/src/MarkItDown/Converters/JatsConverter.cs b/src/MarkItDown/Converters/JatsConverter.cs new file mode 100644 index 000000000..cdc010315 --- /dev/null +++ b/src/MarkItDown/Converters/JatsConverter.cs @@ -0,0 +1,132 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Xml.Linq; +using ManagedCode.MimeTypes; + +namespace MarkItDown.Converters; + +/// +/// Converter for JATS XML documents. +/// +public sealed class JatsConverter : StructuredXmlConverterBase +{ + private static readonly IReadOnlyCollection Extensions = new[] + { + ".jats", + ".bits", + }; + + private static readonly string JatsMime = MimeHelper.GetMimeType(".jats") + ?? MimeTypeUtilities.WithSubtype(MimeHelper.XML, "jats+xml"); + + private static readonly IReadOnlyCollection MimeTypes = new[] + { + JatsMime, + MimeHelper.XML, + }; + + protected override IReadOnlyCollection AcceptedExtensions => Extensions; + + protected override IReadOnlyCollection AcceptedMimeTypes => MimeTypes; + + protected override string RenderMarkdown(XDocument document) + { + if (document.Root is null) + { + return string.Empty; + } + + var builder = new StringBuilder(); + RenderElement(document.Root, builder, 1); + return builder.ToString().Trim(); + } + + protected override string? ExtractTitle(XDocument document) + { + var articleTitle = document + .Root? + .Descendants(document.Root.GetDefaultNamespace() + "article-title") + .FirstOrDefault(); + + return articleTitle?.Value?.Trim() ?? base.ExtractTitle(document); + } + + private static void RenderElement(XElement element, StringBuilder builder, int level) + { + var localName = element.Name.LocalName.ToLowerInvariant(); + switch (localName) + { + case "article": + case "front": + case "body": + case "sec": + foreach (var child in element.Elements()) + { + if (child.Name.LocalName.Equals("title", StringComparison.OrdinalIgnoreCase)) + { + var headingLevel = Math.Min(level, 6); + builder.AppendLine(new string('#', headingLevel) + " " + RenderTextNodes(child.Nodes())); + builder.AppendLine(); + } + else + { + var nextLevel = child.Name.LocalName.Equals("sec", StringComparison.OrdinalIgnoreCase) + ? level + 1 + : level; + RenderElement(child, builder, nextLevel); + } + } + + break; + case "title": + builder.AppendLine(new string('#', Math.Min(level, 6)) + " " + RenderTextNodes(element.Nodes())); + builder.AppendLine(); + break; + case "p": + case "para": + builder.AppendLine(RenderTextNodes(element.Nodes())); + builder.AppendLine(); + break; + case "list" when element.Attribute("list-type")?.Value == "bullet": + case "list" when element.Attribute("list-type")?.Value == "order": + case "list": + RenderList(element, builder, level); + break; + default: + foreach (var child in element.Elements()) + { + RenderElement(child, builder, level); + } + + if (!element.HasElements) + { + var value = element.Value?.Trim(); + if (!string.IsNullOrEmpty(value)) + { + builder.AppendLine(value); + builder.AppendLine(); + } + } + + break; + } + } + + private static void RenderList(XElement element, StringBuilder builder, int level) + { + var ordered = string.Equals(element.Attribute("list-type")?.Value, "order", StringComparison.OrdinalIgnoreCase); + var marker = ordered ? 1 : 0; + foreach (var item in element.Elements(element.Name.Namespace + "list-item")) + { + var prefix = ordered ? $"{marker++}." : "-"; + builder.Append(new string(' ', (level - 1) * 2)); + builder.Append(prefix); + builder.Append(' '); + builder.AppendLine(RenderTextNodes(item.Nodes())); + } + + builder.AppendLine(); + } +} diff --git a/src/MarkItDown/Converters/LatexConverter.cs b/src/MarkItDown/Converters/LatexConverter.cs new file mode 100644 index 000000000..48cfd5c1b --- /dev/null +++ b/src/MarkItDown/Converters/LatexConverter.cs @@ -0,0 +1,105 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Text; +using System.Text.RegularExpressions; +using System.Threading; +using System.Threading.Tasks; +using ManagedCode.MimeTypes; + +namespace MarkItDown.Converters; + +/// +/// Simplistic converter for LaTeX documents. +/// +public sealed class LatexConverter : IDocumentConverter +{ + private static readonly IReadOnlyCollection Extensions = new[] + { + ".tex", + ".latex", + }; + + private static readonly IReadOnlyCollection MimeTypes = new[] + { + MimeHelper.GetMimeType(".tex") ?? MimeTypeUtilities.Compose("text", "x-tex"), + MimeHelper.GetMimeType(".latex") ?? MimeTypeUtilities.Compose("text", "x-latex"), + MimeTypeUtilities.WithType(MimeHelper.XML, "x-tex"), + }; + + private static readonly Regex SectionRegex = new(@"\\(?sub)*section\{(?[^}]*)\}", RegexOptions.Compiled); + private static readonly Regex BoldRegex = new(@"\\textbf\{(?<text>[^}]*)\}", RegexOptions.Compiled); + private static readonly Regex EmphRegex = new(@"\\emph\{(?<text>[^}]*)\}", RegexOptions.Compiled); + private static readonly Regex TitleRegex = new(@"\\title\{(?<title>[^}]*)\}", RegexOptions.Compiled); + + public int Priority => 170; + + public bool AcceptsInput(StreamInfo streamInfo) + { + var normalizedMime = MimeTypeUtilities.NormalizeMime(streamInfo); + var extension = streamInfo.Extension?.ToLowerInvariant(); + if (extension is not null && Extensions.Contains(extension)) + { + return true; + } + + return MimeTypeUtilities.MatchesAny(normalizedMime, MimeTypes) + || MimeTypeUtilities.MatchesAny(streamInfo.MimeType, MimeTypes); + } + + public bool Accepts(Stream stream, StreamInfo streamInfo, CancellationToken cancellationToken = default) => AcceptsInput(streamInfo); + + public async Task<DocumentConverterResult> ConvertAsync(Stream stream, StreamInfo streamInfo, CancellationToken cancellationToken = default) + { + using var reader = new StreamReader(stream, Encoding.UTF8, detectEncodingFromByteOrderMarks: true, leaveOpen: true); + var content = await reader.ReadToEndAsync(cancellationToken).ConfigureAwait(false); + var markdown = ConvertToMarkdown(content); + var titleMatch = TitleRegex.Match(content); + var title = titleMatch.Success ? titleMatch.Groups["title"].Value.Trim() : streamInfo.FileName; + return new DocumentConverterResult(markdown, title); + } + + private static string ConvertToMarkdown(string latex) + { + var normalized = latex.Replace("\r\n", "\n"); + normalized = TitleRegex.Replace(normalized, string.Empty); + normalized = SectionRegex.Replace(normalized, match => + { + var levelToken = match.Groups["level"].Value; + var level = string.IsNullOrEmpty(levelToken) ? 1 : levelToken.Length / 3 + 2; + level = Math.Clamp(level, 1, 6); + return "\n" + new string('#', level) + " " + match.Groups["title"].Value.Trim() + "\n"; + }); + + normalized = Regex.Replace(normalized, @"\\subsubsection\{([^}]*)\}", m => "\n#### " + m.Groups[1].Value.Trim() + "\n"); + normalized = BoldRegex.Replace(normalized, m => "**" + m.Groups["text"].Value + "**"); + normalized = EmphRegex.Replace(normalized, m => "*" + m.Groups["text"].Value + "*"); + normalized = Regex.Replace(normalized, @"\\begin\{itemize\}", "\n"); + normalized = Regex.Replace(normalized, @"\\end\{itemize\}", "\n"); + normalized = Regex.Replace(normalized, @"\\begin\{enumerate\}", "\n"); + normalized = Regex.Replace(normalized, @"\\end\{enumerate\}", "\n"); + normalized = Regex.Replace(normalized, @"\\item\s+", "\n- "); + normalized = Regex.Replace(normalized, @"\\\\", "\n"); + normalized = Regex.Replace(normalized, @"\\\[[^\]]*\\\]", string.Empty); + normalized = Regex.Replace(normalized, @"\\cite\{[^}]*\}", string.Empty); + normalized = Regex.Replace(normalized, @"\\[a-zA-Z]+", string.Empty); + + var lines = normalized.Split('\n'); + var builder = new StringBuilder(); + foreach (var line in lines) + { + var trimmed = line.TrimEnd(); + if (string.IsNullOrWhiteSpace(trimmed)) + { + builder.AppendLine(); + } + else + { + builder.AppendLine(trimmed); + } + } + + return builder.ToString().Trim(); + } +} diff --git a/src/MarkItDown/Converters/MermaidConverter.cs b/src/MarkItDown/Converters/MermaidConverter.cs new file mode 100644 index 000000000..44a93c1a1 --- /dev/null +++ b/src/MarkItDown/Converters/MermaidConverter.cs @@ -0,0 +1,54 @@ +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Text; +using System.Threading; +using System.Threading.Tasks; +using ManagedCode.MimeTypes; + +namespace MarkItDown.Converters; + +/// <summary> +/// Converter for Mermaid diagram sources. +/// </summary> +public sealed class MermaidConverter : IDocumentConverter +{ + private static readonly IReadOnlyCollection<string> Extensions = new[] + { + ".mermaid", + ".mmd", + }; + + private static readonly IReadOnlyCollection<string> MimeTypes = new[] + { + MimeHelper.GetMimeType(".mermaid") ?? MimeTypeUtilities.Compose("text", "x-mermaid"), + }; + + public int Priority => 130; + + public bool AcceptsInput(StreamInfo streamInfo) + { + var normalizedMime = MimeTypeUtilities.NormalizeMime(streamInfo); + var extension = streamInfo.Extension?.ToLowerInvariant(); + if (extension is not null && Extensions.Contains(extension)) + { + return true; + } + + return MimeTypeUtilities.MatchesAny(normalizedMime, MimeTypes) + || MimeTypeUtilities.MatchesAny(streamInfo.MimeType, MimeTypes); + } + + public bool Accepts(Stream stream, StreamInfo streamInfo, CancellationToken cancellationToken = default) => AcceptsInput(streamInfo); + + public async Task<DocumentConverterResult> ConvertAsync(Stream stream, StreamInfo streamInfo, CancellationToken cancellationToken = default) + { + using var reader = new StreamReader(stream, Encoding.UTF8, detectEncodingFromByteOrderMarks: true, leaveOpen: true); + var content = await reader.ReadToEndAsync(cancellationToken).ConfigureAwait(false); + var fenced = new StringBuilder(); + fenced.AppendLine("```mermaid"); + fenced.AppendLine(content.Trim()); + fenced.AppendLine("```"); + return new DocumentConverterResult(fenced.ToString().Trim(), streamInfo.FileName); + } +} diff --git a/src/MarkItDown/Converters/MetaMdConverter.cs b/src/MarkItDown/Converters/MetaMdConverter.cs new file mode 100644 index 000000000..4d0f22a7d --- /dev/null +++ b/src/MarkItDown/Converters/MetaMdConverter.cs @@ -0,0 +1,248 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Text; +using System.Text.Json; +using System.Text.RegularExpressions; +using System.Threading; +using System.Threading.Tasks; +using ManagedCode.MimeTypes; + +namespace MarkItDown.Converters; + +/// <summary> +/// Converter for MetaMD documents that carry structured metadata and references. +/// </summary> +public sealed class MetaMdConverter : IDocumentConverter +{ + private static readonly IReadOnlyCollection<string> Extensions = new[] + { + ".metamd", + ".metamd.md", + }; + + private static readonly IReadOnlyCollection<string> MimeTypes = new[] + { + MimeHelper.GetMimeType(".metamd") ?? MimeTypeUtilities.Compose(MimeHelper.TEXT, "x-metamd"), + }; + + private static readonly Regex FrontMatterPattern = new("^\\+\\+\\+\\s*\\n(?<meta>.*?)(?:\\n\\+\\+\\+\\s*\\n)(?<body>[\\s\\S]*)$", RegexOptions.Compiled | RegexOptions.Singleline); + private static readonly Regex ReferencePattern = new("\\[@(?<id>[^\\]]+)\\]", RegexOptions.Compiled); + private static readonly Regex DiagramPattern = new(":::diagram\\s+type=\"(?<type>[^\"]+)\"\\s*\\n(?<content>[\\s\\S]*?)\\n:::", RegexOptions.Compiled); + + public int Priority => 145; + + public bool AcceptsInput(StreamInfo streamInfo) + { + var normalizedMime = MimeTypeUtilities.NormalizeMime(streamInfo); + if (!string.IsNullOrEmpty(streamInfo.FileName) && streamInfo.FileName.EndsWith(".metamd.md", StringComparison.OrdinalIgnoreCase)) + { + return true; + } + + var extension = streamInfo.Extension?.ToLowerInvariant(); + if (extension is not null && Extensions.Contains(extension)) + { + return true; + } + + return MimeTypeUtilities.MatchesAny(normalizedMime, MimeTypes) + || MimeTypeUtilities.MatchesAny(streamInfo.MimeType, MimeTypes); + } + + public bool Accepts(Stream stream, StreamInfo streamInfo, CancellationToken cancellationToken = default) => AcceptsInput(streamInfo); + + public async Task<DocumentConverterResult> ConvertAsync(Stream stream, StreamInfo streamInfo, CancellationToken cancellationToken = default) + { + using var reader = new StreamReader(stream, Encoding.UTF8, detectEncodingFromByteOrderMarks: true, leaveOpen: true); + var content = await reader.ReadToEndAsync(cancellationToken).ConfigureAwait(false); + var (metadata, body) = ParseDocument(content); + var (processedBody, usedReferences) = ProcessBody(body, metadata.References); + var markdown = ComposeMarkdown(metadata, processedBody, usedReferences); + return new DocumentConverterResult(markdown, metadata.Title ?? streamInfo.FileName); + } + + private static (MetaMdMetadata Metadata, string Body) ParseDocument(string content) + { + var match = FrontMatterPattern.Match(content); + if (!match.Success) + { + return (MetaMdMetadata.Empty, content); + } + + var meta = match.Groups["meta"].Value; + var body = match.Groups["body"].Value; + + try + { + using var document = JsonDocument.Parse(meta, new JsonDocumentOptions + { + AllowTrailingCommas = true, + CommentHandling = JsonCommentHandling.Skip, + }); + + return (MetaMdMetadata.FromJson(document.RootElement), body); + } + catch (JsonException) + { + return (MetaMdMetadata.Empty, body); + } + } + + private static (string Body, List<MetaMdReference> UsedReferences) ProcessBody(string body, IReadOnlyDictionary<string, MetaMdReference> references) + { + var used = new List<MetaMdReference>(); + var replacedDiagrams = DiagramPattern.Replace(body, match => + { + var type = match.Groups["type"].Value; + var content = match.Groups["content"].Value.Trim(); + return $"```{type}\n{content}\n```"; + }); + + var replacedReferences = ReferencePattern.Replace(replacedDiagrams, match => + { + var id = match.Groups["id"].Value.Trim(); + if (references.TryGetValue(id, out var reference)) + { + if (used.All(r => !string.Equals(r.Id, reference.Id, StringComparison.OrdinalIgnoreCase))) + { + used.Add(reference); + } + + return reference.Url is not null + ? $"[{reference.Title}]({reference.Url})" + : $"**{reference.Title}**"; + } + + return match.Value; + }); + + return (replacedReferences, used); + } + + private static string ComposeMarkdown(MetaMdMetadata metadata, string body, IReadOnlyCollection<MetaMdReference> references) + { + var builder = new StringBuilder(); + if (!string.IsNullOrWhiteSpace(metadata.Title)) + { + builder.AppendLine("# " + metadata.Title!.Trim()); + builder.AppendLine(); + } + + if (metadata.Contributors.Count > 0) + { + builder.AppendLine("**Contributors:** " + string.Join(", ", metadata.Contributors)); + } + + if (metadata.Affiliations.Count > 0) + { + builder.AppendLine("**Affiliations:** " + string.Join(", ", metadata.Affiliations)); + } + + if (metadata.Keywords.Count > 0) + { + builder.AppendLine("**Keywords:** " + string.Join(", ", metadata.Keywords)); + } + + if (!string.IsNullOrWhiteSpace(metadata.Abstract)) + { + builder.AppendLine(); + builder.AppendLine(metadata.Abstract.Trim()); + } + + if (builder.Length > 0) + { + builder.AppendLine(); + } + + builder.AppendLine(body.Trim()); + + if (references.Count > 0) + { + builder.AppendLine(); + builder.AppendLine("## References"); + foreach (var reference in references) + { + builder.Append("- **"); + builder.Append(reference.Title); + builder.Append("**"); + if (reference.Authors.Count > 0) + { + builder.Append(" — "); + builder.Append(string.Join(", ", reference.Authors)); + } + + if (!string.IsNullOrWhiteSpace(reference.Url)) + { + builder.Append($" [link]({reference.Url})"); + } + + builder.AppendLine(); + } + } + + return builder.ToString().Trim(); + } + + private readonly record struct MetaMdMetadata( + string? Title, + string? Abstract, + IReadOnlyList<string> Contributors, + IReadOnlyList<string> Affiliations, + IReadOnlyList<string> Keywords, + IReadOnlyDictionary<string, MetaMdReference> References) + { + public static MetaMdMetadata Empty { get; } = new( + null, + null, + Array.Empty<string>(), + Array.Empty<string>(), + Array.Empty<string>(), + new Dictionary<string, MetaMdReference>(StringComparer.OrdinalIgnoreCase)); + + public static MetaMdMetadata FromJson(JsonElement element) + { + var title = element.TryGetProperty("title", out var titleElement) ? titleElement.GetString() : null; + var abstractText = element.TryGetProperty("abstract", out var abstractElement) ? abstractElement.GetString() : null; + var contributors = element.TryGetProperty("contributors", out var contributorsElement) + ? contributorsElement.EnumerateArray().Select(e => e.GetString() ?? string.Empty).Where(s => !string.IsNullOrWhiteSpace(s)).ToList() + : new List<string>(); + var affiliations = element.TryGetProperty("affiliations", out var affiliationsElement) + ? affiliationsElement.EnumerateArray().Select(e => e.GetString() ?? string.Empty).Where(s => !string.IsNullOrWhiteSpace(s)).ToList() + : new List<string>(); + var keywords = element.TryGetProperty("keywords", out var keywordsElement) + ? keywordsElement.EnumerateArray().Select(e => e.GetString() ?? string.Empty).Where(s => !string.IsNullOrWhiteSpace(s)).ToList() + : new List<string>(); + + var references = new Dictionary<string, MetaMdReference>(StringComparer.OrdinalIgnoreCase); + if (element.TryGetProperty("references", out var referencesElement) && referencesElement.ValueKind == JsonValueKind.Array) + { + foreach (var referenceElement in referencesElement.EnumerateArray()) + { + var reference = MetaMdReference.FromJson(referenceElement); + if (!string.IsNullOrWhiteSpace(reference.Id)) + { + references[reference.Id] = reference; + } + } + } + + return new MetaMdMetadata(title, abstractText, contributors, affiliations, keywords, references); + } + } + + private readonly record struct MetaMdReference(string Id, string Title, IReadOnlyList<string> Authors, string? Url) + { + public static MetaMdReference FromJson(JsonElement element) + { + var id = element.TryGetProperty("id", out var idProperty) ? idProperty.GetString() ?? string.Empty : string.Empty; + var title = element.TryGetProperty("title", out var titleProperty) ? titleProperty.GetString() ?? id : id; + var authors = element.TryGetProperty("authors", out var authorsElement) && authorsElement.ValueKind == JsonValueKind.Array + ? authorsElement.EnumerateArray().Select(e => e.GetString() ?? string.Empty).Where(s => !string.IsNullOrWhiteSpace(s)).ToList() + : new List<string>(); + var url = element.TryGetProperty("url", out var urlProperty) ? urlProperty.GetString() : null; + return new MetaMdReference(id, title, authors, url); + } + } +} diff --git a/src/MarkItDown/Converters/OdtConverter.cs b/src/MarkItDown/Converters/OdtConverter.cs new file mode 100644 index 000000000..31387bc47 --- /dev/null +++ b/src/MarkItDown/Converters/OdtConverter.cs @@ -0,0 +1,128 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.IO.Compression; +using System.Linq; +using System.Text; +using System.Threading; +using System.Threading.Tasks; +using System.Xml.Linq; +using ManagedCode.MimeTypes; + +namespace MarkItDown.Converters; + +/// <summary> +/// Converter for OpenDocument Text (ODT) packages. +/// </summary> +public sealed class OdtConverter : IDocumentConverter +{ + private const string ContentEntryName = "content.xml"; + + private static readonly IReadOnlyCollection<string> Extensions = new[] + { + ".odt", + }; + + private static readonly string OdtMime = MimeHelper.GetMimeType(".odt") + ?? MimeTypeUtilities.Compose("application", "vnd.oasis.opendocument.text"); + + private static readonly IReadOnlyCollection<string> MimeTypes = new[] + { + OdtMime, + }; + + public int Priority => 180; + + public bool AcceptsInput(StreamInfo streamInfo) + { + var normalizedMime = MimeTypeUtilities.NormalizeMime(streamInfo); + var extension = streamInfo.Extension?.ToLowerInvariant(); + + if (extension is not null && Extensions.Contains(extension)) + { + return true; + } + + return MimeTypeUtilities.MatchesAny(normalizedMime, MimeTypes) + || MimeTypeUtilities.MatchesAny(streamInfo.MimeType, MimeTypes); + } + + public bool Accepts(Stream stream, StreamInfo streamInfo, CancellationToken cancellationToken = default) + { + return AcceptsInput(streamInfo); + } + + public async Task<DocumentConverterResult> ConvertAsync(Stream stream, StreamInfo streamInfo, CancellationToken cancellationToken = default) + { + if (!stream.CanSeek) + { + throw new FileConversionException("ODT conversion requires a seekable stream."); + } + + stream.Position = 0; + using var archive = new ZipArchive(stream, ZipArchiveMode.Read, leaveOpen: true); + var entry = archive.GetEntry(ContentEntryName); + if (entry is null) + { + return new DocumentConverterResult(string.Empty); + } + + await using var entryStream = entry.Open(); + var document = await XDocument.LoadAsync(entryStream, LoadOptions.PreserveWhitespace, cancellationToken).ConfigureAwait(false); + var markdown = RenderMarkdown(document); + return new DocumentConverterResult(markdown, ExtractTitle(document)); + } + + private static string RenderMarkdown(XDocument document) + { + var builder = new StringBuilder(); + var officeNs = document.Root?.Name.Namespace ?? XNamespace.None; + var textNs = (XNamespace)"urn:oasis:names:tc:opendocument:xmlns:text:1.0"; + var body = document.Root? + .Element(officeNs + "body")? + .Element(officeNs + "text"); + + if (body is null) + { + return string.Empty; + } + + foreach (var node in body.Elements()) + { + var localName = node.Name.LocalName; + if (node.Name == textNs + "h") + { + var levelAttribute = node.Attribute(textNs + "outline-level"); + var level = 1; + if (levelAttribute is not null && int.TryParse(levelAttribute.Value, out var parsed)) + { + level = Math.Clamp(parsed, 1, 6); + } + + builder.AppendLine(new string('#', level) + " " + StructuredXmlConverterBase.RenderTextNodes(node.Nodes())); + builder.AppendLine(); + } + else if (node.Name == textNs + "p") + { + var text = StructuredXmlConverterBase.RenderTextNodes(node.Nodes()); + if (!string.IsNullOrWhiteSpace(text)) + { + builder.AppendLine(text.Trim()); + builder.AppendLine(); + } + } + } + + return builder.ToString().Trim(); + } + + private static string? ExtractTitle(XDocument document) + { + var officeNs = document.Root?.Name.Namespace ?? XNamespace.None; + var metaNs = (XNamespace)"urn:oasis:names:tc:opendocument:xmlns:meta:1.0"; + return document.Root? + .Element(officeNs + "meta")? + .Element(metaNs + "title")? + .Value; + } +} diff --git a/src/MarkItDown/Converters/OpmlConverter.cs b/src/MarkItDown/Converters/OpmlConverter.cs new file mode 100644 index 000000000..948a0616f --- /dev/null +++ b/src/MarkItDown/Converters/OpmlConverter.cs @@ -0,0 +1,76 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Xml.Linq; +using ManagedCode.MimeTypes; + +namespace MarkItDown.Converters; + +/// <summary> +/// Converter for OPML outline documents. +/// </summary> +public sealed class OpmlConverter : StructuredXmlConverterBase +{ + private static readonly IReadOnlyCollection<string> Extensions = new[] + { + ".opml", + }; + + private static readonly string OpmlMime = MimeHelper.GetMimeType(".opml") + ?? MimeTypeUtilities.WithSubtype(MimeHelper.XML, "opml+xml"); + + private static readonly IReadOnlyCollection<string> MimeTypes = new[] + { + OpmlMime, + MimeHelper.XML, + }; + + protected override IReadOnlyCollection<string> AcceptedExtensions => Extensions; + + protected override IReadOnlyCollection<string> AcceptedMimeTypes => MimeTypes; + + protected override string RenderMarkdown(XDocument document) + { + if (document.Root is null) + { + return string.Empty; + } + + var outlines = document.Root + .Elements() + .FirstOrDefault(e => e.Name.LocalName.Equals("body", StringComparison.OrdinalIgnoreCase))? + .Elements(); + + if (outlines is null) + { + return string.Empty; + } + + var builder = new StringBuilder(); + foreach (var outline in outlines) + { + RenderOutline(outline, builder, 0); + } + + return builder.ToString().Trim(); + } + + private static void RenderOutline(XElement outline, StringBuilder builder, int depth) + { + var text = outline.Attribute("text")?.Value ?? outline.Attribute("title")?.Value; + if (text is null) + { + return; + } + + builder.Append(new string(' ', depth * 2)); + builder.Append("- "); + builder.AppendLine(text.Trim()); + + foreach (var child in outline.Elements(outline.Name.Namespace + "outline")) + { + RenderOutline(child, builder, depth + 1); + } + } +} diff --git a/src/MarkItDown/Converters/OrgConverter.cs b/src/MarkItDown/Converters/OrgConverter.cs new file mode 100644 index 000000000..79c0a403b --- /dev/null +++ b/src/MarkItDown/Converters/OrgConverter.cs @@ -0,0 +1,97 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Text; +using System.Text.RegularExpressions; +using System.Threading; +using System.Threading.Tasks; +using ManagedCode.MimeTypes; + +namespace MarkItDown.Converters; + +/// <summary> +/// Converter for Org mode documents. +/// </summary> +public sealed class OrgConverter : IDocumentConverter +{ + private static readonly IReadOnlyCollection<string> Extensions = new[] + { + ".org", + }; + + private static readonly IReadOnlyCollection<string> MimeTypes = new[] + { + MimeHelper.GetMimeType(".org") ?? MimeTypeUtilities.Compose("text", "x-org"), + }; + + private static readonly Regex Bold = new("\\*(?<text>[^*]+)\\*", RegexOptions.Compiled); + private static readonly Regex Italic = new("/(?<text>[^/]+)/", RegexOptions.Compiled); + private static readonly Regex Code = new("=`(?<text>[^`]+)`=", RegexOptions.Compiled); + + public int Priority => 155; + + public bool AcceptsInput(StreamInfo streamInfo) + { + var normalizedMime = MimeTypeUtilities.NormalizeMime(streamInfo); + var extension = streamInfo.Extension?.ToLowerInvariant(); + if (extension is not null && Extensions.Contains(extension)) + { + return true; + } + + return MimeTypeUtilities.MatchesAny(normalizedMime, MimeTypes) + || MimeTypeUtilities.MatchesAny(streamInfo.MimeType, MimeTypes); + } + + public bool Accepts(Stream stream, StreamInfo streamInfo, CancellationToken cancellationToken = default) => AcceptsInput(streamInfo); + + public async Task<DocumentConverterResult> ConvertAsync(Stream stream, StreamInfo streamInfo, CancellationToken cancellationToken = default) + { + using var reader = new StreamReader(stream, Encoding.UTF8, detectEncodingFromByteOrderMarks: true, leaveOpen: true); + var content = await reader.ReadToEndAsync(cancellationToken).ConfigureAwait(false); + var markdown = ConvertToMarkdown(content); + return new DocumentConverterResult(markdown, streamInfo.FileName); + } + + private static string ConvertToMarkdown(string org) + { + var lines = org.Replace("\r\n", "\n").Split('\n'); + var builder = new StringBuilder(); + foreach (var line in lines) + { + var trimmed = line.TrimEnd(); + if (string.IsNullOrWhiteSpace(trimmed)) + { + builder.AppendLine(); + continue; + } + + if (trimmed.StartsWith("*")) + { + var level = trimmed.TakeWhile(c => c == '*').Count(); + builder.AppendLine(new string('#', Math.Clamp(level, 1, 6)) + " " + trimmed[level..].Trim()); + continue; + } + + if (trimmed.StartsWith("- ") || trimmed.StartsWith("+ ")) + { + builder.AppendLine("- " + trimmed[2..].Trim()); + continue; + } + + if (trimmed.StartsWith("1. ")) + { + builder.AppendLine("1. " + trimmed[3..].Trim()); + continue; + } + + var converted = Bold.Replace(trimmed, m => "**" + m.Groups["text"].Value + "**"); + converted = Italic.Replace(converted, m => "*" + m.Groups["text"].Value + "*"); + converted = Code.Replace(converted, m => "`" + m.Groups["text"].Value + "`"); + builder.AppendLine(converted); + } + + return builder.ToString().Trim(); + } +} diff --git a/src/MarkItDown/Converters/PlantUmlConverter.cs b/src/MarkItDown/Converters/PlantUmlConverter.cs new file mode 100644 index 000000000..abddca5da --- /dev/null +++ b/src/MarkItDown/Converters/PlantUmlConverter.cs @@ -0,0 +1,55 @@ +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Text; +using System.Threading; +using System.Threading.Tasks; +using ManagedCode.MimeTypes; + +namespace MarkItDown.Converters; + +/// <summary> +/// Converter for PlantUML diagrams. +/// </summary> +public sealed class PlantUmlConverter : IDocumentConverter +{ + private static readonly IReadOnlyCollection<string> Extensions = new[] + { + ".puml", + ".plantuml", + ".wsd", + }; + + private static readonly IReadOnlyCollection<string> MimeTypes = new[] + { + MimeHelper.GetMimeType(".puml") ?? MimeTypeUtilities.Compose("text", "x-plantuml"), + }; + + public int Priority => 128; + + public bool AcceptsInput(StreamInfo streamInfo) + { + var normalizedMime = MimeTypeUtilities.NormalizeMime(streamInfo); + var extension = streamInfo.Extension?.ToLowerInvariant(); + if (extension is not null && Extensions.Contains(extension)) + { + return true; + } + + return MimeTypeUtilities.MatchesAny(normalizedMime, MimeTypes) + || MimeTypeUtilities.MatchesAny(streamInfo.MimeType, MimeTypes); + } + + public bool Accepts(Stream stream, StreamInfo streamInfo, CancellationToken cancellationToken = default) => AcceptsInput(streamInfo); + + public async Task<DocumentConverterResult> ConvertAsync(Stream stream, StreamInfo streamInfo, CancellationToken cancellationToken = default) + { + using var reader = new StreamReader(stream, Encoding.UTF8, detectEncodingFromByteOrderMarks: true, leaveOpen: true); + var content = await reader.ReadToEndAsync(cancellationToken).ConfigureAwait(false); + var fenced = new StringBuilder(); + fenced.AppendLine("```plantuml"); + fenced.AppendLine(content.Trim()); + fenced.AppendLine("```"); + return new DocumentConverterResult(fenced.ToString().Trim(), streamInfo.FileName); + } +} diff --git a/src/MarkItDown/Converters/RisConverter.cs b/src/MarkItDown/Converters/RisConverter.cs new file mode 100644 index 000000000..000c5bc8c --- /dev/null +++ b/src/MarkItDown/Converters/RisConverter.cs @@ -0,0 +1,146 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Text; +using System.Threading; +using System.Threading.Tasks; +using ManagedCode.MimeTypes; + +namespace MarkItDown.Converters; + +/// <summary> +/// Converter for RIS bibliographic records. +/// </summary> +public sealed class RisConverter : IDocumentConverter +{ + private static readonly IReadOnlyCollection<string> Extensions = new[] + { + ".ris", + }; + + private static readonly IReadOnlyCollection<string> MimeTypes = new[] + { + MimeHelper.GetMimeType(".ris") ?? MimeTypeUtilities.Compose("application", "ris"), + }; + + public int Priority => 139; + + public bool AcceptsInput(StreamInfo streamInfo) + { + var normalizedMime = MimeTypeUtilities.NormalizeMime(streamInfo); + var extension = streamInfo.Extension?.ToLowerInvariant(); + if (extension is not null && Extensions.Contains(extension)) + { + return true; + } + + return MimeTypeUtilities.MatchesAny(normalizedMime, MimeTypes) + || MimeTypeUtilities.MatchesAny(streamInfo.MimeType, MimeTypes); + } + + public bool Accepts(Stream stream, StreamInfo streamInfo, CancellationToken cancellationToken = default) => AcceptsInput(streamInfo); + + public async Task<DocumentConverterResult> ConvertAsync(Stream stream, StreamInfo streamInfo, CancellationToken cancellationToken = default) + { + using var reader = new StreamReader(stream, Encoding.UTF8, detectEncodingFromByteOrderMarks: true, leaveOpen: true); + var content = await reader.ReadToEndAsync(cancellationToken).ConfigureAwait(false); + return new DocumentConverterResult(RenderMarkdown(content), streamInfo.FileName); + } + + private static string RenderMarkdown(string ris) + { + var entries = ParseEntries(ris); + var builder = new StringBuilder(); + foreach (var entry in entries) + { + var title = entry.TryGetValue("TI", out var ti) ? ti.FirstOrDefault() : entry.TryGetValue("T1", out var t1) ? t1.FirstOrDefault() : "Untitled"; + var authors = entry.TryGetValue("AU", out var au) + ? (IReadOnlyList<string>)au + : Array.Empty<string>(); + var year = entry.TryGetValue("PY", out var py) ? py.FirstOrDefault() : null; + var journal = entry.TryGetValue("JO", out var jo) ? jo.FirstOrDefault() : entry.TryGetValue("JF", out var jf) ? jf.FirstOrDefault() : null; + var url = entry.TryGetValue("UR", out var ur) ? ur.FirstOrDefault() : null; + + builder.Append("- **"); + builder.Append(title?.Trim()); + builder.Append("**"); + + if (authors.Count > 0) + { + builder.Append(" — "); + builder.Append(string.Join(", ", authors.Select(a => a.Trim()))); + } + + if (!string.IsNullOrWhiteSpace(journal)) + { + builder.Append(", "); + builder.Append(journal?.Trim()); + } + + if (!string.IsNullOrWhiteSpace(year)) + { + builder.Append(" (" + year?.Trim() + ")"); + } + + if (!string.IsNullOrWhiteSpace(url)) + { + builder.Append($" [link]({url})"); + } + + builder.AppendLine(); + } + + return builder.ToString().Trim(); + } + + private static List<Dictionary<string, List<string>>> ParseEntries(string ris) + { + var entries = new List<Dictionary<string, List<string>>>(); + Dictionary<string, List<string>>? current = null; + + using var reader = new StringReader(ris); + string? line; + while ((line = reader.ReadLine()) is not null) + { + if (line.Length < 6 || !line.Contains(" - ")) + { + continue; + } + + var tag = line[..2]; + var value = line.Length > 6 ? line[6..].Trim() : string.Empty; + + if (tag.Equals("TY", StringComparison.OrdinalIgnoreCase)) + { + current = new Dictionary<string, List<string>>(StringComparer.OrdinalIgnoreCase); + entries.Add(current); + AddValue(current, tag, value); + } + else if (tag.Equals("ER", StringComparison.OrdinalIgnoreCase)) + { + current = null; + } + else if (current is not null) + { + AddValue(current, tag, value); + } + } + + return entries; + } + + private static void AddValue(Dictionary<string, List<string>> entry, string tag, string value) + { + if (!entry.TryGetValue(tag, out var list)) + { + list = new List<string>(); + entry[tag] = list; + } + + if (!string.IsNullOrWhiteSpace(value)) + { + list.Add(value); + } + } +} diff --git a/src/MarkItDown/Converters/RstConverter.cs b/src/MarkItDown/Converters/RstConverter.cs new file mode 100644 index 000000000..b19b713e0 --- /dev/null +++ b/src/MarkItDown/Converters/RstConverter.cs @@ -0,0 +1,115 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Text; +using System.Text.RegularExpressions; +using System.Threading; +using System.Threading.Tasks; +using ManagedCode.MimeTypes; + +namespace MarkItDown.Converters; + +/// <summary> +/// Converter for reStructuredText documents. +/// </summary> +public sealed class RstConverter : IDocumentConverter +{ + private static readonly IReadOnlyCollection<string> Extensions = new[] + { + ".rst", + ".rest", + }; + + private static readonly IReadOnlyCollection<string> MimeTypes = new[] + { + MimeHelper.GetMimeType(".rst") ?? MimeTypeUtilities.Compose("text", "x-rst"), + MimeHelper.GetMimeType(".rest") ?? MimeTypeUtilities.Compose("text", "x-rst"), + }; + + private static readonly Regex InlineLiteral = new("``(?<text>[^`]+)``", RegexOptions.Compiled); + private static readonly Regex Bold = new("\\*\\*(?<text>[^*]+)\\*\\*", RegexOptions.Compiled); + private static readonly Regex Italic = new("\\*(?<text>[^*]+)\\*", RegexOptions.Compiled); + + public int Priority => 165; + + public bool AcceptsInput(StreamInfo streamInfo) + { + var normalizedMime = MimeTypeUtilities.NormalizeMime(streamInfo); + var extension = streamInfo.Extension?.ToLowerInvariant(); + if (extension is not null && Extensions.Contains(extension)) + { + return true; + } + + return MimeTypeUtilities.MatchesAny(normalizedMime, MimeTypes) + || MimeTypeUtilities.MatchesAny(streamInfo.MimeType, MimeTypes); + } + + public bool Accepts(Stream stream, StreamInfo streamInfo, CancellationToken cancellationToken = default) => AcceptsInput(streamInfo); + + public async Task<DocumentConverterResult> ConvertAsync(Stream stream, StreamInfo streamInfo, CancellationToken cancellationToken = default) + { + using var reader = new StreamReader(stream, Encoding.UTF8, detectEncodingFromByteOrderMarks: true, leaveOpen: true); + var content = await reader.ReadToEndAsync(cancellationToken).ConfigureAwait(false); + var markdown = ConvertToMarkdown(content); + return new DocumentConverterResult(markdown, streamInfo.FileName); + } + + private static string ConvertToMarkdown(string rst) + { + var lines = rst.Replace("\r\n", "\n").Split('\n'); + var builder = new StringBuilder(); + for (var i = 0; i < lines.Length; i++) + { + var line = lines[i]; + if (line.Length > 0 && i + 1 < lines.Length && IsUnderline(lines[i + 1], line.Length)) + { + var level = DetermineHeadingLevel(lines[i + 1]); + builder.AppendLine(new string('#', level) + " " + line.Trim()); + builder.AppendLine(); + i++; // Skip underline + continue; + } + + if (line.StartsWith(".. ")) + { + continue; + } + + var converted = line; + converted = InlineLiteral.Replace(converted, m => "`" + m.Groups["text"].Value + "`"); + converted = Bold.Replace(converted, m => "**" + m.Groups["text"].Value + "**"); + converted = Italic.Replace(converted, m => "*" + m.Groups["text"].Value + "*"); + converted = converted.Replace("::", ":"); + + builder.AppendLine(converted); + } + + return builder.ToString().Trim(); + } + + private static bool IsUnderline(string candidate, int length) + { + if (string.IsNullOrWhiteSpace(candidate) || length <= 0 || candidate.Length < length) + { + return false; + } + + return candidate.Take(length).All(ch => ch is '=' or '-' or '~' or '^' or '"' or '`' or '.' or '*'); + } + + private static int DetermineHeadingLevel(string underline) + { + var ch = underline.Trim().FirstOrDefault(); + return ch switch + { + '=' => 1, + '-' => 2, + '~' => 3, + '^' => 4, + '"' => 5, + _ => 2, + }; + } +} diff --git a/src/MarkItDown/Converters/RtfConverter.cs b/src/MarkItDown/Converters/RtfConverter.cs new file mode 100644 index 000000000..0862b81e8 --- /dev/null +++ b/src/MarkItDown/Converters/RtfConverter.cs @@ -0,0 +1,184 @@ +using System; +using System.Collections.Generic; +using System.Globalization; +using System.IO; +using System.Linq; +using System.Text; +using System.Threading; +using System.Threading.Tasks; +using ManagedCode.MimeTypes; + +namespace MarkItDown.Converters; + +/// <summary> +/// Converter for RTF documents with a minimal text extractor. +/// </summary> +public sealed class RtfConverter : IDocumentConverter +{ + private static readonly IReadOnlyCollection<string> Extensions = new[] + { + ".rtf", + }; + + private static readonly string RtfMime = MimeHelper.GetMimeType(".rtf") + ?? MimeTypeUtilities.Compose("application", "rtf"); + + private static readonly IReadOnlyCollection<string> MimeTypes = new[] + { + RtfMime, + }; + + public int Priority => 190; + + public bool AcceptsInput(StreamInfo streamInfo) + { + var normalizedMime = MimeTypeUtilities.NormalizeMime(streamInfo); + var extension = streamInfo.Extension?.ToLowerInvariant(); + if (extension is not null && Extensions.Contains(extension)) + { + return true; + } + + return MimeTypeUtilities.MatchesAny(normalizedMime, MimeTypes) + || MimeTypeUtilities.MatchesAny(streamInfo.MimeType, MimeTypes); + } + + public bool Accepts(Stream stream, StreamInfo streamInfo, CancellationToken cancellationToken = default) + { + return AcceptsInput(streamInfo); + } + + public async Task<DocumentConverterResult> ConvertAsync(Stream stream, StreamInfo streamInfo, CancellationToken cancellationToken = default) + { + using var reader = new StreamReader(stream, Encoding.UTF8, detectEncodingFromByteOrderMarks: true, leaveOpen: true); + var content = await reader.ReadToEndAsync(cancellationToken).ConfigureAwait(false); + var markdown = ExtractText(content); + return new DocumentConverterResult(markdown, streamInfo.FileName); + } + + private static string ExtractText(string rtf) + { + var builder = new StringBuilder(); + var stack = new Stack<int>(); + var i = 0; + while (i < rtf.Length) + { + var ch = rtf[i]; + switch (ch) + { + case '{': + stack.Push(0); + i++; + break; + case '}': + if (stack.Count > 0) + { + stack.Pop(); + } + + i++; + break; + case '\\': + i++; + if (i >= rtf.Length) + { + break; + } + + var next = rtf[i]; + if (next == '\\' || next == '{' || next == '}') + { + builder.Append(next); + i++; + } + else if (next == '\'') + { + i++; + if (i + 1 < rtf.Length) + { + var hex = rtf.Substring(i, 2); + if (byte.TryParse(hex, System.Globalization.NumberStyles.HexNumber, null, out var value)) + { + builder.Append(Encoding.Default.GetString(new[] { value })); + } + + i += 2; + } + } + else + { + var control = ReadControlWord(rtf, ref i); + if (string.Equals(control.Word, "par", StringComparison.OrdinalIgnoreCase) || string.Equals(control.Word, "line", StringComparison.OrdinalIgnoreCase)) + { + builder.AppendLine(); + } + else if (string.Equals(control.Word, "tab", StringComparison.OrdinalIgnoreCase)) + { + builder.Append('\t'); + } + else if (control.Word is not null && control.Word.StartsWith("u", StringComparison.OrdinalIgnoreCase) && control.Argument is not null) + { + builder.Append(char.ConvertFromUtf32(control.Argument.Value)); + if (control.Skip > 0) + { + i += control.Skip; + } + } + } + + break; + default: + if (!char.IsControl(ch)) + { + builder.Append(ch); + } + + i++; + break; + } + } + + return builder.ToString().Replace("\r\n\r\n", "\n\n").Trim(); + } + + private static (string? Word, int? Argument, int Skip) ReadControlWord(string rtf, ref int index) + { + var start = index; + while (index < rtf.Length && char.IsLetter(rtf[index])) + { + index++; + } + + var word = index > start ? rtf[start..index] : null; + + var negative = false; + if (index < rtf.Length && rtf[index] == '-') + { + negative = true; + index++; + } + + var argStart = index; + while (index < rtf.Length && char.IsDigit(rtf[index])) + { + index++; + } + + int? argument = null; + if (index > argStart) + { + var span = rtf[argStart..index]; + if (int.TryParse(span, out var value)) + { + argument = negative ? -value : value; + } + } + + if (index < rtf.Length && rtf[index] == ' ') + { + index++; + } + + return (word, argument, 0); + } +} diff --git a/src/MarkItDown/Converters/StructuredXmlConverterBase.cs b/src/MarkItDown/Converters/StructuredXmlConverterBase.cs new file mode 100644 index 000000000..f4e89b9b6 --- /dev/null +++ b/src/MarkItDown/Converters/StructuredXmlConverterBase.cs @@ -0,0 +1,98 @@ +using System.Collections.Generic; +using System.IO; +using System.Text; +using System.Threading; +using System.Threading.Tasks; +using System.Xml.Linq; +using ManagedCode.MimeTypes; + +namespace MarkItDown.Converters; + +public abstract class StructuredXmlConverterBase : IDocumentConverter +{ + protected abstract IReadOnlyCollection<string> AcceptedExtensions { get; } + + protected abstract IReadOnlyCollection<string> AcceptedMimeTypes { get; } + + public virtual int Priority => 150; + + public virtual bool AcceptsInput(StreamInfo streamInfo) + { + var normalizedMime = MimeTypeUtilities.NormalizeMime(streamInfo); + var extension = streamInfo.Extension?.ToLowerInvariant(); + + if (extension is not null && AcceptedExtensions.Contains(extension)) + { + return true; + } + + if (MimeTypeUtilities.MatchesAny(normalizedMime, AcceptedMimeTypes)) + { + return true; + } + + return MimeTypeUtilities.MatchesAny(streamInfo.MimeType, AcceptedMimeTypes); + } + + public bool Accepts(Stream stream, StreamInfo streamInfo, CancellationToken cancellationToken = default) + { + return AcceptsInput(streamInfo); + } + + public async Task<DocumentConverterResult> ConvertAsync(Stream stream, StreamInfo streamInfo, CancellationToken cancellationToken = default) + { + var document = await LoadDocumentAsync(stream, cancellationToken).ConfigureAwait(false); + var markdown = RenderMarkdown(document); + return new DocumentConverterResult(markdown, ExtractTitle(document)); + } + + protected virtual async Task<XDocument> LoadDocumentAsync(Stream stream, CancellationToken cancellationToken) + { + using var reader = new StreamReader(stream, leaveOpen: true); + var content = await reader.ReadToEndAsync(cancellationToken).ConfigureAwait(false); + return XDocument.Parse(content, LoadOptions.PreserveWhitespace | LoadOptions.SetLineInfo); + } + + protected abstract string RenderMarkdown(XDocument document); + + protected virtual string? ExtractTitle(XDocument document) + { + return document.Root?.Element(document.Root.GetDefaultNamespace() + "title")?.Value + ?? document.Root?.Element("title")?.Value; + } + + internal static string RenderTextNodes(IEnumerable<XNode> nodes) + { + var builder = new StringBuilder(); + foreach (var node in nodes) + { + switch (node) + { + case XText text: + builder.Append(text.Value); + break; + case XElement element: + builder.Append(RenderElementInline(element)); + break; + } + } + + return builder.ToString(); + } + + private static string RenderElementInline(XElement element) + { + var content = RenderTextNodes(element.Nodes()); + var name = element.Name.LocalName.ToLowerInvariant(); + return name switch + { + "emphasis" or "em" or "i" => $"*{content}*", + "bold" or "strong" or "b" => $"**{content}**", + "link" or "a" => + element.Attribute("href") is { } href ? $"[{content}]({href.Value})" : content, + "sub" => $"~{content}~", + "sup" => $"^{content}^", + _ => content, + }; + } +} diff --git a/src/MarkItDown/Converters/TextileConverter.cs b/src/MarkItDown/Converters/TextileConverter.cs new file mode 100644 index 000000000..f2905d887 --- /dev/null +++ b/src/MarkItDown/Converters/TextileConverter.cs @@ -0,0 +1,97 @@ +using System; +using System.Collections.Generic; +using System.Globalization; +using System.IO; +using System.Linq; +using System.Text; +using System.Text.RegularExpressions; +using System.Threading; +using System.Threading.Tasks; +using ManagedCode.MimeTypes; + +namespace MarkItDown.Converters; + +/// <summary> +/// Converter for Textile documents. +/// </summary> +public sealed class TextileConverter : IDocumentConverter +{ + private static readonly IReadOnlyCollection<string> Extensions = new[] + { + ".textile", + }; + + private static readonly IReadOnlyCollection<string> MimeTypes = new[] + { + MimeHelper.GetMimeType(".textile") ?? MimeTypeUtilities.Compose("text", "textile"), + }; + + private static readonly Regex Heading = new("^h(?<level>[1-6])\\.\\s*(?<text>.+)$", RegexOptions.Compiled); + private static readonly Regex Bold = new("\\*(?<text>[^*]+)\\*", RegexOptions.Compiled); + private static readonly Regex Italic = new("_(?<text>[^_]+)_", RegexOptions.Compiled); + + public int Priority => 152; + + public bool AcceptsInput(StreamInfo streamInfo) + { + var normalizedMime = MimeTypeUtilities.NormalizeMime(streamInfo); + var extension = streamInfo.Extension?.ToLowerInvariant(); + if (extension is not null && Extensions.Contains(extension)) + { + return true; + } + + return MimeTypeUtilities.MatchesAny(normalizedMime, MimeTypes) + || MimeTypeUtilities.MatchesAny(streamInfo.MimeType, MimeTypes); + } + + public bool Accepts(Stream stream, StreamInfo streamInfo, CancellationToken cancellationToken = default) => AcceptsInput(streamInfo); + + public async Task<DocumentConverterResult> ConvertAsync(Stream stream, StreamInfo streamInfo, CancellationToken cancellationToken = default) + { + using var reader = new StreamReader(stream, Encoding.UTF8, detectEncodingFromByteOrderMarks: true, leaveOpen: true); + var content = await reader.ReadToEndAsync(cancellationToken).ConfigureAwait(false); + return new DocumentConverterResult(ConvertToMarkdown(content), streamInfo.FileName); + } + + private static string ConvertToMarkdown(string textile) + { + var lines = textile.Replace("\r\n", "\n").Split('\n'); + var builder = new StringBuilder(); + foreach (var line in lines) + { + var trimmed = line.TrimEnd(); + if (string.IsNullOrWhiteSpace(trimmed)) + { + builder.AppendLine(); + continue; + } + + var headingMatch = Heading.Match(trimmed); + if (headingMatch.Success) + { + var level = int.Parse(headingMatch.Groups["level"].Value, System.Globalization.CultureInfo.InvariantCulture); + builder.AppendLine(new string('#', Math.Clamp(level, 1, 6)) + " " + headingMatch.Groups["text"].Value.Trim()); + continue; + } + + if (trimmed.StartsWith("* ")) + { + builder.AppendLine("- " + trimmed[2..].Trim()); + continue; + } + + if (trimmed.StartsWith("# ")) + { + builder.AppendLine("1. " + trimmed[2..].Trim()); + continue; + } + + var converted = Bold.Replace(trimmed, m => "**" + m.Groups["text"].Value + "**"); + converted = Italic.Replace(converted, m => "*" + m.Groups["text"].Value + "*"); + builder.AppendLine(converted); + } + + return builder.ToString().Trim(); + } +} diff --git a/src/MarkItDown/Converters/TikzConverter.cs b/src/MarkItDown/Converters/TikzConverter.cs new file mode 100644 index 000000000..6f2c7253e --- /dev/null +++ b/src/MarkItDown/Converters/TikzConverter.cs @@ -0,0 +1,53 @@ +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Text; +using System.Threading; +using System.Threading.Tasks; +using ManagedCode.MimeTypes; + +namespace MarkItDown.Converters; + +/// <summary> +/// Converter for TikZ diagrams. +/// </summary> +public sealed class TikzConverter : IDocumentConverter +{ + private static readonly IReadOnlyCollection<string> Extensions = new[] + { + ".tikz", + }; + + private static readonly IReadOnlyCollection<string> MimeTypes = new[] + { + MimeHelper.GetMimeType(".tikz") ?? MimeTypeUtilities.Compose("text", "x-tikz"), + }; + + public int Priority => 127; + + public bool AcceptsInput(StreamInfo streamInfo) + { + var normalizedMime = MimeTypeUtilities.NormalizeMime(streamInfo); + var extension = streamInfo.Extension?.ToLowerInvariant(); + if (extension is not null && Extensions.Contains(extension)) + { + return true; + } + + return MimeTypeUtilities.MatchesAny(normalizedMime, MimeTypes) + || MimeTypeUtilities.MatchesAny(streamInfo.MimeType, MimeTypes); + } + + public bool Accepts(Stream stream, StreamInfo streamInfo, CancellationToken cancellationToken = default) => AcceptsInput(streamInfo); + + public async Task<DocumentConverterResult> ConvertAsync(Stream stream, StreamInfo streamInfo, CancellationToken cancellationToken = default) + { + using var reader = new StreamReader(stream, Encoding.UTF8, detectEncodingFromByteOrderMarks: true, leaveOpen: true); + var content = await reader.ReadToEndAsync(cancellationToken).ConfigureAwait(false); + var fenced = new StringBuilder(); + fenced.AppendLine("```latex"); + fenced.AppendLine(content.Trim()); + fenced.AppendLine("```"); + return new DocumentConverterResult(fenced.ToString().Trim(), streamInfo.FileName); + } +} diff --git a/src/MarkItDown/Converters/TypstConverter.cs b/src/MarkItDown/Converters/TypstConverter.cs new file mode 100644 index 000000000..220dedf3d --- /dev/null +++ b/src/MarkItDown/Converters/TypstConverter.cs @@ -0,0 +1,90 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Text; +using System.Threading; +using System.Threading.Tasks; +using ManagedCode.MimeTypes; + +namespace MarkItDown.Converters; + +/// <summary> +/// Converter for Typst documents using heuristic mapping to Markdown. +/// </summary> +public sealed class TypstConverter : IDocumentConverter +{ + private static readonly IReadOnlyCollection<string> Extensions = new[] + { + ".typ", + ".typst", + }; + + private static readonly IReadOnlyCollection<string> MimeTypes = new[] + { + MimeHelper.GetMimeType(".typ") ?? MimeTypeUtilities.Compose("text", "typst"), + MimeHelper.GetMimeType(".typst") ?? MimeTypeUtilities.Compose("text", "typst"), + }; + + public int Priority => 153; + + public bool AcceptsInput(StreamInfo streamInfo) + { + var normalizedMime = MimeTypeUtilities.NormalizeMime(streamInfo); + var extension = streamInfo.Extension?.ToLowerInvariant(); + if (extension is not null && Extensions.Contains(extension)) + { + return true; + } + + return MimeTypeUtilities.MatchesAny(normalizedMime, MimeTypes) + || MimeTypeUtilities.MatchesAny(streamInfo.MimeType, MimeTypes); + } + + public bool Accepts(Stream stream, StreamInfo streamInfo, CancellationToken cancellationToken = default) => AcceptsInput(streamInfo); + + public async Task<DocumentConverterResult> ConvertAsync(Stream stream, StreamInfo streamInfo, CancellationToken cancellationToken = default) + { + using var reader = new StreamReader(stream, Encoding.UTF8, detectEncodingFromByteOrderMarks: true, leaveOpen: true); + var content = await reader.ReadToEndAsync(cancellationToken).ConfigureAwait(false); + return new DocumentConverterResult(ConvertToMarkdown(content), streamInfo.FileName); + } + + private static string ConvertToMarkdown(string typst) + { + var lines = typst.Replace("\r\n", "\n").Split('\n'); + var builder = new StringBuilder(); + foreach (var line in lines) + { + var trimmed = line.Trim(); + if (string.IsNullOrWhiteSpace(trimmed)) + { + builder.AppendLine(); + continue; + } + + if (trimmed.StartsWith("= ")) + { + builder.AppendLine("# " + trimmed[2..].Trim()); + continue; + } + + if (trimmed.StartsWith("==")) + { + var level = trimmed.TakeWhile(c => c == '=').Count(); + builder.AppendLine(new string('#', Math.Clamp(level, 1, 6)) + " " + trimmed[level..].Trim()); + continue; + } + + if (trimmed.StartsWith("#")) + { + builder.AppendLine("- " + trimmed[1..].Trim()); + continue; + } + + builder.AppendLine(trimmed.Replace("#bold", "**").Replace("#italic", "*")); + } + + return builder.ToString().Trim(); + } +} diff --git a/src/MarkItDown/Converters/WikiMarkupConverter.cs b/src/MarkItDown/Converters/WikiMarkupConverter.cs new file mode 100644 index 000000000..321ba3b83 --- /dev/null +++ b/src/MarkItDown/Converters/WikiMarkupConverter.cs @@ -0,0 +1,103 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Text; +using System.Text.RegularExpressions; +using System.Threading; +using System.Threading.Tasks; +using ManagedCode.MimeTypes; + +namespace MarkItDown.Converters; + +/// <summary> +/// Converter for generic wiki markup (MediaWiki style). +/// </summary> +public sealed class WikiMarkupConverter : IDocumentConverter +{ + private static readonly IReadOnlyCollection<string> Extensions = new[] + { + ".wiki", + ".mediawiki", + ".creole", + ".dokuwiki", + }; + + private static readonly IReadOnlyCollection<string> MimeTypes = new[] + { + MimeHelper.GetMimeType(".wiki") ?? MimeTypeUtilities.Compose("text", "x-wiki"), + MimeHelper.GetMimeType(".mediawiki") ?? MimeTypeUtilities.Compose("text", "x-mediawiki"), + }; + + private static readonly Regex LinkPattern = new(@"\[\[(?<target>[^|\]]+)(\|(?<text>[^\]]+))?\]\]", RegexOptions.Compiled); + + public int Priority => 151; + + public bool AcceptsInput(StreamInfo streamInfo) + { + var normalizedMime = MimeTypeUtilities.NormalizeMime(streamInfo); + var extension = streamInfo.Extension?.ToLowerInvariant(); + if (extension is not null && Extensions.Contains(extension)) + { + return true; + } + + return MimeTypeUtilities.MatchesAny(normalizedMime, MimeTypes) + || MimeTypeUtilities.MatchesAny(streamInfo.MimeType, MimeTypes); + } + + public bool Accepts(Stream stream, StreamInfo streamInfo, CancellationToken cancellationToken = default) => AcceptsInput(streamInfo); + + public async Task<DocumentConverterResult> ConvertAsync(Stream stream, StreamInfo streamInfo, CancellationToken cancellationToken = default) + { + using var reader = new StreamReader(stream, Encoding.UTF8, detectEncodingFromByteOrderMarks: true, leaveOpen: true); + var content = await reader.ReadToEndAsync(cancellationToken).ConfigureAwait(false); + return new DocumentConverterResult(ConvertToMarkdown(content), streamInfo.FileName); + } + + private static string ConvertToMarkdown(string wiki) + { + var lines = wiki.Replace("\r\n", "\n").Split('\n'); + var builder = new StringBuilder(); + foreach (var line in lines) + { + var trimmed = line.TrimEnd(); + if (string.IsNullOrWhiteSpace(trimmed)) + { + builder.AppendLine(); + continue; + } + + if (trimmed.StartsWith("=")) + { + var level = trimmed.TakeWhile(c => c == '=').Count(); + var text = trimmed.Trim('=').Trim(); + builder.AppendLine(new string('#', Math.Clamp(level, 1, 6)) + " " + text); + continue; + } + + if (trimmed.StartsWith("*")) + { + builder.AppendLine("- " + trimmed.TrimStart('*').Trim()); + continue; + } + + if (trimmed.StartsWith("#")) + { + builder.AppendLine("1. " + trimmed.TrimStart('#').Trim()); + continue; + } + + var converted = LinkPattern.Replace(trimmed, m => + { + var target = m.Groups["target"].Value.Trim(); + var text = m.Groups["text"].Success ? m.Groups["text"].Value.Trim() : target; + return $"[{text}]({target})"; + }); + + builder.AppendLine(converted.Replace("'''''", "**").Replace("'''", "**").Replace("''", "*")); + } + + return builder.ToString().Trim(); + } +} diff --git a/src/MarkItDown/MarkItDown.csproj b/src/MarkItDown/MarkItDown.csproj index c7892024b..d178389a5 100644 --- a/src/MarkItDown/MarkItDown.csproj +++ b/src/MarkItDown/MarkItDown.csproj @@ -16,7 +16,7 @@ <PackageReference Include="Google.Cloud.DocumentAI.V1" Version="3.21.0" /> <PackageReference Include="Google.Cloud.Speech.V1" Version="3.8.0" /> <PackageReference Include="Google.Cloud.Vision.V1" Version="3.7.0" /> - <PackageReference Include="ManagedCode.MimeTypes" Version="1.0.3" /> + <PackageReference Include="ManagedCode.MimeTypes" Version="1.0.4" /> <PackageReference Include="Sep" Version="0.11.1" /> <PackageReference Include="Microsoft.Extensions.AI" Version="9.9.1" /> <PackageReference Include="Azure.AI.FormRecognizer" Version="4.1.0" /> diff --git a/src/MarkItDown/MarkItDown.cs b/src/MarkItDown/MarkItDownClient.cs similarity index 65% rename from src/MarkItDown/MarkItDown.cs rename to src/MarkItDown/MarkItDownClient.cs index c7e5029c0..b7c773c34 100644 --- a/src/MarkItDown/MarkItDown.cs +++ b/src/MarkItDown/MarkItDownClient.cs @@ -1,8 +1,17 @@ +using System.Collections.Generic; +using System.Diagnostics; +using System.Diagnostics.Metrics; +using System.IO; +using System.Linq; +using System.Net.Http; using System.Text; +using System.Threading; +using System.Threading.Tasks; using Azure; using Azure.AI.FormRecognizer.DocumentAnalysis; using Azure.Core; using Azure.Identity; +using ManagedCode.MimeTypes; using Microsoft.Extensions.Logging; using MarkItDown.Conversion.Middleware; using MarkItDown.Converters; @@ -18,7 +27,7 @@ namespace MarkItDown; /// An extremely simple text-based document reader, suitable for LLM use. /// This reader will convert common file-types or webpages to Markdown. /// </summary> -public sealed class MarkItDown +public sealed class MarkItDownClient { private readonly List<ConverterRegistration> _converters; private readonly ILogger? _logger; @@ -26,31 +35,40 @@ public sealed class MarkItDown private readonly MarkItDownOptions _options; private readonly IntelligenceProviderHub _intelligenceProviders; private readonly IConversionPipeline _conversionPipeline; + private readonly ActivitySource _activitySource; + private readonly Counter<long>? _conversionCounter; + private readonly Counter<long>? _conversionFailureCounter; /// <summary> - /// Initialize a new instance of MarkItDown. + /// Initialize a new instance of <see cref="MarkItDownClient"/>. /// </summary> /// <param name="logger">Optional logger for diagnostic information.</param> /// <param name="httpClient">Optional HTTP client for downloading web content.</param> - public MarkItDown(ILogger? logger = null, HttpClient? httpClient = null) + public MarkItDownClient(ILogger? logger = null, HttpClient? httpClient = null) : this(null, logger, httpClient) { } /// <summary> - /// Initialize a new instance of MarkItDown with advanced configuration options. + /// Initialize a new instance of <see cref="MarkItDownClient"/> with advanced configuration options. /// </summary> /// <param name="options">Configuration overrides for the converter. When <see langword="null"/> defaults are used.</param> /// <param name="logger">Optional logger for diagnostic information.</param> /// <param name="httpClient">Optional HTTP client for downloading web content.</param> - public MarkItDown(MarkItDownOptions? options, ILogger? logger = null, HttpClient? httpClient = null) + public MarkItDownClient(MarkItDownOptions? options, ILogger? logger = null, HttpClient? httpClient = null) { _options = options ?? new MarkItDownOptions(); - _logger = logger; + _logger = logger ?? _options.LoggerFactory?.CreateLogger<MarkItDownClient>(); _httpClient = httpClient; _converters = []; _intelligenceProviders = InitializeIntelligenceProviders(); _conversionPipeline = BuildConversionPipeline(); + _activitySource = _options.ActivitySource ?? MarkItDownDiagnostics.DefaultActivitySource; + + if (_options.EnableTelemetry) + { + (_conversionCounter, _conversionFailureCounter) = MarkItDownDiagnostics.ResolveCounters(_options.Meter); + } if (_options.EnableBuiltins) { @@ -122,8 +140,13 @@ public async Task<DocumentConverterResult> ConvertAsync(Stream stream, StreamInf if (!stream.CanSeek) throw new ArgumentException("Stream must support seeking.", nameof(stream)); + using var activity = StartActivity(MarkItDownDiagnostics.ActivityNameConvertStream, streamInfo); + var source = DescribeSource(streamInfo); + var exceptions = new List<Exception>(); var guesses = StreamInfoGuesser.Guess(stream, streamInfo); + activity?.SetTag("markitdown.guess.count", guesses.Count); + _logger?.LogInformation("Converting {Source} with {GuessCount} candidate formats", source, guesses.Count); foreach (var guess in guesses) { @@ -143,27 +166,40 @@ public async Task<DocumentConverterResult> ConvertAsync(Stream stream, StreamInf continue; } + var converterName = registration.Converter.GetType().Name; _logger?.LogDebug("Using converter {ConverterType} for {MimeType} {Extension}", - registration.Converter.GetType().Name, + converterName, guess.MimeType, guess.Extension); + activity?.SetTag("markitdown.converter", converterName); + activity?.SetTag("markitdown.detected.mime", guess.MimeType); + activity?.SetTag("markitdown.detected.extension", guess.Extension); + if (stream.CanSeek) { stream.Position = 0; } - return await registration.Converter.ConvertAsync(stream, guess, cancellationToken).ConfigureAwait(false); + var result = await registration.Converter.ConvertAsync(stream, guess, cancellationToken).ConfigureAwait(false); + RecordSuccess(guess); + activity?.SetStatus(ActivityStatusCode.Ok); + _logger?.LogInformation("Converted {Source} using {ConverterType}", source, converterName); + return result; } catch (Exception ex) { - _logger?.LogWarning(ex, "Converter {ConverterType} failed", registration.Converter.GetType().Name); + var converterName = registration.Converter.GetType().Name; + RecordFailure(converterName, guess); + _logger?.LogWarning(ex, "Converter {ConverterType} failed for {Source}", converterName, source); exceptions.Add(ex); } } } var message = $"No converter available for file type. MimeType: {streamInfo.MimeType}, Extension: {streamInfo.Extension}"; + activity?.SetStatus(ActivityStatusCode.Error, message); + _logger?.LogWarning("No converter could handle {Source} (MimeType: {MimeType}, Extension: {Extension})", source, streamInfo.MimeType, streamInfo.Extension); if (exceptions.Count > 0) { @@ -184,22 +220,64 @@ public async Task<DocumentConverterResult> ConvertFromUrlAsync(string url, Strea if (_httpClient is null) throw new InvalidOperationException("HTTP client is required for URL conversion. Provide one in the constructor."); - using var response = await _httpClient.GetAsync(url, cancellationToken); - response.EnsureSuccessStatusCode(); + using var activity = StartActivity(MarkItDownDiagnostics.ActivityNameConvertUrl); + activity?.SetTag("markitdown.url", url); - using var contentStream = await response.Content.ReadAsStreamAsync(cancellationToken); - var streamInfo = CreateStreamInfoFromUrl(url, response); - if (streamInfoOverride is not null) + Activity? downloadActivity = null; + try { - streamInfo = streamInfo.CopyWith(streamInfoOverride); - } + _logger?.LogInformation("Downloading {Url}", url); + downloadActivity = StartActivity(MarkItDownDiagnostics.ActivityNameDownload); + downloadActivity?.SetTag("http.url", url); + + using var response = await _httpClient.GetAsync(url, cancellationToken).ConfigureAwait(false); + downloadActivity?.SetTag("http.status_code", (int)response.StatusCode); + activity?.SetTag("http.status_code", (int)response.StatusCode); + response.EnsureSuccessStatusCode(); + + using var contentStream = await response.Content.ReadAsStreamAsync(cancellationToken).ConfigureAwait(false); + var streamInfo = CreateStreamInfoFromUrl(url, response); + if (streamInfoOverride is not null) + { + streamInfo = streamInfo.CopyWith(streamInfoOverride); + } - // Copy to memory stream to ensure we can seek - using var memoryStream = new MemoryStream(); - await contentStream.CopyToAsync(memoryStream, cancellationToken); - memoryStream.Position = 0; + if (!string.IsNullOrWhiteSpace(streamInfo.FileName)) + { + activity?.SetTag("markitdown.filename", streamInfo.FileName); + } - return await ConvertAsync(memoryStream, streamInfo, cancellationToken); + if (!string.IsNullOrWhiteSpace(streamInfo.MimeType)) + { + activity?.SetTag("markitdown.mime", streamInfo.MimeType); + } + + if (!string.IsNullOrWhiteSpace(streamInfo.Extension)) + { + activity?.SetTag("markitdown.extension", streamInfo.Extension); + } + + activity?.SetTag("content.length", response.Content.Headers.ContentLength ?? 0); + + using var memoryStream = new MemoryStream(); + await contentStream.CopyToAsync(memoryStream, cancellationToken).ConfigureAwait(false); + memoryStream.Position = 0; + + downloadActivity?.SetStatus(ActivityStatusCode.Ok); + activity?.SetStatus(ActivityStatusCode.Ok); + return await ConvertAsync(memoryStream, streamInfo, cancellationToken).ConfigureAwait(false); + } + catch (Exception ex) + { + downloadActivity?.SetStatus(ActivityStatusCode.Error, ex.Message); + activity?.SetStatus(ActivityStatusCode.Error, ex.Message); + _logger?.LogError(ex, "Failed to download or convert {Url}", url); + throw; + } + finally + { + downloadActivity?.Dispose(); + } } /// <summary> @@ -394,6 +472,29 @@ private IEnumerable<IDocumentConverter> CreateBuiltInConverters() new BingSerpConverter(), new RssFeedConverter(), new JsonConverter(), + new MetaMdConverter(), + new DocBookConverter(), + new JatsConverter(), + new OpmlConverter(), + new Fb2Converter(), + new EndNoteXmlConverter(), + new BibTexConverter(), + new RisConverter(), + new CslJsonConverter(), + new OdtConverter(), + new RtfConverter(), + new LatexConverter(), + new RstConverter(), + new AsciiDocConverter(), + new OrgConverter(), + new DjotConverter(), + new TypstConverter(), + new TextileConverter(), + new WikiMarkupConverter(), + new MermaidConverter(), + new GraphvizConverter(), + new PlantUmlConverter(), + new TikzConverter(), new JupyterNotebookConverter(), new CsvConverter(), new EpubConverter(), @@ -422,6 +523,29 @@ private IEnumerable<IDocumentConverter> CreateZipInnerConverters(Func<IDocumentC new BingSerpConverter(), new RssFeedConverter(), new JsonConverter(), + new MetaMdConverter(), + new DocBookConverter(), + new JatsConverter(), + new OpmlConverter(), + new Fb2Converter(), + new EndNoteXmlConverter(), + new BibTexConverter(), + new RisConverter(), + new CslJsonConverter(), + new OdtConverter(), + new RtfConverter(), + new LatexConverter(), + new RstConverter(), + new AsciiDocConverter(), + new OrgConverter(), + new DjotConverter(), + new TypstConverter(), + new TextileConverter(), + new WikiMarkupConverter(), + new MermaidConverter(), + new GraphvizConverter(), + new PlantUmlConverter(), + new TikzConverter(), new JupyterNotebookConverter(), new CsvConverter(), new EmlConverter(), @@ -436,6 +560,85 @@ private IEnumerable<IDocumentConverter> CreateZipInnerConverters(Func<IDocumentC }; } + private Activity? StartActivity(string name, StreamInfo? streamInfo = null) + { + if (!_options.EnableTelemetry) + { + return null; + } + + var activity = _activitySource.StartActivity(name, ActivityKind.Internal); + if (activity is not null && streamInfo is not null) + { + if (!string.IsNullOrWhiteSpace(streamInfo.MimeType)) + { + activity.SetTag("markitdown.mime", streamInfo.MimeType); + } + + if (!string.IsNullOrWhiteSpace(streamInfo.Extension)) + { + activity.SetTag("markitdown.extension", streamInfo.Extension); + } + + if (!string.IsNullOrWhiteSpace(streamInfo.FileName)) + { + activity.SetTag("markitdown.filename", streamInfo.FileName); + } + + if (!string.IsNullOrWhiteSpace(streamInfo.Url)) + { + activity.SetTag("markitdown.url", streamInfo.Url); + } + } + + return activity; + } + + private void RecordSuccess(StreamInfo guess) + { + if (_conversionCounter is null) + { + return; + } + + _conversionCounter.Add(1, + new KeyValuePair<string, object?>("markitdown.mime", guess.MimeType ?? string.Empty), + new KeyValuePair<string, object?>("markitdown.extension", guess.Extension ?? string.Empty)); + } + + private void RecordFailure(string converterName, StreamInfo guess) + { + if (_conversionFailureCounter is null) + { + return; + } + + _conversionFailureCounter.Add(1, + new KeyValuePair<string, object?>("markitdown.converter", converterName), + new KeyValuePair<string, object?>("markitdown.mime", guess.MimeType ?? string.Empty), + new KeyValuePair<string, object?>("markitdown.extension", guess.Extension ?? string.Empty)); + } + + private static string DescribeSource(StreamInfo streamInfo) + { + if (!string.IsNullOrWhiteSpace(streamInfo.FileName)) + { + return streamInfo.FileName!; + } + + if (!string.IsNullOrWhiteSpace(streamInfo.LocalPath)) + { + return streamInfo.LocalPath!; + } + + if (!string.IsNullOrWhiteSpace(streamInfo.Url)) + { + return streamInfo.Url!; + } + + return "stream"; + } + private async Task<DocumentConverterResult> ConvertFileInternalAsync(string filePath, StreamInfo? overrides, CancellationToken cancellationToken) { if (!File.Exists(filePath)) @@ -443,6 +646,10 @@ private async Task<DocumentConverterResult> ConvertFileInternalAsync(string file throw new FileNotFoundException($"File not found: {filePath}"); } + using var activity = StartActivity(MarkItDownDiagnostics.ActivityNameConvertFile); + activity?.SetTag("markitdown.path", filePath); + _logger?.LogInformation("Converting file {FilePath}", filePath); + using var fileStream = File.OpenRead(filePath); var streamInfo = CreateStreamInfoFromFile(filePath); if (overrides is not null) @@ -450,7 +657,17 @@ private async Task<DocumentConverterResult> ConvertFileInternalAsync(string file streamInfo = streamInfo.CopyWith(overrides); } - return await ConvertAsync(fileStream, streamInfo, cancellationToken).ConfigureAwait(false); + try + { + var result = await ConvertAsync(fileStream, streamInfo, cancellationToken).ConfigureAwait(false); + activity?.SetStatus(ActivityStatusCode.Ok); + return result; + } + catch (Exception ex) + { + activity?.SetStatus(ActivityStatusCode.Error, ex.Message); + throw; + } } private static StreamInfo CreateStreamInfoFromFile(string filePath) @@ -492,41 +709,19 @@ private static StreamInfo CreateStreamInfoFromUrl(string url, HttpResponseMessag } private static string? GetMimeTypeFromExtension(string? extension) + { + if (string.IsNullOrWhiteSpace(extension)) { - return extension?.ToLowerInvariant() switch - { - ".txt" => "text/plain", - ".md" => "text/markdown", - ".markdown" => "text/markdown", - ".html" => "text/html", - ".htm" => "text/html", - ".json" => "application/json", - ".jsonl" => "application/json", - ".ndjson" => "application/json", - ".ipynb" => "application/x-ipynb+json", - ".xml" => "application/xml", - ".xsd" => "application/xml", - ".xsl" => "application/xml", - ".xslt" => "application/xml", - ".rss" => "application/rss+xml", - ".atom" => "application/atom+xml", - ".csv" => "text/csv", - ".zip" => "application/zip", - ".epub" => "application/epub+zip", - ".pdf" => "application/pdf", - ".docx" => "application/vnd.openxmlformats-officedocument.wordprocessingml.document", - ".xlsx" => "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", - ".pptx" => "application/vnd.openxmlformats-officedocument.presentationml.presentation", - ".jpg" => "image/jpeg", - ".jpeg" => "image/jpeg", - ".png" => "image/png", - ".gif" => "image/gif", - ".bmp" => "image/bmp", - ".tiff" => "image/tiff", - ".tif" => "image/tiff", - ".webp" => "image/webp", - _ => MimeMapping.GetMimeType(extension) - }; + return null; + } + + var mime = MimeMapping.GetMimeType(extension); + if (!string.IsNullOrWhiteSpace(mime)) + { + return mime; + } + + return MimeHelper.GetMimeType(extension); } private static Encoding? TryGetEncoding(string? charset) diff --git a/src/MarkItDown/MarkItDownDiagnostics.cs b/src/MarkItDown/MarkItDownDiagnostics.cs new file mode 100644 index 000000000..087bce1b7 --- /dev/null +++ b/src/MarkItDown/MarkItDownDiagnostics.cs @@ -0,0 +1,47 @@ +using System.Diagnostics; +using System.Diagnostics.Metrics; +using System.Runtime.CompilerServices; + +namespace MarkItDown; + +internal static class MarkItDownDiagnostics +{ + public const string ActivitySourceName = "ManagedCode.MarkItDown"; + private const string Version = "1.0.0"; + + public const string ActivityNameConvertStream = "markitdown.convert.stream"; + public const string ActivityNameConvertFile = "markitdown.convert.file"; + public const string ActivityNameConvertUrl = "markitdown.convert.url"; + public const string ActivityNameDownload = "markitdown.download"; + + public static ActivitySource DefaultActivitySource { get; } = new(ActivitySourceName, Version); + public static Meter DefaultMeter { get; } = new(ActivitySourceName, Version); + + public static Counter<long> ConversionsCounter { get; } = DefaultMeter.CreateCounter<long>("markitdown.conversions"); + public static Counter<long> ConversionFailuresCounter { get; } = DefaultMeter.CreateCounter<long>("markitdown.conversion.failures"); + + private static readonly ConditionalWeakTable<Meter, MeterCounters> CachedCounters = new(); + + public static (Counter<long> Success, Counter<long> Failure) ResolveCounters(Meter? meter) + { + if (meter is null) + { + return (ConversionsCounter, ConversionFailuresCounter); + } + + var counters = CachedCounters.GetValue(meter, static m => new MeterCounters(m)); + return (counters.Success, counters.Failure); + } + + private sealed class MeterCounters + { + public MeterCounters(Meter meter) + { + Success = meter.CreateCounter<long>("markitdown.conversions"); + Failure = meter.CreateCounter<long>("markitdown.conversion.failures"); + } + + public Counter<long> Success { get; } + public Counter<long> Failure { get; } + } +} diff --git a/src/MarkItDown/MarkItDownOptions.cs b/src/MarkItDown/MarkItDownOptions.cs index c1c5c3b59..f6cd2c941 100644 --- a/src/MarkItDown/MarkItDownOptions.cs +++ b/src/MarkItDown/MarkItDownOptions.cs @@ -1,14 +1,17 @@ using System; using System.Collections.Generic; +using System.Diagnostics; +using System.Diagnostics.Metrics; using MarkItDown.Intelligence; using MarkItDown.Intelligence.Providers.Aws; using MarkItDown.Intelligence.Providers.Azure; using MarkItDown.Intelligence.Providers.Google; +using Microsoft.Extensions.Logging; namespace MarkItDown; /// <summary> -/// Configurable options for <see cref="MarkItDown"/> that mirror the flexibility of the Python implementation. +/// Configurable options for <see cref="MarkItDownClient"/> that mirror the flexibility of the Python implementation. /// </summary> public sealed record MarkItDownOptions { @@ -93,4 +96,24 @@ public sealed record MarkItDownOptions /// Gets or sets a value indicating whether AI-based image enrichment should be enabled when a chat client is present. /// </summary> public bool EnableAiImageEnrichment { get; init; } = true; + + /// <summary> + /// Gets or sets a value indicating whether OpenTelemetry instrumentation should be emitted. Defaults to <see langword="true"/>. + /// </summary> + public bool EnableTelemetry { get; init; } = true; + + /// <summary> + /// Optional <see cref="ActivitySource"/> to use when creating telemetry spans. When <see langword="null"/> a shared source is used. + /// </summary> + public ActivitySource? ActivitySource { get; init; } + + /// <summary> + /// Optional <see cref="Meter"/> used to emit metric counters. When <see langword="null"/> a shared meter is used. + /// </summary> + public Meter? Meter { get; init; } + + /// <summary> + /// Optional logger factory used when explicit loggers are not supplied to <see cref="MarkItDownClient"/>. + /// </summary> + public ILoggerFactory? LoggerFactory { get; init; } } diff --git a/src/MarkItDown/MimeMapping.cs b/src/MarkItDown/MimeMapping.cs index 67b613dbb..2508455f5 100644 --- a/src/MarkItDown/MimeMapping.cs +++ b/src/MarkItDown/MimeMapping.cs @@ -1,45 +1,92 @@ namespace MarkItDown; using System.Linq; +using ManagedCode.MimeTypes; internal static class MimeMapping -{ +{ + private const string OctetStream = "application/octet-stream"; + private static readonly Dictionary<string, string> ExtensionToMime = new(StringComparer.OrdinalIgnoreCase) { - [".txt"] = "text/plain", - [".md"] = "text/markdown", - [".markdown"] = "text/markdown", - [".html"] = "text/html", - [".htm"] = "text/html", - [".xhtml"] = "application/xhtml+xml", - [".json"] = "application/json", - [".jsonl"] = "application/json", - [".ndjson"] = "application/json", - [".ipynb"] = "application/x-ipynb+json", - [".xml"] = "application/xml", - [".rss"] = "application/rss+xml", - [".atom"] = "application/atom+xml", - [".csv"] = "text/csv", - [".zip"] = "application/zip", - [".epub"] = "application/epub+zip", - [".pdf"] = "application/pdf", - [".docx"] = "application/vnd.openxmlformats-officedocument.wordprocessingml.document", - [".xlsx"] = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", - [".pptx"] = "application/vnd.openxmlformats-officedocument.presentationml.presentation", - [".jpg"] = "image/jpeg", - [".jpeg"] = "image/jpeg", - [".png"] = "image/png", - [".gif"] = "image/gif", - [".bmp"] = "image/bmp", - [".tiff"] = "image/tiff", - [".tif"] = "image/tiff", - [".webp"] = "image/webp", - [".wav"] = "audio/x-wav", - [".mp3"] = "audio/mpeg", - [".m4a"] = "audio/mp4", - [".mp4"] = "video/mp4", - [".msg"] = "application/vnd.ms-outlook", - [".eml"] = "message/rfc822", + [".adoc"] = GetMimeOrDefault(".adoc", MimeTypeUtilities.Compose(MimeHelper.TEXT, "asciidoc")), + [".asciidoc"] = GetMimeOrDefault(".asciidoc", MimeTypeUtilities.Compose(MimeHelper.TEXT, "asciidoc")), + [".atom"] = MimeHelper.ATOM, + [".bib"] = GetMimeOrDefault(".bib", MimeTypeUtilities.Compose(MimeHelper.TEXT, "x-bibtex")), + [".bibtex"] = GetMimeOrDefault(".bibtex", MimeTypeUtilities.Compose(MimeHelper.TEXT, "x-bibtex")), + [".bits"] = MimeTypeUtilities.WithSubtype(MimeHelper.XML, "bits+xml"), + [".bmp"] = MimeHelper.BMP, + [".metamd"] = GetMimeOrDefault(".metamd", MimeTypeUtilities.Compose(MimeHelper.TEXT, "x-metamd")), + [".creole"] = MimeTypeUtilities.Compose(MimeHelper.TEXT, "x-creole"), + [".csv"] = MimeHelper.CSV, + [".csljson"] = GetMimeOrDefault(".csljson", MimeTypeUtilities.Compose(MimeHelper.APPLICATION, "vnd.citationstyles.csl+json")), + [".dbk"] = MimeTypeUtilities.WithSubtype(MimeHelper.XML, "docbook+xml"), + [".dj"] = MimeTypeUtilities.Compose(MimeHelper.TEXT, "x-djot"), + [".djot"] = MimeTypeUtilities.Compose(MimeHelper.TEXT, "x-djot"), + [".docbook"] = MimeTypeUtilities.WithSubtype(MimeHelper.XML, "docbook+xml"), + [".docx"] = MimeHelper.DOCX, + [".dot"] = MimeTypeUtilities.Compose(MimeHelper.TEXT, "vnd.graphviz"), + [".dokuwiki"] = MimeTypeUtilities.Compose(MimeHelper.TEXT, "x-dokuwiki"), + [".eml"] = MimeHelper.EML, + [".endnote"] = GetMimeOrDefault(".endnote", MimeTypeUtilities.Compose(MimeHelper.APPLICATION, "x-endnote-refer")), + [".enl"] = GetMimeOrDefault(".enl", MimeTypeUtilities.Compose(MimeHelper.APPLICATION, "x-endnote-refer")), + [".epub"] = MimeHelper.EPUB, + [".fb2"] = MimeTypeUtilities.Compose(MimeHelper.APPLICATION, "fb2+zip"), + [".gif"] = MimeHelper.GIF, + [".gv"] = MimeTypeUtilities.Compose(MimeHelper.TEXT, "vnd.graphviz"), + [".htm"] = MimeHelper.HTML, + [".html"] = MimeHelper.HTML, + [".ipynb"] = GetMimeOrDefault(".ipynb", MimeTypeUtilities.Compose(MimeHelper.APPLICATION, "x-ipynb+json")), + [".jats"] = MimeTypeUtilities.WithSubtype(MimeHelper.XML, "jats+xml"), + [".jpg"] = MimeHelper.JPEG, + [".jpeg"] = MimeHelper.JPEG, + [".json"] = MimeHelper.JSON, + [".jsonl"] = MimeHelper.JSON, + [".latex"] = MimeTypeUtilities.Compose(MimeHelper.TEXT, "x-latex"), + [".markdown"] = MimeHelper.MARKDOWN, + [".md"] = MimeHelper.MARKDOWN, + [".mediawiki"] = MimeTypeUtilities.Compose(MimeHelper.TEXT, "x-mediawiki"), + [".mermaid"] = MimeTypeUtilities.Compose(MimeHelper.TEXT, "x-mermaid"), + [".mmd"] = MimeTypeUtilities.Compose(MimeHelper.TEXT, "x-mermaid"), + [".m4a"] = MimeHelper.MP4A, + [".msg"] = MimeHelper.MSG, + [".mp3"] = MimeHelper.MP3, + [".mp4"] = MimeHelper.MP4, + [".ndjson"] = MimeHelper.JSON, + [".odt"] = GetMimeOrDefault(".odt", MimeTypeUtilities.Compose(MimeHelper.APPLICATION, "vnd.oasis.opendocument.text")), + [".opml"] = GetMimeOrDefault(".opml", MimeTypeUtilities.Compose(MimeHelper.TEXT, "x-opml")), + [".org"] = MimeTypeUtilities.Compose(MimeHelper.TEXT, "x-org"), + [".pdf"] = MimeHelper.PDF, + [".plantuml"] = MimeTypeUtilities.Compose(MimeHelper.TEXT, "x-plantuml"), + [".png"] = MimeHelper.PNG, + [".pptx"] = MimeHelper.PPTX, + [".puml"] = MimeTypeUtilities.Compose(MimeHelper.TEXT, "x-plantuml"), + [".rest"] = MimeTypeUtilities.Compose(MimeHelper.TEXT, "prs.fallenstein.rst"), + [".ris"] = GetMimeOrDefault(".ris", MimeTypeUtilities.Compose(MimeHelper.APPLICATION, "x-research-info-systems")), + [".rst"] = MimeTypeUtilities.Compose(MimeHelper.TEXT, "prs.fallenstein.rst"), + [".rss"] = MimeHelper.RSS, + [".rtf"] = MimeHelper.RTF, + [".tab"] = MimeTypeUtilities.WithSubtype(MimeHelper.CSV, "tab-separated-values"), + [".tex"] = MimeTypeUtilities.Compose(MimeHelper.TEXT, "x-tex"), + [".textile"] = MimeTypeUtilities.Compose(MimeHelper.TEXT, "x-textile"), + [".tif"] = MimeHelper.TIFF, + [".tiff"] = MimeHelper.TIFF, + [".tikz"] = MimeTypeUtilities.Compose(MimeHelper.TEXT, "x-tikz"), + [".tsv"] = MimeTypeUtilities.WithSubtype(MimeHelper.CSV, "tab-separated-values"), + [".typ"] = MimeTypeUtilities.Compose(MimeHelper.TEXT, "x-typst"), + [".typst"] = MimeTypeUtilities.Compose(MimeHelper.TEXT, "x-typst"), + [".txt"] = MimeHelper.TEXT, + [".wav"] = MimeHelper.WAV, + [".webp"] = MimeHelper.WEBP, + [".wiki"] = MimeTypeUtilities.Compose(MimeHelper.TEXT, "x-wiki"), + [".wsd"] = MimeTypeUtilities.Compose(MimeHelper.TEXT, "x-plantuml"), + [".xhtml"] = MimeHelper.XHTML, + [".xlsx"] = MimeHelper.XLSX, + [".xml"] = MimeHelper.XML, + [".xsl"] = MimeHelper.XML, + [".xsd"] = MimeHelper.XML, + [".xslt"] = MimeHelper.XML, + [".zip"] = MimeHelper.ZIP, }; private static readonly Dictionary<string, string> MimeToExtension = ExtensionToMime @@ -66,4 +113,12 @@ internal static class MimeMapping return MimeToExtension.TryGetValue(mimeType, out var extension) ? extension : null; } + + private static string GetMimeOrDefault(string extension, string fallbackMime) + { + var mime = MimeHelper.GetMimeType(extension); + return string.IsNullOrWhiteSpace(mime) || string.Equals(mime, OctetStream, StringComparison.OrdinalIgnoreCase) + ? fallbackMime + : mime; + } } diff --git a/src/MarkItDown/MimeTypeUtilities.cs b/src/MarkItDown/MimeTypeUtilities.cs index ba6cf020b..d7fc054b1 100644 --- a/src/MarkItDown/MimeTypeUtilities.cs +++ b/src/MarkItDown/MimeTypeUtilities.cs @@ -47,6 +47,11 @@ public static bool MatchesAny(string? candidate, IEnumerable<string> acceptedPre return false; } + if (string.Equals(candidate, OctetStream, StringComparison.OrdinalIgnoreCase)) + { + return false; + } + foreach (var prefix in acceptedPrefixes) { if (candidate.StartsWith(prefix, StringComparison.OrdinalIgnoreCase)) diff --git a/tests/MarkItDown.Tests/EmlConverterTests.cs b/tests/MarkItDown.Tests/EmlConverterTests.cs index 133cf0865..b08f775b8 100644 --- a/tests/MarkItDown.Tests/EmlConverterTests.cs +++ b/tests/MarkItDown.Tests/EmlConverterTests.cs @@ -141,7 +141,7 @@ public async Task ConvertAsync_EmailWithHtmlContent_ConvertsHtmlToMarkdown() public async Task MarkItDown_ConvertAsync_EmlFile_WorksEndToEnd() { // Arrange - var markItDown = new global::MarkItDown.MarkItDown(); + var markItDown = new global::MarkItDown.MarkItDownClient(); var bytes = Encoding.UTF8.GetBytes(SampleEmail); using var stream = new MemoryStream(bytes); var streamInfo = new StreamInfo(mimeType: "message/rfc822", extension: ".eml"); diff --git a/tests/MarkItDown.Tests/MarkItDownClientTelemetryTests.cs b/tests/MarkItDown.Tests/MarkItDownClientTelemetryTests.cs new file mode 100644 index 000000000..8ba38f145 --- /dev/null +++ b/tests/MarkItDown.Tests/MarkItDownClientTelemetryTests.cs @@ -0,0 +1,158 @@ +using System.Collections.Generic; +using System.Diagnostics; +using System.Diagnostics.Metrics; +using System.IO; +using System.Linq; +using System.Text; +using System.Threading; +using System.Threading.Tasks; +using MarkItDown; +using MarkItDown.Converters; +using Microsoft.Extensions.Logging; +using Shouldly; +using Xunit; + +namespace MarkItDown.Tests; + +public sealed class MarkItDownClientTelemetryTests +{ + [Fact] + public async Task ConvertAsync_EmitsActivityAndMetrics() + { + using var activitySource = new ActivitySource("Test.MarkItDown", "1.0"); + using var meter = new Meter("Test.MarkItDown", "1.0"); + var logger = new TestLogger(); + var options = new MarkItDownOptions + { + EnableBuiltins = false, + ActivitySource = activitySource, + Meter = meter + }; + + var client = new MarkItDownClient(options, logger); + client.RegisterConverter(new FakeConverter()); + + using var stream = new MemoryStream(Encoding.UTF8.GetBytes("hello")); + var info = new StreamInfo("text/plain", ".txt", fileName: "sample.txt"); + + var activities = new List<Activity>(); + using var listener = new ActivityListener + { + ShouldListenTo = source => source.Name == activitySource.Name, + Sample = (ref ActivityCreationOptions<ActivityContext> _) => ActivitySamplingResult.AllData, + ActivityStarted = activity => activities.Add(activity) + }; + ActivitySource.AddActivityListener(listener); + + var measurements = new List<long>(); + using var meterListener = new MeterListener(); + meterListener.InstrumentPublished = (instrument, l) => + { + if (instrument.Meter == meter && (instrument.Name == "markitdown.conversions" || instrument.Name == "markitdown.conversion.failures")) + { + l.EnableMeasurementEvents(instrument); + } + }; + meterListener.SetMeasurementEventCallback<long>((instrument, measurement, tags, state) => + { + if (instrument.Name == "markitdown.conversions") + { + measurements.Add(measurement); + } + }); + meterListener.Start(); + + var result = await client.ConvertAsync(stream, info); + result.Markdown.ShouldBe("ok"); + + activities.ShouldNotBeEmpty(); + activities.Any(a => a.OperationName == MarkItDownDiagnostics.ActivityNameConvertStream).ShouldBeTrue(); + measurements.ShouldContain(1); + } + + [Fact] + public async Task ConvertAsync_EmitsStructuredLogs() + { + var logger = new TestLogger(); + var options = new MarkItDownOptions + { + EnableBuiltins = false, + LoggerFactory = new TestLoggerFactory(logger) + }; + + var client = new MarkItDownClient(options); + client.RegisterConverter(new FakeConverter()); + + using var stream = new MemoryStream(Encoding.UTF8.GetBytes("hello")); + var info = new StreamInfo("text/plain", ".txt", fileName: "structured.txt"); + + await client.ConvertAsync(stream, info); + + logger.Entries.Any(entry => entry.Level == LogLevel.Information && entry.Message.Contains("Converted") && entry.Properties?.Any(p => p.Key == "Source" && (string?)p.Value == "structured.txt") == true) + .ShouldBeTrue(); + } + + private sealed class FakeConverter : IDocumentConverter + { + public int Priority => 0; + + public bool Accepts(Stream stream, StreamInfo streamInfo, CancellationToken cancellationToken = default) => true; + + public bool AcceptsInput(StreamInfo streamInfo) => true; + + public Task<DocumentConverterResult> ConvertAsync(Stream stream, StreamInfo streamInfo, CancellationToken cancellationToken = default) + { + return Task.FromResult(new DocumentConverterResult("ok", streamInfo.FileName)); + } + } + + private sealed class TestLoggerFactory : ILoggerFactory + { + private readonly ILogger logger; + + public TestLoggerFactory(ILogger logger) + { + this.logger = logger; + } + + public void AddProvider(ILoggerProvider provider) + { + } + + public ILogger CreateLogger(string categoryName) => logger; + + public void Dispose() + { + } + } + + private sealed class TestLogger : ILogger + { + private sealed class NullScope : IDisposable + { + public static NullScope Instance { get; } = new NullScope(); + public void Dispose() + { + } + } + + public List<LogEntry> Entries { get; } = new(); + + public IDisposable BeginScope<TState>(TState state) where TState : notnull => NullScope.Instance; + + public bool IsEnabled(LogLevel logLevel) => true; + + public void Log<TState>(LogLevel logLevel, EventId eventId, TState state, Exception? exception, Func<TState, Exception?, string> formatter) + { + IReadOnlyList<KeyValuePair<string, object?>>? properties = null; + if (state is IReadOnlyList<KeyValuePair<string, object?>> structured) + { + properties = structured; + } + + Entries.Add(new LogEntry(logLevel, formatter(state, exception), properties)); + } + + public sealed record LogEntry(LogLevel Level, string Message, IReadOnlyList<KeyValuePair<string, object?>>? Properties); + } +} diff --git a/tests/MarkItDown.Tests/MarkItDownIntegrationTests.cs b/tests/MarkItDown.Tests/MarkItDownIntegrationTests.cs index 932fec59c..171572a75 100644 --- a/tests/MarkItDown.Tests/MarkItDownIntegrationTests.cs +++ b/tests/MarkItDown.Tests/MarkItDownIntegrationTests.cs @@ -23,7 +23,7 @@ public class MarkItDownIntegrationTests public async Task ConvertAsync_WithValidFile_ReturnsSuccess() { // Arrange - var markItDown = new global::MarkItDown.MarkItDown(); + var markItDown = new global::MarkItDown.MarkItDownClient(); var htmlContent = "<html><body><h1>Test Header</h1><p>Test content</p></body></html>"; var bytes = Encoding.UTF8.GetBytes(htmlContent); @@ -44,7 +44,7 @@ public async Task ConvertAsync_WithValidFile_ReturnsSuccess() public async Task ConvertAsync_WithCancellationToken_RespectsToken() { // Arrange - var markItDown = new global::MarkItDown.MarkItDown(); + var markItDown = new global::MarkItDown.MarkItDownClient(); var content = "Simple text content for testing"; var bytes = Encoding.UTF8.GetBytes(content); var cts = new CancellationTokenSource(); @@ -72,7 +72,7 @@ ufe.InnerException is AggregateException ae && public async Task ConvertAsync_WithLargeContent_ProcessesCorrectly() { // Arrange - var markItDown = new global::MarkItDown.MarkItDown(); + var markItDown = new global::MarkItDown.MarkItDownClient(); var largeContent = new StringBuilder(); // Create a large HTML document @@ -105,7 +105,7 @@ public async Task ConvertAsync_WithLargeContent_ProcessesCorrectly() public async Task ConvertAsync_StreamNotSeekable_HandlesCorrectly() { // Arrange - var markItDown = new global::MarkItDown.MarkItDown(); + var markItDown = new global::MarkItDown.MarkItDownClient(); var content = "Test content for non-seekable stream"; var bytes = Encoding.UTF8.GetBytes(content); @@ -121,7 +121,7 @@ await Assert.ThrowsAsync<ArgumentException>( public async Task ConvertAsync_EmptyStream_ReturnsEmptyResult() { // Arrange - var markItDown = new global::MarkItDown.MarkItDown(); + var markItDown = new global::MarkItDown.MarkItDownClient(); using var stream = new MemoryStream(); var streamInfo = new StreamInfo(extension: ".txt"); @@ -137,7 +137,7 @@ public async Task ConvertAsync_EmptyStream_ReturnsEmptyResult() public async Task ConvertAsync_BinaryContent_ThrowsUnsupportedFormatException() { // Arrange - var markItDown = new global::MarkItDown.MarkItDown(); + var markItDown = new global::MarkItDown.MarkItDownClient(); var binaryData = new byte[] { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 }; // Random binary data using var stream = new MemoryStream(binaryData); @@ -152,7 +152,7 @@ await Assert.ThrowsAsync<UnsupportedFormatException>( public async Task ConvertAsync_JsonContent_ReturnsFormattedOutput() { // Arrange - var markItDown = new global::MarkItDown.MarkItDown(); + var markItDown = new global::MarkItDown.MarkItDownClient(); var jsonContent = "{\"name\": \"test\", \"value\": 123, \"nested\": {\"key\": \"value\"}}"; var bytes = Encoding.UTF8.GetBytes(jsonContent); @@ -175,7 +175,7 @@ public async Task ConvertAsync_JsonContent_ReturnsFormattedOutput() public async Task ConvertAsync_MarkdownContent_ReturnsAsIs() { // Arrange - var markItDown = new global::MarkItDown.MarkItDown(); + var markItDown = new global::MarkItDown.MarkItDownClient(); var markdownContent = "# Header\n\nThis is **bold** and *italic* text.\n\n- List item 1\n- List item 2\n"; var bytes = Encoding.UTF8.GetBytes(markdownContent); @@ -194,7 +194,7 @@ public async Task ConvertAsync_MarkdownContent_ReturnsAsIs() public async Task ConvertAsync_ComplexHtmlWithTables_ConvertsCorrectly() { // Arrange - var markItDown = new global::MarkItDown.MarkItDown(); + var markItDown = new global::MarkItDown.MarkItDownClient(); var htmlContent = @" <html> <body> @@ -237,7 +237,7 @@ public async Task ConvertAsync_ComplexHtmlWithTables_ConvertsCorrectly() [MemberData(nameof(GeneralVectors))] public async Task Convert_FilePath_VectorExpectations(FileTestVector vector) { - var markItDown = new global::MarkItDown.MarkItDown(); + var markItDown = new global::MarkItDown.MarkItDownClient(); var path = TestAssetLoader.GetAssetPath(vector.FileName); var result = await markItDown.ConvertAsync(path); AssertVectorOutput(vector, result); @@ -247,7 +247,7 @@ public async Task Convert_FilePath_VectorExpectations(FileTestVector vector) [MemberData(nameof(GeneralVectors))] public async Task Convert_StreamWithHints_VectorExpectations(FileTestVector vector) { - var markItDown = new global::MarkItDown.MarkItDown(); + var markItDown = new global::MarkItDown.MarkItDownClient(); var path = TestAssetLoader.GetAssetPath(vector.FileName); await using var stream = File.OpenRead(path); var streamInfo = new StreamInfo( @@ -268,7 +268,7 @@ public async Task Convert_StreamWithoutHints_VectorExpectations(FileTestVector v return; } - var markItDown = new global::MarkItDown.MarkItDown(); + var markItDown = new global::MarkItDown.MarkItDownClient(); var path = TestAssetLoader.GetAssetPath(vector.FileName); await using var stream = File.OpenRead(path); @@ -280,7 +280,7 @@ public async Task Convert_StreamWithoutHints_VectorExpectations(FileTestVector v [MemberData(nameof(GeneralVectors))] public async Task Convert_FileUri_VectorExpectations(FileTestVector vector) { - var markItDown = new global::MarkItDown.MarkItDown(); + var markItDown = new global::MarkItDown.MarkItDownClient(); var uri = new Uri(TestAssetLoader.GetAssetPath(vector.FileName)).AbsoluteUri; var result = await markItDown.ConvertUriAsync(uri, streamInfo: null); AssertVectorOutput(vector, result); @@ -295,7 +295,7 @@ public async Task Convert_DataUri_VectorExpectations(FileTestVector vector) return; } - var markItDown = new global::MarkItDown.MarkItDown(); + var markItDown = new global::MarkItDown.MarkItDownClient(); var bytes = await File.ReadAllBytesAsync(TestAssetLoader.GetAssetPath(vector.FileName)); var base64 = Convert.ToBase64String(bytes); var mime = vector.MimeType ?? "application/octet-stream"; @@ -309,7 +309,7 @@ public async Task Convert_DataUri_VectorExpectations(FileTestVector vector) public async Task ConvertAsync_MultipleConcurrentCalls_HandlesCorrectly() { // Arrange - var markItDown = new global::MarkItDown.MarkItDown(); + var markItDown = new global::MarkItDown.MarkItDownClient(); var tasks = new Task<DocumentConverterResult>[10]; for (int i = 0; i < 10; i++) @@ -338,7 +338,7 @@ public async Task ConvertAsync_MultipleConcurrentCalls_HandlesCorrectly() public void RegisterConverter_CustomConverter_AddsToList() { // Arrange - var markItDown = new global::MarkItDown.MarkItDown(); + var markItDown = new global::MarkItDown.MarkItDownClient(); var customConverter = new TestCustomConverter(); // Act @@ -354,7 +354,7 @@ public void RegisterConverter_CustomConverter_AddsToList() public void RegisterConverter_NullConverter_ThrowsArgumentNullException() { // Arrange - var markItDown = new global::MarkItDown.MarkItDown(); + var markItDown = new global::MarkItDown.MarkItDownClient(); // Act & Assert Assert.Throws<ArgumentNullException>(() => markItDown.RegisterConverter(null!)); @@ -417,7 +417,7 @@ public Task<DocumentConverterResult> ConvertAsync(Stream stream, StreamInfo stre public async Task Convert_FilePath_UnsupportedThrows(string fileName, StreamInfo streamInfo) { Assert.NotNull(streamInfo); - var markItDown = new global::MarkItDown.MarkItDown(); + var markItDown = new global::MarkItDown.MarkItDownClient(); var path = TestAssetLoader.GetAssetPath(fileName); await Assert.ThrowsAsync<UnsupportedFormatException>(() => markItDown.ConvertAsync(path)); @@ -427,7 +427,7 @@ public async Task Convert_FilePath_UnsupportedThrows(string fileName, StreamInfo [MemberData(nameof(UnsupportedVectors))] public async Task Convert_StreamWithHints_UnsupportedThrows(string fileName, StreamInfo streamInfo) { - var markItDown = new global::MarkItDown.MarkItDown(); + var markItDown = new global::MarkItDown.MarkItDownClient(); await using var stream = TestAssetLoader.OpenAsset(fileName); await Assert.ThrowsAsync<UnsupportedFormatException>(() => markItDown.ConvertAsync(stream, streamInfo)); @@ -438,7 +438,7 @@ public async Task Convert_StreamWithHints_UnsupportedThrows(string fileName, Str public async Task Convert_StreamWithoutHints_UnsupportedThrows(string fileName, StreamInfo streamInfo) { Assert.NotNull(streamInfo); - var markItDown = new global::MarkItDown.MarkItDown(); + var markItDown = new global::MarkItDown.MarkItDownClient(); await using var stream = TestAssetLoader.OpenAsset(fileName); await Assert.ThrowsAsync<UnsupportedFormatException>(() => markItDown.ConvertAsync(stream, new StreamInfo())); @@ -448,7 +448,7 @@ public async Task Convert_StreamWithoutHints_UnsupportedThrows(string fileName, [MemberData(nameof(UnsupportedVectors))] public async Task Convert_DataUri_UnsupportedThrows(string fileName, StreamInfo streamInfo) { - var markItDown = new global::MarkItDown.MarkItDown(); + var markItDown = new global::MarkItDown.MarkItDownClient(); var bytes = await File.ReadAllBytesAsync(TestAssetLoader.GetAssetPath(fileName)); var dataUri = $"data:{streamInfo.MimeType ?? "application/octet-stream"};base64,{Convert.ToBase64String(bytes)}"; diff --git a/tests/MarkItDown.Tests/MarkItDownTests.cs b/tests/MarkItDown.Tests/MarkItDownTests.cs index ca4384605..a25e9309e 100644 --- a/tests/MarkItDown.Tests/MarkItDownTests.cs +++ b/tests/MarkItDown.Tests/MarkItDownTests.cs @@ -9,7 +9,7 @@ public class MarkItDownTests public async Task ConvertAsync_PlainTextFile_ReturnsCorrectMarkdown() { // Arrange - var markItDown = new global::MarkItDown.MarkItDown(); + var markItDown = new global::MarkItDown.MarkItDownClient(); var content = "This is a test file.\nWith multiple lines.\n"; var bytes = Encoding.UTF8.GetBytes(content); using var stream = new MemoryStream(bytes); @@ -26,7 +26,7 @@ public async Task ConvertAsync_PlainTextFile_ReturnsCorrectMarkdown() public async Task ConvertAsync_MarkdownFile_ReturnsCorrectMarkdown() { // Arrange - var markItDown = new global::MarkItDown.MarkItDown(); + var markItDown = new global::MarkItDown.MarkItDownClient(); var content = "# Header\n\nThis is **bold** text.\n"; var bytes = Encoding.UTF8.GetBytes(content); using var stream = new MemoryStream(bytes); @@ -43,7 +43,7 @@ public async Task ConvertAsync_MarkdownFile_ReturnsCorrectMarkdown() public async Task ConvertAsync_UnsupportedFormat_ThrowsUnsupportedFormatException() { // Arrange - var markItDown = new global::MarkItDown.MarkItDown(); + var markItDown = new global::MarkItDown.MarkItDownClient(); using var stream = new MemoryStream([0x50, 0x4B, 0x03, 0x04]); // ZIP file signature var streamInfo = new StreamInfo(mimeType: "application/zip", extension: ".zip"); @@ -56,7 +56,7 @@ await Assert.ThrowsAsync<UnsupportedFormatException>( public async Task ConvertAsync_NonSeekableStream_ThrowsArgumentException() { // Arrange - var markItDown = new global::MarkItDown.MarkItDown(); + var markItDown = new global::MarkItDown.MarkItDownClient(); var nonSeekableStream = new NonSeekableMemoryStream([1, 2, 3]); var streamInfo = new StreamInfo(extension: ".txt"); @@ -69,7 +69,7 @@ await Assert.ThrowsAsync<ArgumentException>( public void RegisterConverter_CustomConverter_AddsToConverterList() { // Arrange - var markItDown = new global::MarkItDown.MarkItDown(); + var markItDown = new global::MarkItDown.MarkItDownClient(); var customConverter = new TestConverter(); // Act @@ -86,7 +86,7 @@ public void RegisterConverter_CustomConverter_AddsToConverterList() public async Task ConvertAsync_FileNotFound_ThrowsFileNotFoundException() { // Arrange - var markItDown = new global::MarkItDown.MarkItDown(); + var markItDown = new global::MarkItDown.MarkItDownClient(); var nonExistentFile = "nonexistent.txt"; // Act & Assert diff --git a/tests/MarkItDown.Tests/NewConverterTests.cs b/tests/MarkItDown.Tests/NewConverterTests.cs index f87c883c3..aa9764fed 100644 --- a/tests/MarkItDown.Tests/NewConverterTests.cs +++ b/tests/MarkItDown.Tests/NewConverterTests.cs @@ -245,7 +245,7 @@ public void AllNewConverters_HaveValidPriorities() public void MarkItDown_RegistersNewConverters_CanHandleNewFormats(string extension, string mimeType) { // Arrange - var markItDown = new global::MarkItDown.MarkItDown(); + var markItDown = new global::MarkItDown.MarkItDownClient(); var converters = markItDown.GetRegisteredConverters(); // Act diff --git a/tests/MarkItDown.Tests/NewConvertersTests.cs b/tests/MarkItDown.Tests/NewConvertersTests.cs index a545e0a4f..f686bbff0 100644 --- a/tests/MarkItDown.Tests/NewConvertersTests.cs +++ b/tests/MarkItDown.Tests/NewConvertersTests.cs @@ -155,7 +155,7 @@ public void AllNewConverters_HaveCorrectPriorities() public void MarkItDown_RegistersNewConverters_CanHandleNewFormats(string extension, string mimeType) { // Arrange - var markItDown = new global::MarkItDown.MarkItDown(); + var markItDown = new global::MarkItDown.MarkItDownClient(); var registeredConverters = markItDown.GetRegisteredConverters(); // Act diff --git a/tests/MarkItDown.Tests/NewFormatsConverterTests.cs b/tests/MarkItDown.Tests/NewFormatsConverterTests.cs new file mode 100644 index 000000000..4763086f1 --- /dev/null +++ b/tests/MarkItDown.Tests/NewFormatsConverterTests.cs @@ -0,0 +1,227 @@ +using System; +using System.IO; +using System.Threading.Tasks; +using MarkItDown; +using Shouldly; +using Xunit; + +namespace MarkItDown.Tests; + +public sealed class NewFormatsConverterTests +{ + private const string SampleOdtBase64 = "UEsDBBQAAAAIAABbTFuvsRNW0gAAAJgBAAALAAAAY29udGVudC54bWyNkDFuwzAMRXefgtDupN0KwVaWImuHOgdQZToWIJOCJAfJ7SvZTtpm6kTw6z/qk83hOjm4YIiWqRWvuxcBSIZ7S+dWnLpj/SYOqmp4GKxB2bOZJ6RUG6aUK2SaolxfWzEHkqyjjZL0hFEmI9kj3Sn52y3LX9U2IOE1/Rcv3gVWFcA92Bf3t9L/KMW2KllbmBGWwnNylrB2eEGXVxbqU0/eIXy8d81+df4FvepGG8HroM9B+xEM53gwBJ5AU+EgLiN2G++3KPunLA9hjfton8+qqm9QSwECFAMUAAAACAAAW0xbr7ETVtIAAACYAQAACwAAAAAAAAAAAAAAgAEAAAAAY29udGVudC54bWxQSwUGAAAAAAEAAQA5AAAA+wAAAAAA"; + [Fact] + public async Task DocBookConverter_ProducesSectionHeadings() + { + var result = await ConvertAsync("sample.docbook"); + result.Markdown.ShouldContain("## Details"); + } + + [Fact] + public async Task JatsConverter_UsesArticleTitle() + { + var result = await ConvertAsync("sample.jats"); + result.Markdown.ShouldContain("Sample JATS Article"); + result.Markdown.ShouldContain("## Background"); + } + + [Fact] + public async Task OpmlConverter_RendersOutline() + { + var result = await ConvertAsync("sample.opml"); + result.Markdown.ShouldContain("- Parent"); + result.Markdown.ShouldContain("Child 1"); + } + + [Fact] + public async Task Fb2Converter_WritesSectionHeading() + { + var result = await ConvertAsync("sample.fb2"); + result.Markdown.ShouldContain("## Section Heading"); + result.Markdown.ShouldContain("FictionBook paragraphs"); + } + + [Fact] + public async Task OdtConverter_LoadsContentXml() + { + var markItDown = new MarkItDownClient(); + var bytes = Convert.FromBase64String(SampleOdtBase64); + var tempPath = Path.Combine(Path.GetTempPath(), $"markitdown-{Guid.NewGuid():N}.odt"); + await File.WriteAllBytesAsync(tempPath, bytes); + + try + { + var result = await markItDown.ConvertAsync(tempPath); + result.Markdown.ShouldContain("Sample ODT"); + result.Markdown.ShouldContain("paragraph comes from an ODT"); + } + finally + { + if (File.Exists(tempPath)) + { + File.Delete(tempPath); + } + } + } + + [Fact] + public async Task RtfConverter_ExtractsPlainText() + { + var result = await ConvertAsync("sample.rtf"); + result.Markdown.ShouldContain("Sample RTF Document"); + result.Markdown.ShouldContain("italics"); + } + + [Fact] + public async Task LatexConverter_ConvertsSections() + { + var result = await ConvertAsync("sample.tex"); + result.Markdown.ShouldContain("# Overview"); + result.Markdown.ShouldContain("- First item"); + } + + [Fact] + public async Task RstConverter_ConvertsHeadings() + { + var result = await ConvertAsync("sample.rst"); + result.Markdown.ShouldContain("# Sample RST Heading"); + result.Markdown.ShouldContain("`inline code`"); + } + + [Fact] + public async Task AsciiDocConverter_ConvertsHeadings() + { + var result = await ConvertAsync("sample.adoc"); + result.Markdown.ShouldContain("# Sample AsciiDoc"); + result.Markdown.ShouldContain("**bold**"); + } + + [Fact] + public async Task OrgConverter_ConvertsHeadings() + { + var result = await ConvertAsync("sample.org"); + result.Markdown.ShouldContain("# Sample Org"); + result.Markdown.ShouldContain("- First bullet"); + } + + [Fact] + public async Task DjotConverter_PassesThroughContent() + { + var result = await ConvertAsync("sample.dj"); + result.Markdown.ShouldContain("Sample Djot"); + } + + [Fact] + public async Task TypstConverter_ConvertsHeading() + { + var result = await ConvertAsync("sample.typ"); + result.Markdown.ShouldContain("# Sample Typst"); + result.Markdown.ShouldContain("- item one"); + } + + [Fact] + public async Task TextileConverter_ConvertsHeading() + { + var result = await ConvertAsync("sample.textile"); + result.Markdown.ShouldContain("# Sample Textile"); + result.Markdown.ShouldContain("1. Numbered one"); + } + + [Fact] + public async Task WikiMarkupConverter_ConvertsLink() + { + var result = await ConvertAsync("sample.wiki"); + result.Markdown.ShouldContain("# Sample Wiki"); + result.Markdown.ShouldContain("[Example](https://example.com)"); + } + + [Fact] + public async Task BibTexConverter_RendersBibliography() + { + var result = await ConvertAsync("sample.bib"); + result.Markdown.ShouldContain("Sample Entry"); + result.Markdown.ShouldContain("Ada Lovelace"); + } + + [Fact] + public async Task RisConverter_RendersEntries() + { + var result = await ConvertAsync("sample.ris"); + result.Markdown.ShouldContain("Sample RIS Entry"); + result.Markdown.ShouldContain("https://example.com/ris"); + } + + [Fact] + public async Task EndNoteXmlConverter_RendersAuthors() + { + var result = await ConvertAsync("sample.endnote.xml"); + result.Markdown.ShouldContain("Sample EndNote"); + result.Markdown.ShouldContain("Test Researcher"); + } + + [Fact] + public async Task CslJsonConverter_RendersReference() + { + var result = await ConvertAsync("sample.csljson"); + result.Markdown.ShouldContain("CSL JSON Entry"); + result.Markdown.ShouldContain("https://example.com/csl"); + } + + [Fact] + public async Task CsvConverter_SupportsTsv() + { + var markItDown = new MarkItDownClient(); + var path = TestAssetLoader.GetAssetPath("sample.tsv"); + var result = await markItDown.ConvertAsync(path); + result.Markdown.ShouldContain("| Name | Value |"); + result.Markdown.ShouldContain("| Alpha | 1 |"); + } + + [Fact] + public async Task MermaidConverter_WrapsFencedBlock() + { + var result = await ConvertAsync("sample.mermaid"); + result.Markdown.ShouldStartWith("```mermaid"); + result.Markdown.ShouldContain("A[Start]"); + } + + [Fact] + public async Task GraphvizConverter_WrapsFencedBlock() + { + var result = await ConvertAsync("sample.dot"); + result.Markdown.ShouldStartWith("```dot"); + result.Markdown.ShouldContain("A -> B"); + } + + [Fact] + public async Task PlantUmlConverter_WrapsFencedBlock() + { + var result = await ConvertAsync("sample.puml"); + result.Markdown.ShouldStartWith("```plantuml"); + result.Markdown.ShouldContain("Alice"); + } + + [Fact] + public async Task TikzConverter_WrapsLatexBlock() + { + var result = await ConvertAsync("sample.tikz"); + result.Markdown.ShouldStartWith("```latex"); + result.Markdown.ShouldContain("\\draw"); + } + + [Fact] + public async Task MetaMdConverter_ExpandsMetadataAndReferences() + { + var result = await ConvertAsync("sample.metamd"); + result.Markdown.ShouldContain("# MetaMD Sample"); + result.Markdown.ShouldContain("Sketch of the Analytical Engine"); + result.Markdown.ShouldContain("```mermaid"); + } + + private static async Task<DocumentConverterResult> ConvertAsync(string fileName) + { + var markItDown = new MarkItDownClient(); + var path = TestAssetLoader.GetAssetPath(fileName); + return await markItDown.ConvertAsync(path); + } +} diff --git a/tests/MarkItDown.Tests/StreamInfoDetectionTests.cs b/tests/MarkItDown.Tests/StreamInfoDetectionTests.cs index a6e7f2001..cb27a088a 100644 --- a/tests/MarkItDown.Tests/StreamInfoDetectionTests.cs +++ b/tests/MarkItDown.Tests/StreamInfoDetectionTests.cs @@ -42,7 +42,7 @@ public void Constructor_WithMimeTypeAndExtension_SetsPropertiesCorrectly() [Fact] public void GuessStreamInfo_HtmlContent_DetectsCorrectMimeType() { - var markItDown = new global::MarkItDown.MarkItDown(); + var markItDown = new global::MarkItDown.MarkItDownClient(); var htmlContent = "<!DOCTYPE html><html><head><title>Test

Test

"; var bytes = Encoding.UTF8.GetBytes(htmlContent); @@ -57,7 +57,7 @@ public void GuessStreamInfo_HtmlContent_DetectsCorrectMimeType() [Fact] public void GuessStreamInfo_PdfSignature_DetectsPdfMimeType() { - var markItDown = new global::MarkItDown.MarkItDown(); + var markItDown = new global::MarkItDown.MarkItDownClient(); var pdfSignature = new byte[] { 0x25, 0x50, 0x44, 0x46, 0x2D }; // %PDF- using var stream = new MemoryStream(pdfSignature); @@ -69,7 +69,7 @@ public void GuessStreamInfo_PdfSignature_DetectsPdfMimeType() [Fact] public void GuessStreamInfo_JsonContent_DetectsJsonMimeType() { - var markItDown = new global::MarkItDown.MarkItDown(); + var markItDown = new global::MarkItDown.MarkItDownClient(); var jsonContent = "{\"test\": \"value\", \"number\": 123}"; var bytes = Encoding.UTF8.GetBytes(jsonContent); diff --git a/tests/MarkItDown.Tests/TestCoveragePlan.md b/tests/MarkItDown.Tests/TestCoveragePlan.md index 70bcb2a4e..d94bd1d00 100644 --- a/tests/MarkItDown.Tests/TestCoveragePlan.md +++ b/tests/MarkItDown.Tests/TestCoveragePlan.md @@ -79,6 +79,7 @@ To reach ≥90% line coverage without shipping native/tooling dependencies, intr - Copied upstream fixtures into `tests/MarkItDown.Tests/TestFiles/`. - Added dependency seams plus targeted unit tests for Audio and PDF converters. - Raised overall line coverage to ~44% (from ~41%). +- Landed fixtures and regression tests for DocBook, JATS, OPML, FB2, ODT, citation formats (BibTeX/RIS/EndNote/CSL JSON), plain-text markups (LaTeX/rST/AsciiDoc/Org/Djot/Typst/Textile/Wiki), diagram syntaxes (Mermaid/Graphviz/PlantUML/TikZ), TSV tables, and the MetaMD profile. ## 6. Next Deliverables diff --git a/tests/MarkItDown.Tests/TestFiles/sample.adoc b/tests/MarkItDown.Tests/TestFiles/sample.adoc new file mode 100644 index 000000000..1ade5207e --- /dev/null +++ b/tests/MarkItDown.Tests/TestFiles/sample.adoc @@ -0,0 +1,6 @@ += Sample AsciiDoc + +This paragraph includes *bold* and _italic_ text. + +* Item one +* Item two diff --git a/tests/MarkItDown.Tests/TestFiles/sample.bib b/tests/MarkItDown.Tests/TestFiles/sample.bib new file mode 100644 index 000000000..049f0cd25 --- /dev/null +++ b/tests/MarkItDown.Tests/TestFiles/sample.bib @@ -0,0 +1,7 @@ +@article{sample, + title = {Sample Entry}, + author = {Ada Lovelace and Grace Hopper}, + year = {2024}, + journal = {Journal of Testing}, + url = {https://example.com/sample} +} diff --git a/tests/MarkItDown.Tests/TestFiles/sample.csljson b/tests/MarkItDown.Tests/TestFiles/sample.csljson new file mode 100644 index 000000000..de333e788 --- /dev/null +++ b/tests/MarkItDown.Tests/TestFiles/sample.csljson @@ -0,0 +1,13 @@ +[ + { + "id": "sample", + "title": "CSL JSON Entry", + "container-title": "CSL Journal", + "issued": { "date-parts": [[2021]] }, + "URL": "https://example.com/csl", + "author": [ + { "given": "Ada", "family": "Lovelace" }, + { "given": "Grace", "family": "Hopper" } + ] + } +] diff --git a/tests/MarkItDown.Tests/TestFiles/sample.dj b/tests/MarkItDown.Tests/TestFiles/sample.dj new file mode 100644 index 000000000..82186d29a --- /dev/null +++ b/tests/MarkItDown.Tests/TestFiles/sample.dj @@ -0,0 +1,3 @@ +# Sample Djot + +Paragraph with **bold** text. diff --git a/tests/MarkItDown.Tests/TestFiles/sample.docbook b/tests/MarkItDown.Tests/TestFiles/sample.docbook new file mode 100644 index 000000000..57948a1e4 --- /dev/null +++ b/tests/MarkItDown.Tests/TestFiles/sample.docbook @@ -0,0 +1,12 @@ + + + Sample DocBook + + Introduction + This is an introductory paragraph. +
+ Details + DocBook content becomes Markdown. +
+
+
diff --git a/tests/MarkItDown.Tests/TestFiles/sample.dot b/tests/MarkItDown.Tests/TestFiles/sample.dot new file mode 100644 index 000000000..4eeda4f33 --- /dev/null +++ b/tests/MarkItDown.Tests/TestFiles/sample.dot @@ -0,0 +1,3 @@ +digraph G { + A -> B; +} diff --git a/tests/MarkItDown.Tests/TestFiles/sample.endnote.xml b/tests/MarkItDown.Tests/TestFiles/sample.endnote.xml new file mode 100644 index 000000000..d489369f2 --- /dev/null +++ b/tests/MarkItDown.Tests/TestFiles/sample.endnote.xml @@ -0,0 +1,26 @@ + + + + + Sample EndNote + + + + + Test Researcher + + + + + 2022 + + + EndNote Journal + + + + https://example.com/endnote + + + + diff --git a/tests/MarkItDown.Tests/TestFiles/sample.fb2 b/tests/MarkItDown.Tests/TestFiles/sample.fb2 new file mode 100644 index 000000000..d89a4f864 --- /dev/null +++ b/tests/MarkItDown.Tests/TestFiles/sample.fb2 @@ -0,0 +1,16 @@ + + + + + Sample FB2 + TestAuthor + + + + <p>Body Title</p> +
+ <p>Section Heading</p> +

FictionBook paragraphs become Markdown paragraphs.

+
+ +
diff --git a/tests/MarkItDown.Tests/TestFiles/sample.jats b/tests/MarkItDown.Tests/TestFiles/sample.jats new file mode 100644 index 000000000..1a9690476 --- /dev/null +++ b/tests/MarkItDown.Tests/TestFiles/sample.jats @@ -0,0 +1,16 @@ + +
+ + + + Sample JATS Article + + + + + + Background +

The JATS body supports sections.

+
+ +
diff --git a/tests/MarkItDown.Tests/TestFiles/sample.mermaid b/tests/MarkItDown.Tests/TestFiles/sample.mermaid new file mode 100644 index 000000000..21655cea0 --- /dev/null +++ b/tests/MarkItDown.Tests/TestFiles/sample.mermaid @@ -0,0 +1,2 @@ +graph TD + A[Start] --> B{Decision} diff --git a/tests/MarkItDown.Tests/TestFiles/sample.metamd b/tests/MarkItDown.Tests/TestFiles/sample.metamd new file mode 100644 index 000000000..00ca61e29 --- /dev/null +++ b/tests/MarkItDown.Tests/TestFiles/sample.metamd @@ -0,0 +1,24 @@ ++++ +{ + "title": "MetaMD Sample", + "abstract": "Short abstract about the document.", + "contributors": ["Ada Lovelace"], + "affiliations": ["Analytical Engines"], + "keywords": ["references", "markdown"], + "references": [ + { + "id": "lovelace1843", + "title": "Sketch of the Analytical Engine", + "authors": ["Ada Lovelace"], + "url": "https://example.com/lovelace" + } + ] +} ++++ + +The MetaMD body references [@lovelace1843]. + +:::diagram type="mermaid" +graph TD + Start --> End +::: diff --git a/tests/MarkItDown.Tests/TestFiles/sample.opml b/tests/MarkItDown.Tests/TestFiles/sample.opml new file mode 100644 index 000000000..685e8979d --- /dev/null +++ b/tests/MarkItDown.Tests/TestFiles/sample.opml @@ -0,0 +1,12 @@ + + + + Sample Outline + + + + + + + + diff --git a/tests/MarkItDown.Tests/TestFiles/sample.org b/tests/MarkItDown.Tests/TestFiles/sample.org new file mode 100644 index 000000000..e2a714cb3 --- /dev/null +++ b/tests/MarkItDown.Tests/TestFiles/sample.org @@ -0,0 +1,5 @@ +* Sample Org +** Nested Heading +Some /italic/ and *bold* text. +- First bullet +- Second bullet diff --git a/tests/MarkItDown.Tests/TestFiles/sample.puml b/tests/MarkItDown.Tests/TestFiles/sample.puml new file mode 100644 index 000000000..1b0da9d2e --- /dev/null +++ b/tests/MarkItDown.Tests/TestFiles/sample.puml @@ -0,0 +1,3 @@ +@startuml +Alice -> Bob: Hello +@enduml diff --git a/tests/MarkItDown.Tests/TestFiles/sample.ris b/tests/MarkItDown.Tests/TestFiles/sample.ris new file mode 100644 index 000000000..d78d00108 --- /dev/null +++ b/tests/MarkItDown.Tests/TestFiles/sample.ris @@ -0,0 +1,8 @@ +TY - JOUR +TI - Sample RIS Entry +AU - Lovelace, Ada +AU - Hopper, Grace +JO - Journal Testing +PY - 2023 +UR - https://example.com/ris +ER - diff --git a/tests/MarkItDown.Tests/TestFiles/sample.rst b/tests/MarkItDown.Tests/TestFiles/sample.rst new file mode 100644 index 000000000..c8f5cb420 --- /dev/null +++ b/tests/MarkItDown.Tests/TestFiles/sample.rst @@ -0,0 +1,7 @@ +Sample RST Heading +================== + +This is *emphasized* text and ``inline code``. + +- Bullet one +- Bullet two diff --git a/tests/MarkItDown.Tests/TestFiles/sample.rtf b/tests/MarkItDown.Tests/TestFiles/sample.rtf new file mode 100644 index 000000000..e191fd498 --- /dev/null +++ b/tests/MarkItDown.Tests/TestFiles/sample.rtf @@ -0,0 +1,4 @@ +{\rtf1\ansi\deff0{\fonttbl{\f0 Arial;}} +\fs24\b Sample RTF Document\b0\par +This is plain text with \i italics\i0.\par +} diff --git a/tests/MarkItDown.Tests/TestFiles/sample.tex b/tests/MarkItDown.Tests/TestFiles/sample.tex new file mode 100644 index 000000000..d0dc67b88 --- /dev/null +++ b/tests/MarkItDown.Tests/TestFiles/sample.tex @@ -0,0 +1,8 @@ +\title{Sample LaTeX} +\section{Overview} +This text should become markdown. +\subsection{List} +\begin{itemize} +\item First item +\item Second item +\end{itemize} diff --git a/tests/MarkItDown.Tests/TestFiles/sample.textile b/tests/MarkItDown.Tests/TestFiles/sample.textile new file mode 100644 index 000000000..0b8bb5992 --- /dev/null +++ b/tests/MarkItDown.Tests/TestFiles/sample.textile @@ -0,0 +1,4 @@ +h1. Sample Textile + +* Bullet one +# Numbered one diff --git a/tests/MarkItDown.Tests/TestFiles/sample.tikz b/tests/MarkItDown.Tests/TestFiles/sample.tikz new file mode 100644 index 000000000..591934f95 --- /dev/null +++ b/tests/MarkItDown.Tests/TestFiles/sample.tikz @@ -0,0 +1,3 @@ +\begin{tikzpicture} + \draw (0,0) -- (1,1); +\end{tikzpicture} diff --git a/tests/MarkItDown.Tests/TestFiles/sample.tsv b/tests/MarkItDown.Tests/TestFiles/sample.tsv new file mode 100644 index 000000000..a40fede50 --- /dev/null +++ b/tests/MarkItDown.Tests/TestFiles/sample.tsv @@ -0,0 +1,3 @@ +Name Value +Alpha 1 +Beta 2 diff --git a/tests/MarkItDown.Tests/TestFiles/sample.typ b/tests/MarkItDown.Tests/TestFiles/sample.typ new file mode 100644 index 000000000..1fe2cbd23 --- /dev/null +++ b/tests/MarkItDown.Tests/TestFiles/sample.typ @@ -0,0 +1,3 @@ += Sample Typst +# item one +# item two diff --git a/tests/MarkItDown.Tests/TestFiles/sample.wiki b/tests/MarkItDown.Tests/TestFiles/sample.wiki new file mode 100644 index 000000000..37d0d33af --- /dev/null +++ b/tests/MarkItDown.Tests/TestFiles/sample.wiki @@ -0,0 +1,6 @@ += Sample Wiki = + +* Bullet +# Numbered + +Link to [[https://example.com|Example]]. From 01a921390a5b6804a6f10dc80fdd49a3b7c0d2a8 Mon Sep 17 00:00:00 2001 From: ksemenenko Date: Sun, 12 Oct 2025 14:09:17 +0200 Subject: [PATCH 2/3] Update src/MarkItDown/Converters/RtfConverter.cs Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/MarkItDown/Converters/RtfConverter.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/MarkItDown/Converters/RtfConverter.cs b/src/MarkItDown/Converters/RtfConverter.cs index 0862b81e8..28b7bc0f6 100644 --- a/src/MarkItDown/Converters/RtfConverter.cs +++ b/src/MarkItDown/Converters/RtfConverter.cs @@ -97,7 +97,7 @@ private static string ExtractText(string rtf) if (i + 1 < rtf.Length) { var hex = rtf.Substring(i, 2); - if (byte.TryParse(hex, System.Globalization.NumberStyles.HexNumber, null, out var value)) + if (byte.TryParse(hex, System.Globalization.NumberStyles.HexNumber, CultureInfo.InvariantCulture, out var value)) { builder.Append(Encoding.Default.GetString(new[] { value })); } From 94126f8123079d18b05d7e811ec91cf11266794b Mon Sep 17 00:00:00 2001 From: ksemenenko Date: Sun, 12 Oct 2025 14:09:28 +0200 Subject: [PATCH 3/3] Update src/MarkItDown/Converters/TextileConverter.cs Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/MarkItDown/Converters/TextileConverter.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/MarkItDown/Converters/TextileConverter.cs b/src/MarkItDown/Converters/TextileConverter.cs index f2905d887..b20564616 100644 --- a/src/MarkItDown/Converters/TextileConverter.cs +++ b/src/MarkItDown/Converters/TextileConverter.cs @@ -70,7 +70,7 @@ private static string ConvertToMarkdown(string textile) var headingMatch = Heading.Match(trimmed); if (headingMatch.Success) { - var level = int.Parse(headingMatch.Groups["level"].Value, System.Globalization.CultureInfo.InvariantCulture); + var level = int.Parse(headingMatch.Groups["level"].Value, CultureInfo.InvariantCulture); builder.AppendLine(new string('#', Math.Clamp(level, 1, 6)) + " " + headingMatch.Groups["text"].Value.Trim()); continue; }