From 62c2be25929230883f7f978d307cf167e6dabdb3 Mon Sep 17 00:00:00 2001 From: Sahil Date: Thu, 25 Dec 2025 22:22:13 +0530 Subject: [PATCH 1/5] feat: markitdown implementation --- src/converters/main.ts | 5 +++++ src/converters/markitdown.ts | 39 ++++++++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+) create mode 100644 src/converters/markitdown.ts diff --git a/src/converters/main.ts b/src/converters/main.ts index 96052076..7b9c1cfb 100644 --- a/src/converters/main.ts +++ b/src/converters/main.ts @@ -23,6 +23,7 @@ import { convert as convertresvg, properties as propertiesresvg } from "./resvg" import { convert as convertImage, properties as propertiesImage } from "./vips"; import { convert as convertVtracer, properties as propertiesVtracer } from "./vtracer"; import { convert as convertxelatex, properties as propertiesxelatex } from "./xelatex"; +import { convert as convertMarkitdown, properties as propertiesMarkitdown } from "./markitdown"; // This should probably be reconstructed so that the functions are not imported instead the functions hook into this to make the converters more modular @@ -127,6 +128,10 @@ const properties: Record< properties: propertiesVtracer, converter: convertVtracer, }, + MarkitDown: { + properties: propertiesMarkitdown, + converter: convertMarkitdown, + }, }; function chunks(arr: T[], size: number): T[][] { diff --git a/src/converters/markitdown.ts b/src/converters/markitdown.ts new file mode 100644 index 00000000..32557dd3 --- /dev/null +++ b/src/converters/markitdown.ts @@ -0,0 +1,39 @@ +import { execFile as execFileOriginal } from "node:child_process"; +import { ExecFileFn } from "./types"; + +export const properties = { + from: { + document: ["pdf", "powerpoint", "excel", "docx", "pptx", "html"], + }, + to: { + document: ["md"], + }, +}; + +export async function convert( + filePath: string, + fileType: string, + convertTo: string, + targetPath: string, + options?: unknown, + execFile: ExecFileFn = execFileOriginal, +): Promise { + return new Promise((resolve, reject) => { + execFile("markitdown", [filePath, "-o", targetPath], (err, stdout, stderr) => { + if (err) { + reject(`markitdown error: ${err}`); + return; + } + + if (stdout) { + console.log(`stdout: ${stdout}`); + } + + if (stderr) { + console.error(`stderr: ${stderr}`); + } + + resolve("Done"); + }); + }); +} From 42f02a227cefda5b1af450113cb9d6b4073fd715 Mon Sep 17 00:00:00 2001 From: Sahil Date: Thu, 25 Dec 2025 23:04:23 +0530 Subject: [PATCH 2/5] fix: code review and docker file: --- Dockerfile | 10 ++++++++++ src/converters/main.ts | 2 +- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index bbf2327f..e8dc9cb6 100644 --- a/Dockerfile +++ b/Dockerfile @@ -77,6 +77,16 @@ RUN apt-get update && apt-get install -y \ --no-install-recommends \ && rm -rf /var/lib/apt/lists/* +# Install Python, pipx, and MarkItDown (PEP 668 compliant) +RUN apt-get update && apt-get install -y \ + python3 \ + python3-pip \ + pipx \ + && pipx install "markitdown[all]" \ + && pipx ensurepath \ + && rm -rf /var/lib/apt/lists/* + + # Install VTracer binary RUN ARCH=$(uname -m) && \ if [ "$ARCH" = "aarch64" ]; then \ diff --git a/src/converters/main.ts b/src/converters/main.ts index 7b9c1cfb..d9ec8e8b 100644 --- a/src/converters/main.ts +++ b/src/converters/main.ts @@ -128,7 +128,7 @@ const properties: Record< properties: propertiesVtracer, converter: convertVtracer, }, - MarkitDown: { + markitDown: { properties: propertiesMarkitdown, converter: convertMarkitdown, }, From fec4a236188d688c8d433119a49901a9b0133cc7 Mon Sep 17 00:00:00 2001 From: Sahil Date: Fri, 26 Dec 2025 21:24:51 +0530 Subject: [PATCH 3/5] fix: add markitdown PATH in container --- Dockerfile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Dockerfile b/Dockerfile index e8dc9cb6..8f5f6d89 100644 --- a/Dockerfile +++ b/Dockerfile @@ -86,6 +86,8 @@ RUN apt-get update && apt-get install -y \ && pipx ensurepath \ && rm -rf /var/lib/apt/lists/* +# Add pipx bin directory to PATH +ENV PATH="/root/.local/bin:${PATH}" # Install VTracer binary RUN ARCH=$(uname -m) && \ From 2ccf4ee4e60ad0637a03e68d29eb3660d58ace65 Mon Sep 17 00:00:00 2001 From: Sahil Date: Fri, 26 Dec 2025 22:49:40 +0530 Subject: [PATCH 4/5] fix: feedback changes --- Dockerfile | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/Dockerfile b/Dockerfile index 8f5f6d89..c7e4b353 100644 --- a/Dockerfile +++ b/Dockerfile @@ -74,16 +74,11 @@ RUN apt-get update && apt-get install -y \ texlive-latex-extra \ texlive-latex-recommended \ texlive-xetex \ - --no-install-recommends \ - && rm -rf /var/lib/apt/lists/* - -# Install Python, pipx, and MarkItDown (PEP 668 compliant) -RUN apt-get update && apt-get install -y \ python3 \ python3-pip \ pipx \ + --no-install-recommends \ && pipx install "markitdown[all]" \ - && pipx ensurepath \ && rm -rf /var/lib/apt/lists/* # Add pipx bin directory to PATH From f2dcc7f0a2f1199d6e5d7e9f3ef29f34a8879c4b Mon Sep 17 00:00:00 2001 From: Sahil Date: Sat, 27 Dec 2025 12:54:51 +0530 Subject: [PATCH 5/5] en: readme changed --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 0d1fa9ee..aa4e6802 100644 --- a/README.md +++ b/README.md @@ -45,6 +45,7 @@ A self-hosted online file converter. Supports over a thousand different formats. | [FFmpeg](https://ffmpeg.org/) | Video | ~472 | ~199 | | [Potrace](https://potrace.sourceforge.net/) | Raster to vector | 4 | 11 | | [VTracer](https://github.com/visioncortex/vtracer) | Raster to vector | 8 | 1 | +| [Markitdown](https://github.com/microsoft/markitdown) | Documents | 6 | 1 |