diff --git a/Dockerfile b/Dockerfile index bbf2327f..c7e4b353 100644 --- a/Dockerfile +++ b/Dockerfile @@ -74,9 +74,16 @@ RUN apt-get update && apt-get install -y \ texlive-latex-extra \ texlive-latex-recommended \ texlive-xetex \ + python3 \ + python3-pip \ + pipx \ --no-install-recommends \ + && pipx install "markitdown[all]" \ && rm -rf /var/lib/apt/lists/* +# Add pipx bin directory to PATH +ENV PATH="/root/.local/bin:${PATH}" + # Install VTracer binary RUN ARCH=$(uname -m) && \ if [ "$ARCH" = "aarch64" ]; then \ diff --git a/README.md b/README.md index 0d1fa9ee..aa4e6802 100644 --- a/README.md +++ b/README.md @@ -45,6 +45,7 @@ A self-hosted online file converter. Supports over a thousand different formats. | [FFmpeg](https://ffmpeg.org/) | Video | ~472 | ~199 | | [Potrace](https://potrace.sourceforge.net/) | Raster to vector | 4 | 11 | | [VTracer](https://github.com/visioncortex/vtracer) | Raster to vector | 8 | 1 | +| [Markitdown](https://github.com/microsoft/markitdown) | Documents | 6 | 1 | diff --git a/src/converters/main.ts b/src/converters/main.ts index 96052076..d9ec8e8b 100644 --- a/src/converters/main.ts +++ b/src/converters/main.ts @@ -23,6 +23,7 @@ import { convert as convertresvg, properties as propertiesresvg } from "./resvg" import { convert as convertImage, properties as propertiesImage } from "./vips"; import { convert as convertVtracer, properties as propertiesVtracer } from "./vtracer"; import { convert as convertxelatex, properties as propertiesxelatex } from "./xelatex"; +import { convert as convertMarkitdown, properties as propertiesMarkitdown } from "./markitdown"; // This should probably be reconstructed so that the functions are not imported instead the functions hook into this to make the converters more modular @@ -127,6 +128,10 @@ const properties: Record< properties: propertiesVtracer, converter: convertVtracer, }, + markitDown: { + properties: propertiesMarkitdown, + converter: convertMarkitdown, + }, }; function chunks(arr: T[], size: number): T[][] { diff --git a/src/converters/markitdown.ts b/src/converters/markitdown.ts new file mode 100644 index 00000000..32557dd3 --- /dev/null +++ b/src/converters/markitdown.ts @@ -0,0 +1,39 @@ +import { execFile as execFileOriginal } from "node:child_process"; +import { ExecFileFn } from "./types"; + +export const properties = { + from: { + document: ["pdf", "powerpoint", "excel", "docx", "pptx", "html"], + }, + to: { + document: ["md"], + }, +}; + +export async function convert( + filePath: string, + fileType: string, + convertTo: string, + targetPath: string, + options?: unknown, + execFile: ExecFileFn = execFileOriginal, +): Promise { + return new Promise((resolve, reject) => { + execFile("markitdown", [filePath, "-o", targetPath], (err, stdout, stderr) => { + if (err) { + reject(`markitdown error: ${err}`); + return; + } + + if (stdout) { + console.log(`stdout: ${stdout}`); + } + + if (stderr) { + console.error(`stderr: ${stderr}`); + } + + resolve("Done"); + }); + }); +}