From 7b4a7548bbf0fd15980a716d9c9b1e8259e5b575 Mon Sep 17 00:00:00 2001 From: pc-style Date: Tue, 27 Jan 2026 15:05:11 +0100 Subject: [PATCH] feat: 10x upgrade with parallel processing, vision, streaming, watch mode, and more - Add parallel processing with configurable concurrency (p-limit) - Add progress bar with ETA for multi-file operations (cli-progress) - Add stdin/stdout pipe support for shell integration - Add vision mode to send PDFs as images to Gemini - Add streaming output for real-time AI responses - Add watch mode to auto-convert new PDFs in a directory - Add glob pattern support (e.g., docs/**/*.pdf) - Add hash-based caching to skip already-processed files - Add custom prompts and built-in templates (invoice, table, summary, code) - Add output formats: markdown, json, html, text - Add image extraction from PDFs - Add templates and cache management commands Amp-Thread-ID: https://ampcode.com/threads/T-019bff71-9c6b-71d1-b614-e480b6560ebf Co-authored-by: Amp --- .gitignore | 3 + AGENTS.md | 21 +++ bun.lock | 43 +++++- package.json | 5 + src/commands/convert.ts | 312 ++++++++++++++++++++++++++++++++-------- src/commands/watch.ts | 99 +++++++++++++ src/index.ts | 35 ++++- src/utils/cache.ts | 35 +++++ src/utils/formats.ts | 70 +++++++++ src/utils/gemini.ts | 113 +++++++++++++-- src/utils/images.ts | 72 ++++++++++ src/utils/templates.ts | 55 +++++++ 12 files changed, 789 insertions(+), 74 deletions(-) create mode 100644 AGENTS.md create mode 100644 src/commands/watch.ts create mode 100644 src/utils/cache.ts create mode 100644 src/utils/formats.ts create mode 100644 src/utils/images.ts create mode 100644 src/utils/templates.ts diff --git a/.gitignore b/.gitignore index 0d1e7a8..8c57158 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,9 @@ bun.lockb bin/ pdf2md +# test folder +test/ + # IDE .vscode/ .idea/ diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..b1765b5 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,21 @@ +# AGENTS.md + +## Commands +- **Build**: `bun run build` - compiles to `bin/pdf2md` +- **Dev**: `bun run ./src/index.ts [args]` - run without compiling +- **Type check**: `bun tsc --noEmit` +- No test framework configured + +## Architecture +- CLI tool using Commander.js with subcommands in `src/commands/` +- Entry: `src/index.ts` → commands: `convert` (default), `config` +- Utils: `src/utils/` - pdf extraction (unpdf), Gemini AI, config (conf) +- Compiles to standalone binary via `bun build --compile` + +## Code Style +- **Runtime**: Bun (never npm/pnpm/yarn/node) +- **Module**: ESM with `.js` extensions in imports (even for .ts files) +- **Types**: Strict TypeScript, explicit error typing (`error: any`) +- **Imports**: node builtins with `node:` prefix, named exports preferred +- **Patterns**: async/await, ora spinners for progress, chalk for colors +- **Error handling**: try/catch with spinner.fail(), process.exit(1) for fatal errors diff --git a/bun.lock b/bun.lock index d3f7e7b..34b3191 100644 --- a/bun.lock +++ b/bun.lock @@ -7,13 +7,18 @@ "dependencies": { "@google/generative-ai": "^0.24.1", "chalk": "^5.6.2", + "chokidar": "^5.0.0", + "cli-progress": "^3.12.0", "commander": "^14.0.2", "conf": "^15.0.2", + "glob": "^13.0.0", "ora": "^9.0.0", + "p-limit": "^7.2.0", "unpdf": "^1.4.0", }, "devDependencies": { "@types/bun": "latest", + "@types/cli-progress": "^3.11.6", "typescript": "^5", }, }, @@ -21,8 +26,14 @@ "packages": { "@google/generative-ai": ["@google/generative-ai@0.24.1", "", {}, "sha512-MqO+MLfM6kjxcKoy0p1wRzG3b4ZZXtPI+z2IE26UogS2Cm/XHO+7gGRBh6gcJsOiIVoH93UwKvW4HdgiOZCy9Q=="], + "@isaacs/balanced-match": ["@isaacs/balanced-match@4.0.1", "", {}, "sha512-yzMTt9lEb8Gv7zRioUilSglI0c0smZ9k5D65677DLWLtWJaXIS3CqcGyUFByYKlnUj6TkjLVs54fBl6+TiGQDQ=="], + + "@isaacs/brace-expansion": ["@isaacs/brace-expansion@5.0.0", "", { "dependencies": { "@isaacs/balanced-match": "^4.0.1" } }, "sha512-ZT55BDLV0yv0RBm2czMiZ+SqCGO7AvmOM3G/w2xhVPH+te0aKgFjmBvGlL1dH+ql2tgGO3MVrbb3jCKyvpgnxA=="], + "@types/bun": ["@types/bun@1.3.5", "", { "dependencies": { "bun-types": "1.3.5" } }, "sha512-RnygCqNrd3srIPEWBd5LFeUYG7plCoH2Yw9WaZGyNmdTEei+gWaHqydbaIRkIkcbXwhBT94q78QljxN0Sk838w=="], + "@types/cli-progress": ["@types/cli-progress@3.11.6", "", { "dependencies": { "@types/node": "*" } }, "sha512-cE3+jb9WRlu+uOSAugewNpITJDt1VF8dHOopPO4IABFc3SXYL5WE/+PTz/FCdZRRfIujiWW3n3aMbv1eIGVRWA=="], + "@types/node": ["@types/node@25.0.3", "", { "dependencies": { "undici-types": "~7.16.0" } }, "sha512-W609buLVRVmeW693xKfzHeIV6nJGGz98uCPfeXI1ELMLXVeKYZ9m15fAMSaUPBHYLGFsVRcMmSCksQOrZV9BYA=="], "ajv": ["ajv@8.17.1", "", { "dependencies": { "fast-deep-equal": "^3.1.3", "fast-uri": "^3.0.1", "json-schema-traverse": "^1.0.0", "require-from-string": "^2.0.2" } }, "sha512-B/gBuNg5SiMTrPkC+A2+cW0RszwxYmn6VYxB/inlBStS5nx6xHIt/ehKRhIMhqusl7a8LjQoZnjCs5vhwxOQ1g=="], @@ -37,8 +48,12 @@ "chalk": ["chalk@5.6.2", "", {}, "sha512-7NzBL0rN6fMUW+f7A6Io4h40qQlG+xGmtMxfbnH/K7TAtt8JQWVQK+6g0UXKMeVJoyV5EkkNsErQ8pVD3bLHbA=="], + "chokidar": ["chokidar@5.0.0", "", { "dependencies": { "readdirp": "^5.0.0" } }, "sha512-TQMmc3w+5AxjpL8iIiwebF73dRDF4fBIieAqGn9RGCWaEVwQ6Fb2cGe31Yns0RRIzii5goJ1Y7xbMwo1TxMplw=="], + "cli-cursor": ["cli-cursor@5.0.0", "", { "dependencies": { "restore-cursor": "^5.0.0" } }, "sha512-aCj4O5wKyszjMmDT4tZj93kxyydN/K5zPWSCe6/0AV/AA1pqe5ZBIw0a2ZfPQV7lL5/yb5HsUreJ6UFAF1tEQw=="], + "cli-progress": ["cli-progress@3.12.0", "", { "dependencies": { "string-width": "^4.2.3" } }, "sha512-tRkV3HJ1ASwm19THiiLIXLO7Im7wlTuKnvkYaTkyoAPefqjNg7W7DHKUlGRxy9vxDvbyCYQkQozvptuMkGCg8A=="], + "cli-spinners": ["cli-spinners@3.3.0", "", {}, "sha512-/+40ljC3ONVnYIttjMWrlL51nItDAbBrq2upN8BPyvGU/2n5Oxw3tbNwORCaNuNqLJnxGqOfjUuhsv7l5Q4IsQ=="], "commander": ["commander@14.0.2", "", {}, "sha512-TywoWNNRbhoD0BXs1P3ZEScW8W5iKrnbithIl0YH+uCmBd0QpPOA8yc82DS3BIE5Ma6FnBVUsJ7wVUDz4dvOWQ=="], @@ -49,6 +64,8 @@ "dot-prop": ["dot-prop@10.1.0", "", { "dependencies": { "type-fest": "^5.0.0" } }, "sha512-MVUtAugQMOff5RnBy2d9N31iG0lNwg1qAoAOn7pOK5wf94WIaE3My2p3uwTQuvS2AcqchkcR3bHByjaM0mmi7Q=="], + "emoji-regex": ["emoji-regex@8.0.0", "", {}, "sha512-MSjYzcWNOA0ewAHpz0MxpYFvwg6yjy1NG3xteoqz644VCo/RPgnr1/GGt+ic3iJTzQ8Eu3TdM14SawnVUmGE6A=="], + "env-paths": ["env-paths@3.0.0", "", {}, "sha512-dtJUTepzMW3Lm/NPxRf3wP4642UWhjL2sQxc+ym2YMj1m/H2zDNQOlezafzkHwn6sMstjHTwG6iQQsctDW/b1A=="], "fast-deep-equal": ["fast-deep-equal@3.1.3", "", {}, "sha512-f3qQ9oQy9j2AhBe/H9VC91wLmKBCCU/gDOnKNAYG5hswO7BLKj09Hc5HYNz9cGI++xlpDCIgDaitVs03ATR84Q=="], @@ -57,6 +74,10 @@ "get-east-asian-width": ["get-east-asian-width@1.4.0", "", {}, "sha512-QZjmEOC+IT1uk6Rx0sX22V6uHWVwbdbxf1faPqJ1QhLdGgsRGCZoyaQBm/piRdJy/D2um6hM1UP7ZEeQ4EkP+Q=="], + "glob": ["glob@13.0.0", "", { "dependencies": { "minimatch": "^10.1.1", "minipass": "^7.1.2", "path-scurry": "^2.0.0" } }, "sha512-tvZgpqk6fz4BaNZ66ZsRaZnbHvP/jG3uKJvAZOwEVUL4RTA5nJeeLYfyN9/VA8NX/V3IBG+hkeuGpKjvELkVhA=="], + + "is-fullwidth-code-point": ["is-fullwidth-code-point@3.0.0", "", {}, "sha512-zymm5+u+sCsSWyD9qNaejV3DFvhCKclKdizYaJUuHA83RLjb7nSuGnddCHGv0hk+KY7BMAlsWeK4Ueg6EV6XQg=="], + "is-interactive": ["is-interactive@2.0.0", "", {}, "sha512-qP1vozQRI+BMOPcjFzrjXuQvdak2pHNUMZoeG2eRbiSqyvbEf/wQtEOTOX1guk6E3t36RkaqiSt8A/6YElNxLQ=="], "is-unicode-supported": ["is-unicode-supported@2.1.0", "", {}, "sha512-mE00Gnza5EEB3Ds0HfMyllZzbBrmLOX3vfWoj9A9PEnTfratQ/BcaJOuMhnkhjXvb2+FkY3VuHqtAGpTPmglFQ=="], @@ -67,12 +88,24 @@ "log-symbols": ["log-symbols@7.0.1", "", { "dependencies": { "is-unicode-supported": "^2.0.0", "yoctocolors": "^2.1.1" } }, "sha512-ja1E3yCr9i/0hmBVaM0bfwDjnGy8I/s6PP4DFp+yP+a+mrHO4Rm7DtmnqROTUkHIkqffC84YY7AeqX6oFk0WFg=="], + "lru-cache": ["lru-cache@11.2.5", "", {}, "sha512-vFrFJkWtJvJnD5hg+hJvVE8Lh/TcMzKnTgCWmtBipwI5yLX/iX+5UB2tfuyODF5E7k9xEzMdYgGqaSb1c0c5Yw=="], + "mimic-function": ["mimic-function@5.0.1", "", {}, "sha512-VP79XUPxV2CigYP3jWwAUFSku2aKqBH7uTAapFWCBqutsbmDo96KY5o8uh6U+/YSIn5OxJnXp73beVkpqMIGhA=="], + "minimatch": ["minimatch@10.1.1", "", { "dependencies": { "@isaacs/brace-expansion": "^5.0.0" } }, "sha512-enIvLvRAFZYXJzkCYG5RKmPfrFArdLv+R+lbQ53BmIMLIry74bjKzX6iHAm8WYamJkhSSEabrWN5D97XnKObjQ=="], + + "minipass": ["minipass@7.1.2", "", {}, "sha512-qOOzS1cBTWYF4BH8fVePDBOO9iptMnGUEZwNc/cMWnTV2nVLZ7VoNWEPHkYczZA0pdoA7dl6e7FL659nX9S2aw=="], + "onetime": ["onetime@7.0.0", "", { "dependencies": { "mimic-function": "^5.0.0" } }, "sha512-VXJjc87FScF88uafS3JllDgvAm+c/Slfz06lorj2uAY34rlUu0Nt+v8wreiImcrgAjjIHp1rXpTDlLOGw29WwQ=="], "ora": ["ora@9.0.0", "", { "dependencies": { "chalk": "^5.6.2", "cli-cursor": "^5.0.0", "cli-spinners": "^3.2.0", "is-interactive": "^2.0.0", "is-unicode-supported": "^2.1.0", "log-symbols": "^7.0.1", "stdin-discarder": "^0.2.2", "string-width": "^8.1.0", "strip-ansi": "^7.1.2" } }, "sha512-m0pg2zscbYgWbqRR6ABga5c3sZdEon7bSgjnlXC64kxtxLOyjRcbbUkLj7HFyy/FTD+P2xdBWu8snGhYI0jc4A=="], + "p-limit": ["p-limit@7.2.0", "", { "dependencies": { "yocto-queue": "^1.2.1" } }, "sha512-ATHLtwoTNDloHRFFxFJdHnG6n2WUeFjaR8XQMFdKIv0xkXjrER8/iG9iu265jOM95zXHAfv9oTkqhrfbIzosrQ=="], + + "path-scurry": ["path-scurry@2.0.1", "", { "dependencies": { "lru-cache": "^11.0.0", "minipass": "^7.1.2" } }, "sha512-oWyT4gICAu+kaA7QWk/jvCHWarMKNs6pXOGWKDTr7cw4IGcUbW+PeTfbaQiLGheFRpjo6O9J0PmyMfQPjH71oA=="], + + "readdirp": ["readdirp@5.0.0", "", {}, "sha512-9u/XQ1pvrQtYyMpZe7DXKv2p5CNvyVwzUB6uhLAnQwHMSgKMBR62lc7AHljaeteeHXn11XTAaLLUVZYVZyuRBQ=="], + "require-from-string": ["require-from-string@2.0.2", "", {}, "sha512-Xf0nWe6RseziFMu+Ap9biiUbmplq6S9/p+7w7YXP/JBHhrUDDUhwa+vANyubuqfZWTveU//DYVGsDG7RKL/vEw=="], "restore-cursor": ["restore-cursor@5.1.0", "", { "dependencies": { "onetime": "^7.0.0", "signal-exit": "^4.1.0" } }, "sha512-oMA2dcrw6u0YfxJQXm342bFKX/E4sG9rbTzO9ptUcR/e8A33cHuvStiYOwH7fszkZlZ1z/ta9AAoPk2F4qIOHA=="], @@ -83,7 +116,7 @@ "stdin-discarder": ["stdin-discarder@0.2.2", "", {}, "sha512-UhDfHmA92YAlNnCfhmq0VeNL5bDbiZGg7sZ2IvPsXubGkiNa9EC+tUTsjBRsYUAz87btI6/1wf4XoVvQ3uRnmQ=="], - "string-width": ["string-width@8.1.0", "", { "dependencies": { "get-east-asian-width": "^1.3.0", "strip-ansi": "^7.1.0" } }, "sha512-Kxl3KJGb/gxkaUMOjRsQ8IrXiGW75O4E3RPjFIINOVH8AMl2SQ/yWdTzWwF3FevIX9LcMAjJW+GRwAlAbTSXdg=="], + "string-width": ["string-width@4.2.3", "", { "dependencies": { "emoji-regex": "^8.0.0", "is-fullwidth-code-point": "^3.0.0", "strip-ansi": "^6.0.1" } }, "sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g=="], "strip-ansi": ["strip-ansi@7.1.2", "", { "dependencies": { "ansi-regex": "^6.0.1" } }, "sha512-gmBGslpoQJtgnMAvOVqGZpEz9dyoKTCzy2nfz/n8aIFhN/jCE/rCmcxabB6jOOHV+0WNnylOxaxBQPSvcWklhA=="], @@ -105,6 +138,14 @@ "when-exit": ["when-exit@2.1.5", "", {}, "sha512-VGkKJ564kzt6Ms1dbgPP/yuIoQCrsFAnRbptpC5wOEsDaNsbCB2bnfnaA8i/vRs5tjUSEOtIuvl9/MyVsvQZCg=="], + "yocto-queue": ["yocto-queue@1.2.2", "", {}, "sha512-4LCcse/U2MHZ63HAJVE+v71o7yOdIe4cZ70Wpf8D/IyjDKYQLV5GD46B+hSTjJsvV5PztjvHoU580EftxjDZFQ=="], + "yoctocolors": ["yoctocolors@2.1.2", "", {}, "sha512-CzhO+pFNo8ajLM2d2IW/R93ipy99LWjtwblvC1RsoSUMZgyLbYFr221TnSNT7GjGdYui6P459mw9JH/g/zW2ug=="], + + "ora/string-width": ["string-width@8.1.0", "", { "dependencies": { "get-east-asian-width": "^1.3.0", "strip-ansi": "^7.1.0" } }, "sha512-Kxl3KJGb/gxkaUMOjRsQ8IrXiGW75O4E3RPjFIINOVH8AMl2SQ/yWdTzWwF3FevIX9LcMAjJW+GRwAlAbTSXdg=="], + + "string-width/strip-ansi": ["strip-ansi@6.0.1", "", { "dependencies": { "ansi-regex": "^5.0.1" } }, "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A=="], + + "string-width/strip-ansi/ansi-regex": ["ansi-regex@5.0.1", "", {}, "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ=="], } } diff --git a/package.json b/package.json index fa2837d..8a90469 100644 --- a/package.json +++ b/package.json @@ -16,13 +16,18 @@ "dependencies": { "@google/generative-ai": "^0.24.1", "chalk": "^5.6.2", + "chokidar": "^5.0.0", + "cli-progress": "^3.12.0", "commander": "^14.0.2", "conf": "^15.0.2", + "glob": "^13.0.0", "ora": "^9.0.0", + "p-limit": "^7.2.0", "unpdf": "^1.4.0" }, "devDependencies": { "@types/bun": "latest", + "@types/cli-progress": "^3.11.6", "typescript": "^5" } } diff --git a/src/commands/convert.ts b/src/commands/convert.ts index 599d438..67dc142 100644 --- a/src/commands/convert.ts +++ b/src/commands/convert.ts @@ -3,98 +3,284 @@ import path from 'node:path'; import { Command } from 'commander'; import ora from 'ora'; import chalk from 'chalk'; +import { glob } from 'glob'; +import pLimit from 'p-limit'; +import cliProgress from 'cli-progress'; import { extractPdfText } from '../utils/pdf.js'; -import { convertToMarkdown } from '../utils/gemini.js'; +import { convertToMarkdown, convertToMarkdownStream, convertPdfWithVision } from '../utils/gemini.js'; import { getApiKey } from '../utils/config.js'; +import { isCached, setCached } from '../utils/cache.js'; +import { getPrompt, listTemplates } from '../utils/templates.js'; +import { formatOutput, getExtension, type OutputFormat } from '../utils/formats.js'; +import { extractImages } from '../utils/images.js'; + +interface ConvertOptions { + mode: 'standalone' | 'ai' | 'vision'; + format: OutputFormat; + output?: string; + apiKey?: string; + template: string; + prompt?: string; + concurrency: number; + cache: boolean; + stream: boolean; + stdout: boolean; + extractImages: boolean; +} + +async function processFile( + filePath: string, + options: ConvertOptions, + apiKey: string | null, + outputPath: string, + showSpinner: boolean = true +): Promise<{ success: boolean; skipped: boolean }> { + const spinner = showSpinner ? ora(`Processing ${path.basename(filePath)}...`).start() : null; + + if (options.cache && outputPath && isCached(filePath, outputPath)) { + spinner?.succeed(chalk.gray(`Skipped (cached): ${path.basename(filePath)}`)); + return { success: true, skipped: true }; + } -async function processFile(filePath: string, mode: 'standalone' | 'ai', apiKey: string | null, output?: string) { - const spinner = ora(`Processing ${path.basename(filePath)}...`).start(); - try { - const text = await extractPdfText(filePath); - - let result = text; - if (mode === 'ai') { - if (!apiKey) { - spinner.fail('API Key is required for AI mode. Use "pdf2md config --key " or pass --api-key.'); - return false; + let result: string; + const prompt = options.prompt || getPrompt(options.template); + + if (options.mode === 'standalone') { + result = await extractPdfText(filePath); + } else if (options.mode === 'vision') { + if (!apiKey) throw new Error('API Key required for vision mode'); + + if (options.stream && !options.stdout) { + spinner?.stop(); + process.stdout.write(chalk.blue(`\n--- ${path.basename(filePath)} ---\n`)); + result = await convertPdfWithVision(filePath, apiKey, { + prompt, + stream: true, + onChunk: (chunk: string) => process.stdout.write(chunk), + }); + process.stdout.write('\n'); + } else { + if (spinner) spinner.text = `Vision processing ${path.basename(filePath)}...`; + result = await convertPdfWithVision(filePath, apiKey, { prompt }); + } + } else { + if (!apiKey) throw new Error('API Key required for AI mode'); + + const text = await extractPdfText(filePath); + + if (options.stream && !options.stdout) { + spinner?.stop(); + process.stdout.write(chalk.blue(`\n--- ${path.basename(filePath)} ---\n`)); + result = await convertToMarkdownStream(text, apiKey, (chunk: string) => process.stdout.write(chunk), { prompt }); + process.stdout.write('\n'); + } else { + if (spinner) spinner.text = `AI processing ${path.basename(filePath)}...`; + result = await convertToMarkdown(text, apiKey, { prompt }); + } + } + + if (options.extractImages && outputPath) { + const imagesDir = outputPath.replace(/\.[^.]+$/, '_images'); + const extracted = await extractImages(filePath, imagesDir); + if (extracted.length > 0 && spinner) { + spinner.text = `Extracted ${extracted.length} images`; } - spinner.text = `Generative AI processing for ${path.basename(filePath)}...`; - try { - result = await convertToMarkdown(text, apiKey); - } catch (e: any) { - spinner.fail(`AI processing failed: ${e.message}`); - return false; + } + + const output = formatOutput(result, options.format, { + source: filePath, + processedAt: new Date().toISOString(), + }); + + if (options.stdout) { + process.stdout.write(output); + } else { + fs.mkdirSync(path.dirname(outputPath), { recursive: true }); + fs.writeFileSync(outputPath, output); + + if (options.cache) { + setCached(filePath, outputPath); } + + spinner?.succeed(`Saved: ${outputPath}`); } - const outputPath = output || filePath.replace(/\.pdf$/i, '.md'); - fs.mkdirSync(path.dirname(outputPath), { recursive: true }); - fs.writeFileSync(outputPath, result); - - spinner.succeed(`Saved to ${outputPath}`); - return true; + return { success: true, skipped: false }; } catch (error: any) { - spinner.fail(`Error processing ${filePath}: ${error.message}`); - return false; + spinner?.fail(`Error: ${path.basename(filePath)} - ${error.message}`); + return { success: false, skipped: false }; } } -async function processDirectory(dirPath: string, mode: 'standalone' | 'ai', apiKey: string | null) { - const files = fs.readdirSync(dirPath, { recursive: true }) as string[]; - const pdfFiles = files.filter(f => f.toLowerCase().endsWith('.pdf')); - - if (pdfFiles.length === 0) { - console.log(chalk.yellow('No PDF files found in directory.')); - return; +async function processMultipleFiles( + files: string[], + options: ConvertOptions, + apiKey: string | null +): Promise { + const limit = pLimit(options.concurrency); + + const progressBar = new cliProgress.SingleBar({ + format: '{bar} {percentage}% | ETA: {eta}s | {value}/{total} files | {status}', + barCompleteChar: '█', + barIncompleteChar: '░', + hideCursor: true, + }, cliProgress.Presets.shades_classic); + + console.log(chalk.blue(`\nProcessing ${files.length} PDF files (concurrency: ${options.concurrency})\n`)); + progressBar.start(files.length, 0, { status: 'Starting...' }); + + let completed = 0; + let succeeded = 0; + let skipped = 0; + + const tasks = files.map((file) => + limit(async () => { + const ext = getExtension(options.format); + const outputPath = file.replace(/\.pdf$/i, ext); + + progressBar.update(completed, { status: path.basename(file) }); + + const result = await processFile(file, options, apiKey, outputPath, false); + + completed++; + if (result.success) succeeded++; + if (result.skipped) skipped++; + + progressBar.update(completed, { status: `Done: ${path.basename(file)}` }); + + return result; + }) + ); + + await Promise.all(tasks); + progressBar.stop(); + + console.log(chalk.green(`\n✓ Completed: ${succeeded}/${files.length} files processed`)); + if (skipped > 0) { + console.log(chalk.gray(` ⏭️ ${skipped} files skipped (cached)`)); } +} - console.log(chalk.blue(`Found ${pdfFiles.length} PDF files in ${dirPath}`)); - - let successCount = 0; - for (const file of pdfFiles) { - const fullPath = path.join(dirPath, file); - // For directory processing, output is always side-by-side - if (await processFile(fullPath, mode, apiKey)) { - successCount++; - } +async function readStdin(): Promise { + const chunks: Buffer[] = []; + for await (const chunk of process.stdin) { + chunks.push(chunk as Buffer); } - - console.log(chalk.green(`\nCompleted! ${successCount}/${pdfFiles.length} files processed.`)); + return Buffer.concat(chunks); } export const convertCommand = new Command('convert') .description('Convert PDF to Markdown') - .argument('[input]', 'Input file or directory') - .option('-m, --mode ', 'Mode: standalone (text extract) or ai (Gemini)', 'standalone') - .option('-o, --output ', 'Output file path (only for single file)') - .option('-k, --api-key ', 'Gemini 3 Flash Preview API Key (overrides config)') - .action(async (input, options, command) => { - if (!input) { - command.help(); + .argument('[input]', 'Input file, directory, or glob pattern (omit to read from stdin)') + .option('-m, --mode ', 'Mode: standalone, ai, or vision', 'standalone') + .option('-f, --format ', 'Output format: markdown, json, html, text', 'markdown') + .option('-o, --output ', 'Output file path (single file only)') + .option('-k, --api-key ', 'Gemini API Key') + .option('-t, --template ', 'Prompt template (default, invoice, table, summary, code)', 'default') + .option('-p, --prompt ', 'Custom prompt (overrides template)') + .option('-c, --concurrency ', 'Parallel processing limit', '3') + .option('--cache', 'Skip already-processed files') + .option('--stream', 'Stream AI response in real-time') + .option('--stdout', 'Output to stdout instead of file') + .option('--extract-images', 'Extract images from PDF') + .option('--list-templates', 'Show available prompt templates') + .action(async (input: string | undefined, opts: { + mode: string; + format: string; + output?: string; + apiKey?: string; + template: string; + prompt?: string; + concurrency: string; + cache?: boolean; + stream?: boolean; + stdout?: boolean; + extractImages?: boolean; + listTemplates?: boolean; + }) => { + if (opts.listTemplates) { + console.log(chalk.blue('Available templates:')); + listTemplates().forEach((t) => console.log(` - ${t}`)); return; } + const options: ConvertOptions = { + mode: opts.mode as ConvertOptions['mode'], + format: opts.format as OutputFormat, + output: opts.output, + apiKey: opts.apiKey, + template: opts.template, + prompt: opts.prompt, + concurrency: parseInt(opts.concurrency, 10), + cache: opts.cache || false, + stream: opts.stream || false, + stdout: opts.stdout || false, + extractImages: opts.extractImages || false, + }; + const apiKey = options.apiKey || getApiKey(); - const mode = options.mode; - // Check if input exists + if (!input) { + if (!process.stdin.isTTY) { + const spinner = ora('Reading from stdin...').start(); + try { + const buffer = await readStdin(); + const tmpPath = `/tmp/pdf2md-stdin-${Date.now()}.pdf`; + fs.writeFileSync(tmpPath, buffer); + + spinner.text = 'Processing...'; + const ext = getExtension(options.format); + const outputPath = options.output || (options.stdout ? '' : `output${ext}`); + + await processFile(tmpPath, { ...options, stdout: options.stdout || !options.output }, apiKey, outputPath); + fs.unlinkSync(tmpPath); + } catch (error: any) { + spinner.fail(`Error: ${error.message}`); + process.exit(1); + } + return; + } + + console.log(chalk.yellow('No input provided. Use --help for usage.')); + return; + } + + if (input.includes('*')) { + const files = await glob(input, { nodir: true }); + const pdfFiles = files.filter((f: string) => f.toLowerCase().endsWith('.pdf')); + + if (pdfFiles.length === 0) { + console.log(chalk.yellow('No PDF files matched the pattern.')); + return; + } + + await processMultipleFiles(pdfFiles, options, apiKey); + return; + } + if (!fs.existsSync(input)) { - console.error(chalk.red(`Error: Input "${input}" not found.`)); - process.exit(1); + console.error(chalk.red(`Error: "${input}" not found.`)); + process.exit(1); } const stats = fs.statSync(input); - + if (stats.isFile()) { - await processFile(input, mode, apiKey, options.output); + const ext = getExtension(options.format); + const outputPath = options.output || input.replace(/\.pdf$/i, ext); + await processFile(input, options, apiKey, outputPath); } else if (stats.isDirectory()) { - if (options.output) { - console.warn(chalk.yellow('Warning: --output is ignored when processing a directory.')); - } - await processDirectory(input, mode, apiKey); - } else { - console.error(chalk.red('Error: Input is not a file or directory.')); - process.exit(1); + const files = fs.readdirSync(input, { recursive: true }) as string[]; + const pdfFiles = files + .filter((f) => f.toLowerCase().endsWith('.pdf')) + .map((f) => path.join(input, f)); + + if (pdfFiles.length === 0) { + console.log(chalk.yellow('No PDF files found in directory.')); + return; + } + + await processMultipleFiles(pdfFiles, options, apiKey); } }); diff --git a/src/commands/watch.ts b/src/commands/watch.ts new file mode 100644 index 0000000..9cebe1a --- /dev/null +++ b/src/commands/watch.ts @@ -0,0 +1,99 @@ +import fs from 'node:fs'; +import path from 'node:path'; +import { Command } from 'commander'; +import chalk from 'chalk'; +import chokidar from 'chokidar'; +import ora from 'ora'; +import { extractPdfText } from '../utils/pdf.js'; +import { convertToMarkdown, convertPdfWithVision } from '../utils/gemini.js'; +import { getApiKey } from '../utils/config.js'; +import { isCached, setCached } from '../utils/cache.js'; +import { getPrompt } from '../utils/templates.js'; +import { formatOutput, getExtension, type OutputFormat } from '../utils/formats.js'; + +export const watchCommand = new Command('watch') + .description('Watch a directory for new PDFs and auto-convert them') + .argument('', 'Directory to watch') + .option('-m, --mode ', 'Mode: standalone, ai, or vision', 'ai') + .option('-f, --format ', 'Output format: markdown, json, html, text', 'markdown') + .option('-k, --api-key ', 'Gemini API Key') + .option('-t, --template ', 'Prompt template or custom prompt', 'default') + .option('--no-cache', 'Disable caching (reprocess all files)') + .action(async (directory, options) => { + const apiKey = options.apiKey || getApiKey(); + const format = options.format as OutputFormat; + const prompt = getPrompt(options.template); + + if (!fs.existsSync(directory)) { + console.error(chalk.red(`Directory "${directory}" not found.`)); + process.exit(1); + } + + if (options.mode !== 'standalone' && !apiKey) { + console.error(chalk.red('API Key required for AI/vision mode. Use "pdf2md config --key ".')); + process.exit(1); + } + + console.log(chalk.blue(`👀 Watching ${directory} for PDF files...`)); + console.log(chalk.gray(` Mode: ${options.mode} | Format: ${format} | Template: ${options.template}`)); + console.log(chalk.gray(' Press Ctrl+C to stop.\n')); + + const watcher = chokidar.watch(path.join(directory, '**/*.pdf'), { + persistent: true, + ignoreInitial: false, + awaitWriteFinish: { + stabilityThreshold: 1000, + pollInterval: 100, + }, + }); + + const processFile = async (filePath: string) => { + const ext = getExtension(format); + const outputPath = filePath.replace(/\.pdf$/i, ext); + + if (options.cache && isCached(filePath, outputPath)) { + console.log(chalk.gray(`⏭️ Skipped (cached): ${path.basename(filePath)}`)); + return; + } + + const spinner = ora(`Processing ${path.basename(filePath)}...`).start(); + + try { + let result: string; + + if (options.mode === 'standalone') { + result = await extractPdfText(filePath); + } else if (options.mode === 'vision') { + result = await convertPdfWithVision(filePath, apiKey!, { prompt }); + } else { + const text = await extractPdfText(filePath); + result = await convertToMarkdown(text, apiKey!, { prompt }); + } + + const output = formatOutput(result, format, { + source: filePath, + processedAt: new Date().toISOString(), + }); + + fs.mkdirSync(path.dirname(outputPath), { recursive: true }); + fs.writeFileSync(outputPath, output); + + if (options.cache) { + setCached(filePath, outputPath); + } + + spinner.succeed(`Saved: ${path.basename(outputPath)}`); + } catch (error: any) { + spinner.fail(`Error: ${error.message}`); + } + }; + + watcher.on('add', processFile); + watcher.on('change', processFile); + + process.on('SIGINT', () => { + console.log(chalk.yellow('\n\nStopping watcher...')); + watcher.close(); + process.exit(0); + }); + }); diff --git a/src/index.ts b/src/index.ts index acc1687..bdbb5c0 100644 --- a/src/index.ts +++ b/src/index.ts @@ -2,14 +2,45 @@ import { Command } from 'commander'; import { convertCommand } from './commands/convert.js'; import { configCommand } from './commands/config.js'; +import { watchCommand } from './commands/watch.js'; +import { clearCache } from './utils/cache.js'; +import { listTemplates } from './utils/templates.js'; +import chalk from 'chalk'; import packageJson from '../package.json' with { type: "json" }; const program = new Command(); program - .description('Convert PDFs to Markdown using Bun and Gemini 3 Flash Preview AI') + .description('Convert PDFs to Markdown using Bun and Gemini AI') .version(packageJson.version) .addCommand(convertCommand, { isDefault: true }) - .addCommand(configCommand); + .addCommand(configCommand) + .addCommand(watchCommand); + +// Templates subcommand +program + .command('templates') + .description('List available prompt templates') + .action(() => { + console.log(chalk.blue('Available templates:\n')); + listTemplates().forEach((t) => { + console.log(` ${chalk.green(t)}`); + }); + console.log(chalk.gray('\nUse with: pdf2md --template ')); + }); + +// Cache subcommand +program + .command('cache') + .description('Manage the file cache') + .option('--clear', 'Clear all cached entries') + .action((opts) => { + if (opts.clear) { + clearCache(); + console.log(chalk.green('Cache cleared.')); + } else { + console.log('Use --clear to clear the cache.'); + } + }); program.parse(); diff --git a/src/utils/cache.ts b/src/utils/cache.ts new file mode 100644 index 0000000..45a97e2 --- /dev/null +++ b/src/utils/cache.ts @@ -0,0 +1,35 @@ +import fs from 'node:fs'; +import path from 'node:path'; +import crypto from 'node:crypto'; +import Conf from 'conf'; + +const cacheStore = new Conf>({ + projectName: 'pdf2md-cli', + configName: 'cache', +}); + +export function getFileHash(filePath: string): string { + const content = fs.readFileSync(filePath); + return crypto.createHash('sha256').update(content).digest('hex'); +} + +export function isCached(filePath: string, outputPath: string): boolean { + const hash = getFileHash(filePath); + const cacheKey = `${path.resolve(filePath)}:${path.resolve(outputPath)}`; + const cachedHash = cacheStore.get(cacheKey); + + if (cachedHash === hash && fs.existsSync(outputPath)) { + return true; + } + return false; +} + +export function setCached(filePath: string, outputPath: string): void { + const hash = getFileHash(filePath); + const cacheKey = `${path.resolve(filePath)}:${path.resolve(outputPath)}`; + cacheStore.set(cacheKey, hash); +} + +export function clearCache(): void { + cacheStore.clear(); +} diff --git a/src/utils/formats.ts b/src/utils/formats.ts new file mode 100644 index 0000000..0392a3b --- /dev/null +++ b/src/utils/formats.ts @@ -0,0 +1,70 @@ +export type OutputFormat = 'markdown' | 'json' | 'html' | 'text'; + +export function getExtension(format: OutputFormat): string { + switch (format) { + case 'markdown': return '.md'; + case 'json': return '.json'; + case 'html': return '.html'; + case 'text': return '.txt'; + } +} + +export function formatOutput(content: string, format: OutputFormat, metadata?: { source: string; processedAt: string }): string { + switch (format) { + case 'markdown': + return content; + + case 'json': + return JSON.stringify({ + content, + metadata: metadata || {}, + }, null, 2); + + case 'html': + return ` + + + + + PDF Conversion + + + +${markdownToHtml(content)} + +`; + + case 'text': + return content + .replace(/#{1,6}\s*/g, '') + .replace(/\*\*([^*]+)\*\*/g, '$1') + .replace(/\*([^*]+)\*/g, '$1') + .replace(/`([^`]+)`/g, '$1') + .replace(/```[\s\S]*?```/g, (match) => match.replace(/```\w*\n?/g, '').trim()) + .replace(/\[([^\]]+)\]\([^)]+\)/g, '$1'); + } +} + +function markdownToHtml(md: string): string { + return md + .replace(/^### (.*$)/gm, '

$1

') + .replace(/^## (.*$)/gm, '

$1

') + .replace(/^# (.*$)/gm, '

$1

') + .replace(/\*\*([^*]+)\*\*/g, '$1') + .replace(/\*([^*]+)\*/g, '$1') + .replace(/`([^`]+)`/g, '$1') + .replace(/```(\w*)\n([\s\S]*?)```/g, '
$2
') + .replace(/^\- (.*$)/gm, '
  • $1
  • ') + .replace(/(
  • .*<\/li>\n?)+/g, '
      $&
    ') + .replace(/\n\n/g, '

    ') + .replace(/^(.+)$/gm, (match) => { + if (match.startsWith('<')) return match; + return `

    ${match}

    `; + }); +} diff --git a/src/utils/gemini.ts b/src/utils/gemini.ts index bfba349..7985e8a 100644 --- a/src/utils/gemini.ts +++ b/src/utils/gemini.ts @@ -1,22 +1,119 @@ -import { GoogleGenerativeAI } from '@google/generative-ai'; +import fs from 'node:fs'; +import { GoogleGenerativeAI, type GenerateContentStreamResult } from '@google/generative-ai'; +import { getDocumentProxy, renderPageAsImage } from 'unpdf'; -export async function convertToMarkdown(text: string, apiKey: string, modelName: string = 'gemini-3-flash-preview'): Promise { - const genAI = new GoogleGenerativeAI(apiKey); - const model = genAI.getGenerativeModel({ model: modelName }); - - const prompt = `Convert the following raw PDF text into well-formatted markdown. +const DEFAULT_PROMPT = `Convert the following into well-formatted markdown. Rules: - Use appropriate heading levels - Format lists properly - Preserve code blocks if present - Add proper spacing - Make it readable and well-structured -- Do not output any preamble or explanation, just the markdown. +- Do not output any preamble or explanation, just the markdown.`; + +export async function convertToMarkdown( + text: string, + apiKey: string, + options: { + modelName?: string; + prompt?: string; + } = {} +): Promise { + const { modelName = 'gemini-2.0-flash', prompt = DEFAULT_PROMPT } = options; + + const genAI = new GoogleGenerativeAI(apiKey); + const model = genAI.getGenerativeModel({ model: modelName }); + + const fullPrompt = `${prompt} Raw text: ${text}`; - const result = await model.generateContent(prompt); + const result = await model.generateContent(fullPrompt); const response = await result.response; return response.text(); } + +export async function convertToMarkdownStream( + text: string, + apiKey: string, + onChunk: (chunk: string) => void, + options: { + modelName?: string; + prompt?: string; + } = {} +): Promise { + const { modelName = 'gemini-2.0-flash', prompt = DEFAULT_PROMPT } = options; + + const genAI = new GoogleGenerativeAI(apiKey); + const model = genAI.getGenerativeModel({ model: modelName }); + + const fullPrompt = `${prompt} + +Raw text: +${text}`; + + const result = await model.generateContentStream(fullPrompt); + + let fullText = ''; + for await (const chunk of result.stream) { + const chunkText = chunk.text(); + fullText += chunkText; + onChunk(chunkText); + } + + return fullText; +} + +export async function convertPdfWithVision( + filePath: string, + apiKey: string, + options: { + modelName?: string; + prompt?: string; + stream?: boolean; + onChunk?: (chunk: string) => void; + } = {} +): Promise { + const { modelName = 'gemini-2.0-flash', prompt = DEFAULT_PROMPT, stream = false, onChunk } = options; + + const genAI = new GoogleGenerativeAI(apiKey); + const model = genAI.getGenerativeModel({ model: modelName }); + + // Load PDF and render pages as images + const dataBuffer = fs.readFileSync(filePath); + const pdf = await getDocumentProxy(new Uint8Array(dataBuffer)); + + const imageParts: { inlineData: { data: string; mimeType: string } }[] = []; + + for (let i = 1; i <= pdf.numPages; i++) { + const imageResult = await renderPageAsImage(pdf, i, { scale: 2 }); + // renderPageAsImage returns an ArrayBuffer + const base64 = Buffer.from(imageResult).toString('base64'); + imageParts.push({ + inlineData: { + data: base64, + mimeType: 'image/png', + }, + }); + } + + const fullPrompt = `${prompt} + +Convert this PDF document (${pdf.numPages} pages shown as images) to markdown.`; + + if (stream && onChunk) { + const result = await model.generateContentStream([fullPrompt, ...imageParts]); + let fullText = ''; + for await (const chunk of result.stream) { + const chunkText = chunk.text(); + fullText += chunkText; + onChunk(chunkText); + } + return fullText; + } else { + const result = await model.generateContent([fullPrompt, ...imageParts]); + const response = await result.response; + return response.text(); + } +} diff --git a/src/utils/images.ts b/src/utils/images.ts new file mode 100644 index 0000000..2f01ff0 --- /dev/null +++ b/src/utils/images.ts @@ -0,0 +1,72 @@ +import fs from 'node:fs'; +import path from 'node:path'; +import { getDocumentProxy } from 'unpdf'; + +export interface ExtractedImage { + name: string; + data: Uint8Array; + width: number; + height: number; +} + +export async function extractImages(filePath: string, outputDir: string): Promise { + const dataBuffer = fs.readFileSync(filePath); + const pdf = await getDocumentProxy(new Uint8Array(dataBuffer)); + + const savedPaths: string[] = []; + fs.mkdirSync(outputDir, { recursive: true }); + + let imageCount = 0; + + for (let i = 1; i <= pdf.numPages; i++) { + const page = await pdf.getPage(i); + const ops = await page.getOperatorList(); + + // Look for image objects in the operator list + for (let j = 0; j < ops.fnArray.length; j++) { + // OPS.paintImageXObject = 85 + if (ops.fnArray[j] === 85) { + const imgName = ops.argsArray[j][0]; + try { + const img = await page.objs.get(imgName); + if (img && img.data) { + imageCount++; + const fileName = `image-${imageCount}.png`; + const outputPath = path.join(outputDir, fileName); + + // Create a simple PNG from raw image data + // For simplicity, save as raw data - users can convert + const rawPath = path.join(outputDir, `image-${imageCount}.raw`); + fs.writeFileSync(rawPath, Buffer.from(img.data)); + savedPaths.push(rawPath); + } + } catch { + // Skip images that can't be extracted + } + } + } + } + + return savedPaths; +} + +export async function hasImages(filePath: string): Promise { + try { + const dataBuffer = fs.readFileSync(filePath); + const pdf = await getDocumentProxy(new Uint8Array(dataBuffer)); + + for (let i = 1; i <= pdf.numPages; i++) { + const page = await pdf.getPage(i); + const ops = await page.getOperatorList(); + + for (let j = 0; j < ops.fnArray.length; j++) { + if (ops.fnArray[j] === 85) { + return true; + } + } + } + return false; + } catch { + return false; + } +} diff --git a/src/utils/templates.ts b/src/utils/templates.ts new file mode 100644 index 0000000..5048a43 --- /dev/null +++ b/src/utils/templates.ts @@ -0,0 +1,55 @@ +export const TEMPLATES: Record = { + default: `Convert the following raw PDF text into well-formatted markdown. +Rules: +- Use appropriate heading levels +- Format lists properly +- Preserve code blocks if present +- Add proper spacing +- Make it readable and well-structured +- Do not output any preamble or explanation, just the markdown.`, + + invoice: `Extract invoice data from the following PDF text. +Format as markdown with: +- Invoice number, date, due date as headers +- Vendor and customer info in sections +- Line items as a markdown table +- Totals clearly formatted +- Do not output any preamble, just the structured markdown.`, + + table: `Extract all tables from the following PDF text. +Rules: +- Convert each table to proper markdown table format +- Preserve column headers +- Maintain data alignment +- If no tables found, state "No tables found" +- Do not output any preamble or explanation.`, + + summary: `Summarize the following PDF text into a concise markdown document. +Rules: +- Create a brief executive summary +- List key points as bullet points +- Keep it under 500 words +- Use appropriate headings +- Do not output any preamble.`, + + code: `Extract and format code from the following PDF text. +Rules: +- Identify code blocks and wrap in appropriate markdown code fences +- Try to detect the programming language +- Preserve indentation +- Add brief comments for context if helpful +- Do not output any preamble.`, +}; + +export function getPrompt(templateOrCustom: string): string { + // Check if it's a built-in template name + if (TEMPLATES[templateOrCustom]) { + return TEMPLATES[templateOrCustom]; + } + // Otherwise treat as custom prompt + return templateOrCustom; +} + +export function listTemplates(): string[] { + return Object.keys(TEMPLATES); +}