|
| 1 | +const path = require('path'); |
| 2 | + |
| 3 | +const MIME_TYPES = { |
| 4 | + // Documents → document_url |
| 5 | + '.pdf': { kind: 'document', mime: 'application/pdf' }, |
| 6 | + '.docx': { kind: 'document', mime: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' }, |
| 7 | + '.pptx': { kind: 'document', mime: 'application/vnd.openxmlformats-officedocument.presentationml.presentation' }, |
| 8 | + '.txt': { kind: 'document', mime: 'text/plain' }, |
| 9 | + '.epub': { kind: 'document', mime: 'application/epub+zip' }, |
| 10 | + '.xml': { kind: 'document', mime: 'application/xml' }, |
| 11 | + '.rtf': { kind: 'document', mime: 'application/rtf' }, |
| 12 | + '.odt': { kind: 'document', mime: 'application/vnd.oasis.opendocument.text' }, |
| 13 | + '.bib': { kind: 'document', mime: 'application/x-bibtex' }, |
| 14 | + '.fb2': { kind: 'document', mime: 'application/x-fictionbook+xml' }, |
| 15 | + '.ipynb': { kind: 'document', mime: 'application/x-ipynb+json' }, |
| 16 | + '.tex': { kind: 'document', mime: 'application/x-tex' }, |
| 17 | + '.opml': { kind: 'document', mime: 'text/x-opml' }, |
| 18 | + '.1': { kind: 'document', mime: 'application/x-troff-man' }, |
| 19 | + '.man': { kind: 'document', mime: 'application/x-troff-man' }, |
| 20 | + // Images → image_url |
| 21 | + '.jpg': { kind: 'image', mime: 'image/jpeg' }, |
| 22 | + '.jpeg': { kind: 'image', mime: 'image/jpeg' }, |
| 23 | + '.png': { kind: 'image', mime: 'image/png' }, |
| 24 | + '.avif': { kind: 'image', mime: 'image/avif' }, |
| 25 | + '.tiff': { kind: 'image', mime: 'image/tiff' }, |
| 26 | + '.gif': { kind: 'image', mime: 'image/gif' }, |
| 27 | + '.heic': { kind: 'image', mime: 'image/heic' }, |
| 28 | + '.heif': { kind: 'image', mime: 'image/heif' }, |
| 29 | + '.bmp': { kind: 'image', mime: 'image/bmp' }, |
| 30 | + '.webp': { kind: 'image', mime: 'image/webp' }, |
| 31 | +}; |
| 32 | + |
| 33 | +module.exports = async function mistralOcr({ filename }, ctx) { |
| 34 | + const apiKey = await ctx.read("MISTRAL_KEY"); |
| 35 | + if (!apiKey) { |
| 36 | + throw new Error("MISTRAL_KEY is not set"); |
| 37 | + } |
| 38 | + |
| 39 | + const ext = path.extname(filename).toLowerCase(); |
| 40 | + const typeInfo = MIME_TYPES[ext]; |
| 41 | + if (!typeInfo) { |
| 42 | + throw new Error(`Unsupported file extension: ${ext}`); |
| 43 | + } |
| 44 | + |
| 45 | + const buf = await ctx.read(filename); |
| 46 | + const base64 = buf.toString('base64'); |
| 47 | + const dataUrl = `data:${typeInfo.mime};base64,${base64}`; |
| 48 | + |
| 49 | + const document = typeInfo.kind === 'image' |
| 50 | + ? { type: 'image_url', image_url: dataUrl } |
| 51 | + : { type: 'document_url', document_url: dataUrl }; |
| 52 | + |
| 53 | + const response = await fetch("https://api.mistral.ai/v1/ocr", { |
| 54 | + method: "POST", |
| 55 | + headers: { |
| 56 | + "Content-Type": "application/json", |
| 57 | + "Authorization": `Bearer ${apiKey}`, |
| 58 | + }, |
| 59 | + body: JSON.stringify({ |
| 60 | + model: "mistral-ocr-latest", |
| 61 | + document, |
| 62 | + include_image_base64: true, |
| 63 | + }), |
| 64 | + }); |
| 65 | + |
| 66 | + if (!response.ok) { |
| 67 | + const errorText = await response.text(); |
| 68 | + throw new Error(`Mistral OCR error: ${response.status} ${response.statusText} - ${errorText}`); |
| 69 | + } |
| 70 | + |
| 71 | + const result = await response.json(); |
| 72 | + |
| 73 | + return result.pages.map(page => page.markdown).join("\n\n"); |
| 74 | +}; |
0 commit comments