Skip to content

Commit 852945b

Browse files
authored
feat(mistral-OCR): added mistral tool and block for parsing pdfs (#217)
1 parent dcd3b30 commit 852945b

File tree

6 files changed

+736
-4
lines changed

6 files changed

+736
-4
lines changed

sim/blocks/blocks/mistral-parse.ts

Lines changed: 199 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,199 @@
1+
import { MistralParserOutput } from '@/tools/mistral/parser'
2+
import { BlockConfig } from '../types'
3+
import { MistralIcon } from '@/components/icons'
4+
5+
export const MistralParseBlock: BlockConfig<MistralParserOutput> = {
6+
type: 'mistral_parse',
7+
name: 'Mistral PDF Parser',
8+
description: 'Extract text from PDF documents',
9+
longDescription:
10+
'Extract text and structure from PDF documents using Mistral\'s OCR API. Enter a URL to a PDF document, configure processing options, and get the content in your preferred format.',
11+
category: 'tools',
12+
bgColor: '#000000',
13+
icon: MistralIcon,
14+
subBlocks: [
15+
{
16+
id: 'filePath',
17+
title: 'PDF Document URL',
18+
type: 'short-input',
19+
layout: 'full',
20+
placeholder: 'Enter full URL to a PDF document (https://example.com/document.pdf)',
21+
},
22+
{
23+
id: 'resultType',
24+
title: 'Output Format',
25+
type: 'dropdown',
26+
layout: 'full',
27+
options: [
28+
{ id: 'markdown', label: 'Markdown (Formatted)' },
29+
{ id: 'text', label: 'Plain Text' },
30+
{ id: 'json', label: 'JSON (Raw)' }
31+
],
32+
},
33+
{
34+
id: 'pages',
35+
title: 'Specific Pages',
36+
type: 'short-input',
37+
layout: 'full',
38+
placeholder: 'e.g. 0,1,2 (leave empty for all pages)',
39+
},
40+
/*
41+
* Image-related parameters - temporarily disabled
42+
* Uncomment if PDF image extraction is needed
43+
*
44+
{
45+
id: 'includeImageBase64',
46+
title: 'Include PDF Images',
47+
type: 'switch',
48+
layout: 'half',
49+
},
50+
{
51+
id: 'imageLimit',
52+
title: 'Max Images',
53+
type: 'short-input',
54+
layout: 'half',
55+
placeholder: 'Maximum number of images to extract',
56+
},
57+
{
58+
id: 'imageMinSize',
59+
title: 'Min Image Size (px)',
60+
type: 'short-input',
61+
layout: 'half',
62+
placeholder: 'Min width/height in pixels',
63+
},
64+
*/
65+
{
66+
id: 'apiKey',
67+
title: 'API Key',
68+
type: 'short-input',
69+
layout: 'full',
70+
placeholder: 'Enter your Mistral API key',
71+
password: true,
72+
},
73+
],
74+
tools: {
75+
access: ['mistral_parser'],
76+
config: {
77+
tool: () => 'mistral_parser',
78+
params: (params) => {
79+
// Basic validation
80+
if (!params || !params.apiKey || params.apiKey.trim() === '') {
81+
throw new Error('Mistral API key is required');
82+
}
83+
84+
if (!params || !params.filePath || params.filePath.trim() === '') {
85+
throw new Error('PDF Document URL is required');
86+
}
87+
88+
// Validate URL format
89+
let validatedUrl;
90+
try {
91+
// Try to create a URL object to validate format
92+
validatedUrl = new URL(params.filePath.trim());
93+
94+
// Ensure URL is using HTTP or HTTPS protocol
95+
if (!['http:', 'https:'].includes(validatedUrl.protocol)) {
96+
throw new Error(`URL must use HTTP or HTTPS protocol. Found: ${validatedUrl.protocol}`);
97+
}
98+
} catch (error) {
99+
const errorMessage = error instanceof Error ? error.message : String(error);
100+
throw new Error(`Invalid URL format: ${errorMessage}`);
101+
}
102+
103+
// Process pages input (convert from comma-separated string to array of numbers)
104+
let pagesArray: number[] | undefined = undefined;
105+
if (params.pages && params.pages.trim() !== '') {
106+
try {
107+
pagesArray = params.pages
108+
.split(',')
109+
.map((p: string) => p.trim())
110+
.filter((p: string) => p.length > 0)
111+
.map((p: string) => {
112+
const num = parseInt(p, 10);
113+
if (isNaN(num) || num < 0) {
114+
throw new Error(`Invalid page number: ${p}`);
115+
}
116+
return num;
117+
});
118+
119+
if (pagesArray && pagesArray.length === 0) {
120+
pagesArray = undefined;
121+
}
122+
} catch (error: any) {
123+
throw new Error(`Page number format error: ${error.message}`);
124+
}
125+
}
126+
127+
// Process numeric inputs
128+
let imageLimit: number | undefined = undefined;
129+
if (params.imageLimit && params.imageLimit.trim() !== '') {
130+
const limit = parseInt(params.imageLimit, 10);
131+
if (!isNaN(limit) && limit > 0) {
132+
imageLimit = limit;
133+
} else {
134+
throw new Error('Image limit must be a positive number');
135+
}
136+
}
137+
138+
let imageMinSize: number | undefined = undefined;
139+
if (params.imageMinSize && params.imageMinSize.trim() !== '') {
140+
const size = parseInt(params.imageMinSize, 10);
141+
if (!isNaN(size) && size > 0) {
142+
imageMinSize = size;
143+
} else {
144+
throw new Error('Minimum image size must be a positive number');
145+
}
146+
}
147+
148+
// Return structured parameters for the tool
149+
const parameters: any = {
150+
filePath: validatedUrl.toString(),
151+
apiKey: params.apiKey.trim(),
152+
resultType: params.resultType || 'markdown',
153+
};
154+
155+
// Add optional parameters if they're defined
156+
if (pagesArray && pagesArray.length > 0) {
157+
parameters.pages = pagesArray;
158+
}
159+
160+
/*
161+
* Image-related parameters - temporarily disabled
162+
* Uncomment if PDF image extraction is needed
163+
*
164+
if (typeof params.includeImageBase64 === 'boolean') {
165+
parameters.includeImageBase64 = params.includeImageBase64;
166+
}
167+
168+
if (imageLimit !== undefined) {
169+
parameters.imageLimit = imageLimit;
170+
}
171+
172+
if (imageMinSize !== undefined) {
173+
parameters.imageMinSize = imageMinSize;
174+
}
175+
*/
176+
177+
return parameters;
178+
},
179+
},
180+
},
181+
inputs: {
182+
filePath: { type: 'string', required: true },
183+
apiKey: { type: 'string', required: true },
184+
resultType: { type: 'string', required: false },
185+
pages: { type: 'string', required: false },
186+
// Image-related inputs - temporarily disabled
187+
// includeImageBase64: { type: 'boolean', required: false },
188+
// imageLimit: { type: 'string', required: false },
189+
// imageMinSize: { type: 'string', required: false },
190+
},
191+
outputs: {
192+
response: {
193+
type: {
194+
content: 'string',
195+
metadata: 'json',
196+
},
197+
},
198+
},
199+
}

sim/blocks/index.ts

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import { GoogleDocsBlock } from './blocks/docs'
88
import { GoogleDriveBlock } from './blocks/drive'
99
import { EvaluatorBlock } from './blocks/evaluator'
1010
import { ExaBlock } from './blocks/exa'
11+
import { MistralParseBlock } from './blocks/mistral-parse'
1112
import { FileBlock } from './blocks/file'
1213
import { FirecrawlBlock } from './blocks/firecrawl'
1314
import { FunctionBlock } from './blocks/function'
@@ -42,11 +43,12 @@ export {
4243
AgentBlock,
4344
AirtableBlock,
4445
ApiBlock,
45-
FileBlock,
46+
MistralParseBlock,
4647
FunctionBlock,
4748
VisionBlock,
4849
FirecrawlBlock,
4950
// GuestyBlock,
51+
FileBlock,
5052
JinaBlock,
5153
TranslateBlock,
5254
SlackBlock,
@@ -86,8 +88,9 @@ const blocks: Record<string, BlockConfig> = {
8688
confluence: ConfluenceBlock,
8789
evaluator: EvaluatorBlock,
8890
exa: ExaBlock,
89-
file: FileBlock,
91+
mistral_parse: MistralParseBlock,
9092
firecrawl: FirecrawlBlock,
93+
file: FileBlock,
9194
function: FunctionBlock,
9295
github: GitHubBlock,
9396
gmail: GmailBlock,

sim/components/icons.tsx

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1819,3 +1819,59 @@ export function DocumentIcon(props: SVGProps<SVGSVGElement>) {
18191819
</svg>
18201820
)
18211821
}
1822+
1823+
export function MistralIcon(props: SVGProps<SVGSVGElement>) {
1824+
return (
1825+
<svg
1826+
{...props}
1827+
width="22"
1828+
height="22"
1829+
viewBox="1 0.5 24 22"
1830+
fill="none"
1831+
xmlns="http://www.w3.org/2000/svg"
1832+
preserveAspectRatio="xMidYMid meet"
1833+
>
1834+
<g clipPath="url(#clip0_1621_58)">
1835+
<path d="M17.4541 0H21.8177V4.39481H17.4541V0Z" fill="black" />
1836+
<path d="M19.6367 0H24.0003V4.39481H19.6367V0Z" fill="#F7D046" />
1837+
<path
1838+
d="M0 0H4.36359V4.39481H0V0ZM0 4.39481H4.36359V8.78961H0V4.39481ZM0 8.78971H4.36359V13.1845H0V8.78971ZM0 13.1845H4.36359V17.5793H0V13.1845ZM0 17.5794H4.36359V21.9742H0V17.5794Z"
1839+
fill="black"
1840+
/>
1841+
<path d="M2.18164 0H6.54523V4.39481H2.18164V0Z" fill="#F7D046" />
1842+
<path
1843+
d="M19.6362 4.39478H23.9998V8.78958H19.6362V4.39478ZM2.18164 4.39478H6.54523V8.78958H2.18164V4.39478Z"
1844+
fill="#F2A73B"
1845+
/>
1846+
<path d="M13.0908 4.39478H17.4544V8.78958H13.0908V4.39478Z" fill="black" />
1847+
<path
1848+
d="M15.2732 4.39478H19.6368V8.78958H15.2732V4.39478ZM6.5459 4.39478H10.9095V8.78958H6.5459V4.39478Z"
1849+
fill="#F2A73B"
1850+
/>
1851+
<path
1852+
d="M10.9096 8.78979H15.2732V13.1846H10.9096V8.78979ZM15.2732 8.78979H19.6368V13.1846H15.2732V8.78979ZM6.5459 8.78979H10.9096V13.1846H6.5459V8.78979Z"
1853+
fill="#EE792F"
1854+
/>
1855+
<path d="M8.72754 13.1846H13.0911V17.5794H8.72754V13.1846Z" fill="black" />
1856+
<path d="M10.9092 13.1846H15.2728V17.5794H10.9092V13.1846Z" fill="#EB5829" />
1857+
<path
1858+
d="M19.6362 8.78979H23.9998V13.1846H19.6362V8.78979ZM2.18164 8.78979H6.54523V13.1846H2.18164V8.78979Z"
1859+
fill="#EE792F"
1860+
/>
1861+
<path d="M17.4541 13.1846H21.8177V17.5794H17.4541V13.1846Z" fill="black" />
1862+
<path d="M19.6367 13.1846H24.0003V17.5794H19.6367V13.1846Z" fill="#EB5829" />
1863+
<path d="M17.4541 17.5793H21.8177V21.9742H17.4541V17.5793Z" fill="black" />
1864+
<path d="M2.18164 13.1846H6.54523V17.5794H2.18164V13.1846Z" fill="#EB5829" />
1865+
<path
1866+
d="M19.6362 17.5793H23.9998V21.9742H19.6362V17.5793ZM2.18164 17.5793H6.54523V21.9742H2.18164V17.5793Z"
1867+
fill="#EA3326"
1868+
/>
1869+
</g>
1870+
<defs>
1871+
<clipPath id="clip0_1621_58">
1872+
<rect width="24" height="22" fill="white" />
1873+
</clipPath>
1874+
</defs>
1875+
</svg>
1876+
)
1877+
}

sim/tools/index.ts

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import { guestyGuestTool, guestyReservationTool } from './guesty'
2020
import { requestTool as httpRequest } from './http/request'
2121
import { contactsTool as hubspotContacts } from './hubspot/contacts'
2222
import { readUrlTool } from './jina/reader'
23+
import { mistralParserTool } from './mistral'
2324
import { notionReadTool, notionWriteTool } from './notion'
2425
import { dalleTool } from './openai/dalle'
2526
import { embeddingsTool as openAIEmbeddings } from './openai/embeddings'
@@ -116,6 +117,7 @@ export const tools: Record<string, ToolConfig> = {
116117
airtable_read: airtableReadTool,
117118
airtable_write: airtableWriteTool,
118119
airtable_update: airtableUpdateTool,
120+
mistral_parser: mistralParserTool,
119121
}
120122

121123
// Get a tool by its ID
@@ -295,7 +297,7 @@ function getCustomTool(customToolId: string): ToolConfig | undefined {
295297
},
296298

297299
// Response handling
298-
transformResponse: async (response: Response) => {
300+
transformResponse: async (response: Response, params: Record<string, any>) => {
299301
const data = await response.json()
300302

301303
if (!data.success) {
@@ -597,7 +599,7 @@ async function handleInternalRequest(
597599

598600
// Use the tool's response transformer if available
599601
if (tool.transformResponse) {
600-
return await tool.transformResponse(response)
602+
return await tool.transformResponse(response, params)
601603
}
602604

603605
// Default response handling

sim/tools/mistral/index.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
import { mistralParserTool } from './parser'
2+
3+
export { mistralParserTool }

0 commit comments

Comments
 (0)