feat(mistral-OCR): added mistral tool and block for parsing pdfs (#217)

emir-karabeg · web-flow · commit 852945b925f6 · 2025-04-02T01:05:51.000-07:00
diff --git a/sim/blocks/blocks/mistral-parse.ts b/sim/blocks/blocks/mistral-parse.ts
@@ -0,0 +1,199 @@
+import { MistralParserOutput } from '@/tools/mistral/parser'
+import { BlockConfig } from '../types'
+import { MistralIcon } from '@/components/icons'
+
+export const MistralParseBlock: BlockConfig<MistralParserOutput> = {
+  type: 'mistral_parse',
+  name: 'Mistral PDF Parser',
+  description: 'Extract text from PDF documents',
+  longDescription:
+    'Extract text and structure from PDF documents using Mistral\'s OCR API. Enter a URL to a PDF document, configure processing options, and get the content in your preferred format.',
+  category: 'tools',
+  bgColor: '#000000',
+  icon: MistralIcon,
+  subBlocks: [
+    {
+      id: 'filePath',
+      title: 'PDF Document URL',
+      type: 'short-input',
+      layout: 'full',
+      placeholder: 'Enter full URL to a PDF document (https://example.com/document.pdf)',
+    },
+    {
+      id: 'resultType',
+      title: 'Output Format',
+      type: 'dropdown',
+      layout: 'full',
+      options: [
+        { id: 'markdown', label: 'Markdown (Formatted)' },
+        { id: 'text', label: 'Plain Text' },
+        { id: 'json', label: 'JSON (Raw)' }
+      ],
+    },
+    {
+      id: 'pages',
+      title: 'Specific Pages',
+      type: 'short-input',
+      layout: 'full',
+      placeholder: 'e.g. 0,1,2 (leave empty for all pages)',
+    },
+    /* 
+     * Image-related parameters - temporarily disabled
+     * Uncomment if PDF image extraction is needed
+     *
+    {
+      id: 'includeImageBase64',
+      title: 'Include PDF Images',
+      type: 'switch',
+      layout: 'half',
+    },
+    {
+      id: 'imageLimit',
+      title: 'Max Images',
+      type: 'short-input',
+      layout: 'half',
+      placeholder: 'Maximum number of images to extract',
+    },
+    {
+      id: 'imageMinSize',
+      title: 'Min Image Size (px)',
+      type: 'short-input',
+      layout: 'half',
+      placeholder: 'Min width/height in pixels',
+    },
+    */
+    {
+      id: 'apiKey',
+      title: 'API Key',
+      type: 'short-input',
+      layout: 'full',
+      placeholder: 'Enter your Mistral API key',
+      password: true,
+    },
+  ],
+  tools: {
+    access: ['mistral_parser'],
+    config: {
+      tool: () => 'mistral_parser',
+      params: (params) => {
+        // Basic validation
+        if (!params || !params.apiKey || params.apiKey.trim() === '') {
+          throw new Error('Mistral API key is required');
+        }
+        
+        if (!params || !params.filePath || params.filePath.trim() === '') {
+          throw new Error('PDF Document URL is required');
+        }
+        
+        // Validate URL format
+        let validatedUrl;
+        try {
+          // Try to create a URL object to validate format
+          validatedUrl = new URL(params.filePath.trim());
+          
+          // Ensure URL is using HTTP or HTTPS protocol
+          if (!['http:', 'https:'].includes(validatedUrl.protocol)) {
+            throw new Error(`URL must use HTTP or HTTPS protocol. Found: ${validatedUrl.protocol}`);
+          }
+        } catch (error) {
+          const errorMessage = error instanceof Error ? error.message : String(error);
+          throw new Error(`Invalid URL format: ${errorMessage}`);
+        }
+        
+        // Process pages input (convert from comma-separated string to array of numbers)
+        let pagesArray: number[] | undefined = undefined;
+        if (params.pages && params.pages.trim() !== '') {
+          try {
+            pagesArray = params.pages
+              .split(',')
+              .map((p: string) => p.trim())
+              .filter((p: string) => p.length > 0)
+              .map((p: string) => {
+                const num = parseInt(p, 10);
+                if (isNaN(num) || num < 0) {
+                  throw new Error(`Invalid page number: ${p}`);
+                }
+                return num;
+              });
+            
+            if (pagesArray && pagesArray.length === 0) {
+              pagesArray = undefined;
+            }
+          } catch (error: any) {
+            throw new Error(`Page number format error: ${error.message}`);
+          }
+        }
+        
+        // Process numeric inputs
+        let imageLimit: number | undefined = undefined;
+        if (params.imageLimit && params.imageLimit.trim() !== '') {
+          const limit = parseInt(params.imageLimit, 10);
+          if (!isNaN(limit) && limit > 0) {
+            imageLimit = limit;
+          } else {
+            throw new Error('Image limit must be a positive number');
+          }
+        }
+        
+        let imageMinSize: number | undefined = undefined;
+        if (params.imageMinSize && params.imageMinSize.trim() !== '') {
+          const size = parseInt(params.imageMinSize, 10);
+          if (!isNaN(size) && size > 0) {
+            imageMinSize = size;
+          } else {
+            throw new Error('Minimum image size must be a positive number');
+          }
+        }
+        
+        // Return structured parameters for the tool
+        const parameters: any = {
+          filePath: validatedUrl.toString(),
+          apiKey: params.apiKey.trim(),
+          resultType: params.resultType || 'markdown',
+        };
+        
+        // Add optional parameters if they're defined
+        if (pagesArray && pagesArray.length > 0) {
+          parameters.pages = pagesArray;
+        }
+        
+        /* 
+         * Image-related parameters - temporarily disabled
+         * Uncomment if PDF image extraction is needed
+         *
+        if (typeof params.includeImageBase64 === 'boolean') {
+          parameters.includeImageBase64 = params.includeImageBase64;
+        }
+        
+        if (imageLimit !== undefined) {
+          parameters.imageLimit = imageLimit;
+        }
+        
+        if (imageMinSize !== undefined) {
+          parameters.imageMinSize = imageMinSize;
+        }
+        */
+        
+        return parameters;
+      },
+    },
+  },
+  inputs: {
+    filePath: { type: 'string', required: true },
+    apiKey: { type: 'string', required: true },
+    resultType: { type: 'string', required: false },
+    pages: { type: 'string', required: false },
+    // Image-related inputs - temporarily disabled
+    // includeImageBase64: { type: 'boolean', required: false },
+    // imageLimit: { type: 'string', required: false },
+    // imageMinSize: { type: 'string', required: false },
+  },
+  outputs: {
+    response: {
+      type: {
+        content: 'string',
+        metadata: 'json',
+      },
+    },
+  },
+} 
diff --git a/sim/blocks/index.ts b/sim/blocks/index.ts
@@ -8,6 +8,7 @@ import { GoogleDocsBlock } from './blocks/docs'
 import { GoogleDriveBlock } from './blocks/drive'
 import { EvaluatorBlock } from './blocks/evaluator'
 import { ExaBlock } from './blocks/exa'
+import { MistralParseBlock } from './blocks/mistral-parse'
 import { FileBlock } from './blocks/file'
 import { FirecrawlBlock } from './blocks/firecrawl'
 import { FunctionBlock } from './blocks/function'
@@ -42,11 +43,12 @@ export {
   AgentBlock,
   AirtableBlock,
   ApiBlock,
-  FileBlock,
+  MistralParseBlock,
   FunctionBlock,
   VisionBlock,
   FirecrawlBlock,
   // GuestyBlock,
+  FileBlock,
   JinaBlock,
   TranslateBlock,
   SlackBlock,
@@ -86,8 +88,9 @@ const blocks: Record<string, BlockConfig> = {
   confluence: ConfluenceBlock,
   evaluator: EvaluatorBlock,
   exa: ExaBlock,
-  file: FileBlock,
+  mistral_parse: MistralParseBlock,
   firecrawl: FirecrawlBlock,
+  file: FileBlock,
   function: FunctionBlock,
   github: GitHubBlock,
   gmail: GmailBlock,
diff --git a/sim/components/icons.tsx b/sim/components/icons.tsx
@@ -1819,3 +1819,59 @@ export function DocumentIcon(props: SVGProps<SVGSVGElement>) {
     </svg>
   )
 }
+
+export function MistralIcon(props: SVGProps<SVGSVGElement>) {
+  return (
+    <svg
+      {...props}
+      width="22"
+      height="22"
+      viewBox="1 0.5 24 22"
+      fill="none"
+      xmlns="http://www.w3.org/2000/svg"
+      preserveAspectRatio="xMidYMid meet"
+    >
+      <g clipPath="url(#clip0_1621_58)">
+        <path d="M17.4541 0H21.8177V4.39481H17.4541V0Z" fill="black" />
+        <path d="M19.6367 0H24.0003V4.39481H19.6367V0Z" fill="#F7D046" />
+        <path
+          d="M0 0H4.36359V4.39481H0V0ZM0 4.39481H4.36359V8.78961H0V4.39481ZM0 8.78971H4.36359V13.1845H0V8.78971ZM0 13.1845H4.36359V17.5793H0V13.1845ZM0 17.5794H4.36359V21.9742H0V17.5794Z"
+          fill="black"
+        />
+        <path d="M2.18164 0H6.54523V4.39481H2.18164V0Z" fill="#F7D046" />
+        <path
+          d="M19.6362 4.39478H23.9998V8.78958H19.6362V4.39478ZM2.18164 4.39478H6.54523V8.78958H2.18164V4.39478Z"
+          fill="#F2A73B"
+        />
+        <path d="M13.0908 4.39478H17.4544V8.78958H13.0908V4.39478Z" fill="black" />
+        <path
+          d="M15.2732 4.39478H19.6368V8.78958H15.2732V4.39478ZM6.5459 4.39478H10.9095V8.78958H6.5459V4.39478Z"
+          fill="#F2A73B"
+        />
+        <path
+          d="M10.9096 8.78979H15.2732V13.1846H10.9096V8.78979ZM15.2732 8.78979H19.6368V13.1846H15.2732V8.78979ZM6.5459 8.78979H10.9096V13.1846H6.5459V8.78979Z"
+          fill="#EE792F"
+        />
+        <path d="M8.72754 13.1846H13.0911V17.5794H8.72754V13.1846Z" fill="black" />
+        <path d="M10.9092 13.1846H15.2728V17.5794H10.9092V13.1846Z" fill="#EB5829" />
+        <path
+          d="M19.6362 8.78979H23.9998V13.1846H19.6362V8.78979ZM2.18164 8.78979H6.54523V13.1846H2.18164V8.78979Z"
+          fill="#EE792F"
+        />
+        <path d="M17.4541 13.1846H21.8177V17.5794H17.4541V13.1846Z" fill="black" />
+        <path d="M19.6367 13.1846H24.0003V17.5794H19.6367V13.1846Z" fill="#EB5829" />
+        <path d="M17.4541 17.5793H21.8177V21.9742H17.4541V17.5793Z" fill="black" />
+        <path d="M2.18164 13.1846H6.54523V17.5794H2.18164V13.1846Z" fill="#EB5829" />
+        <path
+          d="M19.6362 17.5793H23.9998V21.9742H19.6362V17.5793ZM2.18164 17.5793H6.54523V21.9742H2.18164V17.5793Z"
+          fill="#EA3326"
+        />
+      </g>
+      <defs>
+        <clipPath id="clip0_1621_58">
+          <rect width="24" height="22" fill="white" />
+        </clipPath>
+      </defs>
+    </svg>
+  )
+}
diff --git a/sim/tools/index.ts b/sim/tools/index.ts
@@ -20,6 +20,7 @@ import { guestyGuestTool, guestyReservationTool } from './guesty'
 import { requestTool as httpRequest } from './http/request'
 import { contactsTool as hubspotContacts } from './hubspot/contacts'
 import { readUrlTool } from './jina/reader'
+import { mistralParserTool } from './mistral'
 import { notionReadTool, notionWriteTool } from './notion'
 import { dalleTool } from './openai/dalle'
 import { embeddingsTool as openAIEmbeddings } from './openai/embeddings'
@@ -116,6 +117,7 @@ export const tools: Record<string, ToolConfig> = {
   airtable_read: airtableReadTool,
   airtable_write: airtableWriteTool,
   airtable_update: airtableUpdateTool,
+  mistral_parser: mistralParserTool,
 }
 
 // Get a tool by its ID
@@ -295,7 +297,7 @@ function getCustomTool(customToolId: string): ToolConfig | undefined {
     },
 
     // Response handling
-    transformResponse: async (response: Response) => {
+    transformResponse: async (response: Response, params: Record<string, any>) => {
       const data = await response.json()
 
       if (!data.success) {
@@ -597,7 +599,7 @@ async function handleInternalRequest(
 
     // Use the tool's response transformer if available
     if (tool.transformResponse) {
-      return await tool.transformResponse(response)
+      return await tool.transformResponse(response, params)
     }
 
     // Default response handling
diff --git a/sim/tools/mistral/index.ts b/sim/tools/mistral/index.ts
@@ -0,0 +1,3 @@
+import { mistralParserTool } from './parser'
+
+export { mistralParserTool } 
diff --git a/sim/tools/mistral/parser.ts b/sim/tools/mistral/parser.ts

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+import { mistralParserTool } from './parser'`
	`2`	`+`
	`3`	`+export { mistralParserTool }`