Skip to content

Commit 33c5c52

Browse files
authored
feat(parser): added URL upload for our file parser (#229)
* added URL upload for our file parser * resolved PR comments
1 parent 2de6f45 commit 33c5c52

File tree

4 files changed

+307
-101
lines changed

4 files changed

+307
-101
lines changed

sim/app/api/files/parse/route.ts

Lines changed: 137 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
import { NextRequest, NextResponse } from 'next/server'
2-
import { existsSync } from 'fs'
3-
import fs from 'fs'
4-
import { readFile, unlink, writeFile } from 'fs/promises'
2+
import * as binExt from 'binary-extensions'
3+
import { Buffer } from 'buffer'
4+
import { createHash } from 'crypto'
5+
import fsPromises, { readFile, unlink, writeFile } from 'fs/promises'
6+
import { tmpdir } from 'os'
57
import { join } from 'path'
68
import path from 'path'
79
import { isSupportedFileType, parseFile } from '@/lib/file-parsers'
@@ -12,6 +14,10 @@ import '@/lib/uploads/setup.server'
1214

1315
const logger = createLogger('FilesParseAPI')
1416

17+
// Constants for URL downloads
18+
const MAX_DOWNLOAD_SIZE_BYTES = 100 * 1024 * 1024 // 100 MB
19+
const DOWNLOAD_TIMEOUT_MS = 30000 // 30 seconds
20+
1521
interface ParseSuccessResult {
1622
success: true
1723
output: {
@@ -140,6 +146,11 @@ export async function POST(request: NextRequest) {
140146
async function parseFileSingle(filePath: string, fileType?: string): Promise<ParseResult> {
141147
logger.info('Parsing file:', filePath)
142148

149+
// Check if this is an external URL
150+
if (filePath.startsWith('http://') || filePath.startsWith('https://')) {
151+
return handleExternalUrl(filePath, fileType)
152+
}
153+
143154
// Check if this is an S3 path
144155
const isS3Path = filePath.includes('/api/files/serve/s3/')
145156

@@ -152,6 +163,118 @@ async function parseFileSingle(filePath: string, fileType?: string): Promise<Par
152163
return handleLocalFile(filePath, fileType)
153164
}
154165

166+
/**
167+
* Handle an external URL by downloading the file first
168+
*/
169+
async function handleExternalUrl(url: string, fileType?: string): Promise<ParseResult> {
170+
logger.info(`Handling external URL: ${url}`)
171+
172+
try {
173+
// Create a unique filename for the temporary file
174+
const urlHash = createHash('md5').update(url).digest('hex')
175+
const urlObj = new URL(url)
176+
const originalFilename = urlObj.pathname.split('/').pop() || 'download'
177+
const tmpFilename = `${urlHash}-${originalFilename}`
178+
const tmpFilePath = path.join(tmpdir(), tmpFilename)
179+
180+
// Download the file using native fetch
181+
logger.info(`Downloading file from URL to ${tmpFilePath}`)
182+
const response = await fetch(url, {
183+
method: 'GET',
184+
headers: {
185+
'User-Agent': 'SimStudio/1.0',
186+
},
187+
signal: AbortSignal.timeout(DOWNLOAD_TIMEOUT_MS), // Add timeout
188+
})
189+
190+
if (!response.ok) {
191+
throw new Error(`Failed to download file: ${response.status} ${response.statusText}`)
192+
}
193+
194+
// Check file size before downloading content
195+
const contentLength = response.headers.get('content-length')
196+
if (contentLength) {
197+
const fileSize = parseInt(contentLength, 10)
198+
if (fileSize > MAX_DOWNLOAD_SIZE_BYTES) {
199+
throw new Error(
200+
`File size (${prettySize(fileSize)}) exceeds the limit of ${prettySize(
201+
MAX_DOWNLOAD_SIZE_BYTES
202+
)}.`
203+
)
204+
}
205+
} else {
206+
logger.warn('Content-Length header missing, cannot verify file size before download.')
207+
}
208+
209+
// Get the file buffer from response
210+
const arrayBuffer = await response.arrayBuffer()
211+
const fileBuffer = Buffer.from(arrayBuffer)
212+
213+
// Write to temporary file
214+
await writeFile(tmpFilePath, fileBuffer)
215+
logger.info(`Downloaded ${fileBuffer.length} bytes to ${tmpFilePath}`)
216+
217+
// Determine file extension and type
218+
const contentType = response.headers.get('content-type') || ''
219+
const extension =
220+
path.extname(originalFilename).toLowerCase().substring(1) ||
221+
(contentType ? contentType.split('/').pop() || 'unknown' : 'unknown')
222+
223+
try {
224+
// Process based on file type
225+
let result: ParseResult
226+
227+
if (extension === 'pdf') {
228+
result = await handlePdfBuffer(fileBuffer, originalFilename, fileType, url)
229+
} else if (extension === 'csv') {
230+
result = await handleCsvBuffer(fileBuffer, originalFilename, fileType, url)
231+
} else if (isSupportedFileType(extension)) {
232+
result = await handleGenericTextBuffer(
233+
fileBuffer,
234+
originalFilename,
235+
extension,
236+
fileType,
237+
url
238+
)
239+
} else {
240+
result = handleGenericBuffer(fileBuffer, originalFilename, extension, fileType)
241+
}
242+
243+
// Clean up temporary file
244+
try {
245+
await unlink(tmpFilePath)
246+
logger.info(`Deleted temporary file: ${tmpFilePath}`)
247+
} catch (cleanupError) {
248+
logger.warn(`Failed to delete temporary file ${tmpFilePath}:`, cleanupError)
249+
}
250+
251+
return result
252+
} catch (parseError) {
253+
logger.error(`Error parsing downloaded file: ${url}`, parseError)
254+
255+
// Clean up temporary file on error
256+
try {
257+
await unlink(tmpFilePath)
258+
} catch (cleanupError) {
259+
// Ignore cleanup errors on parse failure
260+
}
261+
262+
throw parseError
263+
}
264+
} catch (error) {
265+
logger.error(`Error handling external URL ${url}:`, error)
266+
let errorMessage = `Failed to download or process file from URL: ${(error as Error).message}`
267+
if ((error as Error).name === 'TimeoutError') {
268+
errorMessage = `Download timed out after ${DOWNLOAD_TIMEOUT_MS / 1000} seconds.`
269+
}
270+
return {
271+
success: false,
272+
error: errorMessage,
273+
filePath: url,
274+
}
275+
}
276+
}
277+
155278
/**
156279
* Handle file stored in S3
157280
*/
@@ -358,7 +481,7 @@ function handleGenericBuffer(
358481
extension: string,
359482
fileType?: string
360483
): ParseResult {
361-
const isBinary = binaryExtensions.includes(extension)
484+
const isBinary = binExt.includes(extension)
362485
const content = isBinary
363486
? `[Binary ${extension.toUpperCase()} file - ${fileBuffer.length} bytes]`
364487
: fileBuffer.toString('utf-8')
@@ -428,7 +551,7 @@ async function handleLocalFile(filePath: string, fileType?: string): Promise<Par
428551

429552
// Make sure the file is actually a file that exists
430553
try {
431-
await fs.promises.access(localFilePath, fs.constants.R_OK)
554+
await fsPromises.access(localFilePath, fsPromises.constants.R_OK)
432555
} catch (error) {
433556
logger.error(`File access error: ${localFilePath}`, error)
434557
return {
@@ -439,7 +562,7 @@ async function handleLocalFile(filePath: string, fileType?: string): Promise<Par
439562
}
440563

441564
// Get file stats
442-
const stats = await fs.promises.stat(localFilePath)
565+
const stats = await fsPromises.stat(localFilePath)
443566
if (!stats.isFile()) {
444567
logger.error(`Not a file: ${localFilePath}`)
445568
return {
@@ -564,18 +687,21 @@ async function handleGenericFile(
564687
const fileSize = fileBuffer.length
565688

566689
// Determine if file should be treated as binary
567-
const isBinary = binaryExtensions.includes(extension)
690+
const isBinary = binExt.includes(extension)
568691

569692
// Parse content based on binary status
570-
const fileContent = isBinary
571-
? `[Binary ${extension.toUpperCase()} file - ${fileSize} bytes]`
572-
: await parseTextFile(fileBuffer)
693+
let content: string
694+
if (isBinary) {
695+
content = `[Binary ${extension.toUpperCase()} file - ${fileSize} bytes]`
696+
} else {
697+
content = await parseTextFile(fileBuffer)
698+
}
573699

574700
// Always return success: true for generic files (even unsupported ones)
575701
return {
576702
success: true,
577703
output: {
578-
content: fileContent,
704+
content,
579705
fileType: fileType || getMimeType(extension),
580706
size: fileSize,
581707
name: filename,

sim/blocks/blocks/file.ts

Lines changed: 96 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,60 +1,132 @@
11
import { DocumentIcon } from '@/components/icons'
2-
import { FileParserOutput } from '@/tools/file/parser'
3-
import { BlockConfig } from '../types'
2+
import { createLogger } from '@/lib/logs/console-logger'
3+
import { FileParserOutput } from '@/tools/file/types'
4+
import { BlockConfig, SubBlockConfig, SubBlockLayout, SubBlockType } from '../types'
5+
6+
const logger = createLogger('FileBlock')
7+
8+
const isProduction = process.env.NODE_ENV === 'production'
9+
const isS3Enabled = process.env.USE_S3 === 'true'
10+
const shouldEnableURLInput = isProduction || isS3Enabled
11+
12+
// Define sub-blocks conditionally
13+
const inputMethodBlock: SubBlockConfig = {
14+
id: 'inputMethod',
15+
title: 'Select Input Method',
16+
type: 'dropdown' as SubBlockType,
17+
layout: 'full' as SubBlockLayout,
18+
options: [
19+
{ id: 'url', label: 'File URL' },
20+
{ id: 'upload', label: 'Upload Files' },
21+
],
22+
}
23+
24+
const fileUrlBlock: SubBlockConfig = {
25+
id: 'filePath',
26+
title: 'File URL',
27+
type: 'short-input' as SubBlockType,
28+
layout: 'full' as SubBlockLayout,
29+
placeholder: 'Enter URL to a file (https://example.com/document.pdf)',
30+
condition: {
31+
field: 'inputMethod',
32+
value: 'url',
33+
},
34+
}
35+
36+
const fileUploadBlock: SubBlockConfig = {
37+
id: 'file',
38+
title: 'Upload Files',
39+
type: 'file-upload' as SubBlockType,
40+
layout: 'full' as SubBlockLayout,
41+
acceptedTypes: '.pdf,.csv,.docx',
42+
multiple: true,
43+
maxSize: 100,
44+
}
445

546
export const FileBlock: BlockConfig<FileParserOutput> = {
647
type: 'file',
748
name: 'File',
849
description: 'Read and parse multiple files',
950
longDescription:
10-
'Upload and extract contents from structured file formats including PDFs, CSV spreadsheets, and Word documents (DOCX). Specialized parsers extract text and metadata from each format. You can upload multiple files at once and access them individually or as a combined document.',
51+
'Upload and extract contents from structured file formats including PDFs, CSV spreadsheets, and Word documents (DOCX). ' +
52+
(shouldEnableURLInput
53+
? 'You can either provide a URL to a file or upload files directly. '
54+
: 'Upload files directly. ') +
55+
'Specialized parsers extract text and metadata from each format. You can upload multiple files at once and access them individually or as a combined document.',
1156
category: 'tools',
1257
bgColor: '#40916C',
1358
icon: DocumentIcon,
1459
subBlocks: [
60+
...(shouldEnableURLInput ? [inputMethodBlock, fileUrlBlock] : []),
1561
{
16-
id: 'file',
17-
title: 'Upload Files',
18-
type: 'file-upload',
19-
layout: 'full',
20-
acceptedTypes: '.pdf,.csv,.docx',
21-
multiple: true,
22-
maxSize: 100,
62+
...fileUploadBlock,
63+
...(shouldEnableURLInput ? { condition: { field: 'inputMethod', value: 'upload' } } : {}),
2364
},
2465
],
2566
tools: {
2667
access: ['file_parser'],
2768
config: {
2869
tool: () => 'file_parser',
2970
params: (params) => {
30-
console.log('File block params:', params)
71+
// Determine input method based on whether URL input is enabled
72+
const inputMethod = shouldEnableURLInput ? params.inputMethod || 'url' : 'upload'
73+
74+
if (inputMethod === 'url') {
75+
if (!params.filePath || params.filePath.trim() === '') {
76+
logger.error('Missing file URL')
77+
throw new Error('File URL is required')
78+
}
79+
80+
const fileUrl = params.filePath.trim()
3181

32-
// Handle case where 'file' is an array (multiple files)
33-
if (params.file && Array.isArray(params.file) && params.file.length > 0) {
34-
// Process all files by sending array of paths
35-
const filePaths = params.file.map((file) => file.path)
3682
return {
37-
filePath: filePaths.length === 1 ? filePaths[0] : filePaths,
83+
filePath: fileUrl,
3884
fileType: params.fileType || 'auto',
3985
}
4086
}
4187

42-
// Handle case where 'file' is a single file object
43-
if (params.file && params.file.path) {
44-
return {
45-
filePath: params.file.path,
46-
fileType: params.fileType || 'auto',
88+
// Handle file upload input (always possible, default if URL input disabled)
89+
if (inputMethod === 'upload') {
90+
// Handle case where 'file' is an array (multiple files)
91+
if (params.file && Array.isArray(params.file) && params.file.length > 0) {
92+
const filePaths = params.file.map((file) => file.path)
93+
94+
return {
95+
filePath: filePaths.length === 1 ? filePaths[0] : filePaths,
96+
fileType: params.fileType || 'auto',
97+
}
98+
}
99+
100+
// Handle case where 'file' is a single file object
101+
if (params.file && params.file.path) {
102+
return {
103+
filePath: params.file.path,
104+
fileType: params.fileType || 'auto',
105+
}
47106
}
107+
108+
// If no files, return error
109+
logger.error('No files provided for upload method')
110+
throw new Error('Please upload a file') // Changed error message slightly
48111
}
49112

50-
// If no files, return empty params
51-
return { filePath: '', fileType: params.fileType || 'auto' }
113+
// This part should ideally not be reached if logic above is correct
114+
logger.error(`Invalid configuration or state: ${inputMethod}`)
115+
throw new Error('Invalid configuration: Unable to determine input method')
52116
},
53117
},
54118
},
55119
inputs: {
120+
// Conditionally require inputMethod and filePath only if URL input is enabled
121+
...(shouldEnableURLInput
122+
? {
123+
inputMethod: { type: 'string', required: false }, // Not strictly required as it defaults
124+
filePath: { type: 'string', required: false }, // Required only if inputMethod is 'url' (validated in params)
125+
}
126+
: {}),
56127
fileType: { type: 'string', required: false },
57-
file: { type: 'json', required: true },
128+
// File input is always potentially needed, but only required if method is 'upload' (validated in params)
129+
file: { type: 'json', required: false },
58130
},
59131
outputs: {
60132
response: {

0 commit comments

Comments
 (0)