11import { NextRequest , NextResponse } from 'next/server'
2- import { existsSync } from 'fs'
3- import fs from 'fs'
4- import { readFile , unlink , writeFile } from 'fs/promises'
2+ import * as binExt from 'binary-extensions'
3+ import { Buffer } from 'buffer'
4+ import { createHash } from 'crypto'
5+ import fsPromises , { readFile , unlink , writeFile } from 'fs/promises'
6+ import { tmpdir } from 'os'
57import { join } from 'path'
68import path from 'path'
79import { isSupportedFileType , parseFile } from '@/lib/file-parsers'
@@ -12,6 +14,10 @@ import '@/lib/uploads/setup.server'
1214
1315const logger = createLogger ( 'FilesParseAPI' )
1416
17+ // Constants for URL downloads
18+ const MAX_DOWNLOAD_SIZE_BYTES = 100 * 1024 * 1024 // 100 MB
19+ const DOWNLOAD_TIMEOUT_MS = 30000 // 30 seconds
20+
1521interface ParseSuccessResult {
1622 success : true
1723 output : {
@@ -140,6 +146,11 @@ export async function POST(request: NextRequest) {
140146async function parseFileSingle ( filePath : string , fileType ?: string ) : Promise < ParseResult > {
141147 logger . info ( 'Parsing file:' , filePath )
142148
149+ // Check if this is an external URL
150+ if ( filePath . startsWith ( 'http://' ) || filePath . startsWith ( 'https://' ) ) {
151+ return handleExternalUrl ( filePath , fileType )
152+ }
153+
143154 // Check if this is an S3 path
144155 const isS3Path = filePath . includes ( '/api/files/serve/s3/' )
145156
@@ -152,6 +163,118 @@ async function parseFileSingle(filePath: string, fileType?: string): Promise<Par
152163 return handleLocalFile ( filePath , fileType )
153164}
154165
166+ /**
167+ * Handle an external URL by downloading the file first
168+ */
169+ async function handleExternalUrl ( url : string , fileType ?: string ) : Promise < ParseResult > {
170+ logger . info ( `Handling external URL: ${ url } ` )
171+
172+ try {
173+ // Create a unique filename for the temporary file
174+ const urlHash = createHash ( 'md5' ) . update ( url ) . digest ( 'hex' )
175+ const urlObj = new URL ( url )
176+ const originalFilename = urlObj . pathname . split ( '/' ) . pop ( ) || 'download'
177+ const tmpFilename = `${ urlHash } -${ originalFilename } `
178+ const tmpFilePath = path . join ( tmpdir ( ) , tmpFilename )
179+
180+ // Download the file using native fetch
181+ logger . info ( `Downloading file from URL to ${ tmpFilePath } ` )
182+ const response = await fetch ( url , {
183+ method : 'GET' ,
184+ headers : {
185+ 'User-Agent' : 'SimStudio/1.0' ,
186+ } ,
187+ signal : AbortSignal . timeout ( DOWNLOAD_TIMEOUT_MS ) , // Add timeout
188+ } )
189+
190+ if ( ! response . ok ) {
191+ throw new Error ( `Failed to download file: ${ response . status } ${ response . statusText } ` )
192+ }
193+
194+ // Check file size before downloading content
195+ const contentLength = response . headers . get ( 'content-length' )
196+ if ( contentLength ) {
197+ const fileSize = parseInt ( contentLength , 10 )
198+ if ( fileSize > MAX_DOWNLOAD_SIZE_BYTES ) {
199+ throw new Error (
200+ `File size (${ prettySize ( fileSize ) } ) exceeds the limit of ${ prettySize (
201+ MAX_DOWNLOAD_SIZE_BYTES
202+ ) } .`
203+ )
204+ }
205+ } else {
206+ logger . warn ( 'Content-Length header missing, cannot verify file size before download.' )
207+ }
208+
209+ // Get the file buffer from response
210+ const arrayBuffer = await response . arrayBuffer ( )
211+ const fileBuffer = Buffer . from ( arrayBuffer )
212+
213+ // Write to temporary file
214+ await writeFile ( tmpFilePath , fileBuffer )
215+ logger . info ( `Downloaded ${ fileBuffer . length } bytes to ${ tmpFilePath } ` )
216+
217+ // Determine file extension and type
218+ const contentType = response . headers . get ( 'content-type' ) || ''
219+ const extension =
220+ path . extname ( originalFilename ) . toLowerCase ( ) . substring ( 1 ) ||
221+ ( contentType ? contentType . split ( '/' ) . pop ( ) || 'unknown' : 'unknown' )
222+
223+ try {
224+ // Process based on file type
225+ let result : ParseResult
226+
227+ if ( extension === 'pdf' ) {
228+ result = await handlePdfBuffer ( fileBuffer , originalFilename , fileType , url )
229+ } else if ( extension === 'csv' ) {
230+ result = await handleCsvBuffer ( fileBuffer , originalFilename , fileType , url )
231+ } else if ( isSupportedFileType ( extension ) ) {
232+ result = await handleGenericTextBuffer (
233+ fileBuffer ,
234+ originalFilename ,
235+ extension ,
236+ fileType ,
237+ url
238+ )
239+ } else {
240+ result = handleGenericBuffer ( fileBuffer , originalFilename , extension , fileType )
241+ }
242+
243+ // Clean up temporary file
244+ try {
245+ await unlink ( tmpFilePath )
246+ logger . info ( `Deleted temporary file: ${ tmpFilePath } ` )
247+ } catch ( cleanupError ) {
248+ logger . warn ( `Failed to delete temporary file ${ tmpFilePath } :` , cleanupError )
249+ }
250+
251+ return result
252+ } catch ( parseError ) {
253+ logger . error ( `Error parsing downloaded file: ${ url } ` , parseError )
254+
255+ // Clean up temporary file on error
256+ try {
257+ await unlink ( tmpFilePath )
258+ } catch ( cleanupError ) {
259+ // Ignore cleanup errors on parse failure
260+ }
261+
262+ throw parseError
263+ }
264+ } catch ( error ) {
265+ logger . error ( `Error handling external URL ${ url } :` , error )
266+ let errorMessage = `Failed to download or process file from URL: ${ ( error as Error ) . message } `
267+ if ( ( error as Error ) . name === 'TimeoutError' ) {
268+ errorMessage = `Download timed out after ${ DOWNLOAD_TIMEOUT_MS / 1000 } seconds.`
269+ }
270+ return {
271+ success : false ,
272+ error : errorMessage ,
273+ filePath : url ,
274+ }
275+ }
276+ }
277+
155278/**
156279 * Handle file stored in S3
157280 */
@@ -358,7 +481,7 @@ function handleGenericBuffer(
358481 extension : string ,
359482 fileType ?: string
360483) : ParseResult {
361- const isBinary = binaryExtensions . includes ( extension )
484+ const isBinary = binExt . includes ( extension )
362485 const content = isBinary
363486 ? `[Binary ${ extension . toUpperCase ( ) } file - ${ fileBuffer . length } bytes]`
364487 : fileBuffer . toString ( 'utf-8' )
@@ -428,7 +551,7 @@ async function handleLocalFile(filePath: string, fileType?: string): Promise<Par
428551
429552 // Make sure the file is actually a file that exists
430553 try {
431- await fs . promises . access ( localFilePath , fs . constants . R_OK )
554+ await fsPromises . access ( localFilePath , fsPromises . constants . R_OK )
432555 } catch ( error ) {
433556 logger . error ( `File access error: ${ localFilePath } ` , error )
434557 return {
@@ -439,7 +562,7 @@ async function handleLocalFile(filePath: string, fileType?: string): Promise<Par
439562 }
440563
441564 // Get file stats
442- const stats = await fs . promises . stat ( localFilePath )
565+ const stats = await fsPromises . stat ( localFilePath )
443566 if ( ! stats . isFile ( ) ) {
444567 logger . error ( `Not a file: ${ localFilePath } ` )
445568 return {
@@ -564,18 +687,21 @@ async function handleGenericFile(
564687 const fileSize = fileBuffer . length
565688
566689 // Determine if file should be treated as binary
567- const isBinary = binaryExtensions . includes ( extension )
690+ const isBinary = binExt . includes ( extension )
568691
569692 // Parse content based on binary status
570- const fileContent = isBinary
571- ? `[Binary ${ extension . toUpperCase ( ) } file - ${ fileSize } bytes]`
572- : await parseTextFile ( fileBuffer )
693+ let content : string
694+ if ( isBinary ) {
695+ content = `[Binary ${ extension . toUpperCase ( ) } file - ${ fileSize } bytes]`
696+ } else {
697+ content = await parseTextFile ( fileBuffer )
698+ }
573699
574700 // Always return success: true for generic files (even unsupported ones)
575701 return {
576702 success : true ,
577703 output : {
578- content : fileContent ,
704+ content,
579705 fileType : fileType || getMimeType ( extension ) ,
580706 size : fileSize ,
581707 name : filename ,
0 commit comments