diff --git a/api-reference/endpoint/parse.mdx b/api-reference/endpoint/parse.mdx new file mode 100644 index 00000000..8f7b1848 --- /dev/null +++ b/api-reference/endpoint/parse.mdx @@ -0,0 +1,4 @@ +--- +title: 'Parse' +openapi: '/api-reference/v2-openapi.json POST /parse' +--- diff --git a/api-reference/v2-introduction.mdx b/api-reference/v2-introduction.mdx index d58f0430..90a752d8 100644 --- a/api-reference/v2-introduction.mdx +++ b/api-reference/v2-introduction.mdx @@ -11,6 +11,9 @@ The Firecrawl API gives you programmatic access to web data. All endpoints share Extract content from any webpage in markdown or json format. + + Upload files and parse them into markdown or other formats. + Crawl entire websites, extract their content and metadata. diff --git a/api-reference/v2-openapi.json b/api-reference/v2-openapi.json index cf275f2f..9a52be83 100644 --- a/api-reference/v2-openapi.json +++ b/api-reference/v2-openapi.json @@ -131,6 +131,136 @@ } } }, + "/parse": { + "post": { + "summary": "Upload and parse a file", + "operationId": "parseFile", + "tags": ["Scraping"], + "security": [ + { + "bearerAuth": [] + } + ], + "requestBody": { + "required": true, + "content": { + "multipart/form-data": { + "schema": { + "type": "object", + "properties": { + "file": { + "type": "string", + "format": "binary", + "description": "The file bytes to parse. Supported extensions: .html, .htm, .pdf, .docx, .doc, .odt, .rtf, .xlsx, .xls." + }, + "options": { + "$ref": "#/components/schemas/ParseOptions" + } + }, + "required": ["file"] + }, + "encoding": { + "options": { + "contentType": "application/json" + } + } + } + } + }, + "responses": { + "200": { + "description": "Successful response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ScrapeResponse" + } + } + } + }, + "400": { + "description": "Bad request", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "success": { + "type": "boolean", + "example": false + }, + "code": { + "type": "string", + "example": "BAD_REQUEST" + }, + "error": { + "type": "string", + "example": "Invalid multipart form-data request." + } + } + } + } + } + }, + "402": { + "description": "Payment required", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "Payment required to access this resource." + } + } + } + } + } + }, + "429": { + "description": "Too many requests", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "error": { + "type": "string", + "example": "Request rate limit exceeded. Please wait and try again later." + } + } + } + } + } + }, + "500": { + "description": "Server error", + "content": { + "application/json": { + "schema": { + "type": "object", + "properties": { + "success": { + "type": "boolean", + "example": false + }, + "code": { + "type": "string", + "example": "UNKNOWN_ERROR" + }, + "error": { + "type": "string", + "example": "An unexpected error occurred on the server." + } + } + } + } + } + } + } + } + }, "/batch/scrape": { "post": { "summary": "Scrape multiple URLs and optionally extract information using an LLM", @@ -3183,6 +3313,205 @@ "description": "Output formats to include in the response. You can specify one or more formats, either as strings (e.g., `'markdown'`) or as objects with additional options (e.g., `{ type: 'json', schema: {...} }`). Some formats require specific options to be set. Example: `['markdown', { type: 'json', schema: {...} }]`.", "default": ["markdown"] }, + "ParseFormats": { + "type": "array", + "items": { + "oneOf": [ + { + "type": "object", + "title": "Markdown", + "properties": { + "type": { + "type": "string", + "enum": ["markdown"] + } + }, + "required": ["type"] + }, + { + "type": "object", + "title": "Summary", + "properties": { + "type": { + "type": "string", + "enum": ["summary"] + } + }, + "required": ["type"] + }, + { + "type": "object", + "title": "HTML", + "properties": { + "type": { + "type": "string", + "enum": ["html"] + } + }, + "required": ["type"] + }, + { + "type": "object", + "title": "Raw HTML", + "properties": { + "type": { + "type": "string", + "enum": ["rawHtml"] + } + }, + "required": ["type"] + }, + { + "type": "object", + "title": "Links", + "properties": { + "type": { + "type": "string", + "enum": ["links"] + } + }, + "required": ["type"] + }, + { + "type": "object", + "title": "Images", + "properties": { + "type": { + "type": "string", + "enum": ["images"] + } + }, + "required": ["type"] + }, + { + "type": "object", + "title": "JSON", + "properties": { + "type": { + "type": "string", + "enum": ["json"] + }, + "schema": { + "type": "object", + "description": "The schema to use for the JSON output. Must conform to [JSON Schema](https://json-schema.org/)." + }, + "prompt": { + "type": "string", + "description": "The prompt to use for the JSON output" + } + }, + "required": ["type"] + } + ] + }, + "description": "Output formats supported for `/parse` uploads. Browser-rendering formats and change tracking are not supported.", + "default": ["markdown"] + }, + "ParseOptions": { + "type": "object", + "description": "Optional parse options sent as JSON in the multipart `options` field.", + "properties": { + "formats": { + "$ref": "#/components/schemas/ParseFormats" + }, + "onlyMainContent": { + "type": "boolean", + "description": "Only return the main content of the page excluding headers, navs, footers, etc.", + "default": true + }, + "includeTags": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Tags to include in the output." + }, + "excludeTags": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Tags to exclude from the output." + }, + "headers": { + "type": "object", + "description": "Headers to send when additional network requests are required." + }, + "timeout": { + "type": "integer", + "description": "Timeout in milliseconds for the request. Default is 30000 (30 seconds). Maximum is 300000 (300 seconds).", + "default": 30000, + "maximum": 300000 + }, + "parsers": { + "type": "array", + "description": "Controls file parser behavior when relevant (for example PDF parser mode).", + "items": { + "oneOf": [ + { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": ["pdf"] + }, + "mode": { + "type": "string", + "enum": ["fast", "auto", "ocr"], + "default": "auto", + "description": "PDF parsing mode. \"fast\": text-only extraction. \"auto\": text-first with OCR fallback. \"ocr\": OCR on every page." + }, + "maxPages": { + "type": "integer", + "minimum": 1, + "maximum": 10000, + "description": "Maximum number of pages to parse from the PDF." + } + }, + "required": ["type"], + "additionalProperties": false + } + ] + }, + "default": ["pdf"] + }, + "skipTlsVerification": { + "type": "boolean", + "description": "Skip TLS certificate verification when making requests.", + "default": true + }, + "removeBase64Images": { + "type": "boolean", + "description": "Remove base64-encoded images from output and keep alt text placeholders.", + "default": true + }, + "blockAds": { + "type": "boolean", + "description": "Enable ad and cookie popup blocking.", + "default": true + }, + "proxy": { + "type": "string", + "enum": ["basic", "auto"], + "description": "Proxy mode for parse uploads. `/parse` supports only `basic` and `auto`." + }, + "origin": { + "type": "string", + "description": "Origin identifier for analytics and logging.", + "default": "api" + }, + "integration": { + "type": "string", + "nullable": true, + "description": "Optional integration identifier." + }, + "zeroDataRetention": { + "type": "boolean", + "default": false, + "description": "If true, this will enable zero data retention for this parse. To enable this feature, please contact help@firecrawl.dev" + } + } + }, "ScrapeOptions": { "type": "object", "properties": { diff --git a/docs.json b/docs.json index 5d2e89fd..43671c8f 100755 --- a/docs.json +++ b/docs.json @@ -254,6 +254,12 @@ "api-reference/endpoint/map" ] }, + { + "group": "Parse Endpoints", + "pages": [ + "api-reference/endpoint/parse" + ] + }, { "group": "Crawl Endpoints", "pages": [ diff --git a/features/document-parsing.mdx b/features/document-parsing.mdx index 4b89c030..78660cbf 100644 --- a/features/document-parsing.mdx +++ b/features/document-parsing.mdx @@ -51,7 +51,58 @@ parsers: [{ type: "pdf" }] ## How to Use Document Parsing -Document parsing in Firecrawl works automatically when you provide a URL that points to a supported document type. The system will detect the file type based on the URL extension or content-type header and process it accordingly. +Document parsing in Firecrawl works in two ways: + +1. **URL-based parsing (`/v2/scrape`)**: provide a URL that points to a supported document type. +2. **File upload parsing (`/v2/parse`)**: upload file bytes directly with `multipart/form-data`. + +For URL-based parsing, Firecrawl detects file type from extension or content type automatically. + +### Upload documents with `/v2/parse` + +Use `/v2/parse` when the source document is local or not publicly accessible by URL. + + + +```bash cURL +curl -X POST "https://api.firecrawl.dev/v2/parse" \ + -H "Authorization: Bearer fc-YOUR-API-KEY" \ + -F 'options={"formats":["markdown"]}' \ + -F "file=@./document.docx;type=application/vnd.openxmlformats-officedocument.wordprocessingml.document" +``` + +```js Node +import Firecrawl from "@mendable/firecrawl-js"; + +const app = new Firecrawl({ apiKey: "fc-YOUR-API-KEY" }); + +const doc = await app.parse( + { + data: "

Upload Parse

", + filename: "upload.html", + contentType: "text/html", + }, + { formats: ["markdown"] }, +); + +console.log(doc.markdown); +``` + +```python Python +from firecrawl import Firecrawl +from firecrawl.v2.types import ScrapeOptions + +app = Firecrawl(api_key="fc-YOUR-API-KEY") +doc = app.parse( + b"

Upload Parse

", + filename="upload.html", + content_type="text/html", + options=ScrapeOptions(formats=["markdown"]), +) +print(doc.markdown) +``` + +
### Example: Scraping an Excel File diff --git a/sdks/java.mdx b/sdks/java.mdx index d1f3b142..e5783969 100644 --- a/sdks/java.mdx +++ b/sdks/java.mdx @@ -128,6 +128,36 @@ System.out.println(doc.getMarkdown()); System.out.println(doc.getMetadata().get("title")); ``` +### Parsing uploaded files + +The latest Java SDK package (`com.firecrawl:firecrawl-java`) supports direct file uploads to `/v2/parse`. +`parse` does not support `changeTracking` or browser-only options like `screenshot`, `branding`, `actions`, `waitFor`, `location`, and `mobile`. + +```java Java +import com.firecrawl.client.FirecrawlClient; +import com.firecrawl.models.Document; +import com.firecrawl.models.ParseFile; +import com.firecrawl.models.ParseOptions; +import java.nio.charset.StandardCharsets; +import java.util.List; + +FirecrawlClient client = FirecrawlClient.fromEnv(); + +ParseFile file = ParseFile.builder() + .filename("upload.html") + .content("

Java Parse

" + .getBytes(StandardCharsets.UTF_8)) + .contentType("text/html") + .build(); + +Document parsed = client.parse( + file, + ParseOptions.builder().formats(List.of("markdown")).build() +); + +System.out.println(parsed.getMarkdown()); +``` + #### JSON Extraction Extract structured data using the Extract endpoint by specifying a JSON schema and prompt: diff --git a/sdks/node.mdx b/sdks/node.mdx index 55ee9952..35f13f3d 100644 --- a/sdks/node.mdx +++ b/sdks/node.mdx @@ -41,6 +41,26 @@ To scrape a single URL with error handling, use the `scrapeUrl` method. It takes +### Parsing uploaded files + +Use `parse` when you want to upload a local file (`html`, `pdf`, `docx`, `xlsx`, etc.) instead of scraping by URL. +`parse` does not support `changeTracking` or browser-only options like `screenshot`, `branding`, `actions`, `waitFor`, `location`, and `mobile`. + +```js Node +const parsed = await firecrawl.parse( + { + data: "

Node Parse

", + filename: "upload.html", + contentType: "text/html", + }, + { + formats: ["markdown"], + }, +); + +console.log(parsed.markdown); +``` + ### Crawling a Website To crawl a website with error handling, use the `crawlUrl` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format. See [Pagination](#pagination) for auto/ manual pagination and limiting. diff --git a/sdks/python.mdx b/sdks/python.mdx index a9030101..de4bcdce 100644 --- a/sdks/python.mdx +++ b/sdks/python.mdx @@ -42,6 +42,24 @@ To scrape a single URL, use the `scrape` method. It takes the URL as a parameter +### Parsing uploaded files + +Use `parse` to upload local files (`html`, `pdf`, `docx`, `xlsx`, etc.) directly to `/v2/parse`. +`parse` does not support `changeTracking` or browser-only options like actions, wait_for, location, mobile, screenshot, and branding. + +```python Python +from firecrawl.v2.types import ParseOptions + +parsed = firecrawl.parse( + b"

Python Parse

", + filename="upload.html", + content_type="text/html", + options=ParseOptions(formats=["markdown"]), +) + +print(parsed.markdown) +``` + ### Crawl a Website To crawl a website, use the `crawl` method. It takes the starting URL and optional options as arguments. The options allow you to specify additional settings for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format. See [Pagination](#pagination) for auto/manual pagination and limiting. @@ -211,6 +229,18 @@ For async operations, use the `AsyncFirecrawl` class. Its methods mirror `Firecr +```python Python +from firecrawl import AsyncFirecrawl + +async_firecrawl = AsyncFirecrawl(api_key="fc-YOUR-API-KEY") + +parsed = await async_firecrawl.parse( + b"

Async Parse

", + filename="upload.html", + content_type="text/html", +) +``` + ## Browser Launch cloud browser sessions and execute code remotely. diff --git a/sdks/rust.mdx b/sdks/rust.mdx index edf268b2..8f6a44ca 100644 --- a/sdks/rust.mdx +++ b/sdks/rust.mdx @@ -39,6 +39,31 @@ To scrape a single URL, use the `scrape_url` method. It takes the URL as a param +### Parsing uploaded files (v2) + +The Rust SDK includes a v2 client with multipart file parsing support: +`parse` does not support `changeTracking` or browser-only options like `screenshot`, `branding`, `actions`, `waitFor`, `location`, and `mobile`. + +```rust Rust +use firecrawl::v2::{Client, ParseFile, ParseFormat, ParseOptions}; + +let client = Client::new("fc-YOUR-API-KEY")?; + +let file = ParseFile::from_bytes( + "upload.html", + b"

Rust Parse

".to_vec(), +) +.with_content_type("text/html"); + +let options = ParseOptions { + formats: Some(vec![ParseFormat::Markdown]), + ..Default::default() +}; + +let doc = client.parse(file, Some(options)).await?; +println!("{:?}", doc.markdown); +``` + ### Scraping with Extract With Extract, you can easily extract structured data from any URL. You need to specify your schema in the JSON Schema format, using the `serde_json::json!` macro.