diff --git a/api-reference/endpoint/parse.mdx b/api-reference/endpoint/parse.mdx
new file mode 100644
index 00000000..3d1c6fec
--- /dev/null
+++ b/api-reference/endpoint/parse.mdx
@@ -0,0 +1,31 @@
+---
+title: 'Parse'
+openapi: '/api-reference/v2-openapi.json POST /parse'
+---
+
+import ParseCurl from '/snippets/v2/parse/base/curl.mdx'
+
+Use `/v2/parse` to upload a local file and run it through the scrape pipeline (PDF/document parsing, markdown conversion, metadata extraction, and transformers).
+
+## Multipart Fields
+
+- `file` (required): The file to parse.
+- `options` (optional): JSON string of parse options.
+- `origin`, `integration`, `zeroDataRetention` (optional): Same semantics as `/v2/scrape`.
+
+## Allowed Options
+
+- `formats`: `markdown`, `html`, `rawHtml`, `links`, `images`, `summary`, `json`, `attributes`
+- `onlyMainContent`, `includeTags`, `excludeTags`, `parsers` (pdf only), `removeBase64Images`, `timeout`
+
+## Supported File Types
+
+- PDF (`application/pdf` or `%PDF` signature)
+- Office documents: `.docx`, `.odt`, `.rtf`, `.xlsx`, `.xls`
+- HTML: `.html`, `.htm`, `text/html`, `application/xhtml+xml`
+- Markdown: `.md`, `text/markdown`
+- Plain text: `text/plain`
+
+## Example
+
+
diff --git a/api-reference/v2-openapi.json b/api-reference/v2-openapi.json
index 8c65e021..ba027a34 100644
--- a/api-reference/v2-openapi.json
+++ b/api-reference/v2-openapi.json
@@ -131,6 +131,201 @@
}
}
},
+ "/parse": {
+ "post": {
+ "summary": "Parse a local file upload using the scrape pipeline",
+ "operationId": "parseLocalFile",
+ "tags": ["Scraping"],
+ "security": [
+ {
+ "bearerAuth": []
+ }
+ ],
+ "requestBody": {
+ "required": true,
+ "content": {
+ "multipart/form-data": {
+ "schema": {
+ "type": "object",
+ "properties": {
+ "file": {
+ "type": "string",
+ "format": "binary",
+ "description": "File to parse"
+ },
+ "options": {
+ "allOf": [
+ {
+ "$ref": "#/components/schemas/ParseOptions"
+ }
+ ],
+ "description": "Parse options to apply (send as JSON in the multipart part)."
+ },
+ "origin": {
+ "type": "string",
+ "description": "Request origin identifier"
+ },
+ "integration": {
+ "type": "string",
+ "description": "Integration identifier"
+ },
+ "zeroDataRetention": {
+ "type": "boolean",
+ "description": "If true, enable zero data retention for this parse. Contact support to enable."
+ }
+ },
+ "required": ["file"]
+ },
+ "encoding": {
+ "options": {
+ "contentType": "application/json"
+ }
+ }
+ }
+ }
+ },
+ "responses": {
+ "200": {
+ "description": "Successful response",
+ "content": {
+ "application/json": {
+ "schema": {
+ "$ref": "#/components/schemas/ScrapeResponse"
+ }
+ }
+ }
+ },
+ "400": {
+ "description": "Bad request",
+ "content": {
+ "application/json": {
+ "schema": {
+ "type": "object",
+ "properties": {
+ "success": {
+ "type": "boolean",
+ "example": false
+ },
+ "code": {
+ "type": "string",
+ "example": "BAD_REQUEST"
+ },
+ "error": {
+ "type": "string",
+ "example": "Invalid options JSON"
+ }
+ }
+ }
+ }
+ }
+ },
+ "402": {
+ "description": "Payment required",
+ "content": {
+ "application/json": {
+ "schema": {
+ "type": "object",
+ "properties": {
+ "error": {
+ "type": "string",
+ "example": "Payment required to access this resource."
+ }
+ }
+ }
+ }
+ }
+ },
+ "413": {
+ "description": "Payload too large",
+ "content": {
+ "application/json": {
+ "schema": {
+ "type": "object",
+ "properties": {
+ "success": {
+ "type": "boolean",
+ "example": false
+ },
+ "code": {
+ "type": "string",
+ "example": "PAYLOAD_TOO_LARGE"
+ },
+ "error": {
+ "type": "string",
+ "example": "File size exceeds limit"
+ }
+ }
+ }
+ }
+ }
+ },
+ "415": {
+ "description": "Unsupported media type",
+ "content": {
+ "application/json": {
+ "schema": {
+ "type": "object",
+ "properties": {
+ "success": {
+ "type": "boolean",
+ "example": false
+ },
+ "code": {
+ "type": "string",
+ "example": "SCRAPE_UNSUPPORTED_FILE_ERROR"
+ },
+ "error": {
+ "type": "string",
+ "example": "Unsupported file type"
+ }
+ }
+ }
+ }
+ }
+ },
+ "429": {
+ "description": "Too many requests",
+ "content": {
+ "application/json": {
+ "schema": {
+ "type": "object",
+ "properties": {
+ "error": {
+ "type": "string",
+ "example": "Request rate limit exceeded. Please wait and try again later."
+ }
+ }
+ }
+ }
+ }
+ },
+ "500": {
+ "description": "Server error",
+ "content": {
+ "application/json": {
+ "schema": {
+ "type": "object",
+ "properties": {
+ "success": {
+ "type": "boolean",
+ "example": false
+ },
+ "code": {
+ "type": "string",
+ "example": "UNKNOWN_ERROR"
+ },
+ "error": {
+ "type": "string",
+ "example": "An unexpected error occurred on the server."
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ },
"/batch/scrape": {
"post": {
"summary": "Scrape multiple URLs and optionally extract information using an LLM",
@@ -2782,6 +2977,191 @@
"description": "Output formats to include in the response. You can specify one or more formats, either as strings (e.g., `'markdown'`) or as objects with additional options (e.g., `{ type: 'json', schema: {...} }`). Some formats require specific options to be set. Example: `['markdown', { type: 'json', schema: {...} }]`.",
"default": ["markdown"]
},
+ "ParseFormats": {
+ "type": "array",
+ "items": {
+ "oneOf": [
+ {
+ "type": "object",
+ "title": "Markdown",
+ "properties": {
+ "type": {
+ "type": "string",
+ "enum": ["markdown"]
+ }
+ },
+ "required": ["type"]
+ },
+ {
+ "type": "object",
+ "title": "Summary",
+ "properties": {
+ "type": {
+ "type": "string",
+ "enum": ["summary"]
+ }
+ },
+ "required": ["type"]
+ },
+ {
+ "type": "object",
+ "title": "HTML",
+ "properties": {
+ "type": {
+ "type": "string",
+ "enum": ["html"]
+ }
+ },
+ "required": ["type"]
+ },
+ {
+ "type": "object",
+ "title": "Raw HTML",
+ "properties": {
+ "type": {
+ "type": "string",
+ "enum": ["rawHtml"]
+ }
+ },
+ "required": ["type"]
+ },
+ {
+ "type": "object",
+ "title": "Links",
+ "properties": {
+ "type": {
+ "type": "string",
+ "enum": ["links"]
+ }
+ },
+ "required": ["type"]
+ },
+ {
+ "type": "object",
+ "title": "Images",
+ "properties": {
+ "type": {
+ "type": "string",
+ "enum": ["images"]
+ }
+ },
+ "required": ["type"]
+ },
+ {
+ "type": "object",
+ "title": "JSON",
+ "properties": {
+ "type": {
+ "type": "string",
+ "enum": ["json"]
+ },
+ "schema": {
+ "type": "object",
+ "description": "The schema to use for the JSON output. Must conform to [JSON Schema](https://json-schema.org/)."
+ },
+ "prompt": {
+ "type": "string",
+ "description": "The prompt to use for the JSON output"
+ }
+ },
+ "required": ["type"]
+ },
+ {
+ "type": "object",
+ "title": "Attributes",
+ "properties": {
+ "type": {
+ "type": "string",
+ "enum": ["attributes"]
+ },
+ "selectors": {
+ "type": "array",
+ "description": "Extract specific attributes from elements",
+ "items": {
+ "type": "object",
+ "properties": {
+ "selector": {
+ "type": "string",
+ "description": "CSS selector to find elements"
+ },
+ "attribute": {
+ "type": "string",
+ "description": "Attribute name to extract (e.g., 'data-vehicle-name' or 'id')"
+ }
+ },
+ "required": ["selector", "attribute"]
+ }
+ }
+ },
+ "required": ["type", "selectors"]
+ }
+ ]
+ },
+ "description": "Output formats to include in the response. You can specify one or more formats, either as strings (e.g., `'markdown'`) or as objects with additional options (e.g., `{ type: 'json', schema: {...} }`). Some formats require specific options to be set. Example: `['markdown', { type: 'json', schema: {...} }]`.",
+ "default": ["markdown"]
+ },
+ "ParseOptions": {
+ "type": "object",
+ "properties": {
+ "formats": {
+ "$ref": "#/components/schemas/ParseFormats"
+ },
+ "onlyMainContent": {
+ "type": "boolean",
+ "description": "Only return the main content excluding headers, navs, footers, etc.",
+ "default": true
+ },
+ "includeTags": {
+ "type": "array",
+ "items": {
+ "type": "string"
+ },
+ "description": "Tags to include in the output."
+ },
+ "excludeTags": {
+ "type": "array",
+ "items": {
+ "type": "string"
+ },
+ "description": "Tags to exclude from the output."
+ },
+ "timeout": {
+ "type": "integer",
+ "description": "Timeout in milliseconds for the request."
+ },
+ "parsers": {
+ "type": "array",
+ "description": "Controls how PDFs are processed during parsing. When \"pdf\" is included (default), the PDF content is extracted and converted to markdown format, with billing based on the number of pages (1 credit per page). When an empty array is passed, the PDF file is returned in base64 encoding with a flat rate of 1 credit for the entire PDF.",
+ "items": {
+ "oneOf": [
+ {
+ "type": "object",
+ "properties": {
+ "type": {
+ "type": "string",
+ "enum": ["pdf"]
+ },
+ "maxPages": {
+ "type": "integer",
+ "minimum": 1,
+ "maximum": 10000,
+ "description": "Maximum number of pages to parse from the PDF. Must be a positive integer up to 10000."
+ }
+ },
+ "required": ["type"],
+ "additionalProperties": false
+ }
+ ]
+ },
+ "default": ["pdf"]
+ },
+ "removeBase64Images": {
+ "type": "boolean",
+ "description": "Removes all base 64 images from the output, which may be overwhelmingly long. The image's alt text remains in the output, but the URL is replaced with a placeholder.",
+ "default": true
+ }
+ }
+ },
"ScrapeOptions": {
"type": "object",
"properties": {
diff --git a/docs.json b/docs.json
index 14f32b37..13246b52 100755
--- a/docs.json
+++ b/docs.json
@@ -233,6 +233,7 @@
"group": "Scrape Endpoints",
"pages": [
"api-reference/endpoint/scrape",
+ "api-reference/endpoint/parse",
"api-reference/endpoint/batch-scrape",
"api-reference/endpoint/batch-scrape-get",
"api-reference/endpoint/batch-scrape-delete",
@@ -2809,4 +2810,4 @@
"destination": "/api-reference/endpoint/:slug*"
}
]
-}
\ No newline at end of file
+}
diff --git a/sdks/node.mdx b/sdks/node.mdx
index 6016872d..e3b49bb2 100644
--- a/sdks/node.mdx
+++ b/sdks/node.mdx
@@ -9,6 +9,7 @@ og:description: "Firecrawl Node SDK is a wrapper around the Firecrawl API to hel
import InstallationNode from '/snippets/v2/installation/js.mdx'
import ScrapeAndCrawlExampleNode from '/snippets/v2/scrape-and-crawl/js.mdx'
import ScrapeNodeShort from '/snippets/v2/scrape/short/js.mdx'
+import ParseNodeBase from '/snippets/v2/parse/base/js.mdx'
import CrawlNodeShort from '/snippets/v2/crawl/short/js.mdx'
import CrawlSitemapOnlyNode from '/snippets/v2/crawl/sitemap-only/js.mdx'
import StartCrawlNodeShort from '/snippets/v2/start-crawl/short/js.mdx'
@@ -40,6 +41,12 @@ To scrape a single URL with error handling, use the `scrapeUrl` method. It takes
+### Parsing a Local File
+
+Use `parse` to upload a local file and run it through the scraping pipeline.
+
+
+
### Crawling a Website
To crawl a website with error handling, use the `crawlUrl` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format. See [Pagination](#pagination) for auto/ manual pagination and limiting.
diff --git a/sdks/python.mdx b/sdks/python.mdx
index 8f3b1440..217e974e 100644
--- a/sdks/python.mdx
+++ b/sdks/python.mdx
@@ -8,6 +8,7 @@ og:description: "Firecrawl Python SDK is a wrapper around the Firecrawl API to h
import InstallationPython from '/snippets/v2/installation/python.mdx'
import ScrapePythonShort from '/snippets/v2/scrape/short/python.mdx'
+import ParsePythonBase from '/snippets/v2/parse/base/python.mdx'
import CrawlPythonShort from '/snippets/v2/crawl/short/python.mdx'
import CrawlSitemapOnlyPython from '/snippets/v2/crawl/sitemap-only/python.mdx'
import CheckCrawlStatusPythonShort from '/snippets/v2/crawl-status/short/python.mdx'
@@ -41,6 +42,12 @@ To scrape a single URL, use the `scrape` method. It takes the URL as a parameter
+### Parsing a Local File
+
+Use `parse` to upload a local file and run it through the scraping pipeline.
+
+
+
### Crawl a Website
To crawl a website, use the `crawl` method. It takes the starting URL and optional options as arguments. The options allow you to specify additional settings for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format. See [Pagination](#pagination) for auto/manual pagination and limiting.
diff --git a/sdks/rust.mdx b/sdks/rust.mdx
index edf268b2..54f1e796 100644
--- a/sdks/rust.mdx
+++ b/sdks/rust.mdx
@@ -9,6 +9,7 @@ og:description: "Firecrawl Rust SDK is a library to help you easily scrape and c
import InstallationRust from '/snippets/v1/installation/rust.mdx'
import ScrapeAndCrawlRustExample from '/snippets/v1/scrape-and-crawl/rust.mdx'
import ScrapeRustShort from '/snippets/v1/scrape/short/rust.mdx'
+import ParseRustBase from '/snippets/v2/parse/base/rust.mdx'
import CrawlRustShort from '/snippets/v1/crawl/short/rust.mdx'
import CrawlSitemapOnlyRust from '/snippets/v2/crawl/sitemap-only/rust.mdx'
import CrawlAsyncRustShort from '/snippets/v1/crawl-async/short/rust.mdx'
@@ -16,7 +17,7 @@ import MapRustShort from '/snippets/v1/map/short/rust.mdx'
import LLMExtractRust from '/snippets/v1/llm-extract/base/rust.mdx'
-This SDK currently uses the **v1** version of the Firecrawl API, which is not the most recent (v2 is available). Some features and improvements may only be available in v2.
+This SDK currently uses the **v1** version of the Firecrawl API for most endpoints (v2 is available). The `parse_file` method uses the v2 parse endpoint.
## Installation
@@ -39,6 +40,12 @@ To scrape a single URL, use the `scrape_url` method. It takes the URL as a param
+### Parsing a Local File (v2 API)
+
+Use `parse_file` to upload a local document and parse it with the v2 pipeline.
+
+
+
### Scraping with Extract
With Extract, you can easily extract structured data from any URL. You need to specify your schema in the JSON Schema format, using the `serde_json::json!` macro.
diff --git a/snippets/v2/parse/base/curl.mdx b/snippets/v2/parse/base/curl.mdx
new file mode 100644
index 00000000..1a30e80c
--- /dev/null
+++ b/snippets/v2/parse/base/curl.mdx
@@ -0,0 +1,6 @@
+```bash
+curl -X POST "https://api.firecrawl.dev/v2/parse" \
+ -H "Authorization: Bearer YOUR_API_KEY" \
+ -F "file=@./sample.pdf" \
+ -F 'options={"formats":["markdown"]}'
+```
diff --git a/snippets/v2/parse/base/js.mdx b/snippets/v2/parse/base/js.mdx
new file mode 100644
index 00000000..74db31b2
--- /dev/null
+++ b/snippets/v2/parse/base/js.mdx
@@ -0,0 +1,13 @@
+```js Node
+import fs from 'node:fs';
+
+const file = fs.readFileSync('./sample.pdf');
+
+const parseResult = await firecrawl.parse(
+ file,
+ { formats: ['markdown'] },
+ { filename: 'sample.pdf' }
+);
+
+console.log(parseResult);
+```
diff --git a/snippets/v2/parse/base/python.mdx b/snippets/v2/parse/base/python.mdx
new file mode 100644
index 00000000..d14b18a2
--- /dev/null
+++ b/snippets/v2/parse/base/python.mdx
@@ -0,0 +1,9 @@
+```python
+doc = firecrawl.parse(
+ "./sample.pdf",
+ formats=["markdown"],
+ filename="sample.pdf",
+)
+
+print(doc)
+```
diff --git a/snippets/v2/parse/base/rust.mdx b/snippets/v2/parse/base/rust.mdx
new file mode 100644
index 00000000..37b75c7d
--- /dev/null
+++ b/snippets/v2/parse/base/rust.mdx
@@ -0,0 +1,11 @@
+```rust
+use firecrawl::parse::{ParseFormats, ParseOptions};
+
+let options = ParseOptions {
+ formats: Some(vec![ParseFormats::Markdown]),
+ ..Default::default()
+};
+
+let doc = app.parse_file("sample.pdf", Some(options)).await?;
+println!("{:#?}", doc);
+```