From 3200663bd38529f53d037c5b5e2fa9ba13ad776b Mon Sep 17 00:00:00 2001 From: mogery Date: Mon, 4 May 2026 17:55:50 +0200 Subject: [PATCH 01/27] fix stuff --- apps/api/src/__tests__/snips/v2/scrape-query.test.ts | 6 +++++- .../src/__tests__/snips/v2/types-validation.test.ts | 12 ++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/apps/api/src/__tests__/snips/v2/scrape-query.test.ts b/apps/api/src/__tests__/snips/v2/scrape-query.test.ts index c49dd5cc03..660253cb33 100644 --- a/apps/api/src/__tests__/snips/v2/scrape-query.test.ts +++ b/apps/api/src/__tests__/snips/v2/scrape-query.test.ts @@ -33,6 +33,7 @@ describe("Query format", () => { expect(response.answer).toBeDefined(); expect(typeof response.answer).toBe("string"); expect(response.answer!.length).toBeGreaterThan(0); + expect(response.markdown).toBeUndefined(); }, scrapeTimeout, ); @@ -43,7 +44,10 @@ describe("Query format", () => { const response = await scrape( { url: "https://firecrawl.dev", - formats: ["markdown", { type: "query", prompt: "What is Firecrawl?" }], + formats: [ + "markdown", + { type: "query", prompt: "What is Firecrawl?" }, + ], }, identity, ); diff --git a/apps/api/src/__tests__/snips/v2/types-validation.test.ts b/apps/api/src/__tests__/snips/v2/types-validation.test.ts index 4e97a57311..d11c5928a2 100644 --- a/apps/api/src/__tests__/snips/v2/types-validation.test.ts +++ b/apps/api/src/__tests__/snips/v2/types-validation.test.ts @@ -79,6 +79,18 @@ describe("V2 Types Validation", () => { expect(result.timeout).toBe(60000); // Should be transformed from 30000 }); + it("should accept query format without markdown", () => { + const input: ScrapeRequestInput = { + url: "https://example.com", + formats: [{ type: "query", prompt: "What is Firecrawl?" }], + }; + + const result = scrapeRequestSchema.parse(input); + expect(result.formats).toEqual([ + { type: "query", prompt: "What is Firecrawl?", directQuote: false }, + ]); + }); + it("should accept valid scrape request with changeTracking format", () => { const input: ScrapeRequestInput = { url: "https://example.com", From e576fb0eb88e5a6107bbfdf10282d6e73b64ac65 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Tue, 5 May 2026 11:00:38 +0000 Subject: [PATCH 02/27] fix(elixir-sdk): add proper error handling and accept strings for enum params - Add Firecrawl.Error exception module for API error responses - Add Req response step that converts HTTP 4xx/5xx to {:error, %Firecrawl.Error{}} for standard functions and raises for bang (!) variants - Update enum params (model, sitemap) to accept both atoms and strings via {:or, [{:in, [atoms...]}, :string]} type - Update generate.exs to produce these patterns on re-generation - Add tests for error handling and string enum acceptance Co-Authored-By: gaurav --- apps/elixir-sdk/generate.exs | 11 +++- apps/elixir-sdk/lib/firecrawl.ex | 15 +++-- apps/elixir-sdk/lib/firecrawl/error.ex | 27 ++++++++ apps/elixir-sdk/test/firecrawl_test.exs | 86 +++++++++++++++++++++++++ 4 files changed, 133 insertions(+), 6 deletions(-) create mode 100644 apps/elixir-sdk/lib/firecrawl/error.ex diff --git a/apps/elixir-sdk/generate.exs b/apps/elixir-sdk/generate.exs index 94e09c5396..7c97f522c7 100644 --- a/apps/elixir-sdk/generate.exs +++ b/apps/elixir-sdk/generate.exs @@ -142,7 +142,7 @@ defmodule Firecrawl.Generator do ) \"\"\" - @type response :: {:ok, Req.Response.t()} | {:error, Exception.t()} + @type response :: {:ok, Req.Response.t()} | {:error, Exception.t() | Firecrawl.Error.t()} @base_url "#{base_url}" @@ -191,8 +191,15 @@ defmodule Firecrawl.Generator do #{auth_header_line} ) |> Req.merge(opts) + |> Req.Request.append_response_steps(firecrawl_error_handler: &handle_api_error/1) end + defp handle_api_error({request, %Req.Response{status: status} = response}) when status >= 400 do + {request, Firecrawl.Error.exception(status: status, body: response.body)} + end + + defp handle_api_error({request, response}), do: {request, response} + defp to_body(validated_params, key_mapping) do Map.new(validated_params, fn {k, v} -> json_key = Map.fetch!(key_mapping, k) @@ -486,7 +493,7 @@ defmodule Firecrawl.Generator do defp openapi_to_nimble_type(%{"type" => "string", "enum" => values}) do inspected = values |> Enum.map(&atom_literal/1) |> Enum.join(", ") - "{:in, [#{inspected}]}" + "{:or, [{:in, [#{inspected}]}, :string]}" end defp openapi_to_nimble_type(%{"type" => "string"}), do: ":string" diff --git a/apps/elixir-sdk/lib/firecrawl.ex b/apps/elixir-sdk/lib/firecrawl.ex index 6faaa94b5c..d974d853b5 100644 --- a/apps/elixir-sdk/lib/firecrawl.ex +++ b/apps/elixir-sdk/lib/firecrawl.ex @@ -38,7 +38,7 @@ defmodule Firecrawl do ) """ - @type response :: {:ok, Req.Response.t()} | {:error, Exception.t()} + @type response :: {:ok, Req.Response.t()} | {:error, Exception.t() | Firecrawl.Error.t()} @base_url "https://api.firecrawl.dev/v2" @@ -87,8 +87,15 @@ defmodule Firecrawl do headers: [{"authorization", "Bearer #{api_key}"}] ) |> Req.merge(opts) + |> Req.Request.append_response_steps(firecrawl_error_handler: &handle_api_error/1) end + defp handle_api_error({request, %Req.Response{status: status} = response}) when status >= 400 do + {request, Firecrawl.Error.exception(status: status, body: response.body)} + end + + defp handle_api_error({request, response}), do: {request, response} + defp to_body(validated_params, key_mapping) do Map.new(validated_params, fn {k, v} -> json_key = Map.fetch!(key_mapping, k) @@ -272,7 +279,7 @@ defmodule Firecrawl do prompt: [type: :string, doc: "A prompt to use to generate the crawler options (all the parameters below) from natural language. Explicitly set parameters will override the generated equivalents."], regex_on_full_url: [type: :boolean, doc: "When true, includePaths and excludePaths regex patterns are matched against the full URL (including query parameters) instead of just the URL pathname. Useful when you need to filter URLs based on query strings."], scrape_options: [type: :keyword_list], - sitemap: [type: {:in, [:skip, :include, :only]}, doc: "Sitemap mode when crawling. If you set it to 'skip', the crawler will ignore the website sitemap and only crawl the entered URL and discover pages from there onwards. If you set it to 'only', the crawler will only crawl URLs from the sitemap (plus the start URL) and will not discover links from HTML."], + sitemap: [type: {:or, [{:in, [:skip, :include, :only]}, :string]}, doc: "Sitemap mode when crawling. If you set it to 'skip', the crawler will ignore the website sitemap and only crawl the entered URL and discover pages from there onwards. If you set it to 'only', the crawler will only crawl URLs from the sitemap (plus the start URL) and will not discover links from HTML."], url: [type: :string, required: true, doc: "The base URL to start crawling from"], webhook: [type: :keyword_list, doc: "A webhook specification object."], zero_data_retention: [type: :boolean, doc: "If true, this will enable zero data retention for this crawl. To enable this feature, please contact help@firecrawl.dev"] @@ -852,7 +859,7 @@ defmodule Firecrawl do limit: [type: :integer, doc: "Maximum number of links to return"], location: [type: :keyword_list, doc: "Location settings for the request. When specified, this will use an appropriate proxy if available and emulate the corresponding language and timezone settings. Defaults to 'US' if not specified."], search: [type: :string, doc: "Specify a search query to order the results by relevance. Example: 'blog' will return URLs that contain the word 'blog' in the URL ordered by relevance."], - sitemap: [type: {:in, [:skip, :include, :only]}, doc: "Sitemap mode when mapping. If you set it to `skip`, the sitemap won't be used to find URLs. If you set it to `only`, only URLs that are in the sitemap will be returned. By default (`include`), the sitemap and other methods will be used together to find URLs."], + sitemap: [type: {:or, [{:in, [:skip, :include, :only]}, :string]}, doc: "Sitemap mode when mapping. If you set it to `skip`, the sitemap won't be used to find URLs. If you set it to `only`, only URLs that are in the sitemap will be returned. By default (`include`), the sitemap and other methods will be used together to find URLs."], timeout: [type: :integer, doc: "Timeout in milliseconds. There is no timeout by default."], url: [type: :string, required: true, doc: "The base URL to start crawling from"] ]) @@ -1076,7 +1083,7 @@ defmodule Firecrawl do @start_agent_schema NimbleOptions.new!([ max_credits: [type: {:or, [:integer, :float]}, doc: "Maximum credits to spend on this agent task. Defaults to 2500 if not set. Values above 2,500 are always billed as paid requests."], - model: [type: {:in, [:"spark-1-mini", :"spark-1-pro"]}, doc: "The model to use for the agent task. spark-1-mini (default) is 60% cheaper, spark-1-pro offers higher accuracy for complex tasks"], + model: [type: {:or, [{:in, [:"spark-1-mini", :"spark-1-pro"]}, :string]}, doc: "The model to use for the agent task. spark-1-mini (default) is 60% cheaper, spark-1-pro offers higher accuracy for complex tasks"], prompt: [type: :string, required: true, doc: "The prompt describing what data to extract"], schema: [type: :any, doc: "Optional JSON schema to structure the extracted data"], strict_constrain_to_urls: [type: :boolean, doc: "If true, agent will only visit URLs provided in the urls array"], diff --git a/apps/elixir-sdk/lib/firecrawl/error.ex b/apps/elixir-sdk/lib/firecrawl/error.ex new file mode 100644 index 0000000000..b20d8af680 --- /dev/null +++ b/apps/elixir-sdk/lib/firecrawl/error.ex @@ -0,0 +1,27 @@ +defmodule Firecrawl.Error do + @moduledoc """ + Exception raised when the Firecrawl API returns an error response (HTTP 4xx/5xx). + + ## Fields + + * `:status` - The HTTP status code + * `:body` - The decoded response body (typically a map with `"error"` key) + """ + + defexception [:status, :body] + + @type t :: %__MODULE__{ + status: pos_integer(), + body: term() + } + + @impl true + def message(%__MODULE__{status: status, body: body}) when is_map(body) do + error_msg = body["error"] || body["message"] || inspect(body) + "Firecrawl API error (HTTP #{status}): #{error_msg}" + end + + def message(%__MODULE__{status: status, body: body}) do + "Firecrawl API error (HTTP #{status}): #{inspect(body)}" + end +end diff --git a/apps/elixir-sdk/test/firecrawl_test.exs b/apps/elixir-sdk/test/firecrawl_test.exs index 3bc7cf8781..88009c03a9 100644 --- a/apps/elixir-sdk/test/firecrawl_test.exs +++ b/apps/elixir-sdk/test/firecrawl_test.exs @@ -92,6 +92,92 @@ defmodule FirecrawlTest do assert {:error, _} = result end + test "accepts string values for enum params (model)" do + Application.put_env(:firecrawl, :api_key, "test-key") + on_exit(fn -> Application.delete_env(:firecrawl, :api_key) end) + + result = + Firecrawl.start_agent( + [prompt: "test", model: "spark-1-mini"], + base_url: "http://localhost:1", + retry: false + ) + + assert {:error, _} = result + end + + test "accepts string values for enum params (sitemap)" do + Application.put_env(:firecrawl, :api_key, "test-key") + on_exit(fn -> Application.delete_env(:firecrawl, :api_key) end) + + result = + Firecrawl.crawl_urls( + [url: "https://example.com", sitemap: "skip"], + base_url: "http://localhost:1", + retry: false + ) + + assert {:error, _} = result + end + + test "non-bang returns {:error, %Firecrawl.Error{}} for API errors" do + adapter = fn request -> + resp = Req.Response.new( + status: 402, + headers: %{"content-type" => ["application/json"]}, + body: Jason.encode!(%{"success" => false, "error" => "Payment required"}) + ) + {request, resp} + end + + result = + Firecrawl.scrape_and_extract_from_url( + [url: "https://example.com"], + api_key: "test-key", + adapter: adapter + ) + + assert {:error, %Firecrawl.Error{status: 402}} = result + end + + test "bang raises Firecrawl.Error for API errors" do + adapter = fn request -> + resp = Req.Response.new( + status: 401, + headers: %{"content-type" => ["application/json"]}, + body: Jason.encode!(%{"success" => false, "error" => "Unauthorized"}) + ) + {request, resp} + end + + assert_raise Firecrawl.Error, ~r/Unauthorized/, fn -> + Firecrawl.scrape_and_extract_from_url!( + [url: "https://example.com"], + api_key: "test-key", + adapter: adapter + ) + end + end + + test "non-bang returns {:ok, response} for successful API calls" do + adapter = fn request -> + resp = Req.Response.new( + status: 200, + headers: %{"content-type" => ["application/json"]}, + body: Jason.encode!(%{"success" => true, "data" => %{}}) + ) + {request, resp} + end + + result = + Firecrawl.get_credit_usage( + api_key: "test-key", + adapter: adapter + ) + + assert {:ok, %Req.Response{status: 200}} = result + end + test "all expected API functions are defined with bang variants" do functions = Firecrawl.__info__(:functions) From 087f8b5614801b205b2a8f0911f5b78c99aab271 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Tue, 5 May 2026 11:03:29 +0000 Subject: [PATCH 03/27] fix(php-sdk): add ScreenshotFormat, missing params, and browser improvements - Add ScreenshotFormat model class for full-page screenshot options (fullPage, quality) alongside existing string format names - Add minAge parameter to ScrapeOptions for cache-only lookups - Add profile parameter to ScrapeOptions for persistent browser storage - Add changeTracking parameter to ScrapeOptions - Add prompt parameter to interact() method for AI-powered browser automation - Add interactiveLiveViewUrl property to BrowserCreateResponse - Add profile parameter to browser() session creation method - Maintain backward compatibility for existing positional argument usage Co-Authored-By: gaurav --- apps/php-sdk/src/Client/FirecrawlClient.php | 10 ++++ .../src/Models/BrowserCreateResponse.php | 7 +++ apps/php-sdk/src/Models/ScrapeOptions.php | 49 +++++++++++++++---- apps/php-sdk/src/Models/ScreenshotFormat.php | 40 +++++++++++++++ 4 files changed, 96 insertions(+), 10 deletions(-) create mode 100644 apps/php-sdk/src/Models/ScreenshotFormat.php diff --git a/apps/php-sdk/src/Client/FirecrawlClient.php b/apps/php-sdk/src/Client/FirecrawlClient.php index e6cef00f69..e237b51c14 100644 --- a/apps/php-sdk/src/Client/FirecrawlClient.php +++ b/apps/php-sdk/src/Client/FirecrawlClient.php @@ -123,6 +123,7 @@ public function interact( string $language = 'node', ?int $timeout = null, ?string $origin = null, + ?string $prompt = null, ): BrowserExecuteResponse { $body = [ 'code' => $code, @@ -134,6 +135,9 @@ public function interact( if ($origin !== null) { $body['origin'] = $origin; } + if ($prompt !== null) { + $body['prompt'] = $prompt; + } return BrowserExecuteResponse::fromArray( $this->http->post("/v2/scrape/{$jobId}/interact", $body), @@ -399,11 +403,14 @@ public function cancelAgent(string $jobId): array /** * Create a new browser session. + * + * @param array|null $profile */ public function browser( ?int $ttl = null, ?int $activityTtl = null, ?bool $streamWebView = null, + ?array $profile = null, ): BrowserCreateResponse { $body = []; if ($ttl !== null) { @@ -415,6 +422,9 @@ public function browser( if ($streamWebView !== null) { $body['streamWebView'] = $streamWebView; } + if ($profile !== null) { + $body['profile'] = $profile; + } return BrowserCreateResponse::fromArray($this->http->post('/v2/browser', $body)); } diff --git a/apps/php-sdk/src/Models/BrowserCreateResponse.php b/apps/php-sdk/src/Models/BrowserCreateResponse.php index 4a69d4ff74..05ad54133e 100644 --- a/apps/php-sdk/src/Models/BrowserCreateResponse.php +++ b/apps/php-sdk/src/Models/BrowserCreateResponse.php @@ -11,6 +11,7 @@ public function __construct( private readonly ?string $id = null, private readonly ?string $cdpUrl = null, private readonly ?string $liveViewUrl = null, + private readonly ?string $interactiveLiveViewUrl = null, private readonly ?string $expiresAt = null, private readonly ?string $error = null, ) {} @@ -23,6 +24,7 @@ public static function fromArray(array $data): self id: $data['id'] ?? null, cdpUrl: $data['cdpUrl'] ?? null, liveViewUrl: $data['liveViewUrl'] ?? null, + interactiveLiveViewUrl: $data['interactiveLiveViewUrl'] ?? null, expiresAt: $data['expiresAt'] ?? null, error: $data['error'] ?? null, ); @@ -57,4 +59,9 @@ public function getError(): ?string { return $this->error; } + + public function getInteractiveLiveViewUrl(): ?string + { + return $this->interactiveLiveViewUrl; + } } diff --git a/apps/php-sdk/src/Models/ScrapeOptions.php b/apps/php-sdk/src/Models/ScrapeOptions.php index 28153bf577..15be371b3c 100644 --- a/apps/php-sdk/src/Models/ScrapeOptions.php +++ b/apps/php-sdk/src/Models/ScrapeOptions.php @@ -7,7 +7,7 @@ final class ScrapeOptions { /** - * @param list|null $formats + * @param list|null $formats * @param array|null $headers * @param list|null $includeTags * @param list|null $excludeTags @@ -31,18 +31,23 @@ private function __construct( private readonly ?bool $blockAds = null, private readonly ?string $proxy = null, private readonly ?int $maxAge = null, + private readonly ?int $minAge = null, private readonly ?bool $storeInCache = null, private readonly ?bool $lockdown = null, private readonly ?string $integration = null, + /** @var array|null */ + private readonly ?array $profile = null, + private readonly ?bool $changeTracking = null, ) {} /** - * @param list|null $formats - * @param array|null $headers - * @param list|null $includeTags - * @param list|null $excludeTags - * @param list|null $parsers - * @param list>|null $actions + * @param list|null $formats + * @param array|null $headers + * @param list|null $includeTags + * @param list|null $excludeTags + * @param list|null $parsers + * @param list>|null $actions + * @param array|null $profile */ public static function with( ?array $formats = null, @@ -64,12 +69,16 @@ public static function with( ?bool $storeInCache = null, ?string $integration = null, ?bool $lockdown = null, + ?int $minAge = null, + ?array $profile = null, + ?bool $changeTracking = null, ): self { return new self( $formats, $headers, $includeTags, $excludeTags, $onlyMainContent, $timeout, $waitFor, $mobile, $parsers, $actions, $location, $skipTlsVerification, $removeBase64Images, $blockAds, $proxy, - $maxAge, $storeInCache, $lockdown, $integration, + $maxAge, $minAge, $storeInCache, $lockdown, $integration, $profile, + $changeTracking, ); } @@ -80,7 +89,8 @@ public function toArray(): array if ($this->formats !== null) { $data['formats'] = array_map( - fn (string|JsonFormat $f): string|array => $f instanceof JsonFormat ? $f->toArray() : $f, + fn (string|JsonFormat|ScreenshotFormat $f): string|array => + $f instanceof JsonFormat || $f instanceof ScreenshotFormat ? $f->toArray() : $f, $this->formats, ); } @@ -101,9 +111,12 @@ public function toArray(): array 'blockAds' => $this->blockAds, 'proxy' => $this->proxy, 'maxAge' => $this->maxAge, + 'minAge' => $this->minAge, 'storeInCache' => $this->storeInCache, 'lockdown' => $this->lockdown, 'integration' => $this->integration, + 'profile' => $this->profile, + 'changeTracking' => $this->changeTracking, ]; foreach ($fields as $key => $value) { @@ -115,7 +128,7 @@ public function toArray(): array return $data; } - /** @return list|null */ + /** @return list|null */ public function getFormats(): ?array { return $this->formats; @@ -215,4 +228,20 @@ public function getIntegration(): ?string { return $this->integration; } + + public function getMinAge(): ?int + { + return $this->minAge; + } + + /** @return array|null */ + public function getProfile(): ?array + { + return $this->profile; + } + + public function getChangeTracking(): ?bool + { + return $this->changeTracking; + } } diff --git a/apps/php-sdk/src/Models/ScreenshotFormat.php b/apps/php-sdk/src/Models/ScreenshotFormat.php new file mode 100644 index 0000000000..6ffd2e760a --- /dev/null +++ b/apps/php-sdk/src/Models/ScreenshotFormat.php @@ -0,0 +1,40 @@ + */ + public function toArray(): array + { + return array_filter([ + 'type' => 'screenshot', + 'fullPage' => $this->fullPage, + 'quality' => $this->quality, + ], fn (mixed $v): bool => $v !== null); + } + + public function getFullPage(): ?bool + { + return $this->fullPage; + } + + public function getQuality(): ?int + { + return $this->quality; + } +} From 69c113a045059ee8d43f3894d50695f22e669180 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Tue, 5 May 2026 11:14:23 +0000 Subject: [PATCH 04/27] fix: address cubic review - safe error interpolation and non-vacuous enum tests - Guard error message interpolation against non-string values - Enum string tests now assert the error is NOT a NimbleOptions.ValidationError, proving validation actually accepts strings Co-Authored-By: gaurav --- apps/elixir-sdk/lib/firecrawl/error.ex | 7 ++++++- apps/elixir-sdk/test/firecrawl_test.exs | 8 ++++++-- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/apps/elixir-sdk/lib/firecrawl/error.ex b/apps/elixir-sdk/lib/firecrawl/error.ex index b20d8af680..7fbdb59644 100644 --- a/apps/elixir-sdk/lib/firecrawl/error.ex +++ b/apps/elixir-sdk/lib/firecrawl/error.ex @@ -17,7 +17,12 @@ defmodule Firecrawl.Error do @impl true def message(%__MODULE__{status: status, body: body}) when is_map(body) do - error_msg = body["error"] || body["message"] || inspect(body) + error_msg = + case body["error"] || body["message"] do + msg when is_binary(msg) -> msg + _ -> inspect(body) + end + "Firecrawl API error (HTTP #{status}): #{error_msg}" end diff --git a/apps/elixir-sdk/test/firecrawl_test.exs b/apps/elixir-sdk/test/firecrawl_test.exs index 88009c03a9..72f6a9d149 100644 --- a/apps/elixir-sdk/test/firecrawl_test.exs +++ b/apps/elixir-sdk/test/firecrawl_test.exs @@ -103,7 +103,9 @@ defmodule FirecrawlTest do retry: false ) - assert {:error, _} = result + assert {:error, err} = result + refute match?(%NimbleOptions.ValidationError{}, err), + "Expected connection error, got validation error: #{inspect(err)}" end test "accepts string values for enum params (sitemap)" do @@ -117,7 +119,9 @@ defmodule FirecrawlTest do retry: false ) - assert {:error, _} = result + assert {:error, err} = result + refute match?(%NimbleOptions.ValidationError{}, err), + "Expected connection error, got validation error: #{inspect(err)}" end test "non-bang returns {:error, %Firecrawl.Error{}} for API errors" do From 37e1eb599ddaee84847534894f53b2adc2cbdeca Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Tue, 5 May 2026 13:32:43 +0000 Subject: [PATCH 05/27] fix(ci): use env context instead of secrets in workflow if condition The secrets context is not allowed in step-level if conditions. Move FIRECRAWL_API_KEY to job-level env and check env.FIRECRAWL_API_KEY in the E2E test step condition instead. Co-Authored-By: gaurav --- .github/workflows/test-php-sdk.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test-php-sdk.yml b/.github/workflows/test-php-sdk.yml index c716f35fa5..d77625eefa 100644 --- a/.github/workflows/test-php-sdk.yml +++ b/.github/workflows/test-php-sdk.yml @@ -18,6 +18,8 @@ jobs: build-and-test: name: Build and Test runs-on: blacksmith-4vcpu-ubuntu-2404 + env: + FIRECRAWL_API_KEY: ${{ secrets.FIRECRAWL_API_KEY }} if: >- github.event_name == 'workflow_dispatch' || github.event_name == 'pull_request' || @@ -62,8 +64,6 @@ jobs: run: vendor/bin/pest --ci - name: Run E2E tests - if: ${{ secrets.FIRECRAWL_API_KEY != '' }} + if: env.FIRECRAWL_API_KEY != '' working-directory: ./apps/php-sdk - env: - FIRECRAWL_API_KEY: ${{ secrets.FIRECRAWL_API_KEY }} run: vendor/bin/pest --ci --group=e2e From 20b463f48d0372e9587e1abe4ed752ad7a1c8af4 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Tue, 5 May 2026 13:37:24 +0000 Subject: [PATCH 06/27] fix(ci): scope secret to E2E step only, use non-secret flag for if check Use a boolean HAS_API_KEY env var at job level for the condition check, and keep the actual FIRECRAWL_API_KEY secret scoped to just the E2E step. Co-Authored-By: gaurav --- .github/workflows/test-php-sdk.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test-php-sdk.yml b/.github/workflows/test-php-sdk.yml index d77625eefa..5db9858f4a 100644 --- a/.github/workflows/test-php-sdk.yml +++ b/.github/workflows/test-php-sdk.yml @@ -19,7 +19,7 @@ jobs: name: Build and Test runs-on: blacksmith-4vcpu-ubuntu-2404 env: - FIRECRAWL_API_KEY: ${{ secrets.FIRECRAWL_API_KEY }} + HAS_API_KEY: ${{ secrets.FIRECRAWL_API_KEY != '' }} if: >- github.event_name == 'workflow_dispatch' || github.event_name == 'pull_request' || @@ -64,6 +64,8 @@ jobs: run: vendor/bin/pest --ci - name: Run E2E tests - if: env.FIRECRAWL_API_KEY != '' + if: env.HAS_API_KEY == 'true' working-directory: ./apps/php-sdk + env: + FIRECRAWL_API_KEY: ${{ secrets.FIRECRAWL_API_KEY }} run: vendor/bin/pest --ci --group=e2e From 14d5af3e4cb3c6f5f4abd726fefbeac40c68f00d Mon Sep 17 00:00:00 2001 From: mogery Date: Tue, 5 May 2026 17:44:32 +0200 Subject: [PATCH 07/27] chore(api/query): change api surface --- .../__tests__/snips/v2/scrape-query.test.ts | 24 ++++ .../snips/v2/types-validation.test.ts | 56 ++++++++- apps/api/src/controllers/v2/types.ts | 2 +- .../scraper/scrapeURL/transformers/query.ts | 2 +- .../Firecrawl.Tests/ModelsTests.cs | 15 +++ apps/dot-net-sdk/Firecrawl/Firecrawl.csproj | 2 +- .../Firecrawl/Models/QueryFormat.cs | 22 ++++ apps/elixir-sdk/mix.exs | 2 +- apps/go-sdk/options.go | 71 +++++++++-- apps/go-sdk/options_test.go | 43 +++++++ apps/go-sdk/parse.go | 22 +++- apps/go-sdk/version.go | 2 +- apps/java-sdk/build.gradle.kts | 2 +- .../com/firecrawl/models/QueryFormat.java | 59 +++++++++ .../com/firecrawl/models/ScrapeOptions.java | 4 +- .../com/firecrawl/FirecrawlClientTest.java | 11 +- apps/js-sdk/firecrawl/package.json | 2 +- apps/js-sdk/firecrawl/src/v2/types.ts | 1 + apps/php-sdk/src/Models/ParseOptions.php | 7 +- apps/php-sdk/src/Models/QueryFormat.php | 47 +++++++ apps/php-sdk/src/Models/ScrapeOptions.php | 10 +- apps/php-sdk/src/Version.php | 2 +- apps/php-sdk/tests/Unit/ModelsTest.php | 17 +++ apps/python-sdk/firecrawl/__init__.py | 2 +- .../unit/v2/utils/test_validation.py | 43 ++++++- apps/python-sdk/firecrawl/v2/types.py | 1 + .../firecrawl/v2/utils/validation.py | 7 ++ apps/ruby-sdk/lib/firecrawl.rb | 1 + .../lib/firecrawl/models/parse_options.rb | 6 +- .../lib/firecrawl/models/query_format.rb | 34 +++++ .../lib/firecrawl/models/scrape_options.rb | 8 +- apps/ruby-sdk/lib/firecrawl/version.rb | 2 +- apps/ruby-sdk/test/firecrawl/client_test.rb | 19 +++ apps/rust-sdk/Cargo.lock | 2 +- apps/rust-sdk/Cargo.toml | 2 +- apps/rust-sdk/src/scrape.rs | 22 ++++ apps/rust-sdk/src/types.rs | 119 +++++++++++++++++- 37 files changed, 650 insertions(+), 43 deletions(-) create mode 100644 apps/dot-net-sdk/Firecrawl/Models/QueryFormat.cs create mode 100644 apps/go-sdk/options_test.go create mode 100644 apps/java-sdk/src/main/java/com/firecrawl/models/QueryFormat.java create mode 100644 apps/php-sdk/src/Models/QueryFormat.php create mode 100644 apps/ruby-sdk/lib/firecrawl/models/query_format.rb diff --git a/apps/api/src/__tests__/snips/v2/scrape-query.test.ts b/apps/api/src/__tests__/snips/v2/scrape-query.test.ts index 660253cb33..41cbd5b4d2 100644 --- a/apps/api/src/__tests__/snips/v2/scrape-query.test.ts +++ b/apps/api/src/__tests__/snips/v2/scrape-query.test.ts @@ -61,6 +61,30 @@ describe("Query format", () => { scrapeTimeout, ); + concurrentIf(TEST_PRODUCTION || HAS_AI)( + "returns a direct quote answer when query mode is directQuote", + async () => { + const response = await scrape( + { + url: "https://firecrawl.dev", + formats: [ + { + type: "query", + prompt: "What is Firecrawl?", + mode: "directQuote", + }, + ], + }, + identity, + ); + + expect(response.answer).toBeDefined(); + expect(typeof response.answer).toBe("string"); + expect(response.answer!.length).toBeGreaterThan(0); + }, + scrapeTimeout, + ); + concurrentIf(TEST_PRODUCTION || HAS_AI)( "does not include answer field when query format is not provided", async () => { diff --git a/apps/api/src/__tests__/snips/v2/types-validation.test.ts b/apps/api/src/__tests__/snips/v2/types-validation.test.ts index d11c5928a2..3ab18a9058 100644 --- a/apps/api/src/__tests__/snips/v2/types-validation.test.ts +++ b/apps/api/src/__tests__/snips/v2/types-validation.test.ts @@ -87,7 +87,25 @@ describe("V2 Types Validation", () => { const result = scrapeRequestSchema.parse(input); expect(result.formats).toEqual([ - { type: "query", prompt: "What is Firecrawl?", directQuote: false }, + { type: "query", prompt: "What is Firecrawl?", mode: "freeform" }, + ]); + }); + + it("should accept query format directQuote mode", () => { + const input: ScrapeRequestInput = { + url: "https://example.com", + formats: [ + { + type: "query", + prompt: "What is Firecrawl?", + mode: "directQuote", + }, + ], + }; + + const result = scrapeRequestSchema.parse(input); + expect(result.formats).toEqual([ + { type: "query", prompt: "What is Firecrawl?", mode: "directQuote" }, ]); }); @@ -1003,10 +1021,44 @@ describe("V2 Types Validation", () => { const result = searchRequestSchema.parse(input); expect(result.scrapeOptions?.formats).toEqual([ - { type: "query", prompt: "What is Firecrawl?", directQuote: false }, + { type: "query", prompt: "What is Firecrawl?", mode: "freeform" }, ]); }); + it("should reject search scrapeOptions query format with invalid mode", () => { + const input: SearchRequestInput = { + query: "test", + scrapeOptions: { + formats: [ + { + type: "query", + prompt: "What is Firecrawl?", + mode: "quoted", + } as any, + ], + }, + }; + + expect(() => searchRequestSchema.parse(input)).toThrow(); + }); + + it("should reject search scrapeOptions query format with directQuote boolean", () => { + const input: SearchRequestInput = { + query: "test", + scrapeOptions: { + formats: [ + { + type: "query", + prompt: "What is Firecrawl?", + directQuote: true, + } as any, + ], + }, + }; + + expect(() => searchRequestSchema.parse(input)).toThrow(); + }); + it("should reject search scrapeOptions query prompt over 10000 characters", () => { const input: SearchRequestInput = { query: "test", diff --git a/apps/api/src/controllers/v2/types.ts b/apps/api/src/controllers/v2/types.ts index f11bb094ca..ecfe24d5fc 100644 --- a/apps/api/src/controllers/v2/types.ts +++ b/apps/api/src/controllers/v2/types.ts @@ -403,7 +403,7 @@ type AttributesFormatWithOptions = z.output; const queryFormatWithOptions = z.strictObject({ type: z.literal("query"), prompt: z.string().max(10000), - directQuote: z.boolean().optional().default(false), + mode: z.enum(["freeform", "directQuote"]).optional().default("freeform"), }); type QueryFormatWithOptions = z.output; diff --git a/apps/api/src/scraper/scrapeURL/transformers/query.ts b/apps/api/src/scraper/scrapeURL/transformers/query.ts index 54e162daf2..b6cb0d7352 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/query.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/query.ts @@ -507,7 +507,7 @@ export async function performQuery( let answer: string | null; - if (queryFormat.directQuote) { + if (queryFormat.mode === "directQuote") { answer = await performDirectQuoteQuery( meta, document, diff --git a/apps/dot-net-sdk/Firecrawl.Tests/ModelsTests.cs b/apps/dot-net-sdk/Firecrawl.Tests/ModelsTests.cs index 4d44144b7b..7c1f20ec54 100644 --- a/apps/dot-net-sdk/Firecrawl.Tests/ModelsTests.cs +++ b/apps/dot-net-sdk/Firecrawl.Tests/ModelsTests.cs @@ -244,6 +244,21 @@ public void JsonFormat_HasCorrectType() Assert.Contains("\"schema\"", json); } + [Fact] + public void QueryFormat_HasCorrectMode() + { + var format = new QueryFormat + { + Prompt = "What is Firecrawl?", + Mode = QueryFormat.DirectQuoteMode + }; + + var json = JsonSerializer.Serialize(format, JsonOptions); + Assert.Contains("\"type\":\"query\"", json); + Assert.Contains("\"prompt\":\"What is Firecrawl?\"", json); + Assert.Contains("\"mode\":\"directQuote\"", json); + } + [Fact] public void WebhookConfig_SerializesCorrectly() { diff --git a/apps/dot-net-sdk/Firecrawl/Firecrawl.csproj b/apps/dot-net-sdk/Firecrawl/Firecrawl.csproj index 6bd4e455af..e2a0f936ff 100644 --- a/apps/dot-net-sdk/Firecrawl/Firecrawl.csproj +++ b/apps/dot-net-sdk/Firecrawl/Firecrawl.csproj @@ -8,7 +8,7 @@ firecrawl-sdk - 1.2.0 + 1.2.1 Firecrawl Firecrawl .NET SDK for the Firecrawl API - web scraping, crawling, and data extraction diff --git a/apps/dot-net-sdk/Firecrawl/Models/QueryFormat.cs b/apps/dot-net-sdk/Firecrawl/Models/QueryFormat.cs new file mode 100644 index 0000000000..44cd939206 --- /dev/null +++ b/apps/dot-net-sdk/Firecrawl/Models/QueryFormat.cs @@ -0,0 +1,22 @@ +using System.Text.Json.Serialization; + +namespace Firecrawl.Models; + +/// +/// Query format specification for use in ScrapeOptions.Formats. +/// +public class QueryFormat +{ + public const string FreeformMode = "freeform"; + public const string DirectQuoteMode = "directQuote"; + + [JsonPropertyName("type")] + public string Type { get; } = "query"; + + [JsonPropertyName("prompt")] + public required string Prompt { get; set; } + + [JsonPropertyName("mode")] + [JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)] + public string? Mode { get; set; } +} diff --git a/apps/elixir-sdk/mix.exs b/apps/elixir-sdk/mix.exs index 12db15dc36..479cf2217b 100644 --- a/apps/elixir-sdk/mix.exs +++ b/apps/elixir-sdk/mix.exs @@ -1,7 +1,7 @@ defmodule Firecrawl.MixProject do use Mix.Project - @version "1.2.0" + @version "1.2.1" @source_url "https://github.com/firecrawl/firecrawl/tree/main/apps/elixir-sdk" def project do diff --git a/apps/go-sdk/options.go b/apps/go-sdk/options.go index 75766bc770..4f610553d4 100644 --- a/apps/go-sdk/options.go +++ b/apps/go-sdk/options.go @@ -1,8 +1,40 @@ package firecrawl +import "encoding/json" + +// QueryFormatMode selects how query answers are generated. +type QueryFormatMode string + +const ( + QueryModeFreeform QueryFormatMode = "freeform" + QueryModeDirectQuote QueryFormatMode = "directQuote" +) + +// QueryFormat asks a question about page content. +type QueryFormat struct { + Prompt string `json:"prompt"` + Mode QueryFormatMode `json:"mode,omitempty"` +} + +// MarshalJSON always emits the API-required query format type. +func (q QueryFormat) MarshalJSON() ([]byte, error) { + type queryFormat struct { + Type string `json:"type"` + Prompt string `json:"prompt"` + Mode QueryFormatMode `json:"mode,omitempty"` + } + + return json.Marshal(queryFormat{ + Type: "query", + Prompt: q.Prompt, + Mode: q.Mode, + }) +} + // ScrapeOptions configures a single-page scrape request. type ScrapeOptions struct { - Formats []string `json:"formats,omitempty"` + Formats []string `json:"-"` + FormatOptions []interface{} `json:"-"` Headers map[string]string `json:"headers,omitempty"` IncludeTags []string `json:"includeTags,omitempty"` ExcludeTags []string `json:"excludeTags,omitempty"` @@ -24,6 +56,25 @@ type ScrapeOptions struct { JsonOptions *JsonOptions `json:"jsonOptions,omitempty"` } +// MarshalJSON preserves string formats while allowing object formats such as QueryFormat. +func (o ScrapeOptions) MarshalJSON() ([]byte, error) { + type scrapeOptions ScrapeOptions + payload := struct { + scrapeOptions + Formats interface{} `json:"formats,omitempty"` + }{ + scrapeOptions: scrapeOptions(o), + } + + if len(o.FormatOptions) > 0 { + payload.Formats = o.FormatOptions + } else if len(o.Formats) > 0 { + payload.Formats = o.Formats + } + + return json.Marshal(payload) +} + // CrawlOptions configures a crawl request. type CrawlOptions struct { Prompt *string `json:"prompt,omitempty"` @@ -34,7 +85,7 @@ type CrawlOptions struct { IgnoreQueryParameters *bool `json:"ignoreQueryParameters,omitempty"` DeduplicateSimilarURLs *bool `json:"deduplicateSimilarURLs,omitempty"` Limit *int `json:"limit,omitempty"` - CrawlEntireDomain *bool `json:"crawlEntireDomain,omitempty"` + CrawlEntireDomain *bool `json:"crawlEntireDomain,omitempty"` AllowExternalLinks *bool `json:"allowExternalLinks,omitempty"` AllowSubdomains *bool `json:"allowSubdomains,omitempty"` Delay *int `json:"delay,omitempty"` @@ -87,14 +138,14 @@ type SearchOptions struct { // AgentOptions configures an agent request. type AgentOptions struct { - URLs []string `json:"urls,omitempty"` - Prompt string `json:"prompt"` - Schema map[string]interface{} `json:"schema,omitempty"` - Integration *string `json:"integration,omitempty"` - MaxCredits *int `json:"maxCredits,omitempty"` - StrictConstrainToURLs *bool `json:"strictConstrainToURLs,omitempty"` - Model *string `json:"model,omitempty"` - Webhook *WebhookConfig `json:"webhook,omitempty"` + URLs []string `json:"urls,omitempty"` + Prompt string `json:"prompt"` + Schema map[string]interface{} `json:"schema,omitempty"` + Integration *string `json:"integration,omitempty"` + MaxCredits *int `json:"maxCredits,omitempty"` + StrictConstrainToURLs *bool `json:"strictConstrainToURLs,omitempty"` + Model *string `json:"model,omitempty"` + Webhook *WebhookConfig `json:"webhook,omitempty"` } // LocationConfig specifies geolocation for requests. diff --git a/apps/go-sdk/options_test.go b/apps/go-sdk/options_test.go new file mode 100644 index 0000000000..dbb4f0b5c0 --- /dev/null +++ b/apps/go-sdk/options_test.go @@ -0,0 +1,43 @@ +package firecrawl + +import ( + "encoding/json" + "strings" + "testing" +) + +func TestScrapeOptionsSerializesQueryFormatMode(t *testing.T) { + payload, err := json.Marshal(ScrapeOptions{ + FormatOptions: []interface{}{ + QueryFormat{ + Prompt: "What is Firecrawl?", + Mode: QueryModeDirectQuote, + }, + }, + }) + if err != nil { + t.Fatalf("Marshal ScrapeOptions: %v", err) + } + + jsonBody := string(payload) + for _, want := range []string{ + `"formats":[{"type":"query","prompt":"What is Firecrawl?","mode":"directQuote"}]`, + } { + if !strings.Contains(jsonBody, want) { + t.Fatalf("serialized query format = %s, want to contain %s", jsonBody, want) + } + } +} + +func TestScrapeOptionsPreservesStringFormats(t *testing.T) { + payload, err := json.Marshal(ScrapeOptions{ + Formats: []string{"markdown"}, + }) + if err != nil { + t.Fatalf("Marshal ScrapeOptions: %v", err) + } + + if !strings.Contains(string(payload), `"formats":["markdown"]`) { + t.Fatalf("serialized string formats = %s", payload) + } +} diff --git a/apps/go-sdk/parse.go b/apps/go-sdk/parse.go index b47501e039..6329f2f7a8 100644 --- a/apps/go-sdk/parse.go +++ b/apps/go-sdk/parse.go @@ -57,7 +57,8 @@ func NewParseFileFromBytes(filename string, content []byte) *ParseFile { // mobile) nor the screenshot, branding, or changeTracking formats. The proxy // field only accepts "auto" or "basic". type ParseOptions struct { - Formats []string `json:"formats,omitempty"` + Formats []string `json:"-"` + FormatOptions []interface{} `json:"-"` Headers map[string]string `json:"headers,omitempty"` IncludeTags []string `json:"includeTags,omitempty"` ExcludeTags []string `json:"excludeTags,omitempty"` @@ -72,6 +73,25 @@ type ParseOptions struct { JsonOptions *JsonOptions `json:"jsonOptions,omitempty"` } +// MarshalJSON preserves string formats while allowing object formats such as QueryFormat. +func (o ParseOptions) MarshalJSON() ([]byte, error) { + type parseOptions ParseOptions + payload := struct { + parseOptions + Formats interface{} `json:"formats,omitempty"` + }{ + parseOptions: parseOptions(o), + } + + if len(o.FormatOptions) > 0 { + payload.Formats = o.FormatOptions + } else if len(o.Formats) > 0 { + payload.Formats = o.Formats + } + + return json.Marshal(payload) +} + // Parse uploads a file to the `/v2/parse` endpoint and returns the extracted document. func (c *Client) Parse(ctx context.Context, file *ParseFile, opts *ParseOptions) (*Document, error) { if file == nil { diff --git a/apps/go-sdk/version.go b/apps/go-sdk/version.go index e0d6e24e19..afafa732e6 100644 --- a/apps/go-sdk/version.go +++ b/apps/go-sdk/version.go @@ -9,4 +9,4 @@ package firecrawl // Bump this when preparing a new release. The publish-go-sdk GitHub workflow // reads this value and creates the corresponding monorepo-prefixed tag on // merge to main. -const Version = "1.2.0" +const Version = "1.2.1" diff --git a/apps/java-sdk/build.gradle.kts b/apps/java-sdk/build.gradle.kts index 33f7bda2ad..3183e7110b 100644 --- a/apps/java-sdk/build.gradle.kts +++ b/apps/java-sdk/build.gradle.kts @@ -4,7 +4,7 @@ plugins { } group = "com.firecrawl" -version = "1.4.0" +version = "1.4.1" java { sourceCompatibility = JavaVersion.VERSION_11 diff --git a/apps/java-sdk/src/main/java/com/firecrawl/models/QueryFormat.java b/apps/java-sdk/src/main/java/com/firecrawl/models/QueryFormat.java new file mode 100644 index 0000000000..195281897d --- /dev/null +++ b/apps/java-sdk/src/main/java/com/firecrawl/models/QueryFormat.java @@ -0,0 +1,59 @@ +package com.firecrawl.models; + +import com.fasterxml.jackson.annotation.JsonInclude; +import com.fasterxml.jackson.annotation.JsonValue; + +/** + * Query format for asking a question about page content. + */ +@JsonInclude(JsonInclude.Include.NON_NULL) +public class QueryFormat { + + public enum Mode { + FREEFORM("freeform"), + DIRECT_QUOTE("directQuote"); + + private final String value; + + Mode(String value) { + this.value = value; + } + + @JsonValue + public String getValue() { + return value; + } + } + + private final String type = "query"; + private String prompt; + private Mode mode; + + private QueryFormat() {} + + public String getType() { return type; } + public String getPrompt() { return prompt; } + public Mode getMode() { return mode; } + + public static Builder builder() { return new Builder(); } + + public static final class Builder { + private String prompt; + private Mode mode; + + private Builder() {} + + /** Question to answer from the page content. */ + public Builder prompt(String prompt) { this.prompt = prompt; return this; } + + /** Query answer mode: freeform or direct quote. */ + public Builder mode(Mode mode) { this.mode = mode; return this; } + + public QueryFormat build() { + QueryFormat f = new QueryFormat(); + f.prompt = this.prompt; + f.mode = this.mode; + return f; + } + } +} diff --git a/apps/java-sdk/src/main/java/com/firecrawl/models/ScrapeOptions.java b/apps/java-sdk/src/main/java/com/firecrawl/models/ScrapeOptions.java index 2227049008..440af580d4 100644 --- a/apps/java-sdk/src/main/java/com/firecrawl/models/ScrapeOptions.java +++ b/apps/java-sdk/src/main/java/com/firecrawl/models/ScrapeOptions.java @@ -108,8 +108,8 @@ private Builder() {} /** * Output formats to request. Accepts strings like "markdown", "html", "rawHtml", - * "links", "screenshot", "json", "audio", etc., or format configuration maps for - * advanced formats (e.g., JsonFormat, ScreenshotFormat). + * "links", "screenshot", "json", "audio", etc., or format configuration maps/objects for + * advanced formats (e.g., JsonFormat, QueryFormat). */ public Builder formats(List formats) { this.formats = formats; return this; } diff --git a/apps/java-sdk/src/test/java/com/firecrawl/FirecrawlClientTest.java b/apps/java-sdk/src/test/java/com/firecrawl/FirecrawlClientTest.java index 9899a4335c..04a68737ee 100644 --- a/apps/java-sdk/src/test/java/com/firecrawl/FirecrawlClientTest.java +++ b/apps/java-sdk/src/test/java/com/firecrawl/FirecrawlClientTest.java @@ -60,14 +60,21 @@ void testBuilderAcceptsCustomHttpClient() { @Test void testScrapeOptionsBuilder() { + QueryFormat queryFormat = QueryFormat.builder() + .prompt("What is Firecrawl?") + .mode(QueryFormat.Mode.DIRECT_QUOTE) + .build(); + ScrapeOptions options = ScrapeOptions.builder() - .formats(List.of("markdown", "html")) + .formats(List.of("markdown", "html", queryFormat)) .onlyMainContent(true) .timeout(30000) .mobile(false) .build(); - assertEquals(List.of("markdown", "html"), options.getFormats()); + assertEquals(List.of("markdown", "html", queryFormat), options.getFormats()); + assertEquals("query", queryFormat.getType()); + assertEquals(QueryFormat.Mode.DIRECT_QUOTE, queryFormat.getMode()); assertTrue(options.getOnlyMainContent()); assertEquals(30000, options.getTimeout()); assertFalse(options.getMobile()); diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index c8b729bdf4..5504ff98de 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "4.21.0", + "version": "4.21.1", "description": "JavaScript SDK for Firecrawl API", "main": "dist/index.js", "types": "dist/index.d.ts", diff --git a/apps/js-sdk/firecrawl/src/v2/types.ts b/apps/js-sdk/firecrawl/src/v2/types.ts index 8a3fa03348..f72a6b8ead 100644 --- a/apps/js-sdk/firecrawl/src/v2/types.ts +++ b/apps/js-sdk/firecrawl/src/v2/types.ts @@ -55,6 +55,7 @@ export interface AttributesFormat extends Format { export interface QueryFormat { type: 'query'; prompt: string; + mode?: 'freeform' | 'directQuote'; } export type FormatOption = diff --git a/apps/php-sdk/src/Models/ParseOptions.php b/apps/php-sdk/src/Models/ParseOptions.php index a9a0a4a0bd..ec1d458aaf 100644 --- a/apps/php-sdk/src/Models/ParseOptions.php +++ b/apps/php-sdk/src/Models/ParseOptions.php @@ -23,7 +23,7 @@ final class ParseOptions ]; /** - * @param list|null $formats + * @param list|null $formats * @param array|null $headers * @param list|null $includeTags * @param list|null $excludeTags @@ -45,7 +45,7 @@ private function __construct( ) {} /** - * @param list|null $formats + * @param list|null $formats * @param array|null $headers * @param list|null $includeTags * @param list|null $excludeTags @@ -105,7 +105,8 @@ public function toArray(): array if ($this->formats !== null) { $data['formats'] = array_map( - fn (string|JsonFormat $f): string|array => $f instanceof JsonFormat ? $f->toArray() : $f, + fn (string|JsonFormat|QueryFormat $f): string|array => + $f instanceof JsonFormat || $f instanceof QueryFormat ? $f->toArray() : $f, $this->formats, ); } diff --git a/apps/php-sdk/src/Models/QueryFormat.php b/apps/php-sdk/src/Models/QueryFormat.php new file mode 100644 index 0000000000..70daa7920f --- /dev/null +++ b/apps/php-sdk/src/Models/QueryFormat.php @@ -0,0 +1,47 @@ + */ + public function toArray(): array + { + return array_filter([ + 'type' => 'query', + 'prompt' => $this->prompt, + 'mode' => $this->mode, + ], fn (mixed $v): bool => $v !== null); + } + + public function getPrompt(): string + { + return $this->prompt; + } + + public function getMode(): ?string + { + return $this->mode; + } +} diff --git a/apps/php-sdk/src/Models/ScrapeOptions.php b/apps/php-sdk/src/Models/ScrapeOptions.php index 15be371b3c..7694d70ca1 100644 --- a/apps/php-sdk/src/Models/ScrapeOptions.php +++ b/apps/php-sdk/src/Models/ScrapeOptions.php @@ -7,7 +7,7 @@ final class ScrapeOptions { /** - * @param list|null $formats + * @param list|null $formats * @param array|null $headers * @param list|null $includeTags * @param list|null $excludeTags @@ -41,7 +41,7 @@ private function __construct( ) {} /** - * @param list|null $formats + * @param list|null $formats * @param array|null $headers * @param list|null $includeTags * @param list|null $excludeTags @@ -89,8 +89,8 @@ public function toArray(): array if ($this->formats !== null) { $data['formats'] = array_map( - fn (string|JsonFormat|ScreenshotFormat $f): string|array => - $f instanceof JsonFormat || $f instanceof ScreenshotFormat ? $f->toArray() : $f, + fn (string|JsonFormat|ScreenshotFormat|QueryFormat $f): string|array => + $f instanceof JsonFormat || $f instanceof ScreenshotFormat || $f instanceof QueryFormat ? $f->toArray() : $f, $this->formats, ); } @@ -128,7 +128,7 @@ public function toArray(): array return $data; } - /** @return list|null */ + /** @return list|null */ public function getFormats(): ?array { return $this->formats; diff --git a/apps/php-sdk/src/Version.php b/apps/php-sdk/src/Version.php index 11d7dc95b3..e7da56ddc0 100644 --- a/apps/php-sdk/src/Version.php +++ b/apps/php-sdk/src/Version.php @@ -6,5 +6,5 @@ final class Version { - public const SDK_VERSION = '1.1.0'; + public const SDK_VERSION = '1.1.1'; } diff --git a/apps/php-sdk/tests/Unit/ModelsTest.php b/apps/php-sdk/tests/Unit/ModelsTest.php index d3a6b78deb..59b2592bb4 100644 --- a/apps/php-sdk/tests/Unit/ModelsTest.php +++ b/apps/php-sdk/tests/Unit/ModelsTest.php @@ -6,6 +6,7 @@ use Firecrawl\Models\MapData; use Firecrawl\Models\BatchScrapeJob; use Firecrawl\Models\CrawlJob; +use Firecrawl\Models\QueryFormat; use Firecrawl\Models\ScrapeOptions; it('hydrates CreditUsage from nested data key', function (): void { @@ -133,3 +134,19 @@ 'integration' => 'php-sdk', ]); }); + +it('serializes query format mode in ScrapeOptions', function (): void { + $options = ScrapeOptions::with( + formats: [QueryFormat::with('What is Firecrawl?', QueryFormat::MODE_DIRECT_QUOTE)], + ); + + expect($options->toArray()['formats'][0])->toMatchArray([ + 'type' => 'query', + 'prompt' => 'What is Firecrawl?', + 'mode' => 'directQuote', + ]); +}); + +it('rejects invalid query format mode', function (): void { + QueryFormat::with('What is Firecrawl?', 'quoted'); +})->throws(InvalidArgumentException::class, "query mode must be 'freeform' or 'directQuote'"); diff --git a/apps/python-sdk/firecrawl/__init__.py b/apps/python-sdk/firecrawl/__init__.py index 492fe6f85c..e42f93db3d 100644 --- a/apps/python-sdk/firecrawl/__init__.py +++ b/apps/python-sdk/firecrawl/__init__.py @@ -17,7 +17,7 @@ V1ChangeTrackingOptions, ) -__version__ = "4.24.0" +__version__ = "4.24.1" # Define the logger for the Firecrawl project logger: logging.Logger = logging.getLogger("firecrawl") diff --git a/apps/python-sdk/firecrawl/__tests__/unit/v2/utils/test_validation.py b/apps/python-sdk/firecrawl/__tests__/unit/v2/utils/test_validation.py index 615fafb7e7..41b205b1a9 100644 --- a/apps/python-sdk/firecrawl/__tests__/unit/v2/utils/test_validation.py +++ b/apps/python-sdk/firecrawl/__tests__/unit/v2/utils/test_validation.py @@ -1,5 +1,5 @@ import pytest -from firecrawl.v2.types import JsonFormat, ScrapeOptions, PDFParser +from firecrawl.v2.types import JsonFormat, QueryFormat, ScrapeOptions, PDFParser from firecrawl.v2.utils.validation import validate_scrape_options, prepare_scrape_options @@ -189,6 +189,47 @@ def test_prepare_invalid_options(self): with pytest.raises(ValueError, match="Timeout must be positive"): prepare_scrape_options(options) + def test_prepare_query_format_with_mode(self): + """Test query format mode is preserved.""" + options = ScrapeOptions( + formats=[QueryFormat(prompt="What is Firecrawl?", mode="directQuote")] + ) + result = prepare_scrape_options(options) + + assert result["formats"] == [ + {"type": "query", "prompt": "What is Firecrawl?", "mode": "directQuote"} + ] + + def test_prepare_query_format_rejects_direct_quote_boolean(self): + """Test old query directQuote layout is rejected.""" + options = ScrapeOptions( + formats=[ + { + "type": "query", + "prompt": "What is Firecrawl?", + "directQuote": True, + } + ] + ) + + with pytest.raises(ValueError, match="uses 'mode' instead of 'directQuote'"): + prepare_scrape_options(options) + + def test_prepare_query_format_rejects_invalid_mode(self): + """Test query mode validation.""" + options = ScrapeOptions( + formats=[ + { + "type": "query", + "prompt": "What is Firecrawl?", + "mode": "quoted", + } + ] + ) + + with pytest.raises(ValueError, match="mode must be 'freeform' or 'directQuote'"): + prepare_scrape_options(options) + def test_prepare_empty_options(self): """Test preparation with empty options.""" options = ScrapeOptions() # All defaults diff --git a/apps/python-sdk/firecrawl/v2/types.py b/apps/python-sdk/firecrawl/v2/types.py index a8e9b6b06b..9b06f0341a 100644 --- a/apps/python-sdk/firecrawl/v2/types.py +++ b/apps/python-sdk/firecrawl/v2/types.py @@ -451,6 +451,7 @@ class QueryFormat(Format): type: Literal["query"] = "query" prompt: str + mode: Optional[Literal["freeform", "directQuote"]] = None FormatOption = Union[ diff --git a/apps/python-sdk/firecrawl/v2/utils/validation.py b/apps/python-sdk/firecrawl/v2/utils/validation.py index 0a13f2dcb9..8568bec503 100644 --- a/apps/python-sdk/firecrawl/v2/utils/validation.py +++ b/apps/python-sdk/firecrawl/v2/utils/validation.py @@ -454,6 +454,13 @@ def _validate_query_format(format_obj: Any) -> Dict[str, Any]: if not isinstance(format_obj.get('prompt'), str) or not format_obj['prompt'].strip(): raise ValueError("query format requires a non-empty 'prompt' string") + if "directQuote" in format_obj: + raise ValueError("query format uses 'mode' instead of 'directQuote'") + + mode = format_obj.get("mode") + if mode is not None and mode not in ("freeform", "directQuote"): + raise ValueError("query format mode must be 'freeform' or 'directQuote'") + return format_obj diff --git a/apps/ruby-sdk/lib/firecrawl.rb b/apps/ruby-sdk/lib/firecrawl.rb index 869a08ba06..79af432682 100644 --- a/apps/ruby-sdk/lib/firecrawl.rb +++ b/apps/ruby-sdk/lib/firecrawl.rb @@ -3,6 +3,7 @@ require_relative "firecrawl/version" require_relative "firecrawl/errors" require_relative "firecrawl/http_client" +require_relative "firecrawl/models/query_format" require_relative "firecrawl/models/document" require_relative "firecrawl/models/scrape_options" require_relative "firecrawl/models/crawl_options" diff --git a/apps/ruby-sdk/lib/firecrawl/models/parse_options.rb b/apps/ruby-sdk/lib/firecrawl/models/parse_options.rb index b3edb06c22..1d67aa3be0 100644 --- a/apps/ruby-sdk/lib/firecrawl/models/parse_options.rb +++ b/apps/ruby-sdk/lib/firecrawl/models/parse_options.rb @@ -26,7 +26,7 @@ def initialize(**kwargs) def to_h { - "formats" => formats, + "formats" => formats&.map { |fmt| format_value(fmt) }, "headers" => headers, "includeTags" => include_tags, "excludeTags" => exclude_tags, @@ -69,6 +69,10 @@ def extract_format_type(fmt) fmt.respond_to?(:type) ? fmt.type : nil end end + + def format_value(fmt) + fmt.respond_to?(:to_h) ? fmt.to_h : fmt + end end end end diff --git a/apps/ruby-sdk/lib/firecrawl/models/query_format.rb b/apps/ruby-sdk/lib/firecrawl/models/query_format.rb new file mode 100644 index 0000000000..5706ff36b4 --- /dev/null +++ b/apps/ruby-sdk/lib/firecrawl/models/query_format.rb @@ -0,0 +1,34 @@ +# frozen_string_literal: true + +module Firecrawl + module Models + # Query format for asking a question about page content. + class QueryFormat + MODE_FREEFORM = "freeform" + MODE_DIRECT_QUOTE = "directQuote" + + attr_reader :prompt, :mode + + def initialize(prompt:, mode: nil) + unless mode.nil? || [MODE_FREEFORM, MODE_DIRECT_QUOTE].include?(mode) + raise ArgumentError, "query mode must be 'freeform' or 'directQuote'" + end + + @prompt = prompt + @mode = mode + end + + def to_h + { + "type" => "query", + "prompt" => prompt, + "mode" => mode, + }.compact + end + + def type + "query" + end + end + end +end diff --git a/apps/ruby-sdk/lib/firecrawl/models/scrape_options.rb b/apps/ruby-sdk/lib/firecrawl/models/scrape_options.rb index cb6134ac93..dfd29e4743 100644 --- a/apps/ruby-sdk/lib/firecrawl/models/scrape_options.rb +++ b/apps/ruby-sdk/lib/firecrawl/models/scrape_options.rb @@ -20,7 +20,7 @@ def initialize(**kwargs) def to_h { - "formats" => formats, + "formats" => formats&.map { |fmt| format_value(fmt) }, "headers" => headers, "includeTags" => include_tags, "excludeTags" => exclude_tags, @@ -41,6 +41,12 @@ def to_h "integration" => integration, }.compact end + + private + + def format_value(fmt) + fmt.respond_to?(:to_h) ? fmt.to_h : fmt + end end end end diff --git a/apps/ruby-sdk/lib/firecrawl/version.rb b/apps/ruby-sdk/lib/firecrawl/version.rb index 10ea39d74c..33b639d21c 100644 --- a/apps/ruby-sdk/lib/firecrawl/version.rb +++ b/apps/ruby-sdk/lib/firecrawl/version.rb @@ -1,5 +1,5 @@ # frozen_string_literal: true module Firecrawl - VERSION = "1.3.0" + VERSION = "1.3.1" end diff --git a/apps/ruby-sdk/test/firecrawl/client_test.rb b/apps/ruby-sdk/test/firecrawl/client_test.rb index 5b4547f4ae..184d5505b6 100644 --- a/apps/ruby-sdk/test/firecrawl/client_test.rb +++ b/apps/ruby-sdk/test/firecrawl/client_test.rb @@ -458,6 +458,25 @@ def test_scrape_options_to_h refute h.key?("timeout") # nil values should be omitted end + def test_query_format_to_h + format = Firecrawl::Models::QueryFormat.new( + prompt: "What is Firecrawl?", + mode: Firecrawl::Models::QueryFormat::MODE_DIRECT_QUOTE + ) + opts = Firecrawl::Models::ScrapeOptions.new(formats: [format]) + + assert_equal( + [{ "type" => "query", "prompt" => "What is Firecrawl?", "mode" => "directQuote" }], + opts.to_h["formats"] + ) + end + + def test_query_format_rejects_invalid_mode + assert_raises(ArgumentError) do + Firecrawl::Models::QueryFormat.new(prompt: "What is Firecrawl?", mode: "quoted") + end + end + def test_scrape_options_skip_tls_defaults_to_false opts = Firecrawl::Models::ScrapeOptions.new assert_equal false, opts.skip_tls_verification diff --git a/apps/rust-sdk/Cargo.lock b/apps/rust-sdk/Cargo.lock index 7f17f6a568..13078e92db 100644 --- a/apps/rust-sdk/Cargo.lock +++ b/apps/rust-sdk/Cargo.lock @@ -250,7 +250,7 @@ checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" [[package]] name = "firecrawl" -version = "2.3.0" +version = "2.3.1" dependencies = [ "mockito", "reqwest", diff --git a/apps/rust-sdk/Cargo.toml b/apps/rust-sdk/Cargo.toml index 7bb9bf1b68..6de8905e76 100644 --- a/apps/rust-sdk/Cargo.toml +++ b/apps/rust-sdk/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "firecrawl" -version = "2.3.0" +version = "2.3.1" edition = "2021" license = "MIT" homepage = "https://www.firecrawl.dev/" diff --git a/apps/rust-sdk/src/scrape.rs b/apps/rust-sdk/src/scrape.rs index 650ed92d81..43ec1b86d5 100644 --- a/apps/rust-sdk/src/scrape.rs +++ b/apps/rust-sdk/src/scrape.rs @@ -436,8 +436,30 @@ impl Client { #[cfg(test)] mod tests { use super::*; + use crate::{QueryFormat, QueryFormatMode}; use serde_json::json; + #[test] + fn test_query_format_serializes_mode() { + let options = ScrapeOptions { + formats: Some(vec![Format::Query(QueryFormat { + prompt: "What is Firecrawl?".to_string(), + mode: Some(QueryFormatMode::DirectQuote), + })]), + ..Default::default() + }; + + let payload = serde_json::to_value(options).unwrap(); + assert_eq!( + payload["formats"][0], + json!({ + "type": "query", + "prompt": "What is Firecrawl?", + "mode": "directQuote" + }) + ); + } + #[tokio::test] async fn test_scrape_with_mock() { let mut server = mockito::Server::new_async().await; diff --git a/apps/rust-sdk/src/types.rs b/apps/rust-sdk/src/types.rs index d1bb976c7b..bcc7879ee6 100644 --- a/apps/rust-sdk/src/types.rs +++ b/apps/rust-sdk/src/types.rs @@ -1,14 +1,13 @@ //! Type definitions for Firecrawl API v2. -use serde::{Deserialize, Serialize}; +use serde::{de, Deserialize, Deserializer, Serialize, Serializer}; use serde_json::Value; use std::collections::HashMap; use crate::serde_helpers::deserialize_string_or_array; /// Available output formats for scraping operations. -#[derive(Deserialize, Serialize, Clone, Copy, Debug, PartialEq, Eq)] -#[serde(rename_all = "camelCase")] +#[derive(Clone, Debug, PartialEq, Eq)] pub enum Format { /// Markdown content of the page. Markdown, @@ -34,6 +33,120 @@ pub enum Format { Branding, /// Audio extraction (MP3) from YouTube videos. Audio, + /// Query answer generated from the page content. + Query(QueryFormat), +} + +impl Serialize for Format { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + match self { + Format::Markdown => serializer.serialize_str("markdown"), + Format::Html => serializer.serialize_str("html"), + Format::RawHtml => serializer.serialize_str("rawHtml"), + Format::Links => serializer.serialize_str("links"), + Format::Images => serializer.serialize_str("images"), + Format::Screenshot => serializer.serialize_str("screenshot"), + Format::Summary => serializer.serialize_str("summary"), + Format::ChangeTracking => serializer.serialize_str("changeTracking"), + Format::Json => serializer.serialize_str("json"), + Format::Attributes => serializer.serialize_str("attributes"), + Format::Branding => serializer.serialize_str("branding"), + Format::Audio => serializer.serialize_str("audio"), + Format::Query(query) => query.serialize(serializer), + } + } +} + +impl<'de> Deserialize<'de> for Format { + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + let value = Value::deserialize(deserializer)?; + match value { + Value::String(format) => match format.as_str() { + "markdown" => Ok(Format::Markdown), + "html" => Ok(Format::Html), + "rawHtml" => Ok(Format::RawHtml), + "links" => Ok(Format::Links), + "images" => Ok(Format::Images), + "screenshot" => Ok(Format::Screenshot), + "summary" => Ok(Format::Summary), + "changeTracking" => Ok(Format::ChangeTracking), + "json" => Ok(Format::Json), + "attributes" => Ok(Format::Attributes), + "branding" => Ok(Format::Branding), + "audio" => Ok(Format::Audio), + _ => Err(de::Error::custom(format!("unknown format: {}", format))), + }, + Value::Object(_) => QueryFormat::deserialize(value) + .map(Format::Query) + .map_err(de::Error::custom), + _ => Err(de::Error::custom("format must be a string or object")), + } + } +} + +/// Query format for asking a question about page content. +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct QueryFormat { + pub prompt: String, + pub mode: Option, +} + +#[derive(Deserialize, Serialize)] +#[serde(rename_all = "camelCase")] +struct QueryFormatWire { + #[serde(rename = "type")] + format_type: String, + prompt: String, + #[serde(skip_serializing_if = "Option::is_none")] + mode: Option, +} + +impl Serialize for QueryFormat { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + QueryFormatWire { + format_type: "query".to_string(), + prompt: self.prompt.clone(), + mode: self.mode, + } + .serialize(serializer) + } +} + +impl<'de> Deserialize<'de> for QueryFormat { + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + let wire = QueryFormatWire::deserialize(deserializer)?; + if wire.format_type != "query" { + return Err(de::Error::custom( + "query format object must have type query", + )); + } + + Ok(Self { + prompt: wire.prompt, + mode: wire.mode, + }) + } +} + +/// Query answer mode. +#[derive(Deserialize, Serialize, Clone, Copy, Debug, PartialEq, Eq)] +pub enum QueryFormatMode { + #[serde(rename = "freeform")] + Freeform, + #[serde(rename = "directQuote")] + DirectQuote, } /// Viewport dimensions for screenshots. From 164cd75db9458c7c9df80136de2f78ec01147b50 Mon Sep 17 00:00:00 2001 From: Abimael Martell Date: Tue, 5 May 2026 08:51:48 -0700 Subject: [PATCH 08/27] fix(deps): bump axios to 1.15.2 to resolve security advisories (#3485) Resolves 13 axios advisories (GHSA-3w6x-2g7m-8v23, GHSA-445q-vr5w-6q77, GHSA-5c9x-8gcm-mpgx, GHSA-62hf-57xw-28j9, GHSA-6chq-wfr3-2hj9, GHSA-m7pr-hjqh-92cm, GHSA-pf86-5x62-jrwf, GHSA-pmwg-cvhr-8vh7, GHSA-q8qp-cvcw-x6jj, GHSA-vf2m-468p-8v99, GHSA-w9j2-pvgh-6h63, GHSA-xhjh-pmcv-23jw, GHSA-xx6v-rp6x-q39c) flagged by audit-ci in apps/api, apps/js-sdk, and apps/js-sdk/firecrawl. --- apps/api/package.json | 2 +- apps/api/pnpm-lock.yaml | 21 ++++++++++++--------- apps/js-sdk/firecrawl/package.json | 4 ++-- apps/js-sdk/firecrawl/pnpm-lock.yaml | 14 +++++++------- apps/js-sdk/package.json | 2 +- apps/js-sdk/pnpm-lock.yaml | 14 +++++++------- 6 files changed, 30 insertions(+), 27 deletions(-) diff --git a/apps/api/package.json b/apps/api/package.json index 3f7151b4fb..2b31dbe35d 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -103,7 +103,7 @@ "amqplib": "^0.10.9", "async-mutex": "^0.5.0", "autumn-js": "1.2.13", - "axios": "^1.15.0", + "axios": "^1.15.2", "body-parser": "^1.20.3", "bullmq": "^5.56.7", "cacheable-lookup": "^6.1.0", diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml index da37ef3172..56db10cc82 100644 --- a/apps/api/pnpm-lock.yaml +++ b/apps/api/pnpm-lock.yaml @@ -118,8 +118,8 @@ importers: specifier: 1.2.13 version: 1.2.13(express@4.22.0)(react@18.3.1) axios: - specifier: ^1.15.0 - version: 1.15.0 + specifier: ^1.15.2 + version: 1.16.0 body-parser: specifier: ^1.20.3 version: 1.20.3 @@ -3145,8 +3145,8 @@ packages: peerDependencies: axios: 0.x || 1.x - axios@1.15.0: - resolution: {integrity: sha512-wWyJDlAatxk30ZJer+GeCWS209sA42X+N5jU2jy6oHTp7ufw8uzUTVFBX9+wTfAlhiJXGS0Bq7X6efruWjuK9Q==} + axios@1.16.0: + resolution: {integrity: sha512-6hp5CwvTPlN2A31g5dxnwAX0orzM7pmCRDLnZSX772mv8WDqICwFjowHuPs04Mc8deIld1+ejhtaMn5vp6b+1w==} babel-jest@30.2.0: resolution: {integrity: sha512-0YiBEOxWqKkSQWL9nNGGEgndoeL0ZpWrbLMNL5u/Kaxrli3Eaxlt3ZtIDktEvXt4L/R9r3ODr2zKwGM/2BjxVw==} @@ -5715,6 +5715,7 @@ packages: uuid@10.0.0: resolution: {integrity: sha512-8XkAphELsDnEGrDxUOHB3RGvXz6TeuYSGEZBOjtTtPm2lwhGBjLgOzLHB63IUWfBpNucQjND6d3AOudO+H3RWQ==} + deprecated: uuid@10 and below is no longer supported. For ESM codebases, update to uuid@latest. For CommonJS codebases, use uuid@11 (but be aware this version will likely be deprecated in 2028). hasBin: true uuid@13.0.0: @@ -5723,10 +5724,12 @@ packages: uuid@8.3.2: resolution: {integrity: sha512-+NYs2QeMWy+GWFOEm9xnn6HCDp0l7QBD7ml8zLUmJ+93Q5NF0NocErnwkTkXVFNiX3/fpC6afS8Dhb/gz7R7eg==} + deprecated: uuid@10 and below is no longer supported. For ESM codebases, update to uuid@latest. For CommonJS codebases, use uuid@11 (but be aware this version will likely be deprecated in 2028). hasBin: true uuid@9.0.1: resolution: {integrity: sha512-b+1eJOlsR9K8HJpow9Ok3fiWOWSIcIzXodvv0rQjVoOVNpWMpxf1wZNpt4y9h10odCNrqnYp1OBzRktckBe3sA==} + deprecated: uuid@10 and below is no longer supported. For ESM codebases, update to uuid@latest. For CommonJS codebases, use uuid@11 (but be aware this version will likely be deprecated in 2028). hasBin: true v8-compile-cache-lib@3.0.1: @@ -6346,8 +6349,8 @@ snapshots: '@solana/spl-token': 0.4.13(@solana/web3.js@1.98.4(bufferutil@4.0.9)(encoding@0.1.13)(typescript@5.8.3)(utf-8-validate@5.0.10))(bufferutil@4.0.9)(encoding@0.1.13)(fastestsmallesttextencoderdecoder@1.0.22)(typescript@5.8.3)(utf-8-validate@5.0.10) '@solana/web3.js': 1.98.4(bufferutil@4.0.9)(encoding@0.1.13)(typescript@5.8.3)(utf-8-validate@5.0.10) abitype: 1.0.6(typescript@5.8.3)(zod@3.25.76) - axios: 1.15.0 - axios-retry: 4.5.0(axios@1.15.0) + axios: 1.16.0 + axios-retry: 4.5.0(axios@1.16.0) jose: 6.0.12 md5: 2.3.0 uncrypto: 0.1.3 @@ -8820,12 +8823,12 @@ snapshots: express: 4.22.0 react: 18.3.1 - axios-retry@4.5.0(axios@1.15.0): + axios-retry@4.5.0(axios@1.16.0): dependencies: - axios: 1.15.0 + axios: 1.16.0 is-retry-allowed: 2.2.0 - axios@1.15.0: + axios@1.16.0: dependencies: follow-redirects: 1.16.0 form-data: 4.0.5 diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index 5504ff98de..51f8f4ace1 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -26,7 +26,7 @@ "author": "Mendable.ai", "license": "MIT", "dependencies": { - "axios": "1.15.0", + "axios": "1.15.2", "firecrawl": "4.16.0", "typescript-event-target": "^1.1.1", "zod": "^3.23.8", @@ -70,7 +70,7 @@ "picomatch@<4.0.4": ">=4.0.4", "handlebars": ">=4.7.9", "brace-expansion": ">=5.0.5", - "axios@<1.15.0": "1.15.0", + "axios@<1.15.2": "1.15.2", "follow-redirects@<1.16.0": ">=1.16.0 <2.0.0" } } diff --git a/apps/js-sdk/firecrawl/pnpm-lock.yaml b/apps/js-sdk/firecrawl/pnpm-lock.yaml index 24f2e05cd8..a242f81d50 100644 --- a/apps/js-sdk/firecrawl/pnpm-lock.yaml +++ b/apps/js-sdk/firecrawl/pnpm-lock.yaml @@ -11,7 +11,7 @@ overrides: picomatch@<4.0.4: '>=4.0.4' handlebars: '>=4.7.9' brace-expansion: '>=5.0.5' - axios@<1.15.0: 1.15.0 + axios@<1.15.2: 1.15.2 follow-redirects@<1.16.0: '>=1.16.0 <2.0.0' importers: @@ -19,8 +19,8 @@ importers: .: dependencies: axios: - specifier: 1.15.0 - version: 1.15.0 + specifier: 1.15.2 + version: 1.15.2 firecrawl: specifier: 4.16.0 version: 4.16.0 @@ -884,8 +884,8 @@ packages: asynckit@0.4.0: resolution: {integrity: sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q==} - axios@1.15.0: - resolution: {integrity: sha512-wWyJDlAatxk30ZJer+GeCWS209sA42X+N5jU2jy6oHTp7ufw8uzUTVFBX9+wTfAlhiJXGS0Bq7X6efruWjuK9Q==} + axios@1.15.2: + resolution: {integrity: sha512-wLrXxPtcrPTsNlJmKjkPnNPK2Ihe0hn0wGSaTEiHRPxwjvJwT3hKmXF4dpqxmPO9SoNb2FsYXj/xEo0gHN+D5A==} babel-jest@30.2.0: resolution: {integrity: sha512-0YiBEOxWqKkSQWL9nNGGEgndoeL0ZpWrbLMNL5u/Kaxrli3Eaxlt3ZtIDktEvXt4L/R9r3ODr2zKwGM/2BjxVw==} @@ -2797,7 +2797,7 @@ snapshots: asynckit@0.4.0: {} - axios@1.15.0: + axios@1.15.2: dependencies: follow-redirects: 1.16.0 form-data: 4.0.5 @@ -3090,7 +3090,7 @@ snapshots: firecrawl@4.16.0: dependencies: - axios: 1.15.0 + axios: 1.15.2 typescript-event-target: 1.1.1 zod: 3.25.76 zod-to-json-schema: 3.24.6(zod@3.25.76) diff --git a/apps/js-sdk/package.json b/apps/js-sdk/package.json index 2855beb6e1..046d2ff13d 100644 --- a/apps/js-sdk/package.json +++ b/apps/js-sdk/package.json @@ -12,7 +12,7 @@ "license": "MIT", "dependencies": { "@mendable/firecrawl-js": "^4.3.4", - "axios": "^1.15.0", + "axios": "^1.15.2", "firecrawl": "^4.3.4", "uuid": "^10.0.0", "zod": "^3.23.8" diff --git a/apps/js-sdk/pnpm-lock.yaml b/apps/js-sdk/pnpm-lock.yaml index 6d277b5cee..90e64f30cb 100644 --- a/apps/js-sdk/pnpm-lock.yaml +++ b/apps/js-sdk/pnpm-lock.yaml @@ -16,8 +16,8 @@ importers: specifier: ^4.3.4 version: 4.11.1 axios: - specifier: ^1.15.0 - version: 1.15.0 + specifier: ^1.15.2 + version: 1.16.0 firecrawl: specifier: ^4.3.4 version: 4.11.1 @@ -247,8 +247,8 @@ packages: asynckit@0.4.0: resolution: {integrity: sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q==} - axios@1.15.0: - resolution: {integrity: sha512-wWyJDlAatxk30ZJer+GeCWS209sA42X+N5jU2jy6oHTp7ufw8uzUTVFBX9+wTfAlhiJXGS0Bq7X6efruWjuK9Q==} + axios@1.16.0: + resolution: {integrity: sha512-6hp5CwvTPlN2A31g5dxnwAX0orzM7pmCRDLnZSX772mv8WDqICwFjowHuPs04Mc8deIld1+ejhtaMn5vp6b+1w==} call-bind-apply-helpers@1.0.2: resolution: {integrity: sha512-Sp1ablJ0ivDkSzjcaJdxEunN5/XvksFJ2sMBFfq6x0ryhQV/2b/KwFe21cMpmHtPOSij8K99/wSfoEuTObmuMQ==} @@ -512,7 +512,7 @@ snapshots: '@mendable/firecrawl-js@4.11.1': dependencies: - axios: 1.15.0 + axios: 1.16.0 typescript-event-target: 1.1.1 zod: 3.25.76 zod-to-json-schema: 3.24.6(zod@3.25.76) @@ -541,7 +541,7 @@ snapshots: asynckit@0.4.0: {} - axios@1.15.0: + axios@1.16.0: dependencies: follow-redirects: 1.16.0 form-data: 4.0.5 @@ -616,7 +616,7 @@ snapshots: firecrawl@4.11.1: dependencies: - axios: 1.15.0 + axios: 1.16.0 typescript-event-target: 1.1.1 zod: 3.25.76 zod-to-json-schema: 3.24.6(zod@3.25.76) From abcafcf8c6754d136e3bcd963ba240ef4cd2cb37 Mon Sep 17 00:00:00 2001 From: mogery Date: Tue, 5 May 2026 18:57:27 +0200 Subject: [PATCH 09/27] feat: audit autofixer pipeline --- .github/scripts/audit-ci-vuln-scan.mjs | 597 ++++++++++++++++++ .../npm-audit-claude-remediation.yml | 103 +++ 2 files changed, 700 insertions(+) create mode 100644 .github/scripts/audit-ci-vuln-scan.mjs create mode 100644 .github/workflows/npm-audit-claude-remediation.yml diff --git a/.github/scripts/audit-ci-vuln-scan.mjs b/.github/scripts/audit-ci-vuln-scan.mjs new file mode 100644 index 0000000000..53a500e71d --- /dev/null +++ b/.github/scripts/audit-ci-vuln-scan.mjs @@ -0,0 +1,597 @@ +#!/usr/bin/env node + +import { spawnSync } from "node:child_process"; +import { appendFileSync, mkdirSync, writeFileSync } from "node:fs"; +import path from "node:path"; +import process from "node:process"; + +const AUDITS = [ + { name: "API", appPath: "apps/api", outputName: "api" }, + { + name: "Playwright Service", + appPath: "apps/playwright-service-ts", + outputName: "playwright-service", + }, + { name: "JavaScript SDK", appPath: "apps/js-sdk", outputName: "js-sdk" }, + { + name: "JavaScript SDK Firecrawl", + appPath: "apps/js-sdk/firecrawl", + outputName: "js-sdk-firecrawl", + }, + { + name: "Test Suite", + appPath: "apps/test-suite", + outputName: "test-suite", + }, + { + name: "Ingestion UI", + appPath: "apps/ui/ingestion-ui", + outputName: "ingestion-ui", + }, + { name: "Test Site", appPath: "apps/test-site", outputName: "test-site" }, +]; + +const GHSA_REGEX = /GHSA-[0-9a-z]{4}-[0-9a-z]{4}-[0-9a-z]{4}/gi; +const MARKER_REGEX = //g; +const OUTPUT_DIR = process.env.AUDIT_REMEDIATION_OUTPUT_DIR || "/tmp/audit-remediation"; +const GITHUB_API_URL = process.env.GITHUB_API_URL || "https://api.github.com"; +const GITHUB_TOKEN = process.env.GH_TOKEN || process.env.GITHUB_TOKEN || ""; +const GITHUB_REPOSITORY = process.env.GITHUB_REPOSITORY || ""; + +function run(command, args) { + return spawnSync(command, args, { + encoding: "utf8", + maxBuffer: 1024 * 1024 * 50, + }); +} + +function auditCommand(appPath, extraArgs = []) { + return [ + "dlx", + "audit-ci@^7", + "--directory", + appPath, + "--config", + `${appPath}/audit-ci.jsonc`, + ...extraArgs, + ]; +} + +function commandForDisplay(appPath) { + return `pnpm dlx audit-ci@^7 --directory ${appPath} --config ${appPath}/audit-ci.jsonc`; +} + +function combinedOutput(result) { + return `${result.stdout || ""}${result.stderr || ""}`; +} + +function unique(values) { + return [...new Set(values.filter(Boolean))]; +} + +function advisoryIdsFrom(value) { + if (value === undefined || value === null) { + return []; + } + + return unique(String(value).match(GHSA_REGEX) || []).map((id) => id.toUpperCase()); +} + +function advisoryIdsFromObject(value) { + return advisoryIdsFrom(JSON.stringify(value)); +} + +function normalizeKey(appPath, advisoryId, packageName) { + return `${appPath}|${advisoryId.toUpperCase()}|${packageName || "unknown"}`; +} + +function pushFinding(findings, finding) { + if (!finding.advisoryId) { + return; + } + + const packageName = finding.packageName || "unknown"; + const key = normalizeKey(finding.appPath, finding.advisoryId, packageName); + const existing = findings.find((item) => item.key === key); + + if (existing) { + existing.paths = unique([...existing.paths, ...(finding.paths || [])]); + existing.severities = unique([...existing.severities, ...(finding.severities || [])]); + existing.urls = unique([...existing.urls, ...(finding.urls || [])]); + existing.titles = unique([...existing.titles, ...(finding.titles || [])]); + return; + } + + findings.push({ + key, + appName: finding.appName, + appPath: finding.appPath, + advisoryId: finding.advisoryId.toUpperCase(), + packageName, + paths: unique(finding.paths || []), + severities: unique(finding.severities || []), + urls: unique(finding.urls || []), + titles: unique(finding.titles || []), + }); +} + +function extractJson(output) { + const trimmed = output.trim(); + + for (const [startToken, endToken] of [ + ["{", "}"], + ["[", "]"], + ]) { + const start = trimmed.indexOf(startToken); + const end = trimmed.lastIndexOf(endToken); + if (start !== -1 && end > start) { + try { + return JSON.parse(trimmed.slice(start, end + 1)); + } catch { + // Try the next shape before falling back to text parsing. + } + } + } + + return null; +} + +function collectFromVulnerabilities(report, audit) { + const findings = []; + const vulnerabilities = report?.vulnerabilities; + + if (!vulnerabilities || typeof vulnerabilities !== "object") { + return findings; + } + + for (const [packageName, vulnerability] of Object.entries(vulnerabilities)) { + const via = Array.isArray(vulnerability?.via) ? vulnerability.via : []; + const paths = unique([ + ...(Array.isArray(vulnerability?.nodes) ? vulnerability.nodes : []), + ...(Array.isArray(vulnerability?.effects) ? vulnerability.effects : []), + ]); + + for (const viaEntry of via) { + if (typeof viaEntry === "string") { + continue; + } + + const advisoryIds = advisoryIdsFromObject(viaEntry); + for (const advisoryId of advisoryIds) { + pushFinding(findings, { + appName: audit.name, + appPath: audit.appPath, + advisoryId, + packageName: viaEntry.name || packageName, + paths, + severities: [viaEntry.severity || vulnerability.severity], + urls: [viaEntry.url], + titles: [viaEntry.title], + }); + } + } + } + + return findings; +} + +function collectFromAdvisories(report, audit) { + const findings = []; + const advisories = report?.advisories; + + if (!advisories || typeof advisories !== "object") { + return findings; + } + + for (const advisory of Object.values(advisories)) { + const advisoryIds = unique([ + ...advisoryIdsFrom(advisory.github_advisory_id), + ...advisoryIdsFrom(advisory.url), + ...advisoryIdsFrom(advisory.title), + ...advisoryIdsFromObject(advisory), + ]); + const paths = []; + + if (Array.isArray(advisory.findings)) { + for (const finding of advisory.findings) { + paths.push(...(Array.isArray(finding.paths) ? finding.paths : [])); + } + } + + for (const advisoryId of advisoryIds) { + pushFinding(findings, { + appName: audit.name, + appPath: audit.appPath, + advisoryId, + packageName: advisory.module_name || advisory.name || "unknown", + paths, + severities: [advisory.severity], + urls: [advisory.url], + titles: [advisory.title], + }); + } + } + + return findings; +} + +function collectFallbackFromText(output, audit) { + const findings = []; + const advisoryIds = advisoryIdsFrom(output); + + for (const advisoryId of advisoryIds) { + pushFinding(findings, { + appName: audit.name, + appPath: audit.appPath, + advisoryId, + packageName: "unknown", + paths: [], + severities: [], + urls: [], + titles: [], + }); + } + + return findings; +} + +function collectFindings(report, output, audit) { + const structuredFindings = [ + ...collectFromVulnerabilities(report, audit), + ...collectFromAdvisories(report, audit), + ]; + const structuredAdvisoryIds = new Set( + structuredFindings.map((finding) => `${finding.appPath}|${finding.advisoryId}`), + ); + const fallbackFindings = collectFallbackFromText(output, audit).filter( + (finding) => !structuredAdvisoryIds.has(`${finding.appPath}|${finding.advisoryId}`), + ); + + return [...structuredFindings, ...fallbackFindings]; +} + +async function fetchJson(url) { + const response = await fetch(url, { + headers: { + Accept: "application/vnd.github+json", + Authorization: `Bearer ${GITHUB_TOKEN}`, + "X-GitHub-Api-Version": "2022-11-28", + "User-Agent": "audit-ci-vuln-scan", + }, + }); + + if (!response.ok) { + throw new Error(`GitHub API request failed: ${response.status} ${response.statusText}`); + } + + return response.json(); +} + +async function listOpenPullRequests() { + if (!GITHUB_TOKEN || !GITHUB_REPOSITORY) { + return []; + } + + const pulls = []; + for (let page = 1; ; page += 1) { + const url = `${GITHUB_API_URL}/repos/${GITHUB_REPOSITORY}/pulls?state=open&per_page=100&page=${page}`; + const batch = await fetchJson(url); + pulls.push(...batch); + + if (batch.length < 100) { + break; + } + } + + return pulls; +} + +function extractCoveredKeys(pulls) { + const covered = new Map(); + + for (const pull of pulls) { + const body = pull.body || ""; + const markers = body.matchAll(MARKER_REGEX); + + for (const marker of markers) { + try { + const keys = JSON.parse(marker[1]); + if (!Array.isArray(keys)) { + continue; + } + + for (const key of keys) { + if (typeof key !== "string") { + continue; + } + + const existing = covered.get(key) || []; + existing.push({ + number: pull.number, + title: pull.title, + url: pull.html_url, + }); + covered.set(key, existing); + } + } catch { + // Ignore malformed markers rather than treating them as coverage. + } + } + } + + return covered; +} + +function buildMarker(keys) { + return ``; +} + +function writeGithubOutput(name, value) { + if (!process.env.GITHUB_OUTPUT) { + return; + } + + if (String(value).includes("\n")) { + const delimiter = `EOF_${name}_${Date.now()}_${Math.random().toString(16).slice(2)}`; + appendFileSync(process.env.GITHUB_OUTPUT, `${name}<<${delimiter}\n${value}\n${delimiter}\n`); + } else { + appendFileSync(process.env.GITHUB_OUTPUT, `${name}=${value}\n`); + } +} + +function buildPrompt({ uncoveredFindings, coveredFindings, marker, commands }) { + const uncoveredJson = JSON.stringify(uncoveredFindings, null, 2); + const coveredJson = JSON.stringify( + coveredFindings.map((finding) => ({ + key: finding.key, + appPath: finding.appPath, + advisoryId: finding.advisoryId, + packageName: finding.packageName, + coveredBy: finding.coveredBy, + })), + null, + 2, + ); + + return `PNPM Audit Failures Fix + +You are fixing CI security audit failures in this monorepo and must mirror this repo's workflow exactly. + +Source of truth: +- Workflow file: \`.github/workflows/npm-audit.yml\` +- Job: \`audit\` +- Failure reporter step: \`Report audit failures\` +- Reproduce using the same \`audit-ci\` commands/flags used in that workflow. + +Current uncovered audit failures on main: +- Patch only the vulnerability keys listed in this section. +- Do not spend effort on vulnerabilities listed as already covered by open PRs. +- Add this hidden marker inside the PR body's \`## Summary\` section: + ${marker} + +Uncovered vulnerability records: + +\`\`\`json +${uncoveredJson} +\`\`\` + +Already covered by open PR markers: + +\`\`\`json +${coveredJson} +\`\`\` + +CI-equivalent audit commands from \`.github/workflows/npm-audit.yml\`: +${commands.map((command) => `- \`${command}\``).join("\n")} + +Decision policy (strict order): +1) Upgrade higher-level direct dependencies first (non-breaking only: patch/minor). + - Goal: eliminate vulnerable transitive \`vite\` via parent upgrades (e.g. \`astro\` or app-level deps). + - No major upgrades unless explicitly approved. +2) If still failing, add minimal targeted \`pnpm overrides\` / \`resolutions\` for vulnerable transitives. +3) If neither upgrades nor overrides can reach a non-vulnerable version: + - Mark as \`BLOCKED\` + - Document exact blocking dependency chain and required follow-up (likely major upgrade or upstream fix). +4) If advisory is not practically exploitable for this repo: + - Provide evidence-based impact assessment (reachability, prod vs dev/build-only, exploit preconditions), + - Then propose a temporary ignore with: reason, expiry date, owner, and tracking issue. + +Override safety rules (important): +- Never use unbounded replacement ranges that can cross into a new major. +- For major-constrained selector overrides, the replacement must preserve the same major ceiling. +- Example of BAD override: + - \`\"vite@>=6.0.0 <7.0.0\": \">=6.4.2\"\` (can resolve to Vite 7/8) +- Example of GOOD override: + - \`\"vite@>=6.0.0 <7.0.0\": \">=6.4.2 <7.0.0\"\` +- If deterministic pin is preferred, use exact patched version: + - \`\"vite@>=6.0.0 <7.0.0\": \"6.4.2\"\` +- In the final explanation, explicitly state why the chosen override cannot drift to a higher major. + +Mandatory local verification (must match workflow commands): +- Run exactly these commands locally (same tool/flags/targets as CI): + - \`pnpm dlx audit-ci@^7 --directory apps/ui/ingestion-ui --config apps/ui/ingestion-ui/audit-ci.jsonc\` + - \`pnpm dlx audit-ci@^7 --directory apps/test-site --config apps/test-site/audit-ci.jsonc\` +- If broader validation is needed, also run the other audit commands defined in \`.github/workflows/npm-audit.yml\`. +- Do not claim success unless these CI-equivalent local commands pass (or a documented temporary ignore/blocked path is approved). + +PR requirements: +- Create a PR when done. +- PR body must contain only a **Summary** section (no **Test plan** section). +- Do not include any co-author lines/footers. +- Summary must be organized by app in this monorepo. +- Include the hidden \`audit-ci-vuln-keys\` marker exactly once inside the **Summary** section. +- For each app, list each package update and explicitly map it to the advisory/advisories it addresses. +- For each resolved package version used to fix the advisory, include its release date in the PR summary. +- If an override is used, include selector and replacement, and note the major-bound guarantee. +- If you update an SDK package, bump that SDK's package version as part of the same PR; otherwise the publish workflow will not publish the SDK changes. + +Output format (exact): +## Findings +- Advisory: +- Severity: <...> +- Affected path(s): <...> +- Fixed by: +- Rationale: + +## Changes Made +- : +- : + +## Local Verification (CI-equivalent) +- Workflow reference: \`.github/workflows/npm-audit.yml\` +- Commands run locally (exact): + - \`\` +- Results: + - \`\` + +## Decision Log +- Step 1 (direct upgrades): +- Step 2 (overrides): +- Step 3 (blocked?): +- Step 4 (temp ignore needed?): + +## PR Summary Draft (by app) +- \`\` + - \`\` -> \`\` (resolved version: \`\`, release date: \`\`) + - \` => \` -> \`\` (major-bound: \`\`, resolved version: \`\`, release date: \`\`) + +## Risk / Follow-up +- Runtime impact: +- If ignored: expires , tracked in , owner +- Next recommended action: +`; +} + +function writeStepSummary({ findings, coveredFindings, uncoveredFindings, outputDir }) { + if (!process.env.GITHUB_STEP_SUMMARY) { + return; + } + + const lines = [ + "## Audit CI Vulnerability Scan", + "", + `- Current vulnerabilities on default branch: ${findings.length}`, + `- Covered by open PR markers: ${coveredFindings.length}`, + `- Uncovered vulnerabilities for Claude: ${uncoveredFindings.length}`, + `- Output directory: \`${outputDir}\``, + "", + ]; + + if (uncoveredFindings.length > 0) { + lines.push("### Uncovered", ""); + for (const finding of uncoveredFindings) { + lines.push(`- \`${finding.key}\``); + } + } + + appendFileSync(process.env.GITHUB_STEP_SUMMARY, `${lines.join("\n")}\n`); +} + +async function main() { + mkdirSync(OUTPUT_DIR, { recursive: true }); + + const findings = []; + const auditResults = []; + + for (const audit of AUDITS) { + const textResult = run("pnpm", auditCommand(audit.appPath)); + const textOutput = combinedOutput(textResult); + const textOutputPath = path.join(OUTPUT_DIR, `${audit.outputName}.txt`); + + writeFileSync(textOutputPath, textOutput); + process.stdout.write(textOutput); + + const failed = textResult.status !== 0; + auditResults.push({ + ...audit, + failed, + command: commandForDisplay(audit.appPath), + textOutputPath, + }); + + if (!failed) { + continue; + } + + const jsonResult = run("pnpm", auditCommand(audit.appPath, ["--output-format", "json"])); + const jsonOutput = combinedOutput(jsonResult); + const jsonOutputPath = path.join(OUTPUT_DIR, `${audit.outputName}.json`); + writeFileSync(jsonOutputPath, jsonOutput); + + const report = extractJson(jsonOutput); + const appFindings = collectFindings(report, `${textOutput}\n${jsonOutput}`, audit); + + if (appFindings.length === 0) { + throw new Error( + `${audit.name} audit failed, but no GHSA advisory IDs could be parsed from audit-ci output.`, + ); + } + + for (const finding of appFindings) { + pushFinding(findings, finding); + } + } + + const pulls = await listOpenPullRequests(); + const coveredKeys = extractCoveredKeys(pulls); + const coveredFindings = []; + const uncoveredFindings = []; + + for (const finding of findings) { + const coveredBy = coveredKeys.get(finding.key); + if (coveredBy) { + coveredFindings.push({ ...finding, coveredBy }); + } else { + uncoveredFindings.push(finding); + } + } + + const uncoveredKeys = uncoveredFindings.map((finding) => finding.key); + const marker = buildMarker(uncoveredKeys); + const prompt = buildPrompt({ + uncoveredFindings, + coveredFindings, + marker, + commands: AUDITS.map((audit) => commandForDisplay(audit.appPath)), + }); + + const findingsPath = path.join(OUTPUT_DIR, "findings.json"); + const uncoveredPath = path.join(OUTPUT_DIR, "uncovered-findings.json"); + const coveredPath = path.join(OUTPUT_DIR, "covered-findings.json"); + const auditResultsPath = path.join(OUTPUT_DIR, "audit-results.json"); + const promptPath = path.join(OUTPUT_DIR, "claude-prompt.md"); + + writeFileSync(findingsPath, `${JSON.stringify(findings, null, 2)}\n`); + writeFileSync(uncoveredPath, `${JSON.stringify(uncoveredFindings, null, 2)}\n`); + writeFileSync(coveredPath, `${JSON.stringify(coveredFindings, null, 2)}\n`); + writeFileSync(auditResultsPath, `${JSON.stringify(auditResults, null, 2)}\n`); + writeFileSync(promptPath, prompt); + + writeGithubOutput("has_uncovered", uncoveredFindings.length > 0 ? "true" : "false"); + writeGithubOutput("prompt", prompt); + writeGithubOutput("prompt_file", promptPath); + writeGithubOutput("marker", marker); + writeGithubOutput("vuln_keys_json", JSON.stringify(uncoveredKeys)); + writeGithubOutput("uncovered_findings_json", JSON.stringify(uncoveredFindings)); + writeGithubOutput("uncovered_count", String(uncoveredFindings.length)); + writeGithubOutput("covered_count", String(coveredFindings.length)); + writeGithubOutput("findings_count", String(findings.length)); + + writeStepSummary({ + findings, + coveredFindings, + uncoveredFindings, + outputDir: OUTPUT_DIR, + }); + + if (uncoveredFindings.length === 0) { + console.log("No uncovered audit-ci vulnerabilities found on the default branch."); + } else { + console.log(`Found ${uncoveredFindings.length} uncovered audit-ci vulnerability key(s).`); + } +} + +main().catch((error) => { + console.error(error); + process.exit(1); +}); diff --git a/.github/workflows/npm-audit-claude-remediation.yml b/.github/workflows/npm-audit-claude-remediation.yml new file mode 100644 index 0000000000..f148c5dcf4 --- /dev/null +++ b/.github/workflows/npm-audit-claude-remediation.yml @@ -0,0 +1,103 @@ +name: Audit NPM Claude Remediation + +on: + workflow_run: + workflows: ["Audit NPM Packages"] + types: + - completed + +permissions: + actions: read + contents: write + issues: write + pull-requests: write + +concurrency: + group: audit-npm-claude-remediation-${{ github.event.repository.default_branch }} + cancel-in-progress: false + +jobs: + remediate: + if: ${{ github.event.workflow_run.conclusion == 'failure' }} + runs-on: blacksmith-2vcpu-ubuntu-2404 + timeout-minutes: 60 + + steps: + - name: Checkout default branch + uses: actions/checkout@v5 + with: + ref: ${{ github.event.repository.default_branch }} + + - name: Install pnpm + uses: pnpm/action-setup@fc06bc1257f339d1d5d8b3a19a8cae5388b55320 # v4 + with: + version: 10 + + - name: Scan default branch audit failures + id: scan + env: + GH_TOKEN: ${{ github.token }} + GITHUB_TOKEN: ${{ github.token }} + run: node .github/scripts/audit-ci-vuln-scan.mjs + + - name: Skip when all vulnerabilities are covered + if: ${{ steps.scan.outputs.has_uncovered != 'true' }} + run: | + echo "No uncovered audit-ci vulnerabilities on ${GITHUB_REF_NAME}; skipping Claude." + + - name: Run Claude remediation + id: claude + if: ${{ steps.scan.outputs.has_uncovered == 'true' }} + uses: anthropics/claude-code-action@2cc1ac1331eac7a6a96d716dd204dd2888d0fcd2 # v1 + with: + anthropic_api_key: ${{ secrets.NPM_AUDIT_CLAUDE_ANTHROPIC_API_KEY }} + base_branch: ${{ github.event.repository.default_branch }} + branch_prefix: claude/audit-ci/ + prompt: ${{ steps.scan.outputs.prompt }} + claude_args: | + --max-turns 20 + + - name: Ensure remediation PR marker + if: ${{ steps.scan.outputs.has_uncovered == 'true' && steps.claude.outputs.branch_name != '' }} + env: + GH_TOKEN: ${{ github.token }} + BRANCH_NAME: ${{ steps.claude.outputs.branch_name }} + MARKER: ${{ steps.scan.outputs.marker }} + run: | + set -euo pipefail + + PR_NUMBER="$(gh pr list --head "$BRANCH_NAME" --state open --json number --jq '.[0].number // empty')" + if [ -z "$PR_NUMBER" ]; then + echo "::warning::Claude did not leave an open PR for branch ${BRANCH_NAME}; marker could not be enforced." + exit 0 + fi + + gh label create audit-ci-remediation \ + --description "Automated audit-ci vulnerability remediation" \ + --color "B60205" \ + 2>/dev/null || true + gh pr edit "$PR_NUMBER" --add-label audit-ci-remediation + + BODY_FILE="$(mktemp)" + UPDATED_BODY_FILE="$(mktemp)" + gh pr view "$PR_NUMBER" --json body --jq '.body // ""' > "$BODY_FILE" + + node - "$BODY_FILE" "$UPDATED_BODY_FILE" "$MARKER" <<'NODE' + const { readFileSync, writeFileSync } = require("node:fs"); + + const [, , inputPath, outputPath, marker] = process.argv; + const markerRegex = //; + let body = readFileSync(inputPath, "utf8"); + + if (!markerRegex.test(body)) { + if (/^## Summary\b.*$/m.test(body)) { + body = body.replace(/^## Summary\b.*$/m, (heading) => `${heading}\n\n${marker}`); + } else { + body = `## Summary\n\n${marker}\n\n${body.trim()}\n`; + } + } + + writeFileSync(outputPath, body); + NODE + + gh pr edit "$PR_NUMBER" --body-file "$UPDATED_BODY_FILE" From 60adad74b1c53474afa1eabb0e55a7372bbf4fd0 Mon Sep 17 00:00:00 2001 From: mogery Date: Wed, 6 May 2026 02:44:49 +0200 Subject: [PATCH 10/27] fix(audit-autofixer): perms --- .github/workflows/npm-audit-claude-remediation.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/npm-audit-claude-remediation.yml b/.github/workflows/npm-audit-claude-remediation.yml index f148c5dcf4..1fc876cd51 100644 --- a/.github/workflows/npm-audit-claude-remediation.yml +++ b/.github/workflows/npm-audit-claude-remediation.yml @@ -5,10 +5,12 @@ on: workflows: ["Audit NPM Packages"] types: - completed + workflow_dispatch: permissions: actions: read contents: write + id-token: write issues: write pull-requests: write @@ -18,7 +20,7 @@ concurrency: jobs: remediate: - if: ${{ github.event.workflow_run.conclusion == 'failure' }} + if: ${{ github.event_name == 'workflow_dispatch' || github.event.workflow_run.conclusion == 'failure' }} runs-on: blacksmith-2vcpu-ubuntu-2404 timeout-minutes: 60 From 6abef0eb16357c296eb7f714f896aac8017142be Mon Sep 17 00:00:00 2001 From: mogery Date: Wed, 6 May 2026 13:43:30 +0200 Subject: [PATCH 11/27] fix(audit-autofixer): claude args --- .github/workflows/npm-audit-claude-remediation.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/npm-audit-claude-remediation.yml b/.github/workflows/npm-audit-claude-remediation.yml index 1fc876cd51..1eb13963d3 100644 --- a/.github/workflows/npm-audit-claude-remediation.yml +++ b/.github/workflows/npm-audit-claude-remediation.yml @@ -57,7 +57,8 @@ jobs: branch_prefix: claude/audit-ci/ prompt: ${{ steps.scan.outputs.prompt }} claude_args: | - --max-turns 20 + --max-turns 40 + --allowedTools "Read,Edit,MultiEdit,Write,Glob,Grep,LS,WebFetch,WebSearch,TodoWrite,Bash(pnpm:*),Bash(npm:*),Bash(node:*),Bash(git:*),Bash(gh:*),Bash(jq:*),Bash(rg:*),Bash(curl:*),Bash(date:*)" - name: Ensure remediation PR marker if: ${{ steps.scan.outputs.has_uncovered == 'true' && steps.claude.outputs.branch_name != '' }} From cccbee80d329eecc54efe71ca7da9c2eeaf6db21 Mon Sep 17 00:00:00 2001 From: "claude[bot]" <209825114+claude[bot]@users.noreply.github.com> Date: Wed, 6 May 2026 13:50:15 +0200 Subject: [PATCH 12/27] fix(audit): allowlist GHSA-v2v4-37r5-5v8g (ip-address XSS) in API (#3487) Co-authored-by: claude[bot] <41898282+claude[bot]@users.noreply.github.com> --- apps/api/audit-ci.jsonc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/apps/api/audit-ci.jsonc b/apps/api/audit-ci.jsonc index b2f1c1c3ec..fec8e6004b 100644 --- a/apps/api/audit-ci.jsonc +++ b/apps/api/audit-ci.jsonc @@ -2,6 +2,7 @@ "$schema": "https://github.com/IBM/audit-ci/raw/main/docs/schema.json", "low": true, "allowlist": [ - "GHSA-w5hq-g745-h8pq" + "GHSA-w5hq-g745-h8pq", + "GHSA-v2v4-37r5-5v8g" ] } From 92cbe08c89da651f4de24af2e4283abc9ada8971 Mon Sep 17 00:00:00 2001 From: "firecrawl-spring[bot]" <254786068+firecrawl-spring[bot]@users.noreply.github.com> Date: Wed, 6 May 2026 12:18:19 +0000 Subject: [PATCH 13/27] feat(elixir-sdk): add parse_file for /parse endpoint Adds multipart/form-data support to the OpenAPI generator and exposes parse_file/parse_file! in the Elixir SDK, bringing it to parity with the other SDKs. --- apps/elixir-sdk/README.md | 6 + apps/elixir-sdk/generate.exs | 253 ++++++++++++++++++++++++ apps/elixir-sdk/lib/firecrawl.ex | 105 ++++++++++ apps/elixir-sdk/mix.exs | 2 +- apps/elixir-sdk/test/firecrawl_test.exs | 39 +++- 5 files changed, 403 insertions(+), 2 deletions(-) diff --git a/apps/elixir-sdk/README.md b/apps/elixir-sdk/README.md index db1b8127ee..ec3efb68cd 100644 --- a/apps/elixir-sdk/README.md +++ b/apps/elixir-sdk/README.md @@ -59,6 +59,12 @@ All params are passed as keyword lists with snake_case keys. Invalid keys, missi # Check crawl status {:ok, response} = Firecrawl.get_crawl_status("job-uuid") +# Parse a file (PDF, DOCX, HTML, etc.) +{:ok, response} = Firecrawl.parse_file( + [filename: "report.pdf", data: File.read!("report.pdf"), content_type: "application/pdf"], + formats: ["markdown"] +) + # Self-hosted instance {:ok, response} = Firecrawl.scrape_and_extract_from_url( [url: "https://example.com"], diff --git a/apps/elixir-sdk/generate.exs b/apps/elixir-sdk/generate.exs index 7c97f522c7..e0327912aa 100644 --- a/apps/elixir-sdk/generate.exs +++ b/apps/elixir-sdk/generate.exs @@ -263,6 +263,38 @@ defmodule Firecrawl.Generator do has_body = Map.has_key?(operation, "requestBody") http_method = String.upcase(method) + if has_body and multipart?(operation) do + generate_multipart_function(method, path, operation, spec, func_name, summary, tag) + else + generate_json_function( + method, + path, + operation, + spec, + func_name, + summary, + tag, + path_params, + query_params, + has_body, + http_method + ) + end + end + + defp generate_json_function( + method, + path, + operation, + spec, + func_name, + summary, + tag, + path_params, + query_params, + has_body, + http_method + ) do # Extract request body schema body_properties = if has_body, do: extract_body_properties(operation, spec), else: [] required_keys = if has_body, do: extract_required_keys(operation, spec), else: [] @@ -348,6 +380,227 @@ defmodule Firecrawl.Generator do parts |> Enum.reject(&is_nil/1) |> Enum.join("\n") end + # --------------------------------------------------------------------------- + # Multipart Support + # --------------------------------------------------------------------------- + + defp multipart?(operation) do + content = get_in(operation, ["requestBody", "content"]) || %{} + Map.has_key?(content, "multipart/form-data") + end + + defp extract_multipart_meta(operation, spec) do + schema = get_in(operation, ["requestBody", "content", "multipart/form-data", "schema"]) || %{} + props = resolve_properties(schema, spec) + + {file_props, other_props} = + Enum.split_with(props, fn {_name, ps} -> + Map.get(ps, "format") == "binary" + end) + + file_field = + case file_props do + [{name, _} | _] -> name + _ -> nil + end + + {options_field, options_props, options_required} = + case Enum.find(other_props, fn {_n, ps} -> + Map.has_key?(ps, "properties") or Map.has_key?(ps, "$ref") or + Map.has_key?(ps, "allOf") + end) do + {name, ps} -> + inner_props = resolve_properties(ps, spec) + inner_required = resolve_required(ps, spec) + {name, inner_props, inner_required} + + nil -> + {nil, [], []} + end + + %{ + file_field: file_field, + options_field: options_field, + options_props: options_props, + options_required: options_required + } + end + + defp generate_multipart_function(method, path, operation, spec, func_name, summary, tag) do + meta = extract_multipart_meta(operation, spec) + http_method = String.upcase(method) + has_options? = meta.options_props != [] + + body_schema_code = + if has_options?, do: generate_schema(func_name, meta.options_props, meta.options_required), else: nil + + body_key_mapping_code = + if has_options?, do: generate_key_mapping(func_name, meta.options_props), else: nil + + doc = build_multipart_doc(summary, http_method, path, tag, func_name, has_options?, meta) + + {sig, body} = build_multipart_function_body(func_name, method, path, meta, has_options?, false) + {bang_sig, bang_body} = build_multipart_function_body(func_name, method, path, meta, has_options?, true) + + spec_code = build_multipart_typespec(func_name, has_options?, false) + bang_spec_code = build_multipart_typespec(func_name, has_options?, true) + + parts = [ + body_schema_code, + body_key_mapping_code, + doc, + spec_code, + " #{sig}", + body, + "", + doc_bang(func_name), + bang_spec_code, + " #{bang_sig}", + bang_body, + "" + ] + + parts |> Enum.reject(&is_nil/1) |> Enum.join("\n") + end + + defp build_multipart_function_body(func_name, method, path, meta, has_options?, bang?) do + req_method = String.to_atom(method) + fn_name = if bang?, do: "#{func_name}!", else: func_name + req_fn = if bang?, do: "#{req_method}!", else: "#{req_method}" + options_field = meta.options_field + file_field = meta.file_field + + sig = + if has_options? do + "def #{fn_name}(file, params \\\\ [], opts \\\\ []) do" + else + "def #{fn_name}(file, opts \\\\ []) do" + end + + options_part_text = + cond do + has_options? and not is_nil(options_field) -> + "{\"#{options_field}\", Jason.encode!(to_body(params, @#{func_name}_key_mapping))}, " + + true -> + "" + end + + file_part_text = "{\"#{file_field}\", file_part}" + + indent = if not bang? and has_options?, do: " ", else: " " + + core_lines = [ + "#{indent}filename = Keyword.fetch!(file, :filename)", + "#{indent}data = Keyword.fetch!(file, :data)", + "#{indent}content_type = Keyword.get(file, :content_type)", + "", + "#{indent}if not is_binary(filename) or filename == \"\" do", + "#{indent} raise ArgumentError, \"filename cannot be empty\"", + "#{indent}end", + "", + "#{indent}if is_nil(data) do", + "#{indent} raise ArgumentError, \"file data cannot be empty\"", + "#{indent}end", + "", + "#{indent}file_part =", + "#{indent} case content_type do", + "#{indent} nil -> {data, filename: filename}", + "#{indent} ct -> {data, filename: filename, content_type: ct}", + "#{indent} end", + "", + "#{indent}multipart = [#{options_part_text}#{file_part_text}]", + "", + "#{indent}Req.#{req_fn}(client(opts), url: \"#{path}\", form_multipart: multipart)" + ] + + core = Enum.join(core_lines, "\n") + + body = + cond do + bang? and has_options? -> + " params = NimbleOptions.validate!(params, @#{func_name}_schema)\n#{core}\n end\n" + + not bang? and has_options? -> + " with {:ok, params} <- NimbleOptions.validate(params, @#{func_name}_schema) do\n#{core}\n end\n end\n" + + true -> + "#{core}\n end\n" + end + + {sig, body} + end + + defp build_multipart_typespec(func_name, has_options?, bang?) do + name = if bang?, do: "#{func_name}!", else: func_name + return_type = if bang?, do: "Req.Response.t()", else: "response()" + + args = + if has_options? do + "keyword(), keyword(), keyword()" + else + "keyword(), keyword()" + end + + " @spec #{name}(#{args}) :: #{return_type}" + end + + defp build_multipart_doc(summary, http_method, path, tag, func_name, has_options?, meta) do + bt = <<96>> + + parts = [ + " @doc \"\"\"", + " #{summary}", + "", + " #{bt}#{http_method} #{path}#{bt}", + "", + " Sends a #{bt}multipart/form-data#{bt} request." + ] + + parts = if tag != "", do: parts ++ ["", " Tag: #{tag}"], else: parts + + parts = + parts ++ + [ + "", + " ## File", + "", + " Pass #{bt}file#{bt} as a keyword list:", + "", + " * #{bt}:filename#{bt} (required) - The filename to send.", + " * #{bt}:data#{bt} (required) - The file contents as a binary.", + " * #{bt}:content_type#{bt} (optional) - The MIME type of the file." + ] + + parts = + if has_options? do + parts ++ + [ + "", + " ## Parameters", + "", + " Validated by #{bt}NimbleOptions#{bt}. Pass options as a keyword list with snake_case keys.", + " These are JSON-encoded and sent as the #{bt}#{meta.options_field}#{bt} multipart field.", + " See #{bt}@#{func_name}_schema#{bt} for the full schema." + ] + else + parts + end + + parts = + parts ++ + [ + "", + " ## Returns", + "", + " * #{bt}{:ok, %Req.Response{}}#{bt} on success", + " * #{bt}{:error, exception}#{bt} on HTTP or validation failure", + " \"\"\"" + ] + + Enum.join(parts, "\n") + end + # --------------------------------------------------------------------------- # Schema Extraction # --------------------------------------------------------------------------- diff --git a/apps/elixir-sdk/lib/firecrawl.ex b/apps/elixir-sdk/lib/firecrawl.ex index d974d853b5..b38318ce70 100644 --- a/apps/elixir-sdk/lib/firecrawl.ex +++ b/apps/elixir-sdk/lib/firecrawl.ex @@ -901,6 +901,111 @@ defmodule Firecrawl do end + @parse_file_schema NimbleOptions.new!([ + block_ads: [type: :boolean, doc: "Enable ad and cookie popup blocking."], + exclude_tags: [type: {:list, :string}, doc: "Tags to exclude from the output."], + formats: [type: {:list, :any}, doc: "Output formats supported for `/parse` uploads. Browser-rendering formats and change tracking are not supported."], + headers: [type: :any, doc: "Headers to send when additional network requests are required."], + include_tags: [type: {:list, :string}, doc: "Tags to include in the output."], + integration: [type: :string, doc: "Optional integration identifier."], + only_main_content: [type: :boolean, doc: "Only return the main content of the page excluding headers, navs, footers, etc."], + origin: [type: :string, doc: "Origin identifier for analytics and logging."], + parsers: [type: {:list, :any}, doc: "Controls file parser behavior when relevant (for example PDF parser mode)."], + proxy: [type: {:or, [{:in, [:basic, :auto]}, :string]}, doc: "Proxy mode for parse uploads. `/parse` supports only `basic` and `auto`."], + remove_base64_images: [type: :boolean, doc: "Remove base64-encoded images from output and keep alt text placeholders."], + skip_tls_verification: [type: :boolean, doc: "Skip TLS certificate verification when making requests."], + timeout: [type: :integer, doc: "Timeout in milliseconds for the request. Default is 30000 (30 seconds). Maximum is 300000 (300 seconds)."], + zero_data_retention: [type: :boolean, doc: "If true, this will enable zero data retention for this parse. To enable this feature, please contact help@firecrawl.dev"] + ]) + + @parse_file_key_mapping %{block_ads: "blockAds", exclude_tags: "excludeTags", formats: "formats", headers: "headers", include_tags: "includeTags", integration: "integration", only_main_content: "onlyMainContent", origin: "origin", parsers: "parsers", proxy: "proxy", remove_base64_images: "removeBase64Images", skip_tls_verification: "skipTlsVerification", timeout: "timeout", zero_data_retention: "zeroDataRetention"} + + @doc """ + Upload and parse a file + + `POST /parse` + + Sends a `multipart/form-data` request. + + Tag: Scraping + + ## File + + Pass `file` as a keyword list: + + * `:filename` (required) - The filename to send. + * `:data` (required) - The file contents as a binary. + * `:content_type` (optional) - The MIME type of the file. + + ## Parameters + + Validated by `NimbleOptions`. Pass options as a keyword list with snake_case keys. + These are JSON-encoded and sent as the `options` multipart field. + See `@parse_file_schema` for the full schema. + + ## Returns + + * `{:ok, %Req.Response{}}` on success + * `{:error, exception}` on HTTP or validation failure + """ + @spec parse_file(keyword(), keyword(), keyword()) :: response() + def parse_file(file, params \\ [], opts \\ []) do + with {:ok, params} <- NimbleOptions.validate(params, @parse_file_schema) do + filename = Keyword.fetch!(file, :filename) + data = Keyword.fetch!(file, :data) + content_type = Keyword.get(file, :content_type) + + if not is_binary(filename) or filename == "" do + raise ArgumentError, "filename cannot be empty" + end + + if is_nil(data) do + raise ArgumentError, "file data cannot be empty" + end + + file_part = + case content_type do + nil -> {data, filename: filename} + ct -> {data, filename: filename, content_type: ct} + end + + multipart = [{"options", Jason.encode!(to_body(params, @parse_file_key_mapping))}, {"file", file_part}] + + Req.post(client(opts), url: "/parse", form_multipart: multipart) + end + end + + + @doc """ + Bang variant of `parse_file`. Raises on error. + """ + @spec parse_file!(keyword(), keyword(), keyword()) :: Req.Response.t() + def parse_file!(file, params \\ [], opts \\ []) do + params = NimbleOptions.validate!(params, @parse_file_schema) + filename = Keyword.fetch!(file, :filename) + data = Keyword.fetch!(file, :data) + content_type = Keyword.get(file, :content_type) + + if not is_binary(filename) or filename == "" do + raise ArgumentError, "filename cannot be empty" + end + + if is_nil(data) do + raise ArgumentError, "file data cannot be empty" + end + + file_part = + case content_type do + nil -> {data, filename: filename} + ct -> {data, filename: filename, content_type: ct} + end + + multipart = [{"options", Jason.encode!(to_body(params, @parse_file_key_mapping))}, {"file", file_part}] + + Req.post!(client(opts), url: "/parse", form_multipart: multipart) + end + + @scrape_and_extract_from_url_schema NimbleOptions.new!([ url: [type: :string, required: true, doc: "The URL to scrape"], actions: [type: {:list, :any}, doc: "Actions to perform on the page before grabbing the content"], diff --git a/apps/elixir-sdk/mix.exs b/apps/elixir-sdk/mix.exs index 479cf2217b..e15468af24 100644 --- a/apps/elixir-sdk/mix.exs +++ b/apps/elixir-sdk/mix.exs @@ -1,7 +1,7 @@ defmodule Firecrawl.MixProject do use Mix.Project - @version "1.2.1" + @version "1.3.0" @source_url "https://github.com/firecrawl/firecrawl/tree/main/apps/elixir-sdk" def project do diff --git a/apps/elixir-sdk/test/firecrawl_test.exs b/apps/elixir-sdk/test/firecrawl_test.exs index 72f6a9d149..f6c6b6daa1 100644 --- a/apps/elixir-sdk/test/firecrawl_test.exs +++ b/apps/elixir-sdk/test/firecrawl_test.exs @@ -124,6 +124,37 @@ defmodule FirecrawlTest do "Expected connection error, got validation error: #{inspect(err)}" end + test "parse_file raises ArgumentError when filename is empty" do + Application.put_env(:firecrawl, :api_key, "test-key") + on_exit(fn -> Application.delete_env(:firecrawl, :api_key) end) + + assert_raise ArgumentError, ~r/filename cannot be empty/, fn -> + Firecrawl.parse_file([filename: "", data: "x"]) + end + end + + test "parse_file raises ArgumentError when data is nil" do + Application.put_env(:firecrawl, :api_key, "test-key") + on_exit(fn -> Application.delete_env(:firecrawl, :api_key) end) + + assert_raise ArgumentError, ~r/file data cannot be empty/, fn -> + Firecrawl.parse_file([filename: "doc.pdf", data: nil]) + end + end + + test "parse_file rejects unknown options" do + Application.put_env(:firecrawl, :api_key, "test-key") + on_exit(fn -> Application.delete_env(:firecrawl, :api_key) end) + + assert {:error, %NimbleOptions.ValidationError{message: msg}} = + Firecrawl.parse_file( + [filename: "doc.pdf", data: "x"], + typo_option: true + ) + + assert msg =~ "unknown options" + end + test "non-bang returns {:error, %Firecrawl.Error{}} for API errors" do adapter = fn request -> resp = Req.Response.new( @@ -199,7 +230,13 @@ defmodule FirecrawlTest do {:get_queue_status, 0}, {:get_queue_status!, 0}, {:cancel_crawl, 1}, - {:cancel_crawl!, 1} + {:cancel_crawl!, 1}, + {:parse_file, 1}, + {:parse_file, 2}, + {:parse_file, 3}, + {:parse_file!, 1}, + {:parse_file!, 2}, + {:parse_file!, 3} ] for {name, arity} <- expected do From b59ac6f93b08a7aad902548f7702fcbe8e97cdbf Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 6 May 2026 12:37:41 +0000 Subject: [PATCH 14/27] fix(elixir-sdk): return error tuple from parse_file/3 instead of raising The non-bang parse_file/3 now returns {:error, %ArgumentError{}} for invalid file inputs (empty filename, nil data) instead of raising, matching the documented {:error, exception} contract. The bang variant parse_file!/3 continues to raise as expected. Co-Authored-By: gaurav --- apps/elixir-sdk/lib/firecrawl.ex | 39 ++++++++++++++++++------- apps/elixir-sdk/test/firecrawl_test.exs | 18 +++++++----- 2 files changed, 38 insertions(+), 19 deletions(-) diff --git a/apps/elixir-sdk/lib/firecrawl.ex b/apps/elixir-sdk/lib/firecrawl.ex index b38318ce70..ba32d0d289 100644 --- a/apps/elixir-sdk/lib/firecrawl.ex +++ b/apps/elixir-sdk/lib/firecrawl.ex @@ -130,6 +130,29 @@ defmodule Firecrawl do Enum.join([first | Enum.map(rest, &String.capitalize/1)]) end + defp fetch_file_field(file, key) do + case Keyword.fetch(file, key) do + {:ok, _value} = ok -> ok + :error -> {:error, %ArgumentError{message: "missing required file field: #{key}"}} + end + end + + defp validate_filename(filename) do + if is_binary(filename) and filename != "" do + :ok + else + {:error, %ArgumentError{message: "filename cannot be empty"}} + end + end + + defp validate_data(data) do + if is_nil(data) do + {:error, %ArgumentError{message: "file data cannot be empty"}} + else + :ok + end + end + @doc """ Cancel an agent job @@ -950,19 +973,13 @@ defmodule Firecrawl do """ @spec parse_file(keyword(), keyword(), keyword()) :: response() def parse_file(file, params \\ [], opts \\ []) do - with {:ok, params} <- NimbleOptions.validate(params, @parse_file_schema) do - filename = Keyword.fetch!(file, :filename) - data = Keyword.fetch!(file, :data) + with {:ok, params} <- NimbleOptions.validate(params, @parse_file_schema), + {:ok, filename} <- fetch_file_field(file, :filename), + :ok <- validate_filename(filename), + {:ok, data} <- fetch_file_field(file, :data), + :ok <- validate_data(data) do content_type = Keyword.get(file, :content_type) - if not is_binary(filename) or filename == "" do - raise ArgumentError, "filename cannot be empty" - end - - if is_nil(data) do - raise ArgumentError, "file data cannot be empty" - end - file_part = case content_type do nil -> {data, filename: filename} diff --git a/apps/elixir-sdk/test/firecrawl_test.exs b/apps/elixir-sdk/test/firecrawl_test.exs index f6c6b6daa1..acc9969004 100644 --- a/apps/elixir-sdk/test/firecrawl_test.exs +++ b/apps/elixir-sdk/test/firecrawl_test.exs @@ -124,22 +124,24 @@ defmodule FirecrawlTest do "Expected connection error, got validation error: #{inspect(err)}" end - test "parse_file raises ArgumentError when filename is empty" do + test "parse_file returns error tuple when filename is empty" do Application.put_env(:firecrawl, :api_key, "test-key") on_exit(fn -> Application.delete_env(:firecrawl, :api_key) end) - assert_raise ArgumentError, ~r/filename cannot be empty/, fn -> - Firecrawl.parse_file([filename: "", data: "x"]) - end + assert {:error, %ArgumentError{message: msg}} = + Firecrawl.parse_file([filename: "", data: "x"]) + + assert msg =~ "filename cannot be empty" end - test "parse_file raises ArgumentError when data is nil" do + test "parse_file returns error tuple when data is nil" do Application.put_env(:firecrawl, :api_key, "test-key") on_exit(fn -> Application.delete_env(:firecrawl, :api_key) end) - assert_raise ArgumentError, ~r/file data cannot be empty/, fn -> - Firecrawl.parse_file([filename: "doc.pdf", data: nil]) - end + assert {:error, %ArgumentError{message: msg}} = + Firecrawl.parse_file([filename: "doc.pdf", data: nil]) + + assert msg =~ "file data cannot be empty" end test "parse_file rejects unknown options" do From bc3c57adbb26b523dbfb4e34283708a85481457e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Wed, 6 May 2026 14:42:43 +0200 Subject: [PATCH 15/27] Add monitor API orchestration (ENG-4857) (#3470) Co-authored-by: Cursor --- apps/api/src/__tests__/snips/lib.ts | 1 + apps/api/src/__tests__/snips/v2/crawl.test.ts | 2 +- apps/api/src/__tests__/snips/v2/lib.ts | 90 ++ .../src/__tests__/snips/v2/monitor.test.ts | 142 +++ .../__tests__/snips/v2/scrape-query.test.ts | 4 +- .../snips/v2/types-validation.test.ts | 86 ++ apps/api/src/controllers/v2/monitor.ts | 353 ++++++ apps/api/src/harness.ts | 11 +- apps/api/src/lib/gcs-monitoring.ts | 64 + apps/api/src/routes/v2.ts | 67 + apps/api/src/services/billing/types.ts | 1 + apps/api/src/services/logging/log_job.ts | 12 + apps/api/src/services/monitoring/cron.ts | 247 ++++ apps/api/src/services/monitoring/diff.ts | 105 ++ apps/api/src/services/monitoring/queue.ts | 133 ++ apps/api/src/services/monitoring/results.ts | 227 ++++ apps/api/src/services/monitoring/runner.ts | 1098 +++++++++++++++++ apps/api/src/services/monitoring/scheduler.ts | 75 ++ apps/api/src/services/monitoring/store.ts | 589 +++++++++ apps/api/src/services/monitoring/types.ts | 275 +++++ .../services/notification/monitoring_email.ts | 216 ++++ apps/api/src/services/queue-worker.ts | 40 +- apps/api/src/services/webhook/types.ts | 37 + apps/api/src/services/worker/crawl-logic.ts | 4 + apps/api/src/services/worker/scrape-worker.ts | 25 + apps/api/src/types.ts | 6 + apps/dot-net-sdk/Firecrawl/Firecrawl.csproj | 2 +- apps/dot-net-sdk/Firecrawl/FirecrawlClient.cs | 169 +++ .../Firecrawl/FirecrawlHttpClient.cs | 19 + .../Firecrawl/Models/MonitorModels.cs | 225 ++++ apps/elixir-sdk/lib/firecrawl.ex | 270 ++++ apps/elixir-sdk/mix.exs | 2 +- apps/go-sdk/firecrawl.go | 213 +++- apps/go-sdk/http_client.go | 6 + apps/go-sdk/models.go | 134 +- apps/java-sdk/build.gradle.kts | 2 +- .../com/firecrawl/client/FirecrawlClient.java | 167 +++ .../firecrawl/client/FirecrawlHttpClient.java | 21 + .../java/com/firecrawl/models/Monitor.java | 40 + .../com/firecrawl/models/MonitorCheck.java | 42 + .../firecrawl/models/MonitorCheckDetail.java | 15 + .../firecrawl/models/MonitorCheckPage.java | 30 + .../com/firecrawl/models/MonitorSchedule.java | 12 + .../com/firecrawl/models/MonitorSummary.java | 25 + apps/js-sdk/firecrawl/package.json | 2 +- .../src/__tests__/unit/v2/pagination.test.ts | 21 + apps/js-sdk/firecrawl/src/v2/client.ts | 85 ++ .../firecrawl/src/v2/methods/monitor.ts | 181 +++ apps/js-sdk/firecrawl/src/v2/types.ts | 148 +++ .../firecrawl/src/v2/utils/httpClient.ts | 14 + .../firecrawl/src/v2/utils/pagination.ts | 21 +- apps/php-sdk/src/Client/FirecrawlClient.php | 172 ++- .../src/Client/FirecrawlHttpClient.php | 11 +- apps/php-sdk/src/Models/Monitor.php | 76 ++ apps/php-sdk/src/Models/MonitorCheck.php | 77 ++ .../php-sdk/src/Models/MonitorCheckDetail.php | 86 ++ apps/php-sdk/src/Version.php | 2 +- apps/python-sdk/firecrawl/__init__.py | 2 +- apps/python-sdk/firecrawl/client.py | 32 + apps/python-sdk/firecrawl/v2/client.py | 115 ++ apps/python-sdk/firecrawl/v2/client_async.py | 115 ++ .../firecrawl/v2/methods/aio/monitor.py | 215 ++++ .../firecrawl/v2/methods/monitor.py | 214 ++++ apps/python-sdk/firecrawl/v2/types.py | 130 ++ .../firecrawl/v2/utils/http_client.py | 48 +- .../firecrawl/v2/utils/http_client_async.py | 42 + apps/ruby-sdk/lib/firecrawl.rb | 1 + apps/ruby-sdk/lib/firecrawl/client.rb | 98 ++ apps/ruby-sdk/lib/firecrawl/http_client.rb | 10 + apps/ruby-sdk/lib/firecrawl/models/monitor.rb | 68 + apps/ruby-sdk/lib/firecrawl/version.rb | 2 +- apps/rust-sdk/Cargo.lock | 2 +- apps/rust-sdk/Cargo.toml | 2 +- apps/rust-sdk/src/lib.rs | 2 + apps/rust-sdk/src/monitor.rs | 370 ++++++ 75 files changed, 7632 insertions(+), 36 deletions(-) create mode 100644 apps/api/src/__tests__/snips/v2/monitor.test.ts create mode 100644 apps/api/src/controllers/v2/monitor.ts create mode 100644 apps/api/src/lib/gcs-monitoring.ts create mode 100644 apps/api/src/services/monitoring/cron.ts create mode 100644 apps/api/src/services/monitoring/diff.ts create mode 100644 apps/api/src/services/monitoring/queue.ts create mode 100644 apps/api/src/services/monitoring/results.ts create mode 100644 apps/api/src/services/monitoring/runner.ts create mode 100644 apps/api/src/services/monitoring/scheduler.ts create mode 100644 apps/api/src/services/monitoring/store.ts create mode 100644 apps/api/src/services/monitoring/types.ts create mode 100644 apps/api/src/services/notification/monitoring_email.ts create mode 100644 apps/dot-net-sdk/Firecrawl/Models/MonitorModels.cs create mode 100644 apps/java-sdk/src/main/java/com/firecrawl/models/Monitor.java create mode 100644 apps/java-sdk/src/main/java/com/firecrawl/models/MonitorCheck.java create mode 100644 apps/java-sdk/src/main/java/com/firecrawl/models/MonitorCheckDetail.java create mode 100644 apps/java-sdk/src/main/java/com/firecrawl/models/MonitorCheckPage.java create mode 100644 apps/java-sdk/src/main/java/com/firecrawl/models/MonitorSchedule.java create mode 100644 apps/java-sdk/src/main/java/com/firecrawl/models/MonitorSummary.java create mode 100644 apps/js-sdk/firecrawl/src/v2/methods/monitor.ts create mode 100644 apps/php-sdk/src/Models/Monitor.php create mode 100644 apps/php-sdk/src/Models/MonitorCheck.php create mode 100644 apps/php-sdk/src/Models/MonitorCheckDetail.php create mode 100644 apps/python-sdk/firecrawl/v2/methods/aio/monitor.py create mode 100644 apps/python-sdk/firecrawl/v2/methods/monitor.py create mode 100644 apps/ruby-sdk/lib/firecrawl/models/monitor.rb create mode 100644 apps/rust-sdk/src/monitor.rs diff --git a/apps/api/src/__tests__/snips/lib.ts b/apps/api/src/__tests__/snips/lib.ts index 4e2306633d..4d0e461d01 100644 --- a/apps/api/src/__tests__/snips/lib.ts +++ b/apps/api/src/__tests__/snips/lib.ts @@ -23,6 +23,7 @@ export const TEST_PRODUCTION = !TEST_SELF_HOST; // TODO: do we want to run AI tests when users run this command locally? It may lead to increased spending for them, depending on configuration export const HAS_AI = !!(config.OPENAI_API_KEY || config.OLLAMA_BASE_URL); +export const HAS_FIREWORKS = !!process.env.FIREWORKS_API_KEY; export const HAS_FIRE_ENGINE = !!config.FIRE_ENGINE_BETA_URL; export const HAS_PLAYWRIGHT = !!config.PLAYWRIGHT_MICROSERVICE_URL; export const HAS_PROXY = !!config.PROXY_SERVER; diff --git a/apps/api/src/__tests__/snips/v2/crawl.test.ts b/apps/api/src/__tests__/snips/v2/crawl.test.ts index 88a8228a8a..53c70e4dbc 100644 --- a/apps/api/src/__tests__/snips/v2/crawl.test.ts +++ b/apps/api/src/__tests__/snips/v2/crawl.test.ts @@ -437,7 +437,7 @@ describe("Crawl tests", () => { async () => { const res = await crawl( { - url: base, + url: new URL("/blog", base).href, prompt: "Crawl everything including external links and subdomains", // Explicit options that should override the prompt diff --git a/apps/api/src/__tests__/snips/v2/lib.ts b/apps/api/src/__tests__/snips/v2/lib.ts index 840a2c7c78..1e5583f71c 100644 --- a/apps/api/src/__tests__/snips/v2/lib.ts +++ b/apps/api/src/__tests__/snips/v2/lib.ts @@ -85,6 +85,96 @@ export async function scrapeWithFailure( return raw.body; } +// ========================================= +// Monitor API +// ========================================= + +export type MonitorCreateInput = { + name: string; + schedule: { cron: string; timezone?: string }; + webhook?: { url: string; headers?: Record }; + notification?: { + email?: { + enabled?: boolean; + recipients?: string[]; + includeDiffs?: boolean; + }; + }; + targets: Array< + | { + type: "scrape"; + urls: string[]; + scrapeOptions?: Record; + } + | { + type: "crawl"; + url: string; + crawlOptions?: Record; + scrapeOptions?: Record; + } + >; + retentionDays?: number; +}; + +export async function monitorCreateRaw( + body: MonitorCreateInput, + identity: Identity, +) { + return await request(TEST_API_URL) + .post("/v2/monitor") + .set("Authorization", `Bearer ${identity.apiKey}`) + .set("Content-Type", "application/json") + .send(body); +} + +export async function monitorListRaw(identity: Identity) { + return await request(TEST_API_URL) + .get("/v2/monitor") + .set("Authorization", `Bearer ${identity.apiKey}`); +} + +export async function monitorGetRaw(id: string, identity: Identity) { + return await request(TEST_API_URL) + .get(`/v2/monitor/${id}`) + .set("Authorization", `Bearer ${identity.apiKey}`); +} + +export async function monitorPatchRaw( + id: string, + body: Partial & { status?: "active" | "paused" }, + identity: Identity, +) { + return await request(TEST_API_URL) + .patch(`/v2/monitor/${id}`) + .set("Authorization", `Bearer ${identity.apiKey}`) + .set("Content-Type", "application/json") + .send(body); +} + +export async function monitorDeleteRaw(id: string, identity: Identity) { + return await request(TEST_API_URL) + .delete(`/v2/monitor/${id}`) + .set("Authorization", `Bearer ${identity.apiKey}`); +} + +export async function monitorRunRaw(id: string, identity: Identity) { + return await request(TEST_API_URL) + .post(`/v2/monitor/${id}/run`) + .set("Authorization", `Bearer ${identity.apiKey}`); +} + +export async function monitorCheckRaw( + monitorId: string, + checkId: string, + identity: Identity, + query?: Record, +) { + const req = request(TEST_API_URL) + .get(`/v2/monitor/${monitorId}/checks/${checkId}`) + .set("Authorization", `Bearer ${identity.apiKey}`); + return query ? req.query(query) : req; +} + export async function parseRaw( body: { options?: Omit; diff --git a/apps/api/src/__tests__/snips/v2/monitor.test.ts b/apps/api/src/__tests__/snips/v2/monitor.test.ts new file mode 100644 index 0000000000..9c677507ad --- /dev/null +++ b/apps/api/src/__tests__/snips/v2/monitor.test.ts @@ -0,0 +1,142 @@ +import { + createTestIdUrl, + describeIf, + ALLOW_TEST_SUITE_WEBSITE, + TEST_SELF_HOST, +} from "../lib"; +import { + idmux, + Identity, + monitorCheckRaw, + monitorCreateRaw, + monitorDeleteRaw, + monitorGetRaw, + monitorListRaw, + monitorPatchRaw, + monitorRunRaw, + scrapeTimeout, +} from "./lib"; + +describeIf(ALLOW_TEST_SUITE_WEBSITE && !TEST_SELF_HOST)("/v2/monitor", () => { + let identity: Identity; + + beforeAll(async () => { + identity = await idmux({ + name: "monitor", + concurrency: 20, + credits: 1000000, + }); + }, 10000); + + it("creates, lists, gets, pauses, and deletes a monitor", async () => { + const create = await monitorCreateRaw( + { + name: "snips monitor", + schedule: { cron: "*/30 * * * *", timezone: "UTC" }, + targets: [ + { + type: "scrape", + urls: [createTestIdUrl(), createTestIdUrl()], + scrapeOptions: { formats: ["markdown"] }, + }, + ], + notification: { email: { enabled: false } }, + }, + identity, + ); + + expect(create.statusCode).toBe(200); + expect(create.body.success).toBe(true); + expect(create.body.data.id).toEqual(expect.any(String)); + expect(create.body.data.targets[0].id).toEqual(expect.any(String)); + + const id = create.body.data.id; + const list = await monitorListRaw(identity); + expect(list.statusCode).toBe(200); + expect(list.body.data.some((x: any) => x.id === id)).toBe(true); + + const get = await monitorGetRaw(id, identity); + expect(get.statusCode).toBe(200); + expect(get.body.data.id).toBe(id); + + const patch = await monitorPatchRaw(id, { status: "paused" }, identity); + expect(patch.statusCode).toBe(200); + expect(patch.body.data.status).toBe("paused"); + + const del = await monitorDeleteRaw(id, identity); + expect(del.statusCode).toBe(200); + expect(del.body.success).toBe(true); + }); + + it("rejects cron schedules under 15 minutes", async () => { + const response = await monitorCreateRaw( + { + name: "too frequent", + schedule: { cron: "*/5 * * * *", timezone: "UTC" }, + targets: [ + { + type: "scrape", + urls: [createTestIdUrl()], + }, + ], + }, + identity, + ); + + expect(response.statusCode).toBe(400); + expect(response.body.success).toBe(false); + expect(response.body.error).toContain("15 minutes"); + }); + + it( + "runs a manual scrape monitor check", + async () => { + const create = await monitorCreateRaw( + { + name: "manual monitor", + schedule: { cron: "*/30 * * * *", timezone: "UTC" }, + targets: [ + { + type: "scrape", + urls: [createTestIdUrl(), createTestIdUrl()], + scrapeOptions: { formats: ["markdown"] }, + }, + ], + }, + identity, + ); + expect(create.statusCode).toBe(200); + + const monitorId = create.body.data.id; + const run = await monitorRunRaw(monitorId, identity); + expect(run.statusCode).toBe(200); + const checkId = run.body.id; + + let check: any; + for (let i = 0; i < 90; i++) { + const raw = await monitorCheckRaw(monitorId, checkId, identity); + expect(raw.statusCode).toBe(200); + check = raw.body.data; + if (["completed", "partial", "failed"].includes(check.status)) break; + await new Promise(resolve => setTimeout(resolve, 1000)); + } + + expect(["completed", "partial"]).toContain(check.status); + expect(check.summary.totalPages).toBeGreaterThanOrEqual(2); + expect(check.pages.length).toBeGreaterThanOrEqual(1); + expect(check.next).toBeUndefined(); + + const firstPage = await monitorCheckRaw(monitorId, checkId, identity, { + limit: 1, + }); + expect(firstPage.statusCode).toBe(200); + expect(firstPage.body.next).toContain("skip=1"); + expect(firstPage.body.next).toContain("limit=1"); + expect(firstPage.body.data.next).toBe(firstPage.body.next); + expect(firstPage.body.data.pages).toHaveLength(1); + + await monitorDeleteRaw(monitorId, identity); + }, + 2 * scrapeTimeout, + ); +}); diff --git a/apps/api/src/__tests__/snips/v2/scrape-query.test.ts b/apps/api/src/__tests__/snips/v2/scrape-query.test.ts index 41cbd5b4d2..b088edc1e6 100644 --- a/apps/api/src/__tests__/snips/v2/scrape-query.test.ts +++ b/apps/api/src/__tests__/snips/v2/scrape-query.test.ts @@ -1,4 +1,4 @@ -import { concurrentIf, HAS_AI, TEST_PRODUCTION } from "../lib"; +import { concurrentIf, HAS_AI, HAS_FIREWORKS, TEST_PRODUCTION } from "../lib"; import { scrape, scrapeRaw, @@ -61,7 +61,7 @@ describe("Query format", () => { scrapeTimeout, ); - concurrentIf(TEST_PRODUCTION || HAS_AI)( + concurrentIf(TEST_PRODUCTION || HAS_FIREWORKS)( "returns a direct quote answer when query mode is directQuote", async () => { const response = await scrape( diff --git a/apps/api/src/__tests__/snips/v2/types-validation.test.ts b/apps/api/src/__tests__/snips/v2/types-validation.test.ts index 3ab18a9058..7aa29e744b 100644 --- a/apps/api/src/__tests__/snips/v2/types-validation.test.ts +++ b/apps/api/src/__tests__/snips/v2/types-validation.test.ts @@ -21,6 +21,11 @@ import { SearchRequestInput, toV2CrawlerOptions, } from "../../../controllers/v2/types"; +import { + createMonitorSchema, + updateMonitorSchema, +} from "../../../services/monitoring/types"; +import { getNextMonitorRunAt } from "../../../services/monitoring/cron"; describe("V2 Types Validation", () => { describe("scrapeRequestSchema", () => { @@ -1164,6 +1169,87 @@ describe("V2 Types Validation", () => { }); }); + describe("monitor schedules", () => { + it("should accept natural language schedule text", () => { + const result = createMonitorSchema.parse({ + name: "Blog monitor", + schedule: { + text: "every 30 minutes", + }, + targets: [ + { + type: "scrape", + urls: ["https://example.com"], + }, + ], + }); + + expect(result.schedule).toEqual({ + cron: "*/30 * * * *", + timezone: "UTC", + }); + }); + + it("should accept a natural language start minute", () => { + const result = updateMonitorSchema.parse({ + schedule: { + text: "every 15 minutes starting at :07", + }, + }); + + expect(result.schedule).toEqual({ + cron: "7-59/15 * * * *", + timezone: "UTC", + }); + }); + + it("should reject ambiguous schedule definitions", () => { + expect(() => + updateMonitorSchema.parse({ + schedule: { + cron: "*/30 * * * *", + text: "every 30 minutes", + }, + }), + ).toThrow("Schedule must include either cron or text, not both"); + }); + + it("should calculate next runs in the configured timezone", () => { + const next = getNextMonitorRunAt( + "0 9 * * *", + new Date("2026-01-01T13:00:00.000Z"), + "America/New_York", + ); + + expect(next.toISOString()).toBe("2026-01-01T14:00:00.000Z"); + }); + + it("should accept monitor webhook event filters", () => { + const result = updateMonitorSchema.parse({ + webhook: { + url: "https://example.com/webhook", + events: ["monitor.page", "monitor.check.completed"], + }, + }); + + expect(result.webhook?.events).toEqual([ + "monitor.page", + "monitor.check.completed", + ]); + }); + + it("should reject non-monitor webhook event filters", () => { + expect(() => + updateMonitorSchema.parse({ + webhook: { + url: "https://example.com/webhook", + events: ["completed"], + }, + }), + ).toThrow(); + }); + }); + describe("Edge cases", () => { it("should handle URL without protocol (should add http://)", () => { const input = { diff --git a/apps/api/src/controllers/v2/monitor.ts b/apps/api/src/controllers/v2/monitor.ts new file mode 100644 index 0000000000..914b43c859 --- /dev/null +++ b/apps/api/src/controllers/v2/monitor.ts @@ -0,0 +1,353 @@ +import { Response } from "express"; +import { z } from "zod"; +import { RequestWithAuth } from "./types"; +import { getScrapeZDR } from "../../lib/zdr-helpers"; +import { getMonitorDiffArtifact } from "../../lib/gcs-monitoring"; +import { + createMonitorSchema, + listMonitorChecksQuerySchema, + listMonitorsQuerySchema, + monitorCheckDetailQuerySchema, + updateMonitorSchema, +} from "../../services/monitoring/types"; +import { + createMonitor, + createMonitorCheck, + countMonitorCheckPages, + deleteMonitor, + estimateMonitorCreditsPerRun, + getMonitor, + getMonitorCheck, + getMonitorForUpdate, + listMonitorCheckPages, + listMonitorChecks, + listMonitors, + updateMonitor, +} from "../../services/monitoring/store"; +import { enqueueMonitorCheck } from "../../services/monitoring/scheduler"; +import { + estimateRunsPerMonth, + validateMonitorCron, +} from "../../services/monitoring/cron"; + +const monitorParamsSchema = z.strictObject({ + monitorId: z.uuid(), +}); + +const monitorCheckParamsSchema = monitorParamsSchema.extend({ + checkId: z.uuid(), +}); + +function rejectZdr( + req: RequestWithAuth, + res: Response, +): boolean { + if (getScrapeZDR(req.acuc?.flags) === "forced") { + res.status(400).json({ + success: false, + error: + "Monitoring requires retained snapshots and diffs, and is not supported for zero data retention teams.", + }); + return true; + } + return false; +} + +function serializeMonitor(monitor: any) { + return { + id: monitor.id, + name: monitor.name, + status: monitor.status, + schedule: { + cron: monitor.schedule_cron, + timezone: monitor.schedule_timezone, + }, + nextRunAt: monitor.next_run_at, + lastRunAt: monitor.last_run_at, + currentCheckId: monitor.current_check_id, + targets: monitor.targets, + webhook: monitor.webhook, + notification: monitor.notification, + retentionDays: monitor.retention_days, + estimatedCreditsPerMonth: monitor.estimated_credits_per_month, + lastCheckSummary: monitor.last_check_summary, + createdAt: monitor.created_at, + updatedAt: monitor.updated_at, + }; +} + +function serializeCheck(check: any) { + return { + id: check.id, + monitorId: check.monitor_id, + status: check.status, + trigger: check.trigger, + scheduledFor: check.scheduled_for, + startedAt: check.started_at, + finishedAt: check.finished_at, + estimatedCredits: check.estimated_credits, + reservedCredits: check.reserved_credits, + actualCredits: check.actual_credits, + billingStatus: check.billing_status, + summary: { + totalPages: check.total_pages, + same: check.same_count, + changed: check.changed_count, + new: check.new_count, + removed: check.removed_count, + error: check.error_count, + }, + targetResults: check.target_results, + notificationStatus: check.notification_status, + error: check.error, + createdAt: check.created_at, + updatedAt: check.updated_at, + }; +} + +export async function createMonitorController( + req: RequestWithAuth<{}, any, unknown>, + res: Response, +) { + if (rejectZdr(req, res)) return; + + const input = createMonitorSchema.parse(req.body); + let schedule; + try { + schedule = validateMonitorCron( + input.schedule.cron, + input.schedule.timezone, + ); + } catch (error) { + return res.status(400).json({ + success: false, + error: error instanceof Error ? error.message : String(error), + }); + } + const monitor = await createMonitor({ + teamId: req.auth.team_id, + input, + nextRunAt: schedule.nextRunAt, + intervalMs: schedule.intervalMs, + }); + + res.status(200).json({ + success: true, + data: serializeMonitor(monitor), + }); +} + +export async function listMonitorsController( + req: RequestWithAuth<{}, any, unknown>, + res: Response, +) { + const query = listMonitorsQuerySchema.parse(req.query); + const monitors = await listMonitors({ + teamId: req.auth.team_id, + limit: query.limit, + offset: query.offset, + }); + + res.status(200).json({ + success: true, + data: monitors.map(serializeMonitor), + }); +} + +export async function getMonitorController( + req: RequestWithAuth<{ monitorId: string }, any, unknown>, + res: Response, +) { + const { monitorId } = monitorParamsSchema.parse(req.params); + const monitor = await getMonitor(req.auth.team_id, monitorId); + if (!monitor) { + return res.status(404).json({ success: false, error: "Monitor not found" }); + } + + res.status(200).json({ + success: true, + data: serializeMonitor(monitor), + }); +} + +export async function updateMonitorController( + req: RequestWithAuth<{ monitorId: string }, any, unknown>, + res: Response, +) { + if (rejectZdr(req, res)) return; + + const { monitorId } = monitorParamsSchema.parse(req.params); + const existing = await getMonitorForUpdate(req.auth.team_id, monitorId); + if (!existing) { + return res.status(404).json({ success: false, error: "Monitor not found" }); + } + + const input = updateMonitorSchema.parse(req.body); + const cron = input.schedule?.cron ?? existing.schedule_cron; + const timezone = input.schedule?.timezone ?? existing.schedule_timezone; + let schedule; + try { + schedule = validateMonitorCron(cron, timezone); + } catch (error) { + return res.status(400).json({ + success: false, + error: error instanceof Error ? error.message : String(error), + }); + } + const monitor = await updateMonitor({ + teamId: req.auth.team_id, + monitorId, + input, + nextRunAt: input.schedule ? schedule.nextRunAt : undefined, + intervalMs: + input.schedule || input.targets ? schedule.intervalMs : undefined, + }); + + res.status(200).json({ + success: true, + data: serializeMonitor(monitor), + }); +} + +export async function deleteMonitorController( + req: RequestWithAuth<{ monitorId: string }, any, unknown>, + res: Response, +) { + const { monitorId } = monitorParamsSchema.parse(req.params); + const deleted = await deleteMonitor({ + teamId: req.auth.team_id, + monitorId, + }); + if (!deleted) { + return res.status(404).json({ success: false, error: "Monitor not found" }); + } + + res.status(200).json({ success: true }); +} + +export async function runMonitorController( + req: RequestWithAuth<{ monitorId: string }, any, unknown>, + res: Response, +) { + if (rejectZdr(req, res)) return; + + const { monitorId } = monitorParamsSchema.parse(req.params); + const monitor = await getMonitorForUpdate(req.auth.team_id, monitorId); + if (!monitor) { + return res.status(404).json({ success: false, error: "Monitor not found" }); + } + if (monitor.current_check_id) { + return res.status(409).json({ + success: false, + error: "Monitor check is already running.", + checkId: monitor.current_check_id, + }); + } + + const check = await createMonitorCheck({ + monitor, + trigger: "manual", + }); + await enqueueMonitorCheck({ + monitorId: monitor.id, + checkId: check.id, + teamId: monitor.team_id, + }); + + res.status(200).json({ + success: true, + id: check.id, + data: serializeCheck(check), + }); +} + +export async function listMonitorChecksController( + req: RequestWithAuth<{ monitorId: string }, any, unknown>, + res: Response, +) { + const { monitorId } = monitorParamsSchema.parse(req.params); + const monitor = await getMonitor(req.auth.team_id, monitorId); + if (!monitor) { + return res.status(404).json({ success: false, error: "Monitor not found" }); + } + + const query = listMonitorChecksQuerySchema.parse(req.query); + const checks = await listMonitorChecks({ + teamId: req.auth.team_id, + monitorId, + limit: query.limit, + offset: query.offset, + status: query.status, + }); + + res.status(200).json({ + success: true, + data: checks.map(serializeCheck), + }); +} + +export async function getMonitorCheckController( + req: RequestWithAuth<{ monitorId: string; checkId: string }, any, unknown>, + res: Response, +) { + const { monitorId, checkId } = monitorCheckParamsSchema.parse(req.params); + const query = monitorCheckDetailQuerySchema.parse(req.query); + const skip = query.skip; + const check = await getMonitorCheck(req.auth.team_id, monitorId, checkId); + if (!check) { + return res.status(404).json({ success: false, error: "Check not found" }); + } + + const [pages, totalPagesForFilter] = await Promise.all([ + listMonitorCheckPages({ + teamId: req.auth.team_id, + monitorId, + checkId, + limit: query.limit, + skip, + status: query.status, + }), + countMonitorCheckPages({ + checkId, + status: query.status, + }), + ]); + + const pagesWithDiffs = await Promise.all( + pages.map(async page => ({ + id: page.id, + targetId: page.target_id, + url: page.url, + status: page.status, + previousScrapeId: page.previous_scrape_id, + currentScrapeId: page.current_scrape_id, + statusCode: page.status_code, + error: page.error, + metadata: page.metadata, + diff: await getMonitorDiffArtifact(page.diff_gcs_key), + createdAt: page.created_at, + })), + ); + const nextSkip = skip + pagesWithDiffs.length; + const next = (() => { + if (totalPagesForFilter <= nextSkip) return undefined; + const url = new URL( + `/v2/monitor/${monitorId}/checks/${checkId}`, + `${req.protocol}://${req.get("host")}`, + ); + url.searchParams.set("skip", String(nextSkip)); + url.searchParams.set("limit", String(query.limit)); + if (query.status) url.searchParams.set("status", query.status); + return url.toString(); + })(); + + res.status(200).json({ + success: true, + next, + data: { + ...serializeCheck(check), + pages: pagesWithDiffs, + next, + }, + }); +} diff --git a/apps/api/src/harness.ts b/apps/api/src/harness.ts index 1deef110ac..51e5c18223 100644 --- a/apps/api/src/harness.ts +++ b/apps/api/src/harness.ts @@ -1,5 +1,6 @@ import { config } from "./config"; import { type ChildProcess, spawn } from "child_process"; +import { existsSync } from "fs"; import * as net from "net"; import { basename, join } from "path"; import { HTML_TO_MARKDOWN_PATH } from "./natives"; @@ -19,9 +20,15 @@ let nuqRabbitMQContainer: { containerRuntime: string; } | null = null; -// Get the monorepo root (apps/api/dist/src -> ../../../..) +// Get the monorepo root for both tsx source execution and compiled dist execution. // __dirname is available in CommonJS (which this compiles to) -const MONOREPO_ROOT = join(__dirname, "..", "..", "..", ".."); +const SOURCE_MONOREPO_ROOT = join(__dirname, "..", "..", ".."); +const DIST_MONOREPO_ROOT = join(__dirname, "..", "..", "..", ".."); +const MONOREPO_ROOT = existsSync( + join(SOURCE_MONOREPO_ROOT, "apps", "nuq-postgres"), +) + ? SOURCE_MONOREPO_ROOT + : DIST_MONOREPO_ROOT; const NUQ_POSTGRES_PATH = join(MONOREPO_ROOT, "apps", "nuq-postgres"); interface ProcessResult { diff --git a/apps/api/src/lib/gcs-monitoring.ts b/apps/api/src/lib/gcs-monitoring.ts new file mode 100644 index 0000000000..36b3e9e12b --- /dev/null +++ b/apps/api/src/lib/gcs-monitoring.ts @@ -0,0 +1,64 @@ +import { config } from "../config"; +import { storage } from "./gcs-jobs"; + +type MonitorDiffArtifact = { + url: string; + previousScrapeId: string | null; + currentScrapeId: string | null; + text: string; + json: unknown; + generatedAt: string; +}; + +const contentType = "application/json"; + +export function monitorDiffGcsKey(params: { + teamId: string; + monitorId: string; + checkId: string; + pageId: string; +}): string { + return `monitors/${params.teamId}/${params.monitorId}/${params.checkId}/${params.pageId}.diff.json`; +} + +export async function saveMonitorDiffArtifact( + key: string, + artifact: MonitorDiffArtifact, +): Promise<{ textBytes: number; jsonBytes: number }> { + const payload = JSON.stringify(artifact); + if (!config.GCS_BUCKET_NAME) { + return { + textBytes: Buffer.byteLength(artifact.text), + jsonBytes: Buffer.byteLength(JSON.stringify(artifact.json ?? null)), + }; + } + + const bucket = storage.bucket(config.GCS_BUCKET_NAME); + await bucket.file(key).save(payload, { + contentType, + resumable: false, + }); + + return { + textBytes: Buffer.byteLength(artifact.text), + jsonBytes: Buffer.byteLength(JSON.stringify(artifact.json ?? null)), + }; +} + +export async function getMonitorDiffArtifact( + key: string | null | undefined, +): Promise { + if (!key || !config.GCS_BUCKET_NAME) return null; + + const bucket = storage.bucket(config.GCS_BUCKET_NAME); + try { + const [contents] = await bucket.file(key).download(); + return JSON.parse(contents.toString()) as MonitorDiffArtifact; + } catch (error) { + const maybeGcsError = error as { code?: number; statusCode?: number }; + if (maybeGcsError.code === 404 || maybeGcsError.statusCode === 404) { + return null; + } + throw error; + } +} diff --git a/apps/api/src/routes/v2.ts b/apps/api/src/routes/v2.ts index d84e30221e..407e1f2de1 100644 --- a/apps/api/src/routes/v2.ts +++ b/apps/api/src/routes/v2.ts @@ -65,6 +65,16 @@ import { scrapeInteractController, scrapeStopInteractiveBrowserController, } from "../controllers/v2/scrape-browser"; +import { + createMonitorController, + deleteMonitorController, + getMonitorCheckController, + getMonitorController, + listMonitorChecksController, + listMonitorsController, + runMonitorController, + updateMonitorController, +} from "../controllers/v2/monitor"; expressWs(express()); @@ -448,6 +458,63 @@ v2Router.get( wrap(activityController), ); +v2Router.post( + "/monitor", + authMiddleware(RateLimiterMode.Crawl), + countryCheck, + checkCreditsMiddleware(1), + blocklistMiddleware, + wrap(createMonitorController), +); + +v2Router.get( + "/monitor", + authMiddleware(RateLimiterMode.CrawlStatus), + wrap(listMonitorsController), +); + +v2Router.get( + "/monitor/:monitorId", + authMiddleware(RateLimiterMode.CrawlStatus), + wrap(getMonitorController), +); + +v2Router.patch( + "/monitor/:monitorId", + authMiddleware(RateLimiterMode.Crawl), + countryCheck, + checkCreditsMiddleware(1), + blocklistMiddleware, + wrap(updateMonitorController), +); + +v2Router.delete( + "/monitor/:monitorId", + authMiddleware(RateLimiterMode.CrawlStatus), + wrap(deleteMonitorController), +); + +v2Router.post( + "/monitor/:monitorId/run", + authMiddleware(RateLimiterMode.Crawl), + countryCheck, + checkCreditsMiddleware(1), + blocklistMiddleware, + wrap(runMonitorController), +); + +v2Router.get( + "/monitor/:monitorId/checks", + authMiddleware(RateLimiterMode.CrawlStatus), + wrap(listMonitorChecksController), +); + +v2Router.get( + "/monitor/:monitorId/checks/:checkId", + authMiddleware(RateLimiterMode.CrawlStatus), + wrap(getMonitorCheckController), +); + v2Router.post( "/browser", authMiddleware(RateLimiterMode.Browser), diff --git a/apps/api/src/services/billing/types.ts b/apps/api/src/services/billing/types.ts index e39dfb4499..87ae1ec54d 100644 --- a/apps/api/src/services/billing/types.ts +++ b/apps/api/src/services/billing/types.ts @@ -9,6 +9,7 @@ export type BillingEndpoint = | "interact" | "llms_txt" | "map" + | "monitor" | "parse" | "scrape" | "search"; diff --git a/apps/api/src/services/logging/log_job.ts b/apps/api/src/services/logging/log_job.ts index 1a6a11e011..5241a6d37b 100644 --- a/apps/api/src/services/logging/log_job.ts +++ b/apps/api/src/services/logging/log_job.ts @@ -222,6 +222,8 @@ export type LoggedScrape = { skipNuq: boolean; zeroDataRetention: boolean; is_parse?: boolean; + monitor_id?: string | null; + monitor_check_id?: string | null; }; export async function logScrape(scrape: LoggedScrape, force: boolean = false) { @@ -259,6 +261,12 @@ export async function logScrape(scrape: LoggedScrape, force: boolean = false) { ? null : (scrape.pdf_num_pages ?? null), credits_cost: scrape.credits_cost, + ...(scrape.is_parse + ? {} + : { + monitor_id: scrape.monitor_id ?? null, + monitor_check_id: scrape.monitor_check_id ?? null, + }), }, force, logger, @@ -322,6 +330,8 @@ type LoggedCrawl = { credits_cost: number; zeroDataRetention: boolean; cancelled: boolean; + monitor_id?: string | null; + monitor_check_id?: string | null; }; export async function logCrawl(crawl: LoggedCrawl, force: boolean = false) { @@ -350,6 +360,8 @@ export async function logCrawl(crawl: LoggedCrawl, force: boolean = false) { num_docs: crawl.num_docs, credits_cost: crawl.credits_cost, cancelled: crawl.cancelled, + monitor_id: crawl.monitor_id ?? null, + monitor_check_id: crawl.monitor_check_id ?? null, }, force, logger, diff --git a/apps/api/src/services/monitoring/cron.ts b/apps/api/src/services/monitoring/cron.ts new file mode 100644 index 0000000000..cc4313a16f --- /dev/null +++ b/apps/api/src/services/monitoring/cron.ts @@ -0,0 +1,247 @@ +const MIN_MONITOR_INTERVAL_MS = 15 * 60 * 1000; +const SEARCH_LIMIT_MINUTES = 366 * 24 * 60; + +type CronField = Set; +type CronSpec = ReturnType; + +const WEEKDAY_TO_NUMBER: Record = { + sun: 0, + mon: 1, + tue: 2, + wed: 3, + thu: 4, + fri: 5, + sat: 6, +}; + +function parseMinuteStart(value: string | undefined): number { + if (value === undefined) return 0; + const minute = Number(value); + if (!Number.isInteger(minute) || minute < 0 || minute > 59) { + throw new Error("Schedule start minute must be between 0 and 59"); + } + return minute; +} + +function assertDivides(value: number, divisor: number, unit: string) { + if (divisor % value !== 0) { + throw new Error(`${unit} interval must divide evenly into ${divisor}`); + } +} + +export function parseMonitorScheduleText(input: string): string { + const text = input + .trim() + .toLowerCase() + .replace(/[.,]/g, "") + .replace(/\s+/g, " "); + + const minuteMatch = text.match( + /^every (\d+) (?:minutes?|mins?)(?: (?:starting|start)(?: at| on)? :?(\d{1,2}))?$/, + ); + if (minuteMatch) { + const interval = Number(minuteMatch[1]); + if (!Number.isInteger(interval) || interval <= 0 || interval > 60) { + throw new Error("Minute interval must be between 1 and 60"); + } + assertDivides(interval, 60, "Minute"); + const startMinute = parseMinuteStart(minuteMatch[2]); + return startMinute === 0 + ? `*/${interval} * * * *` + : `${startMinute}-59/${interval} * * * *`; + } + + const hourlyMatch = text.match( + /^(?:hourly|every hour)(?: (?:at|starting at) :?(\d{1,2}))?$/, + ); + if (hourlyMatch) { + return `${parseMinuteStart(hourlyMatch[1])} * * * *`; + } + + const hourMatch = text.match(/^every (\d+) (?:hours?|hrs?)$/); + if (hourMatch) { + const interval = Number(hourMatch[1]); + if (!Number.isInteger(interval) || interval <= 0 || interval > 24) { + throw new Error("Hour interval must be between 1 and 24"); + } + assertDivides(interval, 24, "Hour"); + return `0 */${interval} * * *`; + } + + const dailyMatch = text.match( + /^(?:daily|every day)(?: at (\d{1,2})(?::(\d{2}))?)?$/, + ); + if (dailyMatch) { + const hour = dailyMatch[1] === undefined ? 0 : Number(dailyMatch[1]); + const minute = dailyMatch[2] === undefined ? 0 : Number(dailyMatch[2]); + if (!Number.isInteger(hour) || hour < 0 || hour > 23) { + throw new Error("Daily schedule hour must be between 0 and 23"); + } + if (!Number.isInteger(minute) || minute < 0 || minute > 59) { + throw new Error("Daily schedule minute must be between 0 and 59"); + } + return `${minute} ${hour} * * *`; + } + + if (text === "weekly" || text === "every week") { + return "0 0 * * 0"; + } + + throw new Error( + "Unsupported schedule text. Try phrases like 'every 30 minutes', 'hourly', or 'daily at 9:00'", + ); +} + +function parseField(field: string, min: number, max: number): CronField { + const values = new Set(); + for (const part of field.split(",")) { + const [rangePart, stepPart] = part.split("/"); + const step = stepPart === undefined ? 1 : Number(stepPart); + if (!Number.isInteger(step) || step <= 0) { + throw new Error("Invalid cron step"); + } + + let start: number; + let end: number; + if (rangePart === "*") { + start = min; + end = max; + } else if (rangePart.includes("-")) { + const [a, b] = rangePart.split("-").map(Number); + start = a; + end = b; + } else { + start = Number(rangePart); + end = start; + } + + if ( + !Number.isInteger(start) || + !Number.isInteger(end) || + start < min || + end > max || + start > end + ) { + throw new Error("Invalid cron field"); + } + + for (let value = start; value <= end; value += step) { + values.add(value); + } + } + return values; +} + +function parseDayOfWeek(field: string): CronField { + const values = parseField(field, 0, 7); + if (values.has(7)) { + values.add(0); + values.delete(7); + } + return values; +} + +function parseCron(cron: string) { + const parts = cron.trim().split(/\s+/); + if (parts.length !== 5) { + throw new Error("Cron expression must contain five fields"); + } + + return { + minutes: parseField(parts[0], 0, 59), + hours: parseField(parts[1], 0, 23), + daysOfMonth: parseField(parts[2], 1, 31), + months: parseField(parts[3], 1, 12), + daysOfWeek: parseDayOfWeek(parts[4]), + }; +} + +function validateTimeZone(timeZone: string): void { + try { + new Intl.DateTimeFormat("en-US", { timeZone }).format(new Date()); + } catch { + throw new Error(`Invalid monitor schedule timezone: ${timeZone}`); + } +} + +function getZonedParts(date: Date, timeZone: string) { + const parts = new Intl.DateTimeFormat("en-US", { + timeZone, + hourCycle: "h23", + minute: "2-digit", + hour: "2-digit", + day: "2-digit", + month: "2-digit", + weekday: "short", + }).formatToParts(date); + + const values = Object.fromEntries( + parts + .filter(part => part.type !== "literal") + .map(part => [part.type, part.value]), + ); + + return { + minutes: Number(values.minute), + hours: Number(values.hour), + daysOfMonth: Number(values.day), + months: Number(values.month), + daysOfWeek: WEEKDAY_TO_NUMBER[String(values.weekday).toLowerCase()], + }; +} + +function matches(date: Date, cron: CronSpec, timeZone: string): boolean { + const zoned = getZonedParts(date, timeZone); + return ( + cron.minutes.has(zoned.minutes) && + cron.hours.has(zoned.hours) && + cron.daysOfMonth.has(zoned.daysOfMonth) && + cron.months.has(zoned.months) && + cron.daysOfWeek.has(zoned.daysOfWeek) + ); +} + +export function getNextMonitorRunAt( + cronExpression: string, + from = new Date(), + timeZone = "UTC", +): Date { + validateTimeZone(timeZone); + const cron = parseCron(cronExpression); + const candidate = new Date(from); + candidate.setUTCSeconds(0, 0); + candidate.setUTCMinutes(candidate.getUTCMinutes() + 1); + + for (let i = 0; i < SEARCH_LIMIT_MINUTES; i++) { + if (matches(candidate, cron, timeZone)) { + return new Date(candidate); + } + candidate.setUTCMinutes(candidate.getUTCMinutes() + 1); + } + + throw new Error("Cron expression did not produce a run within one year"); +} + +export function validateMonitorCron( + cronExpression: string, + timeZone = "UTC", +): { + nextRunAt: Date; + intervalMs: number; +} { + const nextRunAt = getNextMonitorRunAt(cronExpression, new Date(), timeZone); + const secondRunAt = getNextMonitorRunAt(cronExpression, nextRunAt, timeZone); + const intervalMs = secondRunAt.getTime() - nextRunAt.getTime(); + if (intervalMs < MIN_MONITOR_INTERVAL_MS) { + throw new Error( + "Monitor schedule must not run more often than every 15 minutes", + ); + } + + return { nextRunAt, intervalMs }; +} + +export function estimateRunsPerMonth(intervalMs: number): number { + const daysPerMonth = 30; + return Math.ceil((daysPerMonth * 24 * 60 * 60 * 1000) / intervalMs); +} diff --git a/apps/api/src/services/monitoring/diff.ts b/apps/api/src/services/monitoring/diff.ts new file mode 100644 index 0000000000..3197367d36 --- /dev/null +++ b/apps/api/src/services/monitoring/diff.ts @@ -0,0 +1,105 @@ +import gitDiff from "git-diff"; +import parseDiff from "parse-diff"; + +type MonitoringDiffResult = + | { + status: "same"; + text?: undefined; + json?: undefined; + } + | { + status: "changed"; + text: string; + json: { + files: Array<{ + from: string | null; + to: string | null; + chunks: Array<{ + content: string; + changes: Array<{ + type: string; + normal?: boolean; + add?: boolean; + del?: boolean; + ln?: number; + ln1?: number; + ln2?: number; + content: string; + }>; + }>; + }>; + }; + }; + +function normalizeMarkdownForChangeTracking(markdown: string): string { + return [...markdown.replace(/\s+/g, "").replace(/\[iframe\]\(.+?\)/g, "")] + .sort() + .join(""); +} + +export function diffMonitorMarkdown( + previousMarkdown: string, + currentMarkdown: string, +): MonitoringDiffResult { + if ( + normalizeMarkdownForChangeTracking(previousMarkdown) === + normalizeMarkdownForChangeTracking(currentMarkdown) + ) { + return { status: "same" }; + } + + const text = gitDiff(previousMarkdown, currentMarkdown, { + color: false, + wordDiff: false, + }); + const structured = parseDiff(text); + + return { + status: "changed", + text, + json: { + files: structured.map(file => ({ + from: file.from || null, + to: file.to || null, + chunks: file.chunks.map(chunk => ({ + content: chunk.content, + changes: chunk.changes.map(change => { + const base = { + type: change.type, + content: change.content, + }; + + if ( + change.type === "normal" && + "ln1" in change && + "ln2" in change + ) { + return { + ...base, + normal: true, + ln1: change.ln1, + ln2: change.ln2, + }; + } + if (change.type === "add" && "ln" in change) { + return { + ...base, + add: true, + ln: change.ln, + }; + } + if (change.type === "del" && "ln" in change) { + return { + ...base, + del: true, + ln: change.ln, + }; + } + + return base; + }), + })), + })), + }, + }; +} diff --git a/apps/api/src/services/monitoring/queue.ts b/apps/api/src/services/monitoring/queue.ts new file mode 100644 index 0000000000..77cc8675f2 --- /dev/null +++ b/apps/api/src/services/monitoring/queue.ts @@ -0,0 +1,133 @@ +import amqp from "amqplib"; +import { config } from "../../config"; +import { logger as _logger } from "../../lib/logger"; + +const MONITOR_CHECK_QUEUE = "monitor.checks"; +const MONITOR_CHECK_DLX = "monitor.checks.dlx"; +const MONITOR_CHECK_DLQ = "monitor.checks.dlq"; + +const logger = _logger.child({ module: "monitoring-queue" }); + +export type MonitorCheckJobData = { + monitorId: string; + checkId: string; + teamId: string; +}; + +let connection: amqp.ChannelModel | null = null; +let channel: amqp.Channel | null = null; + +async function getChannel(): Promise { + if (channel) return channel; + + const url = config.NUQ_RABBITMQ_URL; + if (!url) { + throw new Error("NUQ_RABBITMQ_URL is not configured"); + } + + connection = await amqp.connect(url); + channel = await connection.createChannel(); + + await channel.assertExchange(MONITOR_CHECK_DLX, "direct", { durable: true }); + await channel.assertQueue(MONITOR_CHECK_DLQ, { + durable: true, + arguments: { + "x-queue-type": "quorum", + }, + }); + await channel.bindQueue( + MONITOR_CHECK_DLQ, + MONITOR_CHECK_DLX, + MONITOR_CHECK_QUEUE, + ); + + await channel.assertQueue(MONITOR_CHECK_QUEUE, { + durable: true, + arguments: { + "x-queue-type": "quorum", + "x-dead-letter-exchange": MONITOR_CHECK_DLX, + "x-dead-letter-routing-key": MONITOR_CHECK_QUEUE, + "x-delivery-limit": 1, + }, + }); + + connection.on("close", () => { + logger.warn("Monitor queue connection closed"); + connection = null; + channel = null; + }); + + connection.on("error", error => { + logger.error("Monitor queue connection error", { error }); + }); + + return channel; +} + +export async function addMonitorCheckJob( + data: MonitorCheckJobData, +): Promise { + const ch = await getChannel(); + const sent = ch.sendToQueue( + MONITOR_CHECK_QUEUE, + Buffer.from(JSON.stringify(data)), + { + persistent: true, + contentType: "application/json", + messageId: data.checkId, + }, + ); + + if (!sent) { + logger.warn("Monitor check message buffer full", { + monitorId: data.monitorId, + checkId: data.checkId, + }); + } + + logger.info("Monitor check job added to queue", { + monitorId: data.monitorId, + checkId: data.checkId, + teamId: data.teamId, + }); +} + +export async function consumeMonitorCheckJobs( + handler: (data: MonitorCheckJobData) => Promise, +): Promise { + const ch = await getChannel(); + await ch.prefetch(1); + + await ch.consume( + MONITOR_CHECK_QUEUE, + async msg => { + if (!msg) return; + + let data: MonitorCheckJobData; + try { + data = JSON.parse(msg.content.toString()) as MonitorCheckJobData; + } catch (error) { + logger.error("Failed to parse monitor check job", { error }); + ch.nack(msg, false, false); + return; + } + + const jobLogger = logger.child({ + monitorId: data.monitorId, + checkId: data.checkId, + teamId: data.teamId, + }); + + try { + await handler(data); + ch.ack(msg); + } catch (error) { + jobLogger.error("Monitor check job failed", { error }); + ch.nack(msg, false, false); + } + }, + { noAck: false }, + ); + + logger.info("Started consuming monitor check jobs"); +} diff --git a/apps/api/src/services/monitoring/results.ts b/apps/api/src/services/monitoring/results.ts new file mode 100644 index 0000000000..2c06706f18 --- /dev/null +++ b/apps/api/src/services/monitoring/results.ts @@ -0,0 +1,227 @@ +import { v7 as uuidv7 } from "uuid"; +import { NuQJob } from "../worker/nuq"; +import { ScrapeJobData } from "../../types"; +import { getJobFromGCS } from "../../lib/gcs-jobs"; +import { + monitorDiffGcsKey, + saveMonitorDiffArtifact, +} from "../../lib/gcs-monitoring"; +import { logger as _logger } from "../../lib/logger"; +import { createWebhookSender, WebhookEvent } from "../webhook"; +import { diffMonitorMarkdown } from "./diff"; +import { + getMonitorForUpdate, + getMonitorPage, + hashMonitorUrl, + insertMonitorCheckPages, + upsertMonitorPage, +} from "./store"; + +const logger = _logger.child({ module: "monitoring-results" }); + +function getDocumentUrl(doc: any, fallback: string): string { + return doc?.metadata?.sourceURL ?? doc?.metadata?.url ?? doc?.url ?? fallback; +} + +function getDocumentStatusCode(doc: any): number | null { + return typeof doc?.metadata?.statusCode === "number" + ? doc.metadata.statusCode + : null; +} + +async function sendMonitorPageWebhook(params: { + teamId: string; + monitorId: string; + checkId: string; + url: string; + status: string; + previousScrapeId?: string | null; + currentScrapeId?: string | null; + error?: string | null; +}) { + try { + const monitor = await getMonitorForUpdate(params.teamId, params.monitorId); + if (!monitor?.webhook) return; + + const sender = await createWebhookSender({ + teamId: params.teamId, + jobId: params.checkId, + webhook: monitor.webhook as any, + v0: false, + }); + + await sender?.send(WebhookEvent.MONITOR_PAGE, { + success: params.status !== "error", + data: { + monitorId: params.monitorId, + checkId: params.checkId, + url: params.url, + status: params.status, + previousScrapeId: params.previousScrapeId ?? null, + currentScrapeId: params.currentScrapeId ?? null, + error: params.error ?? null, + }, + error: params.error ?? undefined, + }); + } catch (error) { + logger.warn("Failed to send monitor page webhook", { + error, + monitorId: params.monitorId, + checkId: params.checkId, + url: params.url, + status: params.status, + }); + } +} + +export async function recordMonitorScrapeSuccess( + job: NuQJob, + doc: any, +): Promise { + const monitoring = job.data.monitoring; + if (!monitoring || job.data.mode !== "single_urls") return; + + const url = getDocumentUrl(doc, job.data.url); + const previous = await getMonitorPage({ + monitorId: monitoring.monitorId, + targetId: monitoring.targetId, + url, + }); + + let status: "same" | "new" | "changed" = "new"; + let diffGcsKey: string | null = null; + let diffTextBytes: number | null = null; + let diffJsonBytes: number | null = null; + + if (previous?.last_scrape_id && !previous.is_removed) { + const previousDoc = (await getJobFromGCS(previous.last_scrape_id))?.[0]; + const previousMarkdown = previousDoc?.markdown; + const currentMarkdown = doc?.markdown; + + if (previousMarkdown && currentMarkdown) { + const diff = diffMonitorMarkdown(previousMarkdown, currentMarkdown); + status = diff.status; + + if (diff.status === "changed") { + diffGcsKey = monitorDiffGcsKey({ + teamId: job.data.team_id, + monitorId: monitoring.monitorId, + checkId: monitoring.checkId, + pageId: uuidv7(), + }); + const sizes = await saveMonitorDiffArtifact(diffGcsKey, { + url, + previousScrapeId: previous.last_scrape_id, + currentScrapeId: job.id, + text: diff.text, + json: diff.json, + generatedAt: new Date().toISOString(), + }); + diffTextBytes = sizes.textBytes; + diffJsonBytes = sizes.jsonBytes; + } + } else { + status = "changed"; + } + } + + await upsertMonitorPage({ + monitorId: monitoring.monitorId, + teamId: job.data.team_id, + targetId: monitoring.targetId, + url, + source: monitoring.source, + checkId: monitoring.checkId, + scrapeId: job.id, + status, + metadata: { + title: doc?.metadata?.title ?? null, + statusCode: getDocumentStatusCode(doc), + creditsUsed: doc?.metadata?.creditsUsed ?? null, + }, + }); + + await insertMonitorCheckPages([ + { + check_id: monitoring.checkId, + monitor_id: monitoring.monitorId, + team_id: job.data.team_id, + target_id: monitoring.targetId, + url, + url_hash: hashMonitorUrl(url), + status, + previous_scrape_id: previous?.last_scrape_id ?? null, + current_scrape_id: job.id, + diff_gcs_key: diffGcsKey, + diff_text_bytes: diffTextBytes, + diff_json_bytes: diffJsonBytes, + status_code: getDocumentStatusCode(doc), + metadata: { + title: doc?.metadata?.title ?? null, + creditsUsed: doc?.metadata?.creditsUsed ?? null, + }, + }, + ]); + + logger.info("Recorded monitor scrape result", { + monitorId: monitoring.monitorId, + checkId: monitoring.checkId, + targetId: monitoring.targetId, + scrapeId: job.id, + url, + status, + previousScrapeId: previous?.last_scrape_id ?? null, + diffGcsKey, + }); + + await sendMonitorPageWebhook({ + teamId: job.data.team_id, + monitorId: monitoring.monitorId, + checkId: monitoring.checkId, + url, + status, + previousScrapeId: previous?.last_scrape_id ?? null, + currentScrapeId: job.id, + }); +} + +export async function recordMonitorScrapeFailure( + job: NuQJob, + error: unknown, +): Promise { + const monitoring = job.data.monitoring; + if (!monitoring || job.data.mode !== "single_urls") return; + + await insertMonitorCheckPages([ + { + check_id: monitoring.checkId, + monitor_id: monitoring.monitorId, + team_id: job.data.team_id, + target_id: monitoring.targetId, + url: job.data.url, + url_hash: hashMonitorUrl(job.data.url), + status: "error", + current_scrape_id: job.id, + error: error instanceof Error ? error.message : String(error), + }, + ]); + + logger.info("Recorded monitor scrape failure", { + monitorId: monitoring.monitorId, + checkId: monitoring.checkId, + targetId: monitoring.targetId, + scrapeId: job.id, + url: job.data.url, + error: error instanceof Error ? error.message : String(error), + }); + + await sendMonitorPageWebhook({ + teamId: job.data.team_id, + monitorId: monitoring.monitorId, + checkId: monitoring.checkId, + url: job.data.url, + status: "error", + currentScrapeId: job.id, + error: error instanceof Error ? error.message : String(error), + }); +} diff --git a/apps/api/src/services/monitoring/runner.ts b/apps/api/src/services/monitoring/runner.ts new file mode 100644 index 0000000000..4afa1ce732 --- /dev/null +++ b/apps/api/src/services/monitoring/runner.ts @@ -0,0 +1,1098 @@ +import { v7 as uuidv7 } from "uuid"; +import { config } from "../../config"; +import { logger as _logger } from "../../lib/logger"; +import { logRequest } from "../logging/log_job"; +import { processJobInternal } from "../worker/scrape-worker"; +import { NuQJob, crawlGroup, scrapeQueue } from "../worker/nuq"; +import { ScrapeJobData } from "../../types"; +import { getJobFromGCS } from "../../lib/gcs-jobs"; +import { + monitorDiffGcsKey, + saveMonitorDiffArtifact, +} from "../../lib/gcs-monitoring"; +import { diffMonitorMarkdown } from "./diff"; +import { autumnService } from "../autumn/autumn.service"; +import { getBillingQueue } from "../queue-service"; +import { + crawlToCrawler, + markCrawlActive, + saveCrawl, + StoredCrawl, +} from "../../lib/crawl-redis"; +import { _addScrapeJobToBullMQ, addScrapeJob } from "../queue-jobs"; +import { + CrawlRequest, + type ScrapeOptions, + crawlRequestSchema, + scrapeRequestSchema, + toV0CrawlerOptions, +} from "../../controllers/v2/types"; +import { createWebhookSender, WebhookEvent } from "../webhook"; +import { sendMonitoringEmailSummary } from "../notification/monitoring_email"; +import { + getMonitorForUpdate, + getMonitorPage, + countMonitorCheckPages, + hashMonitorUrl, + insertMonitorCheckPages, + listActiveMonitorPages, + listMonitorCheckPages, + listRunningMonitorChecks, + markMonitorRunning, + updateMonitorCheck, + updateMonitorScheduleAfterRun, + upsertMonitorPage, +} from "./store"; +import type { + MonitorCheckPageInsert, + MonitorCheckRow, + MonitorRow, + MonitorTarget, +} from "./types"; +import { withMarkdownFormat } from "./types"; +import { redisEvictConnection } from "../redis"; +import type { MonitorCheckJobData } from "./queue"; + +const logger = _logger.child({ module: "monitoring-runner" }); +const poll = (ms: number) => new Promise(resolve => setTimeout(resolve, ms)); + +type PageResult = MonitorCheckPageInsert & { + emailStatus?: string; +}; + +function withMonitorScrapeDefaults( + options: Record, +): ScrapeOptions { + return { + maxAge: 0, + ...withMarkdownFormat(options), + }; +} + +function getDocumentUrl(doc: any, fallback: string): string { + return doc?.metadata?.sourceURL ?? doc?.metadata?.url ?? doc?.url ?? fallback; +} + +function getDocumentStatusCode(doc: any): number | null { + return typeof doc?.metadata?.statusCode === "number" + ? doc.metadata.statusCode + : null; +} + +function estimateActualCredits(doc: any, options: any): number { + if (typeof doc?.metadata?.creditsUsed === "number") { + return doc.metadata.creditsUsed; + } + const formats = Array.isArray(options?.formats) ? options.formats : []; + const hasJson = formats.some((format: any) => + typeof format === "string" ? format === "json" : format?.type === "json", + ); + return hasJson ? 5 : 1; +} + +async function runSingleScrape(params: { + monitor: MonitorRow; + check: MonitorCheckRow; + target: MonitorTarget; + url: string; + requestId?: string; +}): Promise<{ scrapeId: string; doc: any; credits: number }> { + const scrapeId = uuidv7(); + const scrapeOptions = scrapeRequestSchema.parse({ + url: params.url, + ...withMonitorScrapeDefaults(params.target.scrapeOptions ?? {}), + origin: "monitor", + }); + + await logRequest({ + id: scrapeId, + kind: "scrape", + api_version: "v2", + team_id: params.monitor.team_id, + origin: "monitor", + integration: null, + target_hint: params.url, + zeroDataRetention: false, + api_key_id: null, + }); + + const job: NuQJob = { + id: scrapeId, + status: "active", + createdAt: new Date(), + priority: 20, + data: { + mode: "single_urls", + url: params.url, + team_id: params.monitor.team_id, + scrapeOptions, + internalOptions: { + teamId: params.monitor.team_id, + saveScrapeResultToGCS: !!config.GCS_FIRE_ENGINE_BUCKET_NAME, + bypassBilling: true, + zeroDataRetention: false, + }, + skipNuq: true, + origin: "monitor", + integration: null, + billing: { endpoint: "monitor", jobId: params.check.id }, + requestId: params.requestId, + zeroDataRetention: false, + apiKeyId: null, + }, + }; + + const doc = await processJobInternal(job); + return { + scrapeId, + doc, + credits: estimateActualCredits(doc, scrapeOptions), + }; +} + +async function diffAndPersistPage(params: { + monitor: MonitorRow; + check: MonitorCheckRow; + target: MonitorTarget; + url: string; + scrapeId: string; + doc: any; + source: "explicit" | "discovered"; +}): Promise { + const previous = await getMonitorPage({ + monitorId: params.monitor.id, + targetId: params.target.id, + url: params.url, + }); + + let status: PageResult["status"] = "new"; + let diffGcsKey: string | null = null; + let diffTextBytes: number | null = null; + let diffJsonBytes: number | null = null; + + if (previous?.last_scrape_id && !previous.is_removed) { + const previousDoc = (await getJobFromGCS(previous.last_scrape_id))?.[0]; + const previousMarkdown = previousDoc?.markdown; + const currentMarkdown = params.doc?.markdown; + + if (previousMarkdown && currentMarkdown) { + const diff = diffMonitorMarkdown(previousMarkdown, currentMarkdown); + status = diff.status; + + if (diff.status === "changed") { + diffGcsKey = monitorDiffGcsKey({ + teamId: params.monitor.team_id, + monitorId: params.monitor.id, + checkId: params.check.id, + pageId: uuidv7(), + }); + const sizes = await saveMonitorDiffArtifact(diffGcsKey, { + url: params.url, + previousScrapeId: previous.last_scrape_id, + currentScrapeId: params.scrapeId, + text: diff.text, + json: diff.json, + generatedAt: new Date().toISOString(), + }); + diffTextBytes = sizes.textBytes; + diffJsonBytes = sizes.jsonBytes; + } + } else { + status = "changed"; + } + } + + await upsertMonitorPage({ + monitorId: params.monitor.id, + teamId: params.monitor.team_id, + targetId: params.target.id, + url: params.url, + source: params.source, + checkId: params.check.id, + scrapeId: params.scrapeId, + status, + metadata: { + title: params.doc?.metadata?.title ?? null, + statusCode: getDocumentStatusCode(params.doc), + }, + }); + + return { + check_id: params.check.id, + monitor_id: params.monitor.id, + team_id: params.monitor.team_id, + target_id: params.target.id, + url: params.url, + url_hash: hashMonitorUrl(params.url), + status, + previous_scrape_id: previous?.last_scrape_id ?? null, + current_scrape_id: params.scrapeId, + diff_gcs_key: diffGcsKey, + diff_text_bytes: diffTextBytes, + diff_json_bytes: diffJsonBytes, + status_code: getDocumentStatusCode(params.doc), + metadata: { + title: params.doc?.metadata?.title ?? null, + }, + emailStatus: status, + }; +} + +async function runScrapeTarget(params: { + monitor: MonitorRow; + check: MonitorCheckRow; + target: MonitorTarget; +}): Promise<{ pages: PageResult[]; credits: number; targetResult: any }> { + if (params.target.type !== "scrape") { + return { pages: [], credits: 0, targetResult: null }; + } + + const pages: PageResult[] = []; + let credits = 0; + + for (const url of params.target.urls) { + try { + const result = await runSingleScrape({ + monitor: params.monitor, + check: params.check, + target: params.target, + url, + }); + credits += result.credits; + pages.push( + await diffAndPersistPage({ + monitor: params.monitor, + check: params.check, + target: params.target, + url, + scrapeId: result.scrapeId, + doc: result.doc, + source: "explicit", + }), + ); + } catch (error) { + pages.push({ + check_id: params.check.id, + monitor_id: params.monitor.id, + team_id: params.monitor.team_id, + target_id: params.target.id, + url, + url_hash: hashMonitorUrl(url), + status: "error", + error: error instanceof Error ? error.message : String(error), + emailStatus: "error", + }); + } + } + + return { + pages, + credits, + targetResult: { + targetId: params.target.id, + type: params.target.type, + pages: pages.length, + credits, + }, + }; +} + +async function runCrawlTarget(params: { + monitor: MonitorRow; + check: MonitorCheckRow; + target: MonitorTarget; +}): Promise<{ pages: PageResult[]; credits: number; targetResult: any }> { + if (params.target.type !== "crawl") { + return { pages: [], credits: 0, targetResult: null }; + } + + const crawlId = uuidv7(); + const body = crawlRequestSchema.parse({ + url: params.target.url, + ...(params.target.crawlOptions ?? {}), + scrapeOptions: withMarkdownFormat(params.target.scrapeOptions ?? {}), + origin: "monitor", + }) as CrawlRequest; + + await logRequest({ + id: crawlId, + kind: "crawl", + api_version: "v2", + team_id: params.monitor.team_id, + origin: "monitor", + integration: null, + target_hint: body.url, + zeroDataRetention: false, + api_key_id: null, + }); + + const crawlerOptions = { + ...body, + url: undefined, + scrapeOptions: undefined, + prompt: undefined, + }; + + const sc: StoredCrawl = { + originUrl: body.url, + crawlerOptions: toV0CrawlerOptions(crawlerOptions), + scrapeOptions: body.scrapeOptions, + internalOptions: { + disableSmartWaitCache: true, + teamId: params.monitor.team_id, + saveScrapeResultToGCS: !!config.GCS_FIRE_ENGINE_BUCKET_NAME, + zeroDataRetention: false, + bypassBilling: true, + }, + team_id: params.monitor.team_id, + createdAt: Date.now(), + maxConcurrency: body.maxConcurrency, + zeroDataRetention: false, + }; + + const crawler = crawlToCrawler(crawlId, sc, null); + try { + sc.robots = await crawler.getRobotsTxt( + body.scrapeOptions.skipTlsVerification, + ); + } catch { + // Crawls tolerate robots fetch failures in the public controller too. + } + + await crawlGroup.addGroup(crawlId, sc.team_id, 24 * 60 * 60 * 1000); + await saveCrawl(crawlId, sc); + await markCrawlActive(crawlId); + + await _addScrapeJobToBullMQ( + { + url: body.url, + mode: "kickoff", + team_id: params.monitor.team_id, + crawlerOptions, + scrapeOptions: sc.scrapeOptions, + internalOptions: sc.internalOptions, + origin: "monitor", + integration: null, + billing: { endpoint: "monitor", jobId: params.check.id }, + crawl_id: crawlId, + v1: true, + zeroDataRetention: false, + apiKeyId: null, + }, + uuidv7(), + ); + + const started = Date.now(); + let status = "scraping"; + let total = 0; + while (Date.now() - started < 30 * 60 * 1000) { + const group = await crawlGroup.getGroup(crawlId); + const stats = await scrapeQueue.getGroupNumericStats(crawlId, logger); + status = group?.status ?? "scraping"; + total = + (stats.completed ?? 0) + + (stats.active ?? 0) + + (stats.queued ?? 0) + + (stats.backlog ?? 0); + if (status !== "active" && status !== "scraping") break; + await poll(1000); + } + + const doneJobs = await scrapeQueue.getCrawlJobsForListing( + crawlId, + Math.max(total, 1), + 0, + logger, + ); + + const pages: PageResult[] = []; + const seen = new Set(); + let credits = 0; + + for (const job of doneJobs) { + const doc = job.returnvalue ?? (await getJobFromGCS(job.id))?.[0]; + if (!doc) continue; + const url = getDocumentUrl(doc, (job.data as any)?.url ?? body.url); + seen.add(hashMonitorUrl(url)); + credits += estimateActualCredits(doc, body.scrapeOptions); + pages.push( + await diffAndPersistPage({ + monitor: params.monitor, + check: params.check, + target: params.target, + url, + scrapeId: job.id, + doc, + source: "discovered", + }), + ); + } + + if (status === "completed") { + const previousPages = await listActiveMonitorPages({ + monitorId: params.monitor.id, + targetId: params.target.id, + }); + for (const previous of previousPages) { + if (seen.has(previous.url_hash)) continue; + await upsertMonitorPage({ + monitorId: params.monitor.id, + teamId: params.monitor.team_id, + targetId: params.target.id, + url: previous.url, + source: previous.source, + checkId: params.check.id, + scrapeId: previous.last_scrape_id, + status: "removed", + metadata: previous.metadata, + }); + pages.push({ + check_id: params.check.id, + monitor_id: params.monitor.id, + team_id: params.monitor.team_id, + target_id: params.target.id, + url: previous.url, + url_hash: previous.url_hash, + status: "removed", + previous_scrape_id: previous.last_scrape_id, + current_scrape_id: null, + emailStatus: "removed", + }); + } + } + + return { + pages, + credits, + targetResult: { + targetId: params.target.id, + type: params.target.type, + crawlId, + status, + pages: pages.length, + credits, + }, + }; +} + +function summarize(pages: PageResult[]) { + return { + totalPages: pages.length, + same: pages.filter(page => page.status === "same").length, + changed: pages.filter(page => page.status === "changed").length, + new: pages.filter(page => page.status === "new").length, + removed: pages.filter(page => page.status === "removed").length, + error: pages.filter(page => page.status === "error").length, + }; +} + +async function billMonitorCheck(params: { + monitor: MonitorRow; + check: MonitorCheckRow; + actualCredits: number; + lockId: string | null; +}): Promise { + if (params.lockId) { + await autumnService.finalizeCreditsLock({ + lockId: params.lockId, + action: "confirm", + overrideValue: params.actualCredits, + properties: { + source: "monitorCheck", + endpoint: "monitor", + jobId: params.check.id, + }, + }); + } + + if (params.actualCredits <= 0 || !config.USE_DB_AUTHENTICATION) return; + + await getBillingQueue().add( + "bill_team", + { + team_id: params.monitor.team_id, + subscription_id: undefined, + credits: params.actualCredits, + billing: { endpoint: "monitor", jobId: params.check.id }, + is_extract: false, + timestamp: new Date().toISOString(), + originating_job_id: params.check.id, + api_key_id: null, + autumnTrackInRequest: Boolean(params.lockId), + }, + { + jobId: uuidv7(), + priority: 10, + }, + ); +} + +async function sendNotifications(params: { + monitor: MonitorRow; + check: MonitorCheckRow; + pages: PageResult[]; +}): Promise<{ webhook?: unknown; email?: unknown }> { + const payload = { + monitorId: params.monitor.id, + checkId: params.check.id, + status: params.check.status, + summary: toSummaryObject(params.check), + }; + + let webhookStatus: unknown = { attempted: false }; + if (params.monitor.webhook) { + const sender = await createWebhookSender({ + teamId: params.monitor.team_id, + jobId: params.check.id, + webhook: params.monitor.webhook as any, + v0: false, + }); + await sender?.send(WebhookEvent.MONITOR_CHECK_COMPLETED, { + success: params.check.status === "completed", + data: payload, + error: params.check.error ?? undefined, + }); + webhookStatus = { attempted: true, success: true }; + } + + const emailStatus = await sendMonitoringEmailSummary({ + monitor: params.monitor, + check: params.check, + pages: params.pages + .filter(page => page.status !== "same") + .slice(0, 25) + .map(page => ({ + url: page.url, + status: page.status, + error: page.error, + })), + }); + + return { + webhook: webhookStatus, + email: emailStatus, + }; +} + +async function enqueueMonitorScrapeTarget(params: { + monitor: MonitorRow; + check: MonitorCheckRow; + target: MonitorTarget; +}): Promise<{ targetId: string; type: "scrape"; expectedJobs: string[] }> { + if (params.target.type !== "scrape") { + throw new Error("Expected scrape target"); + } + + const expectedJobs: string[] = []; + for (const url of params.target.urls) { + const scrapeId = uuidv7(); + const scrapeOptions = scrapeRequestSchema.parse({ + url, + ...withMonitorScrapeDefaults(params.target.scrapeOptions ?? {}), + origin: "monitor", + }); + + await logRequest({ + id: scrapeId, + kind: "scrape", + api_version: "v2", + team_id: params.monitor.team_id, + origin: "monitor", + integration: null, + target_hint: url, + zeroDataRetention: false, + api_key_id: null, + }); + + await addScrapeJob( + { + mode: "single_urls", + url, + team_id: params.monitor.team_id, + scrapeOptions, + internalOptions: { + teamId: params.monitor.team_id, + saveScrapeResultToGCS: !!config.GCS_FIRE_ENGINE_BUCKET_NAME, + bypassBilling: true, + zeroDataRetention: false, + }, + origin: "monitor", + integration: null, + billing: { endpoint: "monitor", jobId: params.check.id }, + zeroDataRetention: false, + apiKeyId: null, + monitoring: { + monitorId: params.monitor.id, + checkId: params.check.id, + targetId: params.target.id, + source: "explicit", + }, + }, + scrapeId, + 20, + ); + expectedJobs.push(scrapeId); + } + + return { targetId: params.target.id, type: "scrape", expectedJobs }; +} + +async function enqueueMonitorCrawlTarget(params: { + monitor: MonitorRow; + check: MonitorCheckRow; + target: MonitorTarget; +}): Promise<{ targetId: string; type: "crawl"; crawlId: string }> { + if (params.target.type !== "crawl") { + throw new Error("Expected crawl target"); + } + + const crawlId = uuidv7(); + const body = crawlRequestSchema.parse({ + url: params.target.url, + ...(params.target.crawlOptions ?? {}), + scrapeOptions: withMonitorScrapeDefaults(params.target.scrapeOptions ?? {}), + origin: "monitor", + }) as CrawlRequest; + + await logRequest({ + id: crawlId, + kind: "crawl", + api_version: "v2", + team_id: params.monitor.team_id, + origin: "monitor", + integration: null, + target_hint: body.url, + zeroDataRetention: false, + api_key_id: null, + }); + + const crawlerOptions = { + ...body, + url: undefined, + scrapeOptions: undefined, + prompt: undefined, + }; + + const sc: StoredCrawl = { + originUrl: body.url, + crawlerOptions: toV0CrawlerOptions(crawlerOptions), + scrapeOptions: body.scrapeOptions, + internalOptions: { + disableSmartWaitCache: true, + teamId: params.monitor.team_id, + saveScrapeResultToGCS: !!config.GCS_FIRE_ENGINE_BUCKET_NAME, + zeroDataRetention: false, + bypassBilling: true, + }, + team_id: params.monitor.team_id, + createdAt: Date.now(), + maxConcurrency: body.maxConcurrency, + zeroDataRetention: false, + }; + + const crawler = crawlToCrawler(crawlId, sc, null); + try { + sc.robots = await crawler.getRobotsTxt( + body.scrapeOptions.skipTlsVerification, + ); + } catch { + // Non-fatal, same as the public crawl controller. + } + + await crawlGroup.addGroup(crawlId, sc.team_id, 24 * 60 * 60 * 1000); + await saveCrawl(crawlId, sc); + await markCrawlActive(crawlId); + + await _addScrapeJobToBullMQ( + { + url: body.url, + mode: "kickoff", + team_id: params.monitor.team_id, + crawlerOptions, + scrapeOptions: sc.scrapeOptions, + internalOptions: sc.internalOptions, + origin: "monitor", + integration: null, + billing: { endpoint: "monitor", jobId: params.check.id }, + crawl_id: crawlId, + v1: true, + zeroDataRetention: false, + apiKeyId: null, + monitoring: { + monitorId: params.monitor.id, + checkId: params.check.id, + targetId: params.target.id, + source: "discovered", + }, + }, + uuidv7(), + ); + + return { targetId: params.target.id, type: "crawl", crawlId }; +} + +export async function processMonitorCheckJob( + job: MonitorCheckJobData, +): Promise { + const monitor = await getMonitorForUpdate(job.teamId, job.monitorId); + if (!monitor) { + throw new Error("Monitor not found"); + } + + await markMonitorRunning({ + monitorId: monitor.id, + checkId: job.checkId, + }); + + let check = await updateMonitorCheck(job.checkId, { + status: "running", + started_at: new Date().toISOString(), + }); + + let lockId: string | null = null; + try { + lockId = await autumnService.lockCredits({ + teamId: monitor.team_id, + value: check.estimated_credits ?? 1, + lockId: `monitor_${check.id}`, + expiresAt: Date.now() + 60 * 60 * 1000, + properties: { + source: "monitorCheck", + endpoint: "monitor", + jobId: check.id, + }, + }); + + check = await updateMonitorCheck(check.id, { + autumn_lock_id: lockId, + reserved_credits: lockId ? (check.estimated_credits ?? 1) : null, + billing_status: lockId ? "reserved" : "not_applicable", + }); + + const targetResults: unknown[] = []; + + for (const target of monitor.targets) { + const result = + target.type === "scrape" + ? await enqueueMonitorScrapeTarget({ monitor, check, target }) + : await enqueueMonitorCrawlTarget({ monitor, check, target }); + targetResults.push(result); + } + + await updateMonitorCheck(check.id, { + target_results: targetResults, + }); + } catch (error) { + if (lockId) { + await autumnService.finalizeCreditsLock({ + lockId, + action: "release", + properties: { + source: "monitorCheck", + endpoint: "monitor", + jobId: check.id, + }, + }); + } + + check = await updateMonitorCheck(check.id, { + status: "failed", + finished_at: new Date().toISOString(), + billing_status: lockId ? "released" : "failed", + error: error instanceof Error ? error.message : String(error), + }); + + await sendNotifications({ + monitor, + check, + pages: [], + }).catch(err => + logger.warn("Failed to send monitor failure notifications", { + error: err, + }), + ); + + await updateMonitorScheduleAfterRun({ + monitor, + check, + }); + + throw error; + } +} + +async function processRemovedPagesForCompletedCrawls(params: { + monitor: MonitorRow; + check: MonitorCheckRow; + targetResults: any[]; +}): Promise { + for (const target of params.targetResults) { + if (target?.type !== "crawl" || target.removedProcessed) continue; + + const group = await crawlGroup.getGroup(target.crawlId); + if (group?.status !== "completed") continue; + + const checkPages = await listMonitorCheckPages({ + teamId: params.monitor.team_id, + monitorId: params.monitor.id, + checkId: params.check.id, + limit: 100000, + skip: 0, + }); + const seen = new Set( + checkPages + .filter(page => page.target_id === target.targetId) + .map(page => page.url_hash), + ); + const activePages = await listActiveMonitorPages({ + monitorId: params.monitor.id, + targetId: target.targetId, + }); + + const removed: MonitorCheckPageInsert[] = []; + for (const previous of activePages) { + if (seen.has(previous.url_hash)) continue; + await upsertMonitorPage({ + monitorId: params.monitor.id, + teamId: params.monitor.team_id, + targetId: target.targetId, + url: previous.url, + source: previous.source, + checkId: params.check.id, + scrapeId: previous.last_scrape_id, + status: "removed", + metadata: previous.metadata, + }); + removed.push({ + check_id: params.check.id, + monitor_id: params.monitor.id, + team_id: params.monitor.team_id, + target_id: target.targetId, + url: previous.url, + url_hash: previous.url_hash, + status: "removed" as const, + previous_scrape_id: previous.last_scrape_id, + current_scrape_id: null, + }); + } + + await insertMonitorCheckPages(removed); + target.removedProcessed = true; + } +} + +async function isMonitorCheckComplete( + check: MonitorCheckRow, +): Promise { + const targetResults = Array.isArray(check.target_results) + ? (check.target_results as any[]) + : []; + + if (targetResults.length === 0) return false; + + for (const target of targetResults) { + if (target?.type === "scrape") { + const expected = Array.isArray(target.expectedJobs) + ? target.expectedJobs.length + : 0; + const recorded = await countMonitorCheckPages({ + checkId: check.id, + targetId: target.targetId, + }); + if (recorded < expected) return false; + } else if (target?.type === "crawl") { + const group = await crawlGroup.getGroup(target.crawlId); + if (!group || group.status === "active") return false; + + const stats = await scrapeQueue.getGroupNumericStats( + target.crawlId, + logger, + ); + const unfinished = + (stats.active ?? 0) + (stats.queued ?? 0) + (stats.backlog ?? 0); + if (unfinished > 0) return false; + } + } + + return true; +} + +export async function reconcileRunningMonitorChecks( + limit: number = 50, +): Promise { + const checks = await listRunningMonitorChecks(limit); + for (const check of checks) { + const lockKey = `monitor-check-finalize:${check.id}`; + const lock = await redisEvictConnection.set(lockKey, "1", "EX", 60, "NX"); + if (lock !== "OK") continue; + + try { + const monitor = await getMonitorForUpdate( + check.team_id, + check.monitor_id, + ); + if (!monitor) continue; + + const targetResults = Array.isArray(check.target_results) + ? ([...check.target_results] as any[]) + : []; + + await processRemovedPagesForCompletedCrawls({ + monitor, + check, + targetResults, + }); + + if ( + !(await isMonitorCheckComplete({ + ...check, + target_results: targetResults, + })) + ) { + await updateMonitorCheck(check.id, { target_results: targetResults }); + continue; + } + + const [same, changed, newCount, removed, errorCount] = await Promise.all([ + countMonitorCheckPages({ checkId: check.id, status: "same" }), + countMonitorCheckPages({ checkId: check.id, status: "changed" }), + countMonitorCheckPages({ checkId: check.id, status: "new" }), + countMonitorCheckPages({ checkId: check.id, status: "removed" }), + countMonitorCheckPages({ checkId: check.id, status: "error" }), + ]); + const totalPages = same + changed + newCount + removed + errorCount; + const actualCredits = totalPages; + + let finalized = await updateMonitorCheck(check.id, { + status: errorCount > 0 ? "partial" : "completed", + finished_at: new Date().toISOString(), + actual_credits: actualCredits, + billing_status: check.autumn_lock_id ? "confirmed" : "not_applicable", + total_pages: totalPages, + same_count: same, + changed_count: changed, + new_count: newCount, + removed_count: removed, + error_count: errorCount, + target_results: targetResults, + }); + + try { + await billMonitorCheck({ + monitor, + check: finalized, + actualCredits, + lockId: check.autumn_lock_id, + }); + } catch (error) { + logger.warn("Failed to bill monitor check during reconciliation", { + monitorId: monitor.id, + checkId: finalized.id, + error, + }); + finalized = await updateMonitorCheck(check.id, { + billing_status: "failed", + }).catch(updateError => { + logger.warn("Failed to record monitor check billing failure", { + monitorId: monitor.id, + checkId: finalized.id, + error: updateError, + }); + return finalized; + }); + } + + let notificationStatus: { webhook?: unknown; email?: unknown } | null = + null; + try { + const pages = (await listMonitorCheckPages({ + teamId: monitor.team_id, + monitorId: monitor.id, + checkId: check.id, + limit: 100, + skip: 0, + })) as PageResult[]; + + notificationStatus = await sendNotifications({ + monitor, + check: finalized, + pages, + }); + + finalized = await updateMonitorCheck(check.id, { + notification_status: notificationStatus, + webhook_payload: notificationStatus.webhook + ? { summary: toSummaryObject(finalized) } + : null, + email_payload: notificationStatus.email + ? { summary: toSummaryObject(finalized) } + : null, + }); + } catch (error) { + logger.warn("Failed to send monitor check notifications", { + monitorId: monitor.id, + checkId: finalized.id, + error, + }); + notificationStatus = { + webhook: { + attempted: !!monitor.webhook, + success: false, + error: error instanceof Error ? error.message : String(error), + }, + email: { + attempted: !!monitor.notification?.email?.enabled, + success: false, + error: error instanceof Error ? error.message : String(error), + }, + }; + finalized = await updateMonitorCheck(check.id, { + notification_status: notificationStatus, + }).catch(updateError => { + logger.warn("Failed to record monitor check notification failure", { + monitorId: monitor.id, + checkId: finalized.id, + error: updateError, + }); + return finalized; + }); + } + + await updateMonitorScheduleAfterRun({ + monitor, + check: finalized, + summary: toSummaryObject(finalized), + }); + + logger.info("Reconciled monitor check", { + monitorId: monitor.id, + checkId: finalized.id, + status: finalized.status, + totalPages, + same, + changed, + new: newCount, + removed, + errors: errorCount, + }); + } catch (error) { + logger.warn("Failed to reconcile monitor check", { + error, + checkId: check.id, + }); + } finally { + await redisEvictConnection.del(lockKey); + } + } +} + +function toSummaryObject(check: MonitorCheckRow) { + return { + totalPages: check.total_pages, + same: check.same_count, + changed: check.changed_count, + new: check.new_count, + removed: check.removed_count, + error: check.error_count, + }; +} diff --git a/apps/api/src/services/monitoring/scheduler.ts b/apps/api/src/services/monitoring/scheduler.ts new file mode 100644 index 0000000000..99b817b34e --- /dev/null +++ b/apps/api/src/services/monitoring/scheduler.ts @@ -0,0 +1,75 @@ +import { randomUUID } from "crypto"; +import { logger as _logger } from "../../lib/logger"; +import { addMonitorCheckJob } from "./queue"; +import { + advanceMonitorAfterSkippedCheck, + claimDueMonitors, + createMonitorCheck, + updateMonitorCheck, +} from "./store"; + +const logger = _logger.child({ module: "monitoring-scheduler" }); + +export async function enqueueMonitorCheck(params: { + monitorId: string; + checkId: string; + teamId: string; +}): Promise { + await addMonitorCheckJob(params); +} + +export async function enqueueDueMonitorChecks( + params: { + workerId?: string; + limit?: number; + leaseSeconds?: number; + } = {}, +): Promise { + const workerId = params.workerId ?? `monitor-scheduler-${randomUUID()}`; + const monitors = await claimDueMonitors({ + workerId, + limit: params.limit ?? 10, + leaseSeconds: params.leaseSeconds ?? 60, + }); + + let enqueued = 0; + for (const monitor of monitors) { + try { + if (monitor.current_check_id) { + const skipped = await createMonitorCheck({ + monitor, + trigger: "scheduled", + scheduledFor: monitor.next_run_at, + status: "skipped_overlap", + }); + const finished = await updateMonitorCheck(skipped.id, { + status: "skipped_overlap", + finished_at: new Date().toISOString(), + error: "Previous monitor check is still running.", + }); + await advanceMonitorAfterSkippedCheck({ monitor, check: finished }); + continue; + } + + const check = await createMonitorCheck({ + monitor, + trigger: "scheduled", + scheduledFor: monitor.next_run_at, + }); + await enqueueMonitorCheck({ + monitorId: monitor.id, + checkId: check.id, + teamId: monitor.team_id, + }); + enqueued++; + } catch (error) { + logger.error("Failed to enqueue due monitor check", { + error, + monitorId: monitor.id, + teamId: monitor.team_id, + }); + } + } + + return enqueued; +} diff --git a/apps/api/src/services/monitoring/store.ts b/apps/api/src/services/monitoring/store.ts new file mode 100644 index 0000000000..2d4e51dffa --- /dev/null +++ b/apps/api/src/services/monitoring/store.ts @@ -0,0 +1,589 @@ +import { createHash } from "crypto"; +import { v7 as uuidv7 } from "uuid"; +import { supabase_rr_service, supabase_service } from "../supabase"; +import { getNextMonitorRunAt, estimateRunsPerMonth } from "./cron"; +import type { + CreateMonitorRequest, + MonitorCheckPageInsert, + MonitorCheckRow, + MonitorPageRow, + MonitorRow, + MonitorSummary, + MonitorTarget, + UpdateMonitorRequest, +} from "./types"; + +export function hashMonitorUrl(url: string): string { + return `\\x${createHash("sha256").update(url).digest("hex")}`; +} + +function ensureTargetIds(targets: Array>): MonitorTarget[] { + return targets.map(target => ({ + ...target, + id: typeof target.id === "string" ? target.id : uuidv7(), + })) as MonitorTarget[]; +} + +function estimateTargetCredits(target: MonitorTarget): number { + if (target.type === "scrape") { + return target.urls.length; + } + + const limit = + typeof target.crawlOptions?.limit === "number" + ? target.crawlOptions.limit + : 10000; + return Math.max(1, limit); +} + +export function estimateMonitorCreditsPerRun(targets: MonitorTarget[]): number { + return targets.reduce( + (sum, target) => sum + estimateTargetCredits(target), + 0, + ); +} + +function toMonitorSummary(check: MonitorCheckRow): MonitorSummary { + return { + totalPages: check.total_pages, + same: check.same_count, + changed: check.changed_count, + new: check.new_count, + removed: check.removed_count, + error: check.error_count, + }; +} + +function throwIfError(error: any, message: string): void { + if (error) { + throw new Error(`${message}: ${error.message ?? JSON.stringify(error)}`); + } +} + +export async function createMonitor(params: { + teamId: string; + input: CreateMonitorRequest; + nextRunAt: Date; + intervalMs: number; +}): Promise { + const targets = ensureTargetIds(params.input.targets); + const estimatedCreditsPerRun = estimateMonitorCreditsPerRun(targets); + const estimatedCreditsPerMonth = + estimatedCreditsPerRun * estimateRunsPerMonth(params.intervalMs); + + const { data, error } = await supabase_service + .from("monitors") + .insert({ + id: uuidv7(), + team_id: params.teamId, + name: params.input.name, + schedule_cron: params.input.schedule.cron, + schedule_timezone: params.input.schedule.timezone, + next_run_at: params.nextRunAt.toISOString(), + retention_days: params.input.retentionDays, + estimated_credits_per_month: estimatedCreditsPerMonth, + targets, + webhook: params.input.webhook ?? null, + notification: params.input.notification ?? null, + }) + .select("*") + .single(); + + throwIfError(error, "Failed to create monitor"); + return data as MonitorRow; +} + +export async function listMonitors(params: { + teamId: string; + limit: number; + offset: number; +}): Promise { + const { data, error } = await supabase_rr_service + .from("monitors") + .select("*") + .eq("team_id", params.teamId) + .neq("status", "deleted") + .order("created_at", { ascending: false }) + .range(params.offset, params.offset + params.limit - 1); + + throwIfError(error, "Failed to list monitors"); + return (data ?? []) as MonitorRow[]; +} + +export async function getMonitor( + teamId: string, + monitorId: string, +): Promise { + const { data, error } = await supabase_rr_service + .from("monitors") + .select("*") + .eq("id", monitorId) + .eq("team_id", teamId) + .neq("status", "deleted") + .maybeSingle(); + + throwIfError(error, "Failed to get monitor"); + return data as MonitorRow | null; +} + +export async function getMonitorForUpdate( + teamId: string, + monitorId: string, +): Promise { + const { data, error } = await supabase_service + .from("monitors") + .select("*") + .eq("id", monitorId) + .eq("team_id", teamId) + .neq("status", "deleted") + .maybeSingle(); + + throwIfError(error, "Failed to get monitor"); + return data as MonitorRow | null; +} + +export async function updateMonitor(params: { + teamId: string; + monitorId: string; + input: UpdateMonitorRequest; + nextRunAt?: Date; + intervalMs?: number; +}): Promise { + const patch: Record = { + updated_at: new Date().toISOString(), + }; + + if (params.input.name !== undefined) patch.name = params.input.name; + if (params.input.status !== undefined) patch.status = params.input.status; + if (params.input.webhook !== undefined) + patch.webhook = params.input.webhook ?? null; + if (params.input.notification !== undefined) { + patch.notification = params.input.notification ?? null; + } + if (params.input.retentionDays !== undefined) { + patch.retention_days = params.input.retentionDays; + } + if (params.input.targets !== undefined) { + const targets = ensureTargetIds(params.input.targets); + patch.targets = targets; + if (params.intervalMs !== undefined) { + patch.estimated_credits_per_month = + estimateMonitorCreditsPerRun(targets) * + estimateRunsPerMonth(params.intervalMs); + } + } + if (params.input.schedule !== undefined) { + patch.schedule_cron = params.input.schedule.cron; + patch.schedule_timezone = params.input.schedule.timezone; + patch.next_run_at = params.nextRunAt?.toISOString() ?? null; + } + + const { data, error } = await supabase_service + .from("monitors") + .update(patch) + .eq("id", params.monitorId) + .eq("team_id", params.teamId) + .neq("status", "deleted") + .select("*") + .maybeSingle(); + + throwIfError(error, "Failed to update monitor"); + return data as MonitorRow | null; +} + +export async function deleteMonitor(params: { + teamId: string; + monitorId: string; +}): Promise { + const { data, error } = await supabase_service + .from("monitors") + .update({ + status: "deleted", + deleted_at: new Date().toISOString(), + next_run_at: null, + updated_at: new Date().toISOString(), + }) + .eq("id", params.monitorId) + .eq("team_id", params.teamId) + .neq("status", "deleted") + .select("id") + .maybeSingle(); + + throwIfError(error, "Failed to delete monitor"); + return !!data; +} + +export async function createMonitorCheck(params: { + monitor: MonitorRow; + trigger: "scheduled" | "manual"; + scheduledFor?: string | null; + status?: MonitorCheckRow["status"]; +}): Promise { + const estimated = estimateMonitorCreditsPerRun(params.monitor.targets); + const { data, error } = await supabase_service + .from("monitor_checks") + .insert({ + id: uuidv7(), + monitor_id: params.monitor.id, + team_id: params.monitor.team_id, + trigger: params.trigger, + status: params.status ?? "queued", + scheduled_for: params.scheduledFor ?? null, + estimated_credits: estimated, + }) + .select("*") + .single(); + + throwIfError(error, "Failed to create monitor check"); + return data as MonitorCheckRow; +} + +export async function markMonitorRunning(params: { + monitorId: string; + checkId: string; +}): Promise { + const { error } = await supabase_service + .from("monitors") + .update({ + current_check_id: params.checkId, + updated_at: new Date().toISOString(), + }) + .eq("id", params.monitorId) + .is("current_check_id", null); + + throwIfError(error, "Failed to mark monitor running"); +} + +export async function updateMonitorScheduleAfterRun(params: { + monitor: MonitorRow; + check: MonitorCheckRow; + summary?: MonitorSummary; +}): Promise { + const nextRunAt = + params.monitor.status === "active" + ? getNextMonitorRunAt( + params.monitor.schedule_cron, + new Date(), + params.monitor.schedule_timezone, + ).toISOString() + : null; + const { error } = await supabase_service + .from("monitors") + .update({ + current_check_id: null, + locked_at: null, + locked_until: null, + last_run_at: params.check.finished_at ?? new Date().toISOString(), + last_check_id: params.check.id, + next_run_at: nextRunAt, + last_check_summary: params.summary ?? toMonitorSummary(params.check), + updated_at: new Date().toISOString(), + }) + .eq("id", params.monitor.id); + + throwIfError(error, "Failed to update monitor after run"); +} + +export async function advanceMonitorAfterSkippedCheck(params: { + monitor: MonitorRow; + check: MonitorCheckRow; +}): Promise { + const nextRunAt = + params.monitor.status === "active" + ? getNextMonitorRunAt( + params.monitor.schedule_cron, + new Date(), + params.monitor.schedule_timezone, + ).toISOString() + : null; + const { error } = await supabase_service + .from("monitors") + .update({ + locked_at: null, + locked_until: null, + last_run_at: params.check.finished_at ?? new Date().toISOString(), + last_check_id: params.check.id, + next_run_at: nextRunAt, + last_check_summary: toMonitorSummary(params.check), + updated_at: new Date().toISOString(), + }) + .eq("id", params.monitor.id); + + throwIfError(error, "Failed to advance monitor after skipped check"); +} + +export async function getMonitorCheck( + teamId: string, + monitorId: string, + checkId: string, +): Promise { + const { data, error } = await supabase_rr_service + .from("monitor_checks") + .select("*") + .eq("id", checkId) + .eq("monitor_id", monitorId) + .eq("team_id", teamId) + .maybeSingle(); + + throwIfError(error, "Failed to get monitor check"); + return data as MonitorCheckRow | null; +} + +export async function listRunningMonitorChecks( + limit: number = 100, +): Promise { + const { data, error } = await supabase_service + .from("monitor_checks") + .select("*") + .eq("status", "running") + .order("created_at", { ascending: true }) + .limit(limit); + + throwIfError(error, "Failed to list running monitor checks"); + return (data ?? []) as MonitorCheckRow[]; +} + +export async function listMonitorChecks(params: { + teamId: string; + monitorId: string; + limit: number; + offset: number; + status?: MonitorCheckRow["status"]; +}): Promise { + let query = supabase_rr_service + .from("monitor_checks") + .select("*") + .eq("monitor_id", params.monitorId) + .eq("team_id", params.teamId) + .order("created_at", { ascending: false }); + + if (params.status) { + query = query.eq("status", params.status); + } + + const { data, error } = await query.range( + params.offset, + params.offset + params.limit - 1, + ); + + throwIfError(error, "Failed to list monitor checks"); + return (data ?? []) as MonitorCheckRow[]; +} + +export async function updateMonitorCheck( + checkId: string, + patch: Partial, +): Promise { + const { data, error } = await supabase_service + .from("monitor_checks") + .update({ + ...patch, + updated_at: new Date().toISOString(), + }) + .eq("id", checkId) + .select("*") + .single(); + + throwIfError(error, "Failed to update monitor check"); + return data as MonitorCheckRow; +} + +export async function insertMonitorCheckPages( + pages: MonitorCheckPageInsert[], +): Promise { + if (pages.length === 0) return; + + const { error } = await supabase_service.from("monitor_check_pages").insert( + pages.map(page => ({ + id: uuidv7(), + ...page, + url_hash: page.url_hash ?? hashMonitorUrl(page.url), + })), + ); + + throwIfError(error, "Failed to insert monitor check pages"); +} + +export async function listMonitorCheckPages(params: { + teamId: string; + monitorId: string; + checkId: string; + limit: number; + skip: number; + status?: string; +}): Promise { + let query = supabase_rr_service + .from("monitor_check_pages") + .select("*") + .eq("check_id", params.checkId) + .eq("monitor_id", params.monitorId) + .eq("team_id", params.teamId) + .order("created_at", { ascending: true }); + + if (params.status) { + query = query.eq("status", params.status); + } + + const { data, error } = await query.range( + params.skip, + params.skip + params.limit - 1, + ); + + throwIfError(error, "Failed to list monitor check pages"); + return data ?? []; +} + +export async function countMonitorCheckPages(params: { + checkId: string; + targetId?: string; + status?: string; +}): Promise { + let query = supabase_rr_service + .from("monitor_check_pages") + .select("id", { count: "exact", head: true }) + .eq("check_id", params.checkId); + + if (params.targetId) { + query = query.eq("target_id", params.targetId); + } + if (params.status) { + query = query.eq("status", params.status); + } + + const { count, error } = await query; + throwIfError(error, "Failed to count monitor check pages"); + return count ?? 0; +} + +export async function getMonitorPage(params: { + monitorId: string; + targetId: string; + url: string; +}): Promise { + const { data, error } = await supabase_rr_service + .from("monitor_pages") + .select("*") + .eq("monitor_id", params.monitorId) + .eq("target_id", params.targetId) + .eq("url_hash", hashMonitorUrl(params.url)) + .maybeSingle(); + + throwIfError(error, "Failed to get monitor page"); + return data as MonitorPageRow | null; +} + +export async function upsertMonitorPage(params: { + monitorId: string; + teamId: string; + targetId: string; + url: string; + source: "explicit" | "discovered"; + checkId: string; + scrapeId: string | null; + status: "same" | "new" | "changed" | "removed" | "error"; + metadata?: unknown; +}): Promise { + const now = new Date().toISOString(); + + const existing = await getMonitorPage({ + monitorId: params.monitorId, + targetId: params.targetId, + url: params.url, + }); + + if (!existing) { + const { error } = await supabase_service.from("monitor_pages").insert({ + monitor_id: params.monitorId, + team_id: params.teamId, + target_id: params.targetId, + url: params.url, + url_hash: hashMonitorUrl(params.url), + source: params.source, + first_seen_check_id: params.checkId, + last_seen_check_id: + params.status === "removed" ? undefined : params.checkId, + last_changed_check_id: + params.status === "changed" || params.status === "new" + ? params.checkId + : undefined, + last_scrape_id: params.scrapeId, + last_status: params.status, + is_removed: params.status === "removed", + removed_at: params.status === "removed" ? now : null, + metadata: params.metadata ?? null, + created_at: now, + updated_at: now, + }); + + throwIfError(error, "Failed to insert monitor page"); + return; + } + + const patch: Record = { + last_status: params.status, + is_removed: params.status === "removed", + removed_at: params.status === "removed" ? now : null, + metadata: params.metadata ?? existing.metadata ?? null, + updated_at: now, + }; + if (params.status !== "removed") { + patch.last_seen_check_id = params.checkId; + patch.last_scrape_id = params.scrapeId; + } + if (params.status === "changed" || params.status === "new") { + patch.last_changed_check_id = params.checkId; + } + + const { error } = await supabase_service + .from("monitor_pages") + .update(patch) + .eq("id", existing.id); + + throwIfError(error, "Failed to update monitor page"); +} + +export async function listActiveMonitorPages(params: { + monitorId: string; + targetId: string; +}): Promise { + const pageSize = 1000; + const pages: MonitorPageRow[] = []; + let offset = 0; + + while (true) { + const { data, error } = await supabase_rr_service + .from("monitor_pages") + .select("*") + .eq("monitor_id", params.monitorId) + .eq("target_id", params.targetId) + .eq("is_removed", false) + .order("created_at", { ascending: true }) + .range(offset, offset + pageSize - 1); + + throwIfError(error, "Failed to list active monitor pages"); + const batch = (data ?? []) as MonitorPageRow[]; + pages.push(...batch); + if (batch.length < pageSize) break; + offset += pageSize; + } + + return pages; +} + +export async function claimDueMonitors(params: { + workerId: string; + limit: number; + leaseSeconds: number; +}): Promise { + const { data, error } = await supabase_service.rpc( + "monitoring_claim_due_monitors", + { + p_worker_id: params.workerId, + p_limit: params.limit, + p_lease_seconds: params.leaseSeconds, + }, + ); + + throwIfError(error, "Failed to claim due monitors"); + return (data ?? []) as MonitorRow[]; +} diff --git a/apps/api/src/services/monitoring/types.ts b/apps/api/src/services/monitoring/types.ts new file mode 100644 index 0000000000..7a1aab5214 --- /dev/null +++ b/apps/api/src/services/monitoring/types.ts @@ -0,0 +1,275 @@ +import { z } from "zod"; +import { + crawlerOptions, + URL as urlSchema, + type ScrapeOptions, +} from "../../controllers/v2/types"; +import { createWebhookSchema } from "../webhook/schema"; +import { parseMonitorScheduleText } from "./cron"; + +const formatSchema = z.union([z.string(), z.record(z.string(), z.unknown())]); + +const scrapeOptionsSchema = z + .object({ + formats: z.array(formatSchema).optional(), + }) + .catchall(z.unknown()) + .optional() + .default({}); + +const scrapeTargetSchema = z.strictObject({ + id: z.string().uuid().optional(), + type: z.literal("scrape"), + urls: z.array(urlSchema).min(1), + scrapeOptions: scrapeOptionsSchema, +}); + +const crawlTargetSchema = z.strictObject({ + id: z.string().uuid().optional(), + type: z.literal("crawl"), + url: urlSchema, + crawlOptions: crawlerOptions + .partial() + .catchall(z.unknown()) + .optional() + .default({}), + scrapeOptions: scrapeOptionsSchema, +}); + +const monitorTargetSchema = z.union([scrapeTargetSchema, crawlTargetSchema]); + +const monitorWebhookSchema = createWebhookSchema([ + "monitor.page", + "monitor.check.completed", +]); + +const monitorScheduleSchema = z + .strictObject({ + cron: z.string().min(1).max(128).optional(), + text: z.string().min(1).max(128).optional(), + timezone: z.string().min(1).max(128).optional().default("UTC"), + }) + .superRefine((schedule, ctx) => { + if (!schedule.cron && !schedule.text) { + ctx.addIssue({ + code: "custom", + message: "Schedule must include either cron or text", + path: ["cron"], + }); + } + if (schedule.cron && schedule.text) { + ctx.addIssue({ + code: "custom", + message: "Schedule must include either cron or text, not both", + path: ["text"], + }); + } + if (schedule.text) { + try { + parseMonitorScheduleText(schedule.text); + } catch (error) { + ctx.addIssue({ + code: "custom", + message: error instanceof Error ? error.message : String(error), + path: ["text"], + }); + } + } + }) + .transform(schedule => ({ + cron: schedule.cron ?? parseMonitorScheduleText(schedule.text!), + timezone: schedule.timezone, + })); + +const monitorNotificationSchema = z + .strictObject({ + email: z + .strictObject({ + enabled: z.boolean().optional().default(false), + recipients: z.array(z.email()).max(25).optional().default([]), + includeDiffs: z.boolean().optional().default(false), + }) + .optional(), + }) + .optional() + .default({}); + +export const createMonitorSchema = z.strictObject({ + name: z.string().min(1).max(256), + schedule: monitorScheduleSchema, + webhook: monitorWebhookSchema.optional(), + notification: monitorNotificationSchema, + targets: z.array(monitorTargetSchema).min(1).max(50), + retentionDays: z.number().int().positive().max(365).optional().default(30), +}); + +export const updateMonitorSchema = createMonitorSchema + .partial() + .extend({ + status: z.enum(["active", "paused"]).optional(), + }) + .refine(x => Object.keys(x).length > 0, "Update body cannot be empty"); + +export const listMonitorsQuerySchema = z.object({ + limit: z.coerce.number().int().positive().max(100).optional().default(25), + offset: z.coerce.number().int().nonnegative().optional().default(0), +}); + +export const listMonitorChecksQuerySchema = z.object({ + limit: z.coerce.number().int().positive().max(100).optional().default(25), + offset: z.coerce.number().int().nonnegative().optional().default(0), + status: z + .enum([ + "queued", + "running", + "completed", + "failed", + "partial", + "skipped_overlap", + ]) + .optional(), +}); + +export const monitorCheckDetailQuerySchema = z.object({ + limit: z.coerce.number().int().positive().max(100).optional().default(25), + skip: z.coerce.number().int().nonnegative().optional().default(0), + status: z.enum(["same", "new", "changed", "removed", "error"]).optional(), +}); + +export type MonitorTarget = z.infer & { + id: string; +}; +export type CreateMonitorRequest = z.infer; +export type UpdateMonitorRequest = z.infer; +type MonitorNotification = z.infer; + +export type MonitorRow = { + id: string; + team_id: string; + name: string; + status: "active" | "paused" | "deleted"; + schedule_cron: string; + schedule_timezone: string; + next_run_at: string | null; + last_run_at: string | null; + current_check_id: string | null; + locked_at: string | null; + locked_until: string | null; + retention_days: number; + estimated_credits_per_month: number | null; + targets: MonitorTarget[]; + webhook: unknown | null; + notification: MonitorNotification | null; + last_check_summary: MonitorSummary | null; + created_at: string; + updated_at: string; + deleted_at: string | null; +}; + +export type MonitorCheckRow = { + id: string; + monitor_id: string; + team_id: string; + trigger: "scheduled" | "manual"; + status: + | "queued" + | "running" + | "completed" + | "failed" + | "partial" + | "skipped_overlap"; + scheduled_for: string | null; + started_at: string | null; + finished_at: string | null; + estimated_credits: number | null; + reserved_credits: number | null; + actual_credits: number | null; + autumn_lock_id: string | null; + billing_status: + | "not_applicable" + | "reserved" + | "confirmed" + | "released" + | "failed"; + total_pages: number; + same_count: number; + changed_count: number; + new_count: number; + removed_count: number; + error_count: number; + target_results: unknown | null; + webhook_payload: unknown | null; + email_payload: unknown | null; + notification_status: unknown | null; + error: string | null; + created_at: string; + updated_at: string; +}; + +type MonitorPageStatus = "same" | "new" | "changed" | "removed" | "error"; +type MonitorPageSource = "explicit" | "discovered"; + +export type MonitorPageRow = { + id: string; + monitor_id: string; + team_id: string; + target_id: string; + url: string; + url_hash: string; + source: MonitorPageSource; + first_seen_check_id: string | null; + last_seen_check_id: string | null; + last_changed_check_id: string | null; + last_scrape_id: string | null; + last_status: MonitorPageStatus; + is_removed: boolean; + removed_at: string | null; + metadata: unknown | null; + created_at: string; + updated_at: string; +}; + +export type MonitorSummary = { + totalPages: number; + same: number; + changed: number; + new: number; + removed: number; + error: number; +}; + +export type MonitorCheckPageInsert = { + check_id: string; + monitor_id: string; + team_id: string; + target_id: string; + url: string; + url_hash?: string; + status: MonitorPageStatus; + previous_scrape_id?: string | null; + current_scrape_id?: string | null; + diff_gcs_key?: string | null; + diff_text_bytes?: number | null; + diff_json_bytes?: number | null; + status_code?: number | null; + error?: string | null; + metadata?: unknown | null; +}; + +export function withMarkdownFormat( + options: Record, +): ScrapeOptions { + const formats = Array.isArray(options.formats) ? options.formats : []; + const hasMarkdown = formats.some(format => + typeof format === "string" + ? format === "markdown" + : typeof format === "object" && + format !== null && + (format as any).type === "markdown", + ); + + return { + ...options, + formats: hasMarkdown ? formats : ["markdown", ...formats], + } as ScrapeOptions; +} diff --git a/apps/api/src/services/notification/monitoring_email.ts b/apps/api/src/services/notification/monitoring_email.ts new file mode 100644 index 0000000000..06d90ed831 --- /dev/null +++ b/apps/api/src/services/notification/monitoring_email.ts @@ -0,0 +1,216 @@ +import { Resend } from "resend"; +import escapeHtml from "escape-html"; +import { config } from "../../config"; +import { logger as _logger } from "../../lib/logger"; +import { supabase_service } from "../supabase"; +import type { MonitorCheckRow, MonitorRow } from "../monitoring/types"; + +const logger = _logger.child({ module: "monitoring-email" }); + +type MonitoringEmailPage = { + url: string; + status: string; + error?: string | null; +}; + +type MonitoringEmailPayload = { + monitorId: string; + monitorName: string; + checkId: string; + summary: { + changed: number; + new: number; + removed: number; + error: number; + totalPages: number; + }; + pages: MonitoringEmailPage[]; + creditsUsed: number | null; +}; + +async function getTeamEmails(teamId: string): Promise { + const { data, error } = await supabase_service + .from("user_teams") + .select( + "users(email, id, notification_preferences(unsubscribed_all, email_preferences))", + ) + .eq("team_id", teamId); + + if (error) { + logger.warn("Failed to load team emails", { error, teamId }); + return []; + } + + const emails = new Set(); + for (const row of data ?? []) { + const user = (row as any).users; + const email = user?.email; + if (!email) continue; + + const prefs = Array.isArray(user.notification_preferences) + ? user.notification_preferences[0] + : user.notification_preferences; + if (prefs?.unsubscribed_all) continue; + if ( + Array.isArray(prefs?.email_preferences) && + !prefs.email_preferences.includes("system_alerts") + ) { + continue; + } + + emails.add(email); + } + + return [...emails]; +} + +function buildHtml(payload: MonitoringEmailPayload): string { + const pageItems = payload.pages + .slice(0, 20) + .map(page => { + const url = escapeHtml(page.url); + return `
  • ${escapeHtml(page.status)}: ${url}${ + page.error ? ` — ${escapeHtml(page.error)}` : "" + }
  • `; + }) + .join(""); + + return `Hey there,
    +

    Your Firecrawl monitor ${escapeHtml(payload.monitorName)} detected activity.

    +
      +
    • Changed: ${payload.summary.changed}
    • +
    • New: ${payload.summary.new}
    • +
    • Removed: ${payload.summary.removed}
    • +
    • Errors: ${payload.summary.error}
    • +
    • Total pages checked: ${payload.summary.totalPages}
    • +
    +${pageItems ? `

    Top pages:

      ${pageItems}
    ` : ""} +

    Check ID: ${escapeHtml(payload.checkId)}

    +

    Credits used: ${payload.creditsUsed ?? "unknown"}

    +
    Thanks,
    Firecrawl Team
    `; +} + +export async function sendMonitoringEmailSummary(params: { + monitor: MonitorRow; + check: MonitorCheckRow; + pages: MonitoringEmailPage[]; +}): Promise<{ + attempted: boolean; + success: boolean; + recipients: string[]; + error?: string; +}> { + const configEmail = params.monitor.notification?.email; + if (!configEmail?.enabled) { + logger.info( + "Skipping monitoring email summary; email notifications disabled", + { + monitorId: params.monitor.id, + checkId: params.check.id, + }, + ); + return { attempted: false, success: true, recipients: [] }; + } + + if ( + params.check.changed_count + + params.check.new_count + + params.check.removed_count + + params.check.error_count <= + 0 + ) { + logger.info("Skipping monitoring email summary; no changes detected", { + monitorId: params.monitor.id, + checkId: params.check.id, + changed: params.check.changed_count, + new: params.check.new_count, + removed: params.check.removed_count, + errors: params.check.error_count, + }); + return { attempted: false, success: true, recipients: [] }; + } + + const explicitRecipients = configEmail.recipients ?? []; + const teamRecipients = + explicitRecipients.length > 0 + ? [] + : await getTeamEmails(params.monitor.team_id); + const recipients = [...new Set([...explicitRecipients, ...teamRecipients])]; + if (recipients.length === 0) { + logger.info("Skipping monitoring email summary; no recipients configured", { + monitorId: params.monitor.id, + checkId: params.check.id, + }); + return { attempted: false, success: true, recipients }; + } + + const resendApiKey = config.RESEND_API_KEY?.trim(); + if (!resendApiKey) { + logger.warn( + "Skipping monitoring email summary; RESEND_API_KEY is not set", + { + monitorId: params.monitor.id, + checkId: params.check.id, + recipients, + }, + ); + return { attempted: false, success: true, recipients }; + } + + const payload: MonitoringEmailPayload = { + monitorId: params.monitor.id, + monitorName: params.monitor.name, + checkId: params.check.id, + summary: { + changed: params.check.changed_count, + new: params.check.new_count, + removed: params.check.removed_count, + error: params.check.error_count, + totalPages: params.check.total_pages, + }, + pages: params.pages, + creditsUsed: params.check.actual_credits, + }; + + const resend = new Resend(resendApiKey); + try { + const { error } = await resend.emails.send({ + from: "Firecrawl ", + to: recipients, + reply_to: "help@firecrawl.com", + subject: `Monitor changes detected: ${params.monitor.name}`, + html: buildHtml(payload), + }); + + if (error) { + logger.warn("Monitoring email summary send failed", { + monitorId: params.monitor.id, + checkId: params.check.id, + recipients, + error, + }); + return { + attempted: true, + success: false, + recipients, + error: typeof error === "string" ? error : JSON.stringify(error), + }; + } + + logger.info("Monitoring email summary sent", { + monitorId: params.monitor.id, + checkId: params.check.id, + recipients, + }); + + return { attempted: true, success: true, recipients }; + } catch (error) { + logger.warn("Failed to send monitoring email summary", { error }); + return { + attempted: true, + success: false, + recipients, + error: error instanceof Error ? error.message : String(error), + }; + } +} diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 0afc8fce80..289a544e9c 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -25,6 +25,12 @@ import { crawlFinishedQueue, NuQJob, scrapeQueue } from "./worker/nuq"; import { finishCrawlSuper } from "./worker/crawl-logic"; import { getCrawl } from "../lib/crawl-redis"; import { TransportableError } from "../lib/error"; +import { + processMonitorCheckJob, + reconcileRunningMonitorChecks, +} from "./monitoring/runner"; +import { enqueueDueMonitorChecks } from "./monitoring/scheduler"; +import { consumeMonitorCheckJobs } from "./monitoring/queue"; configDotenv(); @@ -38,6 +44,7 @@ const connectionMonitorInterval = config.CONNECTION_MONITOR_INTERVAL; const gotJobInterval = config.CONNECTION_MONITOR_INTERVAL; const runningJobs: Set = new Set(); +let monitorSchedulerInterval: NodeJS.Timeout | null = null; const processDeepResearchJobInternal = async ( token: string, @@ -407,7 +414,7 @@ let currentLiveness: boolean = true; app.get("/liveness", (req, res) => { _logger.info("Liveness endpoint hit"); - if (config.USE_DB_AUTHENTICATION) { + if (config.USE_DB_AUTHENTICATION && config.NUQ_RABBITMQ_URL) { // networking check for Kubernetes environments const host = config.FIRECRAWL_APP_HOST; const port = config.FIRECRAWL_APP_PORT; @@ -452,12 +459,43 @@ app.listen(workerPort, () => { initializeEngineForcing(); + if (config.USE_DB_AUTHENTICATION) { + monitorSchedulerInterval = setInterval(() => { + enqueueDueMonitorChecks().catch(error => { + _logger.error("Failed to enqueue due monitor checks", { error }); + }); + reconcileRunningMonitorChecks().catch(error => { + _logger.error("Failed to reconcile running monitor checks", { error }); + }); + }, 60_000); + enqueueDueMonitorChecks().catch(error => { + _logger.error("Failed to enqueue due monitor checks", { error }); + }); + reconcileRunningMonitorChecks().catch(error => { + _logger.error("Failed to reconcile running monitor checks", { error }); + }); + + await consumeMonitorCheckJobs(processMonitorCheckJob); + } else if (!config.USE_DB_AUTHENTICATION) { + _logger.info( + "Skipping monitor worker startup because database authentication is disabled", + ); + } else { + _logger.info( + "Skipping monitor worker startup because NUQ_RABBITMQ_URL is not configured", + ); + } + await Promise.all([ workerFun(getDeepResearchQueue(), processDeepResearchJobInternal), workerFun(getGenerateLlmsTxtQueue(), processGenerateLlmsTxtJobInternal), crawlFinishWorker(), ]); + if (monitorSchedulerInterval) { + clearInterval(monitorSchedulerInterval); + } + _logger.info("All workers exited. Waiting for all jobs to finish..."); while (runningJobs.size > 0) { diff --git a/apps/api/src/services/webhook/types.ts b/apps/api/src/services/webhook/types.ts index a3fbdabdeb..550c4f430c 100644 --- a/apps/api/src/services/webhook/types.ts +++ b/apps/api/src/services/webhook/types.ts @@ -13,6 +13,8 @@ export enum WebhookEvent { EXTRACT_STARTED = "extract.started", EXTRACT_COMPLETED = "extract.completed", EXTRACT_FAILED = "extract.failed", + MONITOR_PAGE = "monitor.page", + MONITOR_CHECK_COMPLETED = "monitor.check.completed", } export type WebhookEventDataMap = { @@ -25,6 +27,8 @@ export type WebhookEventDataMap = { [WebhookEvent.EXTRACT_STARTED]: ExtractStartedData; [WebhookEvent.EXTRACT_COMPLETED]: ExtractCompletedData; [WebhookEvent.EXTRACT_FAILED]: ExtractFailedData; + [WebhookEvent.MONITOR_PAGE]: MonitorPageData; + [WebhookEvent.MONITOR_CHECK_COMPLETED]: MonitorCheckCompletedData; }; export type WebhookConfig = z.infer; @@ -113,3 +117,36 @@ interface ExtractFailedData extends BaseWebhookData { success: false; error: string; } + +// monitor +interface MonitorPageData extends BaseWebhookData { + success: boolean; + data: { + monitorId: string; + checkId: string; + url: string; + status: string; + previousScrapeId?: string | null; + currentScrapeId?: string | null; + error?: string | null; + }; + error?: string; +} + +interface MonitorCheckCompletedData extends BaseWebhookData { + success: boolean; + data: { + monitorId: string; + checkId: string; + status: string; + summary: { + totalPages: number; + same: number; + changed: number; + new: number; + removed: number; + error: number; + }; + }; + error?: string; +} diff --git a/apps/api/src/services/worker/crawl-logic.ts b/apps/api/src/services/worker/crawl-logic.ts index 35516137f1..d35926142b 100644 --- a/apps/api/src/services/worker/crawl-logic.ts +++ b/apps/api/src/services/worker/crawl-logic.ts @@ -73,6 +73,8 @@ export async function finishCrawlSuper(job: NuQJob) { ), zeroDataRetention: sc.zeroDataRetention || job.data.zeroDataRetention, cancelled: sc.cancelled ?? false, + monitor_id: job.data.monitoring?.monitorId, + monitor_check_id: job.data.monitoring?.checkId, }, false, ); @@ -158,6 +160,8 @@ export async function finishCrawlSuper(job: NuQJob) { credits_cost: credits_billed ?? 0, zeroDataRetention: sc.zeroDataRetention || job.data.zeroDataRetention, cancelled: sc.cancelled ?? false, + monitor_id: job.data.monitoring?.monitorId, + monitor_check_id: job.data.monitoring?.checkId, }, false, ); diff --git a/apps/api/src/services/worker/scrape-worker.ts b/apps/api/src/services/worker/scrape-worker.ts index 2abada584c..e6ff468dac 100644 --- a/apps/api/src/services/worker/scrape-worker.ts +++ b/apps/api/src/services/worker/scrape-worker.ts @@ -83,6 +83,10 @@ import { import { ScrapeUrlResponse } from "../../scraper/scrapeURL"; import { logScrape } from "../logging/log_job"; import { FeatureFlag } from "../../scraper/scrapeURL/engines"; +import { + recordMonitorScrapeFailure, + recordMonitorScrapeSuccess, +} from "../monitoring/results"; configDotenv(); @@ -456,6 +460,9 @@ async function processJob(job: NuQJob) { v1: job.data.v1, zeroDataRetention: job.data.zeroDataRetention, apiKeyId: job.data.apiKeyId, + monitoring: job.data.monitoring + ? { ...job.data.monitoring, source: "discovered" } + : undefined, }, jobId, jobPriority, @@ -529,6 +536,8 @@ async function processJob(job: NuQJob) { zeroDataRetention: job.data.zeroDataRetention, skipNuq: job.data.skipNuq ?? false, is_parse: Boolean(job.data.internalOptions?.isParse), + monitor_id: job.data.monitoring?.monitorId, + monitor_check_id: job.data.monitoring?.checkId, }, true, ); @@ -576,6 +585,10 @@ async function processJob(job: NuQJob) { } } + await recordMonitorScrapeSuccess(job, doc).catch(error => + logger.warn("Failed to record monitor scrape result", { error }), + ); + logger.debug("Declaring job as done..."); await addCrawlJobDone(job.data.crawl_id, job.id, true, logger); } else { @@ -613,6 +626,8 @@ async function processJob(job: NuQJob) { zeroDataRetention: job.data.zeroDataRetention, skipNuq: job.data.skipNuq ?? false, is_parse: Boolean(job.data.internalOptions?.isParse), + monitor_id: job.data.monitoring?.monitorId, + monitor_check_id: job.data.monitoring?.checkId, }, false, ); @@ -630,6 +645,10 @@ async function processJob(job: NuQJob) { zeroDataRetention: job.data.zeroDataRetention, }).catch(err => logger.warn("Scrape tracking failed", { error: err })); + await recordMonitorScrapeSuccess(job, doc).catch(error => + logger.warn("Failed to record monitor scrape result", { error }), + ); + if (job.data.skipNuq) { // doesn't use GCS for result retrieval, safe to not await logScrapePromise.catch(err => @@ -789,6 +808,8 @@ async function processJob(job: NuQJob) { zeroDataRetention: job.data.zeroDataRetention, skipNuq: job.data.skipNuq ?? false, is_parse: Boolean(job.data.internalOptions?.isParse), + monitor_id: job.data.monitoring?.monitorId, + monitor_check_id: job.data.monitoring?.checkId, }, true, ); @@ -806,6 +827,10 @@ async function processJob(job: NuQJob) { zeroDataRetention: job.data.zeroDataRetention, }).catch(err => logger.warn("Scrape tracking failed", { error: err })); + await recordMonitorScrapeFailure(job, error).catch(err => + logger.warn("Failed to record monitor scrape failure", { error: err }), + ); + return data; } finally { if (abortTimeoutHandle) clearTimeout(abortTimeoutHandle); diff --git a/apps/api/src/types.ts b/apps/api/src/types.ts index b5d4815fb4..8ad6ffbee5 100644 --- a/apps/api/src/types.ts +++ b/apps/api/src/types.ts @@ -21,6 +21,12 @@ type ScrapeJobCommon = { traceContext?: SerializedTraceContext; skipNuq?: boolean; requestId?: string; + monitoring?: { + monitorId: string; + checkId: string; + targetId: string; + source: "explicit" | "discovered"; + }; }; export type ScrapeJobData = ScrapeJobCommon & diff --git a/apps/dot-net-sdk/Firecrawl/Firecrawl.csproj b/apps/dot-net-sdk/Firecrawl/Firecrawl.csproj index e2a0f936ff..4962ae3904 100644 --- a/apps/dot-net-sdk/Firecrawl/Firecrawl.csproj +++ b/apps/dot-net-sdk/Firecrawl/Firecrawl.csproj @@ -8,7 +8,7 @@ firecrawl-sdk - 1.2.1 + 1.3.0 Firecrawl Firecrawl .NET SDK for the Firecrawl API - web scraping, crawling, and data extraction diff --git a/apps/dot-net-sdk/Firecrawl/FirecrawlClient.cs b/apps/dot-net-sdk/Firecrawl/FirecrawlClient.cs index 1f2ae2b781..11c5da6a3f 100644 --- a/apps/dot-net-sdk/Firecrawl/FirecrawlClient.cs +++ b/apps/dot-net-sdk/Firecrawl/FirecrawlClient.cs @@ -1,6 +1,7 @@ using System.Text.Json; using Firecrawl.Exceptions; using Firecrawl.Models; +using MonitorModel = Firecrawl.Models.Monitor; namespace Firecrawl; @@ -321,6 +322,122 @@ public async Task MapAsync( return response.Data ?? throw new FirecrawlException("Map response contained no data"); } + // ================================================================ + // MONITOR + // ================================================================ + + public async Task CreateMonitorAsync( + CreateMonitorRequest request, + CancellationToken cancellationToken = default) + { + ArgumentNullException.ThrowIfNull(request); + + var response = await _http.PostAsync>( + "/v2/monitor", request, cancellationToken: cancellationToken); + + return response.Data ?? throw new FirecrawlException("Create monitor response contained no data"); + } + + public async Task> ListMonitorsAsync( + int? limit = null, + int? offset = null, + CancellationToken cancellationToken = default) + { + var response = await _http.GetAsync>>( + $"/v2/monitor{BuildQuery(limit, offset)}", cancellationToken); + + return response.Data ?? new List(); + } + + public async Task GetMonitorAsync( + string monitorId, + CancellationToken cancellationToken = default) + { + ArgumentNullException.ThrowIfNull(monitorId); + + var response = await _http.GetAsync>( + $"/v2/monitor/{monitorId}", cancellationToken); + + return response.Data ?? throw new FirecrawlException("Get monitor response contained no data"); + } + + public async Task UpdateMonitorAsync( + string monitorId, + UpdateMonitorRequest request, + CancellationToken cancellationToken = default) + { + ArgumentNullException.ThrowIfNull(monitorId); + ArgumentNullException.ThrowIfNull(request); + + var response = await _http.PatchAsync>( + $"/v2/monitor/{monitorId}", request, cancellationToken); + + return response.Data ?? throw new FirecrawlException("Update monitor response contained no data"); + } + + public async Task DeleteMonitorAsync( + string monitorId, + CancellationToken cancellationToken = default) + { + ArgumentNullException.ThrowIfNull(monitorId); + + var response = await _http.DeleteAsync>( + $"/v2/monitor/{monitorId}", cancellationToken); + + return response.TryGetValue("success", out var success) && success switch + { + bool value => value, + JsonElement element when element.ValueKind == JsonValueKind.True => true, + _ => false + }; + } + + public async Task RunMonitorAsync( + string monitorId, + CancellationToken cancellationToken = default) + { + ArgumentNullException.ThrowIfNull(monitorId); + + var response = await _http.PostAsync>( + $"/v2/monitor/{monitorId}/run", new Dictionary(), cancellationToken: cancellationToken); + + return response.Data ?? throw new FirecrawlException("Run monitor response contained no data"); + } + + public async Task> ListMonitorChecksAsync( + string monitorId, + int? limit = null, + int? offset = null, + CancellationToken cancellationToken = default) + { + ArgumentNullException.ThrowIfNull(monitorId); + + var response = await _http.GetAsync>>( + $"/v2/monitor/{monitorId}/checks{BuildQuery(limit, offset)}", cancellationToken); + + return response.Data ?? new List(); + } + + public async Task GetMonitorCheckAsync( + string monitorId, + string checkId, + int? limit = null, + int? skip = null, + string? status = null, + bool autoPaginate = true, + CancellationToken cancellationToken = default) + { + ArgumentNullException.ThrowIfNull(monitorId); + ArgumentNullException.ThrowIfNull(checkId); + + var response = await _http.GetAsync>( + $"/v2/monitor/{monitorId}/checks/{checkId}{BuildMonitorCheckQuery(limit, skip, status)}", + cancellationToken); + + var check = response.Data ?? throw new FirecrawlException("Get monitor check response contained no data"); + return autoPaginate ? await PaginateMonitorCheckAsync(check, cancellationToken) : check; + } + // ================================================================ // SEARCH // ================================================================ @@ -460,6 +577,32 @@ private async Task PaginateBatchScrapeAsync( return job; } + private async Task PaginateMonitorCheckAsync( + MonitorCheckDetail check, + CancellationToken cancellationToken) + { + check.Pages ??= new List(); + var current = check; + + while (!string.IsNullOrEmpty(current.Next)) + { + var response = await _http.GetAbsoluteAsync>( + current.Next, cancellationToken); + if (response.Data == null) + break; + + var nextPage = response.Data; + + if (nextPage.Pages is { Count: > 0 }) + check.Pages.AddRange(nextPage.Pages); + + current = nextPage; + } + + check.Next = null; + return check; + } + // ================================================================ // INTERNAL UTILITIES // ================================================================ @@ -474,6 +617,32 @@ private static Dictionary BuildBody(object? options) ?? new Dictionary(); } + private static string BuildQuery(int? limit = null, int? offset = null, string? status = null) + { + var query = new List(); + if (limit.HasValue) + query.Add($"limit={Uri.EscapeDataString(limit.Value.ToString())}"); + if (offset.HasValue) + query.Add($"offset={Uri.EscapeDataString(offset.Value.ToString())}"); + if (!string.IsNullOrWhiteSpace(status)) + query.Add($"status={Uri.EscapeDataString(status)}"); + + return query.Count == 0 ? string.Empty : "?" + string.Join("&", query); + } + + private static string BuildMonitorCheckQuery(int? limit = null, int? skip = null, string? status = null) + { + var query = new List(); + if (limit.HasValue) + query.Add($"limit={Uri.EscapeDataString(limit.Value.ToString())}"); + if (skip.HasValue) + query.Add($"skip={Uri.EscapeDataString(skip.Value.ToString())}"); + if (!string.IsNullOrWhiteSpace(status)) + query.Add($"status={Uri.EscapeDataString(status)}"); + + return query.Count == 0 ? string.Empty : "?" + string.Join("&", query); + } + private static string ResolveApiKey(string? apiKey) { if (!string.IsNullOrWhiteSpace(apiKey)) diff --git a/apps/dot-net-sdk/Firecrawl/FirecrawlHttpClient.cs b/apps/dot-net-sdk/Firecrawl/FirecrawlHttpClient.cs index 2da99b16e5..a7ba80dbfe 100644 --- a/apps/dot-net-sdk/Firecrawl/FirecrawlHttpClient.cs +++ b/apps/dot-net-sdk/Firecrawl/FirecrawlHttpClient.cs @@ -139,6 +139,25 @@ HttpRequestMessage BuildRequest() return await ExecuteWithRetryAsync(BuildRequest, cancellationToken); } + internal async Task PatchAsync( + string path, + object body, + CancellationToken cancellationToken = default) + { + var url = _baseUrl + path; + var json = JsonSerializer.Serialize(body, JsonOptions); + + HttpRequestMessage BuildRequest() + { + var content = new StringContent(json, Encoding.UTF8, "application/json"); + var request = new HttpRequestMessage(HttpMethod.Patch, url) { Content = content }; + ApplyStandardHeaders(request); + return request; + } + + return await ExecuteWithRetryAsync(BuildRequest, cancellationToken); + } + internal async Task GetAbsoluteAsync(string absoluteUrl, CancellationToken cancellationToken = default) { // Validate that the pagination URL belongs to the same host to prevent API key exfiltration diff --git a/apps/dot-net-sdk/Firecrawl/Models/MonitorModels.cs b/apps/dot-net-sdk/Firecrawl/Models/MonitorModels.cs new file mode 100644 index 0000000000..ed5c7ef830 --- /dev/null +++ b/apps/dot-net-sdk/Firecrawl/Models/MonitorModels.cs @@ -0,0 +1,225 @@ +using System.Text.Json.Serialization; + +namespace Firecrawl.Models; + +public class MonitorSchedule +{ + [JsonPropertyName("cron")] + public string? Cron { get; set; } + + [JsonPropertyName("timezone")] + public string? Timezone { get; set; } +} + +public class CreateMonitorRequest +{ + [JsonPropertyName("name")] + public string? Name { get; set; } + + [JsonPropertyName("schedule")] + public MonitorSchedule? Schedule { get; set; } + + [JsonPropertyName("targets")] + public List>? Targets { get; set; } + + [JsonPropertyName("webhook")] + public Dictionary? Webhook { get; set; } + + [JsonPropertyName("notification")] + public Dictionary? Notification { get; set; } + + [JsonPropertyName("retentionDays")] + public int? RetentionDays { get; set; } +} + +public class UpdateMonitorRequest +{ + [JsonPropertyName("name")] + public string? Name { get; set; } + + [JsonPropertyName("status")] + public string? Status { get; set; } + + [JsonPropertyName("schedule")] + public MonitorSchedule? Schedule { get; set; } + + [JsonPropertyName("targets")] + public List>? Targets { get; set; } + + [JsonPropertyName("webhook")] + public Dictionary? Webhook { get; set; } + + [JsonPropertyName("notification")] + public Dictionary? Notification { get; set; } + + [JsonPropertyName("retentionDays")] + public int? RetentionDays { get; set; } +} + +public class MonitorSummary +{ + [JsonPropertyName("totalPages")] + public int TotalPages { get; set; } + + [JsonPropertyName("same")] + public int Same { get; set; } + + [JsonPropertyName("changed")] + public int Changed { get; set; } + + [JsonPropertyName("new")] + public int New { get; set; } + + [JsonPropertyName("removed")] + public int Removed { get; set; } + + [JsonPropertyName("error")] + public int Error { get; set; } +} + +public class Monitor +{ + [JsonPropertyName("id")] + public string? Id { get; set; } + + [JsonPropertyName("name")] + public string? Name { get; set; } + + [JsonPropertyName("status")] + public string? Status { get; set; } + + [JsonPropertyName("schedule")] + public MonitorSchedule? Schedule { get; set; } + + [JsonPropertyName("nextRunAt")] + public string? NextRunAt { get; set; } + + [JsonPropertyName("lastRunAt")] + public string? LastRunAt { get; set; } + + [JsonPropertyName("currentCheckId")] + public string? CurrentCheckId { get; set; } + + [JsonPropertyName("targets")] + public List>? Targets { get; set; } + + [JsonPropertyName("webhook")] + public Dictionary? Webhook { get; set; } + + [JsonPropertyName("notification")] + public Dictionary? Notification { get; set; } + + [JsonPropertyName("retentionDays")] + public int RetentionDays { get; set; } + + [JsonPropertyName("estimatedCreditsPerMonth")] + public int? EstimatedCreditsPerMonth { get; set; } + + [JsonPropertyName("lastCheckSummary")] + public MonitorSummary? LastCheckSummary { get; set; } + + [JsonPropertyName("createdAt")] + public string? CreatedAt { get; set; } + + [JsonPropertyName("updatedAt")] + public string? UpdatedAt { get; set; } +} + +public class MonitorCheck +{ + [JsonPropertyName("id")] + public string? Id { get; set; } + + [JsonPropertyName("monitorId")] + public string? MonitorId { get; set; } + + [JsonPropertyName("status")] + public string? Status { get; set; } + + [JsonPropertyName("trigger")] + public string? Trigger { get; set; } + + [JsonPropertyName("scheduledFor")] + public string? ScheduledFor { get; set; } + + [JsonPropertyName("startedAt")] + public string? StartedAt { get; set; } + + [JsonPropertyName("finishedAt")] + public string? FinishedAt { get; set; } + + [JsonPropertyName("estimatedCredits")] + public int? EstimatedCredits { get; set; } + + [JsonPropertyName("reservedCredits")] + public int? ReservedCredits { get; set; } + + [JsonPropertyName("actualCredits")] + public int? ActualCredits { get; set; } + + [JsonPropertyName("billingStatus")] + public string? BillingStatus { get; set; } + + [JsonPropertyName("summary")] + public MonitorSummary? Summary { get; set; } + + [JsonPropertyName("targetResults")] + public object? TargetResults { get; set; } + + [JsonPropertyName("notificationStatus")] + public object? NotificationStatus { get; set; } + + [JsonPropertyName("error")] + public string? Error { get; set; } + + [JsonPropertyName("createdAt")] + public string? CreatedAt { get; set; } + + [JsonPropertyName("updatedAt")] + public string? UpdatedAt { get; set; } +} + +public class MonitorCheckPage +{ + [JsonPropertyName("id")] + public string? Id { get; set; } + + [JsonPropertyName("targetId")] + public string? TargetId { get; set; } + + [JsonPropertyName("url")] + public string? Url { get; set; } + + [JsonPropertyName("status")] + public string? Status { get; set; } + + [JsonPropertyName("previousScrapeId")] + public string? PreviousScrapeId { get; set; } + + [JsonPropertyName("currentScrapeId")] + public string? CurrentScrapeId { get; set; } + + [JsonPropertyName("statusCode")] + public int? StatusCode { get; set; } + + [JsonPropertyName("error")] + public string? Error { get; set; } + + [JsonPropertyName("metadata")] + public object? Metadata { get; set; } + + [JsonPropertyName("diff")] + public object? Diff { get; set; } + + [JsonPropertyName("createdAt")] + public string? CreatedAt { get; set; } +} + +public class MonitorCheckDetail : MonitorCheck +{ + [JsonPropertyName("pages")] + public List? Pages { get; set; } + + [JsonPropertyName("next")] + public string? Next { get; set; } +} diff --git a/apps/elixir-sdk/lib/firecrawl.ex b/apps/elixir-sdk/lib/firecrawl.ex index d974d853b5..3dce580e82 100644 --- a/apps/elixir-sdk/lib/firecrawl.ex +++ b/apps/elixir-sdk/lib/firecrawl.ex @@ -1157,4 +1157,274 @@ defmodule Firecrawl do Req.delete!(client(opts), url: "/scrape/#{job_id}/interact") end + @monitor_key_mapping %{ + name: "name", + schedule: "schedule", + targets: "targets", + webhook: "webhook", + notification: "notification", + retention_days: "retentionDays", + status: "status" + } + + @monitor_list_key_mapping %{limit: "limit", offset: "offset"} + @monitor_check_list_key_mapping %{limit: "limit", offset: "offset", status: "status"} + @monitor_check_key_mapping %{limit: "limit", skip: "skip", status: "status"} + + @monitor_schema NimbleOptions.new!([ + name: [type: :string], + schedule: [type: :any], + targets: [type: {:list, :any}], + webhook: [type: :any], + notification: [type: :any], + retention_days: [type: :integer], + status: [type: :any] + ]) + + @monitor_list_schema NimbleOptions.new!([ + limit: [type: :integer], + offset: [type: :integer] + ]) + + @monitor_check_list_schema NimbleOptions.new!([ + limit: [type: :integer], + offset: [type: :integer], + status: [type: :any] + ]) + + @monitor_check_schema NimbleOptions.new!([ + limit: [type: :integer], + skip: [type: :integer], + status: [type: :any] + ]) + + @doc """ + Create a scheduled monitor. + + `POST /monitor` + """ + @spec create_monitor(keyword(), keyword()) :: response() + def create_monitor(params \\ [], opts \\ []) do + with {:ok, params} <- NimbleOptions.validate(params, @monitor_schema) do + Req.post(client(opts), url: "/monitor", json: to_body(params, @monitor_key_mapping)) + end + end + + @doc """ + Bang variant of `create_monitor`. Raises on error. + """ + @spec create_monitor!(keyword(), keyword()) :: Req.Response.t() + def create_monitor!(params \\ [], opts \\ []) do + params = NimbleOptions.validate!(params, @monitor_schema) + Req.post!(client(opts), url: "/monitor", json: to_body(params, @monitor_key_mapping)) + end + + @doc """ + List monitors for the authenticated team. + + `GET /monitor` + """ + @spec list_monitors(keyword(), keyword()) :: response() + def list_monitors(params \\ [], opts \\ []) do + with {:ok, params} <- NimbleOptions.validate(params, @monitor_list_schema) do + Req.get(client(opts), url: "/monitor", params: to_query(params, @monitor_list_key_mapping)) + end + end + + @doc """ + Bang variant of `list_monitors`. Raises on error. + """ + @spec list_monitors!(keyword(), keyword()) :: Req.Response.t() + def list_monitors!(params \\ [], opts \\ []) do + params = NimbleOptions.validate!(params, @monitor_list_schema) + Req.get!(client(opts), url: "/monitor", params: to_query(params, @monitor_list_key_mapping)) + end + + @doc """ + Get a monitor by ID. + + `GET /monitor/{monitorId}` + """ + @spec get_monitor(String.t(), keyword()) :: response() + def get_monitor(monitor_id, opts \\ []) do + Req.get(client(opts), url: "/monitor/#{monitor_id}") + end + + @doc """ + Bang variant of `get_monitor`. Raises on error. + """ + @spec get_monitor!(String.t(), keyword()) :: Req.Response.t() + def get_monitor!(monitor_id, opts \\ []) do + Req.get!(client(opts), url: "/monitor/#{monitor_id}") + end + + @doc """ + Update a monitor. + + `PATCH /monitor/{monitorId}` + """ + @spec update_monitor(String.t(), keyword(), keyword()) :: response() + def update_monitor(monitor_id, params \\ [], opts \\ []) do + with {:ok, params} <- NimbleOptions.validate(params, @monitor_schema) do + Req.patch(client(opts), url: "/monitor/#{monitor_id}", json: to_body(params, @monitor_key_mapping)) + end + end + + @doc """ + Bang variant of `update_monitor`. Raises on error. + """ + @spec update_monitor!(String.t(), keyword(), keyword()) :: Req.Response.t() + def update_monitor!(monitor_id, params \\ [], opts \\ []) do + params = NimbleOptions.validate!(params, @monitor_schema) + Req.patch!(client(opts), url: "/monitor/#{monitor_id}", json: to_body(params, @monitor_key_mapping)) + end + + @doc """ + Delete a monitor. + + `DELETE /monitor/{monitorId}` + """ + @spec delete_monitor(String.t(), keyword()) :: response() + def delete_monitor(monitor_id, opts \\ []) do + Req.delete(client(opts), url: "/monitor/#{monitor_id}") + end + + @doc """ + Bang variant of `delete_monitor`. Raises on error. + """ + @spec delete_monitor!(String.t(), keyword()) :: Req.Response.t() + def delete_monitor!(monitor_id, opts \\ []) do + Req.delete!(client(opts), url: "/monitor/#{monitor_id}") + end + + @doc """ + Trigger a manual monitor check. + + `POST /monitor/{monitorId}/run` + """ + @spec run_monitor(String.t(), keyword()) :: response() + def run_monitor(monitor_id, opts \\ []) do + Req.post(client(opts), url: "/monitor/#{monitor_id}/run", json: %{}) + end + + @doc """ + Bang variant of `run_monitor`. Raises on error. + """ + @spec run_monitor!(String.t(), keyword()) :: Req.Response.t() + def run_monitor!(monitor_id, opts \\ []) do + Req.post!(client(opts), url: "/monitor/#{monitor_id}/run", json: %{}) + end + + @doc """ + List checks for a monitor. + + `GET /monitor/{monitorId}/checks` + """ + @spec list_monitor_checks(String.t(), keyword(), keyword()) :: response() + def list_monitor_checks(monitor_id, params \\ [], opts \\ []) do + with {:ok, params} <- NimbleOptions.validate(params, @monitor_check_list_schema) do + Req.get(client(opts), + url: "/monitor/#{monitor_id}/checks", + params: to_query(params, @monitor_check_list_key_mapping) + ) + end + end + + @doc """ + Bang variant of `list_monitor_checks`. Raises on error. + """ + @spec list_monitor_checks!(String.t(), keyword(), keyword()) :: Req.Response.t() + def list_monitor_checks!(monitor_id, params \\ [], opts \\ []) do + params = NimbleOptions.validate!(params, @monitor_check_list_schema) + Req.get!(client(opts), + url: "/monitor/#{monitor_id}/checks", + params: to_query(params, @monitor_check_list_key_mapping) + ) + end + + @doc """ + Get a monitor check with paginated page results and inline diffs. + + `GET /monitor/{monitorId}/checks/{checkId}` + """ + @spec get_monitor_check(String.t(), String.t(), keyword(), keyword()) :: response() + def get_monitor_check(monitor_id, check_id, params \\ [], opts \\ []) do + {auto_paginate, opts} = Keyword.pop(opts, :auto_paginate, true) + + with {:ok, params} <- NimbleOptions.validate(params, @monitor_check_schema) do + case Req.get(client(opts), + url: "/monitor/#{monitor_id}/checks/#{check_id}", + params: to_query(params, @monitor_check_key_mapping) + ) do + {:ok, response} when auto_paginate -> {:ok, paginate_monitor_check_response(response, opts)} + result -> result + end + end + end + + @doc """ + Bang variant of `get_monitor_check`. Raises on error. + """ + @spec get_monitor_check!(String.t(), String.t(), keyword(), keyword()) :: Req.Response.t() + def get_monitor_check!(monitor_id, check_id, params \\ [], opts \\ []) do + {auto_paginate, opts} = Keyword.pop(opts, :auto_paginate, true) + params = NimbleOptions.validate!(params, @monitor_check_schema) + response = Req.get!(client(opts), + url: "/monitor/#{monitor_id}/checks/#{check_id}", + params: to_query(params, @monitor_check_key_mapping) + ) + + if auto_paginate, do: paginate_monitor_check_response!(response, opts), else: response + end + + defp paginate_monitor_check_response(%Req.Response{body: body} = response, opts) when is_map(body) do + data = Map.get(body, "data") || %{} + pages = Map.get(data, "pages") || [] + next = Map.get(body, "next") || Map.get(data, "next") + pages = fetch_monitor_check_pages(next, pages, opts) + data = data |> Map.put("pages", pages) |> Map.put("next", nil) + %{response | body: body |> Map.put("data", data) |> Map.put("next", nil)} + end + + defp paginate_monitor_check_response(response, _opts), do: response + + defp paginate_monitor_check_response!(%Req.Response{body: body} = response, opts) when is_map(body) do + data = Map.get(body, "data") || %{} + pages = Map.get(data, "pages") || [] + next = Map.get(body, "next") || Map.get(data, "next") + pages = fetch_monitor_check_pages!(next, pages, opts) + data = data |> Map.put("pages", pages) |> Map.put("next", nil) + %{response | body: body |> Map.put("data", data) |> Map.put("next", nil)} + end + + defp paginate_monitor_check_response!(response, _opts), do: response + + defp fetch_monitor_check_pages(nil, pages, _opts), do: pages + defp fetch_monitor_check_pages("", pages, _opts), do: pages + + defp fetch_monitor_check_pages(next, pages, opts) do + case Req.get(client(opts), url: next) do + {:ok, %Req.Response{body: body}} when is_map(body) -> + data = Map.get(body, "data") || %{} + next_pages = Map.get(data, "pages") || [] + next_url = Map.get(body, "next") || Map.get(data, "next") + fetch_monitor_check_pages(next_url, pages ++ next_pages, opts) + + _ -> + pages + end + end + + defp fetch_monitor_check_pages!(nil, pages, _opts), do: pages + defp fetch_monitor_check_pages!("", pages, _opts), do: pages + + defp fetch_monitor_check_pages!(next, pages, opts) do + response = Req.get!(client(opts), url: next) + body = response.body + data = if is_map(body), do: Map.get(body, "data") || %{}, else: %{} + next_pages = Map.get(data, "pages") || [] + next_url = if is_map(body), do: Map.get(body, "next") || Map.get(data, "next"), else: nil + fetch_monitor_check_pages!(next_url, pages ++ next_pages, opts) + end + end diff --git a/apps/elixir-sdk/mix.exs b/apps/elixir-sdk/mix.exs index 479cf2217b..e15468af24 100644 --- a/apps/elixir-sdk/mix.exs +++ b/apps/elixir-sdk/mix.exs @@ -1,7 +1,7 @@ defmodule Firecrawl.MixProject do use Mix.Project - @version "1.2.1" + @version "1.3.0" @source_url "https://github.com/firecrawl/firecrawl/tree/main/apps/elixir-sdk" def project do diff --git a/apps/go-sdk/firecrawl.go b/apps/go-sdk/firecrawl.go index 33f0361f7a..57f608e0cc 100644 --- a/apps/go-sdk/firecrawl.go +++ b/apps/go-sdk/firecrawl.go @@ -16,6 +16,7 @@ import ( "encoding/json" "fmt" "net/http" + "net/url" "os" "strings" "time" @@ -24,7 +25,7 @@ import ( ) const ( - defaultPollInterval = 2 // seconds + defaultPollInterval = 2 // seconds defaultJobTimeout = 300 // seconds ) @@ -383,6 +384,145 @@ func (c *Client) Map(ctx context.Context, url string, opts *MapOptions) (*MapDat return data, nil } +// ================================================================ +// MONITOR +// ================================================================ + +// CreateMonitor creates a scheduled monitor. +func (c *Client) CreateMonitor(ctx context.Context, req *MonitorCreateRequest) (*Monitor, error) { + if req == nil { + return nil, &FirecrawlError{Message: "monitor request is required"} + } + raw, err := c.http.post(ctx, "/v2/monitor", req, nil) + if err != nil { + return nil, err + } + return extractDataAs[Monitor](raw) +} + +// ListMonitors lists monitors for the authenticated team. +func (c *Client) ListMonitors(ctx context.Context, opts *ListMonitorsOptions) ([]Monitor, error) { + path := "/v2/monitor" + listQuery(opts) + raw, err := c.http.get(ctx, path) + if err != nil { + return nil, err + } + data, err := extractDataAs[[]Monitor](raw) + if err != nil { + return nil, err + } + return *data, nil +} + +// GetMonitor gets a monitor by ID. +func (c *Client) GetMonitor(ctx context.Context, monitorID string) (*Monitor, error) { + if monitorID == "" { + return nil, &FirecrawlError{Message: "monitor ID is required"} + } + raw, err := c.http.get(ctx, "/v2/monitor/"+monitorID) + if err != nil { + return nil, err + } + return extractDataAs[Monitor](raw) +} + +// UpdateMonitor updates a monitor. +func (c *Client) UpdateMonitor(ctx context.Context, monitorID string, req *MonitorUpdateRequest) (*Monitor, error) { + if monitorID == "" { + return nil, &FirecrawlError{Message: "monitor ID is required"} + } + if req == nil { + return nil, &FirecrawlError{Message: "monitor update request is required"} + } + raw, err := c.http.patch(ctx, "/v2/monitor/"+monitorID, req) + if err != nil { + return nil, err + } + return extractDataAs[Monitor](raw) +} + +// DeleteMonitor deletes a monitor. +func (c *Client) DeleteMonitor(ctx context.Context, monitorID string) (bool, error) { + if monitorID == "" { + return false, &FirecrawlError{Message: "monitor ID is required"} + } + raw, err := c.http.delete(ctx, "/v2/monitor/"+monitorID) + if err != nil { + return false, err + } + var resp struct { + Success bool `json:"success"` + } + if err := json.Unmarshal(raw, &resp); err != nil { + return false, &FirecrawlError{Message: fmt.Sprintf("failed to decode response: %v", err)} + } + return resp.Success, nil +} + +// RunMonitor triggers a manual monitor check. +func (c *Client) RunMonitor(ctx context.Context, monitorID string) (*MonitorCheck, error) { + if monitorID == "" { + return nil, &FirecrawlError{Message: "monitor ID is required"} + } + raw, err := c.http.post(ctx, "/v2/monitor/"+monitorID+"/run", map[string]interface{}{}, nil) + if err != nil { + return nil, err + } + return extractDataAs[MonitorCheck](raw) +} + +// ListMonitorChecks lists checks for a monitor. +func (c *Client) ListMonitorChecks(ctx context.Context, monitorID string, opts *ListMonitorChecksOptions) ([]MonitorCheck, error) { + if monitorID == "" { + return nil, &FirecrawlError{Message: "monitor ID is required"} + } + path := "/v2/monitor/" + monitorID + "/checks" + checksQuery(opts) + raw, err := c.http.get(ctx, path) + if err != nil { + return nil, err + } + data, err := extractDataAs[[]MonitorCheck](raw) + if err != nil { + return nil, err + } + return *data, nil +} + +// GetMonitorCheck gets a monitor check with page results, auto-paginated by default. +func (c *Client) GetMonitorCheck(ctx context.Context, monitorID, checkID string, opts *GetMonitorCheckOptions) (*MonitorCheckDetail, error) { + if monitorID == "" || checkID == "" { + return nil, &FirecrawlError{Message: "monitor ID and check ID are required"} + } + path := "/v2/monitor/" + monitorID + "/checks/" + checkID + monitorCheckPageQuery(opts) + raw, err := c.http.get(ctx, path) + if err != nil { + return nil, err + } + detail, err := extractMonitorCheckDetail(raw) + if err != nil { + return nil, err + } + + autoPaginate := true + if opts != nil && opts.AutoPaginate != nil { + autoPaginate = *opts.AutoPaginate + } + for autoPaginate && detail.Next != "" { + raw, err := c.http.getAbsolute(ctx, detail.Next) + if err != nil { + return nil, err + } + nextPage, err := extractMonitorCheckDetail(raw) + if err != nil { + return nil, err + } + detail.Pages = append(detail.Pages, nextPage.Pages...) + detail.Next = nextPage.Next + } + + return detail, nil +} + // ================================================================ // SEARCH // ================================================================ @@ -793,6 +933,63 @@ func mergeOptions(body map[string]interface{}, opts interface{}) { } } +func listQuery(opts *ListMonitorsOptions) string { + if opts == nil { + return "" + } + values := url.Values{} + if opts.Limit != nil { + values.Set("limit", fmt.Sprintf("%d", *opts.Limit)) + } + if opts.Offset != nil { + values.Set("offset", fmt.Sprintf("%d", *opts.Offset)) + } + if encoded := values.Encode(); encoded != "" { + return "?" + encoded + } + return "" +} + +func checksQuery(opts *ListMonitorChecksOptions) string { + if opts == nil { + return "" + } + values := url.Values{} + if opts.Limit != nil { + values.Set("limit", fmt.Sprintf("%d", *opts.Limit)) + } + if opts.Offset != nil { + values.Set("offset", fmt.Sprintf("%d", *opts.Offset)) + } + if opts.Status != "" { + values.Set("status", opts.Status) + } + if encoded := values.Encode(); encoded != "" { + return "?" + encoded + } + return "" +} + +func monitorCheckPageQuery(opts *GetMonitorCheckOptions) string { + if opts == nil { + return "" + } + values := url.Values{} + if opts.Limit != nil { + values.Set("limit", fmt.Sprintf("%d", *opts.Limit)) + } + if opts.Skip != nil { + values.Set("skip", fmt.Sprintf("%d", *opts.Skip)) + } + if opts.Status != "" { + values.Set("status", opts.Status) + } + if encoded := values.Encode(); encoded != "" { + return "?" + encoded + } + return "" +} + // extractDataAs extracts the "data" field from a raw API response and deserializes it. func extractDataAs[T any](raw json.RawMessage) (*T, error) { var envelope map[string]json.RawMessage @@ -816,3 +1013,17 @@ func extractDataAs[T any](raw json.RawMessage) (*T, error) { } return &result, nil } + +func extractMonitorCheckDetail(raw json.RawMessage) (*MonitorCheckDetail, error) { + var envelope struct { + Data MonitorCheckDetail `json:"data"` + Next string `json:"next"` + } + if err := json.Unmarshal(raw, &envelope); err != nil { + return nil, &FirecrawlError{Message: fmt.Sprintf("failed to decode response: %v", err)} + } + if envelope.Next != "" { + envelope.Data.Next = envelope.Next + } + return &envelope.Data, nil +} diff --git a/apps/go-sdk/http_client.go b/apps/go-sdk/http_client.go index 99156559b5..e17f221d66 100644 --- a/apps/go-sdk/http_client.go +++ b/apps/go-sdk/http_client.go @@ -49,6 +49,12 @@ func (h *httpClient) post(ctx context.Context, path string, body interface{}, ex return h.doJSON(ctx, "POST", url, body, extraHeaders) } +// patch sends a PATCH request. +func (h *httpClient) patch(ctx context.Context, path string, body interface{}) (json.RawMessage, error) { + url := h.baseURL + path + return h.doJSON(ctx, "PATCH", url, body, nil) +} + // get sends a GET request. func (h *httpClient) get(ctx context.Context, path string) (json.RawMessage, error) { url := h.baseURL + path diff --git a/apps/go-sdk/models.go b/apps/go-sdk/models.go index d4335d74e6..5877eadea6 100644 --- a/apps/go-sdk/models.go +++ b/apps/go-sdk/models.go @@ -100,6 +100,126 @@ type MapData struct { Links []LinkResult `json:"links,omitempty"` } +// MonitorSchedule configures when a monitor runs. +type MonitorSchedule struct { + Cron string `json:"cron"` + Timezone string `json:"timezone,omitempty"` +} + +// MonitorCreateRequest creates a scheduled monitor. +type MonitorCreateRequest struct { + Name string `json:"name"` + Schedule MonitorSchedule `json:"schedule"` + Targets []map[string]interface{} `json:"targets"` + Webhook map[string]interface{} `json:"webhook,omitempty"` + Notification map[string]interface{} `json:"notification,omitempty"` + RetentionDays *int `json:"retentionDays,omitempty"` +} + +// MonitorUpdateRequest updates a scheduled monitor. +type MonitorUpdateRequest struct { + Name string `json:"name,omitempty"` + Status string `json:"status,omitempty"` + Schedule *MonitorSchedule `json:"schedule,omitempty"` + Targets []map[string]interface{} `json:"targets,omitempty"` + Webhook map[string]interface{} `json:"webhook,omitempty"` + Notification map[string]interface{} `json:"notification,omitempty"` + RetentionDays *int `json:"retentionDays,omitempty"` +} + +// Monitor represents a scheduled monitor. +type Monitor struct { + ID string `json:"id"` + Name string `json:"name"` + Status string `json:"status"` + Schedule MonitorSchedule `json:"schedule"` + NextRunAt string `json:"nextRunAt,omitempty"` + LastRunAt string `json:"lastRunAt,omitempty"` + CurrentCheckID string `json:"currentCheckId,omitempty"` + Targets []map[string]interface{} `json:"targets,omitempty"` + Webhook map[string]interface{} `json:"webhook,omitempty"` + Notification map[string]interface{} `json:"notification,omitempty"` + RetentionDays int `json:"retentionDays"` + EstimatedCreditsPerMonth *int `json:"estimatedCreditsPerMonth,omitempty"` + LastCheckSummary *MonitorSummary `json:"lastCheckSummary,omitempty"` + CreatedAt string `json:"createdAt,omitempty"` + UpdatedAt string `json:"updatedAt,omitempty"` +} + +// MonitorSummary summarizes page statuses in a check. +type MonitorSummary struct { + TotalPages int `json:"totalPages"` + Same int `json:"same"` + Changed int `json:"changed"` + New int `json:"new"` + Removed int `json:"removed"` + Error int `json:"error"` +} + +// MonitorCheck represents a single monitor run. +type MonitorCheck struct { + ID string `json:"id"` + MonitorID string `json:"monitorId"` + Status string `json:"status"` + Trigger string `json:"trigger"` + ScheduledFor string `json:"scheduledFor,omitempty"` + StartedAt string `json:"startedAt,omitempty"` + FinishedAt string `json:"finishedAt,omitempty"` + EstimatedCredits *int `json:"estimatedCredits,omitempty"` + ReservedCredits *int `json:"reservedCredits,omitempty"` + ActualCredits *int `json:"actualCredits,omitempty"` + BillingStatus string `json:"billingStatus,omitempty"` + Summary MonitorSummary `json:"summary"` + TargetResults interface{} `json:"targetResults,omitempty"` + NotificationStatus interface{} `json:"notificationStatus,omitempty"` + Error string `json:"error,omitempty"` + CreatedAt string `json:"createdAt,omitempty"` + UpdatedAt string `json:"updatedAt,omitempty"` +} + +// MonitorCheckPage is a single page result in a monitor check. +type MonitorCheckPage struct { + ID string `json:"id"` + TargetID string `json:"targetId"` + URL string `json:"url"` + Status string `json:"status"` + PreviousScrapeID string `json:"previousScrapeId,omitempty"` + CurrentScrapeID string `json:"currentScrapeId,omitempty"` + StatusCode *int `json:"statusCode,omitempty"` + Error string `json:"error,omitempty"` + Metadata interface{} `json:"metadata,omitempty"` + Diff interface{} `json:"diff,omitempty"` + CreatedAt string `json:"createdAt,omitempty"` +} + +// MonitorCheckDetail includes paginated page results and inline diffs. +type MonitorCheckDetail struct { + MonitorCheck + Pages []MonitorCheckPage `json:"pages,omitempty"` + Next string `json:"next,omitempty"` +} + +// ListMonitorsOptions controls monitor list pagination. +type ListMonitorsOptions struct { + Limit *int + Offset *int +} + +// ListMonitorChecksOptions controls monitor check pagination/filtering. +type ListMonitorChecksOptions struct { + Limit *int + Offset *int + Status string +} + +// GetMonitorCheckOptions controls monitor check page pagination/filtering. +type GetMonitorCheckOptions struct { + Limit *int + Skip *int + Status string + AutoPaginate *bool +} + // SearchData represents the result of a search request. type SearchData struct { Web []map[string]interface{} `json:"web,omitempty"` @@ -142,13 +262,13 @@ type BrowserCreateResponse struct { // BrowserExecuteResponse is returned when executing code in a browser session. type BrowserExecuteResponse struct { - Success bool `json:"success"` - Stdout string `json:"stdout,omitempty"` - Result string `json:"result,omitempty"` - Stderr string `json:"stderr,omitempty"` - ExitCode *int `json:"exitCode,omitempty"` - Killed *bool `json:"killed,omitempty"` - Error string `json:"error,omitempty"` + Success bool `json:"success"` + Stdout string `json:"stdout,omitempty"` + Result string `json:"result,omitempty"` + Stderr string `json:"stderr,omitempty"` + ExitCode *int `json:"exitCode,omitempty"` + Killed *bool `json:"killed,omitempty"` + Error string `json:"error,omitempty"` } // BrowserDeleteResponse is returned when deleting a browser session. diff --git a/apps/java-sdk/build.gradle.kts b/apps/java-sdk/build.gradle.kts index 3183e7110b..3479bb1e70 100644 --- a/apps/java-sdk/build.gradle.kts +++ b/apps/java-sdk/build.gradle.kts @@ -4,7 +4,7 @@ plugins { } group = "com.firecrawl" -version = "1.4.1" +version = "1.5.0" java { sourceCompatibility = JavaVersion.VERSION_11 diff --git a/apps/java-sdk/src/main/java/com/firecrawl/client/FirecrawlClient.java b/apps/java-sdk/src/main/java/com/firecrawl/client/FirecrawlClient.java index 9055be3e7a..bfca8dfc0b 100644 --- a/apps/java-sdk/src/main/java/com/firecrawl/client/FirecrawlClient.java +++ b/apps/java-sdk/src/main/java/com/firecrawl/client/FirecrawlClient.java @@ -446,6 +446,81 @@ public MapData map(String url, MapOptions options) { return extractData(http.post("/v2/map", body, Map.class), MapData.class); } + // ================================================================ + // MONITOR + // ================================================================ + + public Monitor createMonitor(Map request) { + Objects.requireNonNull(request, "Monitor request is required"); + return extractData(http.post("/v2/monitor", request, Map.class), Monitor.class); + } + + public List listMonitors() { + return listMonitors(null, null); + } + + public List listMonitors(Integer limit, Integer offset) { + Map raw = http.get("/v2/monitor" + listQuery(limit, offset), Map.class); + return extractDataList(raw, Monitor.class); + } + + public Monitor getMonitor(String monitorId) { + Objects.requireNonNull(monitorId, "Monitor ID is required"); + return extractData(http.get("/v2/monitor/" + monitorId, Map.class), Monitor.class); + } + + public Monitor updateMonitor(String monitorId, Map request) { + Objects.requireNonNull(monitorId, "Monitor ID is required"); + Objects.requireNonNull(request, "Monitor update request is required"); + return extractData(http.patch("/v2/monitor/" + monitorId, request, Map.class), Monitor.class); + } + + @SuppressWarnings("unchecked") + public boolean deleteMonitor(String monitorId) { + Objects.requireNonNull(monitorId, "Monitor ID is required"); + Map response = http.delete("/v2/monitor/" + monitorId, Map.class); + return Boolean.TRUE.equals(response.get("success")); + } + + public MonitorCheck runMonitor(String monitorId) { + Objects.requireNonNull(monitorId, "Monitor ID is required"); + return extractData( + http.post("/v2/monitor/" + monitorId + "/run", Collections.emptyMap(), Map.class), + MonitorCheck.class + ); + } + + public List listMonitorChecks(String monitorId) { + return listMonitorChecks(monitorId, null, null); + } + + public List listMonitorChecks(String monitorId, Integer limit, Integer offset) { + Objects.requireNonNull(monitorId, "Monitor ID is required"); + Map raw = http.get("/v2/monitor/" + monitorId + "/checks" + listQuery(limit, offset), Map.class); + return extractDataList(raw, MonitorCheck.class); + } + + public MonitorCheckDetail getMonitorCheck(String monitorId, String checkId) { + return getMonitorCheck(monitorId, checkId, null, null, null, true); + } + + public MonitorCheckDetail getMonitorCheck( + String monitorId, String checkId, Integer limit, Integer skip, String status) { + return getMonitorCheck(monitorId, checkId, limit, skip, status, true); + } + + public MonitorCheckDetail getMonitorCheck( + String monitorId, String checkId, Integer limit, Integer skip, String status, boolean autoPaginate) { + Objects.requireNonNull(monitorId, "Monitor ID is required"); + Objects.requireNonNull(checkId, "Check ID is required"); + MonitorCheckDetail check = extractData( + http.get("/v2/monitor/" + monitorId + "/checks/" + checkId + + monitorCheckQuery(limit, skip, status), Map.class), + MonitorCheckDetail.class + ); + return autoPaginate ? paginateMonitorCheck(check) : check; + } + // ================================================================ // SEARCH // ================================================================ @@ -845,6 +920,50 @@ public CompletableFuture mapAsync(String url, MapOptions options) { return CompletableFuture.supplyAsync(() -> map(url, options), asyncExecutor); } + public CompletableFuture createMonitorAsync(Map request) { + return CompletableFuture.supplyAsync(() -> createMonitor(request), asyncExecutor); + } + + public CompletableFuture> listMonitorsAsync(Integer limit, Integer offset) { + return CompletableFuture.supplyAsync(() -> listMonitors(limit, offset), asyncExecutor); + } + + public CompletableFuture getMonitorAsync(String monitorId) { + return CompletableFuture.supplyAsync(() -> getMonitor(monitorId), asyncExecutor); + } + + public CompletableFuture updateMonitorAsync(String monitorId, Map request) { + return CompletableFuture.supplyAsync(() -> updateMonitor(monitorId, request), asyncExecutor); + } + + public CompletableFuture deleteMonitorAsync(String monitorId) { + return CompletableFuture.supplyAsync(() -> deleteMonitor(monitorId), asyncExecutor); + } + + public CompletableFuture runMonitorAsync(String monitorId) { + return CompletableFuture.supplyAsync(() -> runMonitor(monitorId), asyncExecutor); + } + + public CompletableFuture> listMonitorChecksAsync(String monitorId, Integer limit, Integer offset) { + return CompletableFuture.supplyAsync(() -> listMonitorChecks(monitorId, limit, offset), asyncExecutor); + } + + public CompletableFuture getMonitorCheckAsync( + String monitorId, String checkId, Integer limit, Integer skip, String status) { + return CompletableFuture.supplyAsync( + () -> getMonitorCheck(monitorId, checkId, limit, skip, status), + asyncExecutor + ); + } + + public CompletableFuture getMonitorCheckAsync( + String monitorId, String checkId, Integer limit, Integer skip, String status, boolean autoPaginate) { + return CompletableFuture.supplyAsync( + () -> getMonitorCheck(monitorId, checkId, limit, skip, status, autoPaginate), + asyncExecutor + ); + } + /** * Asynchronously runs an agent task and waits for completion. * @@ -1076,6 +1195,25 @@ private BatchScrapeJob paginateBatchScrape(BatchScrapeJob job) { return job; } + private MonitorCheckDetail paginateMonitorCheck(MonitorCheckDetail check) { + if (check.getPages() == null) { + check.setPages(new ArrayList<>()); + } + MonitorCheckDetail current = check; + while (current.getNext() != null && !current.getNext().isEmpty()) { + MonitorCheckDetail nextPage = extractData( + http.getAbsolute(current.getNext(), Map.class), + MonitorCheckDetail.class + ); + if (nextPage.getPages() != null && !nextPage.getPages().isEmpty()) { + check.getPages().addAll(nextPage.getPages()); + } + current = nextPage; + } + check.setNext(null); + return check; + } + // ================================================================ // INTERNAL UTILITIES // ================================================================ @@ -1093,6 +1231,35 @@ private T extractData(Map rawResponse, Class type) { return http.objectMapper.convertValue(data, type); } + private List extractDataList(Map rawResponse, Class type) { + Object data = rawResponse.get("data"); + if (!(data instanceof List)) { + return Collections.emptyList(); + } + List result = new ArrayList<>(); + for (Object item : (List) data) { + result.add(http.objectMapper.convertValue(item, type)); + } + return result; + } + + private String listQuery(Integer limit, Integer offset) { + List parts = new ArrayList<>(); + if (limit != null) parts.add("limit=" + limit); + if (offset != null) parts.add("offset=" + offset); + return parts.isEmpty() ? "" : "?" + String.join("&", parts); + } + + private String monitorCheckQuery(Integer limit, Integer skip, String status) { + List parts = new ArrayList<>(); + if (limit != null) parts.add("limit=" + limit); + if (skip != null) parts.add("skip=" + skip); + if (status != null && !status.isBlank()) { + parts.add("status=" + java.net.URLEncoder.encode(status, java.nio.charset.StandardCharsets.UTF_8)); + } + return parts.isEmpty() ? "" : "?" + String.join("&", parts); + } + /** * Merges a typed options object into a request body map, using Jackson serialization. */ diff --git a/apps/java-sdk/src/main/java/com/firecrawl/client/FirecrawlHttpClient.java b/apps/java-sdk/src/main/java/com/firecrawl/client/FirecrawlHttpClient.java index 33e1fe5d48..695c9e079d 100644 --- a/apps/java-sdk/src/main/java/com/firecrawl/client/FirecrawlHttpClient.java +++ b/apps/java-sdk/src/main/java/com/firecrawl/client/FirecrawlHttpClient.java @@ -86,6 +86,27 @@ T post(String path, Object body, Class responseType, Map return executeWithRetry(request, responseType); } + /** + * Sends a PATCH request with JSON body. + */ + T patch(String path, Object body, Class responseType) { + String url = baseUrl + path; + String json; + try { + json = objectMapper.writeValueAsString(body); + } catch (JsonProcessingException e) { + throw new FirecrawlException("Failed to serialize request body", e); + } + RequestBody requestBody = RequestBody.create(json, JSON); + Request request = new Request.Builder() + .url(url) + .header("Authorization", "Bearer " + apiKey) + .header("Content-Type", "application/json") + .patch(requestBody) + .build(); + return executeWithRetry(request, responseType); + } + /** * Sends a POST multipart/form-data request. */ diff --git a/apps/java-sdk/src/main/java/com/firecrawl/models/Monitor.java b/apps/java-sdk/src/main/java/com/firecrawl/models/Monitor.java new file mode 100644 index 0000000000..c881fb9f9d --- /dev/null +++ b/apps/java-sdk/src/main/java/com/firecrawl/models/Monitor.java @@ -0,0 +1,40 @@ +package com.firecrawl.models; + +import com.fasterxml.jackson.annotation.JsonIgnoreProperties; +import java.util.List; +import java.util.Map; + +@JsonIgnoreProperties(ignoreUnknown = true) +public class Monitor { + private String id; + private String name; + private String status; + private MonitorSchedule schedule; + private String nextRunAt; + private String lastRunAt; + private String currentCheckId; + private List> targets; + private Map webhook; + private Map notification; + private int retentionDays; + private Integer estimatedCreditsPerMonth; + private MonitorSummary lastCheckSummary; + private String createdAt; + private String updatedAt; + + public String getId() { return id; } + public String getName() { return name; } + public String getStatus() { return status; } + public MonitorSchedule getSchedule() { return schedule; } + public String getNextRunAt() { return nextRunAt; } + public String getLastRunAt() { return lastRunAt; } + public String getCurrentCheckId() { return currentCheckId; } + public List> getTargets() { return targets; } + public Map getWebhook() { return webhook; } + public Map getNotification() { return notification; } + public int getRetentionDays() { return retentionDays; } + public Integer getEstimatedCreditsPerMonth() { return estimatedCreditsPerMonth; } + public MonitorSummary getLastCheckSummary() { return lastCheckSummary; } + public String getCreatedAt() { return createdAt; } + public String getUpdatedAt() { return updatedAt; } +} diff --git a/apps/java-sdk/src/main/java/com/firecrawl/models/MonitorCheck.java b/apps/java-sdk/src/main/java/com/firecrawl/models/MonitorCheck.java new file mode 100644 index 0000000000..4b3f8212e2 --- /dev/null +++ b/apps/java-sdk/src/main/java/com/firecrawl/models/MonitorCheck.java @@ -0,0 +1,42 @@ +package com.firecrawl.models; + +import com.fasterxml.jackson.annotation.JsonIgnoreProperties; + +@JsonIgnoreProperties(ignoreUnknown = true) +public class MonitorCheck { + private String id; + private String monitorId; + private String status; + private String trigger; + private String scheduledFor; + private String startedAt; + private String finishedAt; + private Integer estimatedCredits; + private Integer reservedCredits; + private Integer actualCredits; + private String billingStatus; + private MonitorSummary summary; + private Object targetResults; + private Object notificationStatus; + private String error; + private String createdAt; + private String updatedAt; + + public String getId() { return id; } + public String getMonitorId() { return monitorId; } + public String getStatus() { return status; } + public String getTrigger() { return trigger; } + public String getScheduledFor() { return scheduledFor; } + public String getStartedAt() { return startedAt; } + public String getFinishedAt() { return finishedAt; } + public Integer getEstimatedCredits() { return estimatedCredits; } + public Integer getReservedCredits() { return reservedCredits; } + public Integer getActualCredits() { return actualCredits; } + public String getBillingStatus() { return billingStatus; } + public MonitorSummary getSummary() { return summary; } + public Object getTargetResults() { return targetResults; } + public Object getNotificationStatus() { return notificationStatus; } + public String getError() { return error; } + public String getCreatedAt() { return createdAt; } + public String getUpdatedAt() { return updatedAt; } +} diff --git a/apps/java-sdk/src/main/java/com/firecrawl/models/MonitorCheckDetail.java b/apps/java-sdk/src/main/java/com/firecrawl/models/MonitorCheckDetail.java new file mode 100644 index 0000000000..cdb3c710d2 --- /dev/null +++ b/apps/java-sdk/src/main/java/com/firecrawl/models/MonitorCheckDetail.java @@ -0,0 +1,15 @@ +package com.firecrawl.models; + +import com.fasterxml.jackson.annotation.JsonIgnoreProperties; +import java.util.List; + +@JsonIgnoreProperties(ignoreUnknown = true) +public class MonitorCheckDetail extends MonitorCheck { + private List pages; + private String next; + + public List getPages() { return pages; } + public void setPages(List pages) { this.pages = pages; } + public String getNext() { return next; } + public void setNext(String next) { this.next = next; } +} diff --git a/apps/java-sdk/src/main/java/com/firecrawl/models/MonitorCheckPage.java b/apps/java-sdk/src/main/java/com/firecrawl/models/MonitorCheckPage.java new file mode 100644 index 0000000000..231d3788ec --- /dev/null +++ b/apps/java-sdk/src/main/java/com/firecrawl/models/MonitorCheckPage.java @@ -0,0 +1,30 @@ +package com.firecrawl.models; + +import com.fasterxml.jackson.annotation.JsonIgnoreProperties; + +@JsonIgnoreProperties(ignoreUnknown = true) +public class MonitorCheckPage { + private String id; + private String targetId; + private String url; + private String status; + private String previousScrapeId; + private String currentScrapeId; + private Integer statusCode; + private String error; + private Object metadata; + private Object diff; + private String createdAt; + + public String getId() { return id; } + public String getTargetId() { return targetId; } + public String getUrl() { return url; } + public String getStatus() { return status; } + public String getPreviousScrapeId() { return previousScrapeId; } + public String getCurrentScrapeId() { return currentScrapeId; } + public Integer getStatusCode() { return statusCode; } + public String getError() { return error; } + public Object getMetadata() { return metadata; } + public Object getDiff() { return diff; } + public String getCreatedAt() { return createdAt; } +} diff --git a/apps/java-sdk/src/main/java/com/firecrawl/models/MonitorSchedule.java b/apps/java-sdk/src/main/java/com/firecrawl/models/MonitorSchedule.java new file mode 100644 index 0000000000..b8e3ff19fe --- /dev/null +++ b/apps/java-sdk/src/main/java/com/firecrawl/models/MonitorSchedule.java @@ -0,0 +1,12 @@ +package com.firecrawl.models; + +import com.fasterxml.jackson.annotation.JsonIgnoreProperties; + +@JsonIgnoreProperties(ignoreUnknown = true) +public class MonitorSchedule { + private String cron; + private String timezone; + + public String getCron() { return cron; } + public String getTimezone() { return timezone; } +} diff --git a/apps/java-sdk/src/main/java/com/firecrawl/models/MonitorSummary.java b/apps/java-sdk/src/main/java/com/firecrawl/models/MonitorSummary.java new file mode 100644 index 0000000000..3f04548fc0 --- /dev/null +++ b/apps/java-sdk/src/main/java/com/firecrawl/models/MonitorSummary.java @@ -0,0 +1,25 @@ +package com.firecrawl.models; + +import com.fasterxml.jackson.annotation.JsonIgnoreProperties; + +@JsonIgnoreProperties(ignoreUnknown = true) +public class MonitorSummary { + private int totalPages; + private int same; + private int changed; + private int newCount; + private int removed; + private int error; + + public int getTotalPages() { return totalPages; } + public int getSame() { return same; } + public int getChanged() { return changed; } + public int getNew() { return newCount; } + public int getRemoved() { return removed; } + public int getError() { return error; } + + @com.fasterxml.jackson.annotation.JsonProperty("new") + private void setNewCount(int value) { + this.newCount = value; + } +} diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index 51f8f4ace1..f88ddd72bb 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "4.21.1", + "version": "4.22.0", "description": "JavaScript SDK for Firecrawl API", "main": "dist/index.js", "types": "dist/index.d.ts", diff --git a/apps/js-sdk/firecrawl/src/__tests__/unit/v2/pagination.test.ts b/apps/js-sdk/firecrawl/src/__tests__/unit/v2/pagination.test.ts index 20e2afe427..32e8231973 100644 --- a/apps/js-sdk/firecrawl/src/__tests__/unit/v2/pagination.test.ts +++ b/apps/js-sdk/firecrawl/src/__tests__/unit/v2/pagination.test.ts @@ -1,6 +1,7 @@ import { describe, test, expect, jest } from "@jest/globals"; import { getCrawlStatus } from "../../../v2/methods/crawl"; import { getBatchScrapeStatus } from "../../../v2/methods/batch"; +import { getMonitorCheck } from "../../../v2/methods/monitor"; describe("JS SDK v2 pagination", () => { function makeHttp(getImpl: (url: string) => any) { @@ -64,6 +65,26 @@ describe("JS SDK v2 pagination", () => { expect(res.next).toBe("https://api/nextBatch"); }); + test("monitor check: default autoPaginate aggregates pages and nulls next", async () => { + const first = { status: 200, data: { success: true, next: "https://api/m1", data: { id: "check1", monitorId: "mon1", status: "completed", trigger: "manual", billingStatus: "confirmed", summary: {}, createdAt: "now", updatedAt: "now", pages: [{ url: "a", status: "changed" }], next: "https://api/m1" } } }; + const second = { status: 200, data: { success: true, next: null, data: { pages: [{ url: "b", status: "same" }], next: null } } }; + const http = makeHttp((url) => { + if (url.includes("/v2/monitor/")) return first; + return second; + }); + const res = await getMonitorCheck(http, "mon1", "check1"); + expect(res.pages.length).toBe(2); + expect(res.next).toBeNull(); + }); + + test("monitor check: autoPaginate=false returns next", async () => { + const first = { status: 200, data: { success: true, next: "https://api/m1", data: { id: "check1", monitorId: "mon1", status: "completed", trigger: "manual", billingStatus: "confirmed", summary: {}, createdAt: "now", updatedAt: "now", pages: [{ url: "a", status: "changed" }], next: "https://api/m1" } } }; + const http = makeHttp(() => first); + const res = await getMonitorCheck(http, "mon1", "check1", { autoPaginate: false }); + expect(res.pages.length).toBe(1); + expect(res.next).toBe("https://api/m1"); + }); + test("crawl: maxWaitTime stops pagination after first page", async () => { const first = { status: 200, data: { success: true, status: "completed", completed: 1, total: 5, next: "https://api/n1", data: [{ markdown: "a" }] } }; const p1 = { status: 200, data: { success: true, next: "https://api/n2", data: [{ markdown: "b" }] } }; diff --git a/apps/js-sdk/firecrawl/src/v2/client.ts b/apps/js-sdk/firecrawl/src/v2/client.ts index 613a28f7f8..f375bb457b 100644 --- a/apps/js-sdk/firecrawl/src/v2/client.ts +++ b/apps/js-sdk/firecrawl/src/v2/client.ts @@ -32,6 +32,16 @@ import { listBrowsers, } from "./methods/browser"; import { getConcurrency, getCreditUsage, getQueueStatus, getTokenUsage, getCreditUsageHistorical, getTokenUsageHistorical } from "./methods/usage"; +import { + createMonitor as createMonitorMethod, + deleteMonitor as deleteMonitorMethod, + getMonitor as getMonitorMethod, + getMonitorCheck as getMonitorCheckMethod, + listMonitorChecks as listMonitorChecksMethod, + listMonitors as listMonitorsMethod, + runMonitor as runMonitorMethod, + updateMonitor as updateMonitorMethod, +} from "./methods/monitor"; import type { Document, ParseFile, @@ -60,6 +70,14 @@ import type { ScrapeExecuteRequest, ScrapeExecuteResponse, ScrapeBrowserDeleteResponse, + CreateMonitorRequest, + ListMonitorChecksOptions, + ListMonitorsOptions, + Monitor, + MonitorCheck, + MonitorCheckDetail, + GetMonitorCheckOptions, + UpdateMonitorRequest, } from "./types"; import { Watcher } from "./watcher"; import type { WatcherOptions } from "./watcher"; @@ -276,6 +294,73 @@ export class FirecrawlClient { return crawlParamsPreview(this.http, url, prompt); } + // Monitor + /** + * Create a scheduled monitor. + */ + async createMonitor(request: CreateMonitorRequest): Promise { + return createMonitorMethod(this.http, request); + } + + /** + * List monitors for the authenticated team. + */ + async listMonitors(options?: ListMonitorsOptions): Promise { + return listMonitorsMethod(this.http, options); + } + + /** + * Get a monitor by id. + */ + async getMonitor(monitorId: string): Promise { + return getMonitorMethod(this.http, monitorId); + } + + /** + * Update a monitor. + */ + async updateMonitor( + monitorId: string, + request: UpdateMonitorRequest, + ): Promise { + return updateMonitorMethod(this.http, monitorId, request); + } + + /** + * Delete a monitor. + */ + async deleteMonitor(monitorId: string): Promise { + return deleteMonitorMethod(this.http, monitorId); + } + + /** + * Trigger a manual monitor check. + */ + async runMonitor(monitorId: string): Promise { + return runMonitorMethod(this.http, monitorId); + } + + /** + * List checks for a monitor. + */ + async listMonitorChecks( + monitorId: string, + options?: ListMonitorChecksOptions, + ): Promise { + return listMonitorChecksMethod(this.http, monitorId, options); + } + + /** + * Get a monitor check with paginated page results and inline diffs. + */ + async getMonitorCheck( + monitorId: string, + checkId: string, + options?: GetMonitorCheckOptions, + ): Promise { + return getMonitorCheckMethod(this.http, monitorId, checkId, options); + } + // Batch /** * Start a batch scrape job for multiple URLs (async). diff --git a/apps/js-sdk/firecrawl/src/v2/methods/monitor.ts b/apps/js-sdk/firecrawl/src/v2/methods/monitor.ts new file mode 100644 index 0000000000..3e4221da89 --- /dev/null +++ b/apps/js-sdk/firecrawl/src/v2/methods/monitor.ts @@ -0,0 +1,181 @@ +import { + type CreateMonitorRequest, + type ListMonitorsOptions, + type ListMonitorChecksOptions, + type Monitor, + type MonitorCheck, + type MonitorCheckDetail, + type MonitorCheckPage, + type GetMonitorCheckOptions, + type UpdateMonitorRequest, +} from "../types"; +import { HttpClient } from "../utils/httpClient"; +import { + throwForBadResponse, + normalizeAxiosError, +} from "../utils/errorHandler"; +import { fetchAllPages } from "../utils/pagination"; + +type ApiResponse = { + success: boolean; + data?: T; + id?: string; + error?: string; +}; + +function queryString(params?: Record): string { + if (!params) return ""; + const query = new URLSearchParams(); + for (const [key, value] of Object.entries(params)) { + if (value !== undefined && value !== null) query.set(key, String(value)); + } + const str = query.toString(); + return str ? `?${str}` : ""; +} + +function dataOrThrow(res: { status: number; data?: ApiResponse }, action: string): T { + if (res.status !== 200 || !res.data?.success || res.data.data == null) { + throwForBadResponse(res as any, action); + } + return res.data.data; +} + +export async function createMonitor( + http: HttpClient, + request: CreateMonitorRequest, +): Promise { + try { + const res = await http.post>("/v2/monitor", request as any); + return dataOrThrow(res, "create monitor"); + } catch (err: any) { + if (err?.isAxiosError) return normalizeAxiosError(err, "create monitor"); + throw err; + } +} + +export async function listMonitors( + http: HttpClient, + options?: ListMonitorsOptions, +): Promise { + try { + const res = await http.get>( + `/v2/monitor${queryString(options as Record)}`, + ); + return dataOrThrow(res, "list monitors"); + } catch (err: any) { + if (err?.isAxiosError) return normalizeAxiosError(err, "list monitors"); + throw err; + } +} + +export async function getMonitor( + http: HttpClient, + monitorId: string, +): Promise { + try { + const res = await http.get>(`/v2/monitor/${monitorId}`); + return dataOrThrow(res, "get monitor"); + } catch (err: any) { + if (err?.isAxiosError) return normalizeAxiosError(err, "get monitor"); + throw err; + } +} + +export async function updateMonitor( + http: HttpClient, + monitorId: string, + request: UpdateMonitorRequest, +): Promise { + try { + const res = await http.patch>( + `/v2/monitor/${monitorId}`, + request as any, + ); + return dataOrThrow(res, "update monitor"); + } catch (err: any) { + if (err?.isAxiosError) return normalizeAxiosError(err, "update monitor"); + throw err; + } +} + +export async function deleteMonitor( + http: HttpClient, + monitorId: string, +): Promise { + try { + const res = await http.delete>(`/v2/monitor/${monitorId}`); + if (res.status !== 200 || !res.data?.success) { + throwForBadResponse(res, "delete monitor"); + } + return true; + } catch (err: any) { + if (err?.isAxiosError) return normalizeAxiosError(err, "delete monitor"); + throw err; + } +} + +export async function runMonitor( + http: HttpClient, + monitorId: string, +): Promise { + try { + const res = await http.post>( + `/v2/monitor/${monitorId}/run`, + {}, + ); + return dataOrThrow(res, "run monitor"); + } catch (err: any) { + if (err?.isAxiosError) return normalizeAxiosError(err, "run monitor"); + throw err; + } +} + +export async function listMonitorChecks( + http: HttpClient, + monitorId: string, + options?: ListMonitorChecksOptions, +): Promise { + try { + const res = await http.get>( + `/v2/monitor/${monitorId}/checks${queryString(options as Record)}`, + ); + return dataOrThrow(res, "list monitor checks"); + } catch (err: any) { + if (err?.isAxiosError) return normalizeAxiosError(err, "list monitor checks"); + throw err; + } +} + +export async function getMonitorCheck( + http: HttpClient, + monitorId: string, + checkId: string, + options?: GetMonitorCheckOptions, +): Promise { + try { + const { autoPaginate: _autoPaginate, maxPages: _maxPages, maxResults: _maxResults, maxWaitTime: _maxWaitTime, ...query } = options ?? {}; + const res = await http.get>( + `/v2/monitor/${monitorId}/checks/${checkId}${queryString(query as Record)}`, + ); + const detail = dataOrThrow(res, "get monitor check"); + const next = res.data?.next ?? detail.next ?? null; + const auto = options?.autoPaginate ?? true; + if (!auto || !next) { + return { ...detail, next }; + } + + return { + ...detail, + pages: await fetchAllPages( + http, + next, + detail.pages || [], + options, + ), + next: null, + }; + } catch (err: any) { + if (err?.isAxiosError) return normalizeAxiosError(err, "get monitor check"); + throw err; + } +} diff --git a/apps/js-sdk/firecrawl/src/v2/types.ts b/apps/js-sdk/firecrawl/src/v2/types.ts index f72a6b8ead..a07232dc92 100644 --- a/apps/js-sdk/firecrawl/src/v2/types.ts +++ b/apps/js-sdk/firecrawl/src/v2/types.ts @@ -606,6 +606,154 @@ export interface MapOptions { location?: LocationConfig; } +export interface MonitorSchedule { + cron: string; + timezone?: string; +} + +export interface MonitorEmailNotification { + enabled?: boolean; + recipients?: string[]; + includeDiffs?: boolean; +} + +export interface MonitorNotification { + email?: MonitorEmailNotification; +} + +export interface MonitorWebhookConfig { + url: string; + headers?: Record; + metadata?: Record; + events?: string[]; +} + +export interface MonitorScrapeTarget { + id?: string; + type: 'scrape'; + urls: string[]; + scrapeOptions?: ScrapeOptions; +} + +export interface MonitorCrawlTarget { + id?: string; + type: 'crawl'; + url: string; + crawlOptions?: CrawlOptions; + scrapeOptions?: ScrapeOptions; +} + +export type MonitorTarget = MonitorScrapeTarget | MonitorCrawlTarget; + +export interface CreateMonitorRequest { + name: string; + schedule: MonitorSchedule; + webhook?: MonitorWebhookConfig; + notification?: MonitorNotification; + targets: MonitorTarget[]; + retentionDays?: number; +} + +export interface UpdateMonitorRequest { + name?: string; + status?: 'active' | 'paused'; + schedule?: MonitorSchedule; + webhook?: MonitorWebhookConfig | null; + notification?: MonitorNotification | null; + targets?: MonitorTarget[]; + retentionDays?: number; +} + +export interface MonitorSummary { + totalPages: number; + same: number; + changed: number; + new: number; + removed: number; + error: number; +} + +export interface Monitor { + id: string; + name: string; + status: 'active' | 'paused' | 'deleted'; + schedule: MonitorSchedule; + nextRunAt?: string | null; + lastRunAt?: string | null; + currentCheckId?: string | null; + targets: MonitorTarget[]; + webhook?: MonitorWebhookConfig | null; + notification?: MonitorNotification | null; + retentionDays: number; + estimatedCreditsPerMonth?: number | null; + lastCheckSummary?: MonitorSummary | null; + createdAt: string; + updatedAt: string; +} + +export interface MonitorCheck { + id: string; + monitorId: string; + status: + | 'queued' + | 'running' + | 'completed' + | 'failed' + | 'partial' + | 'skipped_overlap'; + trigger: 'scheduled' | 'manual'; + scheduledFor?: string | null; + startedAt?: string | null; + finishedAt?: string | null; + estimatedCredits?: number | null; + reservedCredits?: number | null; + actualCredits?: number | null; + billingStatus: + | 'not_applicable' + | 'reserved' + | 'confirmed' + | 'released' + | 'failed'; + summary: MonitorSummary; + targetResults?: unknown; + notificationStatus?: unknown; + error?: string | null; + createdAt: string; + updatedAt: string; +} + +export interface MonitorCheckPage { + id: string; + targetId: string; + url: string; + status: 'same' | 'new' | 'changed' | 'removed' | 'error'; + previousScrapeId?: string | null; + currentScrapeId?: string | null; + statusCode?: number | null; + error?: string | null; + metadata?: unknown; + diff?: unknown; + createdAt: string; +} + +export interface MonitorCheckDetail extends MonitorCheck { + pages: MonitorCheckPage[]; + next?: string | null; +} + +export interface ListMonitorsOptions { + limit?: number; + offset?: number; +} + +export type ListMonitorChecksOptions = ListMonitorsOptions; + +export type GetMonitorCheckOptions = PaginationConfig & { + limit?: number; + skip?: number; + status?: MonitorCheckPage["status"]; +}; + export interface ExtractResponse { success?: boolean; id?: string; diff --git a/apps/js-sdk/firecrawl/src/v2/utils/httpClient.ts b/apps/js-sdk/firecrawl/src/v2/utils/httpClient.ts index 5dc2cbbdb4..842a0ea25e 100644 --- a/apps/js-sdk/firecrawl/src/v2/utils/httpClient.ts +++ b/apps/js-sdk/firecrawl/src/v2/utils/httpClient.ts @@ -149,6 +149,20 @@ export class HttpClient { return this.request({ method: "delete", url: endpoint, headers }); } + patch( + endpoint: string, + body: Record, + options?: RequestOptions, + ) { + return this.request({ + method: "patch", + url: endpoint, + data: body, + headers: options?.headers, + timeout: options?.timeoutMs, + }); + } + prepareHeaders(idempotencyKey?: string): Record { const headers: Record = {}; if (idempotencyKey) headers["x-idempotency-key"] = idempotencyKey; diff --git a/apps/js-sdk/firecrawl/src/v2/utils/pagination.ts b/apps/js-sdk/firecrawl/src/v2/utils/pagination.ts index 4c54d12719..cb8273cfcd 100644 --- a/apps/js-sdk/firecrawl/src/v2/utils/pagination.ts +++ b/apps/js-sdk/firecrawl/src/v2/utils/pagination.ts @@ -2,14 +2,14 @@ import type { HttpClient } from "../utils/httpClient"; import type { Document, PaginationConfig } from "../types"; /** - * Shared helper to follow `next` cursors and aggregate documents with limits. + * Shared helper to follow `next` URLs and aggregate paginated result arrays. */ -export async function fetchAllPages( +export async function fetchAllPages( http: HttpClient, nextUrl: string, - initial: Document[], + initial: T[], pagination?: PaginationConfig -): Promise { +): Promise { const docs = initial.slice(); let current: string | null = nextUrl; let pageCount = 0; @@ -22,21 +22,24 @@ export async function fetchAllPages( if (maxPages != null && pageCount >= maxPages) break; if (maxWaitTime != null && (Date.now() - started) / 1000 > maxWaitTime) break; - let payload: { success: boolean; next?: string | null; data?: Document[] } | null = null; + let payload: { success: boolean; next?: string | null; data?: T[] | { pages?: T[]; next?: string | null } } | null = null; try { - const res = await http.get<{ success: boolean; next?: string | null; data?: Document[] }>(current); + const res = await http.get<{ success: boolean; next?: string | null; data?: T[] | { pages?: T[]; next?: string | null } }>(current); payload = res.data; } catch { break; // axios rejects on non-2xx; stop pagination gracefully } if (!payload?.success) break; - for (const d of payload.data || []) { + const pageData = Array.isArray(payload.data) + ? payload.data + : payload.data?.pages || []; + for (const d of pageData) { if (maxResults != null && docs.length >= maxResults) break; - docs.push(d as Document); + docs.push(d as T); } if (maxResults != null && docs.length >= maxResults) break; - current = (payload.next ?? null) as string | null; + current = (payload.next ?? (Array.isArray(payload.data) ? null : payload.data?.next) ?? null) as string | null; pageCount += 1; } return docs; diff --git a/apps/php-sdk/src/Client/FirecrawlClient.php b/apps/php-sdk/src/Client/FirecrawlClient.php index e237b51c14..23dbab668f 100644 --- a/apps/php-sdk/src/Client/FirecrawlClient.php +++ b/apps/php-sdk/src/Client/FirecrawlClient.php @@ -24,6 +24,9 @@ use Firecrawl\Models\Document; use Firecrawl\Models\MapData; use Firecrawl\Models\MapOptions; +use Firecrawl\Models\Monitor; +use Firecrawl\Models\MonitorCheck; +use Firecrawl\Models\MonitorCheckDetail; use Firecrawl\Models\ParseFile; use Firecrawl\Models\ParseOptions; use Firecrawl\Models\ScrapeOptions; @@ -316,6 +319,142 @@ public function map(string $url, ?MapOptions $options = null): MapData return MapData::fromArray($response['data'] ?? $response); } + // ================================================================ + // MONITOR + // ================================================================ + + /** + * Create a scheduled monitor. + * + * @param array $schedule + * @param list> $targets + * @param array|null $webhook + * @param array|null $notification + */ + public function createMonitor( + string $name, + array $schedule, + array $targets, + ?array $webhook = null, + ?array $notification = null, + ?int $retentionDays = null, + ): Monitor { + $body = array_filter([ + 'name' => $name, + 'schedule' => $schedule, + 'targets' => $targets, + 'webhook' => $webhook, + 'notification' => $notification, + 'retentionDays' => $retentionDays, + ], static fn ($value) => $value !== null); + + $response = $this->http->post('/v2/monitor', $body); + + return Monitor::fromArray($response['data'] ?? $response); + } + + /** + * @return list + */ + public function listMonitors(?int $limit = null, ?int $offset = null): array + { + $response = $this->http->get('/v2/monitor' . $this->query([ + 'limit' => $limit, + 'offset' => $offset, + ])); + + return array_map( + static fn (array $item): Monitor => Monitor::fromArray($item), + $response['data'] ?? [], + ); + } + + public function getMonitor(string $monitorId): Monitor + { + $response = $this->http->get("/v2/monitor/{$monitorId}"); + + return Monitor::fromArray($response['data'] ?? $response); + } + + /** + * @param array $attributes + */ + public function updateMonitor(string $monitorId, array $attributes): Monitor + { + $response = $this->http->patch("/v2/monitor/{$monitorId}", $attributes); + + return Monitor::fromArray($response['data'] ?? $response); + } + + public function deleteMonitor(string $monitorId): bool + { + $response = $this->http->delete("/v2/monitor/{$monitorId}"); + + return ($response['success'] ?? false) === true; + } + + public function runMonitor(string $monitorId): MonitorCheck + { + $response = $this->http->post("/v2/monitor/{$monitorId}/run", []); + + return MonitorCheck::fromArray($response['data'] ?? $response); + } + + /** + * @return list + */ + public function listMonitorChecks(string $monitorId, ?int $limit = null, ?int $offset = null): array + { + $response = $this->http->get("/v2/monitor/{$monitorId}/checks" . $this->query([ + 'limit' => $limit, + 'offset' => $offset, + ])); + + return array_map( + static fn (array $item): MonitorCheck => MonitorCheck::fromArray($item), + $response['data'] ?? [], + ); + } + + public function getMonitorCheck( + string $monitorId, + string $checkId, + ?int $limit = null, + ?int $skip = null, + ?string $status = null, + bool $autoPaginate = true, + ): MonitorCheckDetail { + $response = $this->http->get("/v2/monitor/{$monitorId}/checks/{$checkId}" . $this->query([ + 'limit' => $limit, + 'skip' => $skip, + 'status' => $status, + ])); + + $data = $response['data'] ?? $response; + if (isset($response['next'])) { + $data['next'] = $response['next']; + } + + if (!$autoPaginate) { + return MonitorCheckDetail::fromArray($data); + } + + while (isset($data['next']) && is_string($data['next']) && $data['next'] !== '') { + $this->assertSameOrigin($data['next']); + $nextResponse = $this->http->getAbsolute($data['next']); + $nextData = $nextResponse['data'] ?? $nextResponse; + if (isset($nextResponse['next'])) { + $nextData['next'] = $nextResponse['next']; + } + + $data['pages'] = array_merge($data['pages'] ?? [], $nextData['pages'] ?? []); + $data['next'] = $nextData['next'] ?? null; + } + + $data['next'] = null; + return MonitorCheckDetail::fromArray($data); + } + // ================================================================ // SEARCH // ================================================================ @@ -505,6 +644,16 @@ private function ensureValidPollInterval(int $pollIntervalSec): void } } + /** + * @param array $params + */ + private function query(array $params): string + { + $params = array_filter($params, static fn ($value) => $value !== null && $value !== ''); + + return $params === [] ? '' : '?' . http_build_query($params); + } + private function pollCrawl( ?string $jobId, int $pollIntervalSec, @@ -553,10 +702,29 @@ private function pollBatchScrape( private function assertSameOrigin(string $url): void { + $baseScheme = parse_url($this->http->getBaseUrl(), PHP_URL_SCHEME); $baseHost = parse_url($this->http->getBaseUrl(), PHP_URL_HOST); + $basePort = parse_url($this->http->getBaseUrl(), PHP_URL_PORT); + $nextScheme = parse_url($url, PHP_URL_SCHEME); $nextHost = parse_url($url, PHP_URL_HOST); - - if ($baseHost === null || $nextHost === null || strcasecmp($baseHost, $nextHost) !== 0) { + $nextPort = parse_url($url, PHP_URL_PORT); + + $basePort ??= is_string($baseScheme) && strcasecmp($baseScheme, 'https') === 0 + ? 443 + : 80; + $nextPort ??= is_string($nextScheme) && strcasecmp($nextScheme, 'https') === 0 + ? 443 + : 80; + + if ( + $baseScheme === null || + $nextScheme === null || + $baseHost === null || + $nextHost === null || + strcasecmp($baseScheme, $nextScheme) !== 0 || + strcasecmp($baseHost, $nextHost) !== 0 || + $basePort !== $nextPort + ) { throw new FirecrawlException( 'Pagination URL origin does not match the API base URL. Refusing to follow: ' . $url, ); diff --git a/apps/php-sdk/src/Client/FirecrawlHttpClient.php b/apps/php-sdk/src/Client/FirecrawlHttpClient.php index b878b74e8f..fa1d35f5d8 100644 --- a/apps/php-sdk/src/Client/FirecrawlHttpClient.php +++ b/apps/php-sdk/src/Client/FirecrawlHttpClient.php @@ -67,6 +67,15 @@ public function delete(string $path): array return $this->request('DELETE', $this->baseUrl . $path); } + /** + * @param array $body + * @return array + */ + public function patch(string $path, array $body): array + { + return $this->request('PATCH', $this->baseUrl . $path, $body); + } + /** * Send a POST request with a multipart/form-data body. * @@ -145,7 +154,7 @@ private function request( if ($multipart !== null) { $options[RequestOptions::MULTIPART] = $multipart; - } elseif ($method === 'POST' && $body !== []) { + } elseif (($method === 'POST' || $method === 'PATCH') && $body !== []) { $options[RequestOptions::JSON] = $body; } diff --git a/apps/php-sdk/src/Models/Monitor.php b/apps/php-sdk/src/Models/Monitor.php new file mode 100644 index 0000000000..fe4a94d049 --- /dev/null +++ b/apps/php-sdk/src/Models/Monitor.php @@ -0,0 +1,76 @@ +> $targets + * @param array|null $schedule + * @param array|null $webhook + * @param array|null $notification + * @param array|null $lastCheckSummary + */ + public function __construct( + private readonly ?string $id = null, + private readonly ?string $name = null, + private readonly ?string $status = null, + private readonly ?array $schedule = null, + private readonly ?string $nextRunAt = null, + private readonly ?string $lastRunAt = null, + private readonly ?string $currentCheckId = null, + private readonly array $targets = [], + private readonly ?array $webhook = null, + private readonly ?array $notification = null, + private readonly ?int $retentionDays = null, + private readonly ?int $estimatedCreditsPerMonth = null, + private readonly ?array $lastCheckSummary = null, + private readonly ?string $createdAt = null, + private readonly ?string $updatedAt = null, + ) {} + + /** @param array $data */ + public static function fromArray(array $data): self + { + return new self( + id: isset($data['id']) ? (string) $data['id'] : null, + name: isset($data['name']) ? (string) $data['name'] : null, + status: isset($data['status']) ? (string) $data['status'] : null, + schedule: isset($data['schedule']) && is_array($data['schedule']) ? $data['schedule'] : null, + nextRunAt: isset($data['nextRunAt']) ? (string) $data['nextRunAt'] : null, + lastRunAt: isset($data['lastRunAt']) ? (string) $data['lastRunAt'] : null, + currentCheckId: isset($data['currentCheckId']) ? (string) $data['currentCheckId'] : null, + targets: isset($data['targets']) && is_array($data['targets']) ? $data['targets'] : [], + webhook: isset($data['webhook']) && is_array($data['webhook']) ? $data['webhook'] : null, + notification: isset($data['notification']) && is_array($data['notification']) ? $data['notification'] : null, + retentionDays: isset($data['retentionDays']) ? (int) $data['retentionDays'] : null, + estimatedCreditsPerMonth: isset($data['estimatedCreditsPerMonth']) ? (int) $data['estimatedCreditsPerMonth'] : null, + lastCheckSummary: isset($data['lastCheckSummary']) && is_array($data['lastCheckSummary']) ? $data['lastCheckSummary'] : null, + createdAt: isset($data['createdAt']) ? (string) $data['createdAt'] : null, + updatedAt: isset($data['updatedAt']) ? (string) $data['updatedAt'] : null, + ); + } + + public function getId(): ?string { return $this->id; } + public function getName(): ?string { return $this->name; } + public function getStatus(): ?string { return $this->status; } + /** @return array|null */ + public function getSchedule(): ?array { return $this->schedule; } + public function getNextRunAt(): ?string { return $this->nextRunAt; } + public function getLastRunAt(): ?string { return $this->lastRunAt; } + public function getCurrentCheckId(): ?string { return $this->currentCheckId; } + /** @return list> */ + public function getTargets(): array { return $this->targets; } + /** @return array|null */ + public function getWebhook(): ?array { return $this->webhook; } + /** @return array|null */ + public function getNotification(): ?array { return $this->notification; } + public function getRetentionDays(): ?int { return $this->retentionDays; } + public function getEstimatedCreditsPerMonth(): ?int { return $this->estimatedCreditsPerMonth; } + /** @return array|null */ + public function getLastCheckSummary(): ?array { return $this->lastCheckSummary; } + public function getCreatedAt(): ?string { return $this->createdAt; } + public function getUpdatedAt(): ?string { return $this->updatedAt; } +} diff --git a/apps/php-sdk/src/Models/MonitorCheck.php b/apps/php-sdk/src/Models/MonitorCheck.php new file mode 100644 index 0000000000..935f20c044 --- /dev/null +++ b/apps/php-sdk/src/Models/MonitorCheck.php @@ -0,0 +1,77 @@ + $summary + * @param mixed $targetResults + * @param mixed $notificationStatus + */ + public function __construct( + private readonly ?string $id = null, + private readonly ?string $monitorId = null, + private readonly ?string $status = null, + private readonly ?string $trigger = null, + private readonly ?string $scheduledFor = null, + private readonly ?string $startedAt = null, + private readonly ?string $finishedAt = null, + private readonly ?int $estimatedCredits = null, + private readonly ?int $reservedCredits = null, + private readonly ?int $actualCredits = null, + private readonly ?string $billingStatus = null, + private readonly array $summary = [], + private readonly mixed $targetResults = null, + private readonly mixed $notificationStatus = null, + private readonly ?string $error = null, + private readonly ?string $createdAt = null, + private readonly ?string $updatedAt = null, + ) {} + + /** @param array $data */ + public static function fromArray(array $data): static + { + return new static( + id: isset($data['id']) ? (string) $data['id'] : null, + monitorId: isset($data['monitorId']) ? (string) $data['monitorId'] : null, + status: isset($data['status']) ? (string) $data['status'] : null, + trigger: isset($data['trigger']) ? (string) $data['trigger'] : null, + scheduledFor: isset($data['scheduledFor']) ? (string) $data['scheduledFor'] : null, + startedAt: isset($data['startedAt']) ? (string) $data['startedAt'] : null, + finishedAt: isset($data['finishedAt']) ? (string) $data['finishedAt'] : null, + estimatedCredits: isset($data['estimatedCredits']) ? (int) $data['estimatedCredits'] : null, + reservedCredits: isset($data['reservedCredits']) ? (int) $data['reservedCredits'] : null, + actualCredits: isset($data['actualCredits']) ? (int) $data['actualCredits'] : null, + billingStatus: isset($data['billingStatus']) ? (string) $data['billingStatus'] : null, + summary: isset($data['summary']) && is_array($data['summary']) ? $data['summary'] : [], + targetResults: $data['targetResults'] ?? null, + notificationStatus: $data['notificationStatus'] ?? null, + error: isset($data['error']) ? (string) $data['error'] : null, + createdAt: isset($data['createdAt']) ? (string) $data['createdAt'] : null, + updatedAt: isset($data['updatedAt']) ? (string) $data['updatedAt'] : null, + ); + } + + public function getId(): ?string { return $this->id; } + public function getMonitorId(): ?string { return $this->monitorId; } + public function getStatus(): ?string { return $this->status; } + public function getTrigger(): ?string { return $this->trigger; } + public function getScheduledFor(): ?string { return $this->scheduledFor; } + public function getStartedAt(): ?string { return $this->startedAt; } + public function getFinishedAt(): ?string { return $this->finishedAt; } + public function getEstimatedCredits(): ?int { return $this->estimatedCredits; } + public function getReservedCredits(): ?int { return $this->reservedCredits; } + public function getActualCredits(): ?int { return $this->actualCredits; } + public function getBillingStatus(): ?string { return $this->billingStatus; } + /** @return array */ + public function getSummary(): array { return $this->summary; } + public function getTargetResults(): mixed { return $this->targetResults; } + public function getNotificationStatus(): mixed { return $this->notificationStatus; } + public function getError(): ?string { return $this->error; } + public function getCreatedAt(): ?string { return $this->createdAt; } + public function getUpdatedAt(): ?string { return $this->updatedAt; } +} diff --git a/apps/php-sdk/src/Models/MonitorCheckDetail.php b/apps/php-sdk/src/Models/MonitorCheckDetail.php new file mode 100644 index 0000000000..f6709c81ee --- /dev/null +++ b/apps/php-sdk/src/Models/MonitorCheckDetail.php @@ -0,0 +1,86 @@ +> $pages + */ + public function __construct( + ?string $id = null, + ?string $monitorId = null, + ?string $status = null, + ?string $trigger = null, + ?string $scheduledFor = null, + ?string $startedAt = null, + ?string $finishedAt = null, + ?int $estimatedCredits = null, + ?int $reservedCredits = null, + ?int $actualCredits = null, + ?string $billingStatus = null, + array $summary = [], + mixed $targetResults = null, + mixed $notificationStatus = null, + ?string $error = null, + ?string $createdAt = null, + ?string $updatedAt = null, + private readonly array $pages = [], + private readonly ?string $next = null, + ) { + parent::__construct( + id: $id, + monitorId: $monitorId, + status: $status, + trigger: $trigger, + scheduledFor: $scheduledFor, + startedAt: $startedAt, + finishedAt: $finishedAt, + estimatedCredits: $estimatedCredits, + reservedCredits: $reservedCredits, + actualCredits: $actualCredits, + billingStatus: $billingStatus, + summary: $summary, + targetResults: $targetResults, + notificationStatus: $notificationStatus, + error: $error, + createdAt: $createdAt, + updatedAt: $updatedAt, + ); + } + + /** @param array $data */ + public static function fromArray(array $data): static + { + /** @var self $check */ + $check = new self( + id: isset($data['id']) ? (string) $data['id'] : null, + monitorId: isset($data['monitorId']) ? (string) $data['monitorId'] : null, + status: isset($data['status']) ? (string) $data['status'] : null, + trigger: isset($data['trigger']) ? (string) $data['trigger'] : null, + scheduledFor: isset($data['scheduledFor']) ? (string) $data['scheduledFor'] : null, + startedAt: isset($data['startedAt']) ? (string) $data['startedAt'] : null, + finishedAt: isset($data['finishedAt']) ? (string) $data['finishedAt'] : null, + estimatedCredits: isset($data['estimatedCredits']) ? (int) $data['estimatedCredits'] : null, + reservedCredits: isset($data['reservedCredits']) ? (int) $data['reservedCredits'] : null, + actualCredits: isset($data['actualCredits']) ? (int) $data['actualCredits'] : null, + billingStatus: isset($data['billingStatus']) ? (string) $data['billingStatus'] : null, + summary: isset($data['summary']) && is_array($data['summary']) ? $data['summary'] : [], + targetResults: $data['targetResults'] ?? null, + notificationStatus: $data['notificationStatus'] ?? null, + error: isset($data['error']) ? (string) $data['error'] : null, + createdAt: isset($data['createdAt']) ? (string) $data['createdAt'] : null, + updatedAt: isset($data['updatedAt']) ? (string) $data['updatedAt'] : null, + pages: isset($data['pages']) && is_array($data['pages']) ? $data['pages'] : [], + next: isset($data['next']) ? (string) $data['next'] : null, + ); + + return $check; + } + + /** @return list> */ + public function getPages(): array { return $this->pages; } + public function getNext(): ?string { return $this->next; } +} diff --git a/apps/php-sdk/src/Version.php b/apps/php-sdk/src/Version.php index e7da56ddc0..8fb71f9b46 100644 --- a/apps/php-sdk/src/Version.php +++ b/apps/php-sdk/src/Version.php @@ -6,5 +6,5 @@ final class Version { - public const SDK_VERSION = '1.1.1'; + public const SDK_VERSION = '1.2.0'; } diff --git a/apps/python-sdk/firecrawl/__init__.py b/apps/python-sdk/firecrawl/__init__.py index e42f93db3d..8abe1b7573 100644 --- a/apps/python-sdk/firecrawl/__init__.py +++ b/apps/python-sdk/firecrawl/__init__.py @@ -17,7 +17,7 @@ V1ChangeTrackingOptions, ) -__version__ = "4.24.1" +__version__ = "4.25.0" # Define the logger for the Firecrawl project logger: logging.Logger = logging.getLogger("firecrawl") diff --git a/apps/python-sdk/firecrawl/client.py b/apps/python-sdk/firecrawl/client.py index 1c21a0a684..00fde14e08 100644 --- a/apps/python-sdk/firecrawl/client.py +++ b/apps/python-sdk/firecrawl/client.py @@ -92,6 +92,14 @@ def __init__(self, client_instance: Optional[V2FirecrawlClient]): self.get_batch_scrape_errors = client_instance.get_batch_scrape_errors self.map = client_instance.map + self.create_monitor = client_instance.create_monitor + self.list_monitors = client_instance.list_monitors + self.get_monitor = client_instance.get_monitor + self.update_monitor = client_instance.update_monitor + self.delete_monitor = client_instance.delete_monitor + self.run_monitor = client_instance.run_monitor + self.list_monitor_checks = client_instance.list_monitor_checks + self.get_monitor_check = client_instance.get_monitor_check self.get_concurrency = client_instance.get_concurrency self.get_credit_usage = client_instance.get_credit_usage self.get_token_usage = client_instance.get_token_usage @@ -172,6 +180,14 @@ def __init__(self, client_instance: Optional[AsyncFirecrawlClient] = None): self.get_batch_scrape_errors = client_instance.get_batch_scrape_errors self.map = client_instance.map + self.create_monitor = client_instance.create_monitor + self.list_monitors = client_instance.list_monitors + self.get_monitor = client_instance.get_monitor + self.update_monitor = client_instance.update_monitor + self.delete_monitor = client_instance.delete_monitor + self.run_monitor = client_instance.run_monitor + self.list_monitor_checks = client_instance.list_monitor_checks + self.get_monitor_check = client_instance.get_monitor_check self.get_concurrency = client_instance.get_concurrency self.get_credit_usage = client_instance.get_credit_usage self.get_token_usage = client_instance.get_token_usage @@ -242,6 +258,14 @@ def __init__( self.parse = self._v2_client.parse self.search = self._v2_client.search self.map = self._v2_client.map + self.create_monitor = self._v2_client.create_monitor + self.list_monitors = self._v2_client.list_monitors + self.get_monitor = self._v2_client.get_monitor + self.update_monitor = self._v2_client.update_monitor + self.delete_monitor = self._v2_client.delete_monitor + self.run_monitor = self._v2_client.run_monitor + self.list_monitor_checks = self._v2_client.list_monitor_checks + self.get_monitor_check = self._v2_client.get_monitor_check self.crawl = self._v2_client.crawl self.start_crawl = self._v2_client.start_crawl @@ -336,6 +360,14 @@ def __init__( self.parse = self._v2_client.parse self.search = self._v2_client.search self.map = self._v2_client.map + self.create_monitor = self._v2_client.create_monitor + self.list_monitors = self._v2_client.list_monitors + self.get_monitor = self._v2_client.get_monitor + self.update_monitor = self._v2_client.update_monitor + self.delete_monitor = self._v2_client.delete_monitor + self.run_monitor = self._v2_client.run_monitor + self.list_monitor_checks = self._v2_client.list_monitor_checks + self.get_monitor_check = self._v2_client.get_monitor_check self.start_crawl = self._v2_client.start_crawl self.get_crawl_status = self._v2_client.get_crawl_status diff --git a/apps/python-sdk/firecrawl/v2/client.py b/apps/python-sdk/firecrawl/v2/client.py index 88539b5425..17084ccefa 100644 --- a/apps/python-sdk/firecrawl/v2/client.py +++ b/apps/python-sdk/firecrawl/v2/client.py @@ -41,6 +41,14 @@ Location, PaginationConfig, AgentOptions, + Monitor, + MonitorCheck, + MonitorCheckDetail, + MonitorCreateRequest, + MonitorNotification, + MonitorSchedule, + MonitorTarget, + MonitorUpdateRequest, ) from .utils.http_client import HttpClient from .utils.error_handler import FirecrawlError @@ -55,6 +63,7 @@ from .methods import extract as extract_module from .methods import agent as agent_module from .methods import browser as browser_module +from .methods import monitor as monitor_module from .watcher import Watcher class FirecrawlClient: @@ -655,6 +664,112 @@ def map( ) if any(v is not None for v in [search, include_subdomains, ignore_query_parameters, limit, sitemap, timeout, integration, location]) else None return map_module.map(self.http_client, url, options) + + def create_monitor( + self, + name: str, + schedule: Union[MonitorSchedule, Dict[str, Any]], + targets: List[Union[MonitorTarget, Dict[str, Any]]], + *, + webhook: Optional[WebhookConfig] = None, + notification: Optional[MonitorNotification] = None, + retention_days: Optional[int] = None, + ) -> Monitor: + """Create a scheduled monitor.""" + if isinstance(schedule, dict): + schedule = MonitorSchedule(**schedule) + request = MonitorCreateRequest( + name=name, + schedule=schedule, + targets=targets, + webhook=webhook, + notification=notification, + retention_days=retention_days, + ) + return monitor_module.create_monitor(self.http_client, request) + + def list_monitors( + self, + *, + limit: Optional[int] = None, + offset: Optional[int] = None, + ) -> List[Monitor]: + """List monitors for the authenticated team.""" + return monitor_module.list_monitors(self.http_client, limit=limit, offset=offset) + + def get_monitor(self, monitor_id: str) -> Monitor: + """Get a monitor by ID.""" + return monitor_module.get_monitor(self.http_client, monitor_id) + + def update_monitor( + self, + monitor_id: str, + *, + name: Optional[str] = None, + status: Optional[Literal["active", "paused"]] = None, + schedule: Optional[Union[MonitorSchedule, Dict[str, Any]]] = None, + webhook: Optional[Union[WebhookConfig, Dict[str, Any]]] = None, + notification: Optional[Union[MonitorNotification, Dict[str, Any]]] = None, + targets: Optional[List[Union[MonitorTarget, Dict[str, Any]]]] = None, + retention_days: Optional[int] = None, + ) -> Monitor: + """Update a monitor.""" + if isinstance(schedule, dict): + schedule = MonitorSchedule(**schedule) + request = MonitorUpdateRequest( + name=name, + status=status, + schedule=schedule, + webhook=webhook, + notification=notification, + targets=targets, + retention_days=retention_days, + ) + return monitor_module.update_monitor(self.http_client, monitor_id, request) + + def delete_monitor(self, monitor_id: str) -> bool: + """Delete a monitor.""" + return monitor_module.delete_monitor(self.http_client, monitor_id) + + def run_monitor(self, monitor_id: str) -> MonitorCheck: + """Run a monitor manually.""" + return monitor_module.run_monitor(self.http_client, monitor_id) + + def list_monitor_checks( + self, + monitor_id: str, + *, + limit: Optional[int] = None, + offset: Optional[int] = None, + ) -> List[MonitorCheck]: + """List checks for a monitor.""" + return monitor_module.list_monitor_checks( + self.http_client, + monitor_id, + limit=limit, + offset=offset, + ) + + def get_monitor_check( + self, + monitor_id: str, + check_id: str, + *, + limit: Optional[int] = None, + skip: Optional[int] = None, + status: Optional[Literal["same", "new", "changed", "removed", "error"]] = None, + pagination_config: Optional[PaginationConfig] = None, + ) -> MonitorCheckDetail: + """Get a monitor check with page results, auto-paginated by default.""" + return monitor_module.get_monitor_check( + self.http_client, + monitor_id, + check_id, + limit=limit, + skip=skip, + status=status, + pagination_config=pagination_config, + ) def cancel_crawl(self, crawl_id: str) -> bool: """ diff --git a/apps/python-sdk/firecrawl/v2/client_async.py b/apps/python-sdk/firecrawl/v2/client_async.py index b41da8eff3..834ac44071 100644 --- a/apps/python-sdk/firecrawl/v2/client_async.py +++ b/apps/python-sdk/firecrawl/v2/client_async.py @@ -36,6 +36,14 @@ PDFAction, Location, PaginationConfig, + Monitor, + MonitorCheck, + MonitorCheckDetail, + MonitorCreateRequest, + MonitorNotification, + MonitorSchedule, + MonitorTarget, + MonitorUpdateRequest, ) from .utils.http_client import HttpClient from .utils.http_client_async import AsyncHttpClient @@ -50,6 +58,7 @@ from .methods.aio import extract as async_extract # type: ignore[attr-defined] from .methods.aio import agent as async_agent # type: ignore[attr-defined] from .methods.aio import browser as async_browser # type: ignore[attr-defined] +from .methods.aio import monitor as async_monitor # type: ignore[attr-defined] from .watcher_async import AsyncWatcher @@ -332,6 +341,112 @@ async def map( ) if any(v is not None for v in [search, include_subdomains, limit, sitemap, integration, timeout]) else None return await async_map.map(self.async_http_client, url, options) + async def create_monitor( + self, + name: str, + schedule: Union[MonitorSchedule, Dict[str, Any]], + targets: List[Union[MonitorTarget, Dict[str, Any]]], + *, + webhook: Optional[WebhookConfig] = None, + notification: Optional[MonitorNotification] = None, + retention_days: Optional[int] = None, + ) -> Monitor: + if isinstance(schedule, dict): + schedule = MonitorSchedule(**schedule) + request = MonitorCreateRequest( + name=name, + schedule=schedule, + targets=targets, + webhook=webhook, + notification=notification, + retention_days=retention_days, + ) + return await async_monitor.create_monitor(self.async_http_client, request) + + async def list_monitors( + self, + *, + limit: Optional[int] = None, + offset: Optional[int] = None, + ) -> List[Monitor]: + return await async_monitor.list_monitors( + self.async_http_client, + limit=limit, + offset=offset, + ) + + async def get_monitor(self, monitor_id: str) -> Monitor: + return await async_monitor.get_monitor(self.async_http_client, monitor_id) + + async def update_monitor( + self, + monitor_id: str, + *, + name: Optional[str] = None, + status: Optional[Literal["active", "paused"]] = None, + schedule: Optional[Union[MonitorSchedule, Dict[str, Any]]] = None, + webhook: Optional[Union[WebhookConfig, Dict[str, Any]]] = None, + notification: Optional[Union[MonitorNotification, Dict[str, Any]]] = None, + targets: Optional[List[Union[MonitorTarget, Dict[str, Any]]]] = None, + retention_days: Optional[int] = None, + ) -> Monitor: + if isinstance(schedule, dict): + schedule = MonitorSchedule(**schedule) + request = MonitorUpdateRequest( + name=name, + status=status, + schedule=schedule, + webhook=webhook, + notification=notification, + targets=targets, + retention_days=retention_days, + ) + return await async_monitor.update_monitor( + self.async_http_client, + monitor_id, + request, + ) + + async def delete_monitor(self, monitor_id: str) -> bool: + return await async_monitor.delete_monitor(self.async_http_client, monitor_id) + + async def run_monitor(self, monitor_id: str) -> MonitorCheck: + return await async_monitor.run_monitor(self.async_http_client, monitor_id) + + async def list_monitor_checks( + self, + monitor_id: str, + *, + limit: Optional[int] = None, + offset: Optional[int] = None, + ) -> List[MonitorCheck]: + return await async_monitor.list_monitor_checks( + self.async_http_client, + monitor_id, + limit=limit, + offset=offset, + ) + + async def get_monitor_check( + self, + monitor_id: str, + check_id: str, + *, + limit: Optional[int] = None, + skip: Optional[int] = None, + status: Optional[Literal["same", "new", "changed", "removed", "error"]] = None, + pagination_config: Optional[PaginationConfig] = None, + ) -> MonitorCheckDetail: + return await async_monitor.get_monitor_check( + self.async_http_client, + monitor_id, + check_id, + limit=limit, + skip=skip, + status=status, + pagination_config=pagination_config, + ) + async def start_batch_scrape(self, urls: List[str], **kwargs) -> Any: return await async_batch.start_batch_scrape(self.async_http_client, urls, **kwargs) diff --git a/apps/python-sdk/firecrawl/v2/methods/aio/monitor.py b/apps/python-sdk/firecrawl/v2/methods/aio/monitor.py new file mode 100644 index 0000000000..fe98ffa05f --- /dev/null +++ b/apps/python-sdk/firecrawl/v2/methods/aio/monitor.py @@ -0,0 +1,215 @@ +import time +from typing import Any, Dict, List, Optional +from pydantic import BaseModel + +from ...types import ( + Monitor, + MonitorCheck, + MonitorCheckDetail, + MonitorCheckPage, + MonitorCreateRequest, + PaginationConfig, + MonitorTarget, + MonitorUpdateRequest, + ScrapeOptions, +) +from ...utils.error_handler import handle_response_error +from ...utils.http_client_async import AsyncHttpClient +from ...utils.validation import prepare_scrape_options + + +def _dump(value: Any) -> Any: + if isinstance(value, ScrapeOptions): + return prepare_scrape_options(value) + if isinstance(value, MonitorTarget): + data = value.model_dump(exclude_none=True, by_alias=True) + if isinstance(value.scrape_options, ScrapeOptions): + data["scrapeOptions"] = prepare_scrape_options(value.scrape_options) + return _prepare_target(data) + if isinstance(value, BaseModel): + return value.model_dump(exclude_none=True, by_alias=True) + if isinstance(value, list): + return [_dump(item) for item in value] + if isinstance(value, dict): + return {key: _dump(item) for key, item in value.items() if item is not None} + return value + + +def _prepare_target(target: Dict[str, Any]) -> Dict[str, Any]: + prepared = dict(target) + if "scrapeOptions" in prepared and isinstance(prepared["scrapeOptions"], ScrapeOptions): + prepared["scrapeOptions"] = prepare_scrape_options(prepared["scrapeOptions"]) + if "crawlOptions" in prepared: + prepared["crawlOptions"] = _dump(prepared["crawlOptions"]) + return prepared + + +def _prepare_payload(request: Any) -> Dict[str, Any]: + payload = _dump(request) + if not isinstance(payload, dict): + raise ValueError("Monitor request must be an object") + if "targets" in payload: + payload["targets"] = [ + _prepare_target(_dump(target)) + for target in payload.get("targets", []) + ] + return payload + + +async def _data_or_error(response, action: str) -> Any: + if response.status_code >= 400: + handle_response_error(response, action) + body = response.json() + if not body.get("success"): + raise Exception(body.get("error", "Unknown error occurred")) + return body.get("data") + + +async def _monitor_check_data_or_error(response, action: str) -> Dict[str, Any]: + if response.status_code >= 400: + handle_response_error(response, action) + body = response.json() + if not body.get("success"): + raise Exception(body.get("error", "Unknown error occurred")) + data = body.get("data") or {} + if body.get("next") is not None: + data["next"] = body.get("next") + return data + + +async def _fetch_all_monitor_check_pages( + client: AsyncHttpClient, + next_url: str, + initial_pages: List[MonitorCheckPage], + pagination_config: Optional[PaginationConfig] = None, +) -> List[MonitorCheckPage]: + pages = initial_pages.copy() + current_url = next_url + page_count = 0 + max_pages = pagination_config.max_pages if pagination_config else None + max_results = pagination_config.max_results if pagination_config else None + max_wait_time = pagination_config.max_wait_time if pagination_config else None + start_time = time.monotonic() + + while current_url: + if max_pages is not None and page_count >= max_pages: + break + if max_wait_time is not None and (time.monotonic() - start_time) > max_wait_time: + break + + response = await client.get(current_url) + if response.status_code >= 400: + break + try: + data = await _monitor_check_data_or_error(response, "get monitor check page") + except Exception: + break + + for page in data.get("pages") or []: + if max_results is not None and len(pages) >= max_results: + break + pages.append(MonitorCheckPage(**page)) + + if max_results is not None and len(pages) >= max_results: + break + + current_url = data.get("next") + page_count += 1 + + return pages + + +async def create_monitor(client: AsyncHttpClient, request: MonitorCreateRequest) -> Monitor: + data = await _data_or_error(await client.post("/v2/monitor", _prepare_payload(request)), "create monitor") + return Monitor(**data) + + +async def list_monitors(client: AsyncHttpClient, *, limit: Optional[int] = None, offset: Optional[int] = None) -> List[Monitor]: + params = [] + if limit is not None: + params.append(f"limit={limit}") + if offset is not None: + params.append(f"offset={offset}") + suffix = f"?{'&'.join(params)}" if params else "" + data = await _data_or_error(await client.get(f"/v2/monitor{suffix}"), "list monitors") + return [Monitor(**item) for item in data or []] + + +async def get_monitor(client: AsyncHttpClient, monitor_id: str) -> Monitor: + data = await _data_or_error(await client.get(f"/v2/monitor/{monitor_id}"), "get monitor") + return Monitor(**data) + + +async def update_monitor(client: AsyncHttpClient, monitor_id: str, request: MonitorUpdateRequest) -> Monitor: + data = await _data_or_error(await client.patch(f"/v2/monitor/{monitor_id}", _prepare_payload(request)), "update monitor") + return Monitor(**data) + + +async def delete_monitor(client: AsyncHttpClient, monitor_id: str) -> bool: + response = await client.delete(f"/v2/monitor/{monitor_id}") + if response.status_code >= 400: + handle_response_error(response, "delete monitor") + body = response.json() + if not body.get("success"): + raise Exception(body.get("error", "Unknown error occurred")) + return True + + +async def run_monitor(client: AsyncHttpClient, monitor_id: str) -> MonitorCheck: + data = await _data_or_error(await client.post(f"/v2/monitor/{monitor_id}/run", {}), "run monitor") + return MonitorCheck(**data) + + +async def list_monitor_checks( + client: AsyncHttpClient, + monitor_id: str, + *, + limit: Optional[int] = None, + offset: Optional[int] = None, +) -> List[MonitorCheck]: + params = [] + if limit is not None: + params.append(f"limit={limit}") + if offset is not None: + params.append(f"offset={offset}") + suffix = f"?{'&'.join(params)}" if params else "" + data = await _data_or_error(await client.get(f"/v2/monitor/{monitor_id}/checks{suffix}"), "list monitor checks") + return [MonitorCheck(**item) for item in data or []] + + +async def get_monitor_check( + client: AsyncHttpClient, + monitor_id: str, + check_id: str, + *, + limit: Optional[int] = None, + skip: Optional[int] = None, + status: Optional[str] = None, + pagination_config: Optional[PaginationConfig] = None, +) -> MonitorCheckDetail: + params = [] + if limit is not None: + params.append(f"limit={limit}") + if skip is not None: + params.append(f"skip={skip}") + if status is not None: + params.append(f"status={status}") + suffix = f"?{'&'.join(params)}" if params else "" + data = await _monitor_check_data_or_error(await client.get(f"/v2/monitor/{monitor_id}/checks/{check_id}{suffix}"), "get monitor check") + detail = MonitorCheckDetail(**data) + + auto_paginate = pagination_config.auto_paginate if pagination_config else True + if auto_paginate and detail.next and not ( + pagination_config + and pagination_config.max_results is not None + and len(detail.pages) >= pagination_config.max_results + ): + detail.pages = await _fetch_all_monitor_check_pages( + client, + detail.next, + detail.pages, + pagination_config, + ) + detail.next = None + + return detail diff --git a/apps/python-sdk/firecrawl/v2/methods/monitor.py b/apps/python-sdk/firecrawl/v2/methods/monitor.py new file mode 100644 index 0000000000..2164206b01 --- /dev/null +++ b/apps/python-sdk/firecrawl/v2/methods/monitor.py @@ -0,0 +1,214 @@ +import time +from typing import Any, Dict, List, Optional +from pydantic import BaseModel + +from ..types import ( + Monitor, + MonitorCheck, + MonitorCheckDetail, + MonitorCheckPage, + MonitorCreateRequest, + PaginationConfig, + MonitorTarget, + MonitorUpdateRequest, + ScrapeOptions, +) +from ..utils import HttpClient, handle_response_error +from ..utils.validation import prepare_scrape_options + + +def _dump(value: Any) -> Any: + if isinstance(value, ScrapeOptions): + return prepare_scrape_options(value) + if isinstance(value, MonitorTarget): + data = value.model_dump(exclude_none=True, by_alias=True) + if isinstance(value.scrape_options, ScrapeOptions): + data["scrapeOptions"] = prepare_scrape_options(value.scrape_options) + return _prepare_target(data) + if isinstance(value, BaseModel): + return value.model_dump(exclude_none=True, by_alias=True) + if isinstance(value, list): + return [_dump(item) for item in value] + if isinstance(value, dict): + return {key: _dump(item) for key, item in value.items() if item is not None} + return value + + +def _prepare_target(target: Dict[str, Any]) -> Dict[str, Any]: + prepared = dict(target) + if "scrapeOptions" in prepared and isinstance(prepared["scrapeOptions"], ScrapeOptions): + prepared["scrapeOptions"] = prepare_scrape_options(prepared["scrapeOptions"]) + if "crawlOptions" in prepared: + prepared["crawlOptions"] = _dump(prepared["crawlOptions"]) + return prepared + + +def _prepare_payload(request: Any) -> Dict[str, Any]: + payload = _dump(request) + if not isinstance(payload, dict): + raise ValueError("Monitor request must be an object") + if "targets" in payload: + payload["targets"] = [ + _prepare_target(_dump(target)) + for target in payload.get("targets", []) + ] + return payload + + +def _data_or_error(response, action: str) -> Any: + if not response.ok: + handle_response_error(response, action) + body = response.json() + if not body.get("success"): + raise Exception(body.get("error", "Unknown error occurred")) + return body.get("data") + + +def _monitor_check_data_or_error(response, action: str) -> Dict[str, Any]: + if not response.ok: + handle_response_error(response, action) + body = response.json() + if not body.get("success"): + raise Exception(body.get("error", "Unknown error occurred")) + data = body.get("data") or {} + if body.get("next") is not None: + data["next"] = body.get("next") + return data + + +def _fetch_all_monitor_check_pages( + client: HttpClient, + next_url: str, + initial_pages: List[MonitorCheckPage], + pagination_config: Optional[PaginationConfig] = None, +) -> List[MonitorCheckPage]: + pages = initial_pages.copy() + current_url = next_url + page_count = 0 + max_pages = pagination_config.max_pages if pagination_config else None + max_results = pagination_config.max_results if pagination_config else None + max_wait_time = pagination_config.max_wait_time if pagination_config else None + start_time = time.monotonic() + + while current_url: + if max_pages is not None and page_count >= max_pages: + break + if max_wait_time is not None and (time.monotonic() - start_time) > max_wait_time: + break + + response = client.get(current_url) + if not response.ok: + break + try: + data = _monitor_check_data_or_error(response, "get monitor check page") + except Exception: + break + + for page in data.get("pages") or []: + if max_results is not None and len(pages) >= max_results: + break + pages.append(MonitorCheckPage(**page)) + + if max_results is not None and len(pages) >= max_results: + break + + current_url = data.get("next") + page_count += 1 + + return pages + + +def create_monitor(client: HttpClient, request: MonitorCreateRequest) -> Monitor: + data = _data_or_error(client.post("/v2/monitor", _prepare_payload(request)), "create monitor") + return Monitor(**data) + + +def list_monitors(client: HttpClient, *, limit: Optional[int] = None, offset: Optional[int] = None) -> List[Monitor]: + params = [] + if limit is not None: + params.append(f"limit={limit}") + if offset is not None: + params.append(f"offset={offset}") + suffix = f"?{'&'.join(params)}" if params else "" + data = _data_or_error(client.get(f"/v2/monitor{suffix}"), "list monitors") + return [Monitor(**item) for item in data or []] + + +def get_monitor(client: HttpClient, monitor_id: str) -> Monitor: + data = _data_or_error(client.get(f"/v2/monitor/{monitor_id}"), "get monitor") + return Monitor(**data) + + +def update_monitor(client: HttpClient, monitor_id: str, request: MonitorUpdateRequest) -> Monitor: + data = _data_or_error(client.patch(f"/v2/monitor/{monitor_id}", _prepare_payload(request)), "update monitor") + return Monitor(**data) + + +def delete_monitor(client: HttpClient, monitor_id: str) -> bool: + response = client.delete(f"/v2/monitor/{monitor_id}") + if not response.ok: + handle_response_error(response, "delete monitor") + body = response.json() + if not body.get("success"): + raise Exception(body.get("error", "Unknown error occurred")) + return True + + +def run_monitor(client: HttpClient, monitor_id: str) -> MonitorCheck: + data = _data_or_error(client.post(f"/v2/monitor/{monitor_id}/run", {}), "run monitor") + return MonitorCheck(**data) + + +def list_monitor_checks( + client: HttpClient, + monitor_id: str, + *, + limit: Optional[int] = None, + offset: Optional[int] = None, +) -> List[MonitorCheck]: + params = [] + if limit is not None: + params.append(f"limit={limit}") + if offset is not None: + params.append(f"offset={offset}") + suffix = f"?{'&'.join(params)}" if params else "" + data = _data_or_error(client.get(f"/v2/monitor/{monitor_id}/checks{suffix}"), "list monitor checks") + return [MonitorCheck(**item) for item in data or []] + + +def get_monitor_check( + client: HttpClient, + monitor_id: str, + check_id: str, + *, + limit: Optional[int] = None, + skip: Optional[int] = None, + status: Optional[str] = None, + pagination_config: Optional[PaginationConfig] = None, +) -> MonitorCheckDetail: + params = [] + if limit is not None: + params.append(f"limit={limit}") + if skip is not None: + params.append(f"skip={skip}") + if status is not None: + params.append(f"status={status}") + suffix = f"?{'&'.join(params)}" if params else "" + data = _monitor_check_data_or_error(client.get(f"/v2/monitor/{monitor_id}/checks/{check_id}{suffix}"), "get monitor check") + detail = MonitorCheckDetail(**data) + + auto_paginate = pagination_config.auto_paginate if pagination_config else True + if auto_paginate and detail.next and not ( + pagination_config + and pagination_config.max_results is not None + and len(detail.pages) >= pagination_config.max_results + ): + detail.pages = _fetch_all_monitor_check_pages( + client, + detail.next, + detail.pages, + pagination_config, + ) + detail.next = None + + return detail diff --git a/apps/python-sdk/firecrawl/v2/types.py b/apps/python-sdk/firecrawl/v2/types.py index 9b06f0341a..a2fe2368f2 100644 --- a/apps/python-sdk/firecrawl/v2/types.py +++ b/apps/python-sdk/firecrawl/v2/types.py @@ -797,6 +797,136 @@ class MapResponse(BaseResponse[MapData]): pass +# Monitor types +class MonitorSchedule(BaseModel): + """Cron schedule for a monitor.""" + + cron: str + timezone: str = "UTC" + + +class MonitorEmailNotification(BaseModel): + enabled: bool = False + recipients: List[str] = [] + include_diffs: bool = Field(default=False, alias="includeDiffs") + + model_config = {"populate_by_name": True} + + +class MonitorNotification(BaseModel): + email: Optional[MonitorEmailNotification] = None + + +class MonitorTarget(BaseModel): + """A scrape or crawl target stored on a monitor.""" + + model_config = {"extra": "allow", "populate_by_name": True} + + id: Optional[str] = None + type: Literal["scrape", "crawl"] + urls: Optional[List[str]] = None + url: Optional[str] = None + scrape_options: Optional[Union[ScrapeOptions, Dict[str, Any]]] = Field(default=None, alias="scrapeOptions") + crawl_options: Optional[Dict[str, Any]] = Field(default=None, alias="crawlOptions") + + +class MonitorCreateRequest(BaseModel): + model_config = {"populate_by_name": True} + + name: str + schedule: MonitorSchedule + webhook: Optional[WebhookConfig] = None + notification: Optional[MonitorNotification] = None + targets: List[Union[MonitorTarget, Dict[str, Any]]] + retention_days: Optional[int] = Field(default=None, alias="retentionDays") + + +class MonitorUpdateRequest(BaseModel): + model_config = {"populate_by_name": True} + + name: Optional[str] = None + status: Optional[Literal["active", "paused"]] = None + schedule: Optional[MonitorSchedule] = None + webhook: Optional[Union[WebhookConfig, Dict[str, Any]]] = None + notification: Optional[Union[MonitorNotification, Dict[str, Any]]] = None + targets: Optional[List[Union[MonitorTarget, Dict[str, Any]]]] = None + retention_days: Optional[int] = Field(default=None, alias="retentionDays") + + +class MonitorSummary(BaseModel): + total_pages: int = Field(default=0, alias="totalPages") + same: int = 0 + changed: int = 0 + new: int = 0 + removed: int = 0 + error: int = 0 + + model_config = {"populate_by_name": True} + + +class Monitor(BaseModel): + model_config = {"populate_by_name": True, "extra": "allow"} + + id: str + name: str + status: Literal["active", "paused", "deleted"] + schedule: MonitorSchedule + next_run_at: Optional[str] = Field(default=None, alias="nextRunAt") + last_run_at: Optional[str] = Field(default=None, alias="lastRunAt") + current_check_id: Optional[str] = Field(default=None, alias="currentCheckId") + targets: List[Dict[str, Any]] + webhook: Optional[Dict[str, Any]] = None + notification: Optional[Dict[str, Any]] = None + retention_days: int = Field(alias="retentionDays") + estimated_credits_per_month: Optional[int] = Field(default=None, alias="estimatedCreditsPerMonth") + last_check_summary: Optional[MonitorSummary] = Field(default=None, alias="lastCheckSummary") + created_at: str = Field(alias="createdAt") + updated_at: str = Field(alias="updatedAt") + + +class MonitorCheck(BaseModel): + model_config = {"populate_by_name": True, "extra": "allow"} + + id: str + monitor_id: str = Field(alias="monitorId") + status: Literal["queued", "running", "completed", "failed", "partial", "skipped_overlap"] + trigger: Literal["scheduled", "manual"] + scheduled_for: Optional[str] = Field(default=None, alias="scheduledFor") + started_at: Optional[str] = Field(default=None, alias="startedAt") + finished_at: Optional[str] = Field(default=None, alias="finishedAt") + estimated_credits: Optional[int] = Field(default=None, alias="estimatedCredits") + reserved_credits: Optional[int] = Field(default=None, alias="reservedCredits") + actual_credits: Optional[int] = Field(default=None, alias="actualCredits") + billing_status: Literal["not_applicable", "reserved", "confirmed", "released", "failed"] = Field(alias="billingStatus") + summary: MonitorSummary + target_results: Optional[Any] = Field(default=None, alias="targetResults") + notification_status: Optional[Any] = Field(default=None, alias="notificationStatus") + error: Optional[str] = None + created_at: str = Field(alias="createdAt") + updated_at: str = Field(alias="updatedAt") + + +class MonitorCheckPage(BaseModel): + model_config = {"populate_by_name": True, "extra": "allow"} + + id: str + target_id: str = Field(alias="targetId") + url: str + status: Literal["same", "new", "changed", "removed", "error"] + previous_scrape_id: Optional[str] = Field(default=None, alias="previousScrapeId") + current_scrape_id: Optional[str] = Field(default=None, alias="currentScrapeId") + status_code: Optional[int] = Field(default=None, alias="statusCode") + error: Optional[str] = None + metadata: Optional[Any] = None + diff: Optional[Any] = None + created_at: str = Field(alias="createdAt") + + +class MonitorCheckDetail(MonitorCheck): + pages: List[MonitorCheckPage] = [] + next: Optional[str] = None + + # Extract types class ExtractRequest(BaseModel): """Request for extract operations.""" diff --git a/apps/python-sdk/firecrawl/v2/utils/http_client.py b/apps/python-sdk/firecrawl/v2/utils/http_client.py index cc41682465..7a75ddbb3e 100644 --- a/apps/python-sdk/firecrawl/v2/utils/http_client.py +++ b/apps/python-sdk/firecrawl/v2/utils/http_client.py @@ -263,4 +263,50 @@ def delete( time.sleep(backoff_factor * (2 ** attempt)) # This should never be reached due to the exception handling above - raise last_exception or Exception("Unexpected error in DELETE request") \ No newline at end of file + raise last_exception or Exception("Unexpected error in DELETE request") + + def patch( + self, + endpoint: str, + data: Dict[str, Any], + headers: Optional[Dict[str, str]] = None, + timeout: Optional[float] = None, + retries: Optional[int] = None, + backoff_factor: Optional[float] = None, + ) -> requests.Response: + """Make a PATCH request with retry logic.""" + if headers is None: + headers = self._prepare_headers() + if timeout is None: + timeout = self.timeout + if retries is None: + retries = self.max_retries + if backoff_factor is None: + backoff_factor = self.backoff_factor + + payload = dict(data) + payload['origin'] = f'python-sdk@{version}' + url = self._build_url(endpoint) + + last_exception = None + num_attempts = max(1, retries) + + for attempt in range(num_attempts): + try: + response = requests.patch( + url, + json=payload, + headers=headers, + timeout=timeout + ) + if response.status_code == 502 and attempt < num_attempts - 1: + time.sleep(backoff_factor * (2 ** attempt)) + continue + return response + except requests.RequestException as e: + last_exception = e + if attempt == num_attempts - 1: + raise e + time.sleep(backoff_factor * (2 ** attempt)) + + raise last_exception or Exception("Unexpected error in PATCH request") \ No newline at end of file diff --git a/apps/python-sdk/firecrawl/v2/utils/http_client_async.py b/apps/python-sdk/firecrawl/v2/utils/http_client_async.py index 9c8e7dec1e..52d14623f8 100644 --- a/apps/python-sdk/firecrawl/v2/utils/http_client_async.py +++ b/apps/python-sdk/firecrawl/v2/utils/http_client_async.py @@ -201,3 +201,45 @@ async def delete( await asyncio.sleep(backoff_factor * (2 ** attempt)) raise last_exception or Exception("Unexpected error in DELETE request") + + async def patch( + self, + endpoint: str, + data: Dict[str, Any], + headers: Optional[Dict[str, str]] = None, + timeout: Optional[float] = None, + retries: Optional[int] = None, + backoff_factor: Optional[float] = None, + ) -> httpx.Response: + if timeout is None: + timeout = self.timeout + if retries is None: + retries = self.max_retries + if backoff_factor is None: + backoff_factor = self.backoff_factor + + payload = dict(data) + payload["origin"] = f"python-sdk@{version}" + + last_exception = None + num_attempts = max(1, retries) + + for attempt in range(num_attempts): + try: + response = await self._client.patch( + endpoint, + json=payload, + headers={**self._headers(), **(headers or {})}, + timeout=timeout, + ) + if response.status_code == 502 and attempt < num_attempts - 1: + await asyncio.sleep(backoff_factor * (2 ** attempt)) + continue + return response + except httpx.HTTPError as e: + last_exception = e + if attempt == num_attempts - 1: + raise e + await asyncio.sleep(backoff_factor * (2 ** attempt)) + + raise last_exception or Exception("Unexpected error in PATCH request") diff --git a/apps/ruby-sdk/lib/firecrawl.rb b/apps/ruby-sdk/lib/firecrawl.rb index 79af432682..64cd6dd5ae 100644 --- a/apps/ruby-sdk/lib/firecrawl.rb +++ b/apps/ruby-sdk/lib/firecrawl.rb @@ -23,4 +23,5 @@ require_relative "firecrawl/models/agent_status_response" require_relative "firecrawl/models/concurrency_check" require_relative "firecrawl/models/credit_usage" +require_relative "firecrawl/models/monitor" require_relative "firecrawl/client" diff --git a/apps/ruby-sdk/lib/firecrawl/client.rb b/apps/ruby-sdk/lib/firecrawl/client.rb index 635c1bbbae..7eaff83254 100644 --- a/apps/ruby-sdk/lib/firecrawl/client.rb +++ b/apps/ruby-sdk/lib/firecrawl/client.rb @@ -1,6 +1,7 @@ # frozen_string_literal: true require "json" +require "uri" module Firecrawl # Client for the Firecrawl v2 API. @@ -280,6 +281,83 @@ def map(url, options = nil) Models::MapData.new(data) end + # ================================================================ + # MONITOR + # ================================================================ + + def create_monitor(name:, schedule:, targets:, webhook: nil, notification: nil, retention_days: nil) + body = { + "name" => name, + "schedule" => schedule, + "targets" => targets, + "webhook" => webhook, + "notification" => notification, + "retentionDays" => retention_days, + }.compact + raw = @http.post("/v2/monitor", body) + Models::Monitor.new(raw["data"] || raw) + end + + def list_monitors(limit: nil, offset: nil) + raw = @http.get("/v2/monitor#{query(limit: limit, offset: offset)}") + (raw["data"] || []).map { |item| Models::Monitor.new(item) } + end + + def get_monitor(monitor_id) + raise ArgumentError, "Monitor ID is required" if monitor_id.nil? + + raw = @http.get("/v2/monitor/#{monitor_id}") + Models::Monitor.new(raw["data"] || raw) + end + + def update_monitor(monitor_id, **attrs) + raise ArgumentError, "Monitor ID is required" if monitor_id.nil? + + body = { + "name" => attrs[:name], + "status" => attrs[:status], + "schedule" => attrs[:schedule], + "webhook" => attrs[:webhook], + "notification" => attrs[:notification], + "targets" => attrs[:targets], + "retentionDays" => attrs[:retention_days], + }.compact + raw = @http.patch("/v2/monitor/#{monitor_id}", body) + Models::Monitor.new(raw["data"] || raw) + end + + def delete_monitor(monitor_id) + raise ArgumentError, "Monitor ID is required" if monitor_id.nil? + + @http.delete("/v2/monitor/#{monitor_id}")["success"] == true + end + + def run_monitor(monitor_id) + raise ArgumentError, "Monitor ID is required" if monitor_id.nil? + + raw = @http.post("/v2/monitor/#{monitor_id}/run", {}) + Models::MonitorCheck.new(raw["data"] || raw) + end + + def list_monitor_checks(monitor_id, limit: nil, offset: nil) + raise ArgumentError, "Monitor ID is required" if monitor_id.nil? + + raw = @http.get("/v2/monitor/#{monitor_id}/checks#{query(limit: limit, offset: offset)}") + (raw["data"] || []).map { |item| Models::MonitorCheck.new(item) } + end + + def get_monitor_check(monitor_id, check_id, limit: nil, skip: nil, status: nil, auto_paginate: true) + raise ArgumentError, "Monitor ID is required" if monitor_id.nil? + raise ArgumentError, "Check ID is required" if check_id.nil? + + params = query(limit: limit, skip: skip, status: status) + raw = @http.get("/v2/monitor/#{monitor_id}/checks/#{check_id}#{params}") + data = raw["data"] || raw + data["next"] = raw["next"] if raw["next"] + check = Models::MonitorCheckDetail.new(data) + auto_paginate ? paginate_monitor_check(check) : check + end + # ================================================================ # SEARCH # ================================================================ @@ -378,6 +456,11 @@ def get_credit_usage private + def query(**params) + compact = params.compact + compact.empty? ? "" : "?#{URI.encode_www_form(compact)}" + end + def poll_crawl(job_id, poll_interval, timeout) deadline = Time.now + timeout while Time.now < deadline @@ -423,5 +506,20 @@ def paginate_batch_scrape(job) end job end + + def paginate_monitor_check(check) + check.pages ||= [] + current = check + while current.next_url && !current.next_url.empty? + raw = @http.get_absolute(current.next_url) + data = raw["data"] || raw + data["next"] = raw["next"] if raw["next"] + next_page = Models::MonitorCheckDetail.new(data) + check.pages.concat(next_page.pages) unless next_page.pages.empty? + current = next_page + end + check.next_url = nil + check + end end end diff --git a/apps/ruby-sdk/lib/firecrawl/http_client.rb b/apps/ruby-sdk/lib/firecrawl/http_client.rb index 248619e5d1..4d1a819e09 100644 --- a/apps/ruby-sdk/lib/firecrawl/http_client.rb +++ b/apps/ruby-sdk/lib/firecrawl/http_client.rb @@ -59,6 +59,16 @@ def delete(path) execute_with_retry(uri, request) end + # Sends a PATCH request with JSON body. + def patch(path, body) + uri = URI("#{@base_url}#{path}") + request = Net::HTTP::Patch.new(uri) + request["Authorization"] = "Bearer #{@api_key}" + request["Content-Type"] = "application/json" + request.body = JSON.generate(body) + execute_with_retry(uri, request) + end + # Sends a POST request with a multipart/form-data body. # # @param path [String] API path diff --git a/apps/ruby-sdk/lib/firecrawl/models/monitor.rb b/apps/ruby-sdk/lib/firecrawl/models/monitor.rb new file mode 100644 index 0000000000..8948baf23d --- /dev/null +++ b/apps/ruby-sdk/lib/firecrawl/models/monitor.rb @@ -0,0 +1,68 @@ +# frozen_string_literal: true + +module Firecrawl + module Models + class Monitor + attr_reader :id, :name, :status, :schedule, :next_run_at, :last_run_at, + :current_check_id, :targets, :webhook, :notification, + :retention_days, :estimated_credits_per_month, + :last_check_summary, :created_at, :updated_at + + def initialize(data) + @id = data["id"] + @name = data["name"] + @status = data["status"] + @schedule = data["schedule"] + @next_run_at = data["nextRunAt"] + @last_run_at = data["lastRunAt"] + @current_check_id = data["currentCheckId"] + @targets = data["targets"] || [] + @webhook = data["webhook"] + @notification = data["notification"] + @retention_days = data["retentionDays"] + @estimated_credits_per_month = data["estimatedCreditsPerMonth"] + @last_check_summary = data["lastCheckSummary"] + @created_at = data["createdAt"] + @updated_at = data["updatedAt"] + end + end + + class MonitorCheck + attr_reader :id, :monitor_id, :status, :trigger, :scheduled_for, + :started_at, :finished_at, :estimated_credits, + :reserved_credits, :actual_credits, :billing_status, + :summary, :target_results, :notification_status, :error, + :created_at, :updated_at + + def initialize(data) + @id = data["id"] + @monitor_id = data["monitorId"] + @status = data["status"] + @trigger = data["trigger"] + @scheduled_for = data["scheduledFor"] + @started_at = data["startedAt"] + @finished_at = data["finishedAt"] + @estimated_credits = data["estimatedCredits"] + @reserved_credits = data["reservedCredits"] + @actual_credits = data["actualCredits"] + @billing_status = data["billingStatus"] + @summary = data["summary"] || {} + @target_results = data["targetResults"] + @notification_status = data["notificationStatus"] + @error = data["error"] + @created_at = data["createdAt"] + @updated_at = data["updatedAt"] + end + end + + class MonitorCheckDetail < MonitorCheck + attr_accessor :pages, :next_url + + def initialize(data) + super + @pages = data["pages"] || [] + @next_url = data["next"] + end + end + end +end diff --git a/apps/ruby-sdk/lib/firecrawl/version.rb b/apps/ruby-sdk/lib/firecrawl/version.rb index 33b639d21c..02eba3e87a 100644 --- a/apps/ruby-sdk/lib/firecrawl/version.rb +++ b/apps/ruby-sdk/lib/firecrawl/version.rb @@ -1,5 +1,5 @@ # frozen_string_literal: true module Firecrawl - VERSION = "1.3.1" + VERSION = "1.4.0" end diff --git a/apps/rust-sdk/Cargo.lock b/apps/rust-sdk/Cargo.lock index 13078e92db..a0f539d345 100644 --- a/apps/rust-sdk/Cargo.lock +++ b/apps/rust-sdk/Cargo.lock @@ -250,7 +250,7 @@ checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" [[package]] name = "firecrawl" -version = "2.3.1" +version = "2.4.0" dependencies = [ "mockito", "reqwest", diff --git a/apps/rust-sdk/Cargo.toml b/apps/rust-sdk/Cargo.toml index 6de8905e76..509e1dcdc3 100644 --- a/apps/rust-sdk/Cargo.toml +++ b/apps/rust-sdk/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "firecrawl" -version = "2.3.1" +version = "2.4.0" edition = "2021" license = "MIT" homepage = "https://www.firecrawl.dev/" diff --git a/apps/rust-sdk/src/lib.rs b/apps/rust-sdk/src/lib.rs index 379ecdbc96..4639f0cdb1 100644 --- a/apps/rust-sdk/src/lib.rs +++ b/apps/rust-sdk/src/lib.rs @@ -25,6 +25,7 @@ mod batch_scrape; mod client; mod crawl; mod map; +mod monitor; mod parse; mod scrape; mod search; @@ -36,6 +37,7 @@ pub use client::Client; pub use crawl::*; pub use error::FirecrawlError; pub use map::*; +pub use monitor::*; pub use parse::*; pub use scrape::*; pub use search::*; diff --git a/apps/rust-sdk/src/monitor.rs b/apps/rust-sdk/src/monitor.rs new file mode 100644 index 0000000000..2b7e1b493c --- /dev/null +++ b/apps/rust-sdk/src/monitor.rs @@ -0,0 +1,370 @@ +//! Monitor endpoint for Firecrawl API v2. + +use serde::{Deserialize, Serialize}; +use serde_json::Value; + +use crate::client::Client; +use crate::FirecrawlError; + +#[derive(Deserialize, Serialize, Debug, Clone)] +#[serde(rename_all = "camelCase")] +pub struct MonitorSchedule { + pub cron: String, + pub timezone: Option, +} + +#[serde_with::skip_serializing_none] +#[derive(Deserialize, Serialize, Debug, Clone)] +#[serde(rename_all = "camelCase")] +pub struct CreateMonitorRequest { + pub name: String, + pub schedule: MonitorSchedule, + pub targets: Vec, + pub webhook: Option, + pub notification: Option, + pub retention_days: Option, +} + +#[serde_with::skip_serializing_none] +#[derive(Deserialize, Serialize, Debug, Default, Clone)] +#[serde(rename_all = "camelCase")] +pub struct UpdateMonitorRequest { + pub name: Option, + pub status: Option, + pub schedule: Option, + pub targets: Option>, + pub webhook: Option, + pub notification: Option, + pub retention_days: Option, +} + +#[derive(Deserialize, Serialize, Debug, Default, Clone)] +#[serde(rename_all = "camelCase")] +pub struct MonitorSummary { + pub total_pages: u32, + pub same: u32, + pub changed: u32, + pub new: u32, + pub removed: u32, + pub error: u32, +} + +#[derive(Deserialize, Serialize, Debug, Clone)] +#[serde(rename_all = "camelCase")] +pub struct Monitor { + pub id: String, + pub name: String, + pub status: String, + pub schedule: MonitorSchedule, + pub next_run_at: Option, + pub last_run_at: Option, + pub current_check_id: Option, + pub targets: Vec, + pub webhook: Option, + pub notification: Option, + pub retention_days: u32, + pub estimated_credits_per_month: Option, + pub last_check_summary: Option, + pub created_at: String, + pub updated_at: String, +} + +#[derive(Deserialize, Serialize, Debug, Clone)] +#[serde(rename_all = "camelCase")] +pub struct MonitorCheck { + pub id: String, + pub monitor_id: String, + pub status: String, + pub trigger: String, + pub scheduled_for: Option, + pub started_at: Option, + pub finished_at: Option, + pub estimated_credits: Option, + pub reserved_credits: Option, + pub actual_credits: Option, + pub billing_status: String, + pub summary: MonitorSummary, + pub target_results: Option, + pub notification_status: Option, + pub error: Option, + pub created_at: String, + pub updated_at: String, +} + +#[derive(Deserialize, Serialize, Debug, Clone)] +#[serde(rename_all = "camelCase")] +pub struct MonitorCheckPage { + pub id: String, + pub target_id: String, + pub url: String, + pub status: String, + pub previous_scrape_id: Option, + pub current_scrape_id: Option, + pub status_code: Option, + pub error: Option, + pub metadata: Option, + pub diff: Option, + pub created_at: String, +} + +#[derive(Deserialize, Serialize, Debug, Clone)] +#[serde(rename_all = "camelCase")] +pub struct MonitorCheckDetail { + #[serde(flatten)] + pub check: MonitorCheck, + pub pages: Vec, + pub next: Option, +} + +#[derive(Deserialize, Debug)] +#[serde(rename_all = "camelCase")] +struct DataResponse { + data: T, +} + +#[derive(Deserialize, Debug)] +#[serde(rename_all = "camelCase")] +struct SuccessResponse { + success: bool, +} + +fn query(limit: Option, offset: Option, status: Option<&str>) -> String { + let mut params = Vec::new(); + if let Some(limit) = limit { + params.push(format!("limit={}", limit)); + } + if let Some(offset) = offset { + params.push(format!("offset={}", offset)); + } + if let Some(status) = status { + params.push(format!("status={}", status)); + } + if params.is_empty() { + String::new() + } else { + format!("?{}", params.join("&")) + } +} + +fn check_page_query(limit: Option, skip: Option, status: Option<&str>) -> String { + let mut params = Vec::new(); + if let Some(limit) = limit { + params.push(format!("limit={}", limit)); + } + if let Some(skip) = skip { + params.push(format!("skip={}", skip)); + } + if let Some(status) = status { + params.push(format!("status={}", status)); + } + if params.is_empty() { + String::new() + } else { + format!("?{}", params.join("&")) + } +} + +impl Client { + pub async fn create_monitor( + &self, + request: CreateMonitorRequest, + ) -> Result { + let response = self + .client + .post(self.url("/monitor")) + .headers(self.prepare_headers(None)) + .json(&request) + .send() + .await + .map_err(|e| FirecrawlError::HttpError("Creating monitor".to_string(), e))?; + + let response: DataResponse = + self.handle_response(response, "create monitor").await?; + Ok(response.data) + } + + pub async fn list_monitors( + &self, + limit: Option, + offset: Option, + ) -> Result, FirecrawlError> { + let response = self + .client + .get(self.url(&format!("/monitor{}", query(limit, offset, None)))) + .headers(self.prepare_headers(None)) + .send() + .await + .map_err(|e| FirecrawlError::HttpError("Listing monitors".to_string(), e))?; + + let response: DataResponse> = + self.handle_response(response, "list monitors").await?; + Ok(response.data) + } + + pub async fn get_monitor( + &self, + monitor_id: impl AsRef, + ) -> Result { + let response = self + .client + .get(self.url(&format!("/monitor/{}", monitor_id.as_ref()))) + .headers(self.prepare_headers(None)) + .send() + .await + .map_err(|e| FirecrawlError::HttpError("Getting monitor".to_string(), e))?; + + let response: DataResponse = self.handle_response(response, "get monitor").await?; + Ok(response.data) + } + + pub async fn update_monitor( + &self, + monitor_id: impl AsRef, + request: UpdateMonitorRequest, + ) -> Result { + let response = self + .client + .patch(self.url(&format!("/monitor/{}", monitor_id.as_ref()))) + .headers(self.prepare_headers(None)) + .json(&request) + .send() + .await + .map_err(|e| FirecrawlError::HttpError("Updating monitor".to_string(), e))?; + + let response: DataResponse = + self.handle_response(response, "update monitor").await?; + Ok(response.data) + } + + pub async fn delete_monitor( + &self, + monitor_id: impl AsRef, + ) -> Result { + let response = self + .client + .delete(self.url(&format!("/monitor/{}", monitor_id.as_ref()))) + .headers(self.prepare_headers(None)) + .send() + .await + .map_err(|e| FirecrawlError::HttpError("Deleting monitor".to_string(), e))?; + + let response: SuccessResponse = self.handle_response(response, "delete monitor").await?; + Ok(response.success) + } + + pub async fn run_monitor( + &self, + monitor_id: impl AsRef, + ) -> Result { + let response = self + .client + .post(self.url(&format!("/monitor/{}/run", monitor_id.as_ref()))) + .headers(self.prepare_headers(None)) + .json(&serde_json::json!({})) + .send() + .await + .map_err(|e| FirecrawlError::HttpError("Running monitor".to_string(), e))?; + + let response: DataResponse = + self.handle_response(response, "run monitor").await?; + Ok(response.data) + } + + pub async fn list_monitor_checks( + &self, + monitor_id: impl AsRef, + limit: Option, + offset: Option, + ) -> Result, FirecrawlError> { + let path = format!( + "/monitor/{}/checks{}", + monitor_id.as_ref(), + query(limit, offset, None) + ); + let response = self + .client + .get(self.url(&path)) + .headers(self.prepare_headers(None)) + .send() + .await + .map_err(|e| FirecrawlError::HttpError("Listing monitor checks".to_string(), e))?; + + let response: DataResponse> = self + .handle_response(response, "list monitor checks") + .await?; + Ok(response.data) + } + + pub async fn get_monitor_check( + &self, + monitor_id: impl AsRef, + check_id: impl AsRef, + limit: Option, + skip: Option, + status: Option<&str>, + ) -> Result { + let path = format!( + "/monitor/{}/checks/{}{}", + monitor_id.as_ref(), + check_id.as_ref(), + check_page_query(limit, skip, status) + ); + let response = self + .client + .get(self.url(&path)) + .headers(self.prepare_headers(None)) + .send() + .await + .map_err(|e| FirecrawlError::HttpError("Getting monitor check".to_string(), e))?; + + let response: DataResponse = + self.handle_response(response, "get monitor check").await?; + let mut check = response.data; + + while let Some(next) = check.next.clone() { + let response = self + .client + .get(next) + .headers(self.prepare_headers(None)) + .send() + .await + .map_err(|e| { + FirecrawlError::HttpError("Getting monitor check page".to_string(), e) + })?; + let response: DataResponse = self + .handle_response(response, "get monitor check page") + .await?; + check.pages.extend(response.data.pages); + check.next = response.data.next; + } + + Ok(check) + } + + pub async fn get_monitor_check_page( + &self, + monitor_id: impl AsRef, + check_id: impl AsRef, + limit: Option, + skip: Option, + status: Option<&str>, + ) -> Result { + let path = format!( + "/monitor/{}/checks/{}{}", + monitor_id.as_ref(), + check_id.as_ref(), + check_page_query(limit, skip, status) + ); + let response = self + .client + .get(self.url(&path)) + .headers(self.prepare_headers(None)) + .send() + .await + .map_err(|e| FirecrawlError::HttpError("Getting monitor check".to_string(), e))?; + + let response: DataResponse = + self.handle_response(response, "get monitor check").await?; + Ok(response.data) + } +} From 8af5b784eec26bdcc18c3fb608f71d8ba4d2c060 Mon Sep 17 00:00:00 2001 From: "firecrawl-spring[bot]" <254786068+firecrawl-spring[bot]@users.noreply.github.com> Date: Wed, 6 May 2026 14:54:30 +0200 Subject: [PATCH 16/27] fix(browser-sessions): retry insertBrowserSession on transient Supabase errors (#3489) Co-authored-by: firecrawl-spring[bot] <254786068+firecrawl-spring[bot]@users.noreply.github.com> Co-authored-by: mogery --- apps/api/src/lib/browser-sessions.ts | 47 ++++++++++++++++++++++------ 1 file changed, 37 insertions(+), 10 deletions(-) diff --git a/apps/api/src/lib/browser-sessions.ts b/apps/api/src/lib/browser-sessions.ts index eaf83fb5ac..d3b27a3ccd 100644 --- a/apps/api/src/lib/browser-sessions.ts +++ b/apps/api/src/lib/browser-sessions.ts @@ -45,18 +45,45 @@ export async function insertBrowserSession( updated_at: now, }; - const { data, error } = await supabase_service - .from(TABLE) - .insert(full) - .select() - .single(); - - if (error) { - logger.error("Failed to insert browser session", { error, id: row.id }); - throw new Error(`Failed to insert browser session: ${error.message}`); + const MAX_ATTEMPTS = 10; + let lastError: any = null; + for (let attempt = 1; attempt <= MAX_ATTEMPTS; attempt++) { + try { + const { data, error } = await supabase_service + .from(TABLE) + .insert(full) + .select() + .single(); + + if (error) { + lastError = error; + logger.error( + "Error inserting browser session due to Supabase error, trying again", + { error, id: row.id, attempt }, + ); + await new Promise(resolve => setTimeout(resolve, 75)); + continue; + } + + return data as BrowserSessionRow; + } catch (error) { + lastError = error; + logger.error( + "Error inserting browser session due to unknown error, trying again", + { error, id: row.id, attempt }, + ); + await new Promise(resolve => setTimeout(resolve, 75)); + } } - return data as BrowserSessionRow; + logger.error("Failed to insert browser session after all retries", { + error: lastError, + id: row.id, + attempts: MAX_ATTEMPTS, + }); + throw new Error( + `Failed to insert browser session: ${lastError?.message ?? "unknown error"}`, + ); } export async function getBrowserSession( From 4c7af99c172f0832c03f6106e39bbaf76b17abd4 Mon Sep 17 00:00:00 2001 From: mogery Date: Wed, 6 May 2026 15:05:28 +0200 Subject: [PATCH 17/27] fix(monitoring): advance scheduled checks before enqueue Co-authored-by: Cursor --- .../src/services/monitoring/scheduler.test.ts | 134 ++++++++++++++++++ apps/api/src/services/monitoring/scheduler.ts | 52 ++++++- apps/api/src/services/monitoring/store.ts | 31 ++++ 3 files changed, 216 insertions(+), 1 deletion(-) create mode 100644 apps/api/src/services/monitoring/scheduler.test.ts diff --git a/apps/api/src/services/monitoring/scheduler.test.ts b/apps/api/src/services/monitoring/scheduler.test.ts new file mode 100644 index 0000000000..c5ec32983a --- /dev/null +++ b/apps/api/src/services/monitoring/scheduler.test.ts @@ -0,0 +1,134 @@ +import { addMonitorCheckJob } from "./queue"; +import { enqueueDueMonitorChecks } from "./scheduler"; +import { + advanceMonitorAfterSkippedCheck, + claimDueMonitors, + createMonitorCheck, + dispatchScheduledMonitorCheck, + updateMonitorCheck, + updateMonitorScheduleAfterRun, +} from "./store"; + +jest.mock("./queue", () => ({ + addMonitorCheckJob: jest.fn(), +})); + +jest.mock("./store", () => ({ + advanceMonitorAfterSkippedCheck: jest.fn(), + claimDueMonitors: jest.fn(), + createMonitorCheck: jest.fn(), + dispatchScheduledMonitorCheck: jest.fn(), + updateMonitorCheck: jest.fn(), + updateMonitorScheduleAfterRun: jest.fn(), +})); + +const mockAddMonitorCheckJob = addMonitorCheckJob as jest.MockedFunction< + typeof addMonitorCheckJob +>; +const mockClaimDueMonitors = claimDueMonitors as jest.MockedFunction< + typeof claimDueMonitors +>; +const mockCreateMonitorCheck = createMonitorCheck as jest.MockedFunction< + typeof createMonitorCheck +>; +const mockDispatchScheduledMonitorCheck = + dispatchScheduledMonitorCheck as jest.MockedFunction< + typeof dispatchScheduledMonitorCheck + >; +const mockUpdateMonitorCheck = updateMonitorCheck as jest.MockedFunction< + typeof updateMonitorCheck +>; +const mockAdvanceMonitorAfterSkippedCheck = + advanceMonitorAfterSkippedCheck as jest.MockedFunction< + typeof advanceMonitorAfterSkippedCheck + >; +const mockUpdateMonitorScheduleAfterRun = + updateMonitorScheduleAfterRun as jest.MockedFunction< + typeof updateMonitorScheduleAfterRun + >; + +describe("monitoring scheduler", () => { + const monitor = { + id: "monitor-1", + team_id: "team-1", + current_check_id: null, + next_run_at: "2026-05-05T18:45:00.000Z", + } as any; + const check = { id: "check-1" } as any; + + beforeEach(() => { + jest.clearAllMocks(); + mockClaimDueMonitors.mockResolvedValue([monitor]); + mockCreateMonitorCheck.mockResolvedValue(check); + mockDispatchScheduledMonitorCheck.mockResolvedValue(true); + mockAddMonitorCheckJob.mockResolvedValue(undefined); + mockAdvanceMonitorAfterSkippedCheck.mockResolvedValue(undefined); + mockUpdateMonitorScheduleAfterRun.mockResolvedValue(undefined); + }); + + it("dispatches and advances a scheduled monitor before enqueueing its job", async () => { + await expect( + enqueueDueMonitorChecks({ workerId: "worker-1" }), + ).resolves.toBe(1); + + expect(mockCreateMonitorCheck).toHaveBeenCalledWith({ + monitor, + trigger: "scheduled", + scheduledFor: monitor.next_run_at, + }); + expect(mockDispatchScheduledMonitorCheck).toHaveBeenCalledWith({ + monitor, + checkId: check.id, + }); + expect(mockAddMonitorCheckJob).toHaveBeenCalledWith({ + monitorId: monitor.id, + checkId: check.id, + teamId: monitor.team_id, + }); + expect( + mockDispatchScheduledMonitorCheck.mock.invocationCallOrder[0], + ).toBeLessThan(mockAddMonitorCheckJob.mock.invocationCallOrder[0]); + }); + + it("fails and clears a dispatched check when enqueueing fails", async () => { + const error = new Error("queue unavailable"); + const failed = { id: check.id, status: "failed" } as any; + mockAddMonitorCheckJob.mockRejectedValue(error); + mockUpdateMonitorCheck.mockResolvedValue(failed); + + await expect( + enqueueDueMonitorChecks({ workerId: "worker-1" }), + ).resolves.toBe(0); + + expect(mockUpdateMonitorCheck).toHaveBeenCalledWith(check.id, { + status: "failed", + finished_at: expect.any(String), + error: error.message, + }); + expect(mockUpdateMonitorScheduleAfterRun).toHaveBeenCalledWith({ + monitor, + check: failed, + }); + }); + + it("records an overlap if dispatch finds another running check", async () => { + const skipped = { id: check.id, status: "skipped_overlap" } as any; + mockDispatchScheduledMonitorCheck.mockResolvedValue(false); + mockUpdateMonitorCheck.mockResolvedValue(skipped); + + await expect( + enqueueDueMonitorChecks({ workerId: "worker-1" }), + ).resolves.toBe(0); + + expect(mockAddMonitorCheckJob).not.toHaveBeenCalled(); + expect(mockUpdateMonitorCheck).toHaveBeenCalledWith(check.id, { + status: "skipped_overlap", + finished_at: expect.any(String), + error: "Previous monitor check is still running.", + }); + expect(mockAdvanceMonitorAfterSkippedCheck).toHaveBeenCalledWith({ + monitor, + check: skipped, + }); + }); +}); diff --git a/apps/api/src/services/monitoring/scheduler.ts b/apps/api/src/services/monitoring/scheduler.ts index 99b817b34e..84a7984e71 100644 --- a/apps/api/src/services/monitoring/scheduler.ts +++ b/apps/api/src/services/monitoring/scheduler.ts @@ -5,7 +5,9 @@ import { advanceMonitorAfterSkippedCheck, claimDueMonitors, createMonitorCheck, + dispatchScheduledMonitorCheck, updateMonitorCheck, + updateMonitorScheduleAfterRun, } from "./store"; const logger = _logger.child({ module: "monitoring-scheduler" }); @@ -34,6 +36,8 @@ export async function enqueueDueMonitorChecks( let enqueued = 0; for (const monitor of monitors) { + let check: Awaited> | null = null; + let dispatched = false; try { if (monitor.current_check_id) { const skipped = await createMonitorCheck({ @@ -51,11 +55,25 @@ export async function enqueueDueMonitorChecks( continue; } - const check = await createMonitorCheck({ + check = await createMonitorCheck({ monitor, trigger: "scheduled", scheduledFor: monitor.next_run_at, }); + dispatched = await dispatchScheduledMonitorCheck({ + monitor, + checkId: check.id, + }); + if (!dispatched) { + check = await updateMonitorCheck(check.id, { + status: "skipped_overlap", + finished_at: new Date().toISOString(), + error: "Previous monitor check is still running.", + }); + await advanceMonitorAfterSkippedCheck({ monitor, check }); + continue; + } + await enqueueMonitorCheck({ monitorId: monitor.id, checkId: check.id, @@ -63,6 +81,38 @@ export async function enqueueDueMonitorChecks( }); enqueued++; } catch (error) { + if (check) { + const failed = await updateMonitorCheck(check.id, { + status: "failed", + finished_at: new Date().toISOString(), + error: error instanceof Error ? error.message : String(error), + }).catch(updateError => { + logger.error("Failed to mark monitor check enqueue failure", { + updateError, + error, + monitorId: monitor.id, + checkId: check?.id, + teamId: monitor.team_id, + }); + return null; + }); + + if (failed && dispatched) { + await updateMonitorScheduleAfterRun({ + monitor, + check: failed, + }).catch(updateError => { + logger.error("Failed to clear failed dispatched monitor check", { + updateError, + error, + monitorId: monitor.id, + checkId: failed.id, + teamId: monitor.team_id, + }); + }); + } + } + logger.error("Failed to enqueue due monitor check", { error, monitorId: monitor.id, diff --git a/apps/api/src/services/monitoring/store.ts b/apps/api/src/services/monitoring/store.ts index 2d4e51dffa..d5a2f26978 100644 --- a/apps/api/src/services/monitoring/store.ts +++ b/apps/api/src/services/monitoring/store.ts @@ -254,6 +254,37 @@ export async function markMonitorRunning(params: { throwIfError(error, "Failed to mark monitor running"); } +export async function dispatchScheduledMonitorCheck(params: { + monitor: MonitorRow; + checkId: string; +}): Promise { + const nextRunAt = + params.monitor.status === "active" + ? getNextMonitorRunAt( + params.monitor.schedule_cron, + new Date(), + params.monitor.schedule_timezone, + ).toISOString() + : null; + + const { data, error } = await supabase_service + .from("monitors") + .update({ + current_check_id: params.checkId, + locked_at: null, + locked_until: null, + next_run_at: nextRunAt, + updated_at: new Date().toISOString(), + }) + .eq("id", params.monitor.id) + .is("current_check_id", null) + .select("id") + .maybeSingle(); + + throwIfError(error, "Failed to dispatch scheduled monitor check"); + return !!data; +} + export async function updateMonitorScheduleAfterRun(params: { monitor: MonitorRow; check: MonitorCheckRow; From 97dfb18926bcb453d7bace3ac8aeca9ec62df2dc Mon Sep 17 00:00:00 2001 From: mogery Date: Wed, 6 May 2026 15:11:20 +0200 Subject: [PATCH 18/27] fix(monitoring): fail stale running checks Co-authored-by: Cursor --- .../src/services/monitoring/runner.test.ts | 56 +++++++++ apps/api/src/services/monitoring/runner.ts | 108 ++++++++++++++++++ 2 files changed, 164 insertions(+) create mode 100644 apps/api/src/services/monitoring/runner.test.ts diff --git a/apps/api/src/services/monitoring/runner.test.ts b/apps/api/src/services/monitoring/runner.test.ts new file mode 100644 index 0000000000..4b1cc32bae --- /dev/null +++ b/apps/api/src/services/monitoring/runner.test.ts @@ -0,0 +1,56 @@ +jest.mock("uuid", () => ({ + v7: () => "test-uuid-v7", +})); + +import { isMonitorCheckStale, MONITOR_CHECK_STALE_TIMEOUT_MS } from "./runner"; + +describe("monitoring runner", () => { + describe("isMonitorCheckStale", () => { + const now = new Date("2026-05-06T12:00:00.000Z"); + + it("returns true when a running check is at least 24 hours old", () => { + expect( + isMonitorCheckStale( + { + started_at: new Date( + now.getTime() - MONITOR_CHECK_STALE_TIMEOUT_MS, + ).toISOString(), + updated_at: now.toISOString(), + created_at: now.toISOString(), + }, + now, + ), + ).toBe(true); + }); + + it("returns false when a running check is not yet stale", () => { + expect( + isMonitorCheckStale( + { + started_at: new Date( + now.getTime() - MONITOR_CHECK_STALE_TIMEOUT_MS + 1, + ).toISOString(), + updated_at: now.toISOString(), + created_at: now.toISOString(), + }, + now, + ), + ).toBe(false); + }); + + it("falls back to updated_at for malformed started_at values", () => { + expect( + isMonitorCheckStale( + { + started_at: null, + updated_at: new Date( + now.getTime() - MONITOR_CHECK_STALE_TIMEOUT_MS, + ).toISOString(), + created_at: now.toISOString(), + }, + now, + ), + ).toBe(true); + }); + }); +}); diff --git a/apps/api/src/services/monitoring/runner.ts b/apps/api/src/services/monitoring/runner.ts index 4afa1ce732..e5371dafe5 100644 --- a/apps/api/src/services/monitoring/runner.ts +++ b/apps/api/src/services/monitoring/runner.ts @@ -55,6 +55,7 @@ import type { MonitorCheckJobData } from "./queue"; const logger = _logger.child({ module: "monitoring-runner" }); const poll = (ms: number) => new Promise(resolve => setTimeout(resolve, ms)); +export const MONITOR_CHECK_STALE_TIMEOUT_MS = 24 * 60 * 60 * 1000; type PageResult = MonitorCheckPageInsert & { emailStatus?: string; @@ -917,6 +918,111 @@ async function isMonitorCheckComplete( return true; } +export function isMonitorCheckStale( + check: Pick, + now: Date = new Date(), +): boolean { + const startedAt = check.started_at ?? check.updated_at ?? check.created_at; + const startedAtMs = Date.parse(startedAt); + if (!Number.isFinite(startedAtMs)) return false; + return now.getTime() - startedAtMs >= MONITOR_CHECK_STALE_TIMEOUT_MS; +} + +async function failStaleMonitorCheck(params: { + monitor: MonitorRow; + check: MonitorCheckRow; +}): Promise { + if (!isMonitorCheckStale(params.check)) return false; + + const error = "Monitor check exceeded the 24 hour running timeout."; + if (params.check.autumn_lock_id) { + await autumnService + .finalizeCreditsLock({ + lockId: params.check.autumn_lock_id, + action: "release", + properties: { + source: "monitorCheck", + endpoint: "monitor", + jobId: params.check.id, + }, + }) + .catch(releaseError => { + logger.warn("Failed to release stale monitor check credit lock", { + error: releaseError, + monitorId: params.monitor.id, + checkId: params.check.id, + lockId: params.check.autumn_lock_id, + }); + }); + } + + const finalized = await updateMonitorCheck(params.check.id, { + status: "failed", + finished_at: new Date().toISOString(), + actual_credits: 0, + billing_status: params.check.autumn_lock_id ? "released" : "not_applicable", + error, + }); + + const notificationStatus = await sendNotifications({ + monitor: params.monitor, + check: finalized, + pages: [], + }).catch(notificationError => { + logger.warn("Failed to send stale monitor check notifications", { + error: notificationError, + monitorId: params.monitor.id, + checkId: params.check.id, + }); + return { + webhook: { + attempted: !!params.monitor.webhook, + success: false, + error: + notificationError instanceof Error + ? notificationError.message + : String(notificationError), + }, + email: { + attempted: !!params.monitor.notification?.email?.enabled, + success: false, + error: + notificationError instanceof Error + ? notificationError.message + : String(notificationError), + }, + }; + }); + + const withNotifications = await updateMonitorCheck(params.check.id, { + notification_status: notificationStatus, + }).catch(updateError => { + logger.warn("Failed to record stale monitor check notification status", { + error: updateError, + monitorId: params.monitor.id, + checkId: params.check.id, + }); + return finalized; + }); + + if (params.monitor.current_check_id === params.check.id) { + await updateMonitorScheduleAfterRun({ + monitor: params.monitor, + check: withNotifications, + summary: toSummaryObject(withNotifications), + }); + } + + logger.warn("Failed stale monitor check", { + monitorId: params.monitor.id, + checkId: params.check.id, + startedAt: params.check.started_at, + timeoutMs: MONITOR_CHECK_STALE_TIMEOUT_MS, + }); + + return true; +} + export async function reconcileRunningMonitorChecks( limit: number = 50, ): Promise { @@ -933,6 +1039,8 @@ export async function reconcileRunningMonitorChecks( ); if (!monitor) continue; + if (await failStaleMonitorCheck({ monitor, check })) continue; + const targetResults = Array.isArray(check.target_results) ? ([...check.target_results] as any[]) : []; From d63121c8e33522acd9e00a69dd48896a21a81c68 Mon Sep 17 00:00:00 2001 From: mogery Date: Wed, 6 May 2026 15:42:43 +0200 Subject: [PATCH 19/27] fix(monitoring): avoid exact count for check pages Co-authored-by: Cursor --- apps/api/src/services/monitoring/store.ts | 36 +++++++++++++++-------- 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/apps/api/src/services/monitoring/store.ts b/apps/api/src/services/monitoring/store.ts index d5a2f26978..f8f2ea59da 100644 --- a/apps/api/src/services/monitoring/store.ts +++ b/apps/api/src/services/monitoring/store.ts @@ -469,21 +469,33 @@ export async function countMonitorCheckPages(params: { targetId?: string; status?: string; }): Promise { - let query = supabase_rr_service - .from("monitor_check_pages") - .select("id", { count: "exact", head: true }) - .eq("check_id", params.checkId); + const pageSize = 1000; + let total = 0; + let offset = 0; - if (params.targetId) { - query = query.eq("target_id", params.targetId); - } - if (params.status) { - query = query.eq("status", params.status); + while (true) { + let query = supabase_rr_service + .from("monitor_check_pages") + .select("id") + .eq("check_id", params.checkId); + + if (params.targetId) { + query = query.eq("target_id", params.targetId); + } + if (params.status) { + query = query.eq("status", params.status); + } + + const { data, error } = await query.range(offset, offset + pageSize - 1); + throwIfError(error, "Failed to count monitor check pages"); + + const batch = data ?? []; + total += batch.length; + if (batch.length < pageSize) break; + offset += pageSize; } - const { count, error } = await query; - throwIfError(error, "Failed to count monitor check pages"); - return count ?? 0; + return total; } export async function getMonitorPage(params: { From 05385ad419628b32fa23246f2b03e36c3cc965fe Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 6 May 2026 16:26:27 +0000 Subject: [PATCH 20/27] fix(tests): gate directQuote test to production only The directQuote query mode uses a Fireworks model that is not available in self-hosted CI environments (only OPENAI_API_KEY is set). This caused all 4 self-hosted test matrix jobs to fail. Gate the test behind TEST_PRODUCTION so it only runs where Fireworks is available. Co-Authored-By: gaurav --- apps/api/src/__tests__/snips/v2/scrape-query.test.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/__tests__/snips/v2/scrape-query.test.ts b/apps/api/src/__tests__/snips/v2/scrape-query.test.ts index 41cbd5b4d2..aff7bfd755 100644 --- a/apps/api/src/__tests__/snips/v2/scrape-query.test.ts +++ b/apps/api/src/__tests__/snips/v2/scrape-query.test.ts @@ -61,7 +61,7 @@ describe("Query format", () => { scrapeTimeout, ); - concurrentIf(TEST_PRODUCTION || HAS_AI)( + concurrentIf(TEST_PRODUCTION)( "returns a direct quote answer when query mode is directQuote", async () => { const response = await scrape( From 05416c7562f5eb6d1ce354f5e8d65e371ca66487 Mon Sep 17 00:00:00 2001 From: mogery Date: Wed, 6 May 2026 19:51:03 +0200 Subject: [PATCH 21/27] feat(scrape): split question and highlights formats Co-authored-by: Cursor --- .../__tests__/snips/v2/scrape-query.test.ts | 72 +++++++++++ .../snips/v2/types-validation.test.ts | 94 ++++++++++++++ apps/api/src/controllers/v2/parse.ts | 2 + apps/api/src/controllers/v2/scrape.ts | 2 + apps/api/src/controllers/v2/types.ts | 22 ++++ apps/api/src/lib/entities.ts | 3 + apps/api/src/lib/scrape-billing.ts | 9 +- apps/api/src/scraper/scrapeURL/index.ts | 10 +- .../scraper/scrapeURL/transformers/index.ts | 33 ++++- .../scraper/scrapeURL/transformers/query.ts | 56 +++++--- .../Firecrawl.Tests/ModelsTests.cs | 21 +++ apps/dot-net-sdk/Firecrawl/Firecrawl.csproj | 2 +- apps/dot-net-sdk/Firecrawl/Models/Document.cs | 6 + .../Firecrawl/Models/HighlightsFormat.cs | 15 +++ .../Firecrawl/Models/QueryFormat.cs | 3 +- .../Firecrawl/Models/QuestionFormat.cs | 15 +++ .../Firecrawl/Models/SearchData.cs | 9 ++ apps/elixir-sdk/lib/firecrawl.ex | 4 +- apps/elixir-sdk/mix.exs | 2 +- apps/go-sdk/models.go | 2 + apps/go-sdk/options.go | 42 +++++- apps/go-sdk/options_test.go | 22 ++++ apps/go-sdk/parse.go | 2 +- apps/go-sdk/version.go | 2 +- apps/java-sdk/build.gradle.kts | 2 +- .../java/com/firecrawl/models/Document.java | 4 + .../firecrawl/models/HighlightsFormat.java | 35 +++++ .../com/firecrawl/models/QueryFormat.java | 5 +- .../com/firecrawl/models/QuestionFormat.java | 35 +++++ .../com/firecrawl/models/ScrapeOptions.java | 2 +- .../com/firecrawl/FirecrawlClientTest.java | 20 +++ apps/js-sdk/firecrawl/package.json | 2 +- .../src/__tests__/unit/v2/validation.test.ts | 21 +++ apps/js-sdk/firecrawl/src/v2/types.ts | 16 +++ .../firecrawl/src/v2/utils/validation.ts | 52 ++++++++ apps/php-sdk/src/Models/Document.php | 14 ++ apps/php-sdk/src/Models/HighlightsFormat.php | 31 +++++ apps/php-sdk/src/Models/ParseOptions.php | 24 +++- apps/php-sdk/src/Models/QueryFormat.php | 1 + apps/php-sdk/src/Models/QuestionFormat.php | 31 +++++ apps/php-sdk/src/Models/ScrapeOptions.php | 16 ++- apps/php-sdk/src/Version.php | 2 +- apps/php-sdk/tests/Unit/ModelsTest.php | 22 ++++ apps/python-sdk/firecrawl/__init__.py | 2 +- .../unit/v2/utils/test_validation.py | 36 +++++- apps/python-sdk/firecrawl/types.py | 6 + apps/python-sdk/firecrawl/v2/types.py | 29 ++++- .../firecrawl/v2/utils/validation.py | 46 +++++++ .../ruby-sdk/lib/firecrawl/models/document.rb | 5 +- .../lib/firecrawl/models/query_format.rb | 42 +++++- apps/ruby-sdk/lib/firecrawl/version.rb | 2 +- apps/ruby-sdk/test/firecrawl/client_test.rb | 14 ++ apps/rust-sdk/Cargo.lock | 2 +- apps/rust-sdk/Cargo.toml | 2 +- apps/rust-sdk/src/scrape.rs | 33 ++++- apps/rust-sdk/src/types.rs | 121 +++++++++++++++++- 56 files changed, 1056 insertions(+), 69 deletions(-) create mode 100644 apps/dot-net-sdk/Firecrawl/Models/HighlightsFormat.cs create mode 100644 apps/dot-net-sdk/Firecrawl/Models/QuestionFormat.cs create mode 100644 apps/java-sdk/src/main/java/com/firecrawl/models/HighlightsFormat.java create mode 100644 apps/java-sdk/src/main/java/com/firecrawl/models/QuestionFormat.java create mode 100644 apps/php-sdk/src/Models/HighlightsFormat.php create mode 100644 apps/php-sdk/src/Models/QuestionFormat.php diff --git a/apps/api/src/__tests__/snips/v2/scrape-query.test.ts b/apps/api/src/__tests__/snips/v2/scrape-query.test.ts index b088edc1e6..22b0356d78 100644 --- a/apps/api/src/__tests__/snips/v2/scrape-query.test.ts +++ b/apps/api/src/__tests__/snips/v2/scrape-query.test.ts @@ -38,6 +38,25 @@ describe("Query format", () => { scrapeTimeout, ); + concurrentIf(TEST_PRODUCTION || HAS_AI)( + "returns a non-empty answer for a valid question", + async () => { + const response = await scrape( + { + url: "https://firecrawl.dev", + formats: [{ type: "question", question: "What is Firecrawl?" }], + }, + identity, + ); + + expect(response.answer).toBeDefined(); + expect(typeof response.answer).toBe("string"); + expect(response.answer!.length).toBeGreaterThan(0); + expect(response.markdown).toBeUndefined(); + }, + scrapeTimeout, + ); + concurrentIf(TEST_PRODUCTION || HAS_AI)( "returns both answer and markdown when formats include markdown and query", async () => { @@ -61,6 +80,25 @@ describe("Query format", () => { scrapeTimeout, ); + concurrentIf(TEST_PRODUCTION || HAS_FIREWORKS)( + "returns non-empty highlights for a valid highlights query", + async () => { + const response = await scrape( + { + url: "https://firecrawl.dev", + formats: [{ type: "highlights", query: "What is Firecrawl?" }], + }, + identity, + ); + + expect(response.highlights).toBeDefined(); + expect(typeof response.highlights).toBe("string"); + expect(response.highlights!.length).toBeGreaterThan(0); + expect(response.answer).toBeUndefined(); + }, + scrapeTimeout, + ); + concurrentIf(TEST_PRODUCTION || HAS_FIREWORKS)( "returns a direct quote answer when query mode is directQuote", async () => { @@ -118,4 +156,38 @@ describe("Query format", () => { }, scrapeTimeout, ); + + it( + "rejects question over 10000 characters", + async () => { + const response = await scrapeWithFailure( + { + url: "https://firecrawl.dev", + formats: [{ type: "question", question: "a".repeat(10001) }], + } as any, + identity, + ); + + expect(response.success).toBe(false); + expect(response.error).toBeDefined(); + }, + scrapeTimeout, + ); + + it( + "rejects highlights query over 10000 characters", + async () => { + const response = await scrapeWithFailure( + { + url: "https://firecrawl.dev", + formats: [{ type: "highlights", query: "a".repeat(10001) }], + } as any, + identity, + ); + + expect(response.success).toBe(false); + expect(response.error).toBeDefined(); + }, + scrapeTimeout, + ); }); diff --git a/apps/api/src/__tests__/snips/v2/types-validation.test.ts b/apps/api/src/__tests__/snips/v2/types-validation.test.ts index 7aa29e744b..aef9c20670 100644 --- a/apps/api/src/__tests__/snips/v2/types-validation.test.ts +++ b/apps/api/src/__tests__/snips/v2/types-validation.test.ts @@ -114,6 +114,62 @@ describe("V2 Types Validation", () => { ]); }); + it("should accept question format", () => { + const input: ScrapeRequestInput = { + url: "https://example.com", + formats: [{ type: "question", question: "What is Firecrawl?" }], + }; + + const result = scrapeRequestSchema.parse(input); + expect(result.formats).toEqual([ + { type: "question", question: "What is Firecrawl?" }, + ]); + }); + + it("should accept highlights format", () => { + const input: ScrapeRequestInput = { + url: "https://example.com", + formats: [{ type: "highlights", query: "What is Firecrawl?" }], + }; + + const result = scrapeRequestSchema.parse(input); + expect(result.formats).toEqual([ + { type: "highlights", query: "What is Firecrawl?" }, + ]); + }); + + it("should reject invalid question and highlights fields", () => { + expect(() => + scrapeRequestSchema.parse({ + url: "https://example.com", + formats: [{ type: "question", question: "" }], + } satisfies ScrapeRequestInput), + ).toThrow(); + + expect(() => + scrapeRequestSchema.parse({ + url: "https://example.com", + formats: [{ type: "question", prompt: "What is Firecrawl?" } as any], + }), + ).toThrow(); + + expect(() => + scrapeRequestSchema.parse({ + url: "https://example.com", + formats: [{ type: "highlights", query: "" }], + } satisfies ScrapeRequestInput), + ).toThrow(); + + expect(() => + scrapeRequestSchema.parse({ + url: "https://example.com", + formats: [ + { type: "highlights", prompt: "What is Firecrawl?" } as any, + ], + }), + ).toThrow(); + }); + it("should accept valid scrape request with changeTracking format", () => { const input: ScrapeRequestInput = { url: "https://example.com", @@ -1030,6 +1086,24 @@ describe("V2 Types Validation", () => { ]); }); + it("should accept search scrapeOptions with question and highlights formats", () => { + const input: SearchRequestInput = { + query: "test", + scrapeOptions: { + formats: [ + { type: "question", question: "What is Firecrawl?" }, + { type: "highlights", query: "What is Firecrawl?" }, + ], + }, + }; + + const result = searchRequestSchema.parse(input); + expect(result.scrapeOptions?.formats).toEqual([ + { type: "question", question: "What is Firecrawl?" }, + { type: "highlights", query: "What is Firecrawl?" }, + ]); + }); + it("should reject search scrapeOptions query format with invalid mode", () => { const input: SearchRequestInput = { query: "test", @@ -1074,6 +1148,26 @@ describe("V2 Types Validation", () => { expect(() => searchRequestSchema.parse(input)).toThrow(); }); + + it("should reject search scrapeOptions question and highlights values over 10000 characters", () => { + expect(() => + searchRequestSchema.parse({ + query: "test", + scrapeOptions: { + formats: [{ type: "question", question: "a".repeat(10001) }], + }, + } satisfies SearchRequestInput), + ).toThrow(); + + expect(() => + searchRequestSchema.parse({ + query: "test", + scrapeOptions: { + formats: [{ type: "highlights", query: "a".repeat(10001) }], + }, + } satisfies SearchRequestInput), + ).toThrow(); + }); }); describe("Type inference", () => { diff --git a/apps/api/src/controllers/v2/parse.ts b/apps/api/src/controllers/v2/parse.ts index d6c7e5f2c1..a10fdfd103 100644 --- a/apps/api/src/controllers/v2/parse.ts +++ b/apps/api/src/controllers/v2/parse.ts @@ -632,6 +632,8 @@ export async function parseController( let usedLlm = !!hasFormatOfType(req.body.formats, "json") || !!hasFormatOfType(req.body.formats, "summary") || + !!hasFormatOfType(req.body.formats, "question") || + !!hasFormatOfType(req.body.formats, "highlights") || !!hasFormatOfType(req.body.formats, "query"); if (!usedLlm) { diff --git a/apps/api/src/controllers/v2/scrape.ts b/apps/api/src/controllers/v2/scrape.ts index 1aa301e4d0..a961401445 100644 --- a/apps/api/src/controllers/v2/scrape.ts +++ b/apps/api/src/controllers/v2/scrape.ts @@ -422,6 +422,8 @@ export async function scrapeController( !!hasFormatOfType(req.body.formats, "json") || !!hasFormatOfType(req.body.formats, "summary") || !!hasFormatOfType(req.body.formats, "branding") || + !!hasFormatOfType(req.body.formats, "question") || + !!hasFormatOfType(req.body.formats, "highlights") || !!hasFormatOfType(req.body.formats, "query"); if (!usedLlm) { diff --git a/apps/api/src/controllers/v2/types.ts b/apps/api/src/controllers/v2/types.ts index ecfe24d5fc..2b838f087e 100644 --- a/apps/api/src/controllers/v2/types.ts +++ b/apps/api/src/controllers/v2/types.ts @@ -400,6 +400,21 @@ const attributesFormatWithOptions = z.strictObject({ type AttributesFormatWithOptions = z.output; +const questionFormatWithOptions = z.strictObject({ + type: z.literal("question"), + question: z.string().min(1).max(10000), +}); + +type QuestionFormatWithOptions = z.output; + +const highlightsFormatWithOptions = z.strictObject({ + type: z.literal("highlights"), + query: z.string().min(1).max(10000), +}); + +type HighlightsFormatWithOptions = z.output; + +/** @deprecated Use `question` or `highlights` format instead. */ const queryFormatWithOptions = z.strictObject({ type: z.literal("query"), prompt: z.string().max(10000), @@ -419,6 +434,8 @@ export type FormatObject = | ChangeTrackingFormatWithOptions | ScreenshotFormatWithOptions | AttributesFormatWithOptions + | QuestionFormatWithOptions + | HighlightsFormatWithOptions | QueryFormatWithOptions | { type: "branding" } | { type: "audio" }; @@ -529,6 +546,8 @@ const baseScrapeOptions = z.strictObject({ screenshotFormatWithOptions, attributesFormatWithOptions, z.strictObject({ type: z.literal("branding") }), + questionFormatWithOptions, + highlightsFormatWithOptions, queryFormatWithOptions, z.strictObject({ type: z.literal("audio") }), ]) @@ -1082,6 +1101,7 @@ export type Document = { json?: any; summary?: string; answer?: string; + highlights?: string; branding?: BrandingProfile; warning?: string; attributes?: { @@ -1784,6 +1804,8 @@ export const searchRequestSchema = z z.strictObject({ type: z.literal("images") }), z.strictObject({ type: z.literal("summary") }), jsonFormatWithOptions, + questionFormatWithOptions, + highlightsFormatWithOptions, queryFormatWithOptions, screenshotFormatWithOptions, ]) diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 3e2f2515ca..0d5dac8eee 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -122,6 +122,7 @@ interface ImageSearchResult { url?: string; position?: number; answer?: string; + highlights?: string; } interface NewsSearchResult { @@ -140,6 +141,7 @@ interface NewsSearchResult { screenshot?: string; metadata?: Record; answer?: string; + highlights?: string; } export interface WebSearchResult { @@ -156,6 +158,7 @@ export interface WebSearchResult { screenshot?: string; metadata?: Record; answer?: string; + highlights?: string; } export type SearchResultType = "web" | "images" | "news"; diff --git a/apps/api/src/lib/scrape-billing.ts b/apps/api/src/lib/scrape-billing.ts index 440e039307..166a91eabd 100644 --- a/apps/api/src/lib/scrape-billing.ts +++ b/apps/api/src/lib/scrape-billing.ts @@ -75,7 +75,14 @@ export async function calculateCreditsToBeBilled( creditsToBeBilled = Math.ceil((costTrackingJSON.totalCost ?? 1) * 1800); } - if (hasFormatOfType(options.formats, "query")) { + const hasQuestionFormat = + hasFormatOfType(options.formats, "question") || + hasFormatOfType(options.formats, "query"); + if (hasQuestionFormat) { + creditsToBeBilled += 4; + } + + if (hasFormatOfType(options.formats, "highlights")) { creditsToBeBilled += 4; } diff --git a/apps/api/src/scraper/scrapeURL/index.ts b/apps/api/src/scraper/scrapeURL/index.ts index eac798f5ef..0b3a02f612 100644 --- a/apps/api/src/scraper/scrapeURL/index.ts +++ b/apps/api/src/scraper/scrapeURL/index.ts @@ -520,9 +520,17 @@ async function scrapeURLLoopIter( ); const hasJson = hasFormatOfType(meta.options.formats, "json"); const hasSummary = hasFormatOfType(meta.options.formats, "summary"); + const hasQuestion = hasFormatOfType(meta.options.formats, "question"); + const hasHighlights = hasFormatOfType(meta.options.formats, "highlights"); const hasQuery = hasFormatOfType(meta.options.formats, "query"); const needsMarkdown = - hasMarkdown || hasChangeTracking || hasJson || hasSummary || hasQuery; + hasMarkdown || + hasChangeTracking || + hasJson || + hasSummary || + hasQuestion || + hasHighlights || + hasQuery; let checkMarkdown: string; const htmlSize = engineResult.html?.length ?? 0; diff --git a/apps/api/src/scraper/scrapeURL/transformers/index.ts b/apps/api/src/scraper/scrapeURL/transformers/index.ts index 45fd933d7e..883ec5873b 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/index.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/index.ts @@ -83,7 +83,7 @@ async function deriveMarkdownFromHTML( // - changeTracking requires markdown // - json format requires markdown (for LLM extraction) // - summary format requires markdown (for summarization) - // - query format requires markdown (for page-level answers) + // - question/highlights/query formats require markdown (for page-level answers) const hasMarkdown = hasFormatOfType(meta.options.formats, "markdown"); const hasChangeTracking = hasFormatOfType( meta.options.formats, @@ -91,12 +91,16 @@ async function deriveMarkdownFromHTML( ); const hasJson = hasFormatOfType(meta.options.formats, "json"); const hasSummary = hasFormatOfType(meta.options.formats, "summary"); + const hasQuestion = hasFormatOfType(meta.options.formats, "question"); + const hasHighlights = hasFormatOfType(meta.options.formats, "highlights"); const hasQuery = hasFormatOfType(meta.options.formats, "query"); if ( !hasMarkdown && !hasChangeTracking && !hasJson && !hasSummary && + !hasQuestion && + !hasHighlights && !hasQuery && !meta.options.onlyCleanContent ) { @@ -317,7 +321,13 @@ function coerceFieldsToFormats(meta: Meta, document: Document): Document { const hasScreenshot = hasFormatOfType(meta.options.formats, "screenshot"); const hasSummary = hasFormatOfType(meta.options.formats, "summary"); const hasBranding = hasFormatOfType(meta.options.formats, "branding"); - const hasQueryFormat = hasFormatOfType(meta.options.formats, "query"); + const hasQuestionFormat = hasFormatOfType(meta.options.formats, "question"); + const hasHighlightsFormat = hasFormatOfType( + meta.options.formats, + "highlights", + ); + const hasLegacyQueryFormat = hasFormatOfType(meta.options.formats, "query"); + const hasAnswerFormat = hasQuestionFormat || hasLegacyQueryFormat; if (!hasMarkdown && document.markdown !== undefined) { delete document.markdown; @@ -432,14 +442,25 @@ function coerceFieldsToFormats(meta: Meta, document: Document): Document { ); } - if (!hasQueryFormat && document.answer !== undefined) { + if (!hasAnswerFormat && document.answer !== undefined) { meta.logger.warn( - "Removed answer from Document because query wasn't in formats -- this is wasteful and indicates a bug.", + "Removed answer from Document because question/query wasn't in formats -- this is wasteful and indicates a bug.", ); delete document.answer; - } else if (hasQueryFormat && document.answer === undefined) { + } else if (hasAnswerFormat && document.answer === undefined) { + meta.logger.warn( + "Request had format question/query, but there was no answer field in the result.", + ); + } + + if (!hasHighlightsFormat && document.highlights !== undefined) { + meta.logger.warn( + "Removed highlights from Document because highlights wasn't in formats -- this is wasteful and indicates a bug.", + ); + delete document.highlights; + } else if (hasHighlightsFormat && document.highlights === undefined) { meta.logger.warn( - "Request had format query, but there was no answer field in the result.", + "Request had format highlights, but there was no highlights field in the result.", ); } diff --git a/apps/api/src/scraper/scrapeURL/transformers/query.ts b/apps/api/src/scraper/scrapeURL/transformers/query.ts index b6cb0d7352..ca09d69814 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/query.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/query.ts @@ -1,7 +1,7 @@ import { generateText } from "ai"; import * as marked from "marked"; import { decode as decodeHtmlEntities } from "he"; -import { Document } from "../../../controllers/v2/types"; +import { Document, FormatObject } from "../../../controllers/v2/types"; import { Meta } from ".."; import { getModel } from "../../../lib/generic-ai"; import { hasFormatOfType } from "../../../lib/format-utils"; @@ -475,8 +475,12 @@ export async function performQuery( meta: Meta, document: Document, ): Promise { - const queryFormat = hasFormatOfType(meta.options.formats, "query"); - if (!queryFormat) { + const answerFormat = meta.options.formats?.find( + (format): format is Extract => + format.type === "question" || format.type === "query", + ); + const highlightsFormat = hasFormatOfType(meta.options.formats, "highlights"); + if (!answerFormat && !highlightsFormat) { return document; } @@ -505,30 +509,40 @@ export async function performQuery( const pageUrl = meta.url ?? document.metadata?.sourceURL ?? ""; - let answer: string | null; + if (answerFormat) { + const prompt = + answerFormat.type === "question" + ? answerFormat.question + : answerFormat.prompt; + const answer = + answerFormat.type === "query" && answerFormat.mode === "directQuote" + ? await performDirectQuoteQuery(meta, document, prompt, markdown) + : await performFreeformQuery(meta, prompt, markdown, pageUrl); + + if (answer !== null) { + document.answer = answer; + } else { + document.warning = + "Query generation failed after all models." + + (document.warning ? " " + document.warning : ""); + } + } - if (queryFormat.mode === "directQuote") { - answer = await performDirectQuoteQuery( + if (highlightsFormat) { + const highlights = await performDirectQuoteQuery( meta, document, - queryFormat.prompt, - markdown, - ); - } else { - answer = await performFreeformQuery( - meta, - queryFormat.prompt, + highlightsFormat.query, markdown, - pageUrl, ); - } - if (answer !== null) { - document.answer = answer; - } else { - document.warning = - "Query generation failed after all models." + - (document.warning ? " " + document.warning : ""); + if (highlights !== null) { + document.highlights = highlights; + } else { + document.warning = + "Highlights generation failed after all models." + + (document.warning ? " " + document.warning : ""); + } } return document; diff --git a/apps/dot-net-sdk/Firecrawl.Tests/ModelsTests.cs b/apps/dot-net-sdk/Firecrawl.Tests/ModelsTests.cs index 7c1f20ec54..492f4c22d3 100644 --- a/apps/dot-net-sdk/Firecrawl.Tests/ModelsTests.cs +++ b/apps/dot-net-sdk/Firecrawl.Tests/ModelsTests.cs @@ -259,6 +259,27 @@ public void QueryFormat_HasCorrectMode() Assert.Contains("\"mode\":\"directQuote\"", json); } + [Fact] + public void QuestionAndHighlightsFormats_SerializeCorrectly() + { + var question = new QuestionFormat + { + Question = "What is Firecrawl?" + }; + var highlights = new HighlightsFormat + { + Query = "What is Firecrawl?" + }; + + var questionJson = JsonSerializer.Serialize(question, JsonOptions); + Assert.Contains("\"type\":\"question\"", questionJson); + Assert.Contains("\"question\":\"What is Firecrawl?\"", questionJson); + + var highlightsJson = JsonSerializer.Serialize(highlights, JsonOptions); + Assert.Contains("\"type\":\"highlights\"", highlightsJson); + Assert.Contains("\"query\":\"What is Firecrawl?\"", highlightsJson); + } + [Fact] public void WebhookConfig_SerializesCorrectly() { diff --git a/apps/dot-net-sdk/Firecrawl/Firecrawl.csproj b/apps/dot-net-sdk/Firecrawl/Firecrawl.csproj index 4962ae3904..efdeaea3d5 100644 --- a/apps/dot-net-sdk/Firecrawl/Firecrawl.csproj +++ b/apps/dot-net-sdk/Firecrawl/Firecrawl.csproj @@ -8,7 +8,7 @@ firecrawl-sdk - 1.3.0 + 1.3.1 Firecrawl Firecrawl .NET SDK for the Firecrawl API - web scraping, crawling, and data extraction diff --git a/apps/dot-net-sdk/Firecrawl/Models/Document.cs b/apps/dot-net-sdk/Firecrawl/Models/Document.cs index 754ad665a9..5dd68eb448 100644 --- a/apps/dot-net-sdk/Firecrawl/Models/Document.cs +++ b/apps/dot-net-sdk/Firecrawl/Models/Document.cs @@ -40,6 +40,12 @@ public class Document [JsonPropertyName("actions")] public object? Actions { get; set; } + [JsonPropertyName("answer")] + public string? Answer { get; set; } + + [JsonPropertyName("highlights")] + public string? Highlights { get; set; } + [JsonPropertyName("warning")] public string? Warning { get; set; } diff --git a/apps/dot-net-sdk/Firecrawl/Models/HighlightsFormat.cs b/apps/dot-net-sdk/Firecrawl/Models/HighlightsFormat.cs new file mode 100644 index 0000000000..ee40ceddc2 --- /dev/null +++ b/apps/dot-net-sdk/Firecrawl/Models/HighlightsFormat.cs @@ -0,0 +1,15 @@ +using System.Text.Json.Serialization; + +namespace Firecrawl.Models; + +/// +/// Highlights format specification for use in ScrapeOptions.Formats. +/// +public class HighlightsFormat +{ + [JsonPropertyName("type")] + public string Type { get; } = "highlights"; + + [JsonPropertyName("query")] + public required string Query { get; set; } +} diff --git a/apps/dot-net-sdk/Firecrawl/Models/QueryFormat.cs b/apps/dot-net-sdk/Firecrawl/Models/QueryFormat.cs index 44cd939206..9aa46e16a9 100644 --- a/apps/dot-net-sdk/Firecrawl/Models/QueryFormat.cs +++ b/apps/dot-net-sdk/Firecrawl/Models/QueryFormat.cs @@ -3,8 +3,9 @@ namespace Firecrawl.Models; /// -/// Query format specification for use in ScrapeOptions.Formats. +/// Deprecated query format specification for use in ScrapeOptions.Formats. /// +[Obsolete("Use QuestionFormat or HighlightsFormat instead.")] public class QueryFormat { public const string FreeformMode = "freeform"; diff --git a/apps/dot-net-sdk/Firecrawl/Models/QuestionFormat.cs b/apps/dot-net-sdk/Firecrawl/Models/QuestionFormat.cs new file mode 100644 index 0000000000..070e9d7f10 --- /dev/null +++ b/apps/dot-net-sdk/Firecrawl/Models/QuestionFormat.cs @@ -0,0 +1,15 @@ +using System.Text.Json.Serialization; + +namespace Firecrawl.Models; + +/// +/// Question format specification for use in ScrapeOptions.Formats. +/// +public class QuestionFormat +{ + [JsonPropertyName("type")] + public string Type { get; } = "question"; + + [JsonPropertyName("question")] + public required string Question { get; set; } +} diff --git a/apps/dot-net-sdk/Firecrawl/Models/SearchData.cs b/apps/dot-net-sdk/Firecrawl/Models/SearchData.cs index 7c44b2a38e..d2092220c7 100644 --- a/apps/dot-net-sdk/Firecrawl/Models/SearchData.cs +++ b/apps/dot-net-sdk/Firecrawl/Models/SearchData.cs @@ -42,6 +42,9 @@ public class WebSearchHit [JsonPropertyName("answer")] public string? Answer { get; set; } + + [JsonPropertyName("highlights")] + public string? Highlights { get; set; } } /// @@ -90,6 +93,9 @@ public class NewsSearchHit [JsonPropertyName("answer")] public string? Answer { get; set; } + + [JsonPropertyName("highlights")] + public string? Highlights { get; set; } } /// @@ -117,6 +123,9 @@ public class ImageSearchHit [JsonPropertyName("answer")] public string? Answer { get; set; } + + [JsonPropertyName("highlights")] + public string? Highlights { get; set; } } /// diff --git a/apps/elixir-sdk/lib/firecrawl.ex b/apps/elixir-sdk/lib/firecrawl.ex index a3ee2b40fd..ca35773f29 100644 --- a/apps/elixir-sdk/lib/firecrawl.ex +++ b/apps/elixir-sdk/lib/firecrawl.ex @@ -1028,7 +1028,7 @@ defmodule Firecrawl do actions: [type: {:list, :any}, doc: "Actions to perform on the page before grabbing the content"], block_ads: [type: :boolean, doc: "Enables ad-blocking and cookie popup blocking."], exclude_tags: [type: {:list, :string}, doc: "Tags to exclude from the output."], - formats: [type: {:list, :any}, doc: "Output formats to include in the response. You can specify one or more formats, either as strings (e.g., `'markdown'`) or as objects with additional options (e.g., `{ type: 'json', schema: {...} }`). Some formats require specific options to be set. Example: `['markdown', { type: 'json', schema: {...} }]`."], + formats: [type: {:list, :any}, doc: "Output formats to include in the response. You can specify one or more formats, either as strings (e.g., `'markdown'`) or as objects with additional options (e.g., `{ type: 'json', schema: {...} }`, `{ type: 'question', question: '...' }`, `{ type: 'highlights', query: '...' }`). The legacy `{ type: 'query', prompt: '...', mode: 'freeform' | 'directQuote' }` format is deprecated."], headers: [type: :any, doc: "Headers to send with the request. Can be used to send cookies, user-agent, etc."], include_tags: [type: {:list, :string}, doc: "Tags to include in the output."], location: [type: :keyword_list, doc: "Location settings for the request. When specified, this will use an appropriate proxy if available and emulate the corresponding language and timezone settings. Defaults to 'US' if not specified."], @@ -1093,7 +1093,7 @@ defmodule Firecrawl do actions: [type: {:list, :any}, doc: "Actions to perform on the page before grabbing the content"], block_ads: [type: :boolean, doc: "Enables ad-blocking and cookie popup blocking."], exclude_tags: [type: {:list, :string}, doc: "Tags to exclude from the output."], - formats: [type: {:list, :any}, doc: "Output formats to include in the response. You can specify one or more formats, either as strings (e.g., `'markdown'`) or as objects with additional options (e.g., `{ type: 'json', schema: {...} }`). Some formats require specific options to be set. Example: `['markdown', { type: 'json', schema: {...} }]`."], + formats: [type: {:list, :any}, doc: "Output formats to include in the response. You can specify one or more formats, either as strings (e.g., `'markdown'`) or as objects with additional options (e.g., `{ type: 'json', schema: {...} }`, `{ type: 'question', question: '...' }`, `{ type: 'highlights', query: '...' }`). The legacy `{ type: 'query', prompt: '...', mode: 'freeform' | 'directQuote' }` format is deprecated."], headers: [type: :any, doc: "Headers to send with the request. Can be used to send cookies, user-agent, etc."], include_tags: [type: {:list, :string}, doc: "Tags to include in the output."], location: [type: :keyword_list, doc: "Location settings for the request. When specified, this will use an appropriate proxy if available and emulate the corresponding language and timezone settings. Defaults to 'US' if not specified."], diff --git a/apps/elixir-sdk/mix.exs b/apps/elixir-sdk/mix.exs index e15468af24..f669387144 100644 --- a/apps/elixir-sdk/mix.exs +++ b/apps/elixir-sdk/mix.exs @@ -1,7 +1,7 @@ defmodule Firecrawl.MixProject do use Mix.Project - @version "1.3.0" + @version "1.3.1" @source_url "https://github.com/firecrawl/firecrawl/tree/main/apps/elixir-sdk" def project do diff --git a/apps/go-sdk/models.go b/apps/go-sdk/models.go index 5877eadea6..a4c14b0ce3 100644 --- a/apps/go-sdk/models.go +++ b/apps/go-sdk/models.go @@ -16,6 +16,8 @@ type Document struct { Audio string `json:"audio,omitempty"` Attributes []map[string]interface{} `json:"attributes,omitempty"` Actions map[string]interface{} `json:"actions,omitempty"` + Answer string `json:"answer,omitempty"` + Highlights string `json:"highlights,omitempty"` Warning string `json:"warning,omitempty"` ChangeTracking map[string]interface{} `json:"changeTracking,omitempty"` Branding map[string]interface{} `json:"branding,omitempty"` diff --git a/apps/go-sdk/options.go b/apps/go-sdk/options.go index 4f610553d4..5f11165358 100644 --- a/apps/go-sdk/options.go +++ b/apps/go-sdk/options.go @@ -2,7 +2,7 @@ package firecrawl import "encoding/json" -// QueryFormatMode selects how query answers are generated. +// QueryFormatMode selects how deprecated query answers are generated. type QueryFormatMode string const ( @@ -10,7 +10,45 @@ const ( QueryModeDirectQuote QueryFormatMode = "directQuote" ) +// QuestionFormat asks a question about page content. +type QuestionFormat struct { + Question string `json:"question"` +} + +// MarshalJSON always emits the API-required question format type. +func (q QuestionFormat) MarshalJSON() ([]byte, error) { + type questionFormat struct { + Type string `json:"type"` + Question string `json:"question"` + } + + return json.Marshal(questionFormat{ + Type: "question", + Question: q.Question, + }) +} + +// HighlightsFormat extracts direct highlights from page content. +type HighlightsFormat struct { + Query string `json:"query"` +} + +// MarshalJSON always emits the API-required highlights format type. +func (h HighlightsFormat) MarshalJSON() ([]byte, error) { + type highlightsFormat struct { + Type string `json:"type"` + Query string `json:"query"` + } + + return json.Marshal(highlightsFormat{ + Type: "highlights", + Query: h.Query, + }) +} + // QueryFormat asks a question about page content. +// +// Deprecated: use QuestionFormat or HighlightsFormat instead. type QueryFormat struct { Prompt string `json:"prompt"` Mode QueryFormatMode `json:"mode,omitempty"` @@ -56,7 +94,7 @@ type ScrapeOptions struct { JsonOptions *JsonOptions `json:"jsonOptions,omitempty"` } -// MarshalJSON preserves string formats while allowing object formats such as QueryFormat. +// MarshalJSON preserves string formats while allowing object formats such as QuestionFormat. func (o ScrapeOptions) MarshalJSON() ([]byte, error) { type scrapeOptions ScrapeOptions payload := struct { diff --git a/apps/go-sdk/options_test.go b/apps/go-sdk/options_test.go index dbb4f0b5c0..9416ec487a 100644 --- a/apps/go-sdk/options_test.go +++ b/apps/go-sdk/options_test.go @@ -29,6 +29,28 @@ func TestScrapeOptionsSerializesQueryFormatMode(t *testing.T) { } } +func TestScrapeOptionsSerializesQuestionAndHighlightsFormats(t *testing.T) { + payload, err := json.Marshal(ScrapeOptions{ + FormatOptions: []interface{}{ + QuestionFormat{Question: "What is Firecrawl?"}, + HighlightsFormat{Query: "What is Firecrawl?"}, + }, + }) + if err != nil { + t.Fatalf("Marshal ScrapeOptions: %v", err) + } + + jsonBody := string(payload) + for _, want := range []string{ + `{"type":"question","question":"What is Firecrawl?"}`, + `{"type":"highlights","query":"What is Firecrawl?"}`, + } { + if !strings.Contains(jsonBody, want) { + t.Fatalf("serialized formats = %s, want to contain %s", jsonBody, want) + } + } +} + func TestScrapeOptionsPreservesStringFormats(t *testing.T) { payload, err := json.Marshal(ScrapeOptions{ Formats: []string{"markdown"}, diff --git a/apps/go-sdk/parse.go b/apps/go-sdk/parse.go index 6329f2f7a8..8ccd4071a7 100644 --- a/apps/go-sdk/parse.go +++ b/apps/go-sdk/parse.go @@ -73,7 +73,7 @@ type ParseOptions struct { JsonOptions *JsonOptions `json:"jsonOptions,omitempty"` } -// MarshalJSON preserves string formats while allowing object formats such as QueryFormat. +// MarshalJSON preserves string formats while allowing object formats such as QuestionFormat. func (o ParseOptions) MarshalJSON() ([]byte, error) { type parseOptions ParseOptions payload := struct { diff --git a/apps/go-sdk/version.go b/apps/go-sdk/version.go index afafa732e6..702528172e 100644 --- a/apps/go-sdk/version.go +++ b/apps/go-sdk/version.go @@ -9,4 +9,4 @@ package firecrawl // Bump this when preparing a new release. The publish-go-sdk GitHub workflow // reads this value and creates the corresponding monorepo-prefixed tag on // merge to main. -const Version = "1.2.1" +const Version = "1.2.2" diff --git a/apps/java-sdk/build.gradle.kts b/apps/java-sdk/build.gradle.kts index 3479bb1e70..8df1994215 100644 --- a/apps/java-sdk/build.gradle.kts +++ b/apps/java-sdk/build.gradle.kts @@ -4,7 +4,7 @@ plugins { } group = "com.firecrawl" -version = "1.5.0" +version = "1.5.1" java { sourceCompatibility = JavaVersion.VERSION_11 diff --git a/apps/java-sdk/src/main/java/com/firecrawl/models/Document.java b/apps/java-sdk/src/main/java/com/firecrawl/models/Document.java index 0fc75145da..b61670d2a8 100644 --- a/apps/java-sdk/src/main/java/com/firecrawl/models/Document.java +++ b/apps/java-sdk/src/main/java/com/firecrawl/models/Document.java @@ -22,6 +22,8 @@ public class Document { private String audio; private List> attributes; private Map actions; + private String answer; + private String highlights; private String warning; private Map changeTracking; private Map branding; @@ -38,6 +40,8 @@ public class Document { public String getAudio() { return audio; } public List> getAttributes() { return attributes; } public Map getActions() { return actions; } + public String getAnswer() { return answer; } + public String getHighlights() { return highlights; } public String getWarning() { return warning; } public Map getChangeTracking() { return changeTracking; } public Map getBranding() { return branding; } diff --git a/apps/java-sdk/src/main/java/com/firecrawl/models/HighlightsFormat.java b/apps/java-sdk/src/main/java/com/firecrawl/models/HighlightsFormat.java new file mode 100644 index 0000000000..5bdc8def19 --- /dev/null +++ b/apps/java-sdk/src/main/java/com/firecrawl/models/HighlightsFormat.java @@ -0,0 +1,35 @@ +package com.firecrawl.models; + +import com.fasterxml.jackson.annotation.JsonInclude; + +/** + * Highlights format for extracting direct highlights from page content. + */ +@JsonInclude(JsonInclude.Include.NON_NULL) +public class HighlightsFormat { + + private final String type = "highlights"; + private String query; + + private HighlightsFormat() {} + + public String getType() { return type; } + public String getQuery() { return query; } + + public static Builder builder() { return new Builder(); } + + public static final class Builder { + private String query; + + private Builder() {} + + /** Query used to select highlights from the page content. */ + public Builder query(String query) { this.query = query; return this; } + + public HighlightsFormat build() { + HighlightsFormat f = new HighlightsFormat(); + f.query = this.query; + return f; + } + } +} diff --git a/apps/java-sdk/src/main/java/com/firecrawl/models/QueryFormat.java b/apps/java-sdk/src/main/java/com/firecrawl/models/QueryFormat.java index 195281897d..6c5773dee3 100644 --- a/apps/java-sdk/src/main/java/com/firecrawl/models/QueryFormat.java +++ b/apps/java-sdk/src/main/java/com/firecrawl/models/QueryFormat.java @@ -4,8 +4,11 @@ import com.fasterxml.jackson.annotation.JsonValue; /** - * Query format for asking a question about page content. + * Deprecated query format for asking a question about page content. + * + * @deprecated Use {@link QuestionFormat} or {@link HighlightsFormat} instead. */ +@Deprecated @JsonInclude(JsonInclude.Include.NON_NULL) public class QueryFormat { diff --git a/apps/java-sdk/src/main/java/com/firecrawl/models/QuestionFormat.java b/apps/java-sdk/src/main/java/com/firecrawl/models/QuestionFormat.java new file mode 100644 index 0000000000..bcf19e3465 --- /dev/null +++ b/apps/java-sdk/src/main/java/com/firecrawl/models/QuestionFormat.java @@ -0,0 +1,35 @@ +package com.firecrawl.models; + +import com.fasterxml.jackson.annotation.JsonInclude; + +/** + * Question format for asking a question about page content. + */ +@JsonInclude(JsonInclude.Include.NON_NULL) +public class QuestionFormat { + + private final String type = "question"; + private String question; + + private QuestionFormat() {} + + public String getType() { return type; } + public String getQuestion() { return question; } + + public static Builder builder() { return new Builder(); } + + public static final class Builder { + private String question; + + private Builder() {} + + /** Question to answer from the page content. */ + public Builder question(String question) { this.question = question; return this; } + + public QuestionFormat build() { + QuestionFormat f = new QuestionFormat(); + f.question = this.question; + return f; + } + } +} diff --git a/apps/java-sdk/src/main/java/com/firecrawl/models/ScrapeOptions.java b/apps/java-sdk/src/main/java/com/firecrawl/models/ScrapeOptions.java index 440af580d4..700e29179f 100644 --- a/apps/java-sdk/src/main/java/com/firecrawl/models/ScrapeOptions.java +++ b/apps/java-sdk/src/main/java/com/firecrawl/models/ScrapeOptions.java @@ -109,7 +109,7 @@ private Builder() {} /** * Output formats to request. Accepts strings like "markdown", "html", "rawHtml", * "links", "screenshot", "json", "audio", etc., or format configuration maps/objects for - * advanced formats (e.g., JsonFormat, QueryFormat). + * advanced formats (e.g., JsonFormat, QuestionFormat, HighlightsFormat). */ public Builder formats(List formats) { this.formats = formats; return this; } diff --git a/apps/java-sdk/src/test/java/com/firecrawl/FirecrawlClientTest.java b/apps/java-sdk/src/test/java/com/firecrawl/FirecrawlClientTest.java index 04a68737ee..1d7cc74af7 100644 --- a/apps/java-sdk/src/test/java/com/firecrawl/FirecrawlClientTest.java +++ b/apps/java-sdk/src/test/java/com/firecrawl/FirecrawlClientTest.java @@ -80,6 +80,26 @@ void testScrapeOptionsBuilder() { assertFalse(options.getMobile()); } + @Test + void testQuestionAndHighlightsFormats() { + QuestionFormat questionFormat = QuestionFormat.builder() + .question("What is Firecrawl?") + .build(); + HighlightsFormat highlightsFormat = HighlightsFormat.builder() + .query("What is Firecrawl?") + .build(); + + ScrapeOptions options = ScrapeOptions.builder() + .formats(List.of(questionFormat, highlightsFormat)) + .build(); + + assertEquals(List.of(questionFormat, highlightsFormat), options.getFormats()); + assertEquals("question", questionFormat.getType()); + assertEquals("What is Firecrawl?", questionFormat.getQuestion()); + assertEquals("highlights", highlightsFormat.getType()); + assertEquals("What is Firecrawl?", highlightsFormat.getQuery()); + } + @Test void testCrawlOptionsBuilder() { CrawlOptions options = CrawlOptions.builder() diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index f88ddd72bb..dc9fd677de 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "4.22.0", + "version": "4.22.1", "description": "JavaScript SDK for Firecrawl API", "main": "dist/index.js", "types": "dist/index.d.ts", diff --git a/apps/js-sdk/firecrawl/src/__tests__/unit/v2/validation.test.ts b/apps/js-sdk/firecrawl/src/__tests__/unit/v2/validation.test.ts index 83a35546a9..83b8795fbe 100644 --- a/apps/js-sdk/firecrawl/src/__tests__/unit/v2/validation.test.ts +++ b/apps/js-sdk/firecrawl/src/__tests__/unit/v2/validation.test.ts @@ -56,6 +56,27 @@ describe("v2 utils: validation", () => { expect((formats[0] as any).viewport).toEqual({ width: 800, height: 600 }); }); + test("ensureValidFormats: accepts question, highlights, and deprecated query formats", () => { + const formats: FormatOption[] = [ + { type: "question", question: "What is Firecrawl?" }, + { type: "highlights", query: "What is Firecrawl?" }, + { type: "query", prompt: "What is Firecrawl?", mode: "directQuote" }, + ]; + expect(() => ensureValidFormats(formats)).not.toThrow(); + }); + + test("ensureValidFormats: validates question, highlights, and deprecated query fields", () => { + expect(() => + ensureValidFormats([{ type: "question", question: "" } as any]), + ).toThrow(/question format requires/i); + expect(() => + ensureValidFormats([{ type: "highlights", query: "" } as any]), + ).toThrow(/highlights format requires/i); + expect(() => + ensureValidFormats([{ type: "query", prompt: "p", mode: "quoted" } as any]), + ).toThrow(/query format mode/i); + }); + test("ensureValidScrapeOptions: leaves parsers untouched", () => { const options = { parsers: ["pdf", "images"] as string[] } as any; const before = [...options.parsers]; diff --git a/apps/js-sdk/firecrawl/src/v2/types.ts b/apps/js-sdk/firecrawl/src/v2/types.ts index a07232dc92..0b38774ad3 100644 --- a/apps/js-sdk/firecrawl/src/v2/types.ts +++ b/apps/js-sdk/firecrawl/src/v2/types.ts @@ -52,6 +52,17 @@ export interface AttributesFormat extends Format { }>; } +export interface QuestionFormat { + type: 'question'; + question: string; +} + +export interface HighlightsFormat { + type: 'highlights'; + query: string; +} + +/** @deprecated Use QuestionFormat or HighlightsFormat instead. */ export interface QueryFormat { type: 'query'; prompt: string; @@ -65,6 +76,8 @@ export type FormatOption = | ChangeTrackingFormat | ScreenshotFormat | AttributesFormat + | QuestionFormat + | HighlightsFormat | QueryFormat; export type ParseFormatString = Exclude< @@ -81,6 +94,8 @@ export type ParseFormatOption = | ParseFormat | JsonFormat | AttributesFormat + | QuestionFormat + | HighlightsFormat | QueryFormat; export interface LocationConfig { @@ -450,6 +465,7 @@ export interface Document { }>; actions?: Record; answer?: string; + highlights?: string; warning?: string; changeTracking?: Record; branding?: BrandingProfile; diff --git a/apps/js-sdk/firecrawl/src/v2/utils/validation.ts b/apps/js-sdk/firecrawl/src/v2/utils/validation.ts index afb91a8cc3..b8082268f7 100644 --- a/apps/js-sdk/firecrawl/src/v2/utils/validation.ts +++ b/apps/js-sdk/firecrawl/src/v2/utils/validation.ts @@ -4,6 +4,9 @@ import { type JsonFormat, type ParseFormatOption, type ParseOptions, + type QuestionFormat, + type HighlightsFormat, + type QueryFormat, type ScrapeOptions, type ScreenshotFormat, } from "../types"; @@ -49,6 +52,30 @@ export function ensureValidFormats(formats?: FormatOption[]): void { } continue; } + if ((fmt as QuestionFormat).type === "question") { + const q = fmt as QuestionFormat; + if (typeof q.question !== "string" || q.question.trim().length === 0) { + throw new Error("question format requires a non-empty 'question' string"); + } + continue; + } + if ((fmt as HighlightsFormat).type === "highlights") { + const h = fmt as HighlightsFormat; + if (typeof h.query !== "string" || h.query.trim().length === 0) { + throw new Error("highlights format requires a non-empty 'query' string"); + } + continue; + } + if ((fmt as QueryFormat).type === "query") { + const q = fmt as QueryFormat; + if (typeof q.prompt !== "string" || q.prompt.trim().length === 0) { + throw new Error("query format requires a non-empty 'prompt' string"); + } + if (q.mode != null && q.mode !== "freeform" && q.mode !== "directQuote") { + throw new Error("query format mode must be 'freeform' or 'directQuote'"); + } + continue; + } if ((fmt as ScreenshotFormat).type === "screenshot") { // no-op; already camelCase; validate numeric fields if present const s = fmt as ScreenshotFormat; @@ -116,6 +143,31 @@ export function ensureValidParseFormats(formats?: ParseFormatOption[]): void { "The SDK will automatically convert Zod schemas to JSON Schema format." ); } + continue; + } + + if ((fmt as QuestionFormat).type === "question") { + const q = fmt as QuestionFormat; + if (typeof q.question !== "string" || q.question.trim().length === 0) { + throw new Error("question format requires a non-empty 'question' string"); + } + continue; + } + if ((fmt as HighlightsFormat).type === "highlights") { + const h = fmt as HighlightsFormat; + if (typeof h.query !== "string" || h.query.trim().length === 0) { + throw new Error("highlights format requires a non-empty 'query' string"); + } + continue; + } + if ((fmt as QueryFormat).type === "query") { + const q = fmt as QueryFormat; + if (typeof q.prompt !== "string" || q.prompt.trim().length === 0) { + throw new Error("query format requires a non-empty 'prompt' string"); + } + if (q.mode != null && q.mode !== "freeform" && q.mode !== "directQuote") { + throw new Error("query format mode must be 'freeform' or 'directQuote'"); + } } } } diff --git a/apps/php-sdk/src/Models/Document.php b/apps/php-sdk/src/Models/Document.php index ad9f021b09..994de06614 100644 --- a/apps/php-sdk/src/Models/Document.php +++ b/apps/php-sdk/src/Models/Document.php @@ -28,6 +28,8 @@ public function __construct( private readonly ?string $audio = null, private readonly ?array $attributes = null, private readonly ?array $actions = null, + private readonly ?string $answer = null, + private readonly ?string $highlights = null, private readonly ?string $warning = null, private readonly ?array $changeTracking = null, private readonly ?array $branding = null, @@ -49,6 +51,8 @@ public static function fromArray(array $data): self audio: $data['audio'] ?? null, attributes: $data['attributes'] ?? null, actions: $data['actions'] ?? null, + answer: $data['answer'] ?? null, + highlights: $data['highlights'] ?? null, warning: $data['warning'] ?? null, changeTracking: $data['changeTracking'] ?? null, branding: $data['branding'] ?? null, @@ -125,6 +129,16 @@ public function getWarning(): ?string return $this->warning; } + public function getAnswer(): ?string + { + return $this->answer; + } + + public function getHighlights(): ?string + { + return $this->highlights; + } + /** @return array|null */ public function getChangeTracking(): ?array { diff --git a/apps/php-sdk/src/Models/HighlightsFormat.php b/apps/php-sdk/src/Models/HighlightsFormat.php new file mode 100644 index 0000000000..fd47f348c2 --- /dev/null +++ b/apps/php-sdk/src/Models/HighlightsFormat.php @@ -0,0 +1,31 @@ + */ + public function toArray(): array + { + return [ + 'type' => 'highlights', + 'query' => $this->query, + ]; + } + + public function getQuery(): string + { + return $this->query; + } +} diff --git a/apps/php-sdk/src/Models/ParseOptions.php b/apps/php-sdk/src/Models/ParseOptions.php index ec1d458aaf..89e03faffc 100644 --- a/apps/php-sdk/src/Models/ParseOptions.php +++ b/apps/php-sdk/src/Models/ParseOptions.php @@ -23,7 +23,7 @@ final class ParseOptions ]; /** - * @param list|null $formats + * @param list|null $formats * @param array|null $headers * @param list|null $includeTags * @param list|null $excludeTags @@ -45,7 +45,7 @@ private function __construct( ) {} /** - * @param list|null $formats + * @param list|null $formats * @param array|null $headers * @param list|null $includeTags * @param list|null $excludeTags @@ -105,8 +105,13 @@ public function toArray(): array if ($this->formats !== null) { $data['formats'] = array_map( - fn (string|JsonFormat|QueryFormat $f): string|array => - $f instanceof JsonFormat || $f instanceof QueryFormat ? $f->toArray() : $f, + fn (string|JsonFormat|QuestionFormat|HighlightsFormat|QueryFormat $f): string|array => + ( + $f instanceof JsonFormat + || $f instanceof QuestionFormat + || $f instanceof HighlightsFormat + || $f instanceof QueryFormat + ) ? $f->toArray() : $f, $this->formats, ); } @@ -142,13 +147,22 @@ private static function extractFormatType(mixed $fmt): ?string if ($fmt instanceof JsonFormat) { return 'json'; } + if ($fmt instanceof QuestionFormat) { + return 'question'; + } + if ($fmt instanceof HighlightsFormat) { + return 'highlights'; + } + if ($fmt instanceof QueryFormat) { + return 'query'; + } if (is_array($fmt) && isset($fmt['type']) && is_string($fmt['type'])) { return $fmt['type']; } return null; } - /** @return list|null */ + /** @return list|null */ public function getFormats(): ?array { return $this->formats; diff --git a/apps/php-sdk/src/Models/QueryFormat.php b/apps/php-sdk/src/Models/QueryFormat.php index 70daa7920f..de95f57f05 100644 --- a/apps/php-sdk/src/Models/QueryFormat.php +++ b/apps/php-sdk/src/Models/QueryFormat.php @@ -4,6 +4,7 @@ namespace Firecrawl\Models; +/** @deprecated Use QuestionFormat or HighlightsFormat instead. */ final class QueryFormat { public const MODE_FREEFORM = 'freeform'; diff --git a/apps/php-sdk/src/Models/QuestionFormat.php b/apps/php-sdk/src/Models/QuestionFormat.php new file mode 100644 index 0000000000..6d076e1859 --- /dev/null +++ b/apps/php-sdk/src/Models/QuestionFormat.php @@ -0,0 +1,31 @@ + */ + public function toArray(): array + { + return [ + 'type' => 'question', + 'question' => $this->question, + ]; + } + + public function getQuestion(): string + { + return $this->question; + } +} diff --git a/apps/php-sdk/src/Models/ScrapeOptions.php b/apps/php-sdk/src/Models/ScrapeOptions.php index 7694d70ca1..b0d8d50d74 100644 --- a/apps/php-sdk/src/Models/ScrapeOptions.php +++ b/apps/php-sdk/src/Models/ScrapeOptions.php @@ -7,7 +7,7 @@ final class ScrapeOptions { /** - * @param list|null $formats + * @param list|null $formats * @param array|null $headers * @param list|null $includeTags * @param list|null $excludeTags @@ -41,7 +41,7 @@ private function __construct( ) {} /** - * @param list|null $formats + * @param list|null $formats * @param array|null $headers * @param list|null $includeTags * @param list|null $excludeTags @@ -89,8 +89,14 @@ public function toArray(): array if ($this->formats !== null) { $data['formats'] = array_map( - fn (string|JsonFormat|ScreenshotFormat|QueryFormat $f): string|array => - $f instanceof JsonFormat || $f instanceof ScreenshotFormat || $f instanceof QueryFormat ? $f->toArray() : $f, + fn (string|JsonFormat|ScreenshotFormat|QuestionFormat|HighlightsFormat|QueryFormat $f): string|array => + ( + $f instanceof JsonFormat + || $f instanceof ScreenshotFormat + || $f instanceof QuestionFormat + || $f instanceof HighlightsFormat + || $f instanceof QueryFormat + ) ? $f->toArray() : $f, $this->formats, ); } @@ -128,7 +134,7 @@ public function toArray(): array return $data; } - /** @return list|null */ + /** @return list|null */ public function getFormats(): ?array { return $this->formats; diff --git a/apps/php-sdk/src/Version.php b/apps/php-sdk/src/Version.php index 8fb71f9b46..aefc62765e 100644 --- a/apps/php-sdk/src/Version.php +++ b/apps/php-sdk/src/Version.php @@ -6,5 +6,5 @@ final class Version { - public const SDK_VERSION = '1.2.0'; + public const SDK_VERSION = '1.2.1'; } diff --git a/apps/php-sdk/tests/Unit/ModelsTest.php b/apps/php-sdk/tests/Unit/ModelsTest.php index 59b2592bb4..b36a04d185 100644 --- a/apps/php-sdk/tests/Unit/ModelsTest.php +++ b/apps/php-sdk/tests/Unit/ModelsTest.php @@ -6,7 +6,9 @@ use Firecrawl\Models\MapData; use Firecrawl\Models\BatchScrapeJob; use Firecrawl\Models\CrawlJob; +use Firecrawl\Models\HighlightsFormat; use Firecrawl\Models\QueryFormat; +use Firecrawl\Models\QuestionFormat; use Firecrawl\Models\ScrapeOptions; it('hydrates CreditUsage from nested data key', function (): void { @@ -147,6 +149,26 @@ ]); }); +it('serializes question and highlights formats in ScrapeOptions', function (): void { + $options = ScrapeOptions::with( + formats: [ + QuestionFormat::with('What is Firecrawl?'), + HighlightsFormat::with('What is Firecrawl?'), + ], + ); + + expect($options->toArray()['formats'])->toMatchArray([ + [ + 'type' => 'question', + 'question' => 'What is Firecrawl?', + ], + [ + 'type' => 'highlights', + 'query' => 'What is Firecrawl?', + ], + ]); +}); + it('rejects invalid query format mode', function (): void { QueryFormat::with('What is Firecrawl?', 'quoted'); })->throws(InvalidArgumentException::class, "query mode must be 'freeform' or 'directQuote'"); diff --git a/apps/python-sdk/firecrawl/__init__.py b/apps/python-sdk/firecrawl/__init__.py index 8abe1b7573..90f63b1cd1 100644 --- a/apps/python-sdk/firecrawl/__init__.py +++ b/apps/python-sdk/firecrawl/__init__.py @@ -17,7 +17,7 @@ V1ChangeTrackingOptions, ) -__version__ = "4.25.0" +__version__ = "4.25.1" # Define the logger for the Firecrawl project logger: logging.Logger = logging.getLogger("firecrawl") diff --git a/apps/python-sdk/firecrawl/__tests__/unit/v2/utils/test_validation.py b/apps/python-sdk/firecrawl/__tests__/unit/v2/utils/test_validation.py index 41b205b1a9..2988d3972e 100644 --- a/apps/python-sdk/firecrawl/__tests__/unit/v2/utils/test_validation.py +++ b/apps/python-sdk/firecrawl/__tests__/unit/v2/utils/test_validation.py @@ -1,5 +1,12 @@ import pytest -from firecrawl.v2.types import JsonFormat, QueryFormat, ScrapeOptions, PDFParser +from firecrawl.v2.types import ( + HighlightsFormat, + JsonFormat, + QueryFormat, + QuestionFormat, + ScrapeOptions, + PDFParser, +) from firecrawl.v2.utils.validation import validate_scrape_options, prepare_scrape_options @@ -200,6 +207,33 @@ def test_prepare_query_format_with_mode(self): {"type": "query", "prompt": "What is Firecrawl?", "mode": "directQuote"} ] + def test_prepare_question_and_highlights_formats(self): + """Test question and highlights formats are preserved.""" + options = ScrapeOptions( + formats=[ + QuestionFormat(question="What is Firecrawl?"), + HighlightsFormat(query="What is Firecrawl?"), + ] + ) + result = prepare_scrape_options(options) + + assert result["formats"] == [ + {"type": "question", "question": "What is Firecrawl?"}, + {"type": "highlights", "query": "What is Firecrawl?"}, + ] + + def test_prepare_question_and_highlights_reject_empty_values(self): + """Test question and highlights validation.""" + with pytest.raises(ValueError, match="question format requires"): + prepare_scrape_options( + ScrapeOptions(formats=[{"type": "question", "question": ""}]) + ) + + with pytest.raises(ValueError, match="highlights format requires"): + prepare_scrape_options( + ScrapeOptions(formats=[{"type": "highlights", "query": ""}]) + ) + def test_prepare_query_format_rejects_direct_quote_boolean(self): """Test old query directQuote layout is rejected.""" options = ScrapeOptions( diff --git a/apps/python-sdk/firecrawl/types.py b/apps/python-sdk/firecrawl/types.py index 2f5a43a73c..4bb832409f 100644 --- a/apps/python-sdk/firecrawl/types.py +++ b/apps/python-sdk/firecrawl/types.py @@ -46,6 +46,9 @@ SourceOption, Format, JsonFormat, + QuestionFormat, + HighlightsFormat, + QueryFormat, FormatOption, SearchRequest, SearchResultWeb, @@ -127,6 +130,9 @@ 'SourceOption', 'Format', 'JsonFormat', + 'QuestionFormat', + 'HighlightsFormat', + 'QueryFormat', 'FormatOption', 'SearchRequest', 'SearchResultWeb', diff --git a/apps/python-sdk/firecrawl/v2/types.py b/apps/python-sdk/firecrawl/v2/types.py index a2fe2368f2..2ace0eb629 100644 --- a/apps/python-sdk/firecrawl/v2/types.py +++ b/apps/python-sdk/firecrawl/v2/types.py @@ -277,6 +277,7 @@ class Document(BaseModel): audio: Optional[str] = None actions: Optional[Dict[str, Any]] = None answer: Optional[str] = None + highlights: Optional[str] = None warning: Optional[str] = None change_tracking: Optional[Dict[str, Any]] = None branding: Optional[BrandingProfile] = None @@ -446,8 +447,22 @@ class AttributesFormat(Format): selectors: List[AttributeSelector] +class QuestionFormat(Format): + """Configuration for question format - ask a question about the page content.""" + + type: Literal["question"] = "question" + question: str + + +class HighlightsFormat(Format): + """Configuration for highlights format - extract direct highlights from page content.""" + + type: Literal["highlights"] = "highlights" + query: str + + class QueryFormat(Format): - """Configuration for query format - ask a question about the page content.""" + """Deprecated query format. Use QuestionFormat or HighlightsFormat instead.""" type: Literal["query"] = "query" prompt: str @@ -461,6 +476,8 @@ class QueryFormat(Format): ChangeTrackingFormat, ScreenshotFormat, AttributesFormat, + QuestionFormat, + HighlightsFormat, QueryFormat, Format, ] @@ -495,10 +512,16 @@ def validate_formats(cls, v): raise ValueError("query format must be an object with 'type' and 'prompt' fields") normalized_formats.append(Format(type=format_item)) elif isinstance(format_item, dict): - # Reject query dicts missing prompt early + fmt_type = format_item.get('type') prompt = format_item.get('prompt') - if format_item.get('type') == 'query' and (not isinstance(prompt, str) or not prompt.strip()): + question = format_item.get('question') + query = format_item.get('query') + if fmt_type == 'query' and (not isinstance(prompt, str) or not prompt.strip()): raise ValueError("query format requires a non-empty 'prompt' string") + if fmt_type == 'question' and (not isinstance(question, str) or not question.strip()): + raise ValueError("question format requires a non-empty 'question' string") + if fmt_type == 'highlights' and (not isinstance(query, str) or not query.strip()): + raise ValueError("highlights format requires a non-empty 'query' string") # Preserve dicts as-is to avoid dropping custom fields like 'schema' normalized_formats.append(format_item) elif isinstance(format_item, Format): diff --git a/apps/python-sdk/firecrawl/v2/utils/validation.py b/apps/python-sdk/firecrawl/v2/utils/validation.py index 8568bec503..11f427f2c7 100644 --- a/apps/python-sdk/firecrawl/v2/utils/validation.py +++ b/apps/python-sdk/firecrawl/v2/utils/validation.py @@ -464,6 +464,28 @@ def _validate_query_format(format_obj: Any) -> Dict[str, Any]: return format_obj +def _validate_question_format(format_obj: Any) -> Dict[str, Any]: + """Validate and prepare question format object.""" + if not isinstance(format_obj, dict): + raise ValueError("question format must be an object with 'type' and 'question' fields") + + if not isinstance(format_obj.get('question'), str) or not format_obj['question'].strip(): + raise ValueError("question format requires a non-empty 'question' string") + + return format_obj + + +def _validate_highlights_format(format_obj: Any) -> Dict[str, Any]: + """Validate and prepare highlights format object.""" + if not isinstance(format_obj, dict): + raise ValueError("highlights format must be an object with 'type' and 'query' fields") + + if not isinstance(format_obj.get('query'), str) or not format_obj['query'].strip(): + raise ValueError("highlights format requires a non-empty 'query' string") + + return format_obj + + def validate_scrape_options(options: Optional[ScrapeOptions]) -> Optional[ScrapeOptions]: """ Validate and normalize scrape options. @@ -573,12 +595,20 @@ def prepare_scrape_options(options: Optional[ScrapeOptions]) -> Optional[Dict[st raise ValueError("json format must be an object with 'type', 'prompt', and 'schema' fields") if fmt == "query": raise ValueError("query format must be an object with 'type' and 'prompt' fields") + if fmt == "question": + raise ValueError("question format must be an object with 'type' and 'question' fields") + if fmt == "highlights": + raise ValueError("highlights format must be an object with 'type' and 'query' fields") converted_formats.append(_convert_format_string(fmt)) elif isinstance(fmt, dict): fmt_type = _convert_format_string(fmt.get('type')) if fmt.get('type') else None if fmt_type == 'json': validated_json = _validate_json_format({**fmt, 'type': 'json'}) converted_formats.append(validated_json) + elif fmt_type == 'question': + converted_formats.append(_validate_question_format(fmt)) + elif fmt_type == 'highlights': + converted_formats.append(_validate_highlights_format(fmt)) elif fmt_type == 'query': converted_formats.append(_validate_query_format(fmt)) elif fmt_type == 'screenshot': @@ -598,6 +628,10 @@ def prepare_scrape_options(options: Optional[ScrapeOptions]) -> Optional[Dict[st elif hasattr(fmt, 'type'): if fmt.type == 'json': converted_formats.append(_validate_json_format(fmt.model_dump())) + elif fmt.type == 'question': + converted_formats.append(_validate_question_format(fmt.model_dump(exclude_none=True))) + elif fmt.type == 'highlights': + converted_formats.append(_validate_highlights_format(fmt.model_dump(exclude_none=True))) elif fmt.type == 'query': converted_formats.append(_validate_query_format(fmt.model_dump(exclude_none=True))) else: @@ -630,12 +664,20 @@ def prepare_scrape_options(options: Optional[ScrapeOptions]) -> Optional[Dict[st raise ValueError("json format must be an object with 'type', 'prompt', and 'schema' fields") if fmt == "query": raise ValueError("query format must be an object with 'type' and 'prompt' fields") + if fmt == "question": + raise ValueError("question format must be an object with 'type' and 'question' fields") + if fmt == "highlights": + raise ValueError("highlights format must be an object with 'type' and 'query' fields") converted_formats.append(_convert_format_string(fmt)) elif isinstance(fmt, dict): fmt_type = _convert_format_string(fmt.get('type')) if fmt.get('type') else None if fmt_type == 'json': validated_json = _validate_json_format({**fmt, 'type': 'json'}) converted_formats.append(validated_json) + elif fmt_type == 'question': + converted_formats.append(_validate_question_format(fmt)) + elif fmt_type == 'highlights': + converted_formats.append(_validate_highlights_format(fmt)) elif fmt_type == 'query': converted_formats.append(_validate_query_format(fmt)) elif fmt_type == 'screenshot': @@ -653,6 +695,10 @@ def prepare_scrape_options(options: Optional[ScrapeOptions]) -> Optional[Dict[st elif hasattr(fmt, 'type'): if fmt.type == 'json': converted_formats.append(_validate_json_format(fmt.model_dump())) + elif fmt.type == 'question': + converted_formats.append(_validate_question_format(fmt.model_dump(exclude_none=True))) + elif fmt.type == 'highlights': + converted_formats.append(_validate_highlights_format(fmt.model_dump(exclude_none=True))) elif fmt.type == 'screenshot': normalized = {'type': 'screenshot'} if getattr(fmt, 'full_page', None) is not None: diff --git a/apps/ruby-sdk/lib/firecrawl/models/document.rb b/apps/ruby-sdk/lib/firecrawl/models/document.rb index 931911af36..578c095b95 100644 --- a/apps/ruby-sdk/lib/firecrawl/models/document.rb +++ b/apps/ruby-sdk/lib/firecrawl/models/document.rb @@ -6,7 +6,8 @@ module Models class Document attr_reader :markdown, :html, :raw_html, :json, :summary, :metadata, :links, :images, :screenshot, :audio, - :attributes, :actions, :warning, :change_tracking, :branding + :attributes, :actions, :answer, :highlights, :warning, + :change_tracking, :branding def initialize(data) @markdown = data["markdown"] @@ -21,6 +22,8 @@ def initialize(data) @audio = data["audio"] @attributes = data["attributes"] @actions = data["actions"] + @answer = data["answer"] + @highlights = data["highlights"] @warning = data["warning"] @change_tracking = data["changeTracking"] @branding = data["branding"] diff --git a/apps/ruby-sdk/lib/firecrawl/models/query_format.rb b/apps/ruby-sdk/lib/firecrawl/models/query_format.rb index 5706ff36b4..bf9ad3dacb 100644 --- a/apps/ruby-sdk/lib/firecrawl/models/query_format.rb +++ b/apps/ruby-sdk/lib/firecrawl/models/query_format.rb @@ -2,7 +2,47 @@ module Firecrawl module Models - # Query format for asking a question about page content. + # Question format for asking a question about page content. + class QuestionFormat + attr_reader :question + + def initialize(question:) + @question = question + end + + def to_h + { + "type" => "question", + "question" => question, + } + end + + def type + "question" + end + end + + # Highlights format for extracting direct highlights from page content. + class HighlightsFormat + attr_reader :query + + def initialize(query:) + @query = query + end + + def to_h + { + "type" => "highlights", + "query" => query, + } + end + + def type + "highlights" + end + end + + # Deprecated query format for asking a question about page content. class QueryFormat MODE_FREEFORM = "freeform" MODE_DIRECT_QUOTE = "directQuote" diff --git a/apps/ruby-sdk/lib/firecrawl/version.rb b/apps/ruby-sdk/lib/firecrawl/version.rb index 02eba3e87a..7b806711fb 100644 --- a/apps/ruby-sdk/lib/firecrawl/version.rb +++ b/apps/ruby-sdk/lib/firecrawl/version.rb @@ -1,5 +1,5 @@ # frozen_string_literal: true module Firecrawl - VERSION = "1.4.0" + VERSION = "1.4.1" end diff --git a/apps/ruby-sdk/test/firecrawl/client_test.rb b/apps/ruby-sdk/test/firecrawl/client_test.rb index 184d5505b6..7b4997d2c1 100644 --- a/apps/ruby-sdk/test/firecrawl/client_test.rb +++ b/apps/ruby-sdk/test/firecrawl/client_test.rb @@ -471,6 +471,20 @@ def test_query_format_to_h ) end + def test_question_and_highlights_format_to_h + question = Firecrawl::Models::QuestionFormat.new(question: "What is Firecrawl?") + highlights = Firecrawl::Models::HighlightsFormat.new(query: "What is Firecrawl?") + opts = Firecrawl::Models::ScrapeOptions.new(formats: [question, highlights]) + + assert_equal( + [ + { "type" => "question", "question" => "What is Firecrawl?" }, + { "type" => "highlights", "query" => "What is Firecrawl?" }, + ], + opts.to_h["formats"] + ) + end + def test_query_format_rejects_invalid_mode assert_raises(ArgumentError) do Firecrawl::Models::QueryFormat.new(prompt: "What is Firecrawl?", mode: "quoted") diff --git a/apps/rust-sdk/Cargo.lock b/apps/rust-sdk/Cargo.lock index a0f539d345..48cda122f9 100644 --- a/apps/rust-sdk/Cargo.lock +++ b/apps/rust-sdk/Cargo.lock @@ -250,7 +250,7 @@ checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" [[package]] name = "firecrawl" -version = "2.4.0" +version = "2.4.1" dependencies = [ "mockito", "reqwest", diff --git a/apps/rust-sdk/Cargo.toml b/apps/rust-sdk/Cargo.toml index 509e1dcdc3..8e6bb84c0a 100644 --- a/apps/rust-sdk/Cargo.toml +++ b/apps/rust-sdk/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "firecrawl" -version = "2.4.0" +version = "2.4.1" edition = "2021" license = "MIT" homepage = "https://www.firecrawl.dev/" diff --git a/apps/rust-sdk/src/scrape.rs b/apps/rust-sdk/src/scrape.rs index 43ec1b86d5..36d213a05b 100644 --- a/apps/rust-sdk/src/scrape.rs +++ b/apps/rust-sdk/src/scrape.rs @@ -436,7 +436,7 @@ impl Client { #[cfg(test)] mod tests { use super::*; - use crate::{QueryFormat, QueryFormatMode}; + use crate::{HighlightsFormat, QueryFormat, QueryFormatMode, QuestionFormat}; use serde_json::json; #[test] @@ -460,6 +460,37 @@ mod tests { ); } + #[test] + fn test_question_and_highlights_formats_serialize() { + let options = ScrapeOptions { + formats: Some(vec![ + Format::Question(QuestionFormat { + question: "What is Firecrawl?".to_string(), + }), + Format::Highlights(HighlightsFormat { + query: "What is Firecrawl?".to_string(), + }), + ]), + ..Default::default() + }; + + let payload = serde_json::to_value(options).unwrap(); + assert_eq!( + payload["formats"][0], + json!({ + "type": "question", + "question": "What is Firecrawl?" + }) + ); + assert_eq!( + payload["formats"][1], + json!({ + "type": "highlights", + "query": "What is Firecrawl?" + }) + ); + } + #[tokio::test] async fn test_scrape_with_mock() { let mut server = mockito::Server::new_async().await; diff --git a/apps/rust-sdk/src/types.rs b/apps/rust-sdk/src/types.rs index bcc7879ee6..5cb1cdea02 100644 --- a/apps/rust-sdk/src/types.rs +++ b/apps/rust-sdk/src/types.rs @@ -33,7 +33,11 @@ pub enum Format { Branding, /// Audio extraction (MP3) from YouTube videos. Audio, - /// Query answer generated from the page content. + /// Question answer generated from the page content. + Question(QuestionFormat), + /// Direct highlights selected from the page content. + Highlights(HighlightsFormat), + /// Deprecated query answer generated from the page content. Query(QueryFormat), } @@ -55,6 +59,8 @@ impl Serialize for Format { Format::Attributes => serializer.serialize_str("attributes"), Format::Branding => serializer.serialize_str("branding"), Format::Audio => serializer.serialize_str("audio"), + Format::Question(question) => question.serialize(serializer), + Format::Highlights(highlights) => highlights.serialize(serializer), Format::Query(query) => query.serialize(serializer), } } @@ -82,15 +88,116 @@ impl<'de> Deserialize<'de> for Format { "audio" => Ok(Format::Audio), _ => Err(de::Error::custom(format!("unknown format: {}", format))), }, - Value::Object(_) => QueryFormat::deserialize(value) - .map(Format::Query) - .map_err(de::Error::custom), + Value::Object(_) => match value.get("type").and_then(Value::as_str) { + Some("question") => QuestionFormat::deserialize(value) + .map(Format::Question) + .map_err(de::Error::custom), + Some("highlights") => HighlightsFormat::deserialize(value) + .map(Format::Highlights) + .map_err(de::Error::custom), + Some("query") => QueryFormat::deserialize(value) + .map(Format::Query) + .map_err(de::Error::custom), + Some(format_type) => Err(de::Error::custom(format!( + "unknown object format: {}", + format_type + ))), + None => Err(de::Error::custom("object format must have a type")), + }, _ => Err(de::Error::custom("format must be a string or object")), } } } -/// Query format for asking a question about page content. +/// Question format for asking a question about page content. +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct QuestionFormat { + pub question: String, +} + +#[derive(Deserialize, Serialize)] +#[serde(rename_all = "camelCase")] +struct QuestionFormatWire { + #[serde(rename = "type")] + format_type: String, + question: String, +} + +impl Serialize for QuestionFormat { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + QuestionFormatWire { + format_type: "question".to_string(), + question: self.question.clone(), + } + .serialize(serializer) + } +} + +impl<'de> Deserialize<'de> for QuestionFormat { + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + let wire = QuestionFormatWire::deserialize(deserializer)?; + if wire.format_type != "question" { + return Err(de::Error::custom( + "question format object must have type question", + )); + } + + Ok(Self { + question: wire.question, + }) + } +} + +/// Highlights format for selecting direct highlights from page content. +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct HighlightsFormat { + pub query: String, +} + +#[derive(Deserialize, Serialize)] +#[serde(rename_all = "camelCase")] +struct HighlightsFormatWire { + #[serde(rename = "type")] + format_type: String, + query: String, +} + +impl Serialize for HighlightsFormat { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + HighlightsFormatWire { + format_type: "highlights".to_string(), + query: self.query.clone(), + } + .serialize(serializer) + } +} + +impl<'de> Deserialize<'de> for HighlightsFormat { + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + let wire = HighlightsFormatWire::deserialize(deserializer)?; + if wire.format_type != "highlights" { + return Err(de::Error::custom( + "highlights format object must have type highlights", + )); + } + + Ok(Self { query: wire.query }) + } +} + +/// Deprecated query format for asking a question about page content. #[derive(Clone, Debug, PartialEq, Eq)] pub struct QueryFormat { pub prompt: String, @@ -555,6 +662,10 @@ pub struct Document { pub attributes: Option>, /// Action results. pub actions: Option>, + /// Answer generated by the question or deprecated query format. + pub answer: Option, + /// Highlights generated by the highlights format. + pub highlights: Option, /// Warning message. pub warning: Option, /// Change tracking data. From 181c341a0e8005167aa580bb250dd036a04e5d74 Mon Sep 17 00:00:00 2001 From: mogery Date: Wed, 6 May 2026 19:57:37 +0200 Subject: [PATCH 22/27] fix(scrape): support YouTube live postprocessing Co-authored-by: Cursor --- .../postprocessors/__tests__/youtube.test.ts | 48 +++++++++++++++++++ .../scrapeURL/postprocessors/youtube.ts | 11 ++++- 2 files changed, 58 insertions(+), 1 deletion(-) create mode 100644 apps/api/src/scraper/scrapeURL/postprocessors/__tests__/youtube.test.ts diff --git a/apps/api/src/scraper/scrapeURL/postprocessors/__tests__/youtube.test.ts b/apps/api/src/scraper/scrapeURL/postprocessors/__tests__/youtube.test.ts new file mode 100644 index 0000000000..9e70415b48 --- /dev/null +++ b/apps/api/src/scraper/scrapeURL/postprocessors/__tests__/youtube.test.ts @@ -0,0 +1,48 @@ +import { youtubePostprocessor } from "../youtube"; + +describe("youtubePostprocessor.shouldRun", () => { + const meta = {} as any; + + it("runs for YouTube live video URLs", () => { + expect( + youtubePostprocessor.shouldRun( + meta, + new URL("https://www.youtube.com/live/H4fUJQCIV5E"), + ), + ).toBe(true); + }); + + it("keeps existing YouTube video URL support", () => { + expect( + youtubePostprocessor.shouldRun( + meta, + new URL("https://www.youtube.com/watch?v=H4fUJQCIV5E"), + ), + ).toBe(true); + expect( + youtubePostprocessor.shouldRun( + meta, + new URL("https://youtu.be/H4fUJQCIV5E"), + ), + ).toBe(true); + }); + + it("does not run for non-video YouTube paths or already processed URLs", () => { + expect( + youtubePostprocessor.shouldRun(meta, new URL("https://www.youtube.com/")), + ).toBe(false); + expect( + youtubePostprocessor.shouldRun( + meta, + new URL("https://www.youtube.com/live/"), + ), + ).toBe(false); + expect( + youtubePostprocessor.shouldRun( + meta, + new URL("https://www.youtube.com/live/H4fUJQCIV5E"), + ["youtube"], + ), + ).toBe(false); + }); +}); diff --git a/apps/api/src/scraper/scrapeURL/postprocessors/youtube.ts b/apps/api/src/scraper/scrapeURL/postprocessors/youtube.ts index 3a9ead8103..e05ffe4424 100644 --- a/apps/api/src/scraper/scrapeURL/postprocessors/youtube.ts +++ b/apps/api/src/scraper/scrapeURL/postprocessors/youtube.ts @@ -51,6 +51,15 @@ function formatUploadedBy(metadata: YouTubeMetadataResponse): string { return name || url || ""; } +function isYouTubeVideoPath(url: URL): boolean { + if (url.pathname === "/watch" && !!url.searchParams.get("v")) { + return true; + } + + const pathParts = url.pathname.split("/").filter(Boolean); + return pathParts.length === 2 && pathParts[0] === "live"; +} + function buildMarkdown( metadata: YouTubeMetadataResponse, sourceUrl: string, @@ -139,7 +148,7 @@ export const youtubePostprocessor: Postprocessor = { url.hostname.endsWith(".youtube.com") || url.hostname === "youtube.com" ) { - return url.pathname === "/watch" && !!url.searchParams.get("v"); + return isYouTubeVideoPath(url); } else if (url.hostname === "youtu.be") { return url.pathname !== "/"; } else { From c672f4b38080cb5979d0474987024dbbd2d72cd8 Mon Sep 17 00:00:00 2001 From: Himanshu Gupta Date: Thu, 7 May 2026 02:09:24 +0530 Subject: [PATCH 23/27] feat(api): add proxy routes for support-agent ask endpoints (#3490) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat(api): add proxy routes for support-agent ask endpoints Forwards /v2/support/ask and /v2/support/docs-search to the support-agent service (SUPPORT_AGENT_URL env var). No auth middleware on the proxy — the support-agent validates the bearer itself. This lets callers use a single api.firecrawl.dev base URL for all Firecrawl endpoints instead of needing to know about ask.firecrawl.dev. Co-Authored-By: Claude Opus 4.6 * docs(env): add SUPPORT_AGENT_URL to .env.example Co-Authored-By: Claude Opus 4.6 * fix(proxy): forward content-type from upstream response Without this, Express defaults to text/html for string bodies, breaking JSON parsing in client libraries. Co-Authored-By: Claude Opus 4.6 * fix(proxy): always set content-type to application/json on upstream request Body is always JSON.stringify'd, so forwarding the client's original content-type (e.g. application/x-www-form-urlencoded) would cause a mismatch that breaks upstream parsing. Co-Authored-By: Claude Opus 4.6 * feat(proxy): add auth middleware to /support/ask route Adds rate-limited auth check for consistency with other v2 endpoints. docs-search remains public (no auth) as designed. Co-Authored-By: Claude Opus 4.6 * feat(proxy): add dedicated Support rate limit (5 req/hour) Adds RateLimiterMode.Support with a 3600s window and 5-request cap. Applied to both /support/ask and /support/docs-search routes. Co-Authored-By: Claude Opus 4.6 * fix(proxy): change support rate limit to 3 req/min Simpler and consistent with other modes using the default 60s window. Co-Authored-By: Claude Opus 4.6 * feat(proxy): split support rate limits into independent buckets Each endpoint gets its own 3 req/min limit so they don't share a bucket. Co-Authored-By: Claude Opus 4.6 * refactor(rate-limiter): remove unnecessary duration override Reverts createRateLimiter back to its original signature. Support endpoints use the same 60s window as everything else. Co-Authored-By: Claude Opus 4.6 * feat(proxy): add Vercel deployment protection bypass header Adds SUPPORT_AGENT_VERCEL_BYPASS_SECRET env var. When set, the proxy includes x-vercel-protection-bypass header on upstream requests to ash.firecrawl.dev. Co-Authored-By: Claude Opus 4.6 --------- Co-authored-by: Claude Opus 4.6 --- apps/api/.env.example | 5 ++ apps/api/src/config.ts | 2 + apps/api/src/controllers/v1/types.ts | 2 + apps/api/src/controllers/v2/support-proxy.ts | 65 ++++++++++++++++++++ apps/api/src/routes/v2.ts | 13 ++++ apps/api/src/services/rate-limiter.ts | 2 + apps/api/src/types.ts | 2 + 7 files changed, 91 insertions(+) create mode 100644 apps/api/src/controllers/v2/support-proxy.ts diff --git a/apps/api/.env.example b/apps/api/.env.example index 232226bf85..249be61fcd 100644 --- a/apps/api/.env.example +++ b/apps/api/.env.example @@ -84,6 +84,11 @@ SELF_HOSTED_WEBHOOK_URL= # Set this to the HMAC secret of your webhook when using the self-hosted version of FireCrawl SELF_HOSTED_WEBHOOK_HMAC_SECRET= +# Support Agent service URL for /v2/support/* proxy +SUPPORT_AGENT_URL= +# Vercel Deployment Protection bypass secret for the support agent +SUPPORT_AGENT_VERCEL_BYPASS_SECRET= + # Resend API Key for transactional emails RESEND_API_KEY= diff --git a/apps/api/src/config.ts b/apps/api/src/config.ts index 7d328d3d20..b5e042f685 100644 --- a/apps/api/src/config.ts +++ b/apps/api/src/config.ts @@ -36,6 +36,8 @@ const configSchema = z.object({ FIRECRAWL_APP_PORT: z.string().default("3002"), FIRECRAWL_APP_SCHEME: z.string().default("http"), LOGGING_LEVEL: z.string().optional(), + SUPPORT_AGENT_URL: z.string().url().optional(), + SUPPORT_AGENT_VERCEL_BYPASS_SECRET: z.string().optional(), // Express EXPRESS_TRUST_PROXY: z.coerce.number().optional(), diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 792bf3d106..478a1b5417 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -1265,6 +1265,8 @@ export type AuthCreditUsageChunk = { browser?: number; browserExecute?: number; account?: number; + supportAsk?: number; + supportDocsSearch?: number; }; concurrency: number; flags: TeamFlags; diff --git a/apps/api/src/controllers/v2/support-proxy.ts b/apps/api/src/controllers/v2/support-proxy.ts new file mode 100644 index 0000000000..3a698ddce9 --- /dev/null +++ b/apps/api/src/controllers/v2/support-proxy.ts @@ -0,0 +1,65 @@ +import { Request, Response } from "express"; +import { config } from "../../config"; +import { logger } from "../../lib/logger"; + +const SUPPORT_AGENT_BASE = config.SUPPORT_AGENT_URL; +const SUPPORT_AGENT_BYPASS = config.SUPPORT_AGENT_VERCEL_BYPASS_SECRET; +const PROXY_TIMEOUT_MS = 65_000; + +const FORWARDED_HEADERS = [ + "authorization", + "idempotency-key", + "x-request-id", +]; + +export async function supportProxyController( + req: Request, + res: Response, +): Promise { + if (!SUPPORT_AGENT_BASE) { + res.status(503).json({ error: "support_agent_unavailable" }); + return; + } + + const target = `${SUPPORT_AGENT_BASE}/api/v2${req.path}`; + + const headers: Record = {}; + for (const name of FORWARDED_HEADERS) { + const value = req.headers[name]; + if (typeof value === "string") { + headers[name] = value; + } + } + + try { + const upstream = await fetch(target, { + method: "POST", + headers: { + ...headers, + "content-type": "application/json", + ...(SUPPORT_AGENT_BYPASS && { + "x-vercel-protection-bypass": SUPPORT_AGENT_BYPASS, + }), + }, + body: JSON.stringify(req.body), + signal: AbortSignal.timeout(PROXY_TIMEOUT_MS), + }); + + for (const name of ["content-type", "x-request-id", "x-idempotency-cached"]) { + const value = upstream.headers.get(name); + if (value) res.setHeader(name, value); + } + + res.status(upstream.status); + const body = await upstream.text(); + res.send(body); + } catch (err: unknown) { + if (err instanceof DOMException && err.name === "TimeoutError") { + logger.error("Support agent proxy timeout"); + res.status(504).json({ error: "support_agent_timeout" }); + return; + } + logger.error("Support agent proxy error", { error: err }); + res.status(502).json({ error: "support_agent_unreachable" }); + } +} diff --git a/apps/api/src/routes/v2.ts b/apps/api/src/routes/v2.ts index 407e1f2de1..5ebf901e2a 100644 --- a/apps/api/src/routes/v2.ts +++ b/apps/api/src/routes/v2.ts @@ -56,6 +56,7 @@ import { browserWebhookDestroyedController, } from "../controllers/v2/browser"; import { activityController } from "../controllers/v1/activity"; +import { supportProxyController } from "../controllers/v2/support-proxy"; import { agentSignupController } from "../controllers/v2/agent-signup"; import { agentSignupConfirmController, @@ -546,6 +547,18 @@ v2Router.post( wrap(browserWebhookDestroyedController), ); +// Support agent proxy — forwards to the support-agent service. +v2Router.post( + "/support/ask", + authMiddleware(RateLimiterMode.SupportAsk), + wrap(supportProxyController), +); +v2Router.post( + "/support/docs-search", + authMiddleware(RateLimiterMode.SupportDocsSearch), + wrap(supportProxyController), +); + // Agent signup routes (public, no auth required — rate limiting is handled inside the controller) // v2Router.post("/agent-signup", wrap(agentSignupController)); v2Router.post("/agent-signup/confirm", wrap(agentSignupConfirmController)); diff --git a/apps/api/src/services/rate-limiter.ts b/apps/api/src/services/rate-limiter.ts index 79a49803cb..2083b8c83a 100644 --- a/apps/api/src/services/rate-limiter.ts +++ b/apps/api/src/services/rate-limiter.ts @@ -30,6 +30,8 @@ const fallbackRateLimits: AuthCreditUsageChunk["rate_limits"] = { browser: 2, browserExecute: 10, account: 1000, + supportAsk: 3, + supportDocsSearch: 3, }; export function getRateLimiter( diff --git a/apps/api/src/types.ts b/apps/api/src/types.ts index 8ad6ffbee5..ed4168477d 100644 --- a/apps/api/src/types.ts +++ b/apps/api/src/types.ts @@ -152,6 +152,8 @@ export enum RateLimiterMode { Browser = "browser", BrowserExecute = "browserExecute", Account = "account", + SupportAsk = "supportAsk", + SupportDocsSearch = "supportDocsSearch", } export type AuthResponse = From 4d5c1dc9636a2ddc6934474eff66214ad8aab62e Mon Sep 17 00:00:00 2001 From: Abimael Martell Date: Wed, 6 May 2026 14:21:18 -0700 Subject: [PATCH 24/27] feat(api): deprecation warnings on legacy endpoints (#3469) * feat(api): deprecation warnings on legacy endpoints Add a deprecation middleware driven by a typed registry that emits RFC 9745 Deprecation/Sunset headers and injects warning + replacement fields into JSON responses. Wired into v1 extract, deep-research, llmstxt and v2 extract. * chore(api): drop HTTP method prefix from deprecation messages Path is unambiguous on its own. * test(api): use crypto.randomUUID in deprecation test uuid@13 is ESM-only and Jest cannot parse its export syntax under the current ts-jest config, causing the test suite to fail to load. Replace the direct uuid import with the global crypto.randomUUID(), matching the convention used by other snips tests. * feat(api): emit deprecation warnings via warnings[] and standard headers Switch the deprecation middleware response shape from a single warning string to a warnings array so multiple notices can coexist with controller-emitted warnings. Surface the same notice in standard HTTP headers: Warning: 299 (RFC 7234) for the human-readable text and Link rel="successor-version" (RFC 5829 / RFC 8288) for the replacement endpoint, alongside the existing Deprecation (RFC 9745) and Sunset (RFC 8594) headers. * feat(api): deprecation warnings on legacy v0 endpoints Wire deprecationMiddleware to /v0/scrape, /v0/crawl, /v0/crawl/status/:jobId, /v0/crawl/cancel/:jobId, and /v0/search. Each points clients at its v2 successor via the standard Deprecation, Warning, and Link rel="successor-version" headers and the warnings[] body field. Health probes and /v0/keyAuth are left untouched. --- .../__tests__/snips/v1/deprecation.test.ts | 93 +++++++++++++++ apps/api/src/lib/deprecations.ts | 109 ++++++++++++++++++ apps/api/src/routes/v0.ts | 27 ++++- apps/api/src/routes/v1.ts | 7 ++ apps/api/src/routes/v2.ts | 3 + 5 files changed, 234 insertions(+), 5 deletions(-) create mode 100644 apps/api/src/__tests__/snips/v1/deprecation.test.ts create mode 100644 apps/api/src/lib/deprecations.ts diff --git a/apps/api/src/__tests__/snips/v1/deprecation.test.ts b/apps/api/src/__tests__/snips/v1/deprecation.test.ts new file mode 100644 index 0000000000..e585567818 --- /dev/null +++ b/apps/api/src/__tests__/snips/v1/deprecation.test.ts @@ -0,0 +1,93 @@ +import { describe, it, expect, beforeAll } from "@jest/globals"; +import request from "supertest"; +import { TEST_API_URL } from "../lib"; +import { idmux, Identity } from "./lib"; + +let identity: Identity; + +beforeAll(async () => { + identity = await idmux({ + name: "deprecation", + concurrency: 10, + credits: 1000, + }); +}, 10000); + +describe("Deprecation warnings on legacy endpoints", () => { + it("POST /v1/llmstxt enqueues with Deprecation header and warnings in body", async () => { + const res = await request(TEST_API_URL) + .post("/v1/llmstxt") + .set("Authorization", `Bearer ${identity.apiKey}`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + + expect(res.statusCode).toBe(200); + expect(res.body.success).toBe(true); + expect(res.headers["deprecation"]).toBe("true"); + expect(res.headers["warning"]).toMatch(/^299 - "/); + expect(res.headers["warning"]).toMatch(/llmstxt/i); + expect(Array.isArray(res.body.warnings)).toBe(true); + expect(res.body.warnings.some((w: string) => /llmstxt/i.test(w))).toBe( + true, + ); + expect(res.body.warnings.some((w: string) => /deprecated/i.test(w))).toBe( + true, + ); + expect(res.body.replacement).toBeUndefined(); + expect(res.headers["link"]).toBeUndefined(); + }, 30000); + + it("GET /v1/llmstxt/:jobId still emits warnings on 404", async () => { + const res = await request(TEST_API_URL) + .get(`/v1/llmstxt/${crypto.randomUUID()}`) + .set("Authorization", `Bearer ${identity.apiKey}`); + + expect(res.statusCode).toBe(404); + expect(res.headers["deprecation"]).toBe("true"); + expect(res.headers["warning"]).toMatch(/deprecated/i); + expect(Array.isArray(res.body.warnings)).toBe(true); + expect(res.body.warnings.some((w: string) => /deprecated/i.test(w))).toBe( + true, + ); + }, 30000); + + it("POST /v1/deep-research returns warnings and successor-version Link", async () => { + const res = await request(TEST_API_URL) + .post("/v1/deep-research") + .set("Authorization", `Bearer ${identity.apiKey}`) + .set("Content-Type", "application/json") + .send({ + query: "what is firecrawl", + maxDepth: 1, + maxUrls: 1, + timeLimit: 60, + }); + + expect(res.statusCode).toBe(200); + expect(res.headers["deprecation"]).toBe("true"); + expect(res.headers["warning"]).toMatch(/deep-research/i); + expect(res.headers["link"]).toContain( + '; rel="successor-version"', + ); + expect(Array.isArray(res.body.warnings)).toBe(true); + expect( + res.body.warnings.some((w: string) => /deep-research/i.test(w)), + ).toBe(true); + expect(res.body.replacement).toBe("/v2/search"); + }, 30000); + + it("non-deprecated endpoints do not emit Deprecation header or warnings", async () => { + const res = await request(TEST_API_URL) + .post("/v1/scrape") + .set("Authorization", `Bearer ${identity.apiKey}`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + + expect(res.headers["deprecation"]).toBeUndefined(); + expect(res.headers["warning"]).toBeUndefined(); + if (res.body && typeof res.body === "object") { + expect(res.body.warnings).toBeUndefined(); + expect(res.body.replacement).toBeUndefined(); + } + }, 60000); +}); diff --git a/apps/api/src/lib/deprecations.ts b/apps/api/src/lib/deprecations.ts new file mode 100644 index 0000000000..02ede0a01e --- /dev/null +++ b/apps/api/src/lib/deprecations.ts @@ -0,0 +1,109 @@ +import { NextFunction, Request, Response } from "express"; + +interface Deprecation { + message: string; + replacement?: string; + sunset?: string; + docs?: string; +} + +const DEPRECATIONS = { + v1_extract: { + message: + "/v1/extract is deprecated. Use /v2/scrape with formats including a 'json' format object.", + replacement: "/v2/scrape", + }, + v1_extract_status: { + message: + "/v1/extract/:jobId is deprecated. Use /v2/scrape with formats including a 'json' format object.", + replacement: "/v2/scrape", + }, + v2_extract: { + message: + "/v2/extract is deprecated. Use /v2/scrape with formats including a 'json' format object.", + replacement: "/v2/scrape", + }, + v2_extract_status: { + message: + "/v2/extract/:jobId is deprecated. Use /v2/scrape with formats including a 'json' format object.", + replacement: "/v2/scrape", + }, + v1_deep_research: { + message: "/v1/deep-research is deprecated. Use /v2/search instead.", + replacement: "/v2/search", + }, + v1_deep_research_status: { + message: "/v1/deep-research/:jobId is deprecated. Use /v2/search instead.", + replacement: "/v2/search", + }, + v1_llmstxt: { + message: "/v1/llmstxt is deprecated and will not be replaced.", + }, + v1_llmstxt_status: { + message: "/v1/llmstxt/:jobId is deprecated and will not be replaced.", + }, + v0_scrape: { + message: "/v0/scrape is deprecated. Use /v2/scrape instead.", + replacement: "/v2/scrape", + }, + v0_crawl: { + message: "/v0/crawl is deprecated. Use /v2/crawl instead.", + replacement: "/v2/crawl", + }, + v0_crawl_status: { + message: + "/v0/crawl/status/:jobId is deprecated. Use /v2/crawl/:jobId instead.", + replacement: "/v2/crawl/:jobId", + }, + v0_crawl_cancel: { + message: + "/v0/crawl/cancel/:jobId is deprecated. Use DELETE /v2/crawl/:jobId instead.", + replacement: "/v2/crawl/:jobId", + }, + v0_search: { + message: "/v0/search is deprecated. Use /v2/search instead.", + replacement: "/v2/search", + }, +} as const satisfies Record; + +type DeprecationKey = keyof typeof DEPRECATIONS; + +// RFC 7234 quoted-string: escape backslash and double quote. +function quoteWarningText(s: string): string { + return `"${s.replace(/\\/g, "\\\\").replace(/"/g, '\\"')}"`; +} + +export function deprecationMiddleware(key: DeprecationKey) { + const dep: Deprecation = DEPRECATIONS[key]; + return (req: Request, res: Response, next: NextFunction) => { + // RFC 9745 Deprecation header. + res.setHeader("Deprecation", "true"); + // RFC 8594 Sunset header. + if (dep.sunset) res.setHeader("Sunset", dep.sunset); + + // RFC 8288 Link relations: "deprecation" (RFC 9745) for docs, and + // "successor-version" (RFC 5829) for the replacement endpoint. + const links: string[] = []; + if (dep.docs) links.push(`<${dep.docs}>; rel="deprecation"`); + if (dep.replacement) { + links.push(`<${dep.replacement}>; rel="successor-version"`); + } + if (links.length > 0) res.setHeader("Link", links.join(", ")); + + // RFC 7234 Warning header, code 299 = "Miscellaneous Persistent Warning". + res.setHeader("Warning", `299 - ${quoteWarningText(dep.message)}`); + + const originalJson = res.json.bind(res); + res.json = (body: any) => { + if (body && typeof body === "object" && !Array.isArray(body)) { + const existing = Array.isArray(body.warnings) ? body.warnings : []; + body.warnings = [...existing, dep.message]; + if (dep.replacement && body.replacement === undefined) { + body.replacement = dep.replacement; + } + } + return originalJson(body); + }; + next(); + }; +} diff --git a/apps/api/src/routes/v0.ts b/apps/api/src/routes/v0.ts index 772cab5810..550a7b8b67 100644 --- a/apps/api/src/routes/v0.ts +++ b/apps/api/src/routes/v0.ts @@ -7,19 +7,36 @@ import { crawlCancelController } from "../../src/controllers/v0/crawl-cancel"; import { keyAuthController } from "../../src/controllers/v0/keyAuth"; import { livenessController } from "../controllers/v0/liveness"; import { readinessController } from "../controllers/v0/readiness"; +import { deprecationMiddleware } from "../lib/deprecations"; export const v0Router = express.Router(); -v0Router.post("/v0/scrape", scrapeController); -v0Router.post("/v0/crawl", crawlController); -v0Router.get("/v0/crawl/status/:jobId", crawlStatusController); -v0Router.delete("/v0/crawl/cancel/:jobId", crawlCancelController); +v0Router.post( + "/v0/scrape", + deprecationMiddleware("v0_scrape"), + scrapeController, +); +v0Router.post("/v0/crawl", deprecationMiddleware("v0_crawl"), crawlController); +v0Router.get( + "/v0/crawl/status/:jobId", + deprecationMiddleware("v0_crawl_status"), + crawlStatusController, +); +v0Router.delete( + "/v0/crawl/cancel/:jobId", + deprecationMiddleware("v0_crawl_cancel"), + crawlCancelController, +); // Auth route for key based authentication v0Router.get("/v0/keyAuth", keyAuthController); // Search routes -v0Router.post("/v0/search", searchController); +v0Router.post( + "/v0/search", + deprecationMiddleware("v0_search"), + searchController, +); // Health/Probe routes v0Router.get("/v0/health/liveness", livenessController); diff --git a/apps/api/src/routes/v1.ts b/apps/api/src/routes/v1.ts index 5b8f0b67d4..c698eb9b01 100644 --- a/apps/api/src/routes/v1.ts +++ b/apps/api/src/routes/v1.ts @@ -46,6 +46,7 @@ import { createX402RouteConfig, isX402Enabled, } from "../lib/x402"; +import { deprecationMiddleware } from "../lib/deprecations"; expressWs(express()); @@ -222,6 +223,7 @@ v1Router.ws("/crawl/:jobId", crawlStatusWSController); v1Router.post( "/extract", authMiddleware(RateLimiterMode.Extract), + deprecationMiddleware("v1_extract"), countryCheck, checkCreditsMiddleware(20), wrap(extractController), @@ -230,12 +232,14 @@ v1Router.post( v1Router.get( "/extract/:jobId", authMiddleware(RateLimiterMode.ExtractStatus), + deprecationMiddleware("v1_extract_status"), wrap(extractStatusController), ); v1Router.post( "/llmstxt", authMiddleware(RateLimiterMode.Scrape), + deprecationMiddleware("v1_llmstxt"), countryCheck, blocklistMiddleware, wrap(generateLLMsTextController), @@ -244,12 +248,14 @@ v1Router.post( v1Router.get( "/llmstxt/:jobId", authMiddleware(RateLimiterMode.CrawlStatus), + deprecationMiddleware("v1_llmstxt_status"), wrap(generateLLMsTextStatusController), ); v1Router.post( "/deep-research", authMiddleware(RateLimiterMode.Crawl), + deprecationMiddleware("v1_deep_research"), countryCheck, checkCreditsMiddleware(1), wrap(deepResearchController), @@ -258,6 +264,7 @@ v1Router.post( v1Router.get( "/deep-research/:jobId", authMiddleware(RateLimiterMode.CrawlStatus), + deprecationMiddleware("v1_deep_research_status"), wrap(deepResearchStatusController), ); diff --git a/apps/api/src/routes/v2.ts b/apps/api/src/routes/v2.ts index 5ebf901e2a..0c738f92ce 100644 --- a/apps/api/src/routes/v2.ts +++ b/apps/api/src/routes/v2.ts @@ -45,6 +45,7 @@ import { createX402RouteConfig, isX402Enabled, } from "../lib/x402"; +import { deprecationMiddleware } from "../lib/deprecations"; import { agentController } from "../controllers/v2/agent"; import { agentStatusController } from "../controllers/v2/agent-status"; import { agentCancelController } from "../controllers/v2/agent-cancel"; @@ -381,6 +382,7 @@ v2Router.get( v2Router.post( "/extract", authMiddleware(RateLimiterMode.Extract), + deprecationMiddleware("v2_extract"), countryCheck, checkCreditsMiddleware(20), blocklistMiddleware, @@ -390,6 +392,7 @@ v2Router.post( v2Router.get( "/extract/:jobId", authMiddleware(RateLimiterMode.ExtractStatus), + deprecationMiddleware("v2_extract_status"), validateJobIdParam, wrap(extractStatusController), ); From e577e7b237f3fb2b9504ceb82cc7847fbc3fcdfc Mon Sep 17 00:00:00 2001 From: Abimael Martell Date: Wed, 6 May 2026 15:28:54 -0700 Subject: [PATCH 25/27] feat(sdk): surface deprecation warnings[] and replacement fields (#3491) * feat(sdk): surface deprecation warnings[] and replacement fields Mirror the API deprecation contract from #3469 in the JS and Python SDKs. Add optional warnings[] and replacement fields to the response types for the v1 extract, deep-research, and llmstxt endpoints, plus v2 extract, so callers can read the structured deprecation notice the API now returns. Mark the SDK methods that hit those endpoints as deprecated using each language's idiomatic doc comment, and add runtime DeprecationWarning calls on the Python deep-research and llmstxt methods (sync + async) for parity with the existing extract methods. * chore(sdk): bump js-sdk to 4.22.2 and python-sdk to 4.25.2 --- apps/js-sdk/firecrawl/package.json | 2 +- apps/js-sdk/firecrawl/src/v1/index.ts | 16 +++ apps/js-sdk/firecrawl/src/v2/types.ts | 2 + apps/python-sdk/firecrawl/__init__.py | 2 +- apps/python-sdk/firecrawl/v1/client.py | 152 ++++++++++++++++++++++++- apps/python-sdk/firecrawl/v2/types.py | 2 + 6 files changed, 170 insertions(+), 6 deletions(-) diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index dc9fd677de..2ec28d6e4d 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "4.22.1", + "version": "4.22.2", "description": "JavaScript SDK for Firecrawl API", "main": "dist/index.js", "types": "dist/index.d.ts", diff --git a/apps/js-sdk/firecrawl/src/v1/index.ts b/apps/js-sdk/firecrawl/src/v1/index.ts index 1874a9f9c1..b293d10e40 100644 --- a/apps/js-sdk/firecrawl/src/v1/index.ts +++ b/apps/js-sdk/firecrawl/src/v1/index.ts @@ -363,6 +363,8 @@ export interface ExtractResponse { data: LLMSchema; error?: string; warning?: string; + warnings?: string[]; + replacement?: string; sources?: string[]; creditsUsed?: number; } @@ -490,6 +492,8 @@ export interface DeepResearchParams { export interface DeepResearchResponse { success: boolean; id: string; + warnings?: string[]; + replacement?: string; } /** @@ -530,6 +534,8 @@ export interface DeepResearchStatusResponse { description: string; }>; summaries: string[]; + warnings?: string[]; + replacement?: string; } /** @@ -563,6 +569,8 @@ export interface GenerateLLMsTextParams { export interface GenerateLLMsTextResponse { success: boolean; id: string; + warnings?: string[]; + replacement?: string; } /** @@ -577,6 +585,8 @@ export interface GenerateLLMsTextStatusResponse { status: "processing" | "completed" | "failed"; error?: string; expiresAt: string; + warnings?: string[]; + replacement?: string; } /** @@ -1626,6 +1636,7 @@ export default class FirecrawlApp { * @param onActivity - Optional callback to receive activity updates in real-time. * @param onSource - Optional callback to receive source updates in real-time. * @returns The final research results. + * @deprecated /v1/deep-research is deprecated. Use /v2/search instead. */ async deepResearch( query: string, @@ -1713,6 +1724,7 @@ export default class FirecrawlApp { * Initiates a deep research operation on a given query without polling. * @param params - Parameters for the deep research operation. * @returns The response containing the research job ID. + * @deprecated /v1/deep-research is deprecated. Use /v2/search instead. */ async asyncDeepResearch(query: string, params: DeepResearchParams): Promise { const headers = this.prepareHeaders(); @@ -1754,6 +1766,7 @@ export default class FirecrawlApp { * Checks the status of a deep research operation. * @param id - The ID of the deep research operation. * @returns The current status and results of the research operation. + * @deprecated /v1/deep-research is deprecated. Use /v2/search instead. */ async checkDeepResearchStatus(id: string): Promise { const headers = this.prepareHeaders(); @@ -1921,6 +1934,7 @@ export default class FirecrawlApp { * @param url - The URL to generate LLMs.txt from. * @param params - Parameters for the LLMs.txt generation operation. * @returns The final generation results. + * @deprecated /v1/llmstxt is deprecated and will not be replaced. */ async generateLLMsText(url: string, params?: GenerateLLMsTextParams): Promise { try { @@ -1973,6 +1987,7 @@ export default class FirecrawlApp { * @param url - The URL to generate LLMs.txt from. * @param params - Parameters for the LLMs.txt generation operation. * @returns The response containing the generation job ID. + * @deprecated /v1/llmstxt is deprecated and will not be replaced. */ async asyncGenerateLLMsText(url: string, params?: GenerateLLMsTextParams): Promise { const headers = this.prepareHeaders(); @@ -2003,6 +2018,7 @@ export default class FirecrawlApp { * Checks the status of a LLMs.txt generation operation. * @param id - The ID of the LLMs.txt generation operation. * @returns The current status and results of the generation operation. + * @deprecated /v1/llmstxt is deprecated and will not be replaced. */ async checkGenerateLLMsTextStatus(id: string): Promise { const headers = this.prepareHeaders(); diff --git a/apps/js-sdk/firecrawl/src/v2/types.ts b/apps/js-sdk/firecrawl/src/v2/types.ts index 0b38774ad3..14f5cc3223 100644 --- a/apps/js-sdk/firecrawl/src/v2/types.ts +++ b/apps/js-sdk/firecrawl/src/v2/types.ts @@ -777,6 +777,8 @@ export interface ExtractResponse { data?: unknown; error?: string; warning?: string; + warnings?: string[]; + replacement?: string; sources?: Record; expiresAt?: string; creditsUsed?: number; diff --git a/apps/python-sdk/firecrawl/__init__.py b/apps/python-sdk/firecrawl/__init__.py index 90f63b1cd1..8e097fa9c1 100644 --- a/apps/python-sdk/firecrawl/__init__.py +++ b/apps/python-sdk/firecrawl/__init__.py @@ -17,7 +17,7 @@ V1ChangeTrackingOptions, ) -__version__ = "4.25.1" +__version__ = "4.25.2" # Define the logger for the Firecrawl project logger: logging.Logger = logging.getLogger("firecrawl") diff --git a/apps/python-sdk/firecrawl/v1/client.py b/apps/python-sdk/firecrawl/v1/client.py index ee67d63ffc..857e61567b 100644 --- a/apps/python-sdk/firecrawl/v1/client.py +++ b/apps/python-sdk/firecrawl/v1/client.py @@ -338,6 +338,8 @@ class V1ExtractResponse(pydantic.BaseModel, Generic[T]): data: Optional[T] = None error: Optional[str] = None warning: Optional[str] = None + warnings: Optional[List[str]] = None + replacement: Optional[str] = None sources: Optional[Dict[Any, Any]] = None creditsUsed: Optional[int] = None @@ -427,6 +429,8 @@ class V1DeepResearchResponse(pydantic.BaseModel): success: bool id: str error: Optional[str] = None + warnings: Optional[List[str]] = None + replacement: Optional[str] = None class V1DeepResearchStatusResponse(pydantic.BaseModel): """ @@ -442,12 +446,16 @@ class V1DeepResearchStatusResponse(pydantic.BaseModel): activities: List[Dict[str, Any]] sources: List[Dict[str, Any]] summaries: List[str] + warnings: Optional[List[str]] = None + replacement: Optional[str] = None class V1GenerateLLMsTextResponse(pydantic.BaseModel): """Response from LLMs.txt generation operations.""" success: bool = True id: str error: Optional[str] = None + warnings: Optional[List[str]] = None + replacement: Optional[str] = None class V1GenerateLLMsTextStatusResponseData(pydantic.BaseModel): llmstxt: str @@ -460,6 +468,8 @@ class V1GenerateLLMsTextStatusResponse(pydantic.BaseModel): status: Literal["processing", "completed", "failed"] error: Optional[str] = None expiresAt: str + warnings: Optional[List[str]] = None + replacement: Optional[str] = None class V1SearchResponse(pydantic.BaseModel): """ @@ -2196,6 +2206,9 @@ def generate_llms_text( """ Generate LLMs.txt for a given URL and poll until completion. + .. deprecated:: + /v1/llmstxt is deprecated and will not be replaced. + Args: url (str): Target URL to generate LLMs.txt from max_urls (Optional[int]): Maximum URLs to process (default: 10) @@ -2213,6 +2226,12 @@ def generate_llms_text( Raises: Exception: If generation fails """ + import warnings + warnings.warn( + "/v1/llmstxt is deprecated and will not be replaced.", + DeprecationWarning, + stacklevel=2, + ) params = V1GenerateLLMsTextParams( maxUrls=max_urls, showFullText=show_full_text, @@ -2265,6 +2284,9 @@ def async_generate_llms_text( """ Initiate an asynchronous LLMs.txt generation operation. + .. deprecated:: + /v1/llmstxt is deprecated and will not be replaced. + Args: url (str): The target URL to generate LLMs.txt from. Must be a valid HTTP/HTTPS URL. max_urls (Optional[int]): Maximum URLs to process (default: 10) @@ -2281,6 +2303,12 @@ def async_generate_llms_text( Raises: Exception: If the generation job initiation fails. """ + import warnings + warnings.warn( + "/v1/llmstxt is deprecated and will not be replaced.", + DeprecationWarning, + stacklevel=2, + ) params = V1GenerateLLMsTextParams( maxUrls=max_urls, showFullText=show_full_text, @@ -2316,6 +2344,9 @@ def check_generate_llms_text_status(self, id: str) -> V1GenerateLLMsTextStatusRe """ Check the status of a LLMs.txt generation operation. + .. deprecated:: + /v1/llmstxt is deprecated and will not be replaced. + Args: id (str): The unique identifier of the LLMs.txt generation job to check status for. @@ -2332,6 +2363,12 @@ def check_generate_llms_text_status(self, id: str) -> V1GenerateLLMsTextStatusRe Raises: Exception: If the status check fails. """ + import warnings + warnings.warn( + "/v1/llmstxt is deprecated and will not be replaced.", + DeprecationWarning, + stacklevel=2, + ) headers = self._prepare_headers() try: response = self._get_request(f'{self.api_url}/v1/llmstxt/{id}', headers) @@ -2593,6 +2630,9 @@ def deep_research( """ Initiates a deep research operation on a given query and polls until completion. + .. deprecated:: + /v1/deep-research is deprecated. Use /v2/search instead. + Args: query (str): Research query or topic to investigate max_depth (Optional[int]): Maximum depth of research exploration @@ -2618,6 +2658,12 @@ def deep_research( Raises: Exception: If research fails """ + import warnings + warnings.warn( + "/v1/deep-research is deprecated. Use /v2/search instead.", + DeprecationWarning, + stacklevel=2, + ) research_params = {} if max_depth is not None: research_params['maxDepth'] = max_depth @@ -2687,6 +2733,9 @@ def async_deep_research( """ Initiates an asynchronous deep research operation. + .. deprecated:: + /v1/deep-research is deprecated. Use /v2/search instead. + Args: query (str): Research query or topic to investigate max_depth (Optional[int]): Maximum depth of research exploration @@ -2705,6 +2754,12 @@ def async_deep_research( Raises: Exception: If the research initiation fails. """ + import warnings + warnings.warn( + "/v1/deep-research is deprecated. Use /v2/search instead.", + DeprecationWarning, + stacklevel=2, + ) research_params = {} if max_depth is not None: research_params['maxDepth'] = max_depth @@ -2721,7 +2776,7 @@ def async_deep_research( research_params = V1DeepResearchParams(**research_params) headers = self._prepare_headers() - + json_data = {'query': query, **research_params.dict(by_alias=True, exclude_none=True)} json_data['origin'] = f"python-sdk@{version}" @@ -2749,6 +2804,9 @@ def check_deep_research_status(self, id: str) -> V1DeepResearchStatusResponse: """ Check the status of a deep research operation. + .. deprecated:: + /v1/deep-research is deprecated. Use /v2/search instead. + Args: id (str): The ID of the deep research operation. @@ -2759,7 +2817,7 @@ def check_deep_research_status(self, id: str) -> V1DeepResearchStatusResponse: * success - Whether research completed successfully * status - Current state (processing/completed/failed) * error - Error message if failed - + Results: * id - Unique identifier for the research job * data - Research findings and analysis @@ -2770,6 +2828,12 @@ def check_deep_research_status(self, id: str) -> V1DeepResearchStatusResponse: Raises: Exception: If the status check fails. """ + import warnings + warnings.warn( + "/v1/deep-research is deprecated. Use /v2/search instead.", + DeprecationWarning, + stacklevel=2, + ) headers = self._prepare_headers() try: response = self._get_request(f'{self.api_url}/v1/deep-research/{id}', headers) @@ -4626,6 +4690,11 @@ async def get_extract_status(self, job_id: str) -> V1ExtractResponse[Any]: """ Check the status of an asynchronous extraction job. + .. deprecated:: + The extract endpoint is in maintenance mode and its use is discouraged. + Review https://docs.firecrawl.dev/developer-guides/usage-guides/choosing-the-data-extractor + to find a replacement. + Args: job_id (str): The ID of the extraction job @@ -4640,6 +4709,14 @@ async def get_extract_status(self, job_id: str) -> V1ExtractResponse[Any]: Raises: ValueError: If status check fails """ + import warnings + warnings.warn( + "The extract endpoint is in maintenance mode and its use is discouraged. " + "Review https://docs.firecrawl.dev/developer-guides/usage-guides/choosing-the-data-extractor " + "to find a replacement.", + DeprecationWarning, + stacklevel=2, + ) headers = self._prepare_headers() try: return await self._async_get_request( @@ -4663,6 +4740,11 @@ async def async_extract( """ Initiate an asynchronous extraction job without waiting for completion. + .. deprecated:: + The extract endpoint is in maintenance mode and its use is discouraged. + Review https://docs.firecrawl.dev/developer-guides/usage-guides/choosing-the-data-extractor + to find a replacement. + Args: urls (Optional[List[str]]): URLs to extract from prompt (Optional[str]): Custom extraction prompt @@ -4683,6 +4765,14 @@ async def async_extract( Raises: ValueError: If job initiation fails """ + import warnings + warnings.warn( + "The extract endpoint is in maintenance mode and its use is discouraged. " + "Review https://docs.firecrawl.dev/developer-guides/usage-guides/choosing-the-data-extractor " + "to find a replacement.", + DeprecationWarning, + stacklevel=2, + ) headers = self._prepare_headers() if not prompt and not schema: @@ -4729,6 +4819,9 @@ async def generate_llms_text( """ Generate LLMs.txt for a given URL and monitor until completion. + .. deprecated:: + /v1/llmstxt is deprecated and will not be replaced. + Args: url (str): Target URL to generate LLMs.txt from max_urls (Optional[int]): Maximum URLs to process (default: 10) @@ -4748,6 +4841,12 @@ async def generate_llms_text( Raises: Exception: If generation fails """ + import warnings + warnings.warn( + "/v1/llmstxt is deprecated and will not be replaced.", + DeprecationWarning, + stacklevel=2, + ) params = {} if max_urls is not None: params['maxUrls'] = max_urls @@ -4791,6 +4890,9 @@ async def async_generate_llms_text( """ Initiate an asynchronous LLMs.txt generation job without waiting for completion. + .. deprecated:: + /v1/llmstxt is deprecated and will not be replaced. + Args: url (str): Target URL to generate LLMs.txt from max_urls (Optional[int]): Maximum URLs to process (default: 10) @@ -4807,6 +4909,12 @@ async def async_generate_llms_text( Raises: ValueError: If job initiation fails """ + import warnings + warnings.warn( + "/v1/llmstxt is deprecated and will not be replaced.", + DeprecationWarning, + stacklevel=2, + ) params = {} if max_urls is not None: params['maxUrls'] = max_urls @@ -4839,6 +4947,9 @@ async def check_generate_llms_text_status(self, id: str) -> V1GenerateLLMsTextSt """ Check the status of an asynchronous LLMs.txt generation job. + .. deprecated:: + /v1/llmstxt is deprecated and will not be replaced. + Args: id (str): The ID of the generation job @@ -4855,6 +4966,12 @@ async def check_generate_llms_text_status(self, id: str) -> V1GenerateLLMsTextSt Raises: ValueError: If status check fails """ + import warnings + warnings.warn( + "/v1/llmstxt is deprecated and will not be replaced.", + DeprecationWarning, + stacklevel=2, + ) headers = self._prepare_headers() try: return await self._async_get_request( @@ -4879,6 +4996,9 @@ async def deep_research( """ Initiates a deep research operation on a given query and polls until completion. + .. deprecated:: + /v1/deep-research is deprecated. Use /v2/search instead. + Args: query (str): Research query or topic to investigate max_depth (Optional[int]): Maximum depth of research exploration @@ -4904,6 +5024,12 @@ async def deep_research( Raises: Exception: If research fails """ + import warnings + warnings.warn( + "/v1/deep-research is deprecated. Use /v2/search instead.", + DeprecationWarning, + stacklevel=2, + ) research_params = {} if max_depth is not None: research_params['maxDepth'] = max_depth @@ -4973,6 +5099,9 @@ async def async_deep_research( """ Initiates an asynchronous deep research operation. + .. deprecated:: + /v1/deep-research is deprecated. Use /v2/search instead. + Args: query (str): Research query or topic to investigate max_depth (Optional[int]): Maximum depth of research exploration @@ -4991,6 +5120,12 @@ async def async_deep_research( Raises: Exception: If the research initiation fails. """ + import warnings + warnings.warn( + "/v1/deep-research is deprecated. Use /v2/search instead.", + DeprecationWarning, + stacklevel=2, + ) research_params = {} if max_depth is not None: research_params['maxDepth'] = max_depth @@ -5007,7 +5142,7 @@ async def async_deep_research( research_params = V1DeepResearchParams(**research_params) headers = self._prepare_headers() - + json_data = {'query': query, **research_params.dict(by_alias=True, exclude_none=True)} json_data['origin'] = f"python-sdk@{version}" @@ -5024,6 +5159,9 @@ async def check_deep_research_status(self, id: str) -> V1DeepResearchStatusRespo """ Check the status of a deep research operation. + .. deprecated:: + /v1/deep-research is deprecated. Use /v2/search instead. + Args: id (str): The ID of the deep research operation. @@ -5034,7 +5172,7 @@ async def check_deep_research_status(self, id: str) -> V1DeepResearchStatusRespo * success - Whether research completed successfully * status - Current state (processing/completed/failed) * error - Error message if failed - + Results: * id - Unique identifier for the research job * data - Research findings and analysis @@ -5045,6 +5183,12 @@ async def check_deep_research_status(self, id: str) -> V1DeepResearchStatusRespo Raises: Exception: If the status check fails. """ + import warnings + warnings.warn( + "/v1/deep-research is deprecated. Use /v2/search instead.", + DeprecationWarning, + stacklevel=2, + ) headers = self._prepare_headers() try: return await self._async_get_request( diff --git a/apps/python-sdk/firecrawl/v2/types.py b/apps/python-sdk/firecrawl/v2/types.py index 2ace0eb629..00fde19a7d 100644 --- a/apps/python-sdk/firecrawl/v2/types.py +++ b/apps/python-sdk/firecrawl/v2/types.py @@ -976,6 +976,8 @@ class ExtractResponse(BaseModel): data: Optional[Any] = None error: Optional[str] = None warning: Optional[str] = None + warnings: Optional[List[str]] = None + replacement: Optional[str] = None sources: Optional[Dict[str, Any]] = None expires_at: Optional[datetime] = None credits_used: Optional[int] = None From f1f4e88fa6a0e363878e977fd342f328731d4195 Mon Sep 17 00:00:00 2001 From: tomsideguide Date: Thu, 7 May 2026 16:52:06 +0100 Subject: [PATCH 26/27] fix: proxy billing for cached scrapes (#3496) * fix: proxy billing for cached scrapes * fix: add proxyUsed field to document metadata --- apps/api/src/scraper/scrapeURL/engines/index/index.ts | 5 ++++- apps/api/src/services/index.ts | 1 + 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/apps/api/src/scraper/scrapeURL/engines/index/index.ts b/apps/api/src/scraper/scrapeURL/engines/index/index.ts index bc1e687910..02577f8bd0 100644 --- a/apps/api/src/scraper/scrapeURL/engines/index/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/index/index.ts @@ -102,6 +102,7 @@ export async function sendDocumentToIndex(meta: Meta, document: Document) { : undefined, contentType: document.metadata.contentType, postprocessorsUsed: document.metadata.postprocessorsUsed, + proxyUsed: document.metadata.proxyUsed, }); } catch (error) { meta.logger.error("Failed to save document to index", { @@ -404,7 +405,9 @@ export async function scrapeURLWithIndex( postprocessorsUsed: doc.postprocessorsUsed, - proxyUsed: doc.proxyUsed ?? "basic", + proxyUsed: + doc.proxyUsed ?? + (meta.featureFlags.has("stealthProxy") ? "stealth" : "basic"), // this can be dropped after june 2026, it's here to backfill proxyUsed for older index entries that don't have it }; } diff --git a/apps/api/src/services/index.ts b/apps/api/src/services/index.ts index ae71536b79..2711afbc4b 100644 --- a/apps/api/src/services/index.ts +++ b/apps/api/src/services/index.ts @@ -165,6 +165,7 @@ export async function saveIndexToGCS( pdfMetadata?: PdfMetadata; contentType?: string; postprocessorsUsed?: string[]; + proxyUsed?: "basic" | "stealth"; }, ): Promise { return await withSpan("firecrawl-index-save-to-gcs", async span => { From 3afe6df1f48f4485b8a018069ad8bbf54ae99cb2 Mon Sep 17 00:00:00 2001 From: mogery Date: Fri, 8 May 2026 15:59:23 +0200 Subject: [PATCH 27/27] fix(monitoring): various fixes --- .../src/services/monitoring/runner.test.ts | 2 +- apps/api/src/services/monitoring/runner.ts | 156 +++++++++++++----- .../src/services/monitoring/scheduler.test.ts | 68 ++++++++ apps/api/src/services/monitoring/scheduler.ts | 70 +++++++- apps/api/src/services/monitoring/stale.ts | 15 ++ .../api/src/services/webhook/delivery.test.ts | 30 ++++ apps/api/src/services/webhook/delivery.ts | 63 +++++-- apps/api/src/services/webhook/types.ts | 2 +- apps/api/src/services/worker/scrape-worker.ts | 25 ++- 9 files changed, 363 insertions(+), 68 deletions(-) create mode 100644 apps/api/src/services/monitoring/stale.ts create mode 100644 apps/api/src/services/webhook/delivery.test.ts diff --git a/apps/api/src/services/monitoring/runner.test.ts b/apps/api/src/services/monitoring/runner.test.ts index 4b1cc32bae..217925c92a 100644 --- a/apps/api/src/services/monitoring/runner.test.ts +++ b/apps/api/src/services/monitoring/runner.test.ts @@ -8,7 +8,7 @@ describe("monitoring runner", () => { describe("isMonitorCheckStale", () => { const now = new Date("2026-05-06T12:00:00.000Z"); - it("returns true when a running check is at least 24 hours old", () => { + it("returns true when a running check is at least 1 hour old", () => { expect( isMonitorCheckStale( { diff --git a/apps/api/src/services/monitoring/runner.ts b/apps/api/src/services/monitoring/runner.ts index e5371dafe5..42a74a4799 100644 --- a/apps/api/src/services/monitoring/runner.ts +++ b/apps/api/src/services/monitoring/runner.ts @@ -52,15 +52,65 @@ import type { import { withMarkdownFormat } from "./types"; import { redisEvictConnection } from "../redis"; import type { MonitorCheckJobData } from "./queue"; +import { + MONITOR_CHECK_STALE_ERROR, + isMonitorCheckStale, + MONITOR_CHECK_STALE_TIMEOUT_MS, +} from "./stale"; const logger = _logger.child({ module: "monitoring-runner" }); const poll = (ms: number) => new Promise(resolve => setTimeout(resolve, ms)); -export const MONITOR_CHECK_STALE_TIMEOUT_MS = 24 * 60 * 60 * 1000; +export { isMonitorCheckStale, MONITOR_CHECK_STALE_TIMEOUT_MS }; type PageResult = MonitorCheckPageInsert & { emailStatus?: string; }; +type MonitorTargetRun = + | { + targetId: string; + type: "scrape"; + expectedJobs: string[]; + } + | { + targetId: string; + type: "crawl"; + crawlId: string; + }; + +function createMonitorTargetRun(target: MonitorTarget): MonitorTargetRun { + if (target.type === "scrape") { + return { + targetId: target.id, + type: "scrape", + expectedJobs: target.urls.map(() => uuidv7()), + }; + } + + return { + targetId: target.id, + type: "crawl", + crawlId: uuidv7(), + }; +} + +function recoverScrapeTargetRunsFromMonitor( + monitor: MonitorRow, +): MonitorTargetRun[] | null { + if (!monitor.targets.every(target => target.type === "scrape")) { + return null; + } + + return monitor.targets.map(target => ({ + targetId: target.id, + type: "scrape" as const, + expectedJobs: + target.type === "scrape" + ? target.urls.map((_, index) => `recovered:${target.id}:${index}`) + : [], + })); +} + function withMonitorScrapeDefaults( options: Record, ): ScrapeOptions { @@ -548,12 +598,28 @@ async function sendNotifications(params: { webhook: params.monitor.webhook as any, v0: false, }); - await sender?.send(WebhookEvent.MONITOR_CHECK_COMPLETED, { - success: params.check.status === "completed", - data: payload, - error: params.check.error ?? undefined, - }); - webhookStatus = { attempted: true, success: true }; + try { + const result = await sender?.send(WebhookEvent.MONITOR_CHECK_COMPLETED, { + success: params.check.status === "completed", + data: payload, + error: params.check.error ?? undefined, + awaitWebhook: true, + }); + webhookStatus = { + attempted: result?.attempted ?? false, + success: result?.delivered === true, + delivered: result?.delivered === true, + queued: result?.queued === true, + skipped: result?.skipped === true, + }; + } catch (error) { + webhookStatus = { + attempted: true, + success: false, + delivered: false, + error: error instanceof Error ? error.message : String(error), + }; + } } const emailStatus = await sendMonitoringEmailSummary({ @@ -579,14 +645,14 @@ async function enqueueMonitorScrapeTarget(params: { monitor: MonitorRow; check: MonitorCheckRow; target: MonitorTarget; -}): Promise<{ targetId: string; type: "scrape"; expectedJobs: string[] }> { + targetRun: Extract; +}): Promise> { if (params.target.type !== "scrape") { throw new Error("Expected scrape target"); } - const expectedJobs: string[] = []; - for (const url of params.target.urls) { - const scrapeId = uuidv7(); + for (const [index, url] of params.target.urls.entries()) { + const scrapeId = params.targetRun.expectedJobs[index]; const scrapeOptions = scrapeRequestSchema.parse({ url, ...withMonitorScrapeDefaults(params.target.scrapeOptions ?? {}), @@ -632,22 +698,22 @@ async function enqueueMonitorScrapeTarget(params: { scrapeId, 20, ); - expectedJobs.push(scrapeId); } - return { targetId: params.target.id, type: "scrape", expectedJobs }; + return params.targetRun; } async function enqueueMonitorCrawlTarget(params: { monitor: MonitorRow; check: MonitorCheckRow; target: MonitorTarget; -}): Promise<{ targetId: string; type: "crawl"; crawlId: string }> { + targetRun: Extract; +}): Promise> { if (params.target.type !== "crawl") { throw new Error("Expected crawl target"); } - const crawlId = uuidv7(); + const crawlId = params.targetRun.crawlId; const body = crawlRequestSchema.parse({ url: params.target.url, ...(params.target.crawlOptions ?? {}), @@ -729,7 +795,7 @@ async function enqueueMonitorCrawlTarget(params: { uuidv7(), ); - return { targetId: params.target.id, type: "crawl", crawlId }; + return params.targetRun; } export async function processMonitorCheckJob( @@ -770,19 +836,19 @@ export async function processMonitorCheckJob( billing_status: lockId ? "reserved" : "not_applicable", }); - const targetResults: unknown[] = []; - - for (const target of monitor.targets) { - const result = - target.type === "scrape" - ? await enqueueMonitorScrapeTarget({ monitor, check, target }) - : await enqueueMonitorCrawlTarget({ monitor, check, target }); - targetResults.push(result); - } - + const targetResults = monitor.targets.map(createMonitorTargetRun); await updateMonitorCheck(check.id, { target_results: targetResults, }); + + for (const [index, target] of monitor.targets.entries()) { + const targetRun = targetResults[index]; + if (target.type === "scrape" && targetRun.type === "scrape") { + await enqueueMonitorScrapeTarget({ monitor, check, target, targetRun }); + } else if (target.type === "crawl" && targetRun.type === "crawl") { + await enqueueMonitorCrawlTarget({ monitor, check, target, targetRun }); + } + } } catch (error) { if (lockId) { await autumnService.finalizeCreditsLock({ @@ -884,12 +950,18 @@ async function processRemovedPagesForCompletedCrawls(params: { async function isMonitorCheckComplete( check: MonitorCheckRow, + monitor?: MonitorRow, ): Promise { - const targetResults = Array.isArray(check.target_results) + let targetResults = Array.isArray(check.target_results) ? (check.target_results as any[]) : []; - if (targetResults.length === 0) return false; + if (targetResults.length === 0) { + if (!monitor) return false; + + targetResults = recoverScrapeTargetRunsFromMonitor(monitor) ?? []; + if (targetResults.length === 0) return false; + } for (const target of targetResults) { if (target?.type === "scrape") { @@ -918,23 +990,13 @@ async function isMonitorCheckComplete( return true; } -export function isMonitorCheckStale( - check: Pick, - now: Date = new Date(), -): boolean { - const startedAt = check.started_at ?? check.updated_at ?? check.created_at; - const startedAtMs = Date.parse(startedAt); - if (!Number.isFinite(startedAtMs)) return false; - return now.getTime() - startedAtMs >= MONITOR_CHECK_STALE_TIMEOUT_MS; -} - async function failStaleMonitorCheck(params: { monitor: MonitorRow; check: MonitorCheckRow; }): Promise { if (!isMonitorCheckStale(params.check)) return false; - const error = "Monitor check exceeded the 24 hour running timeout."; + const error = MONITOR_CHECK_STALE_ERROR; if (params.check.autumn_lock_id) { await autumnService .finalizeCreditsLock({ @@ -1041,9 +1103,12 @@ export async function reconcileRunningMonitorChecks( if (await failStaleMonitorCheck({ monitor, check })) continue; - const targetResults = Array.isArray(check.target_results) + let targetResults = Array.isArray(check.target_results) ? ([...check.target_results] as any[]) : []; + if (targetResults.length === 0) { + targetResults = recoverScrapeTargetRunsFromMonitor(monitor) ?? []; + } await processRemovedPagesForCompletedCrawls({ monitor, @@ -1052,10 +1117,13 @@ export async function reconcileRunningMonitorChecks( }); if ( - !(await isMonitorCheckComplete({ - ...check, - target_results: targetResults, - })) + !(await isMonitorCheckComplete( + { + ...check, + target_results: targetResults, + }, + monitor, + )) ) { await updateMonitorCheck(check.id, { target_results: targetResults }); continue; diff --git a/apps/api/src/services/monitoring/scheduler.test.ts b/apps/api/src/services/monitoring/scheduler.test.ts index c5ec32983a..74a65709bb 100644 --- a/apps/api/src/services/monitoring/scheduler.test.ts +++ b/apps/api/src/services/monitoring/scheduler.test.ts @@ -1,13 +1,16 @@ import { addMonitorCheckJob } from "./queue"; import { enqueueDueMonitorChecks } from "./scheduler"; +import { isMonitorCheckStale } from "./stale"; import { advanceMonitorAfterSkippedCheck, claimDueMonitors, createMonitorCheck, dispatchScheduledMonitorCheck, + getMonitorCheck, updateMonitorCheck, updateMonitorScheduleAfterRun, } from "./store"; +import { autumnService } from "../autumn/autumn.service"; jest.mock("./queue", () => ({ addMonitorCheckJob: jest.fn(), @@ -18,10 +21,23 @@ jest.mock("./store", () => ({ claimDueMonitors: jest.fn(), createMonitorCheck: jest.fn(), dispatchScheduledMonitorCheck: jest.fn(), + getMonitorCheck: jest.fn(), updateMonitorCheck: jest.fn(), updateMonitorScheduleAfterRun: jest.fn(), })); +jest.mock("./stale", () => ({ + isMonitorCheckStale: jest.fn(), + MONITOR_CHECK_STALE_ERROR: + "Monitor check exceeded the 1 hour running timeout.", +})); + +jest.mock("../autumn/autumn.service", () => ({ + autumnService: { + finalizeCreditsLock: jest.fn(), + }, +})); + const mockAddMonitorCheckJob = addMonitorCheckJob as jest.MockedFunction< typeof addMonitorCheckJob >; @@ -35,6 +51,16 @@ const mockDispatchScheduledMonitorCheck = dispatchScheduledMonitorCheck as jest.MockedFunction< typeof dispatchScheduledMonitorCheck >; +const mockGetMonitorCheck = getMonitorCheck as jest.MockedFunction< + typeof getMonitorCheck +>; +const mockIsMonitorCheckStale = isMonitorCheckStale as jest.MockedFunction< + typeof isMonitorCheckStale +>; +const mockFinalizeCreditsLock = + autumnService.finalizeCreditsLock as jest.MockedFunction< + typeof autumnService.finalizeCreditsLock + >; const mockUpdateMonitorCheck = updateMonitorCheck as jest.MockedFunction< typeof updateMonitorCheck >; @@ -64,6 +90,9 @@ describe("monitoring scheduler", () => { mockAddMonitorCheckJob.mockResolvedValue(undefined); mockAdvanceMonitorAfterSkippedCheck.mockResolvedValue(undefined); mockUpdateMonitorScheduleAfterRun.mockResolvedValue(undefined); + mockGetMonitorCheck.mockResolvedValue(null); + mockIsMonitorCheckStale.mockReturnValue(false); + mockFinalizeCreditsLock.mockResolvedValue(undefined as any); }); it("dispatches and advances a scheduled monitor before enqueueing its job", async () => { @@ -131,4 +160,43 @@ describe("monitoring scheduler", () => { check: skipped, }); }); + + it("clears a stale current check before enqueueing a scheduled run", async () => { + const monitorWithCurrentCheck = { + ...monitor, + current_check_id: "stale-check", + } as any; + const staleCheck = { id: "stale-check", status: "running" } as any; + const failedStaleCheck = { ...staleCheck, status: "failed" } as any; + mockClaimDueMonitors.mockResolvedValue([monitorWithCurrentCheck]); + mockGetMonitorCheck.mockResolvedValue(staleCheck); + mockIsMonitorCheckStale.mockReturnValue(true); + mockUpdateMonitorCheck.mockResolvedValue(failedStaleCheck); + + await expect( + enqueueDueMonitorChecks({ workerId: "worker-1" }), + ).resolves.toBe(1); + + expect(mockUpdateMonitorCheck).toHaveBeenCalledWith(staleCheck.id, { + status: "failed", + finished_at: expect.any(String), + actual_credits: 0, + billing_status: "not_applicable", + error: "Monitor check exceeded the 1 hour running timeout.", + }); + expect(mockUpdateMonitorScheduleAfterRun).toHaveBeenCalledWith({ + monitor: monitorWithCurrentCheck, + check: failedStaleCheck, + }); + expect(mockCreateMonitorCheck).toHaveBeenCalledWith({ + monitor: { ...monitorWithCurrentCheck, current_check_id: null }, + trigger: "scheduled", + scheduledFor: monitorWithCurrentCheck.next_run_at, + }); + expect(mockAddMonitorCheckJob).toHaveBeenCalledWith({ + monitorId: monitorWithCurrentCheck.id, + checkId: check.id, + teamId: monitorWithCurrentCheck.team_id, + }); + }); }); diff --git a/apps/api/src/services/monitoring/scheduler.ts b/apps/api/src/services/monitoring/scheduler.ts index 84a7984e71..61492e3b6e 100644 --- a/apps/api/src/services/monitoring/scheduler.ts +++ b/apps/api/src/services/monitoring/scheduler.ts @@ -6,9 +6,13 @@ import { claimDueMonitors, createMonitorCheck, dispatchScheduledMonitorCheck, + getMonitorCheck, updateMonitorCheck, updateMonitorScheduleAfterRun, } from "./store"; +import { autumnService } from "../autumn/autumn.service"; +import { isMonitorCheckStale, MONITOR_CHECK_STALE_ERROR } from "./stale"; +import type { MonitorRow } from "./types"; const logger = _logger.child({ module: "monitoring-scheduler" }); @@ -35,10 +39,17 @@ export async function enqueueDueMonitorChecks( }); let enqueued = 0; - for (const monitor of monitors) { + for (let monitor of monitors) { let check: Awaited> | null = null; let dispatched = false; try { + if (monitor.current_check_id) { + const cleared = await clearFinishedOrStaleCurrentCheck(monitor); + if (cleared) { + monitor = { ...monitor, current_check_id: null }; + } + } + if (monitor.current_check_id) { const skipped = await createMonitorCheck({ monitor, @@ -123,3 +134,60 @@ export async function enqueueDueMonitorChecks( return enqueued; } + +async function clearFinishedOrStaleCurrentCheck( + monitor: MonitorRow, +): Promise { + if (!monitor.current_check_id) return true; + + const current = await getMonitorCheck( + monitor.team_id, + monitor.id, + monitor.current_check_id, + ); + if (!current) return false; + + if (current.status === "running" || current.status === "queued") { + if (!isMonitorCheckStale(current)) return false; + + if (current.autumn_lock_id) { + await autumnService + .finalizeCreditsLock({ + lockId: current.autumn_lock_id, + action: "release", + properties: { + source: "monitorCheck", + endpoint: "monitor", + jobId: current.id, + }, + }) + .catch(error => { + logger.warn("Failed to release stale monitor check credit lock", { + error, + monitorId: monitor.id, + checkId: current.id, + lockId: current.autumn_lock_id, + }); + }); + } + + const failed = await updateMonitorCheck(current.id, { + status: "failed", + finished_at: new Date().toISOString(), + actual_credits: 0, + billing_status: current.autumn_lock_id ? "released" : "not_applicable", + error: MONITOR_CHECK_STALE_ERROR, + }); + await updateMonitorScheduleAfterRun({ + monitor, + check: failed, + }); + return true; + } + + await updateMonitorScheduleAfterRun({ + monitor, + check: current, + }); + return true; +} diff --git a/apps/api/src/services/monitoring/stale.ts b/apps/api/src/services/monitoring/stale.ts new file mode 100644 index 0000000000..669cf22dd8 --- /dev/null +++ b/apps/api/src/services/monitoring/stale.ts @@ -0,0 +1,15 @@ +import type { MonitorCheckRow } from "./types"; + +export const MONITOR_CHECK_STALE_TIMEOUT_MS = 60 * 60 * 1000; +export const MONITOR_CHECK_STALE_ERROR = + "Monitor check exceeded the 1 hour running timeout."; + +export function isMonitorCheckStale( + check: Pick, + now: Date = new Date(), +): boolean { + const startedAt = check.started_at ?? check.updated_at ?? check.created_at; + const startedAtMs = Date.parse(startedAt); + if (!Number.isFinite(startedAtMs)) return false; + return now.getTime() - startedAtMs >= MONITOR_CHECK_STALE_TIMEOUT_MS; +} diff --git a/apps/api/src/services/webhook/delivery.test.ts b/apps/api/src/services/webhook/delivery.test.ts new file mode 100644 index 0000000000..845f7f9308 --- /dev/null +++ b/apps/api/src/services/webhook/delivery.test.ts @@ -0,0 +1,30 @@ +import { WebhookEvent } from "./types"; +import { webhookEventMatchesFilter } from "./delivery"; + +describe("webhook delivery", () => { + describe("webhookEventMatchesFilter", () => { + it("matches full monitor event names", () => { + expect( + webhookEventMatchesFilter( + ["monitor.page", "monitor.check.completed"], + WebhookEvent.MONITOR_PAGE, + ), + ).toBe(true); + expect( + webhookEventMatchesFilter( + ["monitor.page", "monitor.check.completed"], + WebhookEvent.MONITOR_CHECK_COMPLETED, + ), + ).toBe(true); + }); + + it("keeps legacy subtype filters for non-monitor webhooks", () => { + expect(webhookEventMatchesFilter(["page"], WebhookEvent.CRAWL_PAGE)).toBe( + true, + ); + expect( + webhookEventMatchesFilter(["completed"], WebhookEvent.CRAWL_COMPLETED), + ).toBe(true); + }); + }); +}); diff --git a/apps/api/src/services/webhook/delivery.ts b/apps/api/src/services/webhook/delivery.ts index fea1611845..93c06e3ca5 100644 --- a/apps/api/src/services/webhook/delivery.ts +++ b/apps/api/src/services/webhook/delivery.ts @@ -20,6 +20,31 @@ import { randomUUID } from "crypto"; const WEBHOOK_INSERT_QUEUE_KEY = "webhook-insert-queue"; const WEBHOOK_INSERT_BATCH_SIZE = 1000; +type WebhookSendResult = { + attempted: boolean; + delivered?: boolean; + queued?: boolean; + skipped?: boolean; + statusCode?: number; +}; + +export function webhookEventMatchesFilter( + configuredEvents: string[] | undefined, + event: WebhookEvent, +): boolean { + if (!configuredEvents?.length) { + return true; + } + + const legacySubType = event.split(".")[1]; + const namespaceSuffix = event.split(".").slice(1).join("."); + return ( + configuredEvents.includes(event) || + configuredEvents.includes(legacySubType) || + configuredEvents.includes(namespaceSuffix) + ); +} + export class WebhookSender { private config: WebhookConfig; private secret?: string; @@ -45,8 +70,10 @@ export class WebhookSender { async send( event: T, data: WebhookEventDataMap[T], - ): Promise { - if (!this.shouldSendEvent(event)) return; + ): Promise { + if (!this.shouldSendEvent(event)) { + return { attempted: false, skipped: true }; + } const payload = { success: data.success, @@ -64,10 +91,15 @@ export class WebhookSender { ); if (data.awaitWebhook) { - await delivery; - } else { - delivery.catch(() => {}); + return { attempted: true, ...(await delivery) }; } + + delivery.catch(() => {}); + return { + attempted: true, + delivered: false, + queued: this.usesWebhookQueue(), + }; } private shouldSendEvent(event: WebhookEvent): boolean { @@ -75,24 +107,26 @@ export class WebhookSender { return false; } - if (!this.config.events?.length) { - return true; - } + return webhookEventMatchesFilter(this.config.events, event); + } - const subType = event.split(".")[1]; - return this.config.events.includes(subType as any); + private usesWebhookQueue(): boolean { + return Boolean(config.WEBHOOK_USE_RABBITMQ && config.NUQ_RABBITMQ_URL); } - private async deliver(payload: any, scrapeId?: string): Promise { + private async deliver( + payload: any, + scrapeId?: string, + ): Promise> { const webhookHost = new URL(this.config.url).hostname; if (isIPPrivate(webhookHost) && config.ALLOW_LOCAL_WEBHOOKS !== true) { this.logger.warn("Aborting webhook call to private IP address", { webhookUrl: this.config.url, }); - return; + return { delivered: false, skipped: true }; } - if (config.WEBHOOK_USE_RABBITMQ && config.NUQ_RABBITMQ_URL) { + if (this.usesWebhookQueue()) { const queueMessage: WebhookQueueMessage = { webhook_url: this.config.url, payload, @@ -118,7 +152,7 @@ export class WebhookSender { throw error; } - return; + return { delivered: false, queued: true }; } const payloadString = JSON.stringify(payload); @@ -165,6 +199,7 @@ export class WebhookSender { event: payload.type, statusCode: res.status, }); + return { delivered: true, queued: false, statusCode: res.status }; } catch (error) { this.logger.error("Failed to send webhook", { error, diff --git a/apps/api/src/services/webhook/types.ts b/apps/api/src/services/webhook/types.ts index 550c4f430c..0ac175510f 100644 --- a/apps/api/src/services/webhook/types.ts +++ b/apps/api/src/services/webhook/types.ts @@ -41,7 +41,7 @@ export type WebhookQueueMessage = { webhookId: string; id?: string; jobId?: string; - data: any[]; + data: any; error?: string; metadata?: Record; }; diff --git a/apps/api/src/services/worker/scrape-worker.ts b/apps/api/src/services/worker/scrape-worker.ts index e6ff468dac..22707d59ac 100644 --- a/apps/api/src/services/worker/scrape-worker.ts +++ b/apps/api/src/services/worker/scrape-worker.ts @@ -461,7 +461,10 @@ async function processJob(job: NuQJob) { zeroDataRetention: job.data.zeroDataRetention, apiKeyId: job.data.apiKeyId, monitoring: job.data.monitoring - ? { ...job.data.monitoring, source: "discovered" } + ? { + ...job.data.monitoring, + source: "discovered" as const, + } : undefined, }, jobId, @@ -585,9 +588,7 @@ async function processJob(job: NuQJob) { } } - await recordMonitorScrapeSuccess(job, doc).catch(error => - logger.warn("Failed to record monitor scrape result", { error }), - ); + await recordMonitorScrapeSuccess(job, doc); logger.debug("Declaring job as done..."); await addCrawlJobDone(job.data.crawl_id, job.id, true, logger); @@ -827,9 +828,7 @@ async function processJob(job: NuQJob) { zeroDataRetention: job.data.zeroDataRetention, }).catch(err => logger.warn("Scrape tracking failed", { error: err })); - await recordMonitorScrapeFailure(job, error).catch(err => - logger.warn("Failed to record monitor scrape failure", { error: err }), - ); + await recordMonitorScrapeFailure(job, error); return data; } finally { @@ -913,6 +912,9 @@ async function addKickoffSitemapJob( webhook: sourceJob.data.webhook, v1: sourceJob.data.v1, apiKeyId: sourceJob.data.apiKeyId, + monitoring: sourceJob.data.monitoring + ? { ...sourceJob.data.monitoring, source: "discovered" as const } + : undefined, } satisfies ScrapeJobKickoffSitemap, jobId, 21, @@ -968,6 +970,9 @@ async function processKickoffJob(job: NuQJob) { isCrawlSourceScrape: true, zeroDataRetention: job.data.zeroDataRetention, apiKeyId: job.data.apiKeyId, + monitoring: job.data.monitoring + ? { ...job.data.monitoring, source: "discovered" as const } + : undefined, }, jobId, await getJobPriority({ team_id: job.data.team_id, basePriority: 15 }), @@ -1056,6 +1061,9 @@ async function processKickoffJob(job: NuQJob) { v1: job.data.v1, zeroDataRetention: job.data.zeroDataRetention, apiKeyId: job.data.apiKeyId, + monitoring: job.data.monitoring + ? { ...job.data.monitoring, source: "discovered" as const } + : undefined, }, priority: jobPriority, }; @@ -1170,6 +1178,9 @@ async function processKickoffSitemapJob(job: NuQJob) { zeroDataRetention: job.data.zeroDataRetention || (sc.zeroDataRetention ?? false), apiKeyId: job.data.apiKeyId, + monitoring: job.data.monitoring + ? { ...job.data.monitoring, source: "discovered" as const } + : undefined, } satisfies ScrapeJobSingleUrls, jobId: uuidv7(), priority: jobPriority,