diff --git a/package.json b/package.json index b8baa4087..fd89658ed 100644 --- a/package.json +++ b/package.json @@ -72,7 +72,7 @@ }, "dependencies": { "@inquirer/prompts": "^8.2.1", - "@mendable/firecrawl-js": "4.17.0", + "@mendable/firecrawl-js": "4.22.2", "commander": "^14.0.2" } } diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 8658d6fad..881b636ff 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -12,8 +12,8 @@ importers: specifier: ^8.2.1 version: 8.2.1(@types/node@20.19.27) '@mendable/firecrawl-js': - specifier: 4.17.0 - version: 4.17.0 + specifier: 4.22.2 + version: 4.22.2 commander: specifier: ^14.0.2 version: 14.0.2 @@ -332,8 +332,8 @@ packages: '@jridgewell/sourcemap-codec@1.5.5': resolution: {integrity: sha512-cYQ9310grqxueWbl+WuIUIaiUaDcj7WOq5fVhEljNVgRfOUhY9fy2zTvfoqWsnebh8Sl70VScFbICvJnLKB0Og==} - '@mendable/firecrawl-js@4.17.0': - resolution: {integrity: sha512-4Dz2y8QLJMlf45qQIyCgvfjbz+cn9T5jRf0aTxFptBe+123373Vsker9vKYHriWIl2oO/SwRSILkJV6AsGlCMA==} + '@mendable/firecrawl-js@4.22.2': + resolution: {integrity: sha512-HRxafhBsioKvCnhkLPIzIO8qsiyLLyqPe8Oaz5vMR3olb4V1wJLe2oYIzboAppOQbv0++DUeE5K6l03wR+CeHA==} engines: {node: '>=22.0.0'} '@rollup/rollup-android-arm-eabi@4.55.1': @@ -524,8 +524,8 @@ packages: asynckit@0.4.0: resolution: {integrity: sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q==} - axios@1.13.6: - resolution: {integrity: sha512-ChTCHMouEe2kn713WHbQGcuYrr6fXTBiu460OTwWrWob16g1bXn4vtz07Ope7ewMozJAnEquLk5lWQWtBig9DQ==} + axios@1.15.2: + resolution: {integrity: sha512-wLrXxPtcrPTsNlJmKjkPnNPK2Ihe0hn0wGSaTEiHRPxwjvJwT3hKmXF4dpqxmPO9SoNb2FsYXj/xEo0gHN+D5A==} braces@3.0.3: resolution: {integrity: sha512-yQbXgO/OSZVD2IsiLlro+7Hf6Q18EJrKSEsdoMzKePKXct3gvD8oLcOQdIzGupr5Fj+EDe8gO/lxc1BzfMpxvA==} @@ -860,8 +860,9 @@ packages: engines: {node: '>=14'} hasBin: true - proxy-from-env@1.1.0: - resolution: {integrity: sha512-D+zkORCbA9f1tdWRK0RaCR3GPv50cMxcrz4X8k5LTSUD1Dkw47mKJEZQNunItRTkWwgtaUSo1RVFRIG9ZXiFYg==} + proxy-from-env@2.1.0: + resolution: {integrity: sha512-cJ+oHTW1VAEa8cJslgmUZrc+sjRKgAKl3Zyse6+PV38hZe/V6Z14TbCuXcan9F9ghlz4QrFr2c92TNF82UkYHA==} + engines: {node: '>=10'} restore-cursor@5.1.0: resolution: {integrity: sha512-oMA2dcrw6u0YfxJQXm342bFKX/E4sG9rbTzO9ptUcR/e8A33cHuvStiYOwH7fszkZlZ1z/ta9AAoPk2F4qIOHA==} @@ -1259,9 +1260,9 @@ snapshots: '@jridgewell/sourcemap-codec@1.5.5': {} - '@mendable/firecrawl-js@4.17.0': + '@mendable/firecrawl-js@4.22.2': dependencies: - axios: 1.13.6 + axios: 1.15.2 firecrawl: 4.16.0 typescript-event-target: 1.1.2 zod: 3.25.76 @@ -1410,11 +1411,11 @@ snapshots: asynckit@0.4.0: {} - axios@1.13.6: + axios@1.15.2: dependencies: follow-redirects: 1.15.11 form-data: 4.0.5 - proxy-from-env: 1.1.0 + proxy-from-env: 2.1.0 transitivePeerDependencies: - debug @@ -1562,7 +1563,7 @@ snapshots: firecrawl@4.16.0: dependencies: - axios: 1.13.6 + axios: 1.15.2 typescript-event-target: 1.1.2 zod: 3.25.76 zod-to-json-schema: 3.25.1(zod@3.25.76) @@ -1737,7 +1738,7 @@ snapshots: prettier@3.7.4: {} - proxy-from-env@1.1.0: {} + proxy-from-env@2.1.0: {} restore-cursor@5.1.0: dependencies: diff --git a/skills/firecrawl-cli/SKILL.md b/skills/firecrawl-cli/SKILL.md index 8d7e6a7c7..efa8601c5 100644 --- a/skills/firecrawl-cli/SKILL.md +++ b/skills/firecrawl-cli/SKILL.md @@ -63,6 +63,7 @@ Follow this escalation pattern: | Interact with a page | `scrape` + `interact` | Content requires clicks, form fills, pagination, or login | | Download a site to files | `download` | Save an entire site as local files | | Parse a local file | `parse` | File on disk (PDF, DOCX, XLSX, etc.) — not a URL | +| Watch pages for changes | `monitor` | Schedule recurring scrapes/crawls, diff against snapshots | For detailed command reference, run `firecrawl --help`. @@ -72,6 +73,29 @@ For detailed command reference, run `firecrawl --help`. - Use `scrape` + `interact` when you need to interact with a page, such as clicking buttons, filling out forms, navigating through a complex site, infinite scroll, or when scrape fails to grab all the content you need. - Never use interact for web searches - use `search` instead. +**Monitor:** Schedule recurring scrapes or crawls and diff each result against the last retained snapshot. Use for product pages, docs, blogs, changelogs, competitor sites — any page where changes matter. Each check labels pages as `same`, `new`, `changed`, `removed`, or `error`, with webhook and email notification options. + +Subcommands: `create | list | get | update | delete | run | checks | check`. + +```bash +# create from flags +firecrawl monitor create --name "Blog" --schedule "every 30 minutes" \ + --scrape-urls https://example.com/blog --email alerts@example.com + +# or from JSON (positional file, or piped stdin) +firecrawl monitor create monitor.json +cat monitor.json | firecrawl monitor create + +firecrawl monitor list --limit 20 +firecrawl monitor run # trigger a check now +firecrawl monitor checks # list checks +firecrawl monitor check --page-status changed +firecrawl monitor update --state paused +firecrawl monitor delete +``` + +Schedules accept cron (`--cron "*/30 * * * *"`) or natural language (`--schedule "every 30 minutes"`). Minimum interval is 15 minutes. Targets are either `--scrape-urls a,b,c` (scrape) or `--crawl-url ` (crawl whole site each check). Note: `--state` (not `--status`) sets active/paused; `--page-status` (not `--status`) filters page results on `check` — avoids collision with the global `--status` flag. Monitoring is not available for zero-data-retention teams. + **Avoid redundant fetches:** - `search --scrape` already fetches full page content. Don't re-scrape those URLs. diff --git a/src/commands/monitor.ts b/src/commands/monitor.ts new file mode 100644 index 000000000..f042ec922 --- /dev/null +++ b/src/commands/monitor.ts @@ -0,0 +1,476 @@ +/** + * `firecrawl monitor` — manage Firecrawl monitors. + * + * Monitors run recurring scrapes/crawls and diff each result against the last + * retained snapshot. See features/monitoring in the docs. + * + * @mendable/firecrawl-js@4.22.2 exposes monitor methods (createMonitor, + * listMonitors, getMonitor, updateMonitor, deleteMonitor, runMonitor, + * listMonitorChecks, getMonitorCheck), but its HttpClient injects a top-level + * `origin: js-sdk@` field into every POST/PATCH body and the + * /v2/monitor endpoint rejects that with "Unrecognized key in body". Until the + * SDK strips `origin` for monitor requests (or the API accepts it), we hit + * /v2/monitor directly via fetch — same pattern parse.ts uses. + * + * Subcommands: + * create | list | get | update | delete | run | checks | check + */ + +import * as fs from 'fs'; +import { Command } from 'commander'; +import { getConfig, validateConfig } from '../utils/config'; +import { writeOutput } from '../utils/output'; + +const DEFAULT_API_URL = 'https://api.firecrawl.dev'; + +interface CommonOptions { + apiKey?: string; + apiUrl?: string; + output?: string; + pretty?: boolean; +} + +interface MonitorRequestInit { + method?: string; + body?: unknown; + query?: Record; +} + +async function monitorRequest( + path: string, + options: CommonOptions, + init: MonitorRequestInit = {} +): Promise { + const config = getConfig(); + const apiKey = options.apiKey || config.apiKey; + validateConfig(apiKey); + + const baseUrl = (options.apiUrl || config.apiUrl || DEFAULT_API_URL).replace( + /\/$/, + '' + ); + + let url = `${baseUrl}/v2${path}`; + if (init.query) { + const qs = new URLSearchParams(); + for (const [k, v] of Object.entries(init.query)) { + if (v !== undefined && v !== null && v !== '') qs.set(k, String(v)); + } + const s = qs.toString(); + if (s) url += `?${s}`; + } + + const headers: Record = { + Authorization: `Bearer ${apiKey}`, + 'X-Origin': 'cli', + }; + if (init.body !== undefined) headers['Content-Type'] = 'application/json'; + + const response = await fetch(url, { + method: init.method ?? 'GET', + headers, + body: init.body !== undefined ? JSON.stringify(init.body) : undefined, + }); + + const payload = (await response.json().catch(() => ({}))) as any; + + if (!response.ok || payload?.success === false) { + const message = + payload?.error || + `HTTP ${response.status}: ${response.statusText || 'Request failed'}`; + throw new Error(message); + } + + return payload; +} + +function emit( + payload: unknown, + options: CommonOptions & { json?: boolean } +): void { + const text = JSON.stringify(payload, null, options.pretty ? 2 : 0); + writeOutput(text, options.output, !!options.output); +} + +/** + * Read a JSON payload from a positional arg or piped stdin. + * + * - `file` is a path to a .json file, or `-` to read stdin explicitly. + * - If `file` is omitted and stdin is a pipe, stdin is used. + * - Returns `undefined` when no source is provided — caller falls back to flags. + */ +async function readJsonPayload(file?: string): Promise { + if (file === '-' || (!file && !process.stdin.isTTY)) { + const chunks: Buffer[] = []; + for await (const chunk of process.stdin) chunks.push(chunk as Buffer); + const raw = Buffer.concat(chunks).toString('utf-8').trim(); + if (!raw) return undefined; + return JSON.parse(raw); + } + if (file) { + const raw = fs.readFileSync(file, 'utf-8'); + return JSON.parse(raw); + } + return undefined; +} + +function parseCommaList(value: string): string[] { + return value + .split(',') + .map((s) => s.trim()) + .filter(Boolean); +} + +function fail(error: unknown): never { + console.error('Error:', error instanceof Error ? error.message : error); + process.exit(1); +} + +/** + * Build the request body for `monitor create` from CLI flags. + * + * For full control, callers can pass a JSON file path positionally or pipe + * JSON on stdin instead. The flags cover the common scrape-target shape. + */ +function buildCreateBody(opts: { + name?: string; + cron?: string; + scheduleText?: string; + timezone?: string; + urls?: string[]; + crawlUrl?: string; + webhookUrl?: string; + webhookEvents?: string[]; + emailRecipients?: string[]; + retentionDays?: number; +}): unknown { + if (!opts.name) { + throw new Error('--name is required (or pass a JSON file / stdin payload)'); + } + if (!opts.cron && !opts.scheduleText) { + throw new Error('--cron or --schedule is required'); + } + const hasScrape = opts.urls && opts.urls.length > 0; + const hasCrawl = !!opts.crawlUrl; + if (!hasScrape && !hasCrawl) { + throw new Error('Provide --scrape-urls or --crawl-url'); + } + + const schedule: Record = {}; + if (opts.cron) schedule.cron = opts.cron; + if (opts.scheduleText) schedule.text = opts.scheduleText; + if (opts.timezone) schedule.timezone = opts.timezone; + + const targets: unknown[] = []; + if (hasScrape) targets.push({ type: 'scrape', urls: opts.urls }); + if (hasCrawl) targets.push({ type: 'crawl', url: opts.crawlUrl }); + + const body: Record = { + name: opts.name, + schedule, + targets, + }; + + if (opts.webhookUrl) { + body.webhook = { + url: opts.webhookUrl, + ...(opts.webhookEvents && opts.webhookEvents.length > 0 + ? { events: opts.webhookEvents } + : {}), + }; + } + + if (opts.emailRecipients && opts.emailRecipients.length > 0) { + body.notification = { + email: { + enabled: true, + recipients: opts.emailRecipients, + }, + }; + } + + if (opts.retentionDays !== undefined) body.retentionDays = opts.retentionDays; + + return body; +} + +function commonOptions(cmd: Command): Command { + return cmd + .option( + '-k, --api-key ', + 'Firecrawl API key (overrides global --api-key)' + ) + .option('--api-url ', 'API URL (overrides global --api-url)') + .option('-o, --output ', 'Output file path (default: stdout)') + .option('--pretty', 'Pretty print JSON output', false); +} + +/** + * Build the `firecrawl monitor` command tree. + */ +export function createMonitorCommand(): Command { + const monitor = new Command('monitor').description( + 'Schedule recurring scrapes/crawls and track content changes' + ); + + // create + commonOptions( + monitor + .command('create') + .description('Create a monitor (flags, or JSON from file/stdin)') + .argument( + '[file]', + 'Path to JSON payload (use "-" or pipe stdin to read from stdin)' + ) + .option('--name ', 'Monitor name') + .option('--cron ', 'Cron schedule (e.g. "*/30 * * * *")') + .option( + '--schedule ', + 'Natural-language schedule (e.g. "every 30 minutes")' + ) + .option('--timezone ', 'Schedule timezone', 'UTC') + .option( + '--scrape-urls ', + 'Comma-separated URLs to scrape on each check', + parseCommaList + ) + .option('--crawl-url ', 'Root URL for a crawl target') + .option('--webhook-url ', 'Webhook destination') + .option( + '--webhook-events ', + 'Comma-separated events (monitor.page, monitor.check.completed)', + parseCommaList + ) + .option( + '--email ', + 'Comma-separated email recipients for change notifications', + parseCommaList + ) + .option('--retention-days ', 'Snapshot retention window', parseInt) + ).action(async (file: string | undefined, options) => { + try { + const fromJson = await readJsonPayload(file); + const body = + fromJson ?? + buildCreateBody({ + name: options.name, + cron: options.cron, + scheduleText: options.schedule, + timezone: options.timezone, + urls: options.scrapeUrls, + crawlUrl: options.crawlUrl, + webhookUrl: options.webhookUrl, + webhookEvents: options.webhookEvents, + emailRecipients: options.email, + retentionDays: options.retentionDays, + }); + const payload = await monitorRequest('/monitor', options, { + method: 'POST', + body, + }); + emit(payload, options); + } catch (err) { + fail(err); + } + }); + + // list + commonOptions( + monitor + .command('list') + .description('List monitors') + .option('--limit ', 'Maximum results', parseInt) + .option('--offset ', 'Result offset', parseInt) + ).action(async (options) => { + try { + const payload = await monitorRequest('/monitor', options, { + query: { limit: options.limit, offset: options.offset }, + }); + emit(payload, options); + } catch (err) { + fail(err); + } + }); + + // get + commonOptions( + monitor + .command('get') + .description('Get a monitor by ID') + .argument('', 'Monitor ID') + ).action(async (monitorId, options) => { + try { + const payload = await monitorRequest( + `/monitor/${encodeURIComponent(monitorId)}`, + options + ); + emit(payload, options); + } catch (err) { + fail(err); + } + }); + + // update + commonOptions( + monitor + .command('update') + .description('Update a monitor (flags, or JSON from file/stdin)') + .argument('', 'Monitor ID') + .argument( + '[file]', + 'Path to JSON payload (use "-" or pipe stdin to read from stdin)' + ) + .option('--name ', 'New name') + .option('--cron ', 'New cron schedule') + .option('--schedule ', 'New natural-language schedule') + .option('--timezone ', 'Schedule timezone') + .option('--state ', 'active | paused') + .option('--retention-days ', 'Snapshot retention window', parseInt) + ).action(async (monitorId: string, file: string | undefined, options) => { + try { + const fromJson = await readJsonPayload(file); + let body: Record; + if (fromJson) { + body = fromJson as Record; + } else { + body = {}; + if (options.name) body.name = options.name; + if (options.state) body.status = options.state; + if (options.retentionDays !== undefined) + body.retentionDays = options.retentionDays; + if (options.cron || options.schedule || options.timezone) { + const schedule: Record = {}; + if (options.cron) schedule.cron = options.cron; + if (options.schedule) schedule.text = options.schedule; + if (options.timezone) schedule.timezone = options.timezone; + body.schedule = schedule; + } + if (Object.keys(body).length === 0) { + throw new Error( + 'Provide at least one field to update (or a JSON file / stdin payload)' + ); + } + } + const payload = await monitorRequest( + `/monitor/${encodeURIComponent(monitorId)}`, + options, + { method: 'PATCH', body } + ); + emit(payload, options); + } catch (err) { + fail(err); + } + }); + + // delete + commonOptions( + monitor + .command('delete') + .description('Delete a monitor') + .argument('', 'Monitor ID') + ).action(async (monitorId, options) => { + try { + const payload = await monitorRequest( + `/monitor/${encodeURIComponent(monitorId)}`, + options, + { method: 'DELETE' } + ); + emit(payload, options); + } catch (err) { + fail(err); + } + }); + + // run + commonOptions( + monitor + .command('run') + .description('Trigger a check immediately') + .argument('', 'Monitor ID') + ).action(async (monitorId, options) => { + try { + const payload = await monitorRequest( + `/monitor/${encodeURIComponent(monitorId)}/run`, + options, + { method: 'POST' } + ); + emit(payload, options); + } catch (err) { + fail(err); + } + }); + + // checks (list) + commonOptions( + monitor + .command('checks') + .description('List checks for a monitor') + .argument('', 'Monitor ID') + .option('--limit ', 'Maximum results', parseInt) + .option('--offset ', 'Result offset', parseInt) + ).action(async (monitorId, options) => { + try { + const payload = await monitorRequest( + `/monitor/${encodeURIComponent(monitorId)}/checks`, + options, + { query: { limit: options.limit, offset: options.offset } } + ); + emit(payload, options); + } catch (err) { + fail(err); + } + }); + + // check (get one) + commonOptions( + monitor + .command('check') + .description('Get a specific check, with page-level results') + .argument('', 'Monitor ID') + .argument('', 'Check ID') + .option('--limit ', 'Max page results', parseInt) + .option('--skip ', 'Skip page results', parseInt) + .option( + '--page-status ', + 'Filter page results: same, new, changed, removed, error' + ) + ).action(async (monitorId, checkId, options) => { + try { + const payload = await monitorRequest( + `/monitor/${encodeURIComponent(monitorId)}/checks/${encodeURIComponent(checkId)}`, + options, + { + query: { + limit: options.limit, + skip: options.skip, + status: options.pageStatus, + }, + } + ); + emit(payload, options); + } catch (err) { + fail(err); + } + }); + + monitor.addHelpText( + 'after', + ` +Examples: + $ firecrawl monitor create --name "Blog" \\ + --schedule "every 30 minutes" \\ + --scrape-urls https://example.com/blog \\ + --email alerts@example.com + $ firecrawl monitor create monitor.json + $ cat monitor.json | firecrawl monitor create + $ firecrawl monitor list --limit 20 + $ firecrawl monitor get mon_abc123 + $ firecrawl monitor update mon_abc123 --state paused + $ firecrawl monitor run mon_abc123 + $ firecrawl monitor checks mon_abc123 --limit 10 + $ firecrawl monitor check mon_abc123 chk_xyz --page-status changed +` + ); + + return monitor; +} diff --git a/src/index.ts b/src/index.ts index 4ea6442bf..4a1a45979 100644 --- a/src/index.ts +++ b/src/index.ts @@ -18,6 +18,7 @@ import { handleCreditUsageCommand } from './commands/credit-usage'; import { handleCrawlCommand } from './commands/crawl'; import { handleMapCommand } from './commands/map'; import { handleParseCommand } from './commands/parse'; +import { createMonitorCommand } from './commands/monitor'; import { handleSearchCommand } from './commands/search'; import { handleAgentCommand } from './commands/agent'; import { @@ -70,6 +71,7 @@ const AUTH_REQUIRED_COMMANDS = [ 'browser', 'interact', 'credit-usage', + 'monitor', ]; const commandSet = new Set([]); @@ -1574,6 +1576,7 @@ Examples: program.addCommand(createCrawlCommand()); program.addCommand(createMapCommand()); program.addCommand(createParseCommand()); +program.addCommand(createMonitorCommand()); program.addCommand(createSearchCommand()); program.addCommand(createAgentCommand()); program.addCommand(createInteractCommand());