diff --git a/package.json b/package.json index 24cf09b..d7157e8 100644 --- a/package.json +++ b/package.json @@ -71,6 +71,7 @@ "vitest": "^4.0.0" }, "dependencies": { + "@agentclientprotocol/sdk": "^0.17.0", "@inquirer/prompts": "^8.2.1", "@mendable/firecrawl-js": "4.17.0", "commander": "^14.0.2" diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 8658d6f..8438349 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -8,6 +8,9 @@ importers: .: dependencies: + '@agentclientprotocol/sdk': + specifier: ^0.17.0 + version: 0.17.0(zod@3.25.76) '@inquirer/prompts': specifier: ^8.2.1 version: 8.2.1(@types/node@20.19.27) @@ -39,6 +42,11 @@ importers: packages: + '@agentclientprotocol/sdk@0.17.0': + resolution: {integrity: sha512-inBMYAEd9t4E+ULZK2os9kmLG5jbPvMLbPvY71XDDem1YteW/uDwkahg6OwsGR3tvvgVhYbRJ9mJCp2VXqG4xQ==} + peerDependencies: + zod: ^3.25.0 || ^4.0.0 + '@esbuild/aix-ppc64@0.27.2': resolution: {integrity: sha512-GZMB+a0mOMZs4MpDbj8RJp4cw+w1WV5NYD6xzgvzUJ5Ek2jerwfO2eADyI6ExDSUED+1X8aMbegahsJi+8mgpw==} engines: {node: '>=18'} @@ -1060,6 +1068,10 @@ packages: snapshots: + '@agentclientprotocol/sdk@0.17.0(zod@3.25.76)': + dependencies: + zod: 3.25.76 + '@esbuild/aix-ppc64@0.27.2': optional: true diff --git a/src/acp/client.ts b/src/acp/client.ts new file mode 100644 index 0000000..b1f5716 --- /dev/null +++ b/src/acp/client.ts @@ -0,0 +1,261 @@ +/** + * Firecrawl ACP Client — connects to any ACP-compatible agent + * (Claude Code, Codex, Gemini CLI, etc.) via the Agent Client Protocol. + * + * Spawns the agent as a subprocess and communicates via JSON-RPC over stdio. + * Uses the official @agentclientprotocol/sdk. + */ + +import { spawn, type ChildProcess } from 'child_process'; +import { Writable, Readable } from 'stream'; +import * as acp from '@agentclientprotocol/sdk'; + +// ─── Types ────────────────────────────────────────────────────────────────── + +export interface ToolCallInfo { + id: string; + title: string; + status: string; + rawInput?: unknown; + rawOutput?: unknown; +} + +export interface ACPClientCallbacks { + onText?: (text: string) => void; + onToolCall?: (call: ToolCallInfo) => void; + onToolCallUpdate?: (call: ToolCallInfo) => void; + onPlan?: (entries: Array<{ content: string; status: string }>) => void; + onUsage?: (update: { + size: number; + used: number; + cost?: { amount: number; currency: string } | null; + }) => void; + onPermissionRequest?: ( + title: string, + options: Array<{ name: string; optionId: string }> + ) => Promise; // returns optionId +} + +// ─── Client implementation ────────────────────────────────────────────────── + +class FirecrawlClient implements acp.Client { + private callbacks: ACPClientCallbacks; + + constructor(callbacks: ACPClientCallbacks) { + this.callbacks = callbacks; + } + + async requestPermission( + params: acp.RequestPermissionRequest + ): Promise { + // Auto-approve by selecting the first "allow" option + const allowOption = params.options.find( + (o) => o.kind === 'allow_once' || o.kind === 'allow_always' + ); + if (allowOption) { + return { + outcome: { outcome: 'selected', optionId: allowOption.optionId }, + }; + } + + // If custom handler provided, let them choose + if (this.callbacks.onPermissionRequest) { + const optionId = await this.callbacks.onPermissionRequest( + params.toolCall.title ?? 'Unknown tool', + params.options.map((o) => ({ name: o.name, optionId: o.optionId })) + ); + return { outcome: { outcome: 'selected', optionId } }; + } + + // Fallback: select first option + return { + outcome: { outcome: 'selected', optionId: params.options[0].optionId }, + }; + } + + async sessionUpdate(params: acp.SessionNotification): Promise { + const update = params.update; + + switch (update.sessionUpdate) { + case 'agent_message_chunk': + if ( + 'content' in update && + update.content.type === 'text' && + this.callbacks.onText + ) { + this.callbacks.onText(update.content.text); + } + break; + + case 'tool_call': + if (this.callbacks.onToolCall) { + this.callbacks.onToolCall({ + id: update.toolCallId, + title: update.title ?? 'tool', + status: update.status ?? 'pending', + rawInput: update.rawInput, + rawOutput: update.rawOutput, + }); + } + break; + + case 'tool_call_update': + if (this.callbacks.onToolCallUpdate) { + this.callbacks.onToolCallUpdate({ + id: update.toolCallId, + title: update.title ?? '', + status: update.status ?? 'unknown', + rawInput: update.rawInput, + rawOutput: update.rawOutput, + }); + } + break; + + case 'plan': + if (this.callbacks.onPlan) { + this.callbacks.onPlan( + update.entries.map((e: { content: string; status: string }) => ({ + content: e.content, + status: e.status, + })) + ); + } + break; + + case 'usage_update': + if (this.callbacks.onUsage) { + this.callbacks.onUsage({ + size: update.size, + used: update.used, + cost: update.cost ?? undefined, + }); + } + break; + + default: + break; + } + } + + async writeTextFile( + params: acp.WriteTextFileRequest + ): Promise { + const fs = await import('fs'); + const path = await import('path'); + const dir = path.dirname(params.path); + if (!fs.existsSync(dir)) { + fs.mkdirSync(dir, { recursive: true }); + } + fs.writeFileSync(params.path, params.content, 'utf-8'); + return {}; + } + + async readTextFile( + params: acp.ReadTextFileRequest + ): Promise { + const fs = await import('fs'); + const content = fs.readFileSync(params.path, 'utf-8'); + return { content }; + } +} + +// ─── Public API ───────────────────────────────────────────────────────────── + +export async function connectToAgent(opts: { + bin: string; + args?: string[]; + cwd?: string; + systemPrompt?: string; + callbacks: ACPClientCallbacks; +}): Promise<{ + connection: acp.ClientSideConnection; + sessionId: string; + process: ChildProcess; + prompt: (text: string) => Promise; + cancel: () => Promise; + close: () => void; +}> { + // Spawn agent subprocess — pipe stderr to suppress agent noise + const agentProcess = spawn(opts.bin, opts.args ?? [], { + stdio: ['pipe', 'pipe', 'pipe'], + cwd: opts.cwd ?? process.cwd(), + }); + + // Silently discard agent's stderr (hook warnings, debug output, etc.) + agentProcess.stderr?.resume(); + + // Handle spawn errors + const spawnError = new Promise((_, reject) => { + agentProcess.on('error', (err) => { + if ((err as NodeJS.ErrnoException).code === 'ENOENT') { + reject(new Error(`Agent "${opts.bin}" not found. Is it installed?`)); + } else { + reject(err); + } + }); + }); + + // Create ACP stream from stdio + const input = Writable.toWeb( + agentProcess.stdin! + ) as WritableStream; + const output = Readable.toWeb( + agentProcess.stdout! + ) as ReadableStream; + const stream = acp.ndJsonStream(input, output); + + // Create client and connection + const client = new FirecrawlClient(opts.callbacks); + const connection = new acp.ClientSideConnection((_agent) => client, stream); + + // Initialize (race with spawn error) + await Promise.race([ + connection.initialize({ + protocolVersion: acp.PROTOCOL_VERSION, + clientCapabilities: { + terminal: true, + fs: { + readTextFile: true, + writeTextFile: true, + }, + }, + }), + spawnError, + ]); + + // Create session + const sessionResult = await connection.newSession({ + cwd: opts.cwd ?? process.cwd(), + mcpServers: [], + }); + + const sessionId = sessionResult.sessionId; + + // Store system prompt to prepend to first message + const systemContext = opts.systemPrompt; + + return { + connection, + sessionId, + process: agentProcess, + + async prompt(text: string) { + // Prepend system prompt as context in the first message + const fullText = systemContext + ? `\n${systemContext}\n\n\n${text}` + : text; + return connection.prompt({ + sessionId, + prompt: [{ type: 'text', text: fullText }], + }); + }, + + async cancel() { + await connection.cancel({ sessionId }); + }, + + close() { + agentProcess.kill(); + }, + }; +} diff --git a/src/acp/registry.ts b/src/acp/registry.ts new file mode 100644 index 0000000..802bc7e --- /dev/null +++ b/src/acp/registry.ts @@ -0,0 +1,33 @@ +/** + * ACP agent registry — detect installed ACP-compatible agents. + */ + +import { execSync } from 'child_process'; + +export interface ACPAgent { + name: string; + bin: string; + displayName: string; + available: boolean; +} + +const KNOWN_AGENTS: Omit[] = [ + { name: 'claude', bin: 'claude-agent-acp', displayName: 'Claude Code' }, + { name: 'codex', bin: 'codex-acp', displayName: 'Codex' }, +]; + +function isBinAvailable(bin: string): boolean { + try { + execSync(`which ${bin}`, { stdio: 'ignore' }); + return true; + } catch { + return false; + } +} + +export function detectAgents(): ACPAgent[] { + return KNOWN_AGENTS.map((a) => ({ + ...a, + available: isBinAvailable(a.bin), + })); +} diff --git a/src/acp/tui.ts b/src/acp/tui.ts new file mode 100644 index 0000000..fa34799 --- /dev/null +++ b/src/acp/tui.ts @@ -0,0 +1,421 @@ +/** + * Firecrawl Agent TUI — phase-aware inline terminal display. + * + * Renders clear section breaks, tool completions, and a persistent + * status line showing tokens/cost/time. No ANSI cursor tricks — + * everything scrolls naturally. Works in pipes and as an agent harness. + */ + +import type { ToolCallInfo } from './client'; + +// ─── Styles (TTY-aware) ───────────────────────────────────────────────────── + +const tty = process.stderr.isTTY; +const dim = (s: string) => (tty ? `\x1b[2m${s}\x1b[0m` : s); +const green = (s: string) => (tty ? `\x1b[32m${s}\x1b[0m` : s); +const red = (s: string) => (tty ? `\x1b[31m${s}\x1b[0m` : s); +const cyan = (s: string) => (tty ? `\x1b[36m${s}\x1b[0m` : s); +const bold = (s: string) => (tty ? `\x1b[1m${s}\x1b[0m` : s); +const BAR = '━'; + +// ─── Tool call categorization ─────────────────────────────────────────────── + +type Phase = 'planning' | 'discovering' | 'extracting' | 'output'; + +function extractUrl(cmd: string, prefix: string): string | null { + const quoted = cmd.match( + new RegExp(`${prefix}\\s+["'](https?://[^"']+)["']`) + ); + if (quoted) return quoted[1]; + const parts = cmd.replace(new RegExp(`^${prefix}\\s*`), '').split(/\s+/); + for (const part of parts) { + const clean = part.replace(/^["']|["']$/g, ''); + if (clean.startsWith('http')) return clean; + } + return null; +} + +interface CallInfo { + label: string; + phase: Phase; + dedupeKey: string; +} + +function categorize(call: ToolCallInfo, sessionDir: string): CallInfo | null { + const input = call.rawInput as Record | undefined; + const title = (call.title || '').toLowerCase(); + + // ── Match by Bash command (Terminal tool calls) ───────────────────── + if (input?.command && typeof input.command === 'string') { + const cmd = input.command.trim(); + + if (cmd.startsWith('firecrawl search')) { + const m = cmd.match(/firecrawl search\s+["']([^"']+)["']/); + const q = m ? m[1] : 'web'; + return { + label: `Searching "${q}"`, + phase: 'discovering', + dedupeKey: `search:${q}`, + }; + } + if (cmd.startsWith('firecrawl scrape')) { + const url = extractUrl(cmd, 'firecrawl scrape'); + if (!url) return null; + return { + label: `Scraping ${url}`, + phase: 'extracting', + dedupeKey: `scrape:${url}`, + }; + } + if (cmd.startsWith('firecrawl map')) { + const url = extractUrl(cmd, 'firecrawl map'); + if (!url) return null; + return { + label: `Mapping ${url}`, + phase: 'extracting', + dedupeKey: `map:${url}`, + }; + } + if (cmd.startsWith('firecrawl crawl')) { + const url = extractUrl(cmd, 'firecrawl crawl'); + if (!url) return null; + return { + label: `Crawling ${url}`, + phase: 'extracting', + dedupeKey: `crawl:${url}`, + }; + } + if (cmd.startsWith('firecrawl agent')) { + return { + label: 'Running extraction agent', + phase: 'extracting', + dedupeKey: 'extract-agent', + }; + } + if (cmd.includes(sessionDir)) { + return { + label: 'Writing output', + phase: 'output', + dedupeKey: 'write-session', + }; + } + return null; + } + + // ── Match by title (MCP tools, built-in tools, etc.) ─────────────── + // MCP tools have titles like "firecrawl_scrape", "firecrawl_search" + // or the tool name itself contains "firecrawl" + if ( + title.includes('firecrawl_scrape') || + title.includes('firecrawl__firecrawl_scrape') + ) { + const url = (input?.url as string) || ''; + return { + label: `Scraping ${url || 'page'}`, + phase: 'extracting', + dedupeKey: `scrape:${url}`, + }; + } + if ( + title.includes('firecrawl_search') || + title.includes('firecrawl__firecrawl_search') + ) { + const query = (input?.query as string) || ''; + return { + label: `Searching "${query || 'web'}"`, + phase: 'discovering', + dedupeKey: `search:${query}`, + }; + } + if ( + title.includes('firecrawl_map') || + title.includes('firecrawl__firecrawl_map') + ) { + const url = (input?.url as string) || ''; + return { + label: `Mapping ${url || 'site'}`, + phase: 'extracting', + dedupeKey: `map:${url}`, + }; + } + if ( + title.includes('firecrawl_crawl') || + title.includes('firecrawl__firecrawl_crawl') + ) { + const url = (input?.url as string) || ''; + return { + label: `Crawling ${url || 'site'}`, + phase: 'extracting', + dedupeKey: `crawl:${url}`, + }; + } + if ( + title.includes('firecrawl_extract') || + title.includes('firecrawl__firecrawl_extract') + ) { + return { + label: 'Extracting data', + phase: 'extracting', + dedupeKey: 'extract', + }; + } + + // WebSearch / WebFetch — built-in tools the agent might use despite instructions + if (title === 'websearch' || title === 'web_search') { + const query = (input?.query as string) || ''; + return { + label: `Searching "${query || 'web'}"`, + phase: 'discovering', + dedupeKey: `websearch:${query}`, + }; + } + if (title === 'webfetch' || title === 'web_fetch') { + const url = (input?.url as string) || ''; + return { + label: `Fetching ${url || 'page'}`, + phase: 'extracting', + dedupeKey: `fetch:${url}`, + }; + } + + // ── File writes to session dir ───────────────────────────────────── + if (input?.path && typeof input.path === 'string') { + if (input.path.startsWith(sessionDir) && title.includes('write')) { + const basename = input.path.split('/').pop() || input.path; + return { + label: `Writing ${basename}`, + phase: 'output', + dedupeKey: `write:${basename}`, + }; + } + return null; + } + + return null; +} + +// ─── Section header ───────────────────────────────────────────────────────── + +const SECTION_WIDTH = 54; + +function sectionHeader(name: string): string { + const pad = SECTION_WIDTH - name.length - 5; // "━━━ Name " + bars + return dim(`${BAR.repeat(3)} ${name} ${BAR.repeat(Math.max(pad, 3))}`); +} + +function sectionFooter(): string { + return dim(BAR.repeat(SECTION_WIDTH)); +} + +// ─── Public API ───────────────────────────────────────────────────────────── + +export interface TUIHandle { + onText: (text: string) => void; + onToolCall: (call: ToolCallInfo) => void; + onToolCallUpdate: (call: ToolCallInfo) => void; + onUsage: (update: { + size: number; + used: number; + cost?: { amount: number; currency: string } | null; + }) => void; + addCredits: (n: number) => void; + + /** Show working spinner immediately (call when starting a new turn) */ + startWorking: () => void; + section: (name: string) => void; + printStatus: () => void; + printSummary: () => void; + pause: () => void; + resume: () => void; + cleanup: () => void; +} + +export function startTUI(opts: { + sessionId: string; + agentName: string; + format: string; + sessionDir: string; +}): TUIHandle { + const calls = new Map(); + const completed = new Set(); + let currentPhase: Phase | null = null; + + // Metrics + let tokensUsed = 0; + let tokensTotal = 0; + let firecrawlCredits = 0; + const startedAt = Date.now(); + + function elapsed(): string { + const secs = Math.round((Date.now() - startedAt) / 1000); + const m = Math.floor(secs / 60); + const s = secs % 60; + return m > 0 ? `${m}m ${s}s` : `${s}s`; + } + + function statusLine(): string { + const parts: string[] = []; + if (tokensUsed > 0) { + const k = Math.round(tokensUsed / 1000); + parts.push(`${k}k tokens`); + } + if (firecrawlCredits > 0) { + parts.push(`${firecrawlCredits} credits`); + } + parts.push(elapsed()); + return dim(parts.join(' · ')); + } + + function ensurePhase(phase: Phase) { + if (phase === currentPhase) return; + const names: Record = { + planning: 'Planning', + discovering: 'Discovering', + extracting: 'Gathering Data', + output: 'Output', + }; + currentPhase = phase; + process.stderr.write(`\n${sectionHeader(names[phase])}\n\n`); + } + + // Track cursor state + let workingShown = false; + let workingInterval: ReturnType | null = null; + let spinFrame = 0; + let lastCharWasNewline = true; + const SPIN = ['⠋', '⠙', '⠹', '⠸', '⠼', '⠴', '⠦', '⠧', '⠇', '⠏']; + + function ensureNewline() { + if (!lastCharWasNewline) { + process.stdout.write('\n'); + lastCharWasNewline = true; + } + } + + const WORKING_MESSAGES = [ + 'Gathering data', + 'Analyzing sources', + 'Processing results', + 'Crunching numbers', + 'Sifting through pages', + 'Connecting the dots', + 'Piecing it together', + ]; + let workingMsgIndex = Math.floor(Math.random() * WORKING_MESSAGES.length); + let workingMsgTicks = 0; + + function showWorking() { + if (workingShown || !tty) return; + ensureNewline(); + workingShown = true; + workingMsgTicks = 0; + workingInterval = setInterval(() => { + spinFrame = (spinFrame + 1) % SPIN.length; + workingMsgTicks++; + // Rotate message every ~4 seconds + if (workingMsgTicks % 50 === 0) { + workingMsgIndex = (workingMsgIndex + 1) % WORKING_MESSAGES.length; + } + const msg = WORKING_MESSAGES[workingMsgIndex]; + process.stderr.write( + `\r ${dim(`${SPIN[spinFrame]} ${msg}...`)}${''.padEnd(20)}` + ); + }, 80); + } + + function clearWorking() { + if (!workingShown) return; + workingShown = false; + if (workingInterval) { + clearInterval(workingInterval); + workingInterval = null; + } + process.stderr.write(`\r\x1b[2K`); + } + + return { + onText(text: string) { + clearWorking(); + process.stdout.write(text); + if (text.length > 0) { + lastCharWasNewline = text[text.length - 1] === '\n'; + } + }, + + onToolCall(call: ToolCallInfo) { + const info = categorize(call, opts.sessionDir); + if (!info) { + showWorking(); + return; + } + calls.set(call.id, info); + showWorking(); // spinner while it runs + }, + + onToolCallUpdate(call: ToolCallInfo) { + const info = calls.get(call.id); + if (!info) { + // Background work completed — keep working indicator if others pending + return; + } + + if (call.status === 'completed' || call.status === 'errored') { + calls.delete(call.id); + if (completed.has(info.dedupeKey)) return; + completed.add(info.dedupeKey); + + clearWorking(); + ensureNewline(); + ensurePhase(info.phase); + const icon = call.status === 'completed' ? green('✓') : red('✗'); + process.stderr.write(` ${icon} ${info.label}\n`); + } + }, + + onUsage(update) { + tokensUsed = update.used; + tokensTotal = update.size; + }, + + addCredits(n: number) { + firecrawlCredits += n; + }, + + startWorking() { + showWorking(); + }, + + section(name: string) { + ensureNewline(); + currentPhase = null; + process.stderr.write(`\n${sectionHeader(name)}\n\n`); + }, + + printStatus() { + ensureNewline(); + process.stderr.write(`${statusLine()}\n`); + }, + + printSummary() { + ensureNewline(); + process.stderr.write(`\n${sectionFooter()}\n`); + process.stderr.write(`${statusLine()}\n`); + }, + + pause() { + clearWorking(); + ensureNewline(); + process.stderr.write('\n'); + }, + + resume() { + clearWorking(); + completed.clear(); + currentPhase = null; + }, + + cleanup() { + clearWorking(); + }, + }; +} diff --git a/src/commands/agent-interactive.ts b/src/commands/agent-interactive.ts new file mode 100644 index 0000000..251f6b1 --- /dev/null +++ b/src/commands/agent-interactive.ts @@ -0,0 +1,958 @@ +/** + * Interactive ACP agent for data gathering. + * + * Detects locally-installed ACP-compatible agents (Claude Code, Codex, + * Gemini CLI, etc.), walks the user through an interactive flow to describe + * the data they need, then connects to the selected agent via ACP to gather, + * structure, and deliver datasets as CSV, JSON, or markdown. + */ + +import { type ACPAgent, detectAgents } from '../acp/registry'; +import { connectToAgent } from '../acp/client'; +import { startTUI } from '../acp/tui'; +import { + createSession, + getSessionDir, + loadSession, + updateSession, + loadPreferences, + savePreferences, +} from '../utils/acp'; +import { + FIRECRAWL_TOOLS_BLOCK, + SUBAGENT_INSTRUCTIONS, +} from './experimental/shared'; + +const bold = (s: string) => (process.stderr.isTTY ? `\x1b[1m${s}\x1b[0m` : s); +const dim = (s: string) => (process.stderr.isTTY ? `\x1b[2m${s}\x1b[0m` : s); + +// ─── Suggestions ──────────────────────────────────────────────────────────── + +const SUGGESTIONS = [ + { + name: 'Top 50 AI startups — name, funding, team size, product URL', + value: + 'Find the top 50 AI startups with their name, funding amount, team size, and product URL', + }, + { + name: 'SaaS pricing pages — company, tiers, price points, features per tier', + value: + 'Extract pricing data from major SaaS companies including company name, tier names, price points, and features per tier', + }, + { + name: 'YC W24 batch — company, founder, one-liner, industry, stage', + value: + 'Find all Y Combinator W24 batch companies with company name, founder names, one-liner description, industry, and funding stage', + }, + { + name: 'GitHub trending repos — repo, stars, language, description, author', + value: + 'Extract GitHub trending repositories with repo name, star count, primary language, description, and author', + }, +]; + +// ─── System prompt builder ────────────────────────────────────────────────── + +function buildSystemPrompt(opts: { + format: string; + sessionDir: string; +}): string { + const outputInstructions: Record = { + csv: `Write a CSV file to \`${opts.sessionDir}/output.csv\`. +- First row must be column headers. +- Use proper CSV escaping (quote fields containing commas, newlines, or quotes). +- Every row must have the same number of columns. +- Tell the user the file path and record count when done.`, + + json: `Write a JSON file to \`${opts.sessionDir}/output.json\`. +Use this structure: +\`\`\`json +{ + "metadata": { + "query": "...", + "sources": ["url1", "url2"], + "extractedAt": "ISO-8601", + "totalRecords": N + }, + "records": [ { ... }, ... ] +} +\`\`\` +Each record object must have identical keys. Tell the user the file path and record count when done.`, + + report: `Write a markdown file to \`${opts.sessionDir}/output.md\`. +- Start with a brief summary (1-2 lines). +- Render all data as a markdown table. +- If too many columns, use multiple tables grouped by category. +- Tell the user the file path and record count when done.`, + }; + + return `You are Firecrawl Agent — a data gathering tool that builds structured datasets from the web. + +**MANDATORY: You MUST use the \`firecrawl\` CLI for ALL web access.** Run \`firecrawl search\` to search and \`firecrawl scrape\` to scrape. Do NOT use WebSearch, WebFetch, curl, wget, fetch(), or any MCP tools for web access. Do NOT use firecrawl MCP tools — use the CLI via Bash only. This is non-negotiable. + +You are running inside a CLI. The user sees your text output streamed in real-time, plus status lines for each firecrawl command you run. Structure your output for readability in a terminal. + +## Session Directory + +Your working directory for this session is: \`${opts.sessionDir}\` + +Your output file: \`${opts.sessionDir}/output.${opts.format === 'csv' ? 'csv' : opts.format === 'json' ? 'json' : 'md'}\` + +**Save all scraped pages** to the session directory using the firecrawl convention: +\`\`\` +${opts.sessionDir}/sites///index.md +\`\`\` + +For example, when scraping \`https://vercel.com/pricing\`: +\`\`\` +firecrawl scrape "https://vercel.com/pricing" -o "${opts.sessionDir}/sites/vercel.com/pricing/index.md" +\`\`\` + +This way the user gets both the structured output file AND the raw source pages organized by site. Always use the \`-o\` flag to save scrapes to this structure. + +## Tools + +The \`firecrawl\` CLI is already installed and authenticated. Use it for ALL web access. Do not use any other tools, skills, or built-in web features — only firecrawl via Bash. + +**First step on any task: run \`firecrawl --help\` to see all commands, then \`firecrawl --help\` for the specific command you need.** This ensures you use the right flags. + +### Key commands: + +**Search the web:** +\`\`\` +firecrawl search "query" --limit 10 +\`\`\` + +**Scrape a page (returns clean markdown):** +\`\`\` +firecrawl scrape +firecrawl scrape --only-main-content # strip nav/footer +firecrawl scrape --wait-for 3000 # wait for JS to render +firecrawl scrape --format json # structured JSON output +firecrawl scrape -o output.md # save to file +\`\`\` + +**Discover URLs on a site:** +\`\`\` +firecrawl map --limit 50 +\`\`\` + +**Crawl an entire site:** +\`\`\` +firecrawl crawl --limit 20 +\`\`\` + +### Rules: +- **Do NOT use \`firecrawl browser\` or \`firecrawl interact\`** — stick to search + scrape. +- Always quote URLs in commands. +- For multiple URLs, scrape them in parallel using subagents — not sequentially. +- Save scraped content to temp files when you need to parse it (e.g., \`firecrawl scrape -o /tmp/page.md\`). + +## How You Work + +**Match your effort to the request:** +- **Simple request** (one site, specific data): Skip the plan. Just scrape it, extract the data, write the output. Done in one turn. +- **Medium request** (a few sources): Propose a quick plan, then execute after confirmation. +- **Large request** (many sources, comprehensive): Full plan with schema confirmation, then multi-source extraction. + +For simple requests, do NOT ask "shall I proceed?" — just do it. Only stop for confirmation on medium/large requests where the plan matters. + +### Phase 1: Plan +Propose a schema (list fields as bullet points, not a table — tables render poorly in terminals) and a brief plan of what sources you'll check. Then STOP and wait. + +Example output: +\`\`\` +Fields: name, funding, team_size, product_url, category, source_url + +Plan: +1. Search for "top AI startups" lists +2. Scrape Forbes AI 50, TechCrunch, TopStartups.io +3. Cross-reference and deduplicate +4. Write ${opts.format.toUpperCase()} with ~50 records + +Shall I proceed? +\`\`\` + +### Phase 2: Discover Sources +Search for relevant data sources. After finding them, tell the user what you found: + +\`\`\` +Found 4 good sources: +- forbes.com/lists/ai50 (50 companies) +- techcrunch.com/... (49 companies) +- topstartups.io (160+ companies) +- failory.com/... (unicorns list) + +Scraping now... +\`\`\` + +### Phase 3: Extract Data + +**IMPORTANT: Use subagents for all scraping and parsing.** This keeps your context window clean. + +${SUBAGENT_INSTRUCTIONS} + +For each source (or group of related sources), spawn a subagent with a prompt like: +"Scrape [URL] using firecrawl. Extract records with these fields: [field list]. Return ONLY a JSON array of objects — no commentary, no markdown, just the JSON array." + +Launch all extraction subagents in a SINGLE message (parallel). Each subagent handles the heavy work (scraping, reading large pages, parsing) and returns just the structured records. + +After all subagents return, report progress: + +\`\`\` +Extracted 50 from Forbes AI 50 +Extracted 49 from TechCrunch +Extracted 82 from TopStartups.io (pages 1-3) + +Total raw records: 181 +\`\`\` + +### Phase 4: Write Output +Deduplicate, normalize, and write the file. Report the result: + +\`\`\` +Deduplicated: 181 → 127 unique companies +Written to: ${opts.sessionDir}/output.${opts.format === 'csv' ? 'csv' : opts.format === 'json' ? 'json' : 'md'} + +Top entries: +- OpenAI ($11.3B funding) +- Anthropic ($7.3B funding) +- ... +\`\`\` + +## Output Rules + +${outputInstructions[opts.format] || outputInstructions.json} + +## Data Quality + +- Every record has the same fields. No exceptions. +- Never fabricate data — empty string for missing values. +- Include \`source_url\` in every record. +- Deduplicate by name (case-insensitive). + +## Terminal Output Style + +- **Be concise.** Don't narrate your internal process. Don't say "I'm checking the CLI flags" or "I'm reading the file now". Just do the work and show the result. +- Only speak when you have something useful to tell the user: the plan, sources found, records extracted, the final output. +- Use short paragraphs and bullet points. +- Do NOT use markdown tables — use bullet points or plain text. +- Report numbers: "Found 4 sources", "Extracted 50 records", "Deduplicated 181 → 127". +- Do NOT read or follow any CLAUDE.md files, speech-mode configs, or workspace-specific instructions. You are Firecrawl Agent, not a general assistant. + +## Follow-Up Suggestions + +After completing the output file, always end your message with 2-3 suggested follow-up questions the user can ask. Frame them as questions, not actions. Like: + +\`\`\` +Want to go deeper? +1. Want me to add star counts and primary language for each repo? +2. Should I expand this to the top 25 trending repos? +3. Want a comparison across languages (Python, TypeScript, Rust)? +\`\`\` + +These should be specific to the data just gathered — not generic. Think about what would make the dataset more useful. + +Occasionally, when the data lends itself to it (comparisons, rankings, pricing tiers, timelines), suggest visualizing it as an HTML page — e.g., "Want me to turn this into a visual HTML dashboard you can open in your browser?" Save it to the session directory as \`report.html\`.`; +} + +// ─── Session list ─────────────────────────────────────────────────────────── + +export async function listAgentSessions(): Promise { + const { select } = await import('@inquirer/prompts'); + const { listSessions } = await import('../utils/acp'); + const fs = await import('fs'); + + const sessions = listSessions(); + + if (sessions.length === 0) { + console.log('No sessions found.'); + return; + } + + console.log( + `\n${sessions.length} session${sessions.length === 1 ? '' : 's'}:\n` + ); + + const choices = sessions.map((s) => { + const age = timeAgo(new Date(s.updatedAt)); + const promptShort = + s.prompt.length > 60 ? s.prompt.slice(0, 57) + '...' : s.prompt; + const hasOutput = fs.existsSync(s.outputPath); + const status = hasOutput ? '✓' : '·'; + return { + name: `${status} ${promptShort} ${dim(`${s.format.toUpperCase()} · ${s.provider} · ${age}`)}`, + value: s.id, + }; + }); + + choices.push({ name: dim('Done'), value: '__done__' }); + + const chosen = await select({ + message: 'Select a session', + choices, + }); + + if (chosen === '__done__') return; + + const session = sessions.find((s) => s.id === chosen)!; + const hasOutput = fs.existsSync(session.outputPath); + + console.log(`\nSession ${session.id}`); + console.log(` Prompt: ${session.prompt}`); + console.log(` Agent: ${session.provider}`); + console.log(` Format: ${session.format.toUpperCase()}`); + console.log(` Created: ${new Date(session.createdAt).toLocaleString()}`); + console.log( + ` Output: ${hasOutput ? session.outputPath : '(not yet written)'}` + ); + + const actions = [ + { name: 'Resume this session', value: 'resume' }, + ...(hasOutput + ? [{ name: `Open ${session.outputPath.split('/').pop()}`, value: 'open' }] + : []), + { name: 'Open session folder', value: 'folder' }, + { name: 'Back', value: 'back' }, + ]; + + const action = await select({ message: 'Action', choices: actions }); + + if (action === 'resume') { + await runInteractiveAgent({ session: session.id }); + } else if (action === 'open') { + const { execSync } = await import('child_process'); + try { + execSync(`open "${session.outputPath}"`); + } catch { + console.log(session.outputPath); + } + } else if (action === 'folder') { + const { execSync } = await import('child_process'); + const { getSessionDir } = await import('../utils/acp'); + try { + execSync(`open "${getSessionDir(session.id)}"`); + } catch { + console.log(getSessionDir(session.id)); + } + } +} + +function timeAgo(date: Date): string { + const secs = Math.round((Date.now() - date.getTime()) / 1000); + if (secs < 60) return 'just now'; + const mins = Math.floor(secs / 60); + if (mins < 60) return `${mins}m ago`; + const hrs = Math.floor(mins / 60); + if (hrs < 24) return `${hrs}h ago`; + const days = Math.floor(hrs / 24); + return `${days}d ago`; +} + +// ─── Session end ──────────────────────────────────────────────────────────── + +async function showSessionEnd( + sessionId: string, + outputPath: string, + sessionDir: string +): Promise { + const { select } = await import('@inquirer/prompts'); + const fs = await import('fs'); + const { execSync } = await import('child_process'); + + console.log(`\nSession ${sessionId} saved.`); + + // Build choices — only show output file + session folder, not internals + const choices: Array<{ name: string; value: string }> = []; + + if (fs.existsSync(outputPath)) { + const stat = fs.statSync(outputPath); + const size = + stat.size > 1024 ? `${Math.round(stat.size / 1024)}KB` : `${stat.size}B`; + const basename = outputPath.split('/').pop() || 'output'; + choices.push({ + name: `Open ${basename} (${size})`, + value: `file:${outputPath}`, + }); + } + choices.push({ name: 'Open session folder', value: `folder:${sessionDir}` }); + choices.push({ name: 'Done', value: 'done' }); + + const action = await select({ + message: 'What next?', + choices, + }); + + if (action === 'done') return; + + const [type, path] = action.split(':'); + const target = action.slice(type!.length + 1); // handle paths with colons + + try { + if (process.platform === 'darwin') { + execSync(`open "${target}"`); + } else if (process.platform === 'linux') { + execSync(`xdg-open "${target}"`); + } else if (process.platform === 'win32') { + execSync(`start "" "${target}"`); + } + } catch { + console.log(`Path: ${target}`); + } +} + +// ─── Interactive flow ─────────────────────────────────────────────────────── + +export async function runInteractiveAgent(options: { + provider?: string; + session?: string; + format?: string; + yes?: boolean; +}): Promise { + const { input, select } = await import('@inquirer/prompts'); + + // ── Resume session ────────────────────────────────────────────────────── + if (options.session) { + const session = loadSession(options.session); + if (!session) { + console.error(`Session not found: ${options.session}`); + process.exit(1); + } + + console.log(`\nResuming session ${session.id}`); + console.log(` Provider: ${session.provider}`); + console.log(` Prompt: ${session.prompt}`); + console.log(` Format: ${session.format}`); + console.log(` Iterations: ${session.iterations}\n`); + + const refinement = await input({ + message: 'What would you like to refine or add?', + }); + + updateSession(session.id, { + iterations: session.iterations + 1, + }); + + const systemPrompt = buildSystemPrompt({ + format: session.format, + sessionDir: getSessionDir(session.id), + }); + + const userMessage = `Continue from previous session. Original request: "${session.prompt}". Schema fields: ${session.schema.join(', ')}. Output already at: ${session.outputPath}. New instruction: ${refinement}`; + + // Look up the ACP binary for this provider + const agents = detectAgents(); + const resumeAgent = agents.find( + (a) => a.name === session.provider && a.available + ); + if (!resumeAgent) { + console.error( + `Agent "${session.provider}" is not available. Install it first.` + ); + process.exit(1); + } + + console.log(`\n🔥 ${bold('Firecrawl Agent')} — Resuming session`); + console.log(dim(' Press Ctrl+C to cancel\n')); + + const resumeTui = startTUI({ + sessionId: session.id, + agentName: resumeAgent.displayName, + format: session.format, + sessionDir: getSessionDir(session.id), + }); + + let agent: Awaited> | null = null; + + const handleInterrupt = () => { + resumeTui.cleanup(); + process.stderr.write('\nInterrupted.\n'); + if (agent) { + agent.cancel().catch(() => {}); + agent.close(); + } + process.exit(0); + }; + process.on('SIGINT', handleInterrupt); + + try { + agent = await connectToAgent({ + bin: resumeAgent.bin, + systemPrompt, + callbacks: { + onText: (text) => resumeTui.onText(text), + onToolCall: (call) => resumeTui.onToolCall(call), + onToolCallUpdate: (call) => resumeTui.onToolCallUpdate(call), + onUsage: (update) => resumeTui.onUsage(update), + }, + }); + + // Conversation loop (same as main flow) + let currentMessage = userMessage; + while (true) { + const result = await agent.prompt(currentMessage); + resumeTui.pause(); + process.stdout.write('\n'); + + if (result.stopReason !== 'end_turn') { + resumeTui.printSummary(); + break; + } + + const followUp = await input({ + message: '→', + default: '', + }); + + const trimmed = followUp.trim().toLowerCase(); + if ( + !trimmed || + trimmed === 'done' || + trimmed === 'exit' || + trimmed === 'quit' + ) { + resumeTui.printSummary(); + await showSessionEnd( + session.id, + session.outputPath, + getSessionDir(session.id) + ); + break; + } + + resumeTui.resume(); + resumeTui.startWorking(); + currentMessage = followUp; + } + } catch (error) { + resumeTui.cleanup(); + console.error('\nError:', error instanceof Error ? error.message : error); + process.exit(1); + } finally { + process.removeListener('SIGINT', handleInterrupt); + if (agent) agent.close(); + } + return; + } + + // ── Banner ────────────────────────────────────────────────────────────── + const orange = (s: string) => + process.stderr.isTTY ? `\x1b[38;5;208m${s}\x1b[0m` : s; + console.log( + orange(` + ███████╗██╗██████╗ ███████╗ ██████╗██████╗ █████╗ ██╗ ██╗██╗ + ██╔════╝██║██╔══██╗██╔════╝██╔════╝██╔══██╗██╔══██╗██║ ██║██║ + █████╗ ██║██████╔╝█████╗ ██║ ██████╔╝███████║██║ █╗ ██║██║ + ██╔══╝ ██║██╔══██╗██╔══╝ ██║ ██╔══██╗██╔══██║██║███╗██║██║ + ██║ ██║██║ ██║███████╗╚██████╗██║ ██║██║ ██║╚███╔███╔╝███████╗ + ╚═╝ ╚═╝╚═╝ ╚═╝╚══════╝ ╚═════╝╚═╝ ╚═╝╚═╝ ╚═╝ ╚══╝╚══╝ ╚══════╝ + █████╗ ██████╗ ███████╗███╗ ██╗████████╗ + ██╔══██╗██╔════╝ ██╔════╝████╗ ██║╚══██╔══╝ + ███████║██║ ███╗█████╗ ██╔██╗ ██║ ██║ + ██╔══██║██║ ██║██╔══╝ ██║╚██╗██║ ██║ + ██║ ██║╚██████╔╝███████╗██║ ╚████║ ██║ + ╚═╝ ╚═╝ ╚═════╝ ╚══════╝╚═╝ ╚═══╝ ╚═╝`) + ); + console.log(''); + + // ── Detect agents ─────────────────────────────────────────────────────── + const agents = detectAgents(); + const available = agents.filter((a) => a.available); + + if (available.length === 0) { + // Check if raw CLIs are installed (but ACP adapters aren't) + const { execSync } = await import('child_process'); + const hasClaude = (() => { + try { + execSync('which claude', { stdio: 'ignore' }); + return true; + } catch { + return false; + } + })(); + const hasCodex = (() => { + try { + execSync('which codex', { stdio: 'ignore' }); + return true; + } catch { + return false; + } + })(); + + if (hasClaude || hasCodex) { + console.error('\nNo ACP adapters found, but you have:'); + if (hasClaude) + console.error( + ' ✓ Claude Code (claude) — install adapter: npm install -g @zed-industries/claude-agent-acp' + ); + if (hasCodex) + console.error( + ' ✓ Codex (codex) — install adapter: npm install -g @zed-industries/codex-acp' + ); + console.error(''); + } else { + console.error( + '\nNo ACP-compatible agents found. Install one of:\n' + + ' npm install -g @zed-industries/claude-agent-acp (Claude Code)\n' + + ' npm install -g @zed-industries/codex-acp (Codex)\n' + + ' See https://agentclientprotocol.com/get-started/agents\n' + ); + } + process.exit(1); + } + + // ── Select agent ──────────────────────────────────────────────────────── + let selectedAgent: ACPAgent; + const prefs = loadPreferences(); + + if (options.provider) { + const match = agents.find((a) => a.name === options.provider); + if (!match || !match.available) { + console.error( + `Agent "${options.provider}" is not installed. Available: ${available.map((a) => a.name).join(', ')}` + ); + process.exit(1); + } + selectedAgent = match; + } else { + // Always show picker — default to last-used agent + const defaultAgent = prefs.defaultAgent || available[0].name; + const installedChoices = available.map((a) => ({ + name: a.displayName, + value: a.name, + disabled: false as const, + })); + const notInstalled = agents.filter((a) => !a.available); + const agentChoices = [ + ...installedChoices, + ...(notInstalled.length > 0 + ? [ + { + name: '─── Not installed ───', + value: '_sep', + disabled: 'separator' as const, + }, + ...notInstalled.map((a) => ({ + name: a.displayName, + value: a.name, + disabled: 'not installed' as const, + })), + ] + : []), + ]; + + const chosen = await select({ + message: 'Harness', + choices: agentChoices, + default: defaultAgent, + }); + + selectedAgent = agents.find((a) => a.name === chosen)!; + savePreferences({ defaultAgent: selectedAgent.name }); + } + + // ── Gather prompt ─────────────────────────────────────────────────────── + let prompt = await input({ + message: 'What data do you want to gather?', + default: '', + }); + + // If empty, show suggestions to pick from + if (!prompt.trim()) { + const picked = await select({ + message: 'Pick an example:', + choices: SUGGESTIONS.map((s) => ({ name: s.name, value: s.value })), + }); + prompt = picked; + } + + // ── Seed URLs ─────────────────────────────────────────────────────────── + const urls = await input({ + message: + 'Any URLs to start from? (comma-separated, leave blank to auto-discover)', + default: '', + }); + + // ── Output format ─────────────────────────────────────────────────────── + const format = + options.format || + (await select({ + message: 'Output format?', + choices: [ + { name: 'CSV (spreadsheet-ready)', value: 'csv' }, + { name: 'JSON (structured, API-ready)', value: 'json' }, + { name: 'Markdown table (human-readable)', value: 'report' }, + ], + })); + + // ── Create session ────────────────────────────────────────────────────── + const session = createSession({ + provider: selectedAgent.name, + prompt, + schema: [], + format, + }); + + // ── Build message ───────────────────────────────────────────────────── + const systemPrompt = buildSystemPrompt({ + format, + sessionDir: getSessionDir(session.id), + }); + + const parts = [`Gather data: ${prompt}`]; + if (urls.trim()) parts.push(`Start from these URLs: ${urls}`); + const userMessage = parts.join('. ') + '.'; + + console.log( + `\n ${selectedAgent.displayName} · ${format.toUpperCase()} · Session ${session.id}` + ); + console.log(dim(` Press Ctrl+C to cancel · type "done" to finish\n`)); + + // Start TUI + const sessionDir = getSessionDir(session.id); + const tui = startTUI({ + sessionId: session.id, + agentName: selectedAgent.displayName, + format, + sessionDir, + }); + + // Handle Ctrl+C gracefully + let agent: Awaited> | null = null; + + const handleInterrupt = () => { + tui.cleanup(); + process.stderr.write('\nInterrupted.\n'); + if (agent) { + agent.cancel().catch(() => {}); + agent.close(); + } + process.exit(0); + }; + process.on('SIGINT', handleInterrupt); + + try { + agent = await connectToAgent({ + bin: selectedAgent.bin, + systemPrompt, + callbacks: { + onText: (text) => tui.onText(text), + onToolCall: (call) => tui.onToolCall(call), + onToolCallUpdate: (call) => tui.onToolCallUpdate(call), + onUsage: (update) => tui.onUsage(update), + }, + }); + + // ── Conversation loop ───────────────────────────────────────────────── + tui.startWorking(); + let currentMessage = userMessage; + while (true) { + const result = await agent.prompt(currentMessage); + + // Unmount TUI for user input + tui.pause(); + process.stdout.write('\n'); + + // If the agent stopped for a reason other than end_turn, break + if (result.stopReason !== 'end_turn') { + tui.printSummary(); + break; + } + + // Ask user for follow-up (default action) + const followUp = await input({ + message: '→', + default: '', + }); + + const trimmed = followUp.trim().toLowerCase(); + if ( + !trimmed || + trimmed === 'done' || + trimmed === 'exit' || + trimmed === 'quit' + ) { + tui.printSummary(); + await showSessionEnd( + session.id, + session.outputPath, + getSessionDir(session.id) + ); + break; + } + + // Remount TUI for next turn — show spinner immediately + tui.resume(); + tui.startWorking(); + currentMessage = followUp; + } + } catch (error) { + tui.cleanup(); + console.error('\nError:', error instanceof Error ? error.message : error); + process.exit(1); + } finally { + tui.cleanup(); + process.removeListener('SIGINT', handleInterrupt); + if (agent) agent.close(); + } +} + +// ─── Headless mode ────────────────────────────────────────────────────────── + +/** + * Run an ACP agent headlessly with a prompt. Returns the session path + * so callers (other agents, scripts) know where to find the output. + */ +export async function runHeadlessAgent(opts: { + prompt: string; + format?: string; + provider?: string; +}): Promise { + const agents = detectAgents(); + const available = agents.filter((a) => a.available); + + if (available.length === 0) { + console.error('No ACP agents found.'); + process.exit(1); + } + + // Pick agent: flag > preference > first available + const prefs = loadPreferences(); + const agentName = opts.provider || prefs.defaultAgent || available[0].name; + const selectedAgent = + available.find((a) => a.name === agentName) || available[0]; + const format = opts.format || 'json'; + + const session = createSession({ + provider: selectedAgent.name, + prompt: opts.prompt, + schema: [], + format, + }); + + const sessionDir = getSessionDir(session.id); + const systemPrompt = buildSystemPrompt({ format, sessionDir }); + const userMessage = `Gather data: ${opts.prompt}.`; + + // Run in background — spawn detached process + const { spawn } = await import('child_process'); + + // Write args to a temp file since system prompts can be huge + const fs = await import('fs'); + const argsPath = `${sessionDir}/worker-args.json`; + fs.writeFileSync( + argsPath, + JSON.stringify({ + sessionId: session.id, + agentBin: selectedAgent.bin, + systemPrompt, + userMessage, + }) + ); + + // Spawn a detached node process that runs the worker + const child = spawn( + process.execPath, // node or tsx + [ + ...process.execArgv, // preserve tsx loader flags + __filename, + '__headless_worker__', + argsPath, + ], + { + detached: true, + stdio: 'ignore', + env: { ...process.env }, + } + ); + child.unref(); + + console.log( + `Running with ${selectedAgent.displayName} (session ${session.id})` + ); + console.log(`Output: ${session.outputPath}`); + console.log(`Log: ${sessionDir}/agent.log`); + console.log(`\nTail the log: tail -f ${sessionDir}/agent.log`); +} + +// ─── Background worker (called by forked process) ────────────────────────── + +async function _runHeadlessWorker( + sessionId: string, + agentBin: string, + systemPrompt: string, + userMessage: string +): Promise { + const fs = await import('fs'); + const sessionDir = getSessionDir(sessionId); + const logPath = `${sessionDir}/agent.log`; + + // Append a line to the agent log + function log(line: string) { + fs.appendFileSync(logPath, line + '\n'); + } + + log(`[${new Date().toISOString()}] Session started`); + log(`[agent] ${agentBin}`); + log(`[prompt] ${userMessage}`); + log(''); + + try { + const agent = await connectToAgent({ + bin: agentBin, + systemPrompt, + callbacks: { + onText: (text) => { + // Write agent text to log (strip trailing whitespace per line) + fs.appendFileSync(logPath, text); + }, + onToolCall: (call) => { + const input = call.rawInput as Record | undefined; + const cmd = input?.command as string | undefined; + if (cmd) { + log(`\n[tool] ${call.title}: ${cmd.slice(0, 200)}`); + } else { + log(`\n[tool] ${call.title}`); + } + }, + onToolCallUpdate: (call) => { + if (call.status === 'completed') { + log(`[done] ${call.title || call.id}`); + } else if (call.status === 'errored') { + log(`[fail] ${call.title || call.id}`); + } + }, + }, + }); + + await agent.prompt(userMessage); + agent.close(); + updateSession(sessionId, { iterations: 1 }); + log(`\n[${new Date().toISOString()}] Completed`); + } catch (error) { + log( + `\n[${new Date().toISOString()}] Failed: ${error instanceof Error ? error.message : String(error)}` + ); + } +} + +// If this file is run as a background worker +if (process.argv[2] === '__headless_worker__') { + const argsPath = process.argv[3]; + import('fs').then((fs) => { + const args = JSON.parse(fs.readFileSync(argsPath, 'utf-8')); + // Clean up args file + try { + fs.unlinkSync(argsPath); + } catch {} + _runHeadlessWorker( + args.sessionId, + args.agentBin, + args.systemPrompt, + args.userMessage + ).then( + () => process.exit(0), + () => process.exit(1) + ); + }); +} diff --git a/src/index.ts b/src/index.ts index 79fbf7a..cb131e4 100644 --- a/src/index.ts +++ b/src/index.ts @@ -18,6 +18,10 @@ import { handleCrawlCommand } from './commands/crawl'; import { handleMapCommand } from './commands/map'; import { handleSearchCommand } from './commands/search'; import { handleAgentCommand } from './commands/agent'; +import { + runInteractiveAgent, + runHeadlessAgent, +} from './commands/agent-interactive'; import { handleBrowserLaunch, handleBrowserExecute, @@ -642,9 +646,24 @@ function createAgentCommand(): Command { const agentCmd = new Command('agent') .description('Run an AI agent to extract data from the web') .argument( - '', + '[prompt-or-job-id]', 'Natural language prompt describing data to extract, or job ID to check status' ) + .option( + '-i, --interactive', + 'Interactive mode: detect ACP providers, gather data with local agent', + false + ) + .option( + '--provider ', + 'ACP provider to use (claude, codex, opencode)' + ) + .option('--session ', 'Resume an existing interactive session') + .option('-l, --list', 'List past agent sessions', false) + .option( + '--format ', + 'Output format for interactive mode (csv, json, report)' + ) .option('--urls ', 'Comma-separated URLs to focus extraction on') .option( '--model ', @@ -687,10 +706,48 @@ function createAgentCommand(): Command { .option('-o, --output ', 'Output file path (default: stdout)') .option('--json', 'Output as JSON format', false) .option('--pretty', 'Pretty print JSON output', false) + .option('-y, --yes', 'Auto-approve all tool permissions', false) + .option( + '--api', + 'Use Firecrawl API agent instead of local ACP agent', + false + ) .action(async (promptOrJobId, options) => { + // List past sessions + if (options.list) { + const { listAgentSessions } = + await import('./commands/agent-interactive'); + await listAgentSessions(); + return; + } + + // Interactive mode: no prompt, or -i flag, or --session + const isInteractive = + !promptOrJobId || options.interactive || options.session; + + if (isInteractive) { + await runInteractiveAgent({ + provider: options.provider, + session: options.session, + format: options.format, + yes: options.yes, + }); + return; + } + // Auto-detect if it's a job ID (UUID format) const isStatusCheck = options.status || isJobId(promptOrJobId); + // Headless ACP mode: prompt provided, not a job ID, not --api + if (!isStatusCheck && !options.api) { + await runHeadlessAgent({ + prompt: promptOrJobId, + format: options.format || 'json', + provider: options.provider, + }); + return; + } + // Parse URLs let urls: string[] | undefined; if (options.urls) { diff --git a/src/utils/acp.ts b/src/utils/acp.ts new file mode 100644 index 0000000..9dbbfbf --- /dev/null +++ b/src/utils/acp.ts @@ -0,0 +1,153 @@ +/** + * Session management for Firecrawl Agent. + * + * Manages persistent sessions in ~/.firecrawl/sessions/. + */ + +import * as fs from 'fs'; +import * as path from 'path'; +import * as os from 'os'; + +// ─── Types ────────────────────────────────────────────────────────────────── + +export interface Session { + id: string; + provider: string; + prompt: string; + schema: string[]; + format: string; + outputPath: string; + createdAt: string; + updatedAt: string; + iterations: number; +} + +// ─── Sessions directory ───────────────────────────────────────────────────── + +function getFirecrawlDir(): string { + return path.join(os.homedir(), '.firecrawl'); +} + +function getSessionsDir(): string { + return path.join(getFirecrawlDir(), 'sessions'); +} + +function ensureSessionsDir(): void { + const dir = getSessionsDir(); + if (!fs.existsSync(dir)) { + fs.mkdirSync(dir, { recursive: true }); + } +} + +// ─── Preferences ──────────────────────────────────────────────────────────── + +interface Preferences { + defaultAgent?: string; + defaultFormat?: string; +} + +function getPrefsPath(): string { + return path.join(getFirecrawlDir(), 'preferences.json'); +} + +export function loadPreferences(): Preferences { + try { + const p = getPrefsPath(); + if (!fs.existsSync(p)) return {}; + return JSON.parse(fs.readFileSync(p, 'utf-8')) as Preferences; + } catch { + return {}; + } +} + +export function savePreferences(patch: Partial): void { + const dir = getFirecrawlDir(); + if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true }); + const current = loadPreferences(); + const updated = { ...current, ...patch }; + fs.writeFileSync(getPrefsPath(), JSON.stringify(updated, null, 2)); +} + +// ─── Session ID ───────────────────────────────────────────────────────────── + +function generateId(): string { + const chars = 'abcdefghijklmnopqrstuvwxyz0123456789'; + let id = ''; + for (let i = 0; i < 8; i++) { + id += chars[Math.floor(Math.random() * chars.length)]; + } + return id; +} + +// ─── Session CRUD ─────────────────────────────────────────────────────────── + +export function createSession(opts: { + provider: string; + prompt: string; + schema: string[]; + format: string; +}): Session { + ensureSessionsDir(); + + const id = generateId(); + const sessionDir = path.join(getSessionsDir(), id); + fs.mkdirSync(sessionDir, { recursive: true }); + + const ext = + opts.format === 'csv' ? 'csv' : opts.format === 'json' ? 'json' : 'md'; + const outputPath = path.join(sessionDir, `output.${ext}`); + + const session: Session = { + id, + provider: opts.provider, + prompt: opts.prompt, + schema: opts.schema, + format: opts.format, + outputPath, + createdAt: new Date().toISOString(), + updatedAt: new Date().toISOString(), + iterations: 0, + }; + + fs.writeFileSync( + path.join(sessionDir, 'session.json'), + JSON.stringify(session, null, 2) + ); + + return session; +} + +export function loadSession(id: string): Session | null { + const sessionFile = path.join(getSessionsDir(), id, 'session.json'); + if (!fs.existsSync(sessionFile)) return null; + + try { + return JSON.parse(fs.readFileSync(sessionFile, 'utf-8')) as Session; + } catch { + return null; + } +} + +export function listSessions(): Session[] { + const dir = getSessionsDir(); + if (!fs.existsSync(dir)) return []; + + return fs + .readdirSync(dir) + .map((name) => loadSession(name)) + .filter((s): s is Session => s !== null) + .sort((a, b) => b.updatedAt.localeCompare(a.updatedAt)); +} + +export function updateSession(id: string, patch: Partial): void { + const session = loadSession(id); + if (!session) return; + + const updated = { ...session, ...patch, updatedAt: new Date().toISOString() }; + const sessionFile = path.join(getSessionsDir(), id, 'session.json'); + fs.writeFileSync(sessionFile, JSON.stringify(updated, null, 2)); +} + +export function getSessionDir(id: string): string { + return path.join(getSessionsDir(), id); +}