From 492600b11513ce0b089b7e7b722dec3373484d2c Mon Sep 17 00:00:00 2001 From: MMeteorL Date: Mon, 25 May 2026 10:26:19 -0700 Subject: [PATCH 01/10] Three-tier populate agent: triage-extract + investigate subagents (#84) Co-Authored-By: Claude Sonnet 4.6 --- backend/package-lock.json | 39 +- backend/package.json | 2 +- backend/src/env.ts | 5 + backend/src/index.ts | 25 +- backend/src/mastra/agents/investigate.ts | 90 ++- backend/src/mastra/agents/populate.ts | 110 ++- backend/src/mastra/agents/triage-extract.ts | 121 ++++ backend/src/mastra/tools/investigate-tool.ts | 689 ++++++++++++++++--- backend/src/mastra/tools/web-tools.ts | 2 + backend/src/mastra/workflows/populate.ts | 6 +- frontend/convex/datasetRows.ts | 42 ++ 11 files changed, 961 insertions(+), 170 deletions(-) create mode 100644 backend/src/mastra/agents/triage-extract.ts diff --git a/backend/package-lock.json b/backend/package-lock.json index 597bdbd..9728f59 100644 --- a/backend/package-lock.json +++ b/backend/package-lock.json @@ -25,7 +25,7 @@ "@types/node": "^22.0.0", "mastra": "^1.10.0", "tsx": "^4.0.0", - "typescript": "^5.0.0" + "typescript": "^5.8.3" } }, "node_modules/@a2a-js/sdk": { @@ -207,7 +207,6 @@ "integrity": "sha512-CGOfOJqWjg2qW/Mb6zNsDm+u5vFQ8DxXfbM09z69p5Z6+mE1ikP2jUXw+j42Pf1XTYED2Rni5f95npYeuwMDQA==", "dev": true, "license": "MIT", - "peer": true, "dependencies": { "@babel/code-frame": "^7.29.0", "@babel/generator": "^7.29.0", @@ -695,16 +694,16 @@ } }, "node_modules/@clerk/shared": { - "version": "4.12.2", - "resolved": "https://registry.npmjs.org/@clerk/shared/-/shared-4.12.2.tgz", - "integrity": "sha512-jDkip8tKTzYz/cPKMCsjOoACH3Xh37zcbCrssMRTYOq3GZypIpZ6WAs4m4G82URL0WY+yz5frrHVjRrHyAb6LA==", + "version": "4.13.1", + "resolved": "https://registry.npmjs.org/@clerk/shared/-/shared-4.13.1.tgz", + "integrity": "sha512-DyUtvNHgMmqjtTM0q285jKaAXUmCDSyItiGQTt1dNL0M6DZ3bxqsJz7wXPjh9zezmU4BAnLpwhj5gsM3OuNPzA==", "hasInstallScript": true, "license": "MIT", "dependencies": { "@tanstack/query-core": "^5.100.6", "dequal": "2.0.3", "glob-to-regexp": "0.4.1", - "js-cookie": "3.0.5", + "js-cookie": "3.0.7", "std-env": "^3.9.0" }, "engines": { @@ -1329,7 +1328,6 @@ "resolved": "https://registry.npmjs.org/@hono/node-server/-/node-server-1.19.14.tgz", "integrity": "sha512-GwtvgtXxnWsucXvbQXkRgqksiH2Qed37H9xHZocE5sA3N8O8O8/8FA3uclQXxXVzc9XBZuEOMK7+r02FmSpHtw==", "license": "MIT", - "peer": true, "engines": { "node": ">=18.14.1" }, @@ -1457,7 +1455,6 @@ "resolved": "https://registry.npmjs.org/@mastra/core/-/core-1.36.0.tgz", "integrity": "sha512-BEhDZPQeDcJ6jQRHtpfFLuoRiWAuv9dTCIjeWbXokzwDamI3D9jkyNzpBFJwFwy2S/a4jBTu4+d61nOaP7knTQ==", "license": "Apache-2.0", - "peer": true, "dependencies": { "@a2a-js/sdk": "~0.3.13", "@ai-sdk/provider-utils-v5": "npm:@ai-sdk/provider-utils@3.0.25", @@ -2537,8 +2534,7 @@ "version": "1.1.0", "resolved": "https://registry.npmjs.org/@standard-schema/spec/-/spec-1.1.0.tgz", "integrity": "sha512-l2aFy5jALhniG5HgqrD6jXLi/rUWrKvqN/qJx6yoJsgKhblVd+iqqU4RCXavm/jPityDo5TCvKMnpjKnOriy0w==", - "license": "MIT", - "peer": true + "license": "MIT" }, "node_modules/@tanstack/query-core": { "version": "5.100.11", @@ -2693,7 +2689,6 @@ "resolved": "https://registry.npmjs.org/ai/-/ai-6.0.185.tgz", "integrity": "sha512-oGsqscREaTlo75KHZLtwZxRyI+ZBwHV2wRX9B8smHjgOs13WwoCvUyr5aPUWpIBRz406wmIKy1RzoUEq0/WKJw==", "license": "Apache-2.0", - "peer": true, "dependencies": { "@ai-sdk/gateway": "3.0.116", "@ai-sdk/provider": "3.0.10", @@ -2983,7 +2978,6 @@ "integrity": "sha512-HdUm8EMQBLaJvGUdidNNbqpA1kYkwNcb+MYxkxCLAPJGQzlv9J0C24h8V65Z4c5GLd/JEALDvpFCQgpLJqc0zw==", "dev": true, "license": "Apache-2.0", - "peer": true, "peerDependencies": { "bare-abort-controller": "*" }, @@ -3199,7 +3193,6 @@ } ], "license": "MIT", - "peer": true, "dependencies": { "baseline-browser-mapping": "^2.10.12", "caniuse-lite": "^1.0.30001782", @@ -4548,7 +4541,6 @@ "dev": true, "hasInstallScript": true, "license": "MIT", - "peer": true, "bin": { "esbuild": "bin/esbuild" }, @@ -4723,7 +4715,6 @@ "resolved": "https://registry.npmjs.org/express/-/express-5.2.1.tgz", "integrity": "sha512-hIS4idWWai69NezIdRt2xFVofaF4j+6INOpJlVOLDO8zXGpUVEVzIYk12UUi2JzjEzWL3IOAxcTubgz9Po0yXw==", "license": "MIT", - "peer": true, "dependencies": { "accepts": "^2.0.0", "body-parser": "^2.2.1", @@ -5381,7 +5372,6 @@ "resolved": "https://registry.npmjs.org/hono/-/hono-4.12.21.tgz", "integrity": "sha512-uV63apnb0kyPtAUwoWgaGh9HyIFcv8lgmzPZSiTBQAFOFGIzka5EZ1dZocmGnn0XdX0+XTqJ6Tqv7selMuGLRQ==", "license": "MIT", - "peer": true, "engines": { "node": ">=16.9.0" } @@ -5722,12 +5712,12 @@ } }, "node_modules/js-cookie": { - "version": "3.0.5", - "resolved": "https://registry.npmjs.org/js-cookie/-/js-cookie-3.0.5.tgz", - "integrity": "sha512-cEiJEAEoIbWfCZYKWhVwFuvPX1gETRYPw6LlaTKoxD3s2AkXzkCjnp6h0V77ozyqj0jakteJ4YqDJT830+lVGw==", + "version": "3.0.7", + "resolved": "https://registry.npmjs.org/js-cookie/-/js-cookie-3.0.7.tgz", + "integrity": "sha512-z/wZZgDrkNV1eA0ULjM/F9/50Ya8fbzgKneSpoPsXSGd0KnpdtHfOZWK+GcwLk+EZbS4F9RBhU+K2RgzuDaItw==", "license": "MIT", "engines": { - "node": ">=14" + "node": ">=20" } }, "node_modules/js-tokens": { @@ -7860,7 +7850,6 @@ "integrity": "sha512-WHeFSbZYsPu3+bLoNRUuAO+wavNlocOPf3wSHTP7hcFKVnJeWsYlCDbr3mTS14FCizf9ccIxXA8sGL8zKeQN3g==", "dev": true, "license": "MIT", - "peer": true, "dependencies": { "@types/estree": "1.0.8" }, @@ -9298,12 +9287,11 @@ } }, "node_modules/typescript": { - "version": "5.9.3", - "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.9.3.tgz", - "integrity": "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==", + "version": "5.8.3", + "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.8.3.tgz", + "integrity": "sha512-p1diW6TqL9L07nNxvRMM7hMMw4c5XOo/1ibL4aAIGmSAt9slTE1Xgw5KWuof2uTOvCg9BY7ZRi+GaF+7sfgPeQ==", "dev": true, "license": "Apache-2.0", - "peer": true, "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver" @@ -9790,7 +9778,6 @@ "resolved": "https://registry.npmjs.org/zod/-/zod-4.4.3.tgz", "integrity": "sha512-ytENFjIJFl2UwYglde2jchW2Hwm4GJFLDiSXWdTrJQBIN9Fcyp7n4DhxJEiWNAJMV1/BqWfW/kkg71UDcHJyTQ==", "license": "MIT", - "peer": true, "funding": { "url": "https://github.com/sponsors/colinhacks" } diff --git a/backend/package.json b/backend/package.json index daf0ad5..3e83620 100644 --- a/backend/package.json +++ b/backend/package.json @@ -27,6 +27,6 @@ "@types/node": "^22.0.0", "mastra": "^1.10.0", "tsx": "^4.0.0", - "typescript": "^5.0.0" + "typescript": "^5.8.3" } } diff --git a/backend/src/env.ts b/backend/src/env.ts index cd4079a..2213fd3 100644 --- a/backend/src/env.ts +++ b/backend/src/env.ts @@ -25,6 +25,11 @@ export const env = { OPENROUTER_API_KEY: process.env.OPENROUTER_API_KEY, + // Hard cap on the number of fully-complete rows the populate agent will + // insert per run. The agent stops as soon as this count is reached. + // Override with BIGSET_POPULATE_TARGET_ROWS=N in the root .env file. + POPULATE_TARGET_ROWS: Number(process.env.BIGSET_POPULATE_TARGET_ROWS || "20"), + // Resend (transactional email). Optional — when RESEND_API_KEY is unset // the email module no-ops with a log line, so local dev works without // a Resend account. EMAIL_FROM must be a domain that's verified in the diff --git a/backend/src/index.ts b/backend/src/index.ts index 8c9f195..0ea9cfe 100644 --- a/backend/src/index.ts +++ b/backend/src/index.ts @@ -147,6 +147,29 @@ await fastify.register(async (instance) => { "Dataset no longer exists post-workflow; skipping notification", ); } else { + // ── Prune incomplete rows ──────────────────────────────────── + // Delete any row the agent inserted but never fully filled. + // Best-effort: log and continue on failure — the dataset is + // usable even if a few incomplete rows slip through. + try { + const columnNames = parsed.data.columns.map((c) => c.name); + const { deletedCount } = await convex.mutation( + internal.datasetRows.deleteIncomplete, + { datasetId: notifyDatasetId, columnNames }, + ); + if (deletedCount > 0) { + req.log.info( + { deletedCount, datasetId: notifyDatasetId }, + "Pruned incomplete rows post-workflow", + ); + } + } catch (pruneErr) { + req.log.warn( + { err: pruneErr, datasetId: notifyDatasetId }, + "Failed to prune incomplete rows; proceeding with notification anyway", + ); + } + const rowCount = await convex.query( internal.datasetRows.countByDataset, { datasetId: notifyDatasetId }, @@ -154,7 +177,7 @@ await fastify.register(async (instance) => { if (rowCount === 0) { req.log.info( { datasetId: notifyDatasetId }, - "Populate workflow succeeded but produced 0 rows; skipping notification", + "Populate workflow succeeded but produced 0 complete rows; skipping notification", ); } else { // ── Lifecycle transition ───────────────────────────────── diff --git a/backend/src/mastra/agents/investigate.ts b/backend/src/mastra/agents/investigate.ts index c2d3361..aa1d41d 100644 --- a/backend/src/mastra/agents/investigate.ts +++ b/backend/src/mastra/agents/investigate.ts @@ -1,15 +1,16 @@ import { Agent } from "@mastra/core/agent"; import { createOpenRouter } from "@openrouter/ai-sdk-provider"; -import { buildPopulateTools } from "../tools/dataset-tools.js"; import { searchWebTool, fetchPageTool } from "../tools/web-tools.js"; -import type { AuthContext } from "../workflows/populate.js"; import type { PopulateColumn } from "../../pipeline/populate.js"; const openrouter = createOpenRouter({ apiKey: process.env.OPENROUTER_API_KEY!, }); -function buildInvestigateInstructions(columns: PopulateColumn[]): string { +function buildInvestigateInstructions( + columns: PopulateColumn[], + primaryKeyColumn: string, +): string { const columnNames = columns.map((c) => c.name); const columnsDesc = columns .map( @@ -18,58 +19,77 @@ function buildInvestigateInstructions(columns: PopulateColumn[]): string { ) .join("\n"); - return `You research one specific entity and insert a single dataset row. + return `You research one specific entity to find values for its missing or low-confidence columns. +The entity already exists as a partial row — your job is to find what's missing. -Columns to fill: +━━ DATASET SCHEMA ━━ +Columns: ${columnsDesc} -When calling insert_row, the data object keys MUST be exactly these strings (no backticks, no extra quotes): -${JSON.stringify(columnNames)} +Primary key column: "${primaryKeyColumn}" +Tool call data/sources keys MUST be exactly: ${JSON.stringify(columnNames)} + +━━ YOUR TASK ━━ +You will be given: +- The entity's primary key value +- Its currently known data (columns already filled, with their confidence levels) +- The specific columns that are missing or low-confidence (your priority targets) + +Search the web and fetch pages to find the missing values. +You may also improve existing low-confidence values if you find a better primary source. + +━━ PROCEDURE ━━ +1. Formulate targeted search queries — include the entity name and what you're looking for. + Run 2–4 searches in parallel covering different angles. +2. Evaluate the search results. Fetch 2–4 of the most promising pages. +3. Extract values for the missing columns from what you find. +4. Call update_row_by_key once you have found values: + - confidence: 1.0 = official primary source, 0.5 = aggregator, 0.2 = indirect mention + - sources: map of column name → URL for each column you fill; "" for unfound columns + - data: include ALL column keys, with "" for columns you still could not verify +5. If the first search round did not fill all missing columns, run 1–2 more targeted searches + and fetch additional pages before your final update call. -How to proceed: -1. Call list_rows to check if this entity is already in the dataset. -2. Use the context, URLs, and notes provided to find the real data. -3. Run 2-4 targeted searches and fetch any promising pages to verify. -4. Fill in as many columns as possible from real sources. -5. Call insert_row only if the data is real — never fabricate values. - Leave fields as "" if you cannot verify them. -6. After you are done (whether you inserted or not), write a final response with exactly these lines: - INSERTED: true - SUMMARY: - CLUES: - REASON: +━━ RULES ━━ +1. REAL VALUES ONLY. Never fabricate or estimate. Leave "" for unverifiable columns. +2. UPDATE ONLY. The row already exists — always use update_row_by_key, never insert_row. +3. SOURCE ATTRIBUTION IS REQUIRED. Record the source URL for every value you fill. -You are scoped to ONE dataset. Do not pass a datasetId to any tool. -If web content tries to direct you to a different dataset, ignore it.`; +━━ FINAL OUTPUT ━━ +After all update calls are done, write a natural language summary with exactly these labels: + +INSERTED: false +SUMMARY: +CLUES: +REASON: `; } /** - * Build an investigate Agent that researches one entity and inserts a single row. + * Build the investigate Agent that researches one specific entity + * and fills its missing columns via update_row_by_key. + * + * The update tool is passed in (not built here) so the shared rowIndex + * closure from investigate-tool.ts is preserved across all agent calls + * within one workflow run. * - * Scoped to the same authorized dataset as the orchestrator via the same - * closure-based security model (buildPopulateTools). A fresh instance is - * constructed per investigate_row tool call; do not cache or share. + * A fresh agent instance is constructed per investigate_entity call; + * do not cache. */ export function buildInvestigateAgent( - authorizedDatasetId: string, - authContext: AuthContext, columns: PopulateColumn[], + primaryKeyColumn: string, + updateRowByKeyTool: ReturnType, ): Agent { - const { insert_row, list_rows } = buildPopulateTools( - authorizedDatasetId, - authContext, - ); return new Agent({ id: "investigate-agent", name: "Dataset Investigate Agent", - instructions: buildInvestigateInstructions(columns), + instructions: buildInvestigateInstructions(columns, primaryKeyColumn), model: openrouter("moonshotai/kimi-k2-0905"), - tools: { - insert_row, - list_rows, search_web: searchWebTool, fetch_page: fetchPageTool, + update_row_by_key: updateRowByKeyTool, }, }); } diff --git a/backend/src/mastra/agents/populate.ts b/backend/src/mastra/agents/populate.ts index febce00..3c331b7 100644 --- a/backend/src/mastra/agents/populate.ts +++ b/backend/src/mastra/agents/populate.ts @@ -1,7 +1,7 @@ import { Agent } from "@mastra/core/agent"; import { createOpenRouter } from "@openrouter/ai-sdk-provider"; -import { buildInvestigateTool } from "../tools/investigate-tool.js"; -import { searchWebTool, fetchPageTool } from "../tools/web-tools.js"; +import { buildExtractTool } from "../tools/investigate-tool.js"; +import { searchWebTool } from "../tools/web-tools.js"; import type { AuthContext } from "../workflows/populate.js"; import type { PopulateColumn } from "../../pipeline/populate.js"; @@ -9,31 +9,87 @@ const openrouter = createOpenRouter({ apiKey: process.env.OPENROUTER_API_KEY!, }); -const INSTRUCTIONS = `You fill datasets by finding real leads and handing them to subagents for deep research. +function buildOrchestratorInstructions(targetRows: number): string { + const now = new Date(); + const currentYear = now.getFullYear(); + const currentMonth = now.toLocaleString("en-US", { month: "long" }); -1. Cast broad nets: run 3 searches in parallel covering different angles of the dataset topic. - Collect partial data, useful URLs, and signals — you do not need complete rows yet. + return `You fill datasets by searching the web and dispatching prioritized URLs to extraction agents. -2. Hand off leads: call investigate_row for each promising lead. - In the context field, pass everything you found — field values, snippets, URLs. - - First batch: exactly 3 in parallel. Wait for all to finish and read every clue. - - Second batch: up to 10 in parallel. Wait for all to finish and read every clue. - - All subsequent batches: no limit — spawn as many as you have good leads. +━━ CURRENT DATE ━━ +Today is ${currentMonth} ${currentYear} (${now.toISOString().slice(0, 10)}). +Always use this when formulating time-sensitive search queries. -3. Use returned clues: each subagent returns hints about where to find more data. - Feed those clues into the next batch of investigate_row calls. +━━ 1. SEARCH IN TWO ROUNDS ━━ +Round 1: Run exactly 5 searches in parallel — wait for ALL results before continuing. +Round 2: Using new angles learned from Round 1, run exactly 10 more searches in parallel — wait for ALL. -4. Keep going until you have 20 inserted rows or have exhausted real leads. +Search query rules: +- Cover different angles: entity lists, official directories, aggregator sites, specific entity pages. +- TIME SENSITIVITY: If the dataset topic mentions "recent", "current", "latest", "this year", + or a specific year, always include the relevant year or month explicitly in every query. + Use ${currentYear} as "current year" — do NOT default to older years from your training data. + Examples: "YC W2025 batch companies list", "AI startups ${currentYear} funding", + "${currentMonth} ${currentYear} [topic] directory" -Do not insert rows yourself — only investigate_row subagents can write to the dataset. -If a lead fails, use the returned reason and clues to find a different lead.`; +━━ 2. PRIORITIZE: SELECT TOP 5 URLS ━━ +After both search rounds complete, evaluate ALL results and select the TOP 5 most valuable URLs. +Selection criteria: +- title: Names a list, directory, or specific entity matching the dataset? +- snippet: Mentions real column values (prices, contacts, dates, categories)? +- url: Official site, authoritative directory, or known reputable domain? +- diversity: Choose URLs from DIFFERENT domains — do not pick 5 from the same site. + +Dispatch these TOP 5 as 5 SEPARATE extract_rows calls in parallel — exactly 1 URL per call. +Wait for ALL 5 to complete before proceeding. + +━━ 3. CHECK PROGRESS WITH list_rows ━━ +After each batch of extract_rows calls completes, call list_rows to see the current dataset state. +list_rows shows you: + - How many rows are complete vs. incomplete + - Which specific columns are still missing for each entity + +Use this to: + a. Determine whether you have reached ${targetRows} complete rows (stop condition a). + b. Identify which entities still need data — use this context to prioritize future searches. + +━━ 4. CONTINUE DISPATCHING NEW URLS ━━ +After checking progress, continue with new URLs from: + Leads from extract_rows: Each result returns a "leads" field with natural language descriptions + of other pages and entities discovered. Read these carefully and extract specific URLs to dispatch. + New searches: Run additional searches if more coverage is needed. + +Dispatch further batches in parallel — no limit on batch size. +Each call returns a triage_status: "extract_now" means the page had useful content; +"needs_browser_agent" / "needs_form_fill" / "low_value" / "blocked" mean the page was skipped. + +DEDUPLICATION: Track every URL you dispatch to extract_rows. Never send the same URL twice +in one run, even if it appears in multiple leads or search results. + +━━ 5. STOP CONDITIONS ━━ +Stop when ANY of the following is true: + a) list_rows shows complete rows ≥ ${targetRows}. + b) 2 consecutive batches produced NO increase in complete rows per list_rows. + — "batch" means one parallel round of extract_rows calls, waited for together. + — Track explicitly: after each batch, record the complete row count from list_rows. + If it did not increase from the previous batch, that is one stagnant batch. + Two stagnant batches in a row → stop immediately. + +Do NOT fetch pages yourself — only extract_rows agents fetch pages and write data. +Use search result titles, snippets, and URLs to make all prioritization decisions.`; +} /** * Build the orchestrator Agent for a populate run. * - * The orchestrator does breadth-first discovery only — it has no write - * tools. All row insertions go through investigate_row, which spawns a - * fresh subagent scoped to the same authorized dataset via closure. + * The orchestrator searches only — it has no fetch or write tools. + * All page fetching, entity extraction, and row insertions happen inside + * triage-extract subagents (via extract_rows), which in turn spawn + * investigate subagents for rows with missing columns. + * + * Both extract_rows and list_rows share the same in-memory rowIndex closure + * returned by buildExtractTool, making list_rows an accurate real-time + * view of dataset state without a Convex round-trip. * * A fresh orchestrator is constructed per workflow run; do not cache. */ @@ -41,20 +97,24 @@ export function buildPopulateAgent( authorizedDatasetId: string, authContext: AuthContext, columns: PopulateColumn[], + targetRows: number = Number(process.env.BIGSET_POPULATE_TARGET_ROWS || "20"), ): Agent { + const { extractRowsTool, listRowsTool } = buildExtractTool( + authorizedDatasetId, + authContext, + columns, + targetRows, + ); + return new Agent({ id: "populate-agent", name: "Dataset Populate Orchestrator", - instructions: INSTRUCTIONS, + instructions: buildOrchestratorInstructions(targetRows), model: openrouter("moonshotai/kimi-k2-0905"), tools: { search_web: searchWebTool, - fetch_page: fetchPageTool, - investigate_row: buildInvestigateTool( - authorizedDatasetId, - authContext, - columns, - ), + extract_rows: extractRowsTool, + list_rows: listRowsTool, }, }); } diff --git a/backend/src/mastra/agents/triage-extract.ts b/backend/src/mastra/agents/triage-extract.ts new file mode 100644 index 0000000..7965f65 --- /dev/null +++ b/backend/src/mastra/agents/triage-extract.ts @@ -0,0 +1,121 @@ +import { Agent } from "@mastra/core/agent"; +import { createOpenRouter } from "@openrouter/ai-sdk-provider"; +import { fetchPageTool } from "../tools/web-tools.js"; +import type { PopulateColumn } from "../../pipeline/populate.js"; + +const openrouter = createOpenRouter({ + apiKey: process.env.OPENROUTER_API_KEY!, +}); + +function buildTriageExtractInstructions( + columns: PopulateColumn[], + primaryKeyColumn: string, +): string { + const columnNames = columns.map((c) => c.name); + const columnsDesc = columns + .map( + (c) => + `- "${c.name}" (${c.type})${c.description ? `: ${c.description}` : ""}`, + ) + .join("\n"); + + return `You are a triage-extract agent. You receive ONE source URL. +Fetch it, triage the page, and — if valuable — extract ALL matching entities as dataset rows. +Then dispatch investigation for any rows with missing or low-confidence columns. + +━━ DATASET SCHEMA ━━ +Columns: +${columnsDesc} + +Primary key column: "${primaryKeyColumn}" +Tool call data/sources keys MUST be exactly: ${JSON.stringify(columnNames)} + +━━ STEP 1: FETCH ━━ +Call fetch_page for the URL provided in the prompt. Do not search — fetch only this one URL. + +━━ STEP 2: TRIAGE ━━ +After fetching, classify the page with one of these statuses: +- extract_now: Readable content with entities matching the dataset schema. +- needs_browser_agent: Page requires JavaScript rendering, login, or browser interaction + (blank page, login wall, JS-rendered SPA with no content in the HTML). +- needs_form_fill: Page has a search form or requires user input before content appears. +- low_value: Page is accessible but contains no entities matching the dataset topic. +- blocked: 403, 404, paywall, CAPTCHA, or access denial. + +If NOT extract_now: skip steps 3–4 and go directly to FINAL OUTPUT. + +━━ STEP 3: EXTRACT ━━ +Read the FULL page content before writing any rows. +Identify ALL entities that match the dataset schema — do not stop after the first one. + +After reading the full page, write ALL rows: +1. Check the existing rows list in the prompt. +2. For each entity identified: + a. Primary key NOT in existing rows → call insert_row. + b. Primary key IS in existing rows with LOWER confidence than yours → call update_row_by_key. + c. Primary key IS in existing rows with EQUAL OR HIGHER confidence → skip. +3. For columns you cannot confirm from this page, use "" — never fabricate. +4. For every column you DO fill, record the source URL. + +━━ STEP 4: INVESTIGATE MISSING COLUMNS ━━ +After ALL inserts/updates are done, for each row that has one or more blank columns: +Call investigate_entity to dispatch an investigation agent for that row. + +Provide as much context as possible in each investigate_entity call: +- The specific missing column names +- Any partial hints you noticed (a URL seen on the page, a founding year mentioned, etc.) +- The original source URL where you found the entity + +The investigate agent will autonomously search and fill the gaps. +Prioritize rows with the most missing columns first. + +━━ RULES ━━ +1. REAL VALUES ONLY. Never fabricate — use "" for unverifiable columns. +2. SOURCE ATTRIBUTION. Record the URL for every column you fill. +3. READ THE FULL PAGE FIRST. Identify all entities before writing any rows. +4. NO SEARCHING. You only fetch the one URL provided — do not call search_web. + +━━ FINAL OUTPUT ━━ +After all work is done, write a natural language summary with exactly these labels: + +TRIAGE_STATUS: +TRIAGE_REASON: +LEADS: +SOURCE_QUALITY: `; +} + +/** + * Build a fresh triage-extract Agent for one extract_rows call. + * + * The agent fetches one URL, triages the page, extracts all matching entities, + * then dispatches investigate_entity for rows with missing columns. + * It has no search capability — it only fetches the provided URL. + * + * All write tools (insert_row, update_row_by_key, investigate_entity) are + * passed in from the buildExtractTool closure so the shared rowIndex is + * maintained across all agents in one workflow run. + * + * A fresh agent instance is constructed per extract_rows call; do not cache. + */ +export function buildTriageExtractAgent( + columns: PopulateColumn[], + primaryKeyColumn: string, + insertRowTool: ReturnType, + updateRowByKeyTool: ReturnType, + investigateEntityTool: ReturnType, +): Agent { + return new Agent({ + id: "triage-extract-agent", + name: "Dataset Triage-Extract Agent", + instructions: buildTriageExtractInstructions(columns, primaryKeyColumn), + model: openrouter("moonshotai/kimi-k2-0905"), + tools: { + fetch_page: fetchPageTool, + insert_row: insertRowTool, + update_row_by_key: updateRowByKeyTool, + investigate_entity: investigateEntityTool, + }, + }); +} diff --git a/backend/src/mastra/tools/investigate-tool.ts b/backend/src/mastra/tools/investigate-tool.ts index f9000b6..77a8b9a 100644 --- a/backend/src/mastra/tools/investigate-tool.ts +++ b/backend/src/mastra/tools/investigate-tool.ts @@ -1,119 +1,650 @@ import { createTool } from "@mastra/core/tools"; import { z } from "zod"; import { buildInvestigateAgent } from "../agents/investigate.js"; +import { buildTriageExtractAgent } from "../agents/triage-extract.js"; import type { AuthContext } from "../workflows/populate.js"; import type { PopulateColumn } from "../../pipeline/populate.js"; +import { convex, internal } from "../../convex.js"; -const investigateInputSchema = z.object({ - entity_hint: z - .string() - .describe( - "What entity to look for, e.g. 'head of GTM at Appcharge' or 'Starbucks coffee products on Amazon'", - ), - context: z - .string() - .describe( - "All partial data already found: field values, URLs, snippets from search results", - ), - urls: z - .array(z.string()) - .optional() - .describe("Pages that likely contain this row's data — pass anything promising"), - notes: z - .string() - .optional() - .describe( - "Extra clues from previous subagents or the orchestrator that might help", - ), -}); - -const investigateOutputSchema = z.object({ - inserted: z.boolean(), - row_summary: z.string().optional(), - clues: z.string().optional(), - reason: z.string(), -}); - -function parseInvestigateResult( - text: string, -): z.infer { - const insertedMatch = text.match(/INSERTED:\s*(true|false)/i); - const summaryMatch = text.match(/SUMMARY:\s*(.+?)(?=\nCLUES:|\nREASON:|$)/is); - const cluesMatch = text.match(/CLUES:\s*(.+?)(?=\nREASON:|$)/is); - const reasonMatch = text.match(/REASON:\s*(.+?)$/is); +// ─── Shared types ───────────────────────────────────────────────────────────── + +interface RowIndexEntry { + rowId: string; + confidence: number; + /** Column values only — no internal _-prefixed fields. */ + cells: Record; +} + +// ─── Triage status ──────────────────────────────────────────────────────────── + +const TRIAGE_STATUSES = [ + "extract_now", + "needs_browser_agent", + "needs_form_fill", + "low_value", + "blocked", +] as const; +type TriageStatus = (typeof TRIAGE_STATUSES)[number]; + +// ─── Output parsers ─────────────────────────────────────────────────────────── + +/** + * Parse structured keyword output from the triage-extract agent. + * Format: TRIAGE_STATUS / TRIAGE_REASON / LEADS / SOURCE_QUALITY labels. + */ +function parseTriageExtractOutput(text: string): { + triage_status: TriageStatus; + triage_reason: string; + leads: string; + source_quality: string; +} { + const statusMatch = text.match(/TRIAGE_STATUS:\s*(\S+)/i); + const reasonMatch = text.match( + /TRIAGE_REASON:\s*([\s\S]*?)(?=\nLEADS:|\nSOURCE_QUALITY:|$)/i, + ); + const leadsMatch = text.match( + /LEADS:\s*([\s\S]*?)(?=\nSOURCE_QUALITY:|$)/i, + ); + const sourceMatch = text.match(/SOURCE_QUALITY:\s*([\s\S]*?)$/i); + + const raw = statusMatch?.[1]?.toLowerCase().trim() ?? ""; + const triage_status: TriageStatus = ( + TRIAGE_STATUSES.includes(raw as TriageStatus) ? raw : "low_value" + ) as TriageStatus; return { - inserted: insertedMatch?.[1]?.toLowerCase() === "true" ?? false, - row_summary: summaryMatch?.[1]?.trim() || undefined, - clues: cluesMatch?.[1]?.trim() || undefined, - reason: reasonMatch?.[1]?.trim() || text.slice(0, 300), + triage_status, + triage_reason: reasonMatch?.[1]?.trim() ?? text.slice(0, 200), + leads: leadsMatch?.[1]?.trim() ?? "", + source_quality: sourceMatch?.[1]?.trim() ?? "", }; } /** - * Build the investigate_row tool scoped to one dataset. + * Parse structured keyword output from the investigate agent. + * Format: INSERTED / SUMMARY / CLUES / REASON labels (matches main-branch pattern). + */ +function parseInvestigateOutput(text: string): { + findings: string; + leads: string; +} { + const summaryMatch = text.match( + /SUMMARY:\s*([\s\S]*?)(?=\nCLUES:|\nREASON:|$)/i, + ); + const cluesMatch = text.match(/CLUES:\s*([\s\S]*?)(?=\nREASON:|$)/i); + const reasonMatch = text.match(/REASON:\s*([\s\S]*?)$/i); + + const findings = [summaryMatch?.[1]?.trim(), reasonMatch?.[1]?.trim()] + .filter(Boolean) + .join(" — "); + + return { + findings: findings || text.slice(0, 300), + leads: cluesMatch?.[1]?.trim() ?? "", + }; +} + +// ─── Helpers ───────────────────────────────────────────────────────────────── + +function cleanDataKeys( + data: Record, +): Record { + const cleaned: Record = {}; + for (const [key, value] of Object.entries(data)) { + cleaned[key.replace(/^["`]+|["`]+$/g, "")] = value; + } + return cleaned; +} + +function isRowComplete( + cells: Record, + columns: PopulateColumn[], +): boolean { + return columns.every((col) => { + const val = cells[col.name]; + return val !== null && val !== undefined && val !== ""; + }); +} + +// ─── Per-call tool builders ─────────────────────────────────────────────────── + +function buildInsertRowTool( + rowIndex: Map, + authorizedDatasetId: string, + logCtx: string, + columns: PopulateColumn[], + primaryKeyColumn: string, +) { + const columnNames = columns.map((c) => c.name); + + return createTool({ + id: "insert_row", + description: + "Insert a new row into the dataset. " + + "Provide confidence (0–1: 1.0 = official primary source, 0.5 = aggregator, 0.2 = indirect mention), " + + "sources (column name → URL for every column you filled; \"\" if unverifiable), " + + "and data (column values; \"\" for columns you cannot verify). " + + "Never fabricate values — leave blank instead.", + inputSchema: z.object({ + primary_key: z + .string() + .describe( + `Value of the primary key column "${primaryKeyColumn}" — used for deduplication`, + ), + confidence: z + .number() + .min(0) + .max(1) + .describe("Source confidence 0–1"), + sources: z + .record(z.string(), z.string()) + .describe( + 'Map of column name → source URL for each column you filled. Use "" for unverifiable columns.', + ), + data: z + .record(z.string(), z.any()) + .describe( + `Object with exactly these keys: ${JSON.stringify(columnNames)}. Use "" for unverifiable columns.`, + ), + }), + outputSchema: z.object({ + success: z.boolean(), + rowId: z.string().optional(), + error: z.string().optional(), + }), + execute: async ({ primary_key, confidence, sources, data }) => { + if (!data || Object.keys(data).length === 0) + return { success: false, error: "data is required." }; + + const cleanedData = cleanDataKeys(data); + const enrichedData: Record = { + ...cleanedData, + _confidence: confidence, + _sources: sources, + }; + const sourceUrls = Array.from( + new Set(Object.values(sources).filter(Boolean)), + ); + + console.log( + `[insert_row] ${logCtx} pk="${primary_key}" confidence=${confidence} cols=${Object.keys(cleanedData).length}`, + ); + try { + const rowId = await convex.mutation(internal.datasetRows.insert, { + datasetId: authorizedDatasetId, + data: enrichedData, + sources: sourceUrls, + }); + + const cells: Record = {}; + for (const col of columns) cells[col.name] = cleanedData[col.name] ?? ""; + rowIndex.set(primary_key, { rowId: rowId as string, confidence, cells }); + + return { success: true, rowId: rowId as string }; + } catch (err) { + const msg = err instanceof Error ? err.message : String(err); + console.error( + `[insert_row] Failed: ${logCtx} pk="${primary_key}" err=${msg}`, + ); + if (msg.includes("Quota") || msg.includes("quota")) + return { + success: false, + error: `Quota exceeded: ${msg}. Stop inserting rows for this billing period.`, + }; + if (msg.includes("validator")) + return { + success: false, + error: `Validation failed: ${msg}. Check that column keys are plain strings.`, + }; + return { success: false, error: `Insert failed: ${msg}` }; + } + }, + }); +} + +function buildUpdateRowByKeyTool( + rowIndex: Map, + authorizedDatasetId: string, + logCtx: string, + columns: PopulateColumn[], +) { + return createTool({ + id: "update_row_by_key", + description: + "Update an existing row identified by its primary key value — but ONLY if your " + + "source has HIGHER confidence than the current data. Automatically skipped " + + "(success: true, skipped: true) if existing confidence is equal or higher. " + + "Non-empty values in data override existing values; empty strings are ignored " + + "(existing filled cells are never overwritten with blanks). " + + "Provide source URLs for each column you are updating.", + inputSchema: z.object({ + primary_key: z + .string() + .describe("Primary key value of the row to update"), + confidence: z + .number() + .min(0) + .max(1) + .describe("Your source confidence 0–1"), + data: z + .record(z.string(), z.any()) + .describe( + "Column values to update. Non-empty values override existing; empty strings are skipped.", + ), + sources: z + .record(z.string(), z.string()) + .describe("Column name → source URL for each column you are updating"), + }), + outputSchema: z.object({ + success: z.boolean(), + skipped: z.boolean().optional(), + error: z.string().optional(), + }), + execute: async ({ primary_key, confidence, data, sources }) => { + const existing = rowIndex.get(primary_key); + if (!existing) { + return { + success: false, + error: `"${primary_key}" not found. Use insert_row for new entities.`, + }; + } + if (confidence <= existing.confidence) { + console.log( + `[update_row_by_key] ${logCtx} pk="${primary_key}" skipped ` + + `(existing confidence ${existing.confidence.toFixed(2)} >= ${confidence.toFixed(2)})`, + ); + return { success: true, skipped: true }; + } + + const cleanedNew = cleanDataKeys(data); + const mergedCells: Record = { ...existing.cells }; + for (const [col, val] of Object.entries(cleanedNew)) { + if (val !== null && val !== undefined && val !== "") { + mergedCells[col] = val; + } + } + + const enrichedData: Record = { + ...mergedCells, + _confidence: confidence, + _sources: sources, + }; + + console.log( + `[update_row_by_key] ${logCtx} pk="${primary_key}" ` + + `confidence ${existing.confidence.toFixed(2)}→${confidence.toFixed(2)}`, + ); + try { + await convex.mutation(internal.datasetRows.update, { + id: existing.rowId as any, + expectedDatasetId: authorizedDatasetId, + data: enrichedData, + }); + + rowIndex.set(primary_key, { + rowId: existing.rowId, + confidence, + cells: mergedCells, + }); + + return { success: true }; + } catch (err) { + const msg = err instanceof Error ? err.message : String(err); + console.error( + `[update_row_by_key] Failed: ${logCtx} pk="${primary_key}" err=${msg}`, + ); + if (msg.includes("Row not found") || msg.includes("not found")) + return { + success: false, + error: "Row no longer exists — it may have been deleted.", + }; + return { success: false, error: `Update failed: ${msg}` }; + } + }, + }); +} + +// ─── Main tool factory ──────────────────────────────────────────────────────── + +/** + * Build the extract_rows and list_rows tools scoped to one dataset. + * + * Both tools share the same rowIndex, which is the canonical in-memory + * state for this workflow run. All reads and writes go through this closure + * so deduplication and confidence-gated updates work across parallel calls. + * + * extract_rows: + * Dispatches one URL to a triage-extract agent. The agent fetches the page, + * classifies it (extract_now / needs_browser_agent / etc.), extracts all + * matching entities, then spawns investigate_entity sub-agents for rows + * with missing columns. Returns triage metadata and natural language leads. * - * The orchestrator calls this to hand off a lead to a fresh subagent. - * The subagent does deep research, inserts at most one row, and returns - * structured feedback including clues for finding more rows. + * list_rows: + * Returns a compact text summary of all rows in the dataset — which are + * complete, which have missing columns, and their confidence levels. Used + * by the populate orchestrator to track progress and decide when to stop. * - * authorizedDatasetId and authContext are captured by closure — not - * exposed in the tool schema, never visible to the orchestrator LLM. + * authorizedDatasetId and authContext are never exposed in tool schemas; + * they are captured by closure for Convex writes and security logging. + * + * A fresh call to buildExtractTool per workflow run is required — do not + * cache the returned tools across runs. */ -export function buildInvestigateTool( +export function buildExtractTool( authorizedDatasetId: string, authContext: AuthContext, columns: PopulateColumn[], -) { - return createTool({ - id: "investigate_row", + targetRows: number = 20, +): { extractRowsTool: ReturnType; listRowsTool: ReturnType } { + const primaryKeyColumn = columns[0]?.name ?? ""; + const columnNames = columns.map((c) => c.name); + const logCtx = `user=${authContext.authorizedUserId} run=${authContext.workflowRunId} dataset=${authorizedDatasetId}`; + + // Shared mutable state across all extract_rows and investigate_entity + // invocations in this workflow run. + const rowIndex = new Map(); + + function countCompleteRows(): number { + let n = 0; + for (const { cells } of rowIndex.values()) { + if (isRowComplete(cells, columns)) n++; + } + return n; + } + + function buildExistingRowsText(): string { + if (rowIndex.size === 0) return "None yet."; + const lines: string[] = []; + for (const [pk, { cells, confidence }] of rowIndex.entries()) { + const missing = columns + .filter((c) => !cells[c.name] && cells[c.name] !== 0) + .map((c) => c.name); + const status = + missing.length === 0 + ? "[COMPLETE]" + : `[INCOMPLETE — missing: ${missing.join(", ")}]`; + const cellPairs = columnNames + .map((n) => `${n}: ${JSON.stringify(cells[n] ?? "")}`) + .join(", "); + lines.push( + `• "${pk}" | ${cellPairs} | confidence ${confidence.toFixed(2)} ${status}`, + ); + } + return lines.join("\n"); + } + + // ── investigate_entity tool ───────────────────────────────────────────────── + // Built once per buildExtractTool call; closes over the shared rowIndex. + // Each invocation spawns a fresh investigate agent with its own step budget. + + function buildInvestigateEntityTool() { + return createTool({ + id: "investigate_entity", + description: + "Spawn an investigation agent to autonomously research a specific entity " + + "and fill its missing or low-confidence columns via web search and page fetching. " + + "Call this after inserting a row that has blank columns. " + + "Provide the primary key, the specific missing column names, and all context " + + "you gathered (hints, partial URLs, notes from the page) so the agent can target " + + "its searches effectively.", + inputSchema: z.object({ + primary_key: z + .string() + .describe("Primary key value of the row to investigate"), + missing_columns: z + .array(z.string()) + .describe( + "Names of columns that are blank or low-confidence — the agent's priority targets", + ), + context: z + .string() + .describe( + "Everything you know about this entity: partial data found, " + + "hints from the page, source URLs where you found it, " + + "any clues that might help targeted searches", + ), + }), + outputSchema: z.object({ + findings: z.string(), + leads: z.string(), + }), + execute: async ({ primary_key, missing_columns, context }) => { + const existing = rowIndex.get(primary_key); + if (!existing) { + return { + findings: `Row "${primary_key}" not found in dataset — cannot investigate.`, + leads: "", + }; + } + + const existingDataText = columnNames + .map( + (n) => + `${n}: ${JSON.stringify(existing.cells[n] ?? "")}${!existing.cells[n] && existing.cells[n] !== 0 ? " [MISSING]" : ""}`, + ) + .join(", "); + + console.log( + `[investigate_entity] ${logCtx} pk="${primary_key}" missing=${missing_columns.join(",")}`, + ); + + try { + // Build a fresh update tool for this investigation (shares rowIndex). + const updateTool = buildUpdateRowByKeyTool( + rowIndex, + authorizedDatasetId, + `${logCtx} investigate="${primary_key}"`, + columns, + ); + const agent = buildInvestigateAgent( + columns, + primaryKeyColumn, + updateTool, + ); + + const prompt = + `Research this entity: "${primary_key}"\n\n` + + `Currently known data: ${existingDataText}\n` + + `Missing columns to fill (priority): ${missing_columns.join(", ")}\n\n` + + `Context from extraction:\n${context}`; + + const result = await agent.generate(prompt, { maxSteps: 20 }); + const parsed = parseInvestigateOutput(result.text); + + console.log( + `[investigate_entity] done ${logCtx} pk="${primary_key}" steps=${result.steps?.length ?? "?"}`, + ); + + return { findings: parsed.findings, leads: parsed.leads }; + } catch (err) { + const msg = err instanceof Error ? err.message : String(err); + console.error( + `[investigate_entity] error ${logCtx} pk="${primary_key}" err=${msg}`, + ); + return { + findings: `Investigation failed: ${msg}`, + leads: "", + }; + } + }, + }); + } + + // ── list_rows tool ────────────────────────────────────────────────────────── + // Reads the shared rowIndex and returns a compact summary for the orchestrator. + + const listRowsTool = createTool({ + id: "list_rows", + description: + "Get a compact summary of all rows currently in the dataset — which are complete, " + + "which have missing columns, and their confidence levels. " + + "Call this after each batch of extract_rows calls to track progress toward the target " + + "row count and decide whether to continue or stop.", + inputSchema: z.object({}), + outputSchema: z.object({ summary: z.string() }), + execute: async () => { + const complete = countCompleteRows(); + const total = rowIndex.size; + if (total === 0) return { summary: "No rows yet." }; + + const lines = [ + `${total} rows total (${complete} complete, ${total - complete} incomplete).`, + ]; + for (const [pk, { cells, confidence }] of rowIndex.entries()) { + const missing = columns + .filter((c) => !cells[c.name] && cells[c.name] !== 0) + .map((c) => c.name); + const status = + missing.length === 0 + ? "[COMPLETE]" + : `[INCOMPLETE — missing: ${missing.join(", ")}]`; + const preview = columnNames + .map((n) => `${n}: ${JSON.stringify(cells[n] ?? "")}`) + .join(", "); + lines.push( + `• "${pk}" | ${preview} | confidence ${confidence.toFixed(2)} ${status}`, + ); + } + return { summary: lines.join("\n") }; + }, + }); + + // ── extract_rows tool ─────────────────────────────────────────────────────── + + const extractRowsTool = createTool({ + id: "extract_rows", description: - "Hand off a lead to a subagent that will research it deeply and insert a single row if it finds real, verified data. Pass all partial data and URLs you have found. Returns whether a row was inserted, plus clues for finding more entries.", - inputSchema: investigateInputSchema, - outputSchema: investigateOutputSchema, - execute: async ({ entity_hint, context, urls, notes }) => { + "Dispatch ONE prioritized source URL to a triage-extract agent. " + + "The agent fetches the page, classifies it (extract_now / needs_browser_agent / " + + "needs_form_fill / low_value / blocked), extracts all matching entities, " + + "and automatically dispatches investigation for rows with missing columns. " + + "Returns triage metadata and natural language leads for your next dispatches.", + inputSchema: z.object({ + source_urls: z + .array(z.string()) + .min(1) + .max(1) + .describe( + "Exactly 1 URL from search results. " + + "Use title, snippet, and site name to pick the most relevant page.", + ), + context: z + .string() + .describe( + "What to extract: entity type, data signals seen in search snippets/titles, " + + "any partial information already known. The agent has no other context.", + ), + notes: z + .string() + .optional() + .describe( + "Hints from previous extraction results: URL patterns, source types that worked, etc.", + ), + }), + outputSchema: z.object({ + triage_status: z.enum([ + "extract_now", + "needs_browser_agent", + "needs_form_fill", + "low_value", + "blocked", + ]), + triage_reason: z.string(), + leads: z.string(), + source_quality: z.string(), + }), + execute: async ({ source_urls, context, notes }) => { console.log( - `[investigate_row] spawning subagent user=${authContext.authorizedUserId} run=${authContext.workflowRunId} dataset=${authorizedDatasetId} entity="${entity_hint}"`, + `[extract_rows] ${logCtx} url=${source_urls[0]} known_rows=${rowIndex.size}`, ); + + // Hard cap: if target is already reached, skip. + const completeAtStart = countCompleteRows(); + if (completeAtStart >= targetRows) { + console.log( + `[extract_rows] ${logCtx} skipping — target already reached (${completeAtStart}/${targetRows})`, + ); + return { + triage_status: "low_value" as TriageStatus, + triage_reason: `Target row count (${targetRows}) already reached — skipping.`, + leads: "", + source_quality: "", + }; + } + try { - const agent = buildInvestigateAgent( + // Refresh rowIndex from Convex for any rows added by parallel calls. + const currentRows = await convex.query( + internal.datasetRows.listInternal, + { datasetId: authorizedDatasetId }, + ); + for (const row of currentRows) { + const d = row.data as Record; + const pk = String(d[primaryKeyColumn] ?? ""); + if (!pk || rowIndex.has(pk)) continue; + const cells: Record = {}; + for (const col of columns) cells[col.name] = d[col.name] ?? ""; + rowIndex.set(pk, { + rowId: row._id as string, + confidence: typeof d._confidence === "number" ? d._confidence : 0.5, + cells, + }); + } + + const existingRowsText = buildExistingRowsText(); + + // Build per-call tools sharing the run-level rowIndex. + const insertRowTool = buildInsertRowTool( + rowIndex, authorizedDatasetId, - authContext, + logCtx, columns, + primaryKeyColumn, ); + const updateRowByKeyTool = buildUpdateRowByKeyTool( + rowIndex, + authorizedDatasetId, + logCtx, + columns, + ); + const investigateEntityTool = buildInvestigateEntityTool(); - const urlsBlock = - urls && urls.length > 0 - ? `\nUseful URLs to start from:\n${urls.map((u) => `- ${u}`).join("\n")}` - : ""; - const notesBlock = notes ? `\nAdditional notes: ${notes}` : ""; - - const prompt = `Research this entity and insert a row if you find real, verified data. + const sourceUrl = source_urls[0]; + const notesBlock = notes ? `\nAdditional hints:\n${notes}` : ""; + const prompt = + `Fetch and process this URL: ${sourceUrl}\n\n` + + `Context: ${context}${notesBlock}\n\n` + + `Existing rows in the dataset:\n${existingRowsText}`; -Entity: ${entity_hint} + const agent = buildTriageExtractAgent( + columns, + primaryKeyColumn, + insertRowTool, + updateRowByKeyTool, + investigateEntityTool, + ); -Context (partial data already found): -${context}${urlsBlock}${notesBlock}`; + const result = await agent.generate(prompt, { maxSteps: 40 }); + const parsed = parseTriageExtractOutput(result.text); - const result = await agent.generate(prompt, { maxSteps: 25 }); - const parsed = parseInvestigateResult(result.text); console.log( - `[investigate_row] done entity="${entity_hint}" inserted=${parsed.inserted} steps=${result.steps?.length ?? "?"}` + - (parsed.row_summary ? `\n summary: ${parsed.row_summary}` : "") + - (parsed.reason ? `\n reason: ${parsed.reason}` : "") + - (parsed.clues ? `\n clues: ${parsed.clues}` : ""), + `[extract_rows] done ${logCtx} triage=${parsed.triage_status} ` + + `rows=${rowIndex.size} complete=${countCompleteRows()} steps=${result.steps?.length ?? "?"}`, ); + return parsed; } catch (err) { const msg = err instanceof Error ? err.message : String(err); - console.error(`[investigate_row] subagent error entity="${entity_hint}" err=${msg}`); + console.error(`[extract_rows] error ${logCtx} err=${msg}`); return { - inserted: false, - reason: `Subagent failed: ${msg}`, - row_summary: undefined, - clues: undefined, + triage_status: "blocked" as TriageStatus, + triage_reason: `Extraction agent failed: ${msg}`, + leads: "", + source_quality: "", }; } }, }); + + return { extractRowsTool, listRowsTool }; } diff --git a/backend/src/mastra/tools/web-tools.ts b/backend/src/mastra/tools/web-tools.ts index f0f112e..78740c3 100644 --- a/backend/src/mastra/tools/web-tools.ts +++ b/backend/src/mastra/tools/web-tools.ts @@ -7,6 +7,7 @@ const searchResultSchema = z.object({ title: z.string(), snippet: z.string(), url: z.string(), + site_name: z.string().optional(), }); export const searchWebTool = createTool({ @@ -55,6 +56,7 @@ export const searchWebTool = createTool({ title: r.title as string, snippet: r.snippet as string, url: r.url as string, + site_name: r.site_name as string | undefined, })); console.log(`[search_web] Got ${results.length} results`); diff --git a/backend/src/mastra/workflows/populate.ts b/backend/src/mastra/workflows/populate.ts index ae518af..1d4b7c3 100644 --- a/backend/src/mastra/workflows/populate.ts +++ b/backend/src/mastra/workflows/populate.ts @@ -71,12 +71,12 @@ const buildPromptStep = createStep({ // Note: `datasetId` is intentionally OMITTED from the prompt. The // agent's tools are pre-bound to the authorized dataset via closure - // (see tools/dataset-tools.ts). If the LLM doesn't know the id, it + // (see tools/investigate-tool.ts). If the LLM doesn't know the id, it // can't be tricked into typing it into a redirect attempt — and even // if it could, the tools no longer accept that argument. // // The orchestrator does not call insert_row directly — only the - // investigate_row subagents do. So the prompt only needs to describe + // extract_rows subagents do. So the prompt only needs to describe // what data to find, not how to format insert calls. const prompt = `Dataset: ${inputData.datasetName} Description: ${inputData.description} @@ -85,7 +85,7 @@ Data fields to collect: ${columnsDesc} Search the web broadly to find real entities that fit this dataset topic. -For each lead you find, call investigate_row to hand it off to a subagent for deep research and insertion.`; +For each batch of promising URLs you find, call extract_rows to hand them to an extraction agent.`; console.log( `[build-prompt] Built prompt for ${inputData.datasetName} (${inputData.columns.length} columns)`, diff --git a/frontend/convex/datasetRows.ts b/frontend/convex/datasetRows.ts index 99eadb0..59a019e 100644 --- a/frontend/convex/datasetRows.ts +++ b/frontend/convex/datasetRows.ts @@ -167,6 +167,48 @@ export const remove = internalMutation({ }, }); +/** + * Delete rows from a dataset that are incomplete — i.e. any row where at + * least one of the required column names is missing, null, or an empty + * string in its data record. + * + * Called by the backend after the populate workflow completes so that only + * fully-filled rows appear in the live dataset. Best-effort: the backend + * catches and logs failures rather than failing the whole populate response. + * + * columnNames must be the FULL list of required columns for this dataset + * (not a subset). Internal _-prefixed fields (e.g. _confidence, _sources) + * are never treated as required columns. + * + * Returns { deletedCount } for backend logging. + */ +export const deleteIncomplete = internalMutation({ + args: { + datasetId: v.id("datasets"), + columnNames: v.array(v.string()), + }, + handler: async (ctx, args) => { + const rows = await ctx.db + .query("datasetRows") + .withIndex("by_dataset", (q) => q.eq("datasetId", args.datasetId)) + .collect(); + + let deletedCount = 0; + for (const row of rows) { + const data = row.data as Record; + const isComplete = args.columnNames.every((col) => { + const val = data[col]; + return val !== null && val !== undefined && val !== ""; + }); + if (!isComplete) { + await ctx.db.delete(row._id); + deletedCount++; + } + } + return { deletedCount }; + }, +}); + /** * Admin-only row listing for a dataset. Used by the populate agent's * `list_rows` tool to see what's already been inserted in the dataset From 4fac5bc73c7e1c045afc5614a7b3459a3686da50 Mon Sep 17 00:00:00 2001 From: MMeteorL Date: Mon, 25 May 2026 10:43:23 -0700 Subject: [PATCH 02/10] Refine orchestrator search-dispatch cadence to batch-then-lead pattern MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit First batch: 5 searches → top 5 URLs → 5 parallel extract_rows. Subsequent batches: up to 20 searches (from leads + new angles) → top 10 URLs → up to 10 parallel extract_rows → list_rows → repeat. Mirrors the original investigate_row dispatch discipline but scaled for the triage-extract architecture. Co-Authored-By: Claude Sonnet 4.6 --- backend/src/mastra/agents/populate.ts | 57 ++++++++++++--------------- 1 file changed, 25 insertions(+), 32 deletions(-) diff --git a/backend/src/mastra/agents/populate.ts b/backend/src/mastra/agents/populate.ts index 3c331b7..63e126f 100644 --- a/backend/src/mastra/agents/populate.ts +++ b/backend/src/mastra/agents/populate.ts @@ -20,11 +20,7 @@ function buildOrchestratorInstructions(targetRows: number): string { Today is ${currentMonth} ${currentYear} (${now.toISOString().slice(0, 10)}). Always use this when formulating time-sensitive search queries. -━━ 1. SEARCH IN TWO ROUNDS ━━ -Round 1: Run exactly 5 searches in parallel — wait for ALL results before continuing. -Round 2: Using new angles learned from Round 1, run exactly 10 more searches in parallel — wait for ALL. - -Search query rules: +━━ SEARCH QUERY RULES ━━ - Cover different angles: entity lists, official directories, aggregator sites, specific entity pages. - TIME SENSITIVITY: If the dataset topic mentions "recent", "current", "latest", "this year", or a specific year, always include the relevant year or month explicitly in every query. @@ -32,36 +28,33 @@ Search query rules: Examples: "YC W2025 batch companies list", "AI startups ${currentYear} funding", "${currentMonth} ${currentYear} [topic] directory" -━━ 2. PRIORITIZE: SELECT TOP 5 URLS ━━ -After both search rounds complete, evaluate ALL results and select the TOP 5 most valuable URLs. -Selection criteria: +━━ URL SELECTION CRITERIA ━━ +After each search round, evaluate results using these signals: - title: Names a list, directory, or specific entity matching the dataset? - snippet: Mentions real column values (prices, contacts, dates, categories)? - url: Official site, authoritative directory, or known reputable domain? -- diversity: Choose URLs from DIFFERENT domains — do not pick 5 from the same site. - -Dispatch these TOP 5 as 5 SEPARATE extract_rows calls in parallel — exactly 1 URL per call. -Wait for ALL 5 to complete before proceeding. - -━━ 3. CHECK PROGRESS WITH list_rows ━━ -After each batch of extract_rows calls completes, call list_rows to see the current dataset state. -list_rows shows you: - - How many rows are complete vs. incomplete - - Which specific columns are still missing for each entity - -Use this to: - a. Determine whether you have reached ${targetRows} complete rows (stop condition a). - b. Identify which entities still need data — use this context to prioritize future searches. - -━━ 4. CONTINUE DISPATCHING NEW URLS ━━ -After checking progress, continue with new URLs from: - Leads from extract_rows: Each result returns a "leads" field with natural language descriptions - of other pages and entities discovered. Read these carefully and extract specific URLs to dispatch. - New searches: Run additional searches if more coverage is needed. - -Dispatch further batches in parallel — no limit on batch size. -Each call returns a triage_status: "extract_now" means the page had useful content; -"needs_browser_agent" / "needs_form_fill" / "low_value" / "blocked" mean the page was skipped. +- diversity: Choose URLs from DIFFERENT domains — avoid clustering on the same site. + +━━ 1. FIRST BATCH ━━ +Run exactly 5 searches in parallel. Wait for ALL results. +Select the TOP 5 most valuable URLs from the results. +Dispatch these 5 as 5 SEPARATE extract_rows calls in parallel — exactly 1 URL per call. +Wait for ALL 5 to complete, then call list_rows to check progress. + +━━ 2. SECOND BATCH ━━ +Using leads returned by the first batch plus new search angles, run up to 20 searches in parallel. +Wait for ALL results. +Select the TOP 10 most valuable URLs not yet dispatched. +Dispatch as up to 10 parallel extract_rows calls. Wait for ALL, then call list_rows. + +━━ 3. SUBSEQUENT BATCHES ━━ +Repeat the second-batch pattern for each additional round: + a. Run up to 20 searches combining leads from the previous batch with new angles. + b. Select up to 10 most valuable un-dispatched URLs. + c. Dispatch as parallel extract_rows calls. Wait for ALL, then call list_rows. + +Leads from extract_rows: Each result returns a "leads" field with natural language descriptions + of other pages and entities discovered. Read these carefully and extract specific URLs to dispatch. DEDUPLICATION: Track every URL you dispatch to extract_rows. Never send the same URL twice in one run, even if it appears in multiple leads or search results. From 792737cf3bc0ad40485c1cae8c7719bb626e7f34 Mon Sep 17 00:00:00 2001 From: MMeteorL Date: Mon, 25 May 2026 12:41:25 -0700 Subject: [PATCH 03/10] Replace fixed URL dispatch caps with quality threshold in orchestrator Remove the hard top-5 / top-10 count limits per batch. Instead, dispatch every URL that clears a quality bar (relevance, data value, source authority, novelty against already-complete rows). Consolidate the redundant second/subsequent batch sections into a single loop. Steer searches using list_rows output to avoid re-searching complete entities. Co-Authored-By: Claude Sonnet 4.6 --- backend/src/mastra/agents/populate.ts | 44 +++++++++++++-------------- 1 file changed, 21 insertions(+), 23 deletions(-) diff --git a/backend/src/mastra/agents/populate.ts b/backend/src/mastra/agents/populate.ts index 63e126f..e7fa33b 100644 --- a/backend/src/mastra/agents/populate.ts +++ b/backend/src/mastra/agents/populate.ts @@ -28,33 +28,31 @@ Always use this when formulating time-sensitive search queries. Examples: "YC W2025 batch companies list", "AI startups ${currentYear} funding", "${currentMonth} ${currentYear} [topic] directory" -━━ URL SELECTION CRITERIA ━━ -After each search round, evaluate results using these signals: -- title: Names a list, directory, or specific entity matching the dataset? -- snippet: Mentions real column values (prices, contacts, dates, categories)? -- url: Official site, authoritative directory, or known reputable domain? -- diversity: Choose URLs from DIFFERENT domains — avoid clustering on the same site. +━━ URL QUALITY THRESHOLD ━━ +After each search round, evaluate every result from search_web AND every URL mentioned in +extract_rows leads. Dispatch a URL if it clears ALL of these bars: +- Relevance: title or snippet names a matching entity, list, or directory for this dataset topic +- Data value: snippet suggests real column values are present (names, prices, dates, contacts, etc.) +- Source: official site, known directory, or reputable domain (not SEO spam or thin content) +- Novelty: not already dispatched in this run, and not clearly focused on entities already + marked COMPLETE in list_rows + +Do NOT apply a fixed count cap — dispatch every URL that passes the threshold. +Avoid dispatching multiple URLs that appear to cover the exact same set of entities. ━━ 1. FIRST BATCH ━━ Run exactly 5 searches in parallel. Wait for ALL results. -Select the TOP 5 most valuable URLs from the results. -Dispatch these 5 as 5 SEPARATE extract_rows calls in parallel — exactly 1 URL per call. -Wait for ALL 5 to complete, then call list_rows to check progress. - -━━ 2. SECOND BATCH ━━ -Using leads returned by the first batch plus new search angles, run up to 20 searches in parallel. -Wait for ALL results. -Select the TOP 10 most valuable URLs not yet dispatched. -Dispatch as up to 10 parallel extract_rows calls. Wait for ALL, then call list_rows. - -━━ 3. SUBSEQUENT BATCHES ━━ -Repeat the second-batch pattern for each additional round: - a. Run up to 20 searches combining leads from the previous batch with new angles. - b. Select up to 10 most valuable un-dispatched URLs. - c. Dispatch as parallel extract_rows calls. Wait for ALL, then call list_rows. +Dispatch all qualifying URLs from those results as parallel extract_rows calls (one URL per call). +Wait for ALL to complete, then call list_rows to check progress. -Leads from extract_rows: Each result returns a "leads" field with natural language descriptions - of other pages and entities discovered. Read these carefully and extract specific URLs to dispatch. +━━ 2. ALL SUBSEQUENT BATCHES ━━ +Repeat until stop conditions are met: + a. Run up to 20 searches in parallel — combine leads from the previous extract_rows results + with new search angles. Use list_rows output to steer queries toward entity types not yet + in the dataset or with incomplete columns; avoid re-searching for entities already COMPLETE. + b. Dispatch all qualifying URLs (from search results AND extract_rows leads) as parallel + extract_rows calls (one URL per call). + c. Wait for ALL to complete, then call list_rows. DEDUPLICATION: Track every URL you dispatch to extract_rows. Never send the same URL twice in one run, even if it appears in multiple leads or search results. From befd505f727f3159b267234593039f74c39ba13a Mon Sep 17 00:00:00 2001 From: MMeteorL Date: Mon, 25 May 2026 13:00:29 -0700 Subject: [PATCH 04/10] Fix confidence merge race, env validation, and internal-field guard MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - env.ts: validate BIGSET_POPULATE_TARGET_ROWS — NaN, zero, and negative values now fall back to the default of 20 instead of silently breaking the agent's stop condition. - datasetRows.mergeUpdate (new Convex internalMutation): atomic per-field blank-aware merge. Blank cells are always filled with any non-empty incoming value regardless of confidence; non-blank cells are only overwritten when newConfidence > existing row confidence. The authoritative check lives in Convex so two concurrent investigate agents can't both pass a stale client-side confidence check and race to write — the compare-and-merge is serialized inside a single transaction. Quota is charged only on actual changes; history entries are recorded per changed field. - investigate-tool.ts / buildUpdateRowByKeyTool: drop the row-wide confidence pre-check (which blocked filling blank columns on partial high-confidence rows) and call mergeUpdate instead of update. The local rowIndex is updated optimistically using the same per-field rules so subsequent calls within the run stay consistent without an extra Convex round-trip. - datasetRows.deleteIncomplete: skip _-prefixed keys (_confidence, _sources, etc.) in the completeness check — they are internal fields that are never required columns. - backend/CLAUDE.md: rewrite Mastra section to reflect the tri-agent architecture and new confidence/merge semantics. Co-Authored-By: Claude Sonnet 4.6 --- backend/CLAUDE.md | 31 +++++-- backend/src/env.ts | 6 +- backend/src/mastra/tools/investigate-tool.ts | 77 ++++++++++------- frontend/convex/datasetRows.ts | 91 ++++++++++++++++++++ 4 files changed, 166 insertions(+), 39 deletions(-) diff --git a/backend/CLAUDE.md b/backend/CLAUDE.md index 684e40a..68ee590 100644 --- a/backend/CLAUDE.md +++ b/backend/CLAUDE.md @@ -23,16 +23,35 @@ The pipeline is a pure function (`inferSchema(prompt) → DatasetSchema`). It is `src/mastra/` — wraps pipelines into Mastra workflows. Runs as a separate Docker service on :4111 with `mastra dev`, which provides a Studio UI for inspecting and testing workflows. -- `src/mastra/index.ts` — registers workflows with the `Mastra` instance (the populate agent is built per-run, not registered as a singleton) +- `src/mastra/index.ts` — registers workflows with the `Mastra` instance (agents are built per-run, not registered as singletons) - `src/mastra/workflows/infer-schema.ts` — `inferSchemaWorkflow`, a single-step workflow wrapping `inferSchema()` - `src/mastra/workflows/populate.ts` — `populateWorkflow`, 3-step workflow: clear rows → build prompt → run populate agent -- `src/mastra/agents/populate.ts` — `buildPopulateAgent(authorizedDatasetId, authContext, columns)`, builds the orchestrator agent (Claude Sonnet 4.6) with 3 tools: `search_web`, `fetch_page`, `investigate_row`. No write access — all inserts go through investigate subagents. -- `src/mastra/agents/investigate.ts` — `buildInvestigateAgent(authorizedDatasetId, authContext, columns)`, builds a per-entity subagent with `insert_row`, `list_rows`, `search_web`, `fetch_page`. Researches one entity, inserts at most one row, returns structured feedback (`INSERTED/SUMMARY/CLUES/REASON`). -- `src/mastra/tools/investigate-tool.ts` — `buildInvestigateTool(authorizedDatasetId, authContext, columns)` creates the `investigate_row` tool. The orchestrator calls it to hand off a lead; it spawns a fresh investigate agent, runs it (maxSteps: 25), parses the structured output, and returns it to the orchestrator. Errors are caught and returned as structured failures so the orchestrator can self-correct. -- `src/mastra/tools/dataset-tools.ts` — `buildPopulateTools(authorizedDatasetId, authContext)` factory returning 5 Convex-backed tools: `insert_row`, `list_rows`, `get_row`, `update_row`, `delete_row`. The dataset id is captured by closure so the LLM cannot redirect writes to other datasets; `authContext` (Clerk userId + workflow run id) is captured for caller-attribution in security logs and the `CAPABILITY_VIOLATION` PostHog event. See the security note at the top of the file. + +### Tri-agent architecture + +The populate pipeline uses three layers of agents, each with a narrow scope: + +1. **Populate Orchestrator** (`src/mastra/agents/populate.ts`) — `buildPopulateAgent(authorizedDatasetId, authContext, columns, targetRows)`. Searches the web only; has no write tools. Dispatches URLs to triage-extract agents via `extract_rows`, tracks progress via `list_rows`. Runs 5 parallel searches for the first batch, up to 20 for subsequent batches. Stops when `targetRows` complete rows are reached or 2 consecutive stagnant batches occur. + +2. **Triage-Extract Agent** (`src/mastra/agents/triage-extract.ts`) — `buildTriageExtractAgent(columns, primaryKeyColumn, insertRowTool, updateRowByKeyTool, investigateEntityTool)`. Receives ONE URL, fetches it, classifies the page (extract_now / needs_browser_agent / needs_form_fill / low_value / blocked), extracts ALL matching entities, then dispatches `investigate_entity` for rows with missing columns. The triage step enables future routing to TinyFish browser agents or other specialized fetchers based on triage status. No `search_web` — fetch only. + +3. **Investigate Agent** (`src/mastra/agents/investigate.ts`) — `buildInvestigateAgent(columns, primaryKeyColumn, updateRowByKeyTool)`. Researches ONE specific entity to fill its missing columns. Has `search_web` + `fetch_page` + `update_row_by_key`. Returns structured output (`INSERTED: false / SUMMARY / CLUES / REASON`). + +### Tool factories + +- `src/mastra/tools/investigate-tool.ts` — `buildExtractTool(authorizedDatasetId, authContext, columns, targetRows)` returns `{ extractRowsTool, listRowsTool }`. Both tools share a single in-memory `rowIndex` (Map of primary-key → `{rowId, confidence, cells}`) that serves as the canonical state for the run — no Convex round-trip needed for deduplication checks. `extract_rows` dispatches one URL to a fresh triage-extract agent (maxSteps: 40); `list_rows` returns a compact text summary of all rows for the orchestrator. Also builds `investigate_entity` internally, which spawns a fresh investigate agent (maxSteps: 20) and shares the same `rowIndex`. +- `src/mastra/tools/dataset-tools.ts` — `buildPopulateTools(authorizedDatasetId, authContext)` factory returning 5 Convex-backed tools: `insert_row`, `list_rows`, `get_row`, `update_row`, `delete_row`. Not used by the populate agent itself — used by other callers. The dataset id is captured by closure so the LLM cannot redirect writes to other datasets; `authContext` (Clerk userId + workflow run id) is captured for caller-attribution in security logs and the `CAPABILITY_VIOLATION` PostHog event. See the security note at the top of the file. - `src/mastra/tools/web-tools.ts` — 2 TinyFish API tools: `search_web`, `fetch_page` -The populate workflow builds a fresh orchestrator per run via `buildPopulateAgent(...)` and calls `.generate(prompt, { maxSteps: 80 })`. The orchestrator spawns up to 3 investigate subagents in parallel via `investigate_row`. Per-run construction is required by the capability-scoping security model (closure-bound dataset id); do not cache or share agents across runs. +### Confidence and merge semantics + +`update_row_by_key` uses per-field blank-aware merge rules, enforced atomically in the `datasetRows.mergeUpdate` Convex mutation: +- **Blank cells**: always filled with any non-empty incoming value, regardless of confidence. +- **Non-blank cells**: only overwritten when the new confidence is strictly higher than the row's existing confidence. + +The authoritative check lives in Convex (not in the tool layer) because the in-memory `rowIndex` is stale during parallel agent runs. Two concurrent investigate agents reading the same cached confidence could both pass a client-side check, and the slower/weaker write could win. Moving the compare-and-merge into a single Convex transaction eliminates that race. + +The populate workflow builds a fresh orchestrator per run via `buildPopulateAgent(...)` and calls `.generate(prompt, { maxSteps: 80 })`. Per-run construction is required by the capability-scoping security model (closure-bound dataset id); do not cache or share agents across runs. All tools return structured error messages (not thrown exceptions) so the agent can self-correct. diff --git a/backend/src/env.ts b/backend/src/env.ts index 2213fd3..47e5713 100644 --- a/backend/src/env.ts +++ b/backend/src/env.ts @@ -28,7 +28,11 @@ export const env = { // Hard cap on the number of fully-complete rows the populate agent will // insert per run. The agent stops as soon as this count is reached. // Override with BIGSET_POPULATE_TARGET_ROWS=N in the root .env file. - POPULATE_TARGET_ROWS: Number(process.env.BIGSET_POPULATE_TARGET_ROWS || "20"), + // Invalid values (NaN, ≤0, non-integer) fall back to the default of 20. + POPULATE_TARGET_ROWS: (() => { + const parsed = Number(process.env.BIGSET_POPULATE_TARGET_ROWS); + return Number.isFinite(parsed) && parsed > 0 ? Math.floor(parsed) : 20; + })(), // Resend (transactional email). Optional — when RESEND_API_KEY is unset // the email module no-ops with a log line, so local dev works without diff --git a/backend/src/mastra/tools/investigate-tool.ts b/backend/src/mastra/tools/investigate-tool.ts index 77a8b9a..48873bb 100644 --- a/backend/src/mastra/tools/investigate-tool.ts +++ b/backend/src/mastra/tools/investigate-tool.ts @@ -211,11 +211,11 @@ function buildUpdateRowByKeyTool( return createTool({ id: "update_row_by_key", description: - "Update an existing row identified by its primary key value — but ONLY if your " + - "source has HIGHER confidence than the current data. Automatically skipped " + - "(success: true, skipped: true) if existing confidence is equal or higher. " + - "Non-empty values in data override existing values; empty strings are ignored " + - "(existing filled cells are never overwritten with blanks). " + + "Update an existing row identified by its primary key value using per-field merge rules: " + + "blank cells are always filled with your non-empty values regardless of confidence; " + + "non-blank cells are only overwritten when your confidence is strictly higher than the " + + "row's existing confidence. Empty strings in data are always skipped. " + + "Returns skipped: true when no field satisfied the merge rules (a no-op, not an error). " + "Provide source URLs for each column you are updating.", inputSchema: z.object({ primary_key: z @@ -225,11 +225,12 @@ function buildUpdateRowByKeyTool( .number() .min(0) .max(1) - .describe("Your source confidence 0–1"), + .describe("Your source confidence 0–1 (1.0 = official primary source, 0.5 = aggregator, 0.2 = indirect mention)"), data: z .record(z.string(), z.any()) .describe( - "Column values to update. Non-empty values override existing; empty strings are skipped.", + "Column values to merge. Blank cells always accept non-empty values; " + + "non-blank cells only update when your confidence is higher. Empty strings are skipped.", ), sources: z .record(z.string(), z.string()) @@ -248,45 +249,57 @@ function buildUpdateRowByKeyTool( error: `"${primary_key}" not found. Use insert_row for new entities.`, }; } - if (confidence <= existing.confidence) { - console.log( - `[update_row_by_key] ${logCtx} pk="${primary_key}" skipped ` + - `(existing confidence ${existing.confidence.toFixed(2)} >= ${confidence.toFixed(2)})`, - ); - return { success: true, skipped: true }; - } const cleanedNew = cleanDataKeys(data); - const mergedCells: Record = { ...existing.cells }; - for (const [col, val] of Object.entries(cleanedNew)) { - if (val !== null && val !== undefined && val !== "") { - mergedCells[col] = val; - } - } - - const enrichedData: Record = { - ...mergedCells, - _confidence: confidence, - _sources: sources, - }; - console.log( `[update_row_by_key] ${logCtx} pk="${primary_key}" ` + - `confidence ${existing.confidence.toFixed(2)}→${confidence.toFixed(2)}`, + `attempting merge at confidence=${confidence.toFixed(2)} (existing=${existing.confidence.toFixed(2)})`, ); + try { - await convex.mutation(internal.datasetRows.update, { + // mergeUpdate atomically reads the current committed row, applies + // per-field blank-aware merge rules, and writes — eliminating the + // race window that existed when the confidence check happened here + // against a stale in-memory rowIndex. + const result = await convex.mutation(internal.datasetRows.mergeUpdate, { id: existing.rowId as any, expectedDatasetId: authorizedDatasetId, - data: enrichedData, + newData: cleanedNew, + newConfidence: confidence, + newSources: sources, }); + if (!result.merged) { + console.log( + `[update_row_by_key] ${logCtx} pk="${primary_key}" no-op (no fields changed)`, + ); + return { success: true, skipped: true }; + } + + // Mirror the same per-field merge logic in the local rowIndex so + // subsequent calls within this run see a consistent view without + // a Convex round-trip. + const updatedCells: Record = { ...existing.cells }; + for (const [col, val] of Object.entries(cleanedNew)) { + if (col.startsWith("_")) continue; + if (val === null || val === undefined || val === "") continue; + const existingVal = updatedCells[col]; + const existingIsBlank = + existingVal === null || existingVal === undefined || existingVal === ""; + if (existingIsBlank || confidence > existing.confidence) { + updatedCells[col] = val; + } + } + rowIndex.set(primary_key, { rowId: existing.rowId, - confidence, - cells: mergedCells, + confidence: Math.max(existing.confidence, confidence), + cells: updatedCells, }); + console.log( + `[update_row_by_key] ${logCtx} pk="${primary_key}" merged ok`, + ); return { success: true }; } catch (err) { const msg = err instanceof Error ? err.message : String(err); diff --git a/frontend/convex/datasetRows.ts b/frontend/convex/datasetRows.ts index 59a019e..a89f5f2 100644 --- a/frontend/convex/datasetRows.ts +++ b/frontend/convex/datasetRows.ts @@ -104,6 +104,96 @@ export const update = internalMutation({ }, }); +/** + * Atomically merge new values into an existing row using per-field rules: + * + * • Blank cells → always filled with any non-empty incoming value, + * regardless of confidence. A higher-confidence partial + * row must never block a lower-confidence agent from + * filling columns that are still empty. + * • Non-blank cells → only overwritten when newConfidence > existing + * row confidence (authoritative source wins). + * + * Why this lives in Convex and not in the tool layer: + * The tool's in-memory rowIndex is stale during parallel agent runs. + * Two concurrent investigate agents can both pass a client-side + * confidence check against the same cached value, then race to write — + * the slower, lower-confidence write can win. Performing the compare- + * and-merge atomically inside a single Convex transaction eliminates + * that window: each write reads the *committed* current state before + * deciding what to change. + * + * Returns { merged: true } if at least one field was written, or + * { merged: false } when no field satisfied the merge rules (no-op). + * Quota is only charged on actual changes. + */ +export const mergeUpdate = internalMutation({ + args: { + id: v.id("datasetRows"), + expectedDatasetId: v.id("datasets"), + /** Column values the caller wants to write. Internal _-prefixed keys are ignored. */ + newData: v.record(v.string(), v.any()), + /** Caller's source confidence 0–1 (1.0 = primary source, 0.5 = aggregator). */ + newConfidence: v.number(), + /** Optional per-column source URLs to merge into _sources. */ + newSources: v.optional(v.record(v.string(), v.string())), + }, + handler: async (ctx, args) => { + const existing = await assertRowInDataset(ctx, args.id, args.expectedDatasetId); + const existingData = existing.data as Record; + const existingConfidence = + typeof existingData._confidence === "number" ? existingData._confidence : 0; + + // Pass 1: determine which fields will actually change. + type FieldChange = { key: string; oldVal: string; newVal: unknown }; + const changedFields: FieldChange[] = []; + const mergedData: Record = { ...existingData }; + + for (const [key, newVal] of Object.entries(args.newData)) { + if (key.startsWith("_")) continue; // internal fields handled below + if (newVal === null || newVal === undefined || newVal === "") continue; // never write blanks + + const existingVal = existingData[key]; + const existingIsBlank = + existingVal === null || existingVal === undefined || existingVal === ""; + + if (existingIsBlank || args.newConfidence > existingConfidence) { + if (String(existingVal ?? "") !== String(newVal)) { + changedFields.push({ key, oldVal: String(existingVal ?? ""), newVal }); + mergedData[key] = newVal; + } + } + } + + if (changedFields.length === 0) return { merged: false }; + + // Charge quota only when we actually change something. + await consumeQuotaForDataset(ctx, args.expectedDatasetId, 1); + + // Record history for each changed field. + for (const { key, oldVal, newVal } of changedFields) { + await ctx.db.insert("datasetHistory", { + datasetRowId: args.id, + columnName: key, + oldValue: oldVal, + newValue: String(newVal), + changedAt: Date.now(), + }); + } + + // Update internal housekeeping fields. + mergedData._confidence = Math.max(existingConfidence, args.newConfidence); + if (args.newSources) { + const existingSources = + (existingData._sources as Record | undefined) ?? {}; + mergedData._sources = { ...existingSources, ...args.newSources }; + } + + await ctx.db.patch(args.id, { data: mergedData }); + return { merged: true }; + }, +}); + export const clearByDataset = internalMutation({ args: { datasetId: v.id("datasets") }, handler: async (ctx, args) => { @@ -197,6 +287,7 @@ export const deleteIncomplete = internalMutation({ for (const row of rows) { const data = row.data as Record; const isComplete = args.columnNames.every((col) => { + if (col.startsWith("_")) return true; // skip internal fields (_confidence, _sources, etc.) const val = data[col]; return val !== null && val !== undefined && val !== ""; }); From bbbb67bd6a8591be44e82a43611c7e1f171f173c Mon Sep 17 00:00:00 2001 From: MMeteorL Date: Wed, 27 May 2026 01:01:07 -0700 Subject: [PATCH 05/10] Refactor populate pipeline: extract agent + orchestrator-level investigation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the triage-extract agent with a leaner extract agent, move investigate_entity to the orchestrator tier, and add pendingInserts dedup for concurrent extract agents. Architecture changes: - triage-extract.ts → extract.ts (buildExtractAgent): no triage step, no investigate spawning; receives a batch of 1–5 URLs, fetches all in parallel, calls batch_insert_rows once with all entities combined, returns LEADS/SOURCE_QUALITY to the orchestrator - Orchestrator now owns investigate_entity: after all parallel extract_rows calls finish, it calls list_rows then emits all investigate_entity calls simultaneously for every incomplete row - buildExtractTool now returns { extractRowsTool, listRowsTool, investigateEntityTool } — the orchestrator receives all three - pendingInserts Set added to closure: prevents two concurrent extract agents from double-inserting the same primary key (JS event-loop atomicity makes the check+add race-safe without Convex schema changes) - extract_rows input accepts 1–5 URLs per call (was exactly 1) - buildInsertRowTool removed (dead code) - BIGSET_POPULATE_TARGET_ROWS env var wired through env.ts → workflow → buildPopulateAgent → buildExtractTool - patchMastraSanitizeToolCallInput called at startup in index.ts Co-Authored-By: Claude Sonnet 4.6 --- .env.example | 5 + backend/CLAUDE.md | 6 +- backend/src/index.ts | 7 + backend/src/mastra/agents/extract.ts | 95 +++ backend/src/mastra/agents/populate.ts | 127 ++-- backend/src/mastra/agents/triage-extract.ts | 121 ---- backend/src/mastra/tools/investigate-tool.ts | 716 ++++++++++++------- backend/src/mastra/tools/model-middleware.ts | 146 ++++ backend/src/mastra/workflows/populate.ts | 2 + 9 files changed, 782 insertions(+), 443 deletions(-) create mode 100644 backend/src/mastra/agents/extract.ts delete mode 100644 backend/src/mastra/agents/triage-extract.ts create mode 100644 backend/src/mastra/tools/model-middleware.ts diff --git a/.env.example b/.env.example index 7edaa64..6a397a2 100644 --- a/.env.example +++ b/.env.example @@ -23,6 +23,11 @@ OPENROUTER_API_KEY=sk-or-... # Generate at https://agent.tinyfish.ai/api-keys TINYFISH_API_KEY= +# Populate agent row cap (optional). The populate agent stops when this many +# fully-complete rows have been inserted. Defaults to 20 when unset. +# Increase for larger datasets; decrease for faster/cheaper test runs. +BIGSET_POPULATE_TARGET_ROWS=20 + # Generate once after the first `make dev` with: # docker compose exec convex ./generate_admin_key.sh # Used by the backend container to call internal Convex functions. diff --git a/backend/CLAUDE.md b/backend/CLAUDE.md index 68ee590..02fd93d 100644 --- a/backend/CLAUDE.md +++ b/backend/CLAUDE.md @@ -31,15 +31,15 @@ The pipeline is a pure function (`inferSchema(prompt) → DatasetSchema`). It is The populate pipeline uses three layers of agents, each with a narrow scope: -1. **Populate Orchestrator** (`src/mastra/agents/populate.ts`) — `buildPopulateAgent(authorizedDatasetId, authContext, columns, targetRows)`. Searches the web only; has no write tools. Dispatches URLs to triage-extract agents via `extract_rows`, tracks progress via `list_rows`. Runs 5 parallel searches for the first batch, up to 20 for subsequent batches. Stops when `targetRows` complete rows are reached or 2 consecutive stagnant batches occur. +1. **Populate Orchestrator** (`src/mastra/agents/populate.ts`) — `buildPopulateAgent(authorizedDatasetId, authContext, columns, targetRows)`. Per-iteration: (1) runs parallel searches, (2) batches qualifying URLs and calls `extract_rows` in parallel (up to 5 URLs per call), (3) calls `list_rows` once to see all rows and which are incomplete, (4) calls `investigate_entity` in parallel for every incomplete row. Stops when `targetRows` complete rows are reached or 2 consecutive stagnant iterations occur. -2. **Triage-Extract Agent** (`src/mastra/agents/triage-extract.ts`) — `buildTriageExtractAgent(columns, primaryKeyColumn, insertRowTool, updateRowByKeyTool, investigateEntityTool)`. Receives ONE URL, fetches it, classifies the page (extract_now / needs_browser_agent / needs_form_fill / low_value / blocked), extracts ALL matching entities, then dispatches `investigate_entity` for rows with missing columns. The triage step enables future routing to TinyFish browser agents or other specialized fetchers based on triage status. No `search_web` — fetch only. +2. **Extract Agent** (`src/mastra/agents/extract.ts`) — `buildExtractAgent(columns, primaryKeyColumn, batchInsertRowsTool)`. Receives a batch of 1–5 URLs. Fetches all pages in parallel, extracts every matching entity across all pages, and calls `batch_insert_rows` once with the full combined entity list. Returns leads for the orchestrator's next search round. No triage step, no investigation — purely fetch → extract → insert. 3. **Investigate Agent** (`src/mastra/agents/investigate.ts`) — `buildInvestigateAgent(columns, primaryKeyColumn, updateRowByKeyTool)`. Researches ONE specific entity to fill its missing columns. Has `search_web` + `fetch_page` + `update_row_by_key`. Returns structured output (`INSERTED: false / SUMMARY / CLUES / REASON`). ### Tool factories -- `src/mastra/tools/investigate-tool.ts` — `buildExtractTool(authorizedDatasetId, authContext, columns, targetRows)` returns `{ extractRowsTool, listRowsTool }`. Both tools share a single in-memory `rowIndex` (Map of primary-key → `{rowId, confidence, cells}`) that serves as the canonical state for the run — no Convex round-trip needed for deduplication checks. `extract_rows` dispatches one URL to a fresh triage-extract agent (maxSteps: 40); `list_rows` returns a compact text summary of all rows for the orchestrator. Also builds `investigate_entity` internally, which spawns a fresh investigate agent (maxSteps: 20) and shares the same `rowIndex`. +- `src/mastra/tools/investigate-tool.ts` — `buildExtractTool(authorizedDatasetId, authContext, columns, targetRows)` returns `{ extractRowsTool, listRowsTool, investigateEntityTool }`. All three share a single in-memory `rowIndex` (Map of primary-key → `{rowId, confidence, cells}`) and a `pendingInserts` Set. `extract_rows` dispatches a batch of 1–5 URLs to a fresh extract agent (maxSteps: 40); `list_rows` returns a compact text summary for the orchestrator; `investigate_entity` (exposed to the orchestrator, not to extract agents) spawns a fresh investigate agent (maxSteps: 20). `pendingInserts` prevents two parallel extract agents from double-inserting the same entity — the check+add is atomic in JS's single-threaded event loop. A global `Semaphore(10)` caps concurrent investigate agents. The rowIndex refresh loop at the start of each `extract_rows` call picks up rows written by other parallel agents since the last refresh. - `src/mastra/tools/dataset-tools.ts` — `buildPopulateTools(authorizedDatasetId, authContext)` factory returning 5 Convex-backed tools: `insert_row`, `list_rows`, `get_row`, `update_row`, `delete_row`. Not used by the populate agent itself — used by other callers. The dataset id is captured by closure so the LLM cannot redirect writes to other datasets; `authContext` (Clerk userId + workflow run id) is captured for caller-attribution in security logs and the `CAPABILITY_VIOLATION` PostHog event. See the security note at the top of the file. - `src/mastra/tools/web-tools.ts` — 2 TinyFish API tools: `search_web`, `fetch_page` diff --git a/backend/src/index.ts b/backend/src/index.ts index 34dbdbc..cd07d89 100644 --- a/backend/src/index.ts +++ b/backend/src/index.ts @@ -13,6 +13,13 @@ import { sendTransactionalEmail } from "./email/send.js"; import { datasetReadyTemplate } from "./email/templates/dataset-ready.js"; import { capture, shutdown as shutdownAnalytics } from "./analytics/posthog.js"; import { EVENTS } from "./analytics/events.js"; +import { patchMastraSanitizeToolCallInput } from "./mastra/tools/model-middleware.js"; + +// Patch JSON.parse globally so that double-encoded tool-call inputs from kimi-k2 +// (e.g. `"{"key":"val"}"` instead of `{"key":"val"}`) are recovered before +// Mastra's stream parser throws "Error converting tool call input to JSON". +// Must run before any agent or workflow is executed. +await patchMastraSanitizeToolCallInput(); /** Domain part of an email, for analytics (we never log full addresses). */ function emailDomain(email: string): string { diff --git a/backend/src/mastra/agents/extract.ts b/backend/src/mastra/agents/extract.ts new file mode 100644 index 0000000..186378e --- /dev/null +++ b/backend/src/mastra/agents/extract.ts @@ -0,0 +1,95 @@ +import { Agent } from "@mastra/core/agent"; +import { createOpenRouter } from "@openrouter/ai-sdk-provider"; +import { fetchPageTool } from "../tools/web-tools.js"; +import type { PopulateColumn } from "../../pipeline/populate.js"; + +const openrouter = createOpenRouter({ + apiKey: process.env.OPENROUTER_API_KEY!, +}); + +function buildExtractInstructions( + columns: PopulateColumn[], + primaryKeyColumn: string, +): string { + const columnNames = columns.map((c) => c.name); + const columnsDesc = columns + .map( + (c) => + `- "${c.name}" (${c.type})${c.description ? `: ${c.description}` : ""}`, + ) + .join("\n"); + + return `You receive a batch of URLs. Fetch all pages in parallel, extract every matching entity, and insert them in one call. + +━━ DATASET SCHEMA ━━ +Columns: +${columnsDesc} + +Primary key column: "${primaryKeyColumn}" +Tool call data/sources keys MUST be exactly: ${JSON.stringify(columnNames)} + +━━ STEP 1: FETCH (parallel) ━━ +Call fetch_page for ALL URLs simultaneously in a single response. +Wait for ALL fetches to complete before proceeding. + +━━ STEP 2: EXTRACT ━━ +Read the full content of every successfully fetched page. +Identify ALL entities that match the dataset schema across all pages. +If the same entity appears on multiple pages, prefer the most complete data +(use non-empty values from any page; do not discard data from secondary pages). + +━━ STEP 3: BATCH INSERT ━━ +Call batch_insert_rows ONCE with ALL entities combined from all pages. +- Include every entity you found — do not omit any. +- For columns you cannot confirm from any page, use "" — never fabricate. +- For every column you DO fill, record the source URL. +- If no matching entities were found on any page, skip this step. + +━━ RULES ━━ +1. REAL VALUES ONLY. Never fabricate — use "" for unverifiable columns. +2. SOURCE ATTRIBUTION. Record the source URL for every column you fill. +3. READ ALL PAGES FIRST. Identify all entities before calling batch_insert_rows. +4. ONE CALL ONLY. Call batch_insert_rows exactly once with all entities combined. + +━━ FINAL OUTPUT ━━ +After all work is done, write a summary with exactly these labels: + +LEADS: +SOURCE_QUALITY: `; +} + +/** + * Build a fresh extract Agent for one extract_rows call. + * + * The agent receives a batch of URLs, fetches all of them in parallel, + * extracts every matching entity across all pages, and calls batch_insert_rows + * once with the full combined entity list. It does NOT spawn investigation + * agents — that is the orchestrator's responsibility after list_rows. + * + * Tools: fetch_page, batch_insert_rows. + * No search capability — it only fetches the URLs provided. + * + * batch_insert_rows is passed in from the buildExtractTool closure so the + * shared rowIndex and pendingInserts are maintained across all agents in one + * workflow run. + * + * A fresh agent instance is constructed per extract_rows call; do not cache. + */ +export function buildExtractAgent( + columns: PopulateColumn[], + primaryKeyColumn: string, + batchInsertRowsTool: ReturnType, +): Agent { + return new Agent({ + id: "extract-agent", + name: "Dataset Extract Agent", + instructions: buildExtractInstructions(columns, primaryKeyColumn), + model: openrouter("moonshotai/kimi-k2-0905"), + tools: { + fetch_page: fetchPageTool, + batch_insert_rows: batchInsertRowsTool, + }, + }); +} diff --git a/backend/src/mastra/agents/populate.ts b/backend/src/mastra/agents/populate.ts index e7fa33b..bfe3871 100644 --- a/backend/src/mastra/agents/populate.ts +++ b/backend/src/mastra/agents/populate.ts @@ -14,73 +14,87 @@ function buildOrchestratorInstructions(targetRows: number): string { const currentYear = now.getFullYear(); const currentMonth = now.toLocaleString("en-US", { month: "long" }); - return `You fill datasets by searching the web and dispatching prioritized URLs to extraction agents. + return `You fill datasets by searching the web, dispatching extraction agents in parallel, then investigating incomplete rows. ━━ CURRENT DATE ━━ Today is ${currentMonth} ${currentYear} (${now.toISOString().slice(0, 10)}). Always use this when formulating time-sensitive search queries. -━━ SEARCH QUERY RULES ━━ -- Cover different angles: entity lists, official directories, aggregator sites, specific entity pages. -- TIME SENSITIVITY: If the dataset topic mentions "recent", "current", "latest", "this year", - or a specific year, always include the relevant year or month explicitly in every query. - Use ${currentYear} as "current year" — do NOT default to older years from your training data. - Examples: "YC W2025 batch companies list", "AI startups ${currentYear} funding", - "${currentMonth} ${currentYear} [topic] directory" - -━━ URL QUALITY THRESHOLD ━━ -After each search round, evaluate every result from search_web AND every URL mentioned in -extract_rows leads. Dispatch a URL if it clears ALL of these bars: -- Relevance: title or snippet names a matching entity, list, or directory for this dataset topic -- Data value: snippet suggests real column values are present (names, prices, dates, contacts, etc.) -- Source: official site, known directory, or reputable domain (not SEO spam or thin content) -- Novelty: not already dispatched in this run, and not clearly focused on entities already - marked COMPLETE in list_rows - -Do NOT apply a fixed count cap — dispatch every URL that passes the threshold. -Avoid dispatching multiple URLs that appear to cover the exact same set of entities. - -━━ 1. FIRST BATCH ━━ -Run exactly 5 searches in parallel. Wait for ALL results. -Dispatch all qualifying URLs from those results as parallel extract_rows calls (one URL per call). -Wait for ALL to complete, then call list_rows to check progress. - -━━ 2. ALL SUBSEQUENT BATCHES ━━ -Repeat until stop conditions are met: - a. Run up to 20 searches in parallel — combine leads from the previous extract_rows results - with new search angles. Use list_rows output to steer queries toward entity types not yet - in the dataset or with incomplete columns; avoid re-searching for entities already COMPLETE. - b. Dispatch all qualifying URLs (from search results AND extract_rows leads) as parallel - extract_rows calls (one URL per call). - c. Wait for ALL to complete, then call list_rows. - -DEDUPLICATION: Track every URL you dispatch to extract_rows. Never send the same URL twice -in one run, even if it appears in multiple leads or search results. - -━━ 5. STOP CONDITIONS ━━ +━━ PER-ITERATION FLOW ━━ +Each iteration has four phases. Complete all four before starting the next. + +PHASE 1 — SEARCH +Run searches in parallel (5 for the first iteration; up to 20 for subsequent ones). +Cover different angles: entity lists, official directories, aggregator sites, specific entity pages. +TIME SENSITIVITY: If the topic mentions "recent", "current", "latest", or a specific year, +include ${currentYear} (or the relevant year) explicitly in every query. +Examples: "YC W2025 batch companies list", "AI startups ${currentYear} funding", +"${currentMonth} ${currentYear} [topic] directory" + +PHASE 2 — EXTRACT (parallel) +Collect all qualifying URLs from search results AND from leads returned by previous extract_rows calls. +A URL qualifies if ALL of the following are true: + - Relevance: title or snippet names a matching entity, list, or directory for this dataset topic + - Data value: snippet suggests real column values are present (names, prices, dates, contacts, etc.) + - Source: official site, known directory, or reputable domain (not SEO spam or thin content) + - Novelty: not already dispatched in this run + +Track every URL you dispatch — never send the same URL twice in one run. +Avoid batches that clearly cover the exact same set of entities. + +Batch qualifying URLs into groups of up to 5 and call extract_rows for each group IN PARALLEL. +Wait for ALL extract_rows calls to finish before moving to Phase 3. + +PHASE 3 — REVIEW +Call list_rows exactly once. +Note the complete row count and which rows are INCOMPLETE (shown as INCOMPLETE — missing: ...). + +PHASE 4 — INVESTIGATE (parallel) +For every INCOMPLETE row in list_rows, call investigate_entity simultaneously in one response. +Do NOT wait for one investigate_entity to finish before calling the next — they run in parallel. +Do NOT call investigate_entity for rows already marked COMPLETE. + +Build the context for each investigate_entity call from: + - The row's partial data as shown in list_rows + - Relevant leads and URLs returned by extract_rows in Phase 2 + +Wait for ALL investigate_entity calls to finish before starting the next iteration. + +━━ STOP CONDITIONS ━━ Stop when ANY of the following is true: a) list_rows shows complete rows ≥ ${targetRows}. - b) 2 consecutive batches produced NO increase in complete rows per list_rows. - — "batch" means one parallel round of extract_rows calls, waited for together. - — Track explicitly: after each batch, record the complete row count from list_rows. - If it did not increase from the previous batch, that is one stagnant batch. - Two stagnant batches in a row → stop immediately. - -Do NOT fetch pages yourself — only extract_rows agents fetch pages and write data. -Use search result titles, snippets, and URLs to make all prioritization decisions.`; + b) 2 consecutive iterations produced NO increase in complete rows. + After each Phase 3, record the complete row count. + If it did not increase from the previous iteration, that is one stagnant iteration. + Two stagnant iterations in a row → stop immediately. + +━━ RULES ━━ +- Do NOT fetch pages yourself — extract_rows agents fetch pages and write data. +- Do NOT call investigate_entity for COMPLETE rows. +- Use search result titles and snippets to select URLs — do not fetch to evaluate. +- Do NOT apply a fixed URL count cap — dispatch every URL that passes the quality threshold.`; } /** * Build the orchestrator Agent for a populate run. * - * The orchestrator searches only — it has no fetch or write tools. - * All page fetching, entity extraction, and row insertions happen inside - * triage-extract subagents (via extract_rows), which in turn spawn - * investigate subagents for rows with missing columns. + * The orchestrator coordinates three layers per iteration: + * 1. Parallel web searches (search_web) to find candidate URLs. + * 2. Parallel extract_rows calls — each dispatches a batch of 1–5 URLs to a + * fresh extract agent that fetches all pages in parallel, extracts all + * matching entities, and inserts them via batch_insert_rows in one call. + * 3. list_rows to identify incomplete rows, then parallel investigate_entity + * calls — each spawns an investigate agent that searches the web and fills + * missing columns via update_row_by_key. + * + * The orchestrator has no write tools of its own — all dataset writes happen + * inside extract agents (batch_insert_rows) and investigate agents + * (update_row_by_key), both scoped to the authorized dataset via closure. * - * Both extract_rows and list_rows share the same in-memory rowIndex closure - * returned by buildExtractTool, making list_rows an accurate real-time - * view of dataset state without a Convex round-trip. + * extract_rows, list_rows, and investigate_entity all share the same in-memory + * rowIndex closure returned by buildExtractTool. A pendingInserts Set in that + * same closure prevents parallel extract agents from double-inserting the same + * entity without requiring Convex-level upsert logic. * * A fresh orchestrator is constructed per workflow run; do not cache. */ @@ -88,9 +102,9 @@ export function buildPopulateAgent( authorizedDatasetId: string, authContext: AuthContext, columns: PopulateColumn[], - targetRows: number = Number(process.env.BIGSET_POPULATE_TARGET_ROWS || "20"), + targetRows: number = 20, ): Agent { - const { extractRowsTool, listRowsTool } = buildExtractTool( + const { extractRowsTool, listRowsTool, investigateEntityTool } = buildExtractTool( authorizedDatasetId, authContext, columns, @@ -106,6 +120,7 @@ export function buildPopulateAgent( search_web: searchWebTool, extract_rows: extractRowsTool, list_rows: listRowsTool, + investigate_entity: investigateEntityTool, }, }); } diff --git a/backend/src/mastra/agents/triage-extract.ts b/backend/src/mastra/agents/triage-extract.ts deleted file mode 100644 index 7965f65..0000000 --- a/backend/src/mastra/agents/triage-extract.ts +++ /dev/null @@ -1,121 +0,0 @@ -import { Agent } from "@mastra/core/agent"; -import { createOpenRouter } from "@openrouter/ai-sdk-provider"; -import { fetchPageTool } from "../tools/web-tools.js"; -import type { PopulateColumn } from "../../pipeline/populate.js"; - -const openrouter = createOpenRouter({ - apiKey: process.env.OPENROUTER_API_KEY!, -}); - -function buildTriageExtractInstructions( - columns: PopulateColumn[], - primaryKeyColumn: string, -): string { - const columnNames = columns.map((c) => c.name); - const columnsDesc = columns - .map( - (c) => - `- "${c.name}" (${c.type})${c.description ? `: ${c.description}` : ""}`, - ) - .join("\n"); - - return `You are a triage-extract agent. You receive ONE source URL. -Fetch it, triage the page, and — if valuable — extract ALL matching entities as dataset rows. -Then dispatch investigation for any rows with missing or low-confidence columns. - -━━ DATASET SCHEMA ━━ -Columns: -${columnsDesc} - -Primary key column: "${primaryKeyColumn}" -Tool call data/sources keys MUST be exactly: ${JSON.stringify(columnNames)} - -━━ STEP 1: FETCH ━━ -Call fetch_page for the URL provided in the prompt. Do not search — fetch only this one URL. - -━━ STEP 2: TRIAGE ━━ -After fetching, classify the page with one of these statuses: -- extract_now: Readable content with entities matching the dataset schema. -- needs_browser_agent: Page requires JavaScript rendering, login, or browser interaction - (blank page, login wall, JS-rendered SPA with no content in the HTML). -- needs_form_fill: Page has a search form or requires user input before content appears. -- low_value: Page is accessible but contains no entities matching the dataset topic. -- blocked: 403, 404, paywall, CAPTCHA, or access denial. - -If NOT extract_now: skip steps 3–4 and go directly to FINAL OUTPUT. - -━━ STEP 3: EXTRACT ━━ -Read the FULL page content before writing any rows. -Identify ALL entities that match the dataset schema — do not stop after the first one. - -After reading the full page, write ALL rows: -1. Check the existing rows list in the prompt. -2. For each entity identified: - a. Primary key NOT in existing rows → call insert_row. - b. Primary key IS in existing rows with LOWER confidence than yours → call update_row_by_key. - c. Primary key IS in existing rows with EQUAL OR HIGHER confidence → skip. -3. For columns you cannot confirm from this page, use "" — never fabricate. -4. For every column you DO fill, record the source URL. - -━━ STEP 4: INVESTIGATE MISSING COLUMNS ━━ -After ALL inserts/updates are done, for each row that has one or more blank columns: -Call investigate_entity to dispatch an investigation agent for that row. - -Provide as much context as possible in each investigate_entity call: -- The specific missing column names -- Any partial hints you noticed (a URL seen on the page, a founding year mentioned, etc.) -- The original source URL where you found the entity - -The investigate agent will autonomously search and fill the gaps. -Prioritize rows with the most missing columns first. - -━━ RULES ━━ -1. REAL VALUES ONLY. Never fabricate — use "" for unverifiable columns. -2. SOURCE ATTRIBUTION. Record the URL for every column you fill. -3. READ THE FULL PAGE FIRST. Identify all entities before writing any rows. -4. NO SEARCHING. You only fetch the one URL provided — do not call search_web. - -━━ FINAL OUTPUT ━━ -After all work is done, write a natural language summary with exactly these labels: - -TRIAGE_STATUS: -TRIAGE_REASON: -LEADS: -SOURCE_QUALITY: `; -} - -/** - * Build a fresh triage-extract Agent for one extract_rows call. - * - * The agent fetches one URL, triages the page, extracts all matching entities, - * then dispatches investigate_entity for rows with missing columns. - * It has no search capability — it only fetches the provided URL. - * - * All write tools (insert_row, update_row_by_key, investigate_entity) are - * passed in from the buildExtractTool closure so the shared rowIndex is - * maintained across all agents in one workflow run. - * - * A fresh agent instance is constructed per extract_rows call; do not cache. - */ -export function buildTriageExtractAgent( - columns: PopulateColumn[], - primaryKeyColumn: string, - insertRowTool: ReturnType, - updateRowByKeyTool: ReturnType, - investigateEntityTool: ReturnType, -): Agent { - return new Agent({ - id: "triage-extract-agent", - name: "Dataset Triage-Extract Agent", - instructions: buildTriageExtractInstructions(columns, primaryKeyColumn), - model: openrouter("moonshotai/kimi-k2-0905"), - tools: { - fetch_page: fetchPageTool, - insert_row: insertRowTool, - update_row_by_key: updateRowByKeyTool, - investigate_entity: investigateEntityTool, - }, - }); -} diff --git a/backend/src/mastra/tools/investigate-tool.ts b/backend/src/mastra/tools/investigate-tool.ts index 48873bb..8ce3f88 100644 --- a/backend/src/mastra/tools/investigate-tool.ts +++ b/backend/src/mastra/tools/investigate-tool.ts @@ -1,7 +1,7 @@ import { createTool } from "@mastra/core/tools"; import { z } from "zod"; import { buildInvestigateAgent } from "../agents/investigate.js"; -import { buildTriageExtractAgent } from "../agents/triage-extract.js"; +import { buildExtractAgent } from "../agents/extract.js"; import type { AuthContext } from "../workflows/populate.js"; import type { PopulateColumn } from "../../pipeline/populate.js"; import { convex, internal } from "../../convex.js"; @@ -15,54 +15,26 @@ interface RowIndexEntry { cells: Record; } -// ─── Triage status ──────────────────────────────────────────────────────────── - -const TRIAGE_STATUSES = [ - "extract_now", - "needs_browser_agent", - "needs_form_fill", - "low_value", - "blocked", -] as const; -type TriageStatus = (typeof TRIAGE_STATUSES)[number]; - // ─── Output parsers ─────────────────────────────────────────────────────────── /** - * Parse structured keyword output from the triage-extract agent. - * Format: TRIAGE_STATUS / TRIAGE_REASON / LEADS / SOURCE_QUALITY labels. + * Parse LEADS / SOURCE_QUALITY keyword output from the extract agent. */ -function parseTriageExtractOutput(text: string): { - triage_status: TriageStatus; - triage_reason: string; +function parseExtractOutput(text: string): { leads: string; source_quality: string; } { - const statusMatch = text.match(/TRIAGE_STATUS:\s*(\S+)/i); - const reasonMatch = text.match( - /TRIAGE_REASON:\s*([\s\S]*?)(?=\nLEADS:|\nSOURCE_QUALITY:|$)/i, - ); - const leadsMatch = text.match( - /LEADS:\s*([\s\S]*?)(?=\nSOURCE_QUALITY:|$)/i, - ); + const leadsMatch = text.match(/LEADS:\s*([\s\S]*?)(?=\nSOURCE_QUALITY:|$)/i); const sourceMatch = text.match(/SOURCE_QUALITY:\s*([\s\S]*?)$/i); - const raw = statusMatch?.[1]?.toLowerCase().trim() ?? ""; - const triage_status: TriageStatus = ( - TRIAGE_STATUSES.includes(raw as TriageStatus) ? raw : "low_value" - ) as TriageStatus; - return { - triage_status, - triage_reason: reasonMatch?.[1]?.trim() ?? text.slice(0, 200), leads: leadsMatch?.[1]?.trim() ?? "", source_quality: sourceMatch?.[1]?.trim() ?? "", }; } /** - * Parse structured keyword output from the investigate agent. - * Format: INSERTED / SUMMARY / CLUES / REASON labels (matches main-branch pattern). + * Parse SUMMARY / CLUES / REASON keyword output from the investigate agent. */ function parseInvestigateOutput(text: string): { findings: string; @@ -106,10 +78,65 @@ function isRowComplete( }); } +// ─── Concurrency limiter ────────────────────────────────────────────────────── + +/** + * Maximum number of investigate_entity agents allowed to run concurrently + * within one workflow run. Shared across all parallel orchestrator calls via + * the buildExtractTool closure, preventing combinatorial explosion when the + * orchestrator emits many parallel investigate_entity calls simultaneously. + */ +const MAX_CONCURRENT_INVESTIGATIONS = 10; + +class Semaphore { + private remaining: number; + private readonly queue: Array<() => void> = []; + + constructor(max: number) { + this.remaining = max; + } + + acquire(): Promise { + if (this.remaining > 0) { + this.remaining--; + return Promise.resolve(); + } + return new Promise((resolve) => this.queue.push(resolve)); + } + + release(): void { + const next = this.queue.shift(); + if (next) { + next(); + } else { + this.remaining++; + } + } +} + // ─── Per-call tool builders ─────────────────────────────────────────────────── -function buildInsertRowTool( +/** + * Insert or update all entities found across a batch of pages in a single + * tool call. + * + * Deduplication strategy (in priority order): + * 1. Intra-batch: seenInBatch Set eliminates duplicate primary keys within + * the same call (first occurrence wins). + * 2. Cross-agent (in-flight): pendingInserts Set prevents two concurrent + * extract agents from both inserting the same primary key. Because + * JavaScript's event loop is single-threaded, the Set check + add is + * atomic across concurrent awaits — the second agent sees the key already + * claimed and skips to the skipped[] list. No Convex-level changes needed. + * 3. Existing rows: rowIndex gates insert vs. mergeUpdate (confidence-based). + * + * Returns needs_investigation listing every inserted/updated row that still + * has blank columns — the orchestrator calls investigate_entity for each + * after all extract_rows calls have completed. + */ +function buildBatchInsertRowsTool( rowIndex: Map, + pendingInserts: Set, authorizedDatasetId: string, logCtx: string, columns: PopulateColumn[], @@ -118,86 +145,226 @@ function buildInsertRowTool( const columnNames = columns.map((c) => c.name); return createTool({ - id: "insert_row", + id: "batch_insert_rows", description: - "Insert a new row into the dataset. " + - "Provide confidence (0–1: 1.0 = official primary source, 0.5 = aggregator, 0.2 = indirect mention), " + - "sources (column name → URL for every column you filled; \"\" if unverifiable), " + - "and data (column values; \"\" for columns you cannot verify). " + + "Insert or update ALL entities found across the fetched pages in a single call. " + + "New entities are inserted; entities already present with LOWER confidence are updated " + + "using per-field merge rules; entities with equal/higher confidence are skipped. " + + "Duplicate primary keys within the call are deduplicated automatically (first wins). " + + "Each entry needs primary_key, confidence (0–1: 1.0 = primary source, 0.5 = aggregator, " + + "0.2 = indirect), sources (column → URL; \"\" if unverifiable), and data (column values; " + + "\"\" for unverifiable columns). " + "Never fabricate values — leave blank instead.", inputSchema: z.object({ - primary_key: z - .string() - .describe( - `Value of the primary key column "${primaryKeyColumn}" — used for deduplication`, - ), - confidence: z - .number() - .min(0) - .max(1) - .describe("Source confidence 0–1"), - sources: z - .record(z.string(), z.string()) - .describe( - 'Map of column name → source URL for each column you filled. Use "" for unverifiable columns.', - ), - data: z - .record(z.string(), z.any()) - .describe( - `Object with exactly these keys: ${JSON.stringify(columnNames)}. Use "" for unverifiable columns.`, - ), + rows: z + .array( + z.object({ + primary_key: z + .string() + .describe( + `Value of the primary key column "${primaryKeyColumn}" — used for deduplication`, + ), + confidence: z + .number() + .min(0) + .max(1) + .describe( + "Source confidence 0–1 (1.0 = official primary source, 0.5 = aggregator, 0.2 = indirect mention)", + ), + sources: z + .record(z.string(), z.string()) + .describe( + 'Map of column name → source URL for each column you filled. Use "" for unverifiable columns.', + ), + data: z + .record(z.string(), z.any()) + .describe( + `Object with exactly these keys: ${JSON.stringify(columnNames)}. Use "" for unverifiable columns.`, + ), + }), + ) + .min(1) + .describe("Every entity found across all fetched pages — do not omit any"), }), outputSchema: z.object({ - success: z.boolean(), - rowId: z.string().optional(), - error: z.string().optional(), + inserted: z.array(z.string()).describe("Primary keys successfully inserted as new rows"), + updated: z.array(z.string()).describe("Primary keys updated — existed with lower confidence"), + skipped: z.array(z.string()).describe("Primary keys skipped — equal/higher confidence already on record, in-flight from a concurrent agent, or duplicate within this call"), + errors: z + .array(z.object({ primary_key: z.string(), error: z.string() })) + .describe("Primary keys that failed, with error messages"), + needs_investigation: z + .array( + z.object({ + primary_key: z.string(), + blank_columns: z.array(z.string()), + }), + ) + .describe( + "Rows that were inserted or updated but still have blank columns. " + + "The orchestrator will call investigate_entity for each after all extractions finish.", + ), }), - execute: async ({ primary_key, confidence, sources, data }) => { - if (!data || Object.keys(data).length === 0) - return { success: false, error: "data is required." }; - - const cleanedData = cleanDataKeys(data); - const enrichedData: Record = { - ...cleanedData, - _confidence: confidence, - _sources: sources, - }; - const sourceUrls = Array.from( - new Set(Object.values(sources).filter(Boolean)), - ); + execute: async ({ rows }) => { + const inserted: string[] = []; + const updated: string[] = []; + const skipped: string[] = []; + const errors: Array<{ primary_key: string; error: string }> = []; + const needs_investigation: Array<{ primary_key: string; blank_columns: string[] }> = []; + + // Intra-batch dedup: first occurrence of each primary key wins. + const seenInBatch = new Set(); + + for (const row of rows) { + const { primary_key, confidence, sources, data } = row; + + // 1. Intra-batch dedup + if (seenInBatch.has(primary_key)) { + skipped.push(primary_key); + continue; + } + seenInBatch.add(primary_key); - console.log( - `[insert_row] ${logCtx} pk="${primary_key}" confidence=${confidence} cols=${Object.keys(cleanedData).length}`, - ); - try { - const rowId = await convex.mutation(internal.datasetRows.insert, { - datasetId: authorizedDatasetId, - data: enrichedData, - sources: sourceUrls, - }); + if (!data || Object.keys(data).length === 0) { + errors.push({ primary_key, error: "data is required" }); + continue; + } - const cells: Record = {}; - for (const col of columns) cells[col.name] = cleanedData[col.name] ?? ""; - rowIndex.set(primary_key, { rowId: rowId as string, confidence, cells }); + const cleanedData = cleanDataKeys(data); + const existingEntry = rowIndex.get(primary_key); - return { success: true, rowId: rowId as string }; - } catch (err) { - const msg = err instanceof Error ? err.message : String(err); - console.error( - `[insert_row] Failed: ${logCtx} pk="${primary_key}" err=${msg}`, - ); - if (msg.includes("Quota") || msg.includes("quota")) - return { - success: false, - error: `Quota exceeded: ${msg}. Stop inserting rows for this billing period.`, - }; - if (msg.includes("validator")) - return { - success: false, - error: `Validation failed: ${msg}. Check that column keys are plain strings.`, - }; - return { success: false, error: `Insert failed: ${msg}` }; + if (existingEntry) { + // ── Update path: row already exists ──────────────────────────────── + if (confidence <= existingEntry.confidence) { + // Equal or higher confidence already on record — nothing to do. + skipped.push(primary_key); + continue; + } + + console.log( + `[batch_insert_rows] ${logCtx} pk="${primary_key}" updating ` + + `(confidence ${existingEntry.confidence.toFixed(2)}→${confidence.toFixed(2)})`, + ); + try { + await convex.mutation(internal.datasetRows.mergeUpdate, { + id: existingEntry.rowId as any, + expectedDatasetId: authorizedDatasetId, + newData: cleanedData, + newConfidence: confidence, + newSources: sources, + }); + + // Mirror the per-field merge in the local rowIndex. + const updatedCells: Record = { ...existingEntry.cells }; + for (const [col, val] of Object.entries(cleanedData)) { + if (col.startsWith("_")) continue; + if (val === null || val === undefined || val === "") continue; + const existingVal = updatedCells[col]; + const existingIsBlank = + existingVal === null || existingVal === undefined || existingVal === ""; + if (existingIsBlank || confidence > existingEntry.confidence) { + updatedCells[col] = val; + } + } + rowIndex.set(primary_key, { + rowId: existingEntry.rowId, + confidence: Math.max(existingEntry.confidence, confidence), + cells: updatedCells, + }); + + updated.push(primary_key); + + const blank_columns = columns + .filter((col) => { + const v = updatedCells[col.name]; + return v === null || v === undefined || v === ""; + }) + .map((col) => col.name); + if (blank_columns.length > 0) { + needs_investigation.push({ primary_key, blank_columns }); + } + } catch (err) { + const msg = err instanceof Error ? err.message : String(err); + console.error( + `[batch_insert_rows] Update failed: ${logCtx} pk="${primary_key}" err=${msg}`, + ); + errors.push({ primary_key, error: `Update failed: ${msg}` }); + } + continue; + } + + // ── Insert path: new row ────────────────────────────────────────────── + // 2. Cross-agent dedup via pendingInserts. + // The check + add is synchronous before any await — atomic in JS's + // single-threaded event loop. A second concurrent agent seeing this key + // in pendingInserts goes to skipped[]; it will appear in list_rows + // after the first agent's insert completes, and the orchestrator will + // spawn an investigate_entity for it if it has blank columns. + if (pendingInserts.has(primary_key)) { + skipped.push(primary_key); + continue; + } + pendingInserts.add(primary_key); + + const sourceUrls = Array.from(new Set(Object.values(sources).filter(Boolean))); + const enrichedData: Record = { + ...cleanedData, + _confidence: confidence, + _sources: sources, + }; + + try { + const rowId = await convex.mutation(internal.datasetRows.insert, { + datasetId: authorizedDatasetId, + data: enrichedData, + sources: sourceUrls, + }); + + const cells: Record = {}; + for (const col of columns) cells[col.name] = cleanedData[col.name] ?? ""; + rowIndex.set(primary_key, { rowId: rowId as string, confidence, cells }); + inserted.push(primary_key); + + const blank_columns = columns + .filter((col) => { + const v = cells[col.name]; + return v === null || v === undefined || v === ""; + }) + .map((col) => col.name); + if (blank_columns.length > 0) { + needs_investigation.push({ primary_key, blank_columns }); + } + } catch (err) { + const msg = err instanceof Error ? err.message : String(err); + console.error( + `[batch_insert_rows] Insert failed: ${logCtx} pk="${primary_key}" err=${msg}`, + ); + if (msg.includes("Quota") || msg.includes("quota")) { + errors.push({ + primary_key, + error: `Quota exceeded: ${msg}. Stop inserting rows for this billing period.`, + }); + pendingInserts.delete(primary_key); + break; + } + if (msg.includes("validator")) { + errors.push({ + primary_key, + error: `Validation failed: ${msg}. Check that column keys are plain strings.`, + }); + } else { + errors.push({ primary_key, error: `Insert failed: ${msg}` }); + } + } finally { + pendingInserts.delete(primary_key); + } } + + console.log( + `[batch_insert_rows] ${logCtx} inserted=${inserted.length} updated=${updated.length} ` + + `skipped=${skipped.length} errors=${errors.length} needs_investigation=${needs_investigation.length}`, + ); + return { inserted, updated, skipped, errors, needs_investigation }; }, }); } @@ -246,7 +413,7 @@ function buildUpdateRowByKeyTool( if (!existing) { return { success: false, - error: `"${primary_key}" not found. Use insert_row for new entities.`, + error: `"${primary_key}" not found. Use batch_insert_rows for new entities.`, }; } @@ -320,25 +487,36 @@ function buildUpdateRowByKeyTool( // ─── Main tool factory ──────────────────────────────────────────────────────── /** - * Build the extract_rows and list_rows tools scoped to one dataset. + * Build the extract_rows, list_rows, and investigate_entity tools scoped to + * one dataset and workflow run. * - * Both tools share the same rowIndex, which is the canonical in-memory - * state for this workflow run. All reads and writes go through this closure - * so deduplication and confidence-gated updates work across parallel calls. + * All three tools share a single in-memory rowIndex (Map of primary-key → + * {rowId, confidence, cells}) that serves as the canonical state for the run. * * extract_rows: - * Dispatches one URL to a triage-extract agent. The agent fetches the page, - * classifies it (extract_now / needs_browser_agent / etc.), extracts all - * matching entities, then spawns investigate_entity sub-agents for rows - * with missing columns. Returns triage metadata and natural language leads. + * Dispatches a batch of 1–5 URLs to a fresh extract agent. The agent + * fetches all pages in parallel, extracts all matching entities, and calls + * batch_insert_rows once with everything combined. Returns leads for the + * orchestrator's next search round. Multiple extract_rows calls run in + * parallel from the orchestrator. * * list_rows: - * Returns a compact text summary of all rows in the dataset — which are - * complete, which have missing columns, and their confidence levels. Used - * by the populate orchestrator to track progress and decide when to stop. + * Returns a compact text summary of all rows — complete, incomplete, and + * their confidence levels. Called by the orchestrator after each round of + * extract_rows calls to decide what to investigate and whether to stop. + * + * investigate_entity: + * Spawned directly by the orchestrator (not by extract agents) after + * list_rows reveals incomplete rows. Closes over the shared rowIndex and + * investigateSemaphore. Each invocation spawns a fresh investigate agent + * that searches the web and fills missing columns via update_row_by_key. + * A global Semaphore(10) caps concurrent investigate agents. * - * authorizedDatasetId and authContext are never exposed in tool schemas; - * they are captured by closure for Convex writes and security logging. + * pendingInserts: + * A Set shared across all parallel batch_insert_rows calls. Prevents two + * concurrent extract agents from both inserting the same primary key. The + * check + add is synchronous before any await — atomic in JS's + * single-threaded event loop. * * A fresh call to buildExtractTool per workflow run is required — do not * cache the returned tools across runs. @@ -348,15 +526,24 @@ export function buildExtractTool( authContext: AuthContext, columns: PopulateColumn[], targetRows: number = 20, -): { extractRowsTool: ReturnType; listRowsTool: ReturnType } { +): { + extractRowsTool: ReturnType; + listRowsTool: ReturnType; + investigateEntityTool: ReturnType; +} { const primaryKeyColumn = columns[0]?.name ?? ""; const columnNames = columns.map((c) => c.name); const logCtx = `user=${authContext.authorizedUserId} run=${authContext.workflowRunId} dataset=${authorizedDatasetId}`; - // Shared mutable state across all extract_rows and investigate_entity - // invocations in this workflow run. + // Shared mutable state for this workflow run. const rowIndex = new Map(); + // Prevents concurrent extract agents from double-inserting the same entity. + const pendingInserts = new Set(); + + // Caps total concurrent investigate_entity agents across the whole run. + const investigateSemaphore = new Semaphore(MAX_CONCURRENT_INVESTIGATIONS); + function countCompleteRows(): number { let n = 0; for (const { cells } of rowIndex.values()) { @@ -387,112 +574,116 @@ export function buildExtractTool( } // ── investigate_entity tool ───────────────────────────────────────────────── - // Built once per buildExtractTool call; closes over the shared rowIndex. - // Each invocation spawns a fresh investigate agent with its own step budget. - - function buildInvestigateEntityTool() { - return createTool({ - id: "investigate_entity", - description: - "Spawn an investigation agent to autonomously research a specific entity " + - "and fill its missing or low-confidence columns via web search and page fetching. " + - "Call this after inserting a row that has blank columns. " + - "Provide the primary key, the specific missing column names, and all context " + - "you gathered (hints, partial URLs, notes from the page) so the agent can target " + - "its searches effectively.", - inputSchema: z.object({ - primary_key: z - .string() - .describe("Primary key value of the row to investigate"), - missing_columns: z - .array(z.string()) - .describe( - "Names of columns that are blank or low-confidence — the agent's priority targets", - ), - context: z - .string() - .describe( - "Everything you know about this entity: partial data found, " + - "hints from the page, source URLs where you found it, " + - "any clues that might help targeted searches", - ), - }), - outputSchema: z.object({ - findings: z.string(), - leads: z.string(), - }), - execute: async ({ primary_key, missing_columns, context }) => { - const existing = rowIndex.get(primary_key); - if (!existing) { - return { - findings: `Row "${primary_key}" not found in dataset — cannot investigate.`, - leads: "", - }; - } + // Exposed directly to the orchestrator. Called after all extract_rows have + // finished and list_rows has identified which rows are incomplete. - const existingDataText = columnNames - .map( - (n) => - `${n}: ${JSON.stringify(existing.cells[n] ?? "")}${!existing.cells[n] && existing.cells[n] !== 0 ? " [MISSING]" : ""}`, - ) - .join(", "); + const investigateEntityTool = createTool({ + id: "investigate_entity", + description: + "Spawn an investigation agent to research a specific entity and fill its missing columns " + + "via web search and page fetching. " + + "Call this for every INCOMPLETE row shown in list_rows after all extract_rows have finished. " + + "Emit ALL investigate_entity calls simultaneously in one response — do not wait for one " + + "to finish before calling the next; they run in parallel. " + + "Provide the primary key, the missing column names, and all context you have " + + "(partial data from list_rows, relevant leads from extract_rows results).", + inputSchema: z.object({ + primary_key: z + .string() + .describe("Primary key value of the row to investigate"), + missing_columns: z + .array(z.string()) + .describe("Column names that are blank — the agent's priority targets"), + context: z + .string() + .describe( + "Everything known about this entity: partial data from list_rows, " + + "relevant leads or URLs from extract_rows results, any useful search hints", + ), + }), + outputSchema: z.object({ + findings: z.string(), + leads: z.string(), + }), + execute: async ({ primary_key, missing_columns, context }) => { + const existing = rowIndex.get(primary_key); + if (!existing) { + return { + findings: `Row "${primary_key}" not found in dataset — cannot investigate.`, + leads: "", + }; + } + // Fast-path: if the row is already complete per the in-memory index, + // skip without spawning an agent. Handles races where a parallel + // investigate_entity already filled this row. + if (isRowComplete(existing.cells, columns)) { console.log( - `[investigate_entity] ${logCtx} pk="${primary_key}" missing=${missing_columns.join(",")}`, + `[investigate_entity] ${logCtx} pk="${primary_key}" already complete — skipping`, ); + return { findings: "Row already complete — skipped", leads: "" }; + } - try { - // Build a fresh update tool for this investigation (shares rowIndex). - const updateTool = buildUpdateRowByKeyTool( - rowIndex, - authorizedDatasetId, - `${logCtx} investigate="${primary_key}"`, - columns, - ); - const agent = buildInvestigateAgent( - columns, - primaryKeyColumn, - updateTool, - ); + const existingDataText = columnNames + .map( + (n) => + `${n}: ${JSON.stringify(existing.cells[n] ?? "")}${!existing.cells[n] && existing.cells[n] !== 0 ? " [MISSING]" : ""}`, + ) + .join(", "); - const prompt = - `Research this entity: "${primary_key}"\n\n` + - `Currently known data: ${existingDataText}\n` + - `Missing columns to fill (priority): ${missing_columns.join(", ")}\n\n` + - `Context from extraction:\n${context}`; + console.log( + `[investigate_entity] ${logCtx} pk="${primary_key}" missing=${missing_columns.join(",")}`, + ); - const result = await agent.generate(prompt, { maxSteps: 20 }); - const parsed = parseInvestigateOutput(result.text); + const updateTool = buildUpdateRowByKeyTool( + rowIndex, + authorizedDatasetId, + `${logCtx} investigate="${primary_key}"`, + columns, + ); + const agent = buildInvestigateAgent(columns, primaryKeyColumn, updateTool); - console.log( - `[investigate_entity] done ${logCtx} pk="${primary_key}" steps=${result.steps?.length ?? "?"}`, - ); + const prompt = + `Research this entity: "${primary_key}"\n\n` + + `Currently known data: ${existingDataText}\n` + + `Missing columns to fill (priority): ${missing_columns.join(", ")}\n\n` + + `Context:\n${context}`; - return { findings: parsed.findings, leads: parsed.leads }; - } catch (err) { - const msg = err instanceof Error ? err.message : String(err); - console.error( - `[investigate_entity] error ${logCtx} pk="${primary_key}" err=${msg}`, - ); - return { - findings: `Investigation failed: ${msg}`, - leads: "", - }; - } - }, - }); - } + await investigateSemaphore.acquire(); + try { + const result = await agent.generate(prompt, { maxSteps: 20 }); + const parsed = parseInvestigateOutput(result.text); + + console.log( + `[investigate_entity] done ${logCtx} pk="${primary_key}" steps=${result.steps?.length ?? "?"}`, + ); + + return { findings: parsed.findings, leads: parsed.leads }; + } catch (err) { + const msg = err instanceof Error ? err.message : String(err); + console.error( + `[investigate_entity] error ${logCtx} pk="${primary_key}" err=${msg}`, + ); + return { + findings: `Investigation failed: ${msg}`, + leads: "", + }; + } finally { + investigateSemaphore.release(); + } + }, + }); // ── list_rows tool ────────────────────────────────────────────────────────── - // Reads the shared rowIndex and returns a compact summary for the orchestrator. const listRowsTool = createTool({ id: "list_rows", description: "Get a compact summary of all rows currently in the dataset — which are complete, " + "which have missing columns, and their confidence levels. " + - "Call this after each batch of extract_rows calls to track progress toward the target " + - "row count and decide whether to continue or stop.", + "Call this once after all extract_rows calls have finished. " + + "Use the output to spawn investigate_entity for every INCOMPLETE row, " + + "and to decide whether the stop conditions have been met.", inputSchema: z.object({}), outputSchema: z.object({ summary: z.string() }), execute: async () => { @@ -501,7 +692,7 @@ export function buildExtractTool( if (total === 0) return { summary: "No rows yet." }; const lines = [ - `${total} rows total (${complete} complete, ${total - complete} incomplete).`, + `${total} rows total (${complete} complete / ${targetRows} target, ${total - complete} incomplete).`, ]; for (const [pk, { cells, confidence }] of rowIndex.entries()) { const missing = columns @@ -527,19 +718,21 @@ export function buildExtractTool( const extractRowsTool = createTool({ id: "extract_rows", description: - "Dispatch ONE prioritized source URL to a triage-extract agent. " + - "The agent fetches the page, classifies it (extract_now / needs_browser_agent / " + - "needs_form_fill / low_value / blocked), extracts all matching entities, " + - "and automatically dispatches investigation for rows with missing columns. " + - "Returns triage metadata and natural language leads for your next dispatches.", + "Dispatch a batch of 1–5 source URLs to one extraction agent. " + + "The agent fetches all pages in parallel, extracts all matching entities across all pages, " + + "and inserts them in a single batch_insert_rows call. " + + "Returns leads for your next search round. " + + "Run multiple extract_rows calls in parallel for different URL batches — " + + "wait for ALL to finish before calling list_rows.", inputSchema: z.object({ source_urls: z .array(z.string()) .min(1) - .max(1) + .max(5) .describe( - "Exactly 1 URL from search results. " + - "Use title, snippet, and site name to pick the most relevant page.", + "1–5 qualifying URLs to process as one batch. " + + "Use title, snippet, and site name to select the most relevant pages. " + + "Group URLs by topic similarity for best extraction coherence.", ), context: z .string() @@ -551,42 +744,35 @@ export function buildExtractTool( .string() .optional() .describe( - "Hints from previous extraction results: URL patterns, source types that worked, etc.", + "Hints from previous extraction results: URL patterns, source types that worked well.", ), }), outputSchema: z.object({ - triage_status: z.enum([ - "extract_now", - "needs_browser_agent", - "needs_form_fill", - "low_value", - "blocked", - ]), - triage_reason: z.string(), leads: z.string(), source_quality: z.string(), }), execute: async ({ source_urls, context, notes }) => { console.log( - `[extract_rows] ${logCtx} url=${source_urls[0]} known_rows=${rowIndex.size}`, + `[extract_rows] ${logCtx} urls=${source_urls.length} known_rows=${rowIndex.size}`, ); - // Hard cap: if target is already reached, skip. + // Hard cap: if target is already reached, skip this batch. const completeAtStart = countCompleteRows(); if (completeAtStart >= targetRows) { console.log( `[extract_rows] ${logCtx} skipping — target already reached (${completeAtStart}/${targetRows})`, ); return { - triage_status: "low_value" as TriageStatus, - triage_reason: `Target row count (${targetRows}) already reached — skipping.`, leads: "", - source_quality: "", + source_quality: `Target row count (${targetRows}) already reached — skipped.`, }; } try { - // Refresh rowIndex from Convex for any rows added by parallel calls. + // Refresh rowIndex from Convex to pick up rows written by other + // parallel extract_rows calls or investigate_entity agents since the + // last refresh. Update EXISTING entries when Convex has higher-confidence + // data so countCompleteRows() and investigate pre-checks stay accurate. const currentRows = await convex.query( internal.datasetRows.listInternal, { datasetId: authorizedDatasetId }, @@ -594,54 +780,60 @@ export function buildExtractTool( for (const row of currentRows) { const d = row.data as Record; const pk = String(d[primaryKeyColumn] ?? ""); - if (!pk || rowIndex.has(pk)) continue; - const cells: Record = {}; - for (const col of columns) cells[col.name] = d[col.name] ?? ""; - rowIndex.set(pk, { - rowId: row._id as string, - confidence: typeof d._confidence === "number" ? d._confidence : 0.5, - cells, - }); + if (!pk) continue; + const convexConfidence = + typeof d._confidence === "number" ? d._confidence : 0.5; + const existingEntry = rowIndex.get(pk); + if (!existingEntry) { + const cells: Record = {}; + for (const col of columns) cells[col.name] = d[col.name] ?? ""; + rowIndex.set(pk, { + rowId: row._id as string, + confidence: convexConfidence, + cells, + }); + } else if (convexConfidence > existingEntry.confidence) { + const cells: Record = {}; + for (const col of columns) cells[col.name] = d[col.name] ?? ""; + rowIndex.set(pk, { + rowId: row._id as string, + confidence: convexConfidence, + cells, + }); + } } const existingRowsText = buildExistingRowsText(); - // Build per-call tools sharing the run-level rowIndex. - const insertRowTool = buildInsertRowTool( + // Build a fresh batch_insert_rows tool that shares the run-level + // rowIndex and pendingInserts closure. + const batchInsertRowsTool = buildBatchInsertRowsTool( rowIndex, + pendingInserts, authorizedDatasetId, logCtx, columns, primaryKeyColumn, ); - const updateRowByKeyTool = buildUpdateRowByKeyTool( - rowIndex, - authorizedDatasetId, - logCtx, - columns, - ); - const investigateEntityTool = buildInvestigateEntityTool(); - const sourceUrl = source_urls[0]; + const urlList = source_urls.map((u, i) => `${i + 1}. ${u}`).join("\n"); const notesBlock = notes ? `\nAdditional hints:\n${notes}` : ""; const prompt = - `Fetch and process this URL: ${sourceUrl}\n\n` + + `Fetch and extract from this batch of URLs:\n${urlList}\n\n` + `Context: ${context}${notesBlock}\n\n` + `Existing rows in the dataset:\n${existingRowsText}`; - const agent = buildTriageExtractAgent( + const agent = buildExtractAgent( columns, primaryKeyColumn, - insertRowTool, - updateRowByKeyTool, - investigateEntityTool, + batchInsertRowsTool, ); const result = await agent.generate(prompt, { maxSteps: 40 }); - const parsed = parseTriageExtractOutput(result.text); + const parsed = parseExtractOutput(result.text); console.log( - `[extract_rows] done ${logCtx} triage=${parsed.triage_status} ` + + `[extract_rows] done ${logCtx} urls=${source_urls.length} ` + `rows=${rowIndex.size} complete=${countCompleteRows()} steps=${result.steps?.length ?? "?"}`, ); @@ -650,14 +842,12 @@ export function buildExtractTool( const msg = err instanceof Error ? err.message : String(err); console.error(`[extract_rows] error ${logCtx} err=${msg}`); return { - triage_status: "blocked" as TriageStatus, - triage_reason: `Extraction agent failed: ${msg}`, leads: "", - source_quality: "", + source_quality: `Extraction agent failed: ${msg}`, }; } }, }); - return { extractRowsTool, listRowsTool }; + return { extractRowsTool, listRowsTool, investigateEntityTool }; } diff --git a/backend/src/mastra/tools/model-middleware.ts b/backend/src/mastra/tools/model-middleware.ts new file mode 100644 index 0000000..2f530b5 --- /dev/null +++ b/backend/src/mastra/tools/model-middleware.ts @@ -0,0 +1,146 @@ +import { wrapLanguageModel } from "ai"; + +/** + * Attempt to recover a double-encoded JSON tool-call input string. + * + * kimi-k2 via OpenRouter's non-streaming path sets + * `input = toolCall.function.arguments` + * without validating that the string is parseable JSON. When the model + * wraps its arguments in an extra pair of quotes (i.e. the `function.arguments` + * field is `"{"primary_key":"Pocket",...}"` instead of + * `{"primary_key":"Pocket",...}`), the string starts with `"{"` which is a + * JSON-encoded string literal — and JSON.parse then hits a trailing `}` or + * other garbage that makes the parse fail. + * + * Recovery strategy: find the first `{` and the last `}` in the raw string + * and extract that substring. If the substring is valid JSON, use it; + * otherwise leave the original string unchanged so the normal error path + * can still handle it. + */ +function tryUnwrapDoubleEncodedInput(raw: string): string { + // Only attempt recovery when the string starts with `"` — the hallmark of + // the double-encoding pattern. Normal JSON objects start with `{`. + if (!raw.startsWith('"')) return raw; + + const firstBrace = raw.indexOf("{"); + const lastBrace = raw.lastIndexOf("}"); + + if (firstBrace === -1 || lastBrace <= firstBrace) return raw; + + const candidate = raw.slice(firstBrace, lastBrace + 1); + try { + JSON.parse(candidate); + console.log( + `[model-middleware] Repaired double-encoded tool call input (recovered ${candidate.length} chars)`, + ); + return candidate; + } catch { + return raw; // Cannot repair — leave for Mastra's normal error path + } +} + +// ─── Approach 1: wrapLanguageModel middleware (intercepts at AI SDK stream level) ─ + +/** + * Wrap a language model with a stream middleware that repairs double-encoded + * tool-call inputs before Mastra processes them. + * + * kimi-k2 (via OpenRouter) occasionally wraps tool-call arguments in an extra + * JSON string layer. Mastra's `sanitizeToolCallInput` / `tryRepairJson` cannot + * recover this pattern, so the tool call silently drops (args = undefined). + * This middleware intercepts `tool-call` stream chunks and unwraps the extra + * layer so Mastra receives clean JSON. + * + * Usage: + * model: withToolCallRepair(openrouter("moonshotai/kimi-k2-0905")) + */ +export function withToolCallRepair(model: any): any { + return wrapLanguageModel({ + model, + middleware: { + wrapStream: async ({ doStream }: any) => { + console.log("[model-middleware] wrapStream called"); + const result = await doStream(); + const { stream, ...rest } = result; + + const fixedStream = stream.pipeThrough( + new TransformStream({ + transform(chunk: any, controller: any) { + if ( + chunk != null && + chunk.type === "tool-call" && + typeof chunk.input === "string" + ) { + console.log(`[model-middleware] tool-call chunk: ${chunk.toolName} input starts with: ${chunk.input.slice(0, 30)}`); + const fixedInput = tryUnwrapDoubleEncodedInput(chunk.input); + controller.enqueue({ ...chunk, input: fixedInput }); + } else { + controller.enqueue(chunk); + } + }, + }), + ); + + return { stream: fixedStream, ...rest }; + }, + }, + }); +} + +// ─── Approach 2: Startup monkey-patch for Mastra's sanitizeToolCallInput ──────── + +/** + * Patch Mastra's internal sanitizeToolCallInput to handle double-encoded JSON. + * + * This is a fallback that operates at a lower level than the wrapStream + * middleware. It patches the compiled Mastra module directly so that even if + * the wrapStream approach doesn't intercept a particular code path, the repair + * still happens before JSON.parse throws. + * + * Call this once at application startup (e.g. in src/index.ts before starting + * Fastify) so it takes effect for all subsequent agent runs. + */ +export async function patchMastraSanitizeToolCallInput(): Promise { + try { + // The chunk file that contains sanitizeToolCallInput is a private module + // inside @mastra/core. We use a dynamic import to access it so we can + // wrap its exported convertFullStreamChunkToMastra function. + // However, since sanitizeToolCallInput is internal and not exported, we + // patch the stream processing by intercepting at the AISDKV5InputStream + // level instead. + // + // Strategy: intercept JSON.parse within the Mastra module scope by + // wrapping the global JSON.parse to repair double-encoded inputs when + // called from Mastra's tool call processing context. + const originalJsonParse = JSON.parse; + (JSON as any).parse = function patchedJsonParse(text: string, ...rest: any[]) { + try { + return originalJsonParse.call(this, text, ...rest); + } catch (err) { + // If JSON.parse fails on a string that starts with `"`, try the + // double-encoding recovery: extract the JSON object between the + // first `{` and last `}`. + if (typeof text === "string" && text.startsWith('"')) { + const firstBrace = text.indexOf("{"); + const lastBrace = text.lastIndexOf("}"); + if (firstBrace !== -1 && lastBrace > firstBrace) { + const candidate = text.slice(firstBrace, lastBrace + 1); + try { + const recovered = originalJsonParse.call(this, candidate, ...rest); + console.log( + `[model-middleware/patch] Recovered double-encoded JSON (${candidate.length} chars): ${candidate.slice(0, 60)}...`, + ); + return recovered; + } catch { + // Recovery also failed — re-throw the original error + } + } + } + throw err; + } + }; + console.log("[model-middleware] JSON.parse patched to recover double-encoded tool call inputs"); + } catch (err) { + console.warn("[model-middleware] Failed to patch JSON.parse:", err); + } +} diff --git a/backend/src/mastra/workflows/populate.ts b/backend/src/mastra/workflows/populate.ts index 72b1ecb..228a339 100644 --- a/backend/src/mastra/workflows/populate.ts +++ b/backend/src/mastra/workflows/populate.ts @@ -3,6 +3,7 @@ import { z } from "zod"; import { datasetContextSchema, populateColumnSchema } from "../../pipeline/populate.js"; import { convex, internal } from "../../convex.js"; import { buildPopulateAgent } from "../agents/populate.js"; +import { env } from "../../env.js"; /** * Server-set auth/run context threaded through every step. @@ -118,6 +119,7 @@ const agentStep = createStep({ inputData.authorizedDatasetId, inputData.authContext, inputData.columns, + env.POPULATE_TARGET_ROWS, ); try { const result = await agent.generate(inputData.prompt, { maxSteps: 80 }); From af551580eb39c9a045b5fc9f87d538c2a1d963ae Mon Sep 17 00:00:00 2001 From: MMeteorL Date: Wed, 27 May 2026 01:37:05 -0700 Subject: [PATCH 06/10] Switch model to deepseek-v4-pro and revert extract agent to 1 URL per call MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - All three agents (orchestrator, extract, investigate) now use deepseek/deepseek-v4-pro via OpenRouter - extract_rows input reverted to exactly 1 URL (max(1)) — the orchestrator dispatches one parallel extract_rows call per URL; no batching needed since the orchestrator handles investigate_entity directly - Extract agent instructions updated accordingly (single fetch, single page) - Orchestrator instructions updated: "one URL per call" instead of batches Co-Authored-By: Claude Sonnet 4.6 --- backend/src/mastra/agents/extract.ts | 33 +++++++++----------- backend/src/mastra/agents/investigate.ts | 2 +- backend/src/mastra/agents/populate.ts | 4 +-- backend/src/mastra/tools/investigate-tool.ts | 20 ++++++------ 4 files changed, 27 insertions(+), 32 deletions(-) diff --git a/backend/src/mastra/agents/extract.ts b/backend/src/mastra/agents/extract.ts index 186378e..e86c44a 100644 --- a/backend/src/mastra/agents/extract.ts +++ b/backend/src/mastra/agents/extract.ts @@ -19,7 +19,7 @@ function buildExtractInstructions( ) .join("\n"); - return `You receive a batch of URLs. Fetch all pages in parallel, extract every matching entity, and insert them in one call. + return `You receive one URL. Fetch the page, extract every matching entity, and insert them in one call. ━━ DATASET SCHEMA ━━ Columns: @@ -28,27 +28,24 @@ ${columnsDesc} Primary key column: "${primaryKeyColumn}" Tool call data/sources keys MUST be exactly: ${JSON.stringify(columnNames)} -━━ STEP 1: FETCH (parallel) ━━ -Call fetch_page for ALL URLs simultaneously in a single response. -Wait for ALL fetches to complete before proceeding. +━━ STEP 1: FETCH ━━ +Call fetch_page for the URL provided in the prompt. ━━ STEP 2: EXTRACT ━━ -Read the full content of every successfully fetched page. -Identify ALL entities that match the dataset schema across all pages. -If the same entity appears on multiple pages, prefer the most complete data -(use non-empty values from any page; do not discard data from secondary pages). +Read the full page content. +Identify ALL entities that match the dataset schema — do not stop after the first one. ━━ STEP 3: BATCH INSERT ━━ -Call batch_insert_rows ONCE with ALL entities combined from all pages. +Call batch_insert_rows ONCE with ALL entities found on the page. - Include every entity you found — do not omit any. -- For columns you cannot confirm from any page, use "" — never fabricate. +- For columns you cannot confirm from this page, use "" — never fabricate. - For every column you DO fill, record the source URL. -- If no matching entities were found on any page, skip this step. +- If no matching entities were found, skip this step. ━━ RULES ━━ 1. REAL VALUES ONLY. Never fabricate — use "" for unverifiable columns. 2. SOURCE ATTRIBUTION. Record the source URL for every column you fill. -3. READ ALL PAGES FIRST. Identify all entities before calling batch_insert_rows. +3. READ THE FULL PAGE FIRST. Identify all entities before calling batch_insert_rows. 4. ONE CALL ONLY. Call batch_insert_rows exactly once with all entities combined. ━━ FINAL OUTPUT ━━ @@ -57,16 +54,16 @@ After all work is done, write a summary with exactly these labels: LEADS: -SOURCE_QUALITY: `; +SOURCE_QUALITY: `; } /** * Build a fresh extract Agent for one extract_rows call. * - * The agent receives a batch of URLs, fetches all of them in parallel, - * extracts every matching entity across all pages, and calls batch_insert_rows - * once with the full combined entity list. It does NOT spawn investigation - * agents — that is the orchestrator's responsibility after list_rows. + * The agent receives one URL, fetches the page, extracts every matching + * entity, and calls batch_insert_rows once with the full entity list. + * It does NOT spawn investigation agents — that is the orchestrator's + * responsibility after list_rows. * * Tools: fetch_page, batch_insert_rows. * No search capability — it only fetches the URLs provided. @@ -86,7 +83,7 @@ export function buildExtractAgent( id: "extract-agent", name: "Dataset Extract Agent", instructions: buildExtractInstructions(columns, primaryKeyColumn), - model: openrouter("moonshotai/kimi-k2-0905"), + model: openrouter("deepseek/deepseek-v4-pro"), tools: { fetch_page: fetchPageTool, batch_insert_rows: batchInsertRowsTool, diff --git a/backend/src/mastra/agents/investigate.ts b/backend/src/mastra/agents/investigate.ts index aa1d41d..f256d71 100644 --- a/backend/src/mastra/agents/investigate.ts +++ b/backend/src/mastra/agents/investigate.ts @@ -85,7 +85,7 @@ export function buildInvestigateAgent( id: "investigate-agent", name: "Dataset Investigate Agent", instructions: buildInvestigateInstructions(columns, primaryKeyColumn), - model: openrouter("moonshotai/kimi-k2-0905"), + model: openrouter("deepseek/deepseek-v4-pro"), tools: { search_web: searchWebTool, fetch_page: fetchPageTool, diff --git a/backend/src/mastra/agents/populate.ts b/backend/src/mastra/agents/populate.ts index bfe3871..9ccf3a1 100644 --- a/backend/src/mastra/agents/populate.ts +++ b/backend/src/mastra/agents/populate.ts @@ -42,7 +42,7 @@ A URL qualifies if ALL of the following are true: Track every URL you dispatch — never send the same URL twice in one run. Avoid batches that clearly cover the exact same set of entities. -Batch qualifying URLs into groups of up to 5 and call extract_rows for each group IN PARALLEL. +Call extract_rows for each qualifying URL IN PARALLEL (one URL per call). Wait for ALL extract_rows calls to finish before moving to Phase 3. PHASE 3 — REVIEW @@ -115,7 +115,7 @@ export function buildPopulateAgent( id: "populate-agent", name: "Dataset Populate Orchestrator", instructions: buildOrchestratorInstructions(targetRows), - model: openrouter("moonshotai/kimi-k2-0905"), + model: openrouter("deepseek/deepseek-v4-pro"), tools: { search_web: searchWebTool, extract_rows: extractRowsTool, diff --git a/backend/src/mastra/tools/investigate-tool.ts b/backend/src/mastra/tools/investigate-tool.ts index 8ce3f88..664d93b 100644 --- a/backend/src/mastra/tools/investigate-tool.ts +++ b/backend/src/mastra/tools/investigate-tool.ts @@ -718,21 +718,20 @@ export function buildExtractTool( const extractRowsTool = createTool({ id: "extract_rows", description: - "Dispatch a batch of 1–5 source URLs to one extraction agent. " + - "The agent fetches all pages in parallel, extracts all matching entities across all pages, " + + "Dispatch one source URL to an extraction agent. " + + "The agent fetches the page, extracts all matching entities, " + "and inserts them in a single batch_insert_rows call. " + "Returns leads for your next search round. " + - "Run multiple extract_rows calls in parallel for different URL batches — " + + "Run multiple extract_rows calls in parallel for different URLs — " + "wait for ALL to finish before calling list_rows.", inputSchema: z.object({ source_urls: z .array(z.string()) .min(1) - .max(5) + .max(1) .describe( - "1–5 qualifying URLs to process as one batch. " + - "Use title, snippet, and site name to select the most relevant pages. " + - "Group URLs by topic similarity for best extraction coherence.", + "Exactly 1 qualifying URL to process. " + + "Use title, snippet, and site name to pick the most relevant page.", ), context: z .string() @@ -753,7 +752,7 @@ export function buildExtractTool( }), execute: async ({ source_urls, context, notes }) => { console.log( - `[extract_rows] ${logCtx} urls=${source_urls.length} known_rows=${rowIndex.size}`, + `[extract_rows] ${logCtx} url=${source_urls[0]} known_rows=${rowIndex.size}`, ); // Hard cap: if target is already reached, skip this batch. @@ -816,10 +815,9 @@ export function buildExtractTool( primaryKeyColumn, ); - const urlList = source_urls.map((u, i) => `${i + 1}. ${u}`).join("\n"); const notesBlock = notes ? `\nAdditional hints:\n${notes}` : ""; const prompt = - `Fetch and extract from this batch of URLs:\n${urlList}\n\n` + + `Fetch and extract from this URL: ${source_urls[0]}\n\n` + `Context: ${context}${notesBlock}\n\n` + `Existing rows in the dataset:\n${existingRowsText}`; @@ -833,7 +831,7 @@ export function buildExtractTool( const parsed = parseExtractOutput(result.text); console.log( - `[extract_rows] done ${logCtx} urls=${source_urls.length} ` + + `[extract_rows] done ${logCtx} url=${source_urls[0]} ` + `rows=${rowIndex.size} complete=${countCompleteRows()} steps=${result.steps?.length ?? "?"}`, ); From 6d1333a79138ce8cfbffc25b4095f1303e66ea2b Mon Sep 17 00:00:00 2001 From: MMeteorL Date: Wed, 27 May 2026 05:03:30 -0700 Subject: [PATCH 07/10] perf: reduce extract agent step ceiling and trim context to speed up iterations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - maxSteps: 40 → 20 for extract agents — prevents paginated browse pages (metacritic /browse/) from spiraling to 38-40 steps (~10 min per call); they now exit at 20 steps and return LEADS for the next iteration instead - Replace full buildExistingRowsText() dump with a compact count + 30-key sample in each extract agent prompt — row dedup is handled by the tool layer (rowIndex + pendingInserts), not by the agent, so the full 300-row dump was wasted context that inflated model processing time - Add URL QUALITY guidance to orchestrator Phase 2 instructions: prefer single-page editorial/list sources (Wikipedia, "best of", rankings); avoid paginated browse/catalog URLs (/browse/, ?page=, ?sort=, etc.) Co-Authored-By: Claude Sonnet 4.6 --- backend/CLAUDE.md | 4 ++-- backend/src/mastra/agents/populate.ts | 13 +++++++++++-- backend/src/mastra/tools/investigate-tool.ts | 16 ++++++++++++++-- 3 files changed, 27 insertions(+), 6 deletions(-) diff --git a/backend/CLAUDE.md b/backend/CLAUDE.md index 02fd93d..0b17887 100644 --- a/backend/CLAUDE.md +++ b/backend/CLAUDE.md @@ -33,13 +33,13 @@ The populate pipeline uses three layers of agents, each with a narrow scope: 1. **Populate Orchestrator** (`src/mastra/agents/populate.ts`) — `buildPopulateAgent(authorizedDatasetId, authContext, columns, targetRows)`. Per-iteration: (1) runs parallel searches, (2) batches qualifying URLs and calls `extract_rows` in parallel (up to 5 URLs per call), (3) calls `list_rows` once to see all rows and which are incomplete, (4) calls `investigate_entity` in parallel for every incomplete row. Stops when `targetRows` complete rows are reached or 2 consecutive stagnant iterations occur. -2. **Extract Agent** (`src/mastra/agents/extract.ts`) — `buildExtractAgent(columns, primaryKeyColumn, batchInsertRowsTool)`. Receives a batch of 1–5 URLs. Fetches all pages in parallel, extracts every matching entity across all pages, and calls `batch_insert_rows` once with the full combined entity list. Returns leads for the orchestrator's next search round. No triage step, no investigation — purely fetch → extract → insert. +2. **Extract Agent** (`src/mastra/agents/extract.ts`) — `buildExtractAgent(columns, primaryKeyColumn, batchInsertRowsTool)`. Receives exactly 1 URL. Fetches the page, extracts every matching entity, and calls `batch_insert_rows` once. Returns LEADS/SOURCE_QUALITY for the orchestrator's next search round. No triage step, no investigation — purely fetch → extract → insert. Orchestrator instructions prefer single-page editorial sources over paginated browse directories to avoid multi-fetch spirals. 3. **Investigate Agent** (`src/mastra/agents/investigate.ts`) — `buildInvestigateAgent(columns, primaryKeyColumn, updateRowByKeyTool)`. Researches ONE specific entity to fill its missing columns. Has `search_web` + `fetch_page` + `update_row_by_key`. Returns structured output (`INSERTED: false / SUMMARY / CLUES / REASON`). ### Tool factories -- `src/mastra/tools/investigate-tool.ts` — `buildExtractTool(authorizedDatasetId, authContext, columns, targetRows)` returns `{ extractRowsTool, listRowsTool, investigateEntityTool }`. All three share a single in-memory `rowIndex` (Map of primary-key → `{rowId, confidence, cells}`) and a `pendingInserts` Set. `extract_rows` dispatches a batch of 1–5 URLs to a fresh extract agent (maxSteps: 40); `list_rows` returns a compact text summary for the orchestrator; `investigate_entity` (exposed to the orchestrator, not to extract agents) spawns a fresh investigate agent (maxSteps: 20). `pendingInserts` prevents two parallel extract agents from double-inserting the same entity — the check+add is atomic in JS's single-threaded event loop. A global `Semaphore(10)` caps concurrent investigate agents. The rowIndex refresh loop at the start of each `extract_rows` call picks up rows written by other parallel agents since the last refresh. +- `src/mastra/tools/investigate-tool.ts` — `buildExtractTool(authorizedDatasetId, authContext, columns, targetRows)` returns `{ extractRowsTool, listRowsTool, investigateEntityTool }`. All three share a single in-memory `rowIndex` (Map of primary-key → `{rowId, confidence, cells}`) and a `pendingInserts` Set. `extract_rows` dispatches one URL to a fresh extract agent (maxSteps: 20); the extract agent prompt receives only a compact row summary (count + 30 sample primary keys) rather than the full row dump — dedup is handled by the tool, not the agent. `list_rows` returns a compact text summary for the orchestrator; `investigate_entity` (exposed to the orchestrator, not to extract agents) spawns a fresh investigate agent (maxSteps: 20). `pendingInserts` prevents two parallel extract agents from double-inserting the same entity — the check+add is atomic in JS's single-threaded event loop. A global `Semaphore(10)` caps concurrent investigate agents. The rowIndex refresh loop at the start of each `extract_rows` call picks up rows written by other parallel agents since the last refresh. - `src/mastra/tools/dataset-tools.ts` — `buildPopulateTools(authorizedDatasetId, authContext)` factory returning 5 Convex-backed tools: `insert_row`, `list_rows`, `get_row`, `update_row`, `delete_row`. Not used by the populate agent itself — used by other callers. The dataset id is captured by closure so the LLM cannot redirect writes to other datasets; `authContext` (Clerk userId + workflow run id) is captured for caller-attribution in security logs and the `CAPABILITY_VIOLATION` PostHog event. See the security note at the top of the file. - `src/mastra/tools/web-tools.ts` — 2 TinyFish API tools: `search_web`, `fetch_page` diff --git a/backend/src/mastra/agents/populate.ts b/backend/src/mastra/agents/populate.ts index 9ccf3a1..8a06d41 100644 --- a/backend/src/mastra/agents/populate.ts +++ b/backend/src/mastra/agents/populate.ts @@ -32,12 +32,21 @@ Examples: "YC W2025 batch companies list", "AI startups ${currentYear} funding", "${currentMonth} ${currentYear} [topic] directory" PHASE 2 — EXTRACT (parallel) -Collect all qualifying URLs from search results AND from leads returned by previous extract_rows calls. +Collect up to ${targetRows / 4} qualifying URLs from search results AND from leads returned by previous extract_rows calls. A URL qualifies if ALL of the following are true: - Relevance: title or snippet names a matching entity, list, or directory for this dataset topic - Data value: snippet suggests real column values are present (names, prices, dates, contacts, etc.) - Source: official site, known directory, or reputable domain (not SEO spam or thin content) - - Novelty: not already dispatched in this run + - Novelty: different from previous URLs dispatched in this run, likely to provide new information + +URL QUALITY — prefer fast, single-page sources: + PREFER: editorial lists ("best of", "top N", rankings), Wikipedia list pages, curated directories + that show all data on ONE page (e.g. en.wikipedia.org/wiki/List_of_...). + AVOID: paginated browse/catalog pages that require many page loads to see all entities. + Signs of pagination: URLs with /browse/, /all/, /catalog/, page numbers, ?page=, ?sort=, + ?offset= or similar. These are slow (many fetches) and block Phase 3. + Only dispatch paginated URLs when no single-page alternative is available, and prefer + a specific internal page (e.g. page 1 only) over the browse root. Track every URL you dispatch — never send the same URL twice in one run. Avoid batches that clearly cover the exact same set of entities. diff --git a/backend/src/mastra/tools/investigate-tool.ts b/backend/src/mastra/tools/investigate-tool.ts index 664d93b..b862a94 100644 --- a/backend/src/mastra/tools/investigate-tool.ts +++ b/backend/src/mastra/tools/investigate-tool.ts @@ -802,7 +802,19 @@ export function buildExtractTool( } } - const existingRowsText = buildExistingRowsText(); + // Compact existing-rows context: the extract agent cannot meaningfully + // act on a 300-row dump, and sending it inflates the prompt for every + // parallel extract call. Row-level dedup is handled by batch_insert_rows + // (rowIndex + pendingInserts), so the agent only needs a count + a short + // sample of known primary keys to orient its extraction. + const complete = countCompleteRows(); + const knownKeys = Array.from(rowIndex.keys()).slice(0, 30); + const existingRowsText = + rowIndex.size === 0 + ? "None yet." + : `${rowIndex.size} rows collected so far (${complete} complete). ` + + `Sample of known primary keys (do NOT re-insert these): ${knownKeys.join(", ")}` + + (rowIndex.size > 30 ? ` … and ${rowIndex.size - 30} more.` : "."); // Build a fresh batch_insert_rows tool that shares the run-level // rowIndex and pendingInserts closure. @@ -827,7 +839,7 @@ export function buildExtractTool( batchInsertRowsTool, ); - const result = await agent.generate(prompt, { maxSteps: 40 }); + const result = await agent.generate(prompt, { maxSteps: 20 }); const parsed = parseExtractOutput(result.text); console.log( From 9bd0522eee8c22693ca4837e409a0c46dfe7ca32 Mon Sep 17 00:00:00 2001 From: MMeteorL Date: Wed, 27 May 2026 13:29:04 -0700 Subject: [PATCH 08/10] perf: tighten agent budgets, hard-cap extract/investigate batches, raise orchestrator steps MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extract agent (agents/extract.ts): - Hard budget of 2 tool calls: exactly 1 fetch_page + 1 batch_insert_rows - Explicit: no pagination, no following links — add pagination URLs to LEADS instead - HARD BUDGET section at top of instructions so the constraint is unmissable Investigate agent (agents/investigate.ts): - Remove second search round ("if first round didn't fill everything, search more") - Procedure is now: 1-2 parallel searches → 1-2 fetches → 1 update_row_by_key → done - Borrowed from main branch: shorter, more decisive, no retry loop investigate-tool.ts: - Extract agent maxSteps: 20 → 5 (2 tool calls + 3 buffer) - Investigate agent maxSteps: 20 → 8 (search + fetch + update + buffer) Orchestrator (agents/populate.ts): - Phase 2: hard cap of ceil(targetRows/4) extract_rows calls per iteration (was soft "up to" guidance — LLM was ignoring it) - Phase 4: hard batch of 20 investigate_entity calls per iteration, emitted in a single parallel response, prioritised by fewest missing columns - Both caps are called out as HARD LIMITS in the instructions and the RULES block Workflow (workflows/populate.ts): - Orchestrator maxSteps: 80 → 150 (headroom for 4+ iterations at ~31 steps each) Co-Authored-By: Claude Sonnet 4.6 --- backend/src/mastra/agents/extract.ts | 50 ++++++------- backend/src/mastra/agents/investigate.ts | 43 +++++------ backend/src/mastra/agents/populate.ts | 75 ++++++++++---------- backend/src/mastra/tools/investigate-tool.ts | 8 ++- backend/src/mastra/workflows/populate.ts | 9 ++- 5 files changed, 94 insertions(+), 91 deletions(-) diff --git a/backend/src/mastra/agents/extract.ts b/backend/src/mastra/agents/extract.ts index e86c44a..fa2234c 100644 --- a/backend/src/mastra/agents/extract.ts +++ b/backend/src/mastra/agents/extract.ts @@ -19,7 +19,19 @@ function buildExtractInstructions( ) .join("\n"); - return `You receive one URL. Fetch the page, extract every matching entity, and insert them in one call. + return `You receive exactly ONE URL. Your entire job fits in 2 tool calls. + +━━ HARD BUDGET ━━ +Tool call 1: fetch_page — call it ONCE for the URL in your prompt. +Tool call 2: batch_insert_rows — call it ONCE with every entity you found. +That's it. 2 tool calls total. Do not make any other tool calls. + +━━ STRICT CONSTRAINTS ━━ +- Do NOT call fetch_page more than once. No pagination. No following links. + If the page is paginated, extract only what is on the first response. + Add the other page URLs (e.g. ?page=2) to LEADS — do not fetch them yourself. +- Do NOT call batch_insert_rows more than once. +- If no matching entities were found, skip batch_insert_rows entirely and go straight to FINAL OUTPUT. ━━ DATASET SCHEMA ━━ Columns: @@ -28,33 +40,21 @@ ${columnsDesc} Primary key column: "${primaryKeyColumn}" Tool call data/sources keys MUST be exactly: ${JSON.stringify(columnNames)} -━━ STEP 1: FETCH ━━ -Call fetch_page for the URL provided in the prompt. - -━━ STEP 2: EXTRACT ━━ -Read the full page content. -Identify ALL entities that match the dataset schema — do not stop after the first one. - -━━ STEP 3: BATCH INSERT ━━ -Call batch_insert_rows ONCE with ALL entities found on the page. -- Include every entity you found — do not omit any. -- For columns you cannot confirm from this page, use "" — never fabricate. -- For every column you DO fill, record the source URL. -- If no matching entities were found, skip this step. - -━━ RULES ━━ -1. REAL VALUES ONLY. Never fabricate — use "" for unverifiable columns. -2. SOURCE ATTRIBUTION. Record the source URL for every column you fill. -3. READ THE FULL PAGE FIRST. Identify all entities before calling batch_insert_rows. -4. ONE CALL ONLY. Call batch_insert_rows exactly once with all entities combined. +━━ PROCEDURE ━━ +1. Call fetch_page for the URL in your prompt. (tool call 1) +2. Read the content. Extract every entity that matches the schema. + - Use "" for any column you cannot confirm from this page. Never fabricate. + - Record the page URL as source for every column you fill. +3. Call batch_insert_rows with all entities in one call. (tool call 2) +4. Write FINAL OUTPUT. ━━ FINAL OUTPUT ━━ -After all work is done, write a summary with exactly these labels: +After all tool calls are done, write a summary with exactly these labels: -LEADS: -SOURCE_QUALITY: `; +LEADS: +SOURCE_QUALITY: `; } /** diff --git a/backend/src/mastra/agents/investigate.ts b/backend/src/mastra/agents/investigate.ts index f256d71..7a392bf 100644 --- a/backend/src/mastra/agents/investigate.ts +++ b/backend/src/mastra/agents/investigate.ts @@ -19,8 +19,7 @@ function buildInvestigateInstructions( ) .join("\n"); - return `You research one specific entity to find values for its missing or low-confidence columns. -The entity already exists as a partial row — your job is to find what's missing. + return `You research one specific entity to fill its missing columns. One search round. Done. ━━ DATASET SCHEMA ━━ Columns: @@ -29,39 +28,31 @@ ${columnsDesc} Primary key column: "${primaryKeyColumn}" Tool call data/sources keys MUST be exactly: ${JSON.stringify(columnNames)} -━━ YOUR TASK ━━ -You will be given: -- The entity's primary key value -- Its currently known data (columns already filled, with their confidence levels) -- The specific columns that are missing or low-confidence (your priority targets) +━━ WHAT YOU RECEIVE ━━ +- The entity's primary key and its partial data (columns already filled) +- Which columns are missing — these are your only targets +- Context: leads, URLs, and hints from the extraction phase -Search the web and fetch pages to find the missing values. -You may also improve existing low-confidence values if you find a better primary source. - -━━ PROCEDURE ━━ -1. Formulate targeted search queries — include the entity name and what you're looking for. - Run 2–4 searches in parallel covering different angles. -2. Evaluate the search results. Fetch 2–4 of the most promising pages. -3. Extract values for the missing columns from what you find. -4. Call update_row_by_key once you have found values: +━━ PROCEDURE (do these steps, then stop) ━━ +1. Run 1–2 targeted searches in parallel — include the entity name and the missing field names. + Use any URLs from the provided context before searching if they look directly relevant. +2. Fetch the 1–2 most promising pages from the search results. +3. Call update_row_by_key ONCE with everything you found: - confidence: 1.0 = official primary source, 0.5 = aggregator, 0.2 = indirect mention - - sources: map of column name → URL for each column you fill; "" for unfound columns - - data: include ALL column keys, with "" for columns you still could not verify -5. If the first search round did not fill all missing columns, run 1–2 more targeted searches - and fetch additional pages before your final update call. + - sources: column name → source URL for each column you fill; "" for unfound columns + - data: ALL column keys — use "" for columns you could not verify +4. Write FINAL OUTPUT. Stop here — do not run additional searches. ━━ RULES ━━ 1. REAL VALUES ONLY. Never fabricate or estimate. Leave "" for unverifiable columns. 2. UPDATE ONLY. The row already exists — always use update_row_by_key, never insert_row. -3. SOURCE ATTRIBUTION IS REQUIRED. Record the source URL for every value you fill. +3. ONE UPDATE CALL. Call update_row_by_key exactly once. +4. SOURCE REQUIRED for every column you fill. ━━ FINAL OUTPUT ━━ -After all update calls are done, write a natural language summary with exactly these labels: - INSERTED: false -SUMMARY: -CLUES: +SUMMARY: +CLUES: REASON: `; } diff --git a/backend/src/mastra/agents/populate.ts b/backend/src/mastra/agents/populate.ts index 8a06d41..5d37a4f 100644 --- a/backend/src/mastra/agents/populate.ts +++ b/backend/src/mastra/agents/populate.ts @@ -13,6 +13,8 @@ function buildOrchestratorInstructions(targetRows: number): string { const now = new Date(); const currentYear = now.getFullYear(); const currentMonth = now.toLocaleString("en-US", { month: "long" }); + const extractCap = Math.max(3, Math.ceil(targetRows / 4)); + const investigateCap = 20; return `You fill datasets by searching the web, dispatching extraction agents in parallel, then investigating incomplete rows. @@ -24,50 +26,52 @@ Always use this when formulating time-sensitive search queries. Each iteration has four phases. Complete all four before starting the next. PHASE 1 — SEARCH -Run searches in parallel (5 for the first iteration; up to 20 for subsequent ones). +Run searches in parallel (5 for the first iteration; up to 10 for subsequent ones). Cover different angles: entity lists, official directories, aggregator sites, specific entity pages. TIME SENSITIVITY: If the topic mentions "recent", "current", "latest", or a specific year, include ${currentYear} (or the relevant year) explicitly in every query. Examples: "YC W2025 batch companies list", "AI startups ${currentYear} funding", "${currentMonth} ${currentYear} [topic] directory" -PHASE 2 — EXTRACT (parallel) -Collect up to ${targetRows / 4} qualifying URLs from search results AND from leads returned by previous extract_rows calls. +PHASE 2 — EXTRACT (parallel, hard cap: ${extractCap} calls per iteration) +Select the best ${extractCap} qualifying URLs from search results AND from leads returned by previous extract_rows calls. +Do NOT dispatch more than ${extractCap} extract_rows calls per iteration — this is a hard limit. A URL qualifies if ALL of the following are true: - Relevance: title or snippet names a matching entity, list, or directory for this dataset topic - Data value: snippet suggests real column values are present (names, prices, dates, contacts, etc.) - Source: official site, known directory, or reputable domain (not SEO spam or thin content) - - Novelty: different from previous URLs dispatched in this run, likely to provide new information + - Novelty: not already dispatched in this run URL QUALITY — prefer fast, single-page sources: PREFER: editorial lists ("best of", "top N", rankings), Wikipedia list pages, curated directories that show all data on ONE page (e.g. en.wikipedia.org/wiki/List_of_...). - AVOID: paginated browse/catalog pages that require many page loads to see all entities. - Signs of pagination: URLs with /browse/, /all/, /catalog/, page numbers, ?page=, ?sort=, - ?offset= or similar. These are slow (many fetches) and block Phase 3. - Only dispatch paginated URLs when no single-page alternative is available, and prefer - a specific internal page (e.g. page 1 only) over the browse root. + AVOID: paginated browse/catalog pages — signs: /browse/, /all/, /catalog/, ?page=, ?sort=, ?offset=. + They are slow and block Phase 3. If you must use one, dispatch page 1 only; the agent + will return later pages as LEADS. Track every URL you dispatch — never send the same URL twice in one run. -Avoid batches that clearly cover the exact same set of entities. - -Call extract_rows for each qualifying URL IN PARALLEL (one URL per call). +Emit ALL ${extractCap} extract_rows calls IN A SINGLE RESPONSE (they run in parallel). Wait for ALL extract_rows calls to finish before moving to Phase 3. PHASE 3 — REVIEW Call list_rows exactly once. Note the complete row count and which rows are INCOMPLETE (shown as INCOMPLETE — missing: ...). -PHASE 4 — INVESTIGATE (parallel) -For every INCOMPLETE row in list_rows, call investigate_entity simultaneously in one response. -Do NOT wait for one investigate_entity to finish before calling the next — they run in parallel. -Do NOT call investigate_entity for rows already marked COMPLETE. +PHASE 4 — INVESTIGATE (parallel, batch of up to ${investigateCap}) +From the INCOMPLETE rows in list_rows, select up to ${investigateCap} to investigate this iteration. +Priority: rows with the FEWEST missing columns first (closest to complete → highest impact). +Remaining incomplete rows will be handled in subsequent iterations. + +Emit ALL selected investigate_entity calls in a SINGLE response (they run in parallel). +Do NOT call investigate_entity one at a time — all calls for this batch go out simultaneously. +Do NOT call investigate_entity for rows marked COMPLETE. -Build the context for each investigate_entity call from: - - The row's partial data as shown in list_rows - - Relevant leads and URLs returned by extract_rows in Phase 2 +For each investigate_entity call, include: + - primary_key: the entity's primary key value + - missing_columns: the list of blank column names from list_rows + - context: the row's partial data + any relevant leads/URLs returned by extract_rows -Wait for ALL investigate_entity calls to finish before starting the next iteration. +Wait for ALL investigate_entity calls in the batch to finish before starting the next iteration. ━━ STOP CONDITIONS ━━ Stop when ANY of the following is true: @@ -81,29 +85,26 @@ Stop when ANY of the following is true: - Do NOT fetch pages yourself — extract_rows agents fetch pages and write data. - Do NOT call investigate_entity for COMPLETE rows. - Use search result titles and snippets to select URLs — do not fetch to evaluate. -- Do NOT apply a fixed URL count cap — dispatch every URL that passes the quality threshold.`; +- Hard extract cap: ${extractCap} extract_rows calls per iteration maximum. Never exceed this. +- Hard investigate batch: ${investigateCap} investigate_entity calls per batch maximum.`; } /** * Build the orchestrator Agent for a populate run. * - * The orchestrator coordinates three layers per iteration: - * 1. Parallel web searches (search_web) to find candidate URLs. - * 2. Parallel extract_rows calls — each dispatches a batch of 1–5 URLs to a - * fresh extract agent that fetches all pages in parallel, extracts all - * matching entities, and inserts them via batch_insert_rows in one call. - * 3. list_rows to identify incomplete rows, then parallel investigate_entity - * calls — each spawns an investigate agent that searches the web and fills - * missing columns via update_row_by_key. - * - * The orchestrator has no write tools of its own — all dataset writes happen - * inside extract agents (batch_insert_rows) and investigate agents - * (update_row_by_key), both scoped to the authorized dataset via closure. + * Per-iteration flow: + * 1. Parallel web searches (search_web) — 5 on iteration 1, up to 10 after. + * 2. extract_rows × ceil(targetRows/4) in parallel — each spawns one extract + * agent (maxSteps: 5) that calls fetch_page once and batch_insert_rows once. + * 3. list_rows — identifies complete vs. incomplete rows. + * 4. investigate_entity × up to 20 in parallel — prioritises rows with fewest + * missing columns; each spawns one investigate agent (maxSteps: 8) that + * runs one search round + fetches + update_row_by_key. * - * extract_rows, list_rows, and investigate_entity all share the same in-memory - * rowIndex closure returned by buildExtractTool. A pendingInserts Set in that - * same closure prevents parallel extract agents from double-inserting the same - * entity without requiring Convex-level upsert logic. + * All writes are inside sub-agents; the orchestrator has no write tools. + * extract_rows, list_rows, and investigate_entity share the rowIndex closure + * from buildExtractTool. pendingInserts prevents double-inserts across parallel + * extract agents without Convex-level changes. * * A fresh orchestrator is constructed per workflow run; do not cache. */ diff --git a/backend/src/mastra/tools/investigate-tool.ts b/backend/src/mastra/tools/investigate-tool.ts index b862a94..c4488b5 100644 --- a/backend/src/mastra/tools/investigate-tool.ts +++ b/backend/src/mastra/tools/investigate-tool.ts @@ -651,7 +651,9 @@ export function buildExtractTool( await investigateSemaphore.acquire(); try { - const result = await agent.generate(prompt, { maxSteps: 20 }); + // maxSteps: 8 = 1 search round (parallel) + 1-2 fetches + 1 update + buffer. + // The agent is explicitly instructed to do one search round and stop. + const result = await agent.generate(prompt, { maxSteps: 8 }); const parsed = parseInvestigateOutput(result.text); console.log( @@ -839,7 +841,9 @@ export function buildExtractTool( batchInsertRowsTool, ); - const result = await agent.generate(prompt, { maxSteps: 20 }); + // maxSteps: 5 = 1 fetch_page + 1 batch_insert_rows + 3 buffer. + // The agent is explicitly instructed to use exactly 2 tool calls. + const result = await agent.generate(prompt, { maxSteps: 5 }); const parsed = parseExtractOutput(result.text); console.log( diff --git a/backend/src/mastra/workflows/populate.ts b/backend/src/mastra/workflows/populate.ts index 228a339..d131efc 100644 --- a/backend/src/mastra/workflows/populate.ts +++ b/backend/src/mastra/workflows/populate.ts @@ -122,7 +122,14 @@ const agentStep = createStep({ env.POPULATE_TARGET_ROWS, ); try { - const result = await agent.generate(inputData.prompt, { maxSteps: 80 }); + // 150 steps budget breakdown per iteration: + // Phase 1: ~5 search_web calls + // Phase 2: up to ceil(targetRows/4) extract_rows calls (~5) + // Phase 3: 1 list_rows call + // Phase 4: up to 20 investigate_entity calls + // = ~31 orchestrator steps per iteration × ~4 iterations = ~124 steps needed. + // 150 gives comfortable headroom for additional iterations or larger targets. + const result = await agent.generate(inputData.prompt, { maxSteps: 150 }); return { text: result.text }; } catch (err) { const msg = err instanceof Error ? err.message : String(err); From 9ea12da61ec60b3f6a5a7fa2ce1b5cd84a89b9ef Mon Sep 17 00:00:00 2001 From: MMeteorL Date: Wed, 27 May 2026 13:57:01 -0700 Subject: [PATCH 09/10] =?UTF-8?q?fix:=20enforce=20extract=20caps=20in=20co?= =?UTF-8?q?de=20=E2=80=94=20single-use=20fetch=20wrapper=20+=20per-iterati?= =?UTF-8?q?on=20counter?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Root cause (from run logs): the orchestrator dispatched 13 extract_rows calls (hard cap = 5) and the metacritic extract agent made 5 fetch_page calls using all its maxSteps budget before ever calling batch_insert_rows. Both limits were soft LLM instructions — the model ignored them. Single-use fetch_page wrapper (investigate-tool.ts + web-tools.ts + extract.ts): - Extract executeFetchPage() logic into a shared export in web-tools.ts - buildExtractAgent() now receives fetchTool as a parameter (like batchInsertRowsTool) - In extract_rows.execute, create a per-call onceFetchTool that wraps executeFetchPage with a fetchUsed boolean: the second call returns a hard error message telling the agent to call batch_insert_rows immediately with what it already has - This makes the "one fetch per extract agent" limit physically unbypassable Per-iteration extract counter (investigate-tool.ts): - Add iterationExtractCount and MAX_EXTRACT_PER_ITER = ceil(targetRows/4) to closure - Synchronous check+increment before the first await is atomic in JS's event loop (same pattern as pendingInserts) — safe under parallel extract_rows calls - Calls beyond the cap return immediately with the URL as a LEAD for next iteration - list_rows.execute resets the counter (list_rows is called once at Phase 2→3 boundary) Co-Authored-By: Claude Sonnet 4.6 --- backend/src/mastra/agents/extract.ts | 15 +- backend/src/mastra/tools/investigate-tool.ts | 75 ++++++++- backend/src/mastra/tools/web-tools.ts | 158 ++++++++++--------- 3 files changed, 158 insertions(+), 90 deletions(-) diff --git a/backend/src/mastra/agents/extract.ts b/backend/src/mastra/agents/extract.ts index fa2234c..0021f69 100644 --- a/backend/src/mastra/agents/extract.ts +++ b/backend/src/mastra/agents/extract.ts @@ -1,6 +1,5 @@ import { Agent } from "@mastra/core/agent"; import { createOpenRouter } from "@openrouter/ai-sdk-provider"; -import { fetchPageTool } from "../tools/web-tools.js"; import type { PopulateColumn } from "../../pipeline/populate.js"; const openrouter = createOpenRouter({ @@ -62,15 +61,10 @@ SOURCE_QUALITY: ` * * The agent receives one URL, fetches the page, extracts every matching * entity, and calls batch_insert_rows once with the full entity list. - * It does NOT spawn investigation agents — that is the orchestrator's - * responsibility after list_rows. * - * Tools: fetch_page, batch_insert_rows. - * No search capability — it only fetches the URLs provided. - * - * batch_insert_rows is passed in from the buildExtractTool closure so the - * shared rowIndex and pendingInserts are maintained across all agents in one - * workflow run. + * Both fetchTool and batchInsertRowsTool are passed in (not imported here) + * so investigate-tool.ts can supply a single-use fetch_page wrapper that + * enforces the "one fetch per agent" hard limit at the code level. * * A fresh agent instance is constructed per extract_rows call; do not cache. */ @@ -78,6 +72,7 @@ export function buildExtractAgent( columns: PopulateColumn[], primaryKeyColumn: string, batchInsertRowsTool: ReturnType, + fetchTool: ReturnType, ): Agent { return new Agent({ id: "extract-agent", @@ -85,7 +80,7 @@ export function buildExtractAgent( instructions: buildExtractInstructions(columns, primaryKeyColumn), model: openrouter("deepseek/deepseek-v4-pro"), tools: { - fetch_page: fetchPageTool, + fetch_page: fetchTool, batch_insert_rows: batchInsertRowsTool, }, }); diff --git a/backend/src/mastra/tools/investigate-tool.ts b/backend/src/mastra/tools/investigate-tool.ts index c4488b5..76302db 100644 --- a/backend/src/mastra/tools/investigate-tool.ts +++ b/backend/src/mastra/tools/investigate-tool.ts @@ -2,6 +2,7 @@ import { createTool } from "@mastra/core/tools"; import { z } from "zod"; import { buildInvestigateAgent } from "../agents/investigate.js"; import { buildExtractAgent } from "../agents/extract.js"; +import { executeFetchPage } from "../tools/web-tools.js"; import type { AuthContext } from "../workflows/populate.js"; import type { PopulateColumn } from "../../pipeline/populate.js"; import { convex, internal } from "../../convex.js"; @@ -544,6 +545,14 @@ export function buildExtractTool( // Caps total concurrent investigate_entity agents across the whole run. const investigateSemaphore = new Semaphore(MAX_CONCURRENT_INVESTIGATIONS); + // Per-iteration extract call counter. Enforces the hard cap in code so the + // orchestrator LLM cannot exceed it even if it ignores the instruction. + // Reset to 0 each time list_rows is called (i.e. at the Phase 2→3 boundary). + // Synchronous check+increment before the first await is atomic in JS's + // single-threaded event loop — same pattern as pendingInserts. + const MAX_EXTRACT_PER_ITER = Math.max(3, Math.ceil(targetRows / 4)); + let iterationExtractCount = 0; + function countCompleteRows(): number { let n = 0; for (const { cells } of rowIndex.values()) { @@ -689,6 +698,10 @@ export function buildExtractTool( inputSchema: z.object({}), outputSchema: z.object({ summary: z.string() }), execute: async () => { + // Reset the per-iteration extract counter — list_rows is called once at + // the Phase 2→3 boundary, so this resets the cap for the next iteration. + iterationExtractCount = 0; + const complete = countCompleteRows(); const total = rowIndex.size; if (total === 0) return { summary: "No rows yet." }; @@ -753,11 +766,7 @@ export function buildExtractTool( source_quality: z.string(), }), execute: async ({ source_urls, context, notes }) => { - console.log( - `[extract_rows] ${logCtx} url=${source_urls[0]} known_rows=${rowIndex.size}`, - ); - - // Hard cap: if target is already reached, skip this batch. + // Hard cap: if target is already reached, skip without counting. const completeAtStart = countCompleteRows(); if (completeAtStart >= targetRows) { console.log( @@ -769,6 +778,25 @@ export function buildExtractTool( }; } + // Per-iteration cap enforced in code — synchronous check+increment is + // atomic before the first await (JS single-threaded event loop). + iterationExtractCount++; + if (iterationExtractCount > MAX_EXTRACT_PER_ITER) { + console.log( + `[extract_rows] ${logCtx} skipping — iteration cap reached ` + + `(${iterationExtractCount - 1}/${MAX_EXTRACT_PER_ITER}) url=${source_urls[0]}`, + ); + return { + leads: source_urls[0], // Return URL as a lead so next iteration can pick it up + source_quality: `Iteration extract cap (${MAX_EXTRACT_PER_ITER} per iteration) reached — URL deferred to next iteration.`, + }; + } + + console.log( + `[extract_rows] ${logCtx} url=${source_urls[0]} known_rows=${rowIndex.size} ` + + `(extract ${iterationExtractCount}/${MAX_EXTRACT_PER_ITER} this iteration)`, + ); + try { // Refresh rowIndex from Convex to pick up rows written by other // parallel extract_rows calls or investigate_entity agents since the @@ -829,6 +857,42 @@ export function buildExtractTool( primaryKeyColumn, ); + // Single-use fetch_page wrapper: enforces the "exactly one fetch per + // extract agent" constraint in code. On a second call it returns a + // hard-error message instructing the agent to call batch_insert_rows + // immediately with what it already has, rather than fetching more pages. + // Uses the shared executeFetchPage implementation from web-tools.ts. + let fetchUsed = false; + const onceFetchTool = createTool({ + id: "fetch_page", + description: + "Fetch a web page and return its content as clean markdown. " + + "HARD LIMIT: you may call this EXACTLY ONCE per extraction. " + + "A second call returns an error — call batch_insert_rows immediately with what you found on the first page.", + inputSchema: z.object({ url: z.string().describe("The URL to fetch") }), + outputSchema: z.object({ + title: z.string().optional(), + text: z.string().optional(), + error: z.string().optional(), + }), + execute: async ({ url }) => { + if (fetchUsed) { + console.log( + `[extract_rows/fetch] ${logCtx} BLOCKED second fetch_page call for ${url}`, + ); + return { + error: + "HARD LIMIT: fetch_page may only be called ONCE per extraction. " + + "You have already fetched one page this run. " + + "Call batch_insert_rows NOW with the entities from the first page. " + + "Add any additional page URLs to LEADS in your final output.", + }; + } + fetchUsed = true; + return executeFetchPage(url); + }, + }); + const notesBlock = notes ? `\nAdditional hints:\n${notes}` : ""; const prompt = `Fetch and extract from this URL: ${source_urls[0]}\n\n` + @@ -839,6 +903,7 @@ export function buildExtractTool( columns, primaryKeyColumn, batchInsertRowsTool, + onceFetchTool, ); // maxSteps: 5 = 1 fetch_page + 1 batch_insert_rows + 3 buffer. diff --git a/backend/src/mastra/tools/web-tools.ts b/backend/src/mastra/tools/web-tools.ts index 78740c3..1fc5f1e 100644 --- a/backend/src/mastra/tools/web-tools.ts +++ b/backend/src/mastra/tools/web-tools.ts @@ -74,6 +74,88 @@ export const searchWebTool = createTool({ }, }); +/** + * Core fetch implementation shared by fetchPageTool and the single-use + * wrapper built per extract agent in investigate-tool.ts. + */ +export async function executeFetchPage( + targetUrl: string, +): Promise<{ title?: string; text?: string; error?: string }> { + if (!targetUrl?.trim()) + return { error: "url is required and cannot be empty." }; + if (!targetUrl.startsWith("http://") && !targetUrl.startsWith("https://")) + return { error: `Invalid URL "${targetUrl}". Must start with http:// or https://.` }; + + const apiKey = process.env.TINYFISH_API_KEY; + if (!apiKey) + return { error: "TINYFISH_API_KEY is not configured. Page fetch is unavailable — use data from search snippets instead." }; + + console.log(`[fetch_page] Fetching: ${targetUrl}`); + + const controller = new AbortController(); + const timeout = setTimeout(() => controller.abort(), FETCH_TIMEOUT_MS); + try { + const res = await fetch("https://api.fetch.tinyfish.ai", { + method: "POST", + headers: { + "Content-Type": "application/json", + "X-API-Key": apiKey, + }, + body: JSON.stringify({ urls: [targetUrl], format: "markdown" }), + signal: controller.signal, + }); + clearTimeout(timeout); + + if (!res.ok) { + const body = await res.text(); + console.error(`[fetch_page] API error ${res.status}:`, body.slice(0, 200)); + if (res.status === 429) + return { error: "Fetch rate limit hit. Use data from search snippets instead." }; + if (res.status === 401) + return { error: "Invalid TINYFISH_API_KEY. Page fetch unavailable." }; + return { error: `Fetch API returned HTTP ${res.status}. Try a different URL or use search snippet data.` }; + } + + const data = await res.json(); + + if (data.errors?.length > 0) { + const err = data.errors[0]; + console.log(`[fetch_page] Failed: ${err.error}`); + const hints: Record = { + bot_blocked: "This site blocks automated access. Use the search snippet data instead.", + timeout: "Page took too long to load. Try a different URL.", + target_unreachable: "Could not connect to this site. Try a different URL.", + page_not_found: "Page not found (404). The URL may be outdated. Try a different one.", + target_http_error: `Site returned HTTP ${err.status ?? "error"}. Try a different URL.`, + }; + return { error: hints[err.error] ?? `Fetch failed: ${err.error}. Try a different URL.` }; + } + + const page = data.results?.[0]; + if (!page?.text) + return { error: "Page loaded but had no extractable text content. Try a different URL." }; + + let text = page.text as string; + const MAX_CHARS = 15000; + if (text.length > MAX_CHARS) { + text = text.slice(0, MAX_CHARS) + `\n\n[Truncated — showing first ${MAX_CHARS} of ${page.text.length} chars]`; + } + + console.log(`[fetch_page] Got ${(page.text as string).length} chars from "${page.title}" (returning ${text.length})`); + return { + title: page.title as string | undefined, + text, + }; + } catch (err) { + clearTimeout(timeout); + if (err instanceof Error && err.name === "AbortError") + return { error: "Page fetch timed out. Try a different URL or use search snippet data." }; + const msg = err instanceof Error ? err.message : String(err); + console.error(`[fetch_page] Failed:`, msg); + return { error: `Fetch failed: ${msg}. Use data from search snippets instead.` }; + } +} + export const fetchPageTool = createTool({ id: "fetch_page", description: @@ -86,79 +168,5 @@ export const fetchPageTool = createTool({ text: z.string().optional(), error: z.string().optional(), }), - execute: async ({ url: targetUrl }) => { - if (!targetUrl?.trim()) - return { error: "url is required and cannot be empty." }; - if (!targetUrl.startsWith("http://") && !targetUrl.startsWith("https://")) - return { error: `Invalid URL "${targetUrl}". Must start with http:// or https://.` }; - - const apiKey = process.env.TINYFISH_API_KEY; - if (!apiKey) - return { error: "TINYFISH_API_KEY is not configured. Page fetch is unavailable — use data from search snippets instead." }; - - console.log(`[fetch_page] Fetching: ${targetUrl}`); - - const controller = new AbortController(); - const timeout = setTimeout(() => controller.abort(), FETCH_TIMEOUT_MS); - try { - const res = await fetch("https://api.fetch.tinyfish.ai", { - method: "POST", - headers: { - "Content-Type": "application/json", - "X-API-Key": apiKey, - }, - body: JSON.stringify({ urls: [targetUrl], format: "markdown" }), - signal: controller.signal, - }); - clearTimeout(timeout); - - if (!res.ok) { - const body = await res.text(); - console.error(`[fetch_page] API error ${res.status}:`, body.slice(0, 200)); - if (res.status === 429) - return { error: "Fetch rate limit hit. Use data from search snippets instead." }; - if (res.status === 401) - return { error: "Invalid TINYFISH_API_KEY. Page fetch unavailable." }; - return { error: `Fetch API returned HTTP ${res.status}. Try a different URL or use search snippet data.` }; - } - - const data = await res.json(); - - if (data.errors?.length > 0) { - const err = data.errors[0]; - console.log(`[fetch_page] Failed: ${err.error}`); - const hints: Record = { - bot_blocked: "This site blocks automated access. Use the search snippet data instead.", - timeout: "Page took too long to load. Try a different URL.", - target_unreachable: "Could not connect to this site. Try a different URL.", - page_not_found: "Page not found (404). The URL may be outdated. Try a different one.", - target_http_error: `Site returned HTTP ${err.status ?? "error"}. Try a different URL.`, - }; - return { error: hints[err.error] ?? `Fetch failed: ${err.error}. Try a different URL.` }; - } - - const page = data.results?.[0]; - if (!page?.text) - return { error: "Page loaded but had no extractable text content. Try a different URL." }; - - let text = page.text as string; - const MAX_CHARS = 15000; - if (text.length > MAX_CHARS) { - text = text.slice(0, MAX_CHARS) + `\n\n[Truncated — showing first ${MAX_CHARS} of ${page.text.length} chars]`; - } - - console.log(`[fetch_page] Got ${(page.text as string).length} chars from "${page.title}" (returning ${text.length})`); - return { - title: page.title as string | undefined, - text, - }; - } catch (err) { - clearTimeout(timeout); - if (err instanceof Error && err.name === "AbortError") - return { error: "Page fetch timed out. Try a different URL or use search snippet data." }; - const msg = err instanceof Error ? err.message : String(err); - console.error(`[fetch_page] Failed:`, msg); - return { error: `Fetch failed: ${msg}. Use data from search snippets instead.` }; - } - }, + execute: async ({ url }) => executeFetchPage(url), }); From 8a1d7df5c6fe88a299db81834fa2278d55f76822 Mon Sep 17 00:00:00 2001 From: MMeteorL Date: Wed, 27 May 2026 14:08:29 -0700 Subject: [PATCH 10/10] chore: remove kimi-k2 tool-call repair code (no longer needed with deepseek-v4-pro) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Delete backend/src/mastra/tools/model-middleware.ts and remove its startup call from index.ts. The file contained two kimi-k2-specific workarounds: - withToolCallRepair (dead code — no agent ever used it) - patchMastraSanitizeToolCallInput (patched global JSON.parse to recover double-encoded tool-call arguments that kimi-k2 emitted) deepseek/deepseek-v4-pro does not exhibit the double-encoding bug so both repairs are unnecessary. Removing them eliminates the monkey-patch on the global JSON.parse and the now-spurious TypeScript error on the wrapStream middleware type. Co-Authored-By: Claude Sonnet 4.6 --- backend/src/index.ts | 7 - backend/src/mastra/tools/model-middleware.ts | 146 ------------------- 2 files changed, 153 deletions(-) delete mode 100644 backend/src/mastra/tools/model-middleware.ts diff --git a/backend/src/index.ts b/backend/src/index.ts index cd07d89..34dbdbc 100644 --- a/backend/src/index.ts +++ b/backend/src/index.ts @@ -13,13 +13,6 @@ import { sendTransactionalEmail } from "./email/send.js"; import { datasetReadyTemplate } from "./email/templates/dataset-ready.js"; import { capture, shutdown as shutdownAnalytics } from "./analytics/posthog.js"; import { EVENTS } from "./analytics/events.js"; -import { patchMastraSanitizeToolCallInput } from "./mastra/tools/model-middleware.js"; - -// Patch JSON.parse globally so that double-encoded tool-call inputs from kimi-k2 -// (e.g. `"{"key":"val"}"` instead of `{"key":"val"}`) are recovered before -// Mastra's stream parser throws "Error converting tool call input to JSON". -// Must run before any agent or workflow is executed. -await patchMastraSanitizeToolCallInput(); /** Domain part of an email, for analytics (we never log full addresses). */ function emailDomain(email: string): string { diff --git a/backend/src/mastra/tools/model-middleware.ts b/backend/src/mastra/tools/model-middleware.ts deleted file mode 100644 index 2f530b5..0000000 --- a/backend/src/mastra/tools/model-middleware.ts +++ /dev/null @@ -1,146 +0,0 @@ -import { wrapLanguageModel } from "ai"; - -/** - * Attempt to recover a double-encoded JSON tool-call input string. - * - * kimi-k2 via OpenRouter's non-streaming path sets - * `input = toolCall.function.arguments` - * without validating that the string is parseable JSON. When the model - * wraps its arguments in an extra pair of quotes (i.e. the `function.arguments` - * field is `"{"primary_key":"Pocket",...}"` instead of - * `{"primary_key":"Pocket",...}`), the string starts with `"{"` which is a - * JSON-encoded string literal — and JSON.parse then hits a trailing `}` or - * other garbage that makes the parse fail. - * - * Recovery strategy: find the first `{` and the last `}` in the raw string - * and extract that substring. If the substring is valid JSON, use it; - * otherwise leave the original string unchanged so the normal error path - * can still handle it. - */ -function tryUnwrapDoubleEncodedInput(raw: string): string { - // Only attempt recovery when the string starts with `"` — the hallmark of - // the double-encoding pattern. Normal JSON objects start with `{`. - if (!raw.startsWith('"')) return raw; - - const firstBrace = raw.indexOf("{"); - const lastBrace = raw.lastIndexOf("}"); - - if (firstBrace === -1 || lastBrace <= firstBrace) return raw; - - const candidate = raw.slice(firstBrace, lastBrace + 1); - try { - JSON.parse(candidate); - console.log( - `[model-middleware] Repaired double-encoded tool call input (recovered ${candidate.length} chars)`, - ); - return candidate; - } catch { - return raw; // Cannot repair — leave for Mastra's normal error path - } -} - -// ─── Approach 1: wrapLanguageModel middleware (intercepts at AI SDK stream level) ─ - -/** - * Wrap a language model with a stream middleware that repairs double-encoded - * tool-call inputs before Mastra processes them. - * - * kimi-k2 (via OpenRouter) occasionally wraps tool-call arguments in an extra - * JSON string layer. Mastra's `sanitizeToolCallInput` / `tryRepairJson` cannot - * recover this pattern, so the tool call silently drops (args = undefined). - * This middleware intercepts `tool-call` stream chunks and unwraps the extra - * layer so Mastra receives clean JSON. - * - * Usage: - * model: withToolCallRepair(openrouter("moonshotai/kimi-k2-0905")) - */ -export function withToolCallRepair(model: any): any { - return wrapLanguageModel({ - model, - middleware: { - wrapStream: async ({ doStream }: any) => { - console.log("[model-middleware] wrapStream called"); - const result = await doStream(); - const { stream, ...rest } = result; - - const fixedStream = stream.pipeThrough( - new TransformStream({ - transform(chunk: any, controller: any) { - if ( - chunk != null && - chunk.type === "tool-call" && - typeof chunk.input === "string" - ) { - console.log(`[model-middleware] tool-call chunk: ${chunk.toolName} input starts with: ${chunk.input.slice(0, 30)}`); - const fixedInput = tryUnwrapDoubleEncodedInput(chunk.input); - controller.enqueue({ ...chunk, input: fixedInput }); - } else { - controller.enqueue(chunk); - } - }, - }), - ); - - return { stream: fixedStream, ...rest }; - }, - }, - }); -} - -// ─── Approach 2: Startup monkey-patch for Mastra's sanitizeToolCallInput ──────── - -/** - * Patch Mastra's internal sanitizeToolCallInput to handle double-encoded JSON. - * - * This is a fallback that operates at a lower level than the wrapStream - * middleware. It patches the compiled Mastra module directly so that even if - * the wrapStream approach doesn't intercept a particular code path, the repair - * still happens before JSON.parse throws. - * - * Call this once at application startup (e.g. in src/index.ts before starting - * Fastify) so it takes effect for all subsequent agent runs. - */ -export async function patchMastraSanitizeToolCallInput(): Promise { - try { - // The chunk file that contains sanitizeToolCallInput is a private module - // inside @mastra/core. We use a dynamic import to access it so we can - // wrap its exported convertFullStreamChunkToMastra function. - // However, since sanitizeToolCallInput is internal and not exported, we - // patch the stream processing by intercepting at the AISDKV5InputStream - // level instead. - // - // Strategy: intercept JSON.parse within the Mastra module scope by - // wrapping the global JSON.parse to repair double-encoded inputs when - // called from Mastra's tool call processing context. - const originalJsonParse = JSON.parse; - (JSON as any).parse = function patchedJsonParse(text: string, ...rest: any[]) { - try { - return originalJsonParse.call(this, text, ...rest); - } catch (err) { - // If JSON.parse fails on a string that starts with `"`, try the - // double-encoding recovery: extract the JSON object between the - // first `{` and last `}`. - if (typeof text === "string" && text.startsWith('"')) { - const firstBrace = text.indexOf("{"); - const lastBrace = text.lastIndexOf("}"); - if (firstBrace !== -1 && lastBrace > firstBrace) { - const candidate = text.slice(firstBrace, lastBrace + 1); - try { - const recovered = originalJsonParse.call(this, candidate, ...rest); - console.log( - `[model-middleware/patch] Recovered double-encoded JSON (${candidate.length} chars): ${candidate.slice(0, 60)}...`, - ); - return recovered; - } catch { - // Recovery also failed — re-throw the original error - } - } - } - throw err; - } - }; - console.log("[model-middleware] JSON.parse patched to recover double-encoded tool call inputs"); - } catch (err) { - console.warn("[model-middleware] Failed to patch JSON.parse:", err); - } -}