From 4e467ec8305c327513aad0f3fa40b559d713104e Mon Sep 17 00:00:00 2001 From: pranavjanakiraman Date: Mon, 1 Jun 2026 15:48:33 -0700 Subject: [PATCH 1/3] Cap dataset population at 100 rows --- backend/src/mastra/agents/populate.ts | 4 ++-- backend/src/mastra/tools/investigate-tool.ts | 15 +++++++++++++++ backend/src/mastra/workflows/populate.ts | 3 ++- frontend/convex/datasetRows.ts | 18 ++++++++++++------ 4 files changed, 31 insertions(+), 9 deletions(-) diff --git a/backend/src/mastra/agents/populate.ts b/backend/src/mastra/agents/populate.ts index fa0837f..85edf53 100644 --- a/backend/src/mastra/agents/populate.ts +++ b/backend/src/mastra/agents/populate.ts @@ -12,7 +12,7 @@ const openrouter = createOpenRouter({ const INSTRUCTIONS = `You are an expert dataset builder. You conduct research using your web tools. You do broad research to see which rows to add, and then you spin up sub-agents that can do the deep research and fill in each row for you. -Your job is to make sure you dispatch and manage your army of sub agents to build up a dataset with 100 rows in it. +Your job is to make sure you dispatch and manage your army of sub agents to build up a dataset with 100 rows in it. Stop as soon as the dataset reaches 100 rows. WORKFLOW: 1. Understand the data that is is needed and do some research to find places on the web where this data may be obvious and easy to find, collect these links to see what the task of scraping the web is going to look like. @@ -22,7 +22,7 @@ If the dataset is to look at YC Companies, collect links for the YC Startup regi 3. See what the subagent reports back with, if all good and it gives you some information, use that to give better instuctions to subsequent sub agents. -Keep going till you have 100 rows. +Keep going until you have 100 rows, then finish immediately. If run_subagent reports ROW_LIMIT_REACHED, stop calling tools and finish the run. This process should become faster overtime as you just find new rows to go and build, and you keep invoking sub agents in parallel to fill them in. diff --git a/backend/src/mastra/tools/investigate-tool.ts b/backend/src/mastra/tools/investigate-tool.ts index d00d43c..62610af 100644 --- a/backend/src/mastra/tools/investigate-tool.ts +++ b/backend/src/mastra/tools/investigate-tool.ts @@ -1,10 +1,13 @@ import { createTool } from "@mastra/core/tools"; import { z } from "zod"; +import { convex, internal } from "../../convex.js"; import { buildInvestigateAgent } from "../agents/investigate.js"; import type { AuthContext } from "../workflows/populate.js"; import type { PopulateColumn } from "../../pipeline/populate.js"; import type { RunMetrics } from "../run-metrics.js"; +const MAX_DATASET_ROWS = 100; + const investigateInputSchema = z.object({ entity_hint: z .string() @@ -82,6 +85,18 @@ export function buildSubagentTool( inputSchema: investigateInputSchema, outputSchema: investigateOutputSchema, execute: async ({ entity_hint, primary_keys, context, urls, notes }) => { + const rowCount = await convex.query(internal.datasetRows.countByDataset, { + datasetId: authorizedDatasetId, + }); + if (rowCount >= MAX_DATASET_ROWS) { + return { + inserted: false, + reason: `ROW_LIMIT_REACHED: BigSet datasets are capped at ${MAX_DATASET_ROWS} rows. Stop calling run_subagent and finish the run.`, + row_summary: undefined, + clues: undefined, + }; + } + if (metrics) metrics.investigateCalls++; console.log( `[run_subagent] spawning subagent user=${authContext.authorizedUserId} run=${authContext.workflowRunId} dataset=${authorizedDatasetId} entity="${entity_hint}" pk=${JSON.stringify(primary_keys)}`, diff --git a/backend/src/mastra/workflows/populate.ts b/backend/src/mastra/workflows/populate.ts index a2a8246..65320a5 100644 --- a/backend/src/mastra/workflows/populate.ts +++ b/backend/src/mastra/workflows/populate.ts @@ -196,7 +196,8 @@ Data fields to collect: ${columnsDesc}${pkNote}${manifestNote}${strategyNote} Search the web broadly to find real entities that fit this dataset topic. -For each lead you find, call run_subagent with the primary key values and any context/URLs you have found.`; +For each lead you find, call run_subagent with the primary key values and any context/URLs you have found. +Stop the populate run as soon as the dataset reaches 100 rows.`; console.log( `[build-prompt] Built prompt for ${inputData.datasetName} (${inputData.columns.length} columns, strategy=${inputData.enumerationStrategy})`, diff --git a/frontend/convex/datasetRows.ts b/frontend/convex/datasetRows.ts index f69316d..db609f7 100644 --- a/frontend/convex/datasetRows.ts +++ b/frontend/convex/datasetRows.ts @@ -5,6 +5,8 @@ import type { Id } from "./_generated/dataModel.js"; import { assertRowInDataset, loadReadableDataset } from "./lib/authz.js"; import { consumeQuotaForDataset } from "./lib/quota.js"; +const MAX_DATASET_ROWS = 100; + /** * Authoritative row count for a dataset. O(N), so use only on the slow * paths: self-heal in `insert` / `remove` when the dataset doc predates @@ -71,6 +73,16 @@ export const insert = internalMutation({ const dataset = await ctx.db.get(args.datasetId); if (!dataset) throw new Error("Dataset not found"); + const previousCount = + typeof dataset.rowCount === "number" + ? dataset.rowCount + : await actualRowCount(ctx, args.datasetId); + if (previousCount >= MAX_DATASET_ROWS) { + throw new Error( + `Row limit reached: BigSet datasets are capped at ${MAX_DATASET_ROWS} rows. Stop inserting rows and finish the run.`, + ); + } + // Dedup: reject inserts that collide on primary key columns. // Runs BEFORE quota so rejected dupes don't burn quota. const pkColumns = (dataset.columns ?? []).filter( @@ -110,11 +122,6 @@ export const insert = internalMutation({ // Quota consumption only happens for genuine new rows. await consumeQuotaForDataset(ctx, args.datasetId, 1); - const previousCount = - typeof dataset.rowCount === "number" - ? dataset.rowCount - : await actualRowCount(ctx, args.datasetId); - const rowId = await ctx.db.insert("datasetRows", args); await ctx.db.patch(args.datasetId, { rowCount: previousCount + 1 }); @@ -327,4 +334,3 @@ export const listInternal = internalQuery({ .collect(); }, }); - From efb4510bf02335a6f178a2c1463bbcc5c4961288 Mon Sep 17 00:00:00 2001 From: pranavjanakiraman Date: Mon, 1 Jun 2026 16:28:43 -0700 Subject: [PATCH 2/3] Handle row cap count failures in subagent tool --- backend/src/mastra/tools/investigate-tool.ts | 33 ++++++++++---------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/backend/src/mastra/tools/investigate-tool.ts b/backend/src/mastra/tools/investigate-tool.ts index 62610af..2be1016 100644 --- a/backend/src/mastra/tools/investigate-tool.ts +++ b/backend/src/mastra/tools/investigate-tool.ts @@ -85,23 +85,24 @@ export function buildSubagentTool( inputSchema: investigateInputSchema, outputSchema: investigateOutputSchema, execute: async ({ entity_hint, primary_keys, context, urls, notes }) => { - const rowCount = await convex.query(internal.datasetRows.countByDataset, { - datasetId: authorizedDatasetId, - }); - if (rowCount >= MAX_DATASET_ROWS) { - return { - inserted: false, - reason: `ROW_LIMIT_REACHED: BigSet datasets are capped at ${MAX_DATASET_ROWS} rows. Stop calling run_subagent and finish the run.`, - row_summary: undefined, - clues: undefined, - }; - } - - if (metrics) metrics.investigateCalls++; - console.log( - `[run_subagent] spawning subagent user=${authContext.authorizedUserId} run=${authContext.workflowRunId} dataset=${authorizedDatasetId} entity="${entity_hint}" pk=${JSON.stringify(primary_keys)}`, - ); try { + const rowCount = await convex.query(internal.datasetRows.countByDataset, { + datasetId: authorizedDatasetId, + }); + if (rowCount >= MAX_DATASET_ROWS) { + return { + inserted: false, + reason: `ROW_LIMIT_REACHED: BigSet datasets are capped at ${MAX_DATASET_ROWS} rows. Stop calling run_subagent and finish the run.`, + row_summary: undefined, + clues: undefined, + }; + } + + if (metrics) metrics.investigateCalls++; + console.log( + `[run_subagent] spawning subagent user=${authContext.authorizedUserId} run=${authContext.workflowRunId} dataset=${authorizedDatasetId} entity="${entity_hint}" pk=${JSON.stringify(primary_keys)}`, + ); + const agent = buildInvestigateAgent( authorizedDatasetId, authContext, From 4def5e8c61645461ead28e4b90a7ed03ea5d134d Mon Sep 17 00:00:00 2001 From: pranavjanakiraman Date: Mon, 1 Jun 2026 16:33:34 -0700 Subject: [PATCH 3/3] Mention row limit sentinel in populate prompt --- backend/src/mastra/workflows/populate.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/backend/src/mastra/workflows/populate.ts b/backend/src/mastra/workflows/populate.ts index 65320a5..ea06e0e 100644 --- a/backend/src/mastra/workflows/populate.ts +++ b/backend/src/mastra/workflows/populate.ts @@ -197,6 +197,7 @@ ${columnsDesc}${pkNote}${manifestNote}${strategyNote} Search the web broadly to find real entities that fit this dataset topic. For each lead you find, call run_subagent with the primary key values and any context/URLs you have found. +If run_subagent returns ROW_LIMIT_REACHED, stop immediately and do not make any more tool calls. Stop the populate run as soon as the dataset reaches 100 rows.`; console.log(