From 4e467ec8305c327513aad0f3fa40b559d713104e Mon Sep 17 00:00:00 2001 From: pranavjanakiraman Date: Mon, 1 Jun 2026 15:48:33 -0700 Subject: [PATCH 1/6] Cap dataset population at 100 rows --- backend/src/mastra/agents/populate.ts | 4 ++-- backend/src/mastra/tools/investigate-tool.ts | 15 +++++++++++++++ backend/src/mastra/workflows/populate.ts | 3 ++- frontend/convex/datasetRows.ts | 18 ++++++++++++------ 4 files changed, 31 insertions(+), 9 deletions(-) diff --git a/backend/src/mastra/agents/populate.ts b/backend/src/mastra/agents/populate.ts index fa0837f..85edf53 100644 --- a/backend/src/mastra/agents/populate.ts +++ b/backend/src/mastra/agents/populate.ts @@ -12,7 +12,7 @@ const openrouter = createOpenRouter({ const INSTRUCTIONS = `You are an expert dataset builder. You conduct research using your web tools. You do broad research to see which rows to add, and then you spin up sub-agents that can do the deep research and fill in each row for you. -Your job is to make sure you dispatch and manage your army of sub agents to build up a dataset with 100 rows in it. +Your job is to make sure you dispatch and manage your army of sub agents to build up a dataset with 100 rows in it. Stop as soon as the dataset reaches 100 rows. WORKFLOW: 1. Understand the data that is is needed and do some research to find places on the web where this data may be obvious and easy to find, collect these links to see what the task of scraping the web is going to look like. @@ -22,7 +22,7 @@ If the dataset is to look at YC Companies, collect links for the YC Startup regi 3. See what the subagent reports back with, if all good and it gives you some information, use that to give better instuctions to subsequent sub agents. -Keep going till you have 100 rows. +Keep going until you have 100 rows, then finish immediately. If run_subagent reports ROW_LIMIT_REACHED, stop calling tools and finish the run. This process should become faster overtime as you just find new rows to go and build, and you keep invoking sub agents in parallel to fill them in. diff --git a/backend/src/mastra/tools/investigate-tool.ts b/backend/src/mastra/tools/investigate-tool.ts index d00d43c..62610af 100644 --- a/backend/src/mastra/tools/investigate-tool.ts +++ b/backend/src/mastra/tools/investigate-tool.ts @@ -1,10 +1,13 @@ import { createTool } from "@mastra/core/tools"; import { z } from "zod"; +import { convex, internal } from "../../convex.js"; import { buildInvestigateAgent } from "../agents/investigate.js"; import type { AuthContext } from "../workflows/populate.js"; import type { PopulateColumn } from "../../pipeline/populate.js"; import type { RunMetrics } from "../run-metrics.js"; +const MAX_DATASET_ROWS = 100; + const investigateInputSchema = z.object({ entity_hint: z .string() @@ -82,6 +85,18 @@ export function buildSubagentTool( inputSchema: investigateInputSchema, outputSchema: investigateOutputSchema, execute: async ({ entity_hint, primary_keys, context, urls, notes }) => { + const rowCount = await convex.query(internal.datasetRows.countByDataset, { + datasetId: authorizedDatasetId, + }); + if (rowCount >= MAX_DATASET_ROWS) { + return { + inserted: false, + reason: `ROW_LIMIT_REACHED: BigSet datasets are capped at ${MAX_DATASET_ROWS} rows. Stop calling run_subagent and finish the run.`, + row_summary: undefined, + clues: undefined, + }; + } + if (metrics) metrics.investigateCalls++; console.log( `[run_subagent] spawning subagent user=${authContext.authorizedUserId} run=${authContext.workflowRunId} dataset=${authorizedDatasetId} entity="${entity_hint}" pk=${JSON.stringify(primary_keys)}`, diff --git a/backend/src/mastra/workflows/populate.ts b/backend/src/mastra/workflows/populate.ts index a2a8246..65320a5 100644 --- a/backend/src/mastra/workflows/populate.ts +++ b/backend/src/mastra/workflows/populate.ts @@ -196,7 +196,8 @@ Data fields to collect: ${columnsDesc}${pkNote}${manifestNote}${strategyNote} Search the web broadly to find real entities that fit this dataset topic. -For each lead you find, call run_subagent with the primary key values and any context/URLs you have found.`; +For each lead you find, call run_subagent with the primary key values and any context/URLs you have found. +Stop the populate run as soon as the dataset reaches 100 rows.`; console.log( `[build-prompt] Built prompt for ${inputData.datasetName} (${inputData.columns.length} columns, strategy=${inputData.enumerationStrategy})`, diff --git a/frontend/convex/datasetRows.ts b/frontend/convex/datasetRows.ts index f69316d..db609f7 100644 --- a/frontend/convex/datasetRows.ts +++ b/frontend/convex/datasetRows.ts @@ -5,6 +5,8 @@ import type { Id } from "./_generated/dataModel.js"; import { assertRowInDataset, loadReadableDataset } from "./lib/authz.js"; import { consumeQuotaForDataset } from "./lib/quota.js"; +const MAX_DATASET_ROWS = 100; + /** * Authoritative row count for a dataset. O(N), so use only on the slow * paths: self-heal in `insert` / `remove` when the dataset doc predates @@ -71,6 +73,16 @@ export const insert = internalMutation({ const dataset = await ctx.db.get(args.datasetId); if (!dataset) throw new Error("Dataset not found"); + const previousCount = + typeof dataset.rowCount === "number" + ? dataset.rowCount + : await actualRowCount(ctx, args.datasetId); + if (previousCount >= MAX_DATASET_ROWS) { + throw new Error( + `Row limit reached: BigSet datasets are capped at ${MAX_DATASET_ROWS} rows. Stop inserting rows and finish the run.`, + ); + } + // Dedup: reject inserts that collide on primary key columns. // Runs BEFORE quota so rejected dupes don't burn quota. const pkColumns = (dataset.columns ?? []).filter( @@ -110,11 +122,6 @@ export const insert = internalMutation({ // Quota consumption only happens for genuine new rows. await consumeQuotaForDataset(ctx, args.datasetId, 1); - const previousCount = - typeof dataset.rowCount === "number" - ? dataset.rowCount - : await actualRowCount(ctx, args.datasetId); - const rowId = await ctx.db.insert("datasetRows", args); await ctx.db.patch(args.datasetId, { rowCount: previousCount + 1 }); @@ -327,4 +334,3 @@ export const listInternal = internalQuery({ .collect(); }, }); - From efb4510bf02335a6f178a2c1463bbcc5c4961288 Mon Sep 17 00:00:00 2001 From: pranavjanakiraman Date: Mon, 1 Jun 2026 16:28:43 -0700 Subject: [PATCH 2/6] Handle row cap count failures in subagent tool --- backend/src/mastra/tools/investigate-tool.ts | 33 ++++++++++---------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/backend/src/mastra/tools/investigate-tool.ts b/backend/src/mastra/tools/investigate-tool.ts index 62610af..2be1016 100644 --- a/backend/src/mastra/tools/investigate-tool.ts +++ b/backend/src/mastra/tools/investigate-tool.ts @@ -85,23 +85,24 @@ export function buildSubagentTool( inputSchema: investigateInputSchema, outputSchema: investigateOutputSchema, execute: async ({ entity_hint, primary_keys, context, urls, notes }) => { - const rowCount = await convex.query(internal.datasetRows.countByDataset, { - datasetId: authorizedDatasetId, - }); - if (rowCount >= MAX_DATASET_ROWS) { - return { - inserted: false, - reason: `ROW_LIMIT_REACHED: BigSet datasets are capped at ${MAX_DATASET_ROWS} rows. Stop calling run_subagent and finish the run.`, - row_summary: undefined, - clues: undefined, - }; - } - - if (metrics) metrics.investigateCalls++; - console.log( - `[run_subagent] spawning subagent user=${authContext.authorizedUserId} run=${authContext.workflowRunId} dataset=${authorizedDatasetId} entity="${entity_hint}" pk=${JSON.stringify(primary_keys)}`, - ); try { + const rowCount = await convex.query(internal.datasetRows.countByDataset, { + datasetId: authorizedDatasetId, + }); + if (rowCount >= MAX_DATASET_ROWS) { + return { + inserted: false, + reason: `ROW_LIMIT_REACHED: BigSet datasets are capped at ${MAX_DATASET_ROWS} rows. Stop calling run_subagent and finish the run.`, + row_summary: undefined, + clues: undefined, + }; + } + + if (metrics) metrics.investigateCalls++; + console.log( + `[run_subagent] spawning subagent user=${authContext.authorizedUserId} run=${authContext.workflowRunId} dataset=${authorizedDatasetId} entity="${entity_hint}" pk=${JSON.stringify(primary_keys)}`, + ); + const agent = buildInvestigateAgent( authorizedDatasetId, authContext, From 4def5e8c61645461ead28e4b90a7ed03ea5d134d Mon Sep 17 00:00:00 2001 From: pranavjanakiraman Date: Mon, 1 Jun 2026 16:33:34 -0700 Subject: [PATCH 3/6] Mention row limit sentinel in populate prompt --- backend/src/mastra/workflows/populate.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/backend/src/mastra/workflows/populate.ts b/backend/src/mastra/workflows/populate.ts index 65320a5..ea06e0e 100644 --- a/backend/src/mastra/workflows/populate.ts +++ b/backend/src/mastra/workflows/populate.ts @@ -197,6 +197,7 @@ ${columnsDesc}${pkNote}${manifestNote}${strategyNote} Search the web broadly to find real entities that fit this dataset topic. For each lead you find, call run_subagent with the primary key values and any context/URLs you have found. +If run_subagent returns ROW_LIMIT_REACHED, stop immediately and do not make any more tool calls. Stop the populate run as soon as the dataset reaches 100 rows.`; console.log( From af35c5b4c602c3809774950937bee605fd1a1776 Mon Sep 17 00:00:00 2001 From: pranavjanakiraman Date: Thu, 4 Jun 2026 20:26:52 -0700 Subject: [PATCH 4/6] Allow custom dataset max rows --- backend/src/index.ts | 14 +++++- backend/src/mastra/agents/populate.ts | 12 +++-- backend/src/mastra/tools/investigate-tool.ts | 7 ++- backend/src/mastra/workflows/populate.ts | 5 +- backend/src/pipeline/populate.ts | 1 + frontend/app/dataset/[id]/page.tsx | 1 + frontend/app/dataset/new/page.tsx | 52 +++++++++++++++++++- frontend/convex/datasetRows.ts | 7 +-- frontend/convex/datasets.ts | 20 +++++++- frontend/convex/schema.ts | 3 ++ frontend/lib/backend.ts | 3 +- 11 files changed, 108 insertions(+), 17 deletions(-) diff --git a/backend/src/index.ts b/backend/src/index.ts index 57c75e9..22c5d3d 100644 --- a/backend/src/index.ts +++ b/backend/src/index.ts @@ -492,6 +492,7 @@ function startLocalRefreshScheduler( datasetId: dataset.datasetId, datasetName: dataset.datasetName, description: dataset.description, + maxRowCount: dataset.maxRowCount ?? 100, columns: dataset.columns, }, run, @@ -692,6 +693,14 @@ await fastify.register(async (instance) => { throw new Error(`Unexpected populate claim outcome: ${populateOutcome}`); } + const dataset = await convex.query(internal.datasets.getInternal, { + id: parsed.data.datasetId, + }); + if (!dataset) { + await setDatasetPopulateStatus(parsed.data.datasetId, "failed", "Dataset not found"); + return reply.code(404).send({ error: "Dataset not found" }); + } + const { getModelConfig } = await import("./config/models.js"); const modelConfig = await getModelConfig(auth.userId); @@ -705,7 +714,10 @@ await fastify.register(async (instance) => { } void runPopulateWorkflowInBackground({ - input: parsed.data, + input: { + ...parsed.data, + maxRowCount: dataset.maxRowCount ?? parsed.data.maxRowCount, + }, run, authorizedUserId: auth.userId, logger: req.log, diff --git a/backend/src/mastra/agents/populate.ts b/backend/src/mastra/agents/populate.ts index 85edf53..155492a 100644 --- a/backend/src/mastra/agents/populate.ts +++ b/backend/src/mastra/agents/populate.ts @@ -10,9 +10,10 @@ const openrouter = createOpenRouter({ apiKey: process.env.OPENROUTER_API_KEY!, }); -const INSTRUCTIONS = `You are an expert dataset builder. You conduct research using your web tools. +function buildInstructions(maxRowCount: number): string { + return `You are an expert dataset builder. You conduct research using your web tools. You do broad research to see which rows to add, and then you spin up sub-agents that can do the deep research and fill in each row for you. -Your job is to make sure you dispatch and manage your army of sub agents to build up a dataset with 100 rows in it. Stop as soon as the dataset reaches 100 rows. +Your job is to make sure you dispatch and manage your army of sub agents to build up a dataset with ${maxRowCount} rows in it. Stop as soon as the dataset reaches ${maxRowCount} rows. WORKFLOW: 1. Understand the data that is is needed and do some research to find places on the web where this data may be obvious and easy to find, collect these links to see what the task of scraping the web is going to look like. @@ -22,12 +23,13 @@ If the dataset is to look at YC Companies, collect links for the YC Startup regi 3. See what the subagent reports back with, if all good and it gives you some information, use that to give better instuctions to subsequent sub agents. -Keep going until you have 100 rows, then finish immediately. If run_subagent reports ROW_LIMIT_REACHED, stop calling tools and finish the run. +Keep going until you have ${maxRowCount} rows, then finish immediately. If run_subagent reports ROW_LIMIT_REACHED, stop calling tools and finish the run. This process should become faster overtime as you just find new rows to go and build, and you keep invoking sub agents in parallel to fill them in. Duplicates are rejected automatically based on primary key columns. If a subagent reports a duplicate, don't re-investigate the same entity — move on to a new one. `; +} /** * Build the orchestrator Agent for a populate run. @@ -42,6 +44,7 @@ export function buildPopulateAgent( authorizedDatasetId: string, authContext: AuthContext, columns: PopulateColumn[], + maxRowCount: number, metrics?: RunMetrics, ): Agent { const modelSlug = authContext.modelConfig!.populateOrchestrator; @@ -49,7 +52,7 @@ export function buildPopulateAgent( return new Agent({ id: "populate-agent", name: "Dataset Populate Orchestrator", - instructions: INSTRUCTIONS, + instructions: buildInstructions(maxRowCount), model: openrouter(modelSlug), tools: { search_web: searchWebTool, @@ -58,6 +61,7 @@ export function buildPopulateAgent( authorizedDatasetId, authContext, columns, + maxRowCount, metrics, ), }, diff --git a/backend/src/mastra/tools/investigate-tool.ts b/backend/src/mastra/tools/investigate-tool.ts index 2be1016..80573d0 100644 --- a/backend/src/mastra/tools/investigate-tool.ts +++ b/backend/src/mastra/tools/investigate-tool.ts @@ -6,8 +6,6 @@ import type { AuthContext } from "../workflows/populate.js"; import type { PopulateColumn } from "../../pipeline/populate.js"; import type { RunMetrics } from "../run-metrics.js"; -const MAX_DATASET_ROWS = 100; - const investigateInputSchema = z.object({ entity_hint: z .string() @@ -76,6 +74,7 @@ export function buildSubagentTool( authorizedDatasetId: string, authContext: AuthContext, columns: PopulateColumn[], + maxRowCount: number, metrics?: RunMetrics, ) { return createTool({ @@ -89,10 +88,10 @@ export function buildSubagentTool( const rowCount = await convex.query(internal.datasetRows.countByDataset, { datasetId: authorizedDatasetId, }); - if (rowCount >= MAX_DATASET_ROWS) { + if (rowCount >= maxRowCount) { return { inserted: false, - reason: `ROW_LIMIT_REACHED: BigSet datasets are capped at ${MAX_DATASET_ROWS} rows. Stop calling run_subagent and finish the run.`, + reason: `ROW_LIMIT_REACHED: this BigSet dataset is capped at ${maxRowCount} rows. Stop calling run_subagent and finish the run.`, row_summary: undefined, clues: undefined, }; diff --git a/backend/src/mastra/workflows/populate.ts b/backend/src/mastra/workflows/populate.ts index ea06e0e..868d1ff 100644 --- a/backend/src/mastra/workflows/populate.ts +++ b/backend/src/mastra/workflows/populate.ts @@ -152,6 +152,7 @@ const buildPromptOutputSchema = z.object({ authorizedDatasetId: z.string(), authContext: authContextSchema, columns: z.array(populateColumnSchema), + maxRowCount: z.number().int().min(1), }); const buildPromptStep = createStep({ @@ -198,7 +199,7 @@ ${columnsDesc}${pkNote}${manifestNote}${strategyNote} Search the web broadly to find real entities that fit this dataset topic. For each lead you find, call run_subagent with the primary key values and any context/URLs you have found. If run_subagent returns ROW_LIMIT_REACHED, stop immediately and do not make any more tool calls. -Stop the populate run as soon as the dataset reaches 100 rows.`; +Stop the populate run as soon as the dataset reaches ${inputData.maxRowCount} rows.`; console.log( `[build-prompt] Built prompt for ${inputData.datasetName} (${inputData.columns.length} columns, strategy=${inputData.enumerationStrategy})`, @@ -208,6 +209,7 @@ Stop the populate run as soon as the dataset reaches 100 rows.`; authorizedDatasetId: inputData.datasetId, authContext: inputData.authContext, columns: inputData.columns, + maxRowCount: inputData.maxRowCount, }; }, }); @@ -241,6 +243,7 @@ const agentStep = createStep({ inputData.authorizedDatasetId, inputData.authContext, inputData.columns, + inputData.maxRowCount, metrics, ); const result = await agent.generate(inputData.prompt, { maxSteps: 80 }); diff --git a/backend/src/pipeline/populate.ts b/backend/src/pipeline/populate.ts index 55d37aa..84be087 100644 --- a/backend/src/pipeline/populate.ts +++ b/backend/src/pipeline/populate.ts @@ -12,6 +12,7 @@ export const datasetContextSchema = z.object({ datasetId: z.string().min(1), datasetName: z.string(), description: z.string(), + maxRowCount: z.number().int().min(1).default(100), columns: z.array(populateColumnSchema).min(1), rowIds: z.array(z.string()).min(1).optional(), }); diff --git a/frontend/app/dataset/[id]/page.tsx b/frontend/app/dataset/[id]/page.tsx index adba9a0..ce3d420 100644 --- a/frontend/app/dataset/[id]/page.tsx +++ b/frontend/app/dataset/[id]/page.tsx @@ -61,6 +61,7 @@ export default function DatasetPage() { dataset._id, dataset.name, dataset.description, + dataset.maxRowCount ?? 100, dataset.columns, token, ); diff --git a/frontend/app/dataset/new/page.tsx b/frontend/app/dataset/new/page.tsx index 285cbde..bb4dded 100644 --- a/frontend/app/dataset/new/page.tsx +++ b/frontend/app/dataset/new/page.tsx @@ -4,7 +4,7 @@ import { useEffect, useState, useRef } from "react"; import { useRouter } from "next/navigation"; import Link from "next/link"; import { useAuth } from "@clerk/nextjs"; -import { useMutation, useConvexAuth } from "convex/react"; +import { useMutation, useQuery, useConvexAuth } from "convex/react"; import { api } from "@/convex/_generated/api"; import { EVENTS, track } from "@/lib/analytics"; import { inferSchema, type InferredColumn } from "@/lib/backend"; @@ -43,6 +43,8 @@ const BACKEND_TYPE_MAP: Record = { boolean: "boolean", }; +const DEFAULT_MAX_ROW_COUNT = 100; + function mapBackendColumn(col: InferredColumn, index: number): ProposedColumn { return { id: String(index + 1), @@ -81,6 +83,9 @@ export default function NewDatasetPage() { const [step, setStep] = useState("describe"); const [prompt, setPrompt] = useState(""); const [refreshCadence, setRefreshCadence] = useState("daily"); + const [maxRowCountInput, setMaxRowCountInput] = useState( + String(DEFAULT_MAX_ROW_COUNT), + ); const [columns, setColumns] = useState([]); const [datasetName, setDatasetName] = useState(""); const [isCreating, setIsCreating] = useState(false); @@ -92,6 +97,10 @@ export default function NewDatasetPage() { const { getToken } = useAuth(); const createDataset = useMutation(api.datasets.create); + const usage = useQuery( + api.quota.getMy, + isAuthenticated ? {} : "skip", + ); // Page-view event: fires once when the wizard becomes visible (after // auth resolves and the user is authenticated; we don't want to fire @@ -163,6 +172,17 @@ export default function NewDatasetPage() { async function handleConfirm() { if (isCreating) return; + const maxRowCount = Number(maxRowCountInput); + if (!Number.isInteger(maxRowCount) || maxRowCount < 1) { + setError("Max rows must be a whole number greater than 0."); + return; + } + if (usage && maxRowCount > usage.remaining) { + setError( + `Max rows cannot exceed your remaining monthly quota of ${usage.remaining.toLocaleString()} row operations.`, + ); + return; + } setIsCreating(true); setError(null); let datasetId: string; @@ -171,6 +191,7 @@ export default function NewDatasetPage() { name: datasetName, description: prompt, refreshCadence, + maxRowCount, columns: columns.map((c) => ({ name: c.name, type: c.type, @@ -195,6 +216,7 @@ export default function NewDatasetPage() { datasetId, column_count: columns.length, refreshCadence, + maxRowCount, }); } catch {} router.push(`/dataset/${datasetId}`); @@ -320,6 +342,34 @@ export default function NewDatasetPage() { ))} + +
+ + setMaxRowCountInput(e.currentTarget.value)} + onBlur={() => { + if (!maxRowCountInput.trim()) return; + const value = Number(maxRowCountInput); + if (Number.isInteger(value) && value >= 1) { + setMaxRowCountInput(String(value)); + } + }} + className="w-36 rounded-lg border border-border bg-surface px-4 py-2.5 text-sm font-medium outline-none focus:border-foreground/30 transition-colors" + /> + {usage && ( +

+ Up to {usage.remaining.toLocaleString()} row operations available this month. +

+ )} +
diff --git a/frontend/convex/datasetRows.ts b/frontend/convex/datasetRows.ts index db609f7..0ce4316 100644 --- a/frontend/convex/datasetRows.ts +++ b/frontend/convex/datasetRows.ts @@ -5,7 +5,7 @@ import type { Id } from "./_generated/dataModel.js"; import { assertRowInDataset, loadReadableDataset } from "./lib/authz.js"; import { consumeQuotaForDataset } from "./lib/quota.js"; -const MAX_DATASET_ROWS = 100; +const DEFAULT_MAX_DATASET_ROWS = 100; /** * Authoritative row count for a dataset. O(N), so use only on the slow @@ -77,9 +77,10 @@ export const insert = internalMutation({ typeof dataset.rowCount === "number" ? dataset.rowCount : await actualRowCount(ctx, args.datasetId); - if (previousCount >= MAX_DATASET_ROWS) { + const maxRowCount = dataset.maxRowCount ?? DEFAULT_MAX_DATASET_ROWS; + if (previousCount >= maxRowCount) { throw new Error( - `Row limit reached: BigSet datasets are capped at ${MAX_DATASET_ROWS} rows. Stop inserting rows and finish the run.`, + `Row limit reached: this BigSet dataset is capped at ${maxRowCount} rows. Stop inserting rows and finish the run.`, ); } diff --git a/frontend/convex/datasets.ts b/frontend/convex/datasets.ts index 4f34619..e84d1cf 100644 --- a/frontend/convex/datasets.ts +++ b/frontend/convex/datasets.ts @@ -13,7 +13,7 @@ import { loadReadableDataset, requireIdentity, } from "./lib/authz.js"; -import { requireQuotaRemaining } from "./lib/quota.js"; +import { FREE_TIER_MONTHLY_QUOTA, requireQuotaRemaining } from "./lib/quota.js"; import { nextRefreshAtFor, refreshCadenceValidator, @@ -62,6 +62,19 @@ function refreshCadenceFromLegacyLabel( } const PREVIEW_ROW_COUNT = 5; +const DEFAULT_MAX_ROW_COUNT = 100; + +function validateMaxRowCount(maxRowCount: number): void { + if ( + !Number.isInteger(maxRowCount) || + maxRowCount < 1 || + maxRowCount > FREE_TIER_MONTHLY_QUOTA + ) { + throw new Error( + `Max row count must be a whole number between 1 and ${FREE_TIER_MONTHLY_QUOTA}.`, + ); + } +} async function attachPreview(ctx: QueryCtx, dataset: Doc<"datasets">) { // Mini-table preview: just the first N rows. `.take` keeps the @@ -270,6 +283,7 @@ export const claimScheduledRefreshInternal = internalMutation({ description: dataset.description, columns: dataset.columns, ownerId: dataset.ownerId, + maxRowCount: dataset.maxRowCount ?? DEFAULT_MAX_ROW_COUNT, }, }; }, @@ -360,6 +374,7 @@ export const create = mutation({ name: v.string(), description: v.string(), refreshCadence: refreshCadenceValidator, + maxRowCount: v.number(), columns: v.array(columnValidator), retrievalStrategy: v.optional( v.union( @@ -373,10 +388,11 @@ export const create = mutation({ handler: async (ctx, args) => { const identity = await requireIdentity(ctx); assertNotReservedOwner(identity.subject); + validateMaxRowCount(args.maxRowCount); // Block dataset creation at full exhaustion — a dataset you can't // populate is just clutter. Row generation later will re-check, so // this is a UX safeguard, not the only line of defense. - await requireQuotaRemaining(ctx, identity.subject, 1); + await requireQuotaRemaining(ctx, identity.subject, args.maxRowCount); return await ctx.db.insert("datasets", { ...args, diff --git a/frontend/convex/schema.ts b/frontend/convex/schema.ts index 7907d6c..54cea5f 100644 --- a/frontend/convex/schema.ts +++ b/frontend/convex/schema.ts @@ -50,6 +50,9 @@ export default defineSchema({ // with rows created before this field existed — write paths self-heal // on first hit, and `datasets.backfillRowCounts` migrates all at once. rowCount: v.optional(v.number()), + // User-selected target/limit for populate runs. Optional so existing + // datasets keep the legacy 100-row behavior until touched. + maxRowCount: v.optional(v.number()), columns: v.array( v.object({ name: v.string(), diff --git a/frontend/lib/backend.ts b/frontend/lib/backend.ts index 9406a18..2969edc 100644 --- a/frontend/lib/backend.ts +++ b/frontend/lib/backend.ts @@ -211,6 +211,7 @@ export async function populate( datasetId: string, datasetName: string, description: string, + maxRowCount: number, columns: PopulateColumn[], token: string, ): Promise { @@ -220,7 +221,7 @@ export async function populate( "Content-Type": "application/json", Authorization: `Bearer ${token}`, }, - body: JSON.stringify({ datasetId, datasetName: datasetName, description, columns }), + body: JSON.stringify({ datasetId, datasetName, description, maxRowCount, columns }), }); if (!res.ok) { From ef6db16edc9b50290ca02a9f626ed3f913aa4161 Mon Sep 17 00:00:00 2001 From: pranavjanakiraman Date: Thu, 4 Jun 2026 20:30:42 -0700 Subject: [PATCH 5/6] Allow editing dataset max rows --- frontend/app/dataset/[id]/page.tsx | 108 ++++++++++++++++++++++++++++- frontend/convex/datasets.ts | 15 ++++ 2 files changed, 121 insertions(+), 2 deletions(-) diff --git a/frontend/app/dataset/[id]/page.tsx b/frontend/app/dataset/[id]/page.tsx index ce3d420..27108fd 100644 --- a/frontend/app/dataset/[id]/page.tsx +++ b/frontend/app/dataset/[id]/page.tsx @@ -23,7 +23,7 @@ import type { ProfileUser } from "@/lib/profile-user"; export default function DatasetPage() { const params = useParams(); - const { isLoading: authLoading } = useConvexAuth(); + const { isLoading: authLoading, isAuthenticated } = useConvexAuth(); const { userId, getToken } = useAuth(); const { user } = useUser(); const { signOut } = useClerk(); @@ -34,6 +34,7 @@ export default function DatasetPage() { const [settingsOpen, setSettingsOpen] = useState(false); const [confirmPopulate, setConfirmPopulate] = useState(false); const [savingRefreshCadence, setSavingRefreshCadence] = useState(false); + const [savingMaxRowCount, setSavingMaxRowCount] = useState(false); const datasetId = params.id as Id<"datasets">; const dataset = useQuery( @@ -45,6 +46,11 @@ export default function DatasetPage() { authLoading ? "skip" : { datasetId }, ); const updateRefreshSettings = useMutation(api.datasets.updateRefreshSettings); + const updateMaxRowCount = useMutation(api.datasets.updateMaxRowCount); + const usage = useQuery( + api.quota.getMy, + isAuthenticated ? {} : "skip", + ); const rowIds = useMemo(() => (rows ?? []).map((r) => r._id), [rows]); const selection = useSelection(rowIds); @@ -196,6 +202,34 @@ export default function DatasetPage() { } } + async function handleMaxRowCountChange(maxRowCount: number) { + if (!dataset || savingMaxRowCount || userId !== dataset.ownerId) return; + if (!Number.isInteger(maxRowCount) || maxRowCount < 1) { + captureException(new Error("Invalid max row count"), { + operation: "dataset_max_row_count_update", + datasetId: dataset._id, + }); + return; + } + if (usage && maxRowCount > usage.remaining) return; + + setSavingMaxRowCount(true); + try { + await updateMaxRowCount({ + id: dataset._id, + maxRowCount, + }); + } catch (err) { + console.error("[max rows] failed", err); + captureException(err, { + operation: "dataset_max_row_count_update", + datasetId: dataset._id, + }); + } finally { + setSavingMaxRowCount(false); + } + } + if (authLoading || dataset === undefined || rows === undefined) { return (
@@ -215,6 +249,7 @@ export default function DatasetPage() { ...dataset, refreshCadence: dataset.refreshCadence ?? "daily", refreshEnabled: dataset.refreshEnabled ?? true, + maxRowCount: dataset.maxRowCount ?? 100, }; const updateDisabled = updating || isDatasetBusy; const populateDisabled = populating || isDatasetBusy; @@ -274,11 +309,15 @@ export default function DatasetPage() { onClose={() => setSettingsOpen(false)} refreshCadence={displayDataset.refreshCadence} refreshCadenceDisabled={!isOwner || savingRefreshCadence} + maxRowCount={displayDataset.maxRowCount} + maxRowCountRemaining={usage?.remaining} + maxRowCountDisabled={!isOwner || savingMaxRowCount} updateLabel={updateLabel} updateDisabled={updateDisabled} populateLabel={populateLabel} populateDisabled={populateDisabled} onRefreshCadenceChange={handleRefreshCadenceChange} + onMaxRowCountChange={handleMaxRowCountChange} onUpdate={() => { setSettingsOpen(false); handleUpdate(); }} onPopulate={() => { setSettingsOpen(false); @@ -432,11 +471,15 @@ function SettingsDropdown({ onClose, refreshCadence, refreshCadenceDisabled, + maxRowCount, + maxRowCountRemaining, + maxRowCountDisabled, updateLabel, updateDisabled, populateLabel, populateDisabled, onRefreshCadenceChange, + onMaxRowCountChange, onUpdate, onPopulate, }: { @@ -445,15 +488,31 @@ function SettingsDropdown({ onClose: () => void; refreshCadence: RefreshCadence; refreshCadenceDisabled: boolean; + maxRowCount: number; + maxRowCountRemaining?: number; + maxRowCountDisabled: boolean; updateLabel: string; updateDisabled: boolean; populateLabel: string; populateDisabled: boolean; onRefreshCadenceChange: (refreshCadence: RefreshCadence) => void; + onMaxRowCountChange: (maxRowCount: number) => void; onUpdate: () => void; onPopulate: () => void; }) { const ref = useRef(null); + const [maxRowCountInput, setMaxRowCountInput] = useState(String(maxRowCount)); + const parsedMaxRowCount = Number(maxRowCountInput); + const maxRowCountError = + !maxRowCountInput.trim() + ? "Required" + : !Number.isInteger(parsedMaxRowCount) || parsedMaxRowCount < 1 + ? "Use a whole number" + : maxRowCountRemaining !== undefined && parsedMaxRowCount > maxRowCountRemaining + ? `Max ${maxRowCountRemaining.toLocaleString()}` + : null; + const maxRowCountChanged = + Number.isInteger(parsedMaxRowCount) && parsedMaxRowCount !== maxRowCount; useEffect(() => { if (!open) return; @@ -464,6 +523,10 @@ function SettingsDropdown({ return () => document.removeEventListener("mousedown", handleClick); }, [open, onClose]); + useEffect(() => { + if (!open) setMaxRowCountInput(String(maxRowCount)); + }, [maxRowCount, open]); + return (
+
+
+ Max rows +
+
+ setMaxRowCountInput(e.currentTarget.value)} + onBlur={() => { + if (!maxRowCountInput.trim()) return; + const value = Number(maxRowCountInput); + if (Number.isInteger(value) && value >= 1) { + setMaxRowCountInput(String(value)); + } + }} + className="min-w-0 flex-1 rounded-lg border border-border bg-background px-2 py-1.5 text-xs text-foreground outline-none transition-colors focus:border-foreground/30 disabled:opacity-50" + /> + +
+

+ {maxRowCountError ?? + (maxRowCountRemaining !== undefined + ? `${maxRowCountRemaining.toLocaleString()} row operations available` + : "Applies to the next populate run")} +

+
Refresh cadence diff --git a/frontend/convex/datasets.ts b/frontend/convex/datasets.ts index e84d1cf..fa4f657 100644 --- a/frontend/convex/datasets.ts +++ b/frontend/convex/datasets.ts @@ -424,6 +424,21 @@ export const updateRefreshSettings = mutation({ }, }); +export const updateMaxRowCount = mutation({ + args: { + id: v.id("datasets"), + maxRowCount: v.number(), + }, + handler: async (ctx, args) => { + const dataset = await loadOwnedDataset(ctx, args.id); + validateMaxRowCount(args.maxRowCount); + await requireQuotaRemaining(ctx, dataset.ownerId, args.maxRowCount); + await ctx.db.patch(dataset._id, { + maxRowCount: args.maxRowCount, + }); + }, +}); + export const backfillRefreshSettings = internalMutation({ args: { defaultCadence: v.optional(refreshCadenceValidator), From d6779aaae79f9a72a6bd6c55dbc1c5457be85b50 Mon Sep 17 00:00:00 2001 From: pranavjanakiraman Date: Thu, 4 Jun 2026 20:55:26 -0700 Subject: [PATCH 6/6] Address max rows review feedback --- backend/src/index.ts | 1 - backend/src/mastra/tools/investigate-tool.ts | 2 +- backend/src/pipeline/populate.ts | 4 ++- frontend/app/dataset/[id]/page.tsx | 26 ++++++++++++++++---- frontend/convex/datasets.ts | 4 ++- 5 files changed, 28 insertions(+), 9 deletions(-) diff --git a/backend/src/index.ts b/backend/src/index.ts index 739a0f5..26c72ac 100644 --- a/backend/src/index.ts +++ b/backend/src/index.ts @@ -773,7 +773,6 @@ await fastify.register(async (instance) => { id: parsed.data.datasetId, }); if (!dataset) { - await setDatasetPopulateStatus(parsed.data.datasetId, "failed", "Dataset not found"); return reply.code(404).send({ error: "Dataset not found" }); } diff --git a/backend/src/mastra/tools/investigate-tool.ts b/backend/src/mastra/tools/investigate-tool.ts index 9bee682..c1d6b18 100644 --- a/backend/src/mastra/tools/investigate-tool.ts +++ b/backend/src/mastra/tools/investigate-tool.ts @@ -129,7 +129,7 @@ Context (partial data already found): ${context}${urlsBlock}${notesBlock}`; const abortSignal = getSignal(authorizedDatasetId); - const result = await agent.generate(prompt, { abortSignal, maxSteps: 10 }); + const result = await agent.generate(prompt, { abortSignal, maxSteps: 25 }); if (metrics) { // Use result.toolCalls (the flat accumulated list across all steps) rather // than iterating result.steps[n].toolCalls. The per-step arrays are snapshots diff --git a/backend/src/pipeline/populate.ts b/backend/src/pipeline/populate.ts index 84be087..589db1e 100644 --- a/backend/src/pipeline/populate.ts +++ b/backend/src/pipeline/populate.ts @@ -1,5 +1,7 @@ import { z } from "zod"; +const FREE_TIER_MONTHLY_QUOTA = 2500; + export const populateColumnSchema = z.object({ name: z.string(), type: z.enum(["text", "number", "boolean", "url", "date"]), @@ -12,7 +14,7 @@ export const datasetContextSchema = z.object({ datasetId: z.string().min(1), datasetName: z.string(), description: z.string(), - maxRowCount: z.number().int().min(1).default(100), + maxRowCount: z.number().int().min(1).max(FREE_TIER_MONTHLY_QUOTA).default(100), columns: z.array(populateColumnSchema).min(1), rowIds: z.array(z.string()).min(1).optional(), }); diff --git a/frontend/app/dataset/[id]/page.tsx b/frontend/app/dataset/[id]/page.tsx index 1bd3304..d28abda 100644 --- a/frontend/app/dataset/[id]/page.tsx +++ b/frontend/app/dataset/[id]/page.tsx @@ -38,6 +38,7 @@ export default function DatasetPage() { const [confirmPopulate, setConfirmPopulate] = useState(false); const [savingRefreshCadence, setSavingRefreshCadence] = useState(false); const [savingMaxRowCount, setSavingMaxRowCount] = useState(false); + const [maxRowCountSaveError, setMaxRowCountSaveError] = useState(null); const [cellDetail, setCellDetail] = useState<{ column: DatasetColumn; value: unknown; @@ -225,22 +226,33 @@ export default function DatasetPage() { async function handleMaxRowCountChange(maxRowCount: number) { if (!dataset || savingMaxRowCount || userId !== dataset.ownerId) return; if (!Number.isInteger(maxRowCount) || maxRowCount < 1) { + setMaxRowCountSaveError("Max rows must be a whole number greater than 0."); captureException(new Error("Invalid max row count"), { operation: "dataset_max_row_count_update", datasetId: dataset._id, }); return; } - if (usage && maxRowCount > usage.remaining) return; + if (usage && maxRowCount > usage.remaining) { + setMaxRowCountSaveError( + `Max rows cannot exceed your remaining monthly quota of ${usage.remaining.toLocaleString()} row operations.`, + ); + return; + } + setMaxRowCountSaveError(null); setSavingMaxRowCount(true); try { await updateMaxRowCount({ id: dataset._id, maxRowCount, }); + setMaxRowCountSaveError(null); } catch (err) { console.error("[max rows] failed", err); + setMaxRowCountSaveError( + err instanceof Error ? err.message : "Failed to update max rows.", + ); captureException(err, { operation: "dataset_max_row_count_update", datasetId: dataset._id, @@ -385,6 +397,7 @@ export default function DatasetPage() { refreshCadenceDisabled={!isOwner || savingRefreshCadence} maxRowCount={displayDataset.maxRowCount} maxRowCountRemaining={usage?.remaining} + maxRowCountSaveError={maxRowCountSaveError} maxRowCountDisabled={!isOwner || savingMaxRowCount} updateLabel={updateLabel} updateDisabled={updateDisabled} @@ -558,6 +571,7 @@ function SettingsDropdown({ refreshCadenceDisabled, maxRowCount, maxRowCountRemaining, + maxRowCountSaveError, maxRowCountDisabled, updateLabel, updateDisabled, @@ -575,6 +589,7 @@ function SettingsDropdown({ refreshCadenceDisabled: boolean; maxRowCount: number; maxRowCountRemaining?: number; + maxRowCountSaveError: string | null; maxRowCountDisabled: boolean; updateLabel: string; updateDisabled: boolean; @@ -588,7 +603,7 @@ function SettingsDropdown({ const ref = useRef(null); const [maxRowCountInput, setMaxRowCountInput] = useState(String(maxRowCount)); const parsedMaxRowCount = Number(maxRowCountInput); - const maxRowCountError = + const maxRowCountValidationError = !maxRowCountInput.trim() ? "Required" : !Number.isInteger(parsedMaxRowCount) || parsedMaxRowCount < 1 @@ -669,7 +684,7 @@ function SettingsDropdown({ disabled={ maxRowCountDisabled || !maxRowCountChanged || - maxRowCountError !== null + maxRowCountValidationError !== null } onClick={() => onMaxRowCountChange(parsedMaxRowCount)} className="rounded-lg border border-border px-2 py-1.5 text-xs font-medium text-foreground transition-colors hover:bg-foreground/[0.05] disabled:cursor-not-allowed disabled:opacity-40" @@ -677,8 +692,9 @@ function SettingsDropdown({ Save
-

- {maxRowCountError ?? +

+ {maxRowCountValidationError ?? + maxRowCountSaveError ?? (maxRowCountRemaining !== undefined ? `${maxRowCountRemaining.toLocaleString()} row operations available` : "Applies to the next populate run")} diff --git a/frontend/convex/datasets.ts b/frontend/convex/datasets.ts index fa4f657..d295327 100644 --- a/frontend/convex/datasets.ts +++ b/frontend/convex/datasets.ts @@ -432,7 +432,9 @@ export const updateMaxRowCount = mutation({ handler: async (ctx, args) => { const dataset = await loadOwnedDataset(ctx, args.id); validateMaxRowCount(args.maxRowCount); - await requireQuotaRemaining(ctx, dataset.ownerId, args.maxRowCount); + const currentRowCount = dataset.rowCount ?? 0; + const additionalRowsNeeded = Math.max(0, args.maxRowCount - currentRowCount); + await requireQuotaRemaining(ctx, dataset.ownerId, additionalRowsNeeded); await ctx.db.patch(dataset._id, { maxRowCount: args.maxRowCount, });