tinyfish-io · simantak-dabhade · Jun 5, 2026 · Jun 1, 2026 · Jun 1, 2026 · Jun 1, 2026
diff --git a/backend/src/index.ts b/backend/src/index.ts
@@ -568,6 +568,7 @@ function startLocalRefreshScheduler(
             datasetId: dataset.datasetId,
             datasetName: dataset.datasetName,
             description: dataset.description,
+            maxRowCount: dataset.maxRowCount ?? 100,
             columns: dataset.columns,
           },
           run,
@@ -768,6 +769,13 @@ await fastify.register(async (instance) => {
         throw new Error(`Unexpected populate claim outcome: ${populateOutcome}`);
       }
 
+      const dataset = await convex.query(internal.datasets.getInternal, {
+        id: parsed.data.datasetId,
+      });
+      if (!dataset) {
+        return reply.code(404).send({ error: "Dataset not found" });
+      }
+
       const { getModelConfig } = await import("./config/models.js");
       const modelConfig = await getModelConfig(auth.userId);
 
@@ -787,7 +795,10 @@ await fastify.register(async (instance) => {
       const controller = registerDataset(parsed.data.datasetId);
 
       void runPopulateWorkflowInBackground({
-        input: parsed.data,
+        input: {
+          ...parsed.data,
+          maxRowCount: dataset.maxRowCount ?? parsed.data.maxRowCount,
+        },
         run,
         controller,
         authorizedUserId: auth.userId,

diff --git a/backend/src/mastra/agents/populate.ts b/backend/src/mastra/agents/populate.ts
@@ -10,9 +10,10 @@ const openrouter = createOpenRouter({
   apiKey: process.env.OPENROUTER_API_KEY!,
 });
 
-const INSTRUCTIONS = `You are an expert dataset builder. You conduct research using your web tools.
+function buildInstructions(maxRowCount: number): string {
+  return `You are an expert dataset builder. You conduct research using your web tools.
 You do broad research to see which rows to add, and then you spin up sub-agents that can do the deep research and fill in each row for you.
-Your job is to make sure you dispatch and manage your army of sub agents to build up a dataset with 100 rows in it. Stop as soon as the dataset reaches 100 rows.
+Your job is to make sure you dispatch and manage your army of sub agents to build up a dataset with ${maxRowCount} rows in it. Stop as soon as the dataset reaches ${maxRowCount} rows.
 
 WORKFLOW:
 1. Understand the data that is is needed and do some research to find places on the web where this data may be obvious and easy to find, collect these links to see what the task of scraping the web is going to look like.
@@ -22,12 +23,13 @@ If the dataset is to look at YC Companies, collect links for the YC Startup regi
 
 3. See what the subagent reports back with, if all good and it gives you some information, use that to give better instuctions to subsequent sub agents.
 
-Keep going until you have 100 rows, then finish immediately. If run_subagent reports ROW_LIMIT_REACHED, stop calling tools and finish the run.
+Keep going until you have ${maxRowCount} rows, then finish immediately. If run_subagent reports ROW_LIMIT_REACHED, stop calling tools and finish the run.
 
 This process should become faster overtime as you just find new rows to go and build, and you keep invoking sub agents in parallel to fill them in.
 
 Duplicates are rejected automatically based on primary key columns. If a subagent reports a duplicate, don't re-investigate the same entity — move on to a new one.
 `;
+}
 
 /**
  * Build the orchestrator Agent for a populate run.
@@ -42,14 +44,15 @@ export function buildPopulateAgent(
   authorizedDatasetId: string,
   authContext: AuthContext,
   columns: PopulateColumn[],
+  maxRowCount: number,
   metrics?: RunMetrics,
 ): Agent {
   const modelSlug = authContext.modelConfig!.populateOrchestrator;
 
   return new Agent({
     id: "populate-agent",
     name: "Dataset Populate Orchestrator",
-    instructions: INSTRUCTIONS,
+    instructions: buildInstructions(maxRowCount),
     model: openrouter(modelSlug),
     tools: {
       search_web: searchWebTool,
@@ -58,6 +61,7 @@ export function buildPopulateAgent(
         authorizedDatasetId,
         authContext,
         columns,
+        maxRowCount,
         metrics,
       ),
     },

diff --git a/backend/src/mastra/tools/investigate-tool.ts b/backend/src/mastra/tools/investigate-tool.ts
@@ -7,8 +7,6 @@ import type { PopulateColumn } from "../../pipeline/populate.js";
 import type { RunMetrics } from "../run-metrics.js";
 import { getSignal } from "../../abort-registry.js";
 
-const MAX_DATASET_ROWS = 100;
-
 const investigateInputSchema = z.object({
   entity_hint: z
     .string()
@@ -77,6 +75,7 @@ export function buildSubagentTool(
   authorizedDatasetId: string,
   authContext: AuthContext,
   columns: PopulateColumn[],
+  maxRowCount: number,
   metrics?: RunMetrics,
 ) {
   return createTool({
@@ -90,10 +89,10 @@ export function buildSubagentTool(
         const rowCount = await convex.query(internal.datasetRows.countByDataset, {
           datasetId: authorizedDatasetId,
         });
-        if (rowCount >= MAX_DATASET_ROWS) {
+        if (rowCount >= maxRowCount) {
           return {
             inserted: false,
-            reason: `ROW_LIMIT_REACHED: BigSet datasets are capped at ${MAX_DATASET_ROWS} rows. Stop calling run_subagent and finish the run.`,
+            reason: `ROW_LIMIT_REACHED: this BigSet dataset is capped at ${maxRowCount} rows. Stop calling run_subagent and finish the run.`,
             row_summary: undefined,
             clues: undefined,
           };
@@ -130,7 +129,7 @@ Context (partial data already found):
 ${context}${urlsBlock}${notesBlock}`;
 
         const abortSignal = getSignal(authorizedDatasetId);
-        const result = await agent.generate(prompt, { abortSignal, maxSteps: 10 });
+        const result = await agent.generate(prompt, { abortSignal, maxSteps: 25 });
         if (metrics) {
           // Use result.toolCalls (the flat accumulated list across all steps) rather
           // than iterating result.steps[n].toolCalls. The per-step arrays are snapshots

diff --git a/backend/src/mastra/workflows/populate.ts b/backend/src/mastra/workflows/populate.ts
@@ -157,6 +157,7 @@ const buildPromptOutputSchema = z.object({
   authorizedDatasetId: z.string(),
   authContext: authContextSchema,
   columns: z.array(populateColumnSchema),
+  maxRowCount: z.number().int().min(1),
 });
 
 const buildPromptStep = createStep({
@@ -203,7 +204,7 @@ ${columnsDesc}${pkNote}${manifestNote}${strategyNote}
 Search the web broadly to find real entities that fit this dataset topic.
 For each lead you find, call run_subagent with the primary key values and any context/URLs you have found.
 If run_subagent returns ROW_LIMIT_REACHED, stop immediately and do not make any more tool calls.
-Stop the populate run as soon as the dataset reaches 100 rows.`;
+Stop the populate run as soon as the dataset reaches ${inputData.maxRowCount} rows.`;
 
     console.log(
       `[build-prompt] Built prompt for ${inputData.datasetName} (${inputData.columns.length} columns, strategy=${inputData.enumerationStrategy})`,
@@ -213,6 +214,7 @@ Stop the populate run as soon as the dataset reaches 100 rows.`;
       authorizedDatasetId: inputData.datasetId,
       authContext: inputData.authContext,
       columns: inputData.columns,
+      maxRowCount: inputData.maxRowCount,
     };
   },
 });
@@ -246,6 +248,7 @@ const agentStep = createStep({
         inputData.authorizedDatasetId,
         inputData.authContext,
         inputData.columns,
+        inputData.maxRowCount,
         metrics,
       );
       const abortSignal = getSignal(inputData.authorizedDatasetId);

diff --git a/backend/src/pipeline/populate.ts b/backend/src/pipeline/populate.ts
@@ -1,5 +1,7 @@
 import { z } from "zod";
 
+const FREE_TIER_MONTHLY_QUOTA = 2500;
+
 export const populateColumnSchema = z.object({
   name: z.string(),
   type: z.enum(["text", "number", "boolean", "url", "date"]),
@@ -12,6 +14,7 @@ export const datasetContextSchema = z.object({
   datasetId: z.string().min(1),
   datasetName: z.string(),
   description: z.string(),
+  maxRowCount: z.number().int().min(1).max(FREE_TIER_MONTHLY_QUOTA).default(100),
   columns: z.array(populateColumnSchema).min(1),
   rowIds: z.array(z.string()).min(1).optional(),
 });