Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions backend/src/mastra/agents/populate.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ const openrouter = createOpenRouter({

const INSTRUCTIONS = `You are an expert dataset builder. You conduct research using your web tools.
You do broad research to see which rows to add, and then you spin up sub-agents that can do the deep research and fill in each row for you.
Your job is to make sure you dispatch and manage your army of sub agents to build up a dataset with 100 rows in it.
Your job is to make sure you dispatch and manage your army of sub agents to build up a dataset with 100 rows in it. Stop as soon as the dataset reaches 100 rows.

WORKFLOW:
1. Understand the data that is is needed and do some research to find places on the web where this data may be obvious and easy to find, collect these links to see what the task of scraping the web is going to look like.
Expand All @@ -22,7 +22,7 @@ If the dataset is to look at YC Companies, collect links for the YC Startup regi

3. See what the subagent reports back with, if all good and it gives you some information, use that to give better instuctions to subsequent sub agents.

Keep going till you have 100 rows.
Keep going until you have 100 rows, then finish immediately. If run_subagent reports ROW_LIMIT_REACHED, stop calling tools and finish the run.

This process should become faster overtime as you just find new rows to go and build, and you keep invoking sub agents in parallel to fill them in.

Expand Down
24 changes: 20 additions & 4 deletions backend/src/mastra/tools/investigate-tool.ts
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
import { createTool } from "@mastra/core/tools";
import { z } from "zod";
import { convex, internal } from "../../convex.js";
import { buildInvestigateAgent } from "../agents/investigate.js";
import type { AuthContext } from "../workflows/populate.js";
import type { PopulateColumn } from "../../pipeline/populate.js";
import type { RunMetrics } from "../run-metrics.js";

const MAX_DATASET_ROWS = 100;

const investigateInputSchema = z.object({
entity_hint: z
.string()
Expand Down Expand Up @@ -82,11 +85,24 @@ export function buildSubagentTool(
inputSchema: investigateInputSchema,
outputSchema: investigateOutputSchema,
execute: async ({ entity_hint, primary_keys, context, urls, notes }) => {
if (metrics) metrics.investigateCalls++;
console.log(
`[run_subagent] spawning subagent user=${authContext.authorizedUserId} run=${authContext.workflowRunId} dataset=${authorizedDatasetId} entity="${entity_hint}" pk=${JSON.stringify(primary_keys)}`,
);
try {
const rowCount = await convex.query(internal.datasetRows.countByDataset, {
datasetId: authorizedDatasetId,
});
if (rowCount >= MAX_DATASET_ROWS) {
return {
inserted: false,
reason: `ROW_LIMIT_REACHED: BigSet datasets are capped at ${MAX_DATASET_ROWS} rows. Stop calling run_subagent and finish the run.`,
row_summary: undefined,
clues: undefined,
};
}

if (metrics) metrics.investigateCalls++;
console.log(
`[run_subagent] spawning subagent user=${authContext.authorizedUserId} run=${authContext.workflowRunId} dataset=${authorizedDatasetId} entity="${entity_hint}" pk=${JSON.stringify(primary_keys)}`,
);

const agent = buildInvestigateAgent(
authorizedDatasetId,
authContext,
Expand Down
4 changes: 3 additions & 1 deletion backend/src/mastra/workflows/populate.ts
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,9 @@ Data fields to collect:
${columnsDesc}${pkNote}${manifestNote}${strategyNote}

Search the web broadly to find real entities that fit this dataset topic.
For each lead you find, call run_subagent with the primary key values and any context/URLs you have found.`;
For each lead you find, call run_subagent with the primary key values and any context/URLs you have found.
If run_subagent returns ROW_LIMIT_REACHED, stop immediately and do not make any more tool calls.
Stop the populate run as soon as the dataset reaches 100 rows.`;
Comment thread
coderabbitai[bot] marked this conversation as resolved.

console.log(
`[build-prompt] Built prompt for ${inputData.datasetName} (${inputData.columns.length} columns, strategy=${inputData.enumerationStrategy})`,
Expand Down
18 changes: 12 additions & 6 deletions frontend/convex/datasetRows.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ import type { Id } from "./_generated/dataModel.js";
import { assertRowInDataset, loadReadableDataset } from "./lib/authz.js";
import { consumeQuotaForDataset } from "./lib/quota.js";

const MAX_DATASET_ROWS = 100;

/**
* Authoritative row count for a dataset. O(N), so use only on the slow
* paths: self-heal in `insert` / `remove` when the dataset doc predates
Expand Down Expand Up @@ -71,6 +73,16 @@ export const insert = internalMutation({
const dataset = await ctx.db.get(args.datasetId);
if (!dataset) throw new Error("Dataset not found");

const previousCount =
typeof dataset.rowCount === "number"
? dataset.rowCount
: await actualRowCount(ctx, args.datasetId);
if (previousCount >= MAX_DATASET_ROWS) {
throw new Error(
`Row limit reached: BigSet datasets are capped at ${MAX_DATASET_ROWS} rows. Stop inserting rows and finish the run.`,
);
}

// Dedup: reject inserts that collide on primary key columns.
// Runs BEFORE quota so rejected dupes don't burn quota.
const pkColumns = (dataset.columns ?? []).filter(
Expand Down Expand Up @@ -110,11 +122,6 @@ export const insert = internalMutation({
// Quota consumption only happens for genuine new rows.
await consumeQuotaForDataset(ctx, args.datasetId, 1);

const previousCount =
typeof dataset.rowCount === "number"
? dataset.rowCount
: await actualRowCount(ctx, args.datasetId);

const rowId = await ctx.db.insert("datasetRows", args);

await ctx.db.patch(args.datasetId, { rowCount: previousCount + 1 });
Expand Down Expand Up @@ -327,4 +334,3 @@ export const listInternal = internalQuery({
.collect();
},
});