Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion backend/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -568,6 +568,7 @@ function startLocalRefreshScheduler(
datasetId: dataset.datasetId,
datasetName: dataset.datasetName,
description: dataset.description,
maxRowCount: dataset.maxRowCount ?? 100,
columns: dataset.columns,
},
run,
Expand Down Expand Up @@ -768,6 +769,13 @@ await fastify.register(async (instance) => {
throw new Error(`Unexpected populate claim outcome: ${populateOutcome}`);
}

const dataset = await convex.query(internal.datasets.getInternal, {
id: parsed.data.datasetId,
});
if (!dataset) {
return reply.code(404).send({ error: "Dataset not found" });
}
Comment thread
coderabbitai[bot] marked this conversation as resolved.

const { getModelConfig } = await import("./config/models.js");
const modelConfig = await getModelConfig(auth.userId);

Expand All @@ -787,7 +795,10 @@ await fastify.register(async (instance) => {
const controller = registerDataset(parsed.data.datasetId);

void runPopulateWorkflowInBackground({
input: parsed.data,
input: {
...parsed.data,
maxRowCount: dataset.maxRowCount ?? parsed.data.maxRowCount,
},
run,
controller,
authorizedUserId: auth.userId,
Expand Down
12 changes: 8 additions & 4 deletions backend/src/mastra/agents/populate.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,10 @@ const openrouter = createOpenRouter({
apiKey: process.env.OPENROUTER_API_KEY!,
});

const INSTRUCTIONS = `You are an expert dataset builder. You conduct research using your web tools.
function buildInstructions(maxRowCount: number): string {
return `You are an expert dataset builder. You conduct research using your web tools.
You do broad research to see which rows to add, and then you spin up sub-agents that can do the deep research and fill in each row for you.
Your job is to make sure you dispatch and manage your army of sub agents to build up a dataset with 100 rows in it. Stop as soon as the dataset reaches 100 rows.
Your job is to make sure you dispatch and manage your army of sub agents to build up a dataset with ${maxRowCount} rows in it. Stop as soon as the dataset reaches ${maxRowCount} rows.

WORKFLOW:
1. Understand the data that is is needed and do some research to find places on the web where this data may be obvious and easy to find, collect these links to see what the task of scraping the web is going to look like.
Expand All @@ -22,12 +23,13 @@ If the dataset is to look at YC Companies, collect links for the YC Startup regi

3. See what the subagent reports back with, if all good and it gives you some information, use that to give better instuctions to subsequent sub agents.

Keep going until you have 100 rows, then finish immediately. If run_subagent reports ROW_LIMIT_REACHED, stop calling tools and finish the run.
Keep going until you have ${maxRowCount} rows, then finish immediately. If run_subagent reports ROW_LIMIT_REACHED, stop calling tools and finish the run.

This process should become faster overtime as you just find new rows to go and build, and you keep invoking sub agents in parallel to fill them in.

Duplicates are rejected automatically based on primary key columns. If a subagent reports a duplicate, don't re-investigate the same entity — move on to a new one.
`;
}

/**
* Build the orchestrator Agent for a populate run.
Expand All @@ -42,14 +44,15 @@ export function buildPopulateAgent(
authorizedDatasetId: string,
authContext: AuthContext,
columns: PopulateColumn[],
maxRowCount: number,
metrics?: RunMetrics,
): Agent {
const modelSlug = authContext.modelConfig!.populateOrchestrator;

return new Agent({
id: "populate-agent",
name: "Dataset Populate Orchestrator",
instructions: INSTRUCTIONS,
instructions: buildInstructions(maxRowCount),
model: openrouter(modelSlug),
tools: {
search_web: searchWebTool,
Expand All @@ -58,6 +61,7 @@ export function buildPopulateAgent(
authorizedDatasetId,
authContext,
columns,
maxRowCount,
metrics,
),
},
Expand Down
9 changes: 4 additions & 5 deletions backend/src/mastra/tools/investigate-tool.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@ import type { PopulateColumn } from "../../pipeline/populate.js";
import type { RunMetrics } from "../run-metrics.js";
import { getSignal } from "../../abort-registry.js";

const MAX_DATASET_ROWS = 100;

const investigateInputSchema = z.object({
entity_hint: z
.string()
Expand Down Expand Up @@ -77,6 +75,7 @@ export function buildSubagentTool(
authorizedDatasetId: string,
authContext: AuthContext,
columns: PopulateColumn[],
maxRowCount: number,
metrics?: RunMetrics,
) {
return createTool({
Expand All @@ -90,10 +89,10 @@ export function buildSubagentTool(
const rowCount = await convex.query(internal.datasetRows.countByDataset, {
datasetId: authorizedDatasetId,
});
if (rowCount >= MAX_DATASET_ROWS) {
if (rowCount >= maxRowCount) {
return {
inserted: false,
reason: `ROW_LIMIT_REACHED: BigSet datasets are capped at ${MAX_DATASET_ROWS} rows. Stop calling run_subagent and finish the run.`,
reason: `ROW_LIMIT_REACHED: this BigSet dataset is capped at ${maxRowCount} rows. Stop calling run_subagent and finish the run.`,
row_summary: undefined,
clues: undefined,
};
Expand Down Expand Up @@ -130,7 +129,7 @@ Context (partial data already found):
${context}${urlsBlock}${notesBlock}`;

const abortSignal = getSignal(authorizedDatasetId);
const result = await agent.generate(prompt, { abortSignal, maxSteps: 10 });
const result = await agent.generate(prompt, { abortSignal, maxSteps: 25 });
if (metrics) {
// Use result.toolCalls (the flat accumulated list across all steps) rather
// than iterating result.steps[n].toolCalls. The per-step arrays are snapshots
Expand Down
5 changes: 4 additions & 1 deletion backend/src/mastra/workflows/populate.ts
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,7 @@ const buildPromptOutputSchema = z.object({
authorizedDatasetId: z.string(),
authContext: authContextSchema,
columns: z.array(populateColumnSchema),
maxRowCount: z.number().int().min(1),
});

const buildPromptStep = createStep({
Expand Down Expand Up @@ -203,7 +204,7 @@ ${columnsDesc}${pkNote}${manifestNote}${strategyNote}
Search the web broadly to find real entities that fit this dataset topic.
For each lead you find, call run_subagent with the primary key values and any context/URLs you have found.
If run_subagent returns ROW_LIMIT_REACHED, stop immediately and do not make any more tool calls.
Stop the populate run as soon as the dataset reaches 100 rows.`;
Stop the populate run as soon as the dataset reaches ${inputData.maxRowCount} rows.`;

console.log(
`[build-prompt] Built prompt for ${inputData.datasetName} (${inputData.columns.length} columns, strategy=${inputData.enumerationStrategy})`,
Expand All @@ -213,6 +214,7 @@ Stop the populate run as soon as the dataset reaches 100 rows.`;
authorizedDatasetId: inputData.datasetId,
authContext: inputData.authContext,
columns: inputData.columns,
maxRowCount: inputData.maxRowCount,
};
},
});
Expand Down Expand Up @@ -246,6 +248,7 @@ const agentStep = createStep({
inputData.authorizedDatasetId,
inputData.authContext,
inputData.columns,
inputData.maxRowCount,
metrics,
);
const abortSignal = getSignal(inputData.authorizedDatasetId);
Expand Down
3 changes: 3 additions & 0 deletions backend/src/pipeline/populate.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import { z } from "zod";

const FREE_TIER_MONTHLY_QUOTA = 2500;

export const populateColumnSchema = z.object({
name: z.string(),
type: z.enum(["text", "number", "boolean", "url", "date"]),
Expand All @@ -12,6 +14,7 @@ export const datasetContextSchema = z.object({
datasetId: z.string().min(1),
datasetName: z.string(),
description: z.string(),
maxRowCount: z.number().int().min(1).max(FREE_TIER_MONTHLY_QUOTA).default(100),
columns: z.array(populateColumnSchema).min(1),
rowIds: z.array(z.string()).min(1).optional(),
});
Expand Down
Loading