diff --git a/backend/src/index.ts b/backend/src/index.ts index 61c7314..26c72ac 100644 --- a/backend/src/index.ts +++ b/backend/src/index.ts @@ -568,6 +568,7 @@ function startLocalRefreshScheduler( datasetId: dataset.datasetId, datasetName: dataset.datasetName, description: dataset.description, + maxRowCount: dataset.maxRowCount ?? 100, columns: dataset.columns, }, run, @@ -768,6 +769,13 @@ await fastify.register(async (instance) => { throw new Error(`Unexpected populate claim outcome: ${populateOutcome}`); } + const dataset = await convex.query(internal.datasets.getInternal, { + id: parsed.data.datasetId, + }); + if (!dataset) { + return reply.code(404).send({ error: "Dataset not found" }); + } + const { getModelConfig } = await import("./config/models.js"); const modelConfig = await getModelConfig(auth.userId); @@ -787,7 +795,10 @@ await fastify.register(async (instance) => { const controller = registerDataset(parsed.data.datasetId); void runPopulateWorkflowInBackground({ - input: parsed.data, + input: { + ...parsed.data, + maxRowCount: dataset.maxRowCount ?? parsed.data.maxRowCount, + }, run, controller, authorizedUserId: auth.userId, diff --git a/backend/src/mastra/agents/populate.ts b/backend/src/mastra/agents/populate.ts index 85edf53..155492a 100644 --- a/backend/src/mastra/agents/populate.ts +++ b/backend/src/mastra/agents/populate.ts @@ -10,9 +10,10 @@ const openrouter = createOpenRouter({ apiKey: process.env.OPENROUTER_API_KEY!, }); -const INSTRUCTIONS = `You are an expert dataset builder. You conduct research using your web tools. +function buildInstructions(maxRowCount: number): string { + return `You are an expert dataset builder. You conduct research using your web tools. You do broad research to see which rows to add, and then you spin up sub-agents that can do the deep research and fill in each row for you. -Your job is to make sure you dispatch and manage your army of sub agents to build up a dataset with 100 rows in it. Stop as soon as the dataset reaches 100 rows. +Your job is to make sure you dispatch and manage your army of sub agents to build up a dataset with ${maxRowCount} rows in it. Stop as soon as the dataset reaches ${maxRowCount} rows. WORKFLOW: 1. Understand the data that is is needed and do some research to find places on the web where this data may be obvious and easy to find, collect these links to see what the task of scraping the web is going to look like. @@ -22,12 +23,13 @@ If the dataset is to look at YC Companies, collect links for the YC Startup regi 3. See what the subagent reports back with, if all good and it gives you some information, use that to give better instuctions to subsequent sub agents. -Keep going until you have 100 rows, then finish immediately. If run_subagent reports ROW_LIMIT_REACHED, stop calling tools and finish the run. +Keep going until you have ${maxRowCount} rows, then finish immediately. If run_subagent reports ROW_LIMIT_REACHED, stop calling tools and finish the run. This process should become faster overtime as you just find new rows to go and build, and you keep invoking sub agents in parallel to fill them in. Duplicates are rejected automatically based on primary key columns. If a subagent reports a duplicate, don't re-investigate the same entity — move on to a new one. `; +} /** * Build the orchestrator Agent for a populate run. @@ -42,6 +44,7 @@ export function buildPopulateAgent( authorizedDatasetId: string, authContext: AuthContext, columns: PopulateColumn[], + maxRowCount: number, metrics?: RunMetrics, ): Agent { const modelSlug = authContext.modelConfig!.populateOrchestrator; @@ -49,7 +52,7 @@ export function buildPopulateAgent( return new Agent({ id: "populate-agent", name: "Dataset Populate Orchestrator", - instructions: INSTRUCTIONS, + instructions: buildInstructions(maxRowCount), model: openrouter(modelSlug), tools: { search_web: searchWebTool, @@ -58,6 +61,7 @@ export function buildPopulateAgent( authorizedDatasetId, authContext, columns, + maxRowCount, metrics, ), }, diff --git a/backend/src/mastra/tools/investigate-tool.ts b/backend/src/mastra/tools/investigate-tool.ts index 534ed7f..c1d6b18 100644 --- a/backend/src/mastra/tools/investigate-tool.ts +++ b/backend/src/mastra/tools/investigate-tool.ts @@ -7,8 +7,6 @@ import type { PopulateColumn } from "../../pipeline/populate.js"; import type { RunMetrics } from "../run-metrics.js"; import { getSignal } from "../../abort-registry.js"; -const MAX_DATASET_ROWS = 100; - const investigateInputSchema = z.object({ entity_hint: z .string() @@ -77,6 +75,7 @@ export function buildSubagentTool( authorizedDatasetId: string, authContext: AuthContext, columns: PopulateColumn[], + maxRowCount: number, metrics?: RunMetrics, ) { return createTool({ @@ -90,10 +89,10 @@ export function buildSubagentTool( const rowCount = await convex.query(internal.datasetRows.countByDataset, { datasetId: authorizedDatasetId, }); - if (rowCount >= MAX_DATASET_ROWS) { + if (rowCount >= maxRowCount) { return { inserted: false, - reason: `ROW_LIMIT_REACHED: BigSet datasets are capped at ${MAX_DATASET_ROWS} rows. Stop calling run_subagent and finish the run.`, + reason: `ROW_LIMIT_REACHED: this BigSet dataset is capped at ${maxRowCount} rows. Stop calling run_subagent and finish the run.`, row_summary: undefined, clues: undefined, }; @@ -130,7 +129,7 @@ Context (partial data already found): ${context}${urlsBlock}${notesBlock}`; const abortSignal = getSignal(authorizedDatasetId); - const result = await agent.generate(prompt, { abortSignal, maxSteps: 10 }); + const result = await agent.generate(prompt, { abortSignal, maxSteps: 25 }); if (metrics) { // Use result.toolCalls (the flat accumulated list across all steps) rather // than iterating result.steps[n].toolCalls. The per-step arrays are snapshots diff --git a/backend/src/mastra/workflows/populate.ts b/backend/src/mastra/workflows/populate.ts index a831616..35db3b1 100644 --- a/backend/src/mastra/workflows/populate.ts +++ b/backend/src/mastra/workflows/populate.ts @@ -157,6 +157,7 @@ const buildPromptOutputSchema = z.object({ authorizedDatasetId: z.string(), authContext: authContextSchema, columns: z.array(populateColumnSchema), + maxRowCount: z.number().int().min(1), }); const buildPromptStep = createStep({ @@ -203,7 +204,7 @@ ${columnsDesc}${pkNote}${manifestNote}${strategyNote} Search the web broadly to find real entities that fit this dataset topic. For each lead you find, call run_subagent with the primary key values and any context/URLs you have found. If run_subagent returns ROW_LIMIT_REACHED, stop immediately and do not make any more tool calls. -Stop the populate run as soon as the dataset reaches 100 rows.`; +Stop the populate run as soon as the dataset reaches ${inputData.maxRowCount} rows.`; console.log( `[build-prompt] Built prompt for ${inputData.datasetName} (${inputData.columns.length} columns, strategy=${inputData.enumerationStrategy})`, @@ -213,6 +214,7 @@ Stop the populate run as soon as the dataset reaches 100 rows.`; authorizedDatasetId: inputData.datasetId, authContext: inputData.authContext, columns: inputData.columns, + maxRowCount: inputData.maxRowCount, }; }, }); @@ -246,6 +248,7 @@ const agentStep = createStep({ inputData.authorizedDatasetId, inputData.authContext, inputData.columns, + inputData.maxRowCount, metrics, ); const abortSignal = getSignal(inputData.authorizedDatasetId); diff --git a/backend/src/pipeline/populate.ts b/backend/src/pipeline/populate.ts index 55d37aa..589db1e 100644 --- a/backend/src/pipeline/populate.ts +++ b/backend/src/pipeline/populate.ts @@ -1,5 +1,7 @@ import { z } from "zod"; +const FREE_TIER_MONTHLY_QUOTA = 2500; + export const populateColumnSchema = z.object({ name: z.string(), type: z.enum(["text", "number", "boolean", "url", "date"]), @@ -12,6 +14,7 @@ export const datasetContextSchema = z.object({ datasetId: z.string().min(1), datasetName: z.string(), description: z.string(), + maxRowCount: z.number().int().min(1).max(FREE_TIER_MONTHLY_QUOTA).default(100), columns: z.array(populateColumnSchema).min(1), rowIds: z.array(z.string()).min(1).optional(), }); diff --git a/frontend/app/dataset/[id]/page.tsx b/frontend/app/dataset/[id]/page.tsx index d9ee1b6..d28abda 100644 --- a/frontend/app/dataset/[id]/page.tsx +++ b/frontend/app/dataset/[id]/page.tsx @@ -25,7 +25,7 @@ import type { ProfileUser } from "@/lib/profile-user"; export default function DatasetPage() { const params = useParams(); - const { isLoading: authLoading } = useConvexAuth(); + const { isLoading: authLoading, isAuthenticated } = useConvexAuth(); const { userId, getToken } = useAuth(); const { user } = useUser(); const { signOut } = useClerk(); @@ -37,6 +37,8 @@ export default function DatasetPage() { const [settingsOpen, setSettingsOpen] = useState(false); const [confirmPopulate, setConfirmPopulate] = useState(false); const [savingRefreshCadence, setSavingRefreshCadence] = useState(false); + const [savingMaxRowCount, setSavingMaxRowCount] = useState(false); + const [maxRowCountSaveError, setMaxRowCountSaveError] = useState(null); const [cellDetail, setCellDetail] = useState<{ column: DatasetColumn; value: unknown; @@ -53,6 +55,11 @@ export default function DatasetPage() { authLoading ? "skip" : { datasetId }, ); const updateRefreshSettings = useMutation(api.datasets.updateRefreshSettings); + const updateMaxRowCount = useMutation(api.datasets.updateMaxRowCount); + const usage = useQuery( + api.quota.getMy, + isAuthenticated ? {} : "skip", + ); const rowIds = useMemo(() => (rows ?? []).map((r) => r._id), [rows]); const selection = useSelection(rowIds); @@ -71,6 +78,7 @@ export default function DatasetPage() { dataset._id, dataset.name, dataset.description, + dataset.maxRowCount ?? 100, dataset.columns, token, ); @@ -215,6 +223,45 @@ export default function DatasetPage() { } } + async function handleMaxRowCountChange(maxRowCount: number) { + if (!dataset || savingMaxRowCount || userId !== dataset.ownerId) return; + if (!Number.isInteger(maxRowCount) || maxRowCount < 1) { + setMaxRowCountSaveError("Max rows must be a whole number greater than 0."); + captureException(new Error("Invalid max row count"), { + operation: "dataset_max_row_count_update", + datasetId: dataset._id, + }); + return; + } + if (usage && maxRowCount > usage.remaining) { + setMaxRowCountSaveError( + `Max rows cannot exceed your remaining monthly quota of ${usage.remaining.toLocaleString()} row operations.`, + ); + return; + } + + setMaxRowCountSaveError(null); + setSavingMaxRowCount(true); + try { + await updateMaxRowCount({ + id: dataset._id, + maxRowCount, + }); + setMaxRowCountSaveError(null); + } catch (err) { + console.error("[max rows] failed", err); + setMaxRowCountSaveError( + err instanceof Error ? err.message : "Failed to update max rows.", + ); + captureException(err, { + operation: "dataset_max_row_count_update", + datasetId: dataset._id, + }); + } finally { + setSavingMaxRowCount(false); + } + } + async function handleStop() { if (!dataset || stopping) return; if (dataset.status !== "building" && dataset.status !== "updating") return; @@ -274,6 +321,7 @@ export default function DatasetPage() { ...dataset, refreshCadence: dataset.refreshCadence ?? "daily", refreshEnabled: dataset.refreshEnabled ?? true, + maxRowCount: dataset.maxRowCount ?? 100, }; const updateDisabled = updating || isDatasetBusy; const populateDisabled = populating || isDatasetBusy; @@ -347,11 +395,16 @@ export default function DatasetPage() { onClose={() => setSettingsOpen(false)} refreshCadence={displayDataset.refreshCadence} refreshCadenceDisabled={!isOwner || savingRefreshCadence} + maxRowCount={displayDataset.maxRowCount} + maxRowCountRemaining={usage?.remaining} + maxRowCountSaveError={maxRowCountSaveError} + maxRowCountDisabled={!isOwner || savingMaxRowCount} updateLabel={updateLabel} updateDisabled={updateDisabled} populateLabel={populateLabel} populateDisabled={populateDisabled} onRefreshCadenceChange={handleRefreshCadenceChange} + onMaxRowCountChange={handleMaxRowCountChange} onUpdate={() => { setSettingsOpen(false); handleUpdate(); }} onPopulate={() => { setSettingsOpen(false); @@ -516,11 +569,16 @@ function SettingsDropdown({ onClose, refreshCadence, refreshCadenceDisabled, + maxRowCount, + maxRowCountRemaining, + maxRowCountSaveError, + maxRowCountDisabled, updateLabel, updateDisabled, populateLabel, populateDisabled, onRefreshCadenceChange, + onMaxRowCountChange, onUpdate, onPopulate, }: { @@ -529,15 +587,32 @@ function SettingsDropdown({ onClose: () => void; refreshCadence: RefreshCadence; refreshCadenceDisabled: boolean; + maxRowCount: number; + maxRowCountRemaining?: number; + maxRowCountSaveError: string | null; + maxRowCountDisabled: boolean; updateLabel: string; updateDisabled: boolean; populateLabel: string; populateDisabled: boolean; onRefreshCadenceChange: (refreshCadence: RefreshCadence) => void; + onMaxRowCountChange: (maxRowCount: number) => void; onUpdate: () => void; onPopulate: () => void; }) { const ref = useRef(null); + const [maxRowCountInput, setMaxRowCountInput] = useState(String(maxRowCount)); + const parsedMaxRowCount = Number(maxRowCountInput); + const maxRowCountValidationError = + !maxRowCountInput.trim() + ? "Required" + : !Number.isInteger(parsedMaxRowCount) || parsedMaxRowCount < 1 + ? "Use a whole number" + : maxRowCountRemaining !== undefined && parsedMaxRowCount > maxRowCountRemaining + ? `Max ${maxRowCountRemaining.toLocaleString()}` + : null; + const maxRowCountChanged = + Number.isInteger(parsedMaxRowCount) && parsedMaxRowCount !== maxRowCount; useEffect(() => { if (!open) return; @@ -548,6 +623,10 @@ function SettingsDropdown({ return () => document.removeEventListener("mousedown", handleClick); }, [open, onClose]); + useEffect(() => { + if (!open) setMaxRowCountInput(String(maxRowCount)); + }, [maxRowCount, open]); + return (
+
+
+ Max rows +
+
+ setMaxRowCountInput(e.currentTarget.value)} + onBlur={() => { + if (!maxRowCountInput.trim()) return; + const value = Number(maxRowCountInput); + if (Number.isInteger(value) && value >= 1) { + setMaxRowCountInput(String(value)); + } + }} + className="min-w-0 flex-1 rounded-lg border border-border bg-background px-2 py-1.5 text-xs text-foreground outline-none transition-colors focus:border-foreground/30 disabled:opacity-50" + /> + +
+

+ {maxRowCountValidationError ?? + maxRowCountSaveError ?? + (maxRowCountRemaining !== undefined + ? `${maxRowCountRemaining.toLocaleString()} row operations available` + : "Applies to the next populate run")} +

+
Refresh cadence diff --git a/frontend/app/dataset/new/page.tsx b/frontend/app/dataset/new/page.tsx index 285cbde..bb4dded 100644 --- a/frontend/app/dataset/new/page.tsx +++ b/frontend/app/dataset/new/page.tsx @@ -4,7 +4,7 @@ import { useEffect, useState, useRef } from "react"; import { useRouter } from "next/navigation"; import Link from "next/link"; import { useAuth } from "@clerk/nextjs"; -import { useMutation, useConvexAuth } from "convex/react"; +import { useMutation, useQuery, useConvexAuth } from "convex/react"; import { api } from "@/convex/_generated/api"; import { EVENTS, track } from "@/lib/analytics"; import { inferSchema, type InferredColumn } from "@/lib/backend"; @@ -43,6 +43,8 @@ const BACKEND_TYPE_MAP: Record = { boolean: "boolean", }; +const DEFAULT_MAX_ROW_COUNT = 100; + function mapBackendColumn(col: InferredColumn, index: number): ProposedColumn { return { id: String(index + 1), @@ -81,6 +83,9 @@ export default function NewDatasetPage() { const [step, setStep] = useState("describe"); const [prompt, setPrompt] = useState(""); const [refreshCadence, setRefreshCadence] = useState("daily"); + const [maxRowCountInput, setMaxRowCountInput] = useState( + String(DEFAULT_MAX_ROW_COUNT), + ); const [columns, setColumns] = useState([]); const [datasetName, setDatasetName] = useState(""); const [isCreating, setIsCreating] = useState(false); @@ -92,6 +97,10 @@ export default function NewDatasetPage() { const { getToken } = useAuth(); const createDataset = useMutation(api.datasets.create); + const usage = useQuery( + api.quota.getMy, + isAuthenticated ? {} : "skip", + ); // Page-view event: fires once when the wizard becomes visible (after // auth resolves and the user is authenticated; we don't want to fire @@ -163,6 +172,17 @@ export default function NewDatasetPage() { async function handleConfirm() { if (isCreating) return; + const maxRowCount = Number(maxRowCountInput); + if (!Number.isInteger(maxRowCount) || maxRowCount < 1) { + setError("Max rows must be a whole number greater than 0."); + return; + } + if (usage && maxRowCount > usage.remaining) { + setError( + `Max rows cannot exceed your remaining monthly quota of ${usage.remaining.toLocaleString()} row operations.`, + ); + return; + } setIsCreating(true); setError(null); let datasetId: string; @@ -171,6 +191,7 @@ export default function NewDatasetPage() { name: datasetName, description: prompt, refreshCadence, + maxRowCount, columns: columns.map((c) => ({ name: c.name, type: c.type, @@ -195,6 +216,7 @@ export default function NewDatasetPage() { datasetId, column_count: columns.length, refreshCadence, + maxRowCount, }); } catch {} router.push(`/dataset/${datasetId}`); @@ -320,6 +342,34 @@ export default function NewDatasetPage() { ))}
+ +
+ + setMaxRowCountInput(e.currentTarget.value)} + onBlur={() => { + if (!maxRowCountInput.trim()) return; + const value = Number(maxRowCountInput); + if (Number.isInteger(value) && value >= 1) { + setMaxRowCountInput(String(value)); + } + }} + className="w-36 rounded-lg border border-border bg-surface px-4 py-2.5 text-sm font-medium outline-none focus:border-foreground/30 transition-colors" + /> + {usage && ( +

+ Up to {usage.remaining.toLocaleString()} row operations available this month. +

+ )} +
diff --git a/frontend/convex/datasetRows.ts b/frontend/convex/datasetRows.ts index 0536f3b..eac39ea 100644 --- a/frontend/convex/datasetRows.ts +++ b/frontend/convex/datasetRows.ts @@ -5,7 +5,7 @@ import type { Id } from "./_generated/dataModel.js"; import { assertRowInDataset, loadReadableDataset } from "./lib/authz.js"; import { consumeQuotaForDataset } from "./lib/quota.js"; -const MAX_DATASET_ROWS = 100; +const DEFAULT_MAX_DATASET_ROWS = 100; /** * Authoritative row count for a dataset. O(N), so use only on the slow @@ -77,9 +77,10 @@ export const insert = internalMutation({ typeof dataset.rowCount === "number" ? dataset.rowCount : await actualRowCount(ctx, args.datasetId); - if (previousCount >= MAX_DATASET_ROWS) { + const maxRowCount = dataset.maxRowCount ?? DEFAULT_MAX_DATASET_ROWS; + if (previousCount >= maxRowCount) { throw new Error( - `Row limit reached: BigSet datasets are capped at ${MAX_DATASET_ROWS} rows. Stop inserting rows and finish the run.`, + `Row limit reached: this BigSet dataset is capped at ${maxRowCount} rows. Stop inserting rows and finish the run.`, ); } diff --git a/frontend/convex/datasets.ts b/frontend/convex/datasets.ts index 4f34619..d295327 100644 --- a/frontend/convex/datasets.ts +++ b/frontend/convex/datasets.ts @@ -13,7 +13,7 @@ import { loadReadableDataset, requireIdentity, } from "./lib/authz.js"; -import { requireQuotaRemaining } from "./lib/quota.js"; +import { FREE_TIER_MONTHLY_QUOTA, requireQuotaRemaining } from "./lib/quota.js"; import { nextRefreshAtFor, refreshCadenceValidator, @@ -62,6 +62,19 @@ function refreshCadenceFromLegacyLabel( } const PREVIEW_ROW_COUNT = 5; +const DEFAULT_MAX_ROW_COUNT = 100; + +function validateMaxRowCount(maxRowCount: number): void { + if ( + !Number.isInteger(maxRowCount) || + maxRowCount < 1 || + maxRowCount > FREE_TIER_MONTHLY_QUOTA + ) { + throw new Error( + `Max row count must be a whole number between 1 and ${FREE_TIER_MONTHLY_QUOTA}.`, + ); + } +} async function attachPreview(ctx: QueryCtx, dataset: Doc<"datasets">) { // Mini-table preview: just the first N rows. `.take` keeps the @@ -270,6 +283,7 @@ export const claimScheduledRefreshInternal = internalMutation({ description: dataset.description, columns: dataset.columns, ownerId: dataset.ownerId, + maxRowCount: dataset.maxRowCount ?? DEFAULT_MAX_ROW_COUNT, }, }; }, @@ -360,6 +374,7 @@ export const create = mutation({ name: v.string(), description: v.string(), refreshCadence: refreshCadenceValidator, + maxRowCount: v.number(), columns: v.array(columnValidator), retrievalStrategy: v.optional( v.union( @@ -373,10 +388,11 @@ export const create = mutation({ handler: async (ctx, args) => { const identity = await requireIdentity(ctx); assertNotReservedOwner(identity.subject); + validateMaxRowCount(args.maxRowCount); // Block dataset creation at full exhaustion — a dataset you can't // populate is just clutter. Row generation later will re-check, so // this is a UX safeguard, not the only line of defense. - await requireQuotaRemaining(ctx, identity.subject, 1); + await requireQuotaRemaining(ctx, identity.subject, args.maxRowCount); return await ctx.db.insert("datasets", { ...args, @@ -408,6 +424,23 @@ export const updateRefreshSettings = mutation({ }, }); +export const updateMaxRowCount = mutation({ + args: { + id: v.id("datasets"), + maxRowCount: v.number(), + }, + handler: async (ctx, args) => { + const dataset = await loadOwnedDataset(ctx, args.id); + validateMaxRowCount(args.maxRowCount); + const currentRowCount = dataset.rowCount ?? 0; + const additionalRowsNeeded = Math.max(0, args.maxRowCount - currentRowCount); + await requireQuotaRemaining(ctx, dataset.ownerId, additionalRowsNeeded); + await ctx.db.patch(dataset._id, { + maxRowCount: args.maxRowCount, + }); + }, +}); + export const backfillRefreshSettings = internalMutation({ args: { defaultCadence: v.optional(refreshCadenceValidator), diff --git a/frontend/convex/schema.ts b/frontend/convex/schema.ts index d1c1888..4e7b5f4 100644 --- a/frontend/convex/schema.ts +++ b/frontend/convex/schema.ts @@ -50,6 +50,9 @@ export default defineSchema({ // with rows created before this field existed — write paths self-heal // on first hit, and `datasets.backfillRowCounts` migrates all at once. rowCount: v.optional(v.number()), + // User-selected target/limit for populate runs. Optional so existing + // datasets keep the legacy 100-row behavior until touched. + maxRowCount: v.optional(v.number()), columns: v.array( v.object({ name: v.string(), diff --git a/frontend/lib/backend.ts b/frontend/lib/backend.ts index a4f316d..f023a35 100644 --- a/frontend/lib/backend.ts +++ b/frontend/lib/backend.ts @@ -211,6 +211,7 @@ export async function populate( datasetId: string, datasetName: string, description: string, + maxRowCount: number, columns: PopulateColumn[], token: string, ): Promise { @@ -220,7 +221,7 @@ export async function populate( "Content-Type": "application/json", Authorization: `Bearer ${token}`, }, - body: JSON.stringify({ datasetId, datasetName: datasetName, description, columns }), + body: JSON.stringify({ datasetId, datasetName, description, maxRowCount, columns }), }); if (!res.ok) {