From 4cdf47d32f78e38351aea06c10a5e7d6f83f52d1 Mon Sep 17 00:00:00 2001
From: pritpatel2412 <pritptl2412@gmail.com>
Date: Thu, 4 Jun 2026 14:46:02 +0530
Subject: [PATCH 1/3] feat: paginate runStats queries, fix verify-authz on
 Windows, and resolve React hook state effect warning

---
 .../table/use-row-change-detection.ts         | 14 +++--
 frontend/convex/runStats.ts                   | 54 +++++++++++++------
 frontend/convex/schema.ts                     |  2 +
 scripts/verify-authz.sh                       | 19 +++++--
 4 files changed, 64 insertions(+), 25 deletions(-)

diff --git a/frontend/components/table/use-row-change-detection.ts b/frontend/components/table/use-row-change-detection.ts
index 49ce219..e823372 100644
--- a/frontend/components/table/use-row-change-detection.ts
+++ b/frontend/components/table/use-row-change-detection.ts
@@ -53,11 +53,15 @@ export function useRowChangeDetection(rows: DatasetRow[]) {
     prevRowsRef.current = nextMap;
 
     if (newFlashes.size > 0) {
-      setFlashingCells((prev) => {
-        const merged = new Set(prev);
-        for (const key of newFlashes) merged.add(key);
-        return merged;
-      });
+      const updateTimer = setTimeout(() => {
+        setFlashingCells((prev) => {
+          const merged = new Set(prev);
+          for (const key of newFlashes) merged.add(key);
+          return merged;
+        });
+        flashTimersRef.current.delete(updateTimer);
+      }, 0);
+      flashTimersRef.current.add(updateTimer);
 
       const timer = setTimeout(() => {
         setFlashingCells((prev) => {
diff --git a/frontend/convex/runStats.ts b/frontend/convex/runStats.ts
index d1165ec..c225c40 100644
--- a/frontend/convex/runStats.ts
+++ b/frontend/convex/runStats.ts
@@ -1,6 +1,9 @@
 import { internalMutation, internalQuery } from "./_generated/server.js";
 import { v } from "convex/values";
 
+const DEFAULT_PAGE_SIZE = 50;
+const MAX_PAGE_SIZE = 200;
+
 /**
  * Insert a populate-run metrics record.
  *
@@ -68,33 +71,52 @@ export const getByWorkflowRunId = internalQuery({
 });
 
 /**
- * List all runs for a dataset, newest first.
- * TODO: paginate — .collect() loads all docs into memory and will degrade
- * as run history grows. Add cursor-based pagination when this is exposed
- * to the frontend or run counts become large.
+ * List runs for a dataset, newest first.
+ * Cursor-based pagination keeps memory bounded as run history grows.
  */
 export const listByDataset = internalQuery({
-  args: { datasetId: v.string() },
+  args: {
+    datasetId: v.string(),
+    cursor: v.optional(v.string()),
+    limit: v.optional(v.number()),
+  },
   handler: async (ctx, args) => {
-    const runs = await ctx.db
+    const limit = Math.min(args.limit ?? DEFAULT_PAGE_SIZE, MAX_PAGE_SIZE);
+    const { page, isDone, continueCursor } = await ctx.db
       .query("runStats")
-      .withIndex("by_dataset", (q) => q.eq("datasetId", args.datasetId))
-      .collect();
-    return runs.sort((a, b) => b.startedAt - a.startedAt);
+      .withIndex("by_dataset_started_at", (q) =>
+        q.eq("datasetId", args.datasetId),
+      )
+      .order("desc")
+      .paginate({
+        cursor: args.cursor ?? null,
+        numItems: limit,
+      });
+
+    return { runs: page, isDone, continueCursor };
   },
 });
 
 /**
- * List all runs for a user, newest first.
- * TODO: paginate — same concern as listByDataset above.
+ * List runs for a user, newest first.
  */
 export const listByUser = internalQuery({
-  args: { userId: v.string() },
+  args: {
+    userId: v.string(),
+    cursor: v.optional(v.string()),
+    limit: v.optional(v.number()),
+  },
   handler: async (ctx, args) => {
-    const runs = await ctx.db
+    const limit = Math.min(args.limit ?? DEFAULT_PAGE_SIZE, MAX_PAGE_SIZE);
+    const { page, isDone, continueCursor } = await ctx.db
       .query("runStats")
-      .withIndex("by_user", (q) => q.eq("userId", args.userId))
-      .collect();
-    return runs.sort((a, b) => b.startedAt - a.startedAt);
+      .withIndex("by_user_started_at", (q) => q.eq("userId", args.userId))
+      .order("desc")
+      .paginate({
+        cursor: args.cursor ?? null,
+        numItems: limit,
+      });
+
+    return { runs: page, isDone, continueCursor };
   },
 });
diff --git a/frontend/convex/schema.ts b/frontend/convex/schema.ts
index d1c1888..5918710 100644
--- a/frontend/convex/schema.ts
+++ b/frontend/convex/schema.ts
@@ -170,6 +170,8 @@ export default defineSchema({
     rowsUpdated: v.optional(v.number()),
   })
     .index("by_dataset", ["datasetId"])
+    .index("by_dataset_started_at", ["datasetId", "startedAt"])
     .index("by_user", ["userId"])
+    .index("by_user_started_at", ["userId", "startedAt"])
     .index("by_workflow_run", ["workflowRunId"]),
 });
diff --git a/scripts/verify-authz.sh b/scripts/verify-authz.sh
index b3496a1..3dd457e 100644
--- a/scripts/verify-authz.sh
+++ b/scripts/verify-authz.sh
@@ -6,6 +6,17 @@
 #   bash scripts/verify-authz.sh
 set -u
 
+# Use python3 if available, fallback to python (common on Windows)
+PYTHON="python3"
+if ! command -v python3 &>/dev/null; then
+  if command -v python &>/dev/null; then
+    PYTHON="python"
+  else
+    echo "Error: python3 or python is required to run this script."
+    exit 1
+  fi
+fi
+
 CONVEX="${CONVEX_URL:-http://localhost:3210}"
 FRONTEND="${FRONTEND_URL:-http://localhost:3500}"
 FAIL=0
@@ -34,11 +45,11 @@ mutation() {
 }
 
 assert_success() {
-  python3 -c "import json,sys; d=json.load(sys.stdin); print('PASS' if d.get('status')=='success' else 'FAIL: '+d.get('errorMessage','?')[:60])"
+  $PYTHON -c "import json,sys; d=json.load(sys.stdin); print('PASS' if d.get('status')=='success' else 'FAIL: '+d.get('errorMessage','?')[:60])"
 }
 assert_error_contains() {
   local needle="$1"
-  python3 -c "import json,sys; d=json.load(sys.stdin); print('PASS' if '$needle' in d.get('errorMessage','') else 'FAIL: '+d.get('errorMessage','?')[:80])"
+  $PYTHON -c "import json,sys; d=json.load(sys.stdin); print('PASS' if '$needle' in d.get('errorMessage','') else 'FAIL: '+d.get('errorMessage','?')[:80])"
 }
 
 echo "════════════════════════════════════════════════════════════════"
@@ -47,7 +58,7 @@ echo "  convex=$CONVEX  frontend=$FRONTEND"
 echo "════════════════════════════════════════════════════════════════"
 
 PUB_ID=$(query '{"path":"datasets:listPublic","args":{},"format":"json"}' \
-  | python3 -c "import json,sys; print(json.load(sys.stdin)['value'][0]['_id'])")
+  | $PYTHON -c "import json,sys; print(json.load(sys.stdin)['value'][0]['_id'])")
 if [ -z "${PUB_ID:-}" ]; then
   echo "No public dataset found. Seed curated data first (publicSeed:seedPublicDatasets)."
   exit 1
@@ -65,7 +76,7 @@ section "Anonymous WRITES — must all be rejected"
 run_test "anon datasets.listMine -> Not authenticated" \
   "$(query '{"path":"datasets:listMine","args":{},"format":"json"}' | assert_error_contains 'Not authenticated')"
 run_test "anon datasets.create -> Not authenticated" \
-  "$(mutation '{"path":"datasets:create","args":{"name":"x","description":"x","cadence":"daily","columns":[]},"format":"json"}' | assert_error_contains 'Not authenticated')"
+  "$(mutation '{"path":"datasets:create","args":{"name":"x","description":"x","refreshCadence":"daily","columns":[]},"format":"json"}' | assert_error_contains 'Not authenticated')"
 run_test "anon datasets.updateStatus -> Not authenticated" \
   "$(mutation "{\"path\":\"datasets:updateStatus\",\"args\":{\"id\":\"$PUB_ID\",\"status\":\"paused\"},\"format\":\"json\"}" | assert_error_contains 'Not authenticated')"
 run_test "anon datasets.remove -> Not authenticated" \

From 0103e93e2048b90e979f5308307eee4a9c514da6 Mon Sep 17 00:00:00 2001
From: pritpatel2412 <pritptl2412@gmail.com>
Date: Fri, 5 Jun 2026 17:45:54 +0530
Subject: [PATCH 2/3] feat: implement per-cell source provenance

---
 backend/src/mastra/agents/investigate.ts  |  5 +-
 backend/src/mastra/tools/dataset-tools.ts | 28 +++++++++-
 frontend/app/dataset/[id]/page.tsx        | 13 ++++-
 frontend/components/SideSheet.tsx         | 67 ++++++++++++++++++++++-
 frontend/components/table/DataRow.tsx     | 13 ++++-
 frontend/components/table/types.ts        |  8 +++
 frontend/convex/datasetRows.ts            | 21 +++++++
 frontend/convex/schema.ts                 | 10 ++++
 8 files changed, 158 insertions(+), 7 deletions(-)

diff --git a/backend/src/mastra/agents/investigate.ts b/backend/src/mastra/agents/investigate.ts
index 4cdc32e..99c2a4d 100644
--- a/backend/src/mastra/agents/investigate.ts
+++ b/backend/src/mastra/agents/investigate.ts
@@ -28,18 +28,19 @@ RULES:
 - You have at most 6 tool calls total. Budget them: 1 fetch + 1 search + 1 fetch + 1 insert = done.
 - ALWAYS insert a row, even if some fields are incomplete. Use "" for unknown fields. Partial real data is better than no row.
 - Never fabricate values. Use "" for anything you cannot verify.
+- For every field value you extract and fill in "data", you MUST record the cell-level provenance (the source URL, the search query used to find it, and the exact text snippet context showing the value) in the "provenance" parameter of insert_row/update_row.
 - insert_row rejects duplicates based on primary key columns. If you get a "Duplicate" error, do NOT retry — report INSERTED: false and move on.
 
 TOOL CALL FORMAT — every tool call argument must be a JSON object wrapped in curly braces:
   search_web: {"query": "your search terms"}
   fetch_page: {"url": "https://example.com"}
-  insert_row: {"data": {${columnNames.map((n) => `"${n}": "value"`).join(", ")}}, "sources": ["https://url-you-fetched.com"], "row_summary": "one line about this entity", "how_found": "step by step guide on how to extract the data so an agent in the future can do it too"}
+  insert_row: {"data": {${columnNames.map((n) => `"${n}": "value"`).join(", ")}}, "sources": ["https://url-you-fetched.com"], "provenance": {${columnNames.map((n) => `"${n}": {"url": "https://url-you-fetched.com", "query": "search query used", "snippet": "exact context snippet from page"}`).join(", ")}}, "row_summary": "one line about this entity", "how_found": "step by step guide on how to extract the data so an agent in the future can do it too"}
 
 WORKFLOW:
 1. Fetch 1-2 of the provided URLs to get real data (if URLs were given).
 2. If you need more, run ONE search and fetch the best result.
 3. Call insert_row with whatever real data you have. Use "" for missing fields.
-   Include "sources" (URLs you fetched), "row_summary" (one line about this entity), and "how_found" (a step by step guide on how you found this data. eg, 1. fetch the contents of this url "<insert url>", 2. Look for the pricing field, and title name field, 3. etc...)
+   Include "sources" (URLs you fetched), "provenance" (mapping of column names to their detailed source details), "row_summary" (one line about this entity), and "how_found" (a step by step guide on how you found this data. eg, 1. fetch the contents of this url "<insert url>", 2. Look for the pricing field, and title name field, 3. etc...)
 4. Write your final response:
    INSERTED: true/false
    SUMMARY: one line
diff --git a/backend/src/mastra/tools/dataset-tools.ts b/backend/src/mastra/tools/dataset-tools.ts
index 1fc016e..f044ca3 100644
--- a/backend/src/mastra/tools/dataset-tools.ts
+++ b/backend/src/mastra/tools/dataset-tools.ts
@@ -131,6 +131,17 @@ export function buildPopulateTools(
         .array(z.string())
         .optional()
         .describe("URLs you visited or used to gather data for this row"),
+      provenance: z
+        .record(
+          z.string(),
+          z.object({
+            url: z.string(),
+            query: z.string().optional(),
+            snippet: z.string().optional(),
+          })
+        )
+        .optional()
+        .describe("Mapping of column names to their detailed source provenance (url, query, snippet)"),
       row_summary: z
         .string()
         .optional()
@@ -141,7 +152,7 @@ export function buildPopulateTools(
         .describe("Brief description of how you found and verified this data"),
     }),
     outputSchema: writeResultSchema,
-    execute: async ({ data, sources, row_summary, how_found }) => {
+    execute: async ({ data, sources, provenance, row_summary, how_found }) => {
       if (!data || Object.keys(data).length === 0)
         return {
           success: false,
@@ -158,6 +169,7 @@ export function buildPopulateTools(
           datasetId: authorizedDatasetId,
           data: cleanedData,
           ...(sources !== undefined ? { sources } : {}),
+          ...(provenance !== undefined ? { provenance } : {}),
           ...(row_summary !== undefined ? { rowSummary: row_summary } : {}),
           ...(how_found !== undefined ? { howFound: how_found } : {}),
         });
@@ -265,6 +277,17 @@ export function buildPopulateTools(
         .array(z.string())
         .optional()
         .describe("Updated source URLs where this data was verified"),
+      provenance: z
+        .record(
+          z.string(),
+          z.object({
+            url: z.string(),
+            query: z.string().optional(),
+            snippet: z.string().optional(),
+          })
+        )
+        .optional()
+        .describe("Updated mapping of column names to their detailed source provenance (url, query, snippet)"),
       row_summary: z
         .string()
         .optional()
@@ -275,7 +298,7 @@ export function buildPopulateTools(
         .describe("Brief description of how the updated data was found"),
     }),
     outputSchema: writeResultSchema,
-    execute: async ({ rowId, data, sources, row_summary, how_found }) => {
+    execute: async ({ rowId, data, sources, provenance, row_summary, how_found }) => {
       if (!rowId) return { success: false, error: "rowId is required." };
       if (!data || Object.keys(data).length === 0)
         return {
@@ -293,6 +316,7 @@ export function buildPopulateTools(
           expectedDatasetId: authorizedDatasetId,
           data: cleanedData,
           ...(sources !== undefined ? { sources } : {}),
+          ...(provenance !== undefined ? { provenance } : {}),
           ...(row_summary !== undefined ? { rowSummary: row_summary } : {}),
           ...(how_found !== undefined ? { howFound: how_found } : {}),
         });
diff --git a/frontend/app/dataset/[id]/page.tsx b/frontend/app/dataset/[id]/page.tsx
index d9ee1b6..956dd35 100644
--- a/frontend/app/dataset/[id]/page.tsx
+++ b/frontend/app/dataset/[id]/page.tsx
@@ -41,6 +41,11 @@ export default function DatasetPage() {
     column: DatasetColumn;
     value: unknown;
     sources?: string[];
+    provenance?: {
+      url: string;
+      query?: string;
+      snippet?: string;
+    };
   } | null>(null);
 
   const datasetId = params.id as Id<"datasets">;
@@ -95,7 +100,12 @@ export default function DatasetPage() {
     const col = dataset.columns.find((c) => c.name === columnName);
     if (!col) return;
     const row = rows.find((r) => r._id === rowId);
-    setCellDetail({ column: col, value, sources: row?.sources });
+    setCellDetail({
+      column: col,
+      value,
+      sources: row?.sources,
+      provenance: row?.provenance?.[columnName],
+    });
   }, [dataset, rows]);
 
   const openedFired = useRef<string | null>(null);
@@ -409,6 +419,7 @@ export default function DatasetPage() {
             column={cellDetail.column}
             value={cellDetail.value}
             sources={cellDetail.sources}
+            provenance={cellDetail.provenance}
           />
         )}
       </SideSheet>
diff --git a/frontend/components/SideSheet.tsx b/frontend/components/SideSheet.tsx
index 215d477..972631c 100644
--- a/frontend/components/SideSheet.tsx
+++ b/frontend/components/SideSheet.tsx
@@ -119,6 +119,12 @@ interface CellDetailProps {
   value: unknown;
   /** Row-level sources stored by the populate agent. */
   sources?: string[];
+  /** Cell-level provenance metadata. */
+  provenance?: {
+    url: string;
+    query?: string;
+    snippet?: string;
+  };
 }
 
 function isValidHttpUrl(src: string): boolean {
@@ -130,7 +136,7 @@ function isValidHttpUrl(src: string): boolean {
   }
 }
 
-export function CellDetail({ column, value, sources }: CellDetailProps) {
+export function CellDetail({ column, value, sources, provenance }: CellDetailProps) {
   const [copied, setCopied] = useState(false);
   const copyTimerRef = useRef<ReturnType<typeof setTimeout> | null>(null);
   const displayValue = value == null || value === "" ? "—" : String(value);
@@ -192,6 +198,65 @@ export function CellDetail({ column, value, sources }: CellDetailProps) {
         </div>
       </div>
 
+      {/* Cell Provenance */}
+      {provenance && (
+        <div className="rounded-xl border border-emerald-500/15 bg-emerald-500/[0.02] p-4 space-y-3.5">
+          <div className="flex items-center gap-2 text-emerald-700 dark:text-emerald-400 font-medium text-xs">
+            <svg width="14" height="14" viewBox="0 0 24 24" fill="none" stroke="currentColor" strokeWidth="2" strokeLinecap="round" strokeLinejoin="round">
+              <path d="M12 22s8-4 8-10V5l-8-3-8 3v7c0 6 8 10 8 10z"/>
+            </svg>
+            <span>Verified Source Origin</span>
+          </div>
+
+          <div className="space-y-3">
+            {/* Source URL */}
+            <div>
+              <p className="text-[10px] font-semibold text-muted uppercase tracking-wider">Source URL</p>
+              {isValidHttpUrl(provenance.url) ? (
+                <a
+                  href={provenance.url}
+                  target="_blank"
+                  rel="noopener noreferrer"
+                  className="inline-flex items-center gap-1 text-xs text-link hover:underline break-all mt-0.5"
+                  data-ph-mask-text="true"
+                >
+                  <IconExternalLink />
+                  {provenance.url}
+                </a>
+              ) : (
+                <p className="text-xs text-foreground break-all mt-0.5" data-ph-mask-text="true">
+                  {provenance.url}
+                </p>
+              )}
+            </div>
+
+            {/* Search Query */}
+            {provenance.query && (
+              <div>
+                <p className="text-[10px] font-semibold text-muted uppercase tracking-wider">Search Query Used</p>
+                <div className="inline-flex items-center gap-1 px-1.5 py-0.5 rounded bg-foreground/[0.04] border border-border/60 text-xs text-foreground/80 mt-1" data-ph-mask-text="true">
+                  <svg width="10" height="10" viewBox="0 0 24 24" fill="none" stroke="currentColor" strokeWidth="2.5" strokeLinecap="round" strokeLinejoin="round" className="opacity-60">
+                    <circle cx="11" cy="11" r="8"/><path d="m21 21-4.3-4.3"/>
+                  </svg>
+                  <span>{provenance.query}</span>
+                </div>
+              </div>
+            )}
+
+            {/* Text Snippet */}
+            {provenance.snippet && (
+              <div>
+                <p className="text-[10px] font-semibold text-muted uppercase tracking-wider mb-1">Snippet Context</p>
+                <div className="relative rounded-lg border border-border bg-background px-3 py-2 text-xs italic text-foreground/80 leading-relaxed" data-ph-mask-text="true">
+                  <span className="absolute left-2.5 top-1.5 text-foreground/10 text-2xl font-serif leading-none">&ldquo;</span>
+                  <p className="pl-4 pr-1">{provenance.snippet}</p>
+                </div>
+              </div>
+            )}
+          </div>
+        </div>
+      )}
+
       {/* Sources */}
       {sources && sources.length > 0 && (
         <div>
diff --git a/frontend/components/table/DataRow.tsx b/frontend/components/table/DataRow.tsx
index 2d54661..0a456d5 100644
--- a/frontend/components/table/DataRow.tsx
+++ b/frontend/components/table/DataRow.tsx
@@ -117,6 +117,7 @@ function DataRowImpl({
         const value = row.original.data[col.name];
         const isPending = pendingRowIds.has(row.original._id);
         const isFlashing = flashingCells.has(`${row.original._id}:${col.name}`);
+        const hasProvenance = !!row.original.provenance?.[col.name];
         return (
           <div
             key={col.name}
@@ -131,7 +132,17 @@ function DataRowImpl({
               padding: "var(--table-cell-py) var(--table-cell-px)",
             }}
           >
-            <CellValue value={value} type={col.type} />
+            <div className="flex items-center gap-1.5 min-w-0 pr-6">
+              <div className="truncate">
+                <CellValue value={value} type={col.type} />
+              </div>
+              {hasProvenance && (
+                <span
+                  title="Source provenance available"
+                  className="inline-flex shrink-0 w-1.5 h-1.5 rounded-full bg-emerald-500/70"
+                />
+              )}
+            </div>
             <button
               type="button"
               onClick={(e) => {
diff --git a/frontend/components/table/types.ts b/frontend/components/table/types.ts
index 6eff009..bbf468b 100644
--- a/frontend/components/table/types.ts
+++ b/frontend/components/table/types.ts
@@ -29,5 +29,13 @@ export interface DatasetRow {
   _creationTime: number;
   data: Record<string, unknown>;
   sources?: string[];
+  provenance?: Record<
+    string,
+    {
+      url: string;
+      query?: string;
+      snippet?: string;
+    }
+  >;
   updateStatus?: "pending";
 }
diff --git a/frontend/convex/datasetRows.ts b/frontend/convex/datasetRows.ts
index 0536f3b..cc8dd84 100644
--- a/frontend/convex/datasetRows.ts
+++ b/frontend/convex/datasetRows.ts
@@ -66,6 +66,16 @@ export const insert = internalMutation({
     datasetId: v.id("datasets"),
     data: v.record(v.string(), v.any()),
     sources: v.optional(v.array(v.string())),
+    provenance: v.optional(
+      v.record(
+        v.string(),
+        v.object({
+          url: v.string(),
+          query: v.optional(v.string()),
+          snippet: v.optional(v.string()),
+        })
+      )
+    ),
     rowSummary: v.optional(v.string()),
     howFound: v.optional(v.string()),
   },
@@ -150,6 +160,16 @@ export const update = internalMutation({
     expectedDatasetId: v.id("datasets"),
     data: v.record(v.string(), v.any()),
     sources: v.optional(v.array(v.string())),
+    provenance: v.optional(
+      v.record(
+        v.string(),
+        v.object({
+          url: v.string(),
+          query: v.optional(v.string()),
+          snippet: v.optional(v.string()),
+        })
+      )
+    ),
     rowSummary: v.optional(v.string()),
     howFound: v.optional(v.string()),
   },
@@ -182,6 +202,7 @@ export const update = internalMutation({
       updateStatus: undefined,
     };
     if (args.sources !== undefined) patch.sources = args.sources;
+    if (args.provenance !== undefined) patch.provenance = args.provenance;
     if (args.rowSummary !== undefined) patch.rowSummary = args.rowSummary;
     if (args.howFound !== undefined) patch.howFound = args.howFound;
     await ctx.db.patch(args.id, patch);
diff --git a/frontend/convex/schema.ts b/frontend/convex/schema.ts
index 5918710..60deaf1 100644
--- a/frontend/convex/schema.ts
+++ b/frontend/convex/schema.ts
@@ -82,6 +82,16 @@ export default defineSchema({
     datasetId: v.id("datasets"),
     data: v.record(v.string(), v.any()),
     sources: v.optional(v.array(v.string())),
+    provenance: v.optional(
+      v.record(
+        v.string(),
+        v.object({
+          url: v.string(),
+          query: v.optional(v.string()),
+          snippet: v.optional(v.string()),
+        })
+      )
+    ),
     rowSummary: v.optional(v.string()),
     howFound: v.optional(v.string()),
     updateStatus: v.optional(v.literal("pending")),

From ec4a8ece091b927609bc27fa10514a7df9eba22a Mon Sep 17 00:00:00 2001
From: pritpatel2412 <pritptl2412@gmail.com>
Date: Sun, 7 Jun 2026 21:20:33 +0530
Subject: [PATCH 3/3] fix: cap maxTokens in OpenRouter calls to prevent 402
 billing errors

---
 backend/src/mastra/agents/investigate.ts |  3 +-
 backend/src/mastra/agents/populate.ts    |  3 +-
 backend/src/mastra/agents/refresh.ts     |  3 +-
 backend/src/mastra/model-wrapper.test.ts | 62 ++++++++++++++++++++++++
 backend/src/mastra/model-wrapper.ts      | 40 +++++++++++++++
 backend/src/mastra/workflows/populate.ts |  3 +-
 backend/src/pipeline/schema-inference.ts |  3 +-
 7 files changed, 112 insertions(+), 5 deletions(-)
 create mode 100644 backend/src/mastra/model-wrapper.test.ts
 create mode 100644 backend/src/mastra/model-wrapper.ts

diff --git a/backend/src/mastra/agents/investigate.ts b/backend/src/mastra/agents/investigate.ts
index 99c2a4d..3e0d1b8 100644
--- a/backend/src/mastra/agents/investigate.ts
+++ b/backend/src/mastra/agents/investigate.ts
@@ -1,5 +1,6 @@
 import { Agent } from "@mastra/core/agent";
 import { createOpenRouter } from "@openrouter/ai-sdk-provider";
+import { wrapModelWithTokenLimit } from "../model-wrapper.js";
 import { buildPopulateTools } from "../tools/dataset-tools.js";
 import { searchWebTool, fetchPageTool } from "../tools/web-tools.js";
 import type { AuthContext } from "../workflows/populate.js";
@@ -71,7 +72,7 @@ export function buildInvestigateAgent(
     id: "investigate-agent",
     name: "Dataset Investigate Agent",
     instructions: buildInvestigateInstructions(columns),
-    model: openrouter(modelSlug),
+    model: wrapModelWithTokenLimit(openrouter(modelSlug)),
 
     tools: {
       insert_row,
diff --git a/backend/src/mastra/agents/populate.ts b/backend/src/mastra/agents/populate.ts
index 85edf53..ec10b73 100644
--- a/backend/src/mastra/agents/populate.ts
+++ b/backend/src/mastra/agents/populate.ts
@@ -1,5 +1,6 @@
 import { Agent } from "@mastra/core/agent";
 import { createOpenRouter } from "@openrouter/ai-sdk-provider";
+import { wrapModelWithTokenLimit } from "../model-wrapper.js";
 import { buildSubagentTool } from "../tools/investigate-tool.js";
 import { searchWebTool, fetchPageTool } from "../tools/web-tools.js";
 import type { AuthContext } from "../workflows/populate.js";
@@ -50,7 +51,7 @@ export function buildPopulateAgent(
     id: "populate-agent",
     name: "Dataset Populate Orchestrator",
     instructions: INSTRUCTIONS,
-    model: openrouter(modelSlug),
+    model: wrapModelWithTokenLimit(openrouter(modelSlug)),
     tools: {
       search_web: searchWebTool,
       fetch_page: fetchPageTool,
diff --git a/backend/src/mastra/agents/refresh.ts b/backend/src/mastra/agents/refresh.ts
index 2215686..fad49bc 100644
--- a/backend/src/mastra/agents/refresh.ts
+++ b/backend/src/mastra/agents/refresh.ts
@@ -1,5 +1,6 @@
 import { Agent } from "@mastra/core/agent";
 import { createOpenRouter } from "@openrouter/ai-sdk-provider";
+import { wrapModelWithTokenLimit } from "../model-wrapper.js";
 import { buildPopulateTools } from "../tools/dataset-tools.js";
 import { searchWebTool, fetchPageTool } from "../tools/web-tools.js";
 import type { AuthContext } from "../workflows/populate.js";
@@ -64,7 +65,7 @@ export function buildRefreshAgent(
     id: "refresh-agent",
     name: "Dataset Refresh Agent",
     instructions: buildRefreshInstructions(columns),
-    model: openrouter("qwen/qwen3.7-max"),
+    model: wrapModelWithTokenLimit(openrouter("qwen/qwen3.7-max")),
     tools: {
       update_row,
       search_web: searchWebTool,
diff --git a/backend/src/mastra/model-wrapper.test.ts b/backend/src/mastra/model-wrapper.test.ts
new file mode 100644
index 0000000..ebb810a
--- /dev/null
+++ b/backend/src/mastra/model-wrapper.test.ts
@@ -0,0 +1,62 @@
+import test from "node:test";
+import assert from "node:assert";
+import { wrapModelWithTokenLimit } from "./model-wrapper.js";
+
+test("wrapModelWithTokenLimit - doGenerate intercepts and caps maxTokens", async () => {
+  let receivedOptions: any = null;
+
+  const mockModel: any = {
+    provider: "test-provider",
+    modelId: "test-model",
+    doGenerate: async (options: any) => {
+      receivedOptions = options;
+      return { text: "mock response" };
+    },
+    doStream: async (options: any) => {
+      receivedOptions = options;
+      return { stream: "mock stream" };
+    },
+  };
+
+  const wrapped = wrapModelWithTokenLimit(mockModel, 4096);
+
+  // 1. Default maxTokens when not provided
+  await wrapped.doGenerate({ prompt: "hello" });
+  assert.strictEqual(receivedOptions.maxTokens, 4096);
+
+  // 2. Cap maxTokens when it exceeds the limit
+  await wrapped.doGenerate({ prompt: "hello", maxTokens: 99999 });
+  assert.strictEqual(receivedOptions.maxTokens, 4096);
+
+  // 3. Keep maxTokens when it is below the limit
+  await wrapped.doGenerate({ prompt: "hello", maxTokens: 1000 });
+  assert.strictEqual(receivedOptions.maxTokens, 1000);
+
+  // 4. Test doStream default
+  await wrapped.doStream({ prompt: "hello" });
+  assert.strictEqual(receivedOptions.maxTokens, 4096);
+
+  // 5. Test doStream cap
+  await wrapped.doStream({ prompt: "hello", maxTokens: 99999 });
+  assert.strictEqual(receivedOptions.maxTokens, 4096);
+
+  // 6. Test doStream keep below limit
+  await wrapped.doStream({ prompt: "hello", maxTokens: 1000 });
+  assert.strictEqual(receivedOptions.maxTokens, 1000);
+});
+
+test("wrapModelWithTokenLimit - forwards properties and binds functions", () => {
+  const mockModel: any = {
+    provider: "test-provider",
+    modelId: "test-model",
+    someFunc() {
+      return this.provider;
+    },
+  };
+
+  const wrapped = wrapModelWithTokenLimit(mockModel, 4096);
+
+  assert.strictEqual(wrapped.provider, "test-provider");
+  assert.strictEqual(wrapped.modelId, "test-model");
+  assert.strictEqual(wrapped.someFunc(), "test-provider");
+});
diff --git a/backend/src/mastra/model-wrapper.ts b/backend/src/mastra/model-wrapper.ts
new file mode 100644
index 0000000..b01e2d3
--- /dev/null
+++ b/backend/src/mastra/model-wrapper.ts
@@ -0,0 +1,40 @@
+/**
+ * Wraps a LanguageModel with a Proxy to cap or default the maxTokens parameter.
+ * This prevents OpenRouter 402 errors due to requesting the default 65535 maxTokens.
+ */
+export function wrapModelWithTokenLimit(
+  model: any,
+  maxTokensLimit: number = 8192,
+): any {
+  return new Proxy(model, {
+    get(target, prop, receiver) {
+      if (prop === "doGenerate") {
+        return async function (options: any) {
+          const modifiedOptions = { ...options };
+          if (typeof modifiedOptions.maxTokens === "number") {
+            modifiedOptions.maxTokens = Math.min(modifiedOptions.maxTokens, maxTokensLimit);
+          } else {
+            modifiedOptions.maxTokens = maxTokensLimit;
+          }
+          return target.doGenerate(modifiedOptions);
+        };
+      }
+      if (prop === "doStream") {
+        return async function (options: any) {
+          const modifiedOptions = { ...options };
+          if (typeof modifiedOptions.maxTokens === "number") {
+            modifiedOptions.maxTokens = Math.min(modifiedOptions.maxTokens, maxTokensLimit);
+          } else {
+            modifiedOptions.maxTokens = maxTokensLimit;
+          }
+          return target.doStream(modifiedOptions);
+        };
+      }
+      const val = Reflect.get(target, prop, receiver);
+      if (typeof val === "function") {
+        return val.bind(target);
+      }
+      return val;
+    },
+  });
+}
diff --git a/backend/src/mastra/workflows/populate.ts b/backend/src/mastra/workflows/populate.ts
index a831616..ed47a91 100644
--- a/backend/src/mastra/workflows/populate.ts
+++ b/backend/src/mastra/workflows/populate.ts
@@ -2,6 +2,7 @@ import { createStep, createWorkflow } from "@mastra/core/workflows";
 import { z } from "zod";
 import { generateText } from "ai";
 import { createOpenRouter } from "@openrouter/ai-sdk-provider";
+import { wrapModelWithTokenLimit } from "../model-wrapper.js";
 import { datasetContextSchema, populateColumnSchema } from "../../pipeline/populate.js";
 import { convex, internal } from "../../convex.js";
 import { DEFAULT_MODEL_IDS } from "../../config/models.js";
@@ -114,7 +115,7 @@ Respond with EXACTLY one word: scraper or search`;
       const modelSlug =
         inputData.authContext?.modelConfig?.schemaInference ?? DEFAULT_MODEL_IDS.SCHEMA_INFERENCE;
       const result = await generateText({
-        model: openrouter(modelSlug),
+        model: wrapModelWithTokenLimit(openrouter(modelSlug)),
         prompt: classificationPrompt,
         maxOutputTokens: 10,
         abortSignal: getSignal(inputData.datasetId),
diff --git a/backend/src/pipeline/schema-inference.ts b/backend/src/pipeline/schema-inference.ts
index 1f1ea2a..a70e9ba 100644
--- a/backend/src/pipeline/schema-inference.ts
+++ b/backend/src/pipeline/schema-inference.ts
@@ -1,5 +1,6 @@
 import { generateText, Output, NoObjectGeneratedError } from "ai";
 import { createOpenRouter } from "@openrouter/ai-sdk-provider";
+import { wrapModelWithTokenLimit } from "../mastra/model-wrapper.js";
 
 import { DEFAULT_MODEL_IDS } from "../config/models.js";
 import { datasetSchemaSchema, type DatasetSchema } from "./types.js";
@@ -33,7 +34,7 @@ function getModel(modelSlug?: string) {
   }
   const openrouter = createOpenRouter({ apiKey });
   const resolvedSlug = modelSlug ?? DEFAULT_MODEL_IDS.SCHEMA_INFERENCE;
-  return openrouter(resolvedSlug);
+  return wrapModelWithTokenLimit(openrouter(resolvedSlug));
 }
 
 export async function inferSchema(prompt: string, modelSlug?: string): Promise<DatasetSchema> {