From 4cdf47d32f78e38351aea06c10a5e7d6f83f52d1 Mon Sep 17 00:00:00 2001 From: pritpatel2412 Date: Thu, 4 Jun 2026 14:46:02 +0530 Subject: [PATCH 1/3] feat: paginate runStats queries, fix verify-authz on Windows, and resolve React hook state effect warning --- .../table/use-row-change-detection.ts | 14 +++-- frontend/convex/runStats.ts | 54 +++++++++++++------ frontend/convex/schema.ts | 2 + scripts/verify-authz.sh | 19 +++++-- 4 files changed, 64 insertions(+), 25 deletions(-) diff --git a/frontend/components/table/use-row-change-detection.ts b/frontend/components/table/use-row-change-detection.ts index 49ce219..e823372 100644 --- a/frontend/components/table/use-row-change-detection.ts +++ b/frontend/components/table/use-row-change-detection.ts @@ -53,11 +53,15 @@ export function useRowChangeDetection(rows: DatasetRow[]) { prevRowsRef.current = nextMap; if (newFlashes.size > 0) { - setFlashingCells((prev) => { - const merged = new Set(prev); - for (const key of newFlashes) merged.add(key); - return merged; - }); + const updateTimer = setTimeout(() => { + setFlashingCells((prev) => { + const merged = new Set(prev); + for (const key of newFlashes) merged.add(key); + return merged; + }); + flashTimersRef.current.delete(updateTimer); + }, 0); + flashTimersRef.current.add(updateTimer); const timer = setTimeout(() => { setFlashingCells((prev) => { diff --git a/frontend/convex/runStats.ts b/frontend/convex/runStats.ts index d1165ec..c225c40 100644 --- a/frontend/convex/runStats.ts +++ b/frontend/convex/runStats.ts @@ -1,6 +1,9 @@ import { internalMutation, internalQuery } from "./_generated/server.js"; import { v } from "convex/values"; +const DEFAULT_PAGE_SIZE = 50; +const MAX_PAGE_SIZE = 200; + /** * Insert a populate-run metrics record. * @@ -68,33 +71,52 @@ export const getByWorkflowRunId = internalQuery({ }); /** - * List all runs for a dataset, newest first. - * TODO: paginate — .collect() loads all docs into memory and will degrade - * as run history grows. Add cursor-based pagination when this is exposed - * to the frontend or run counts become large. + * List runs for a dataset, newest first. + * Cursor-based pagination keeps memory bounded as run history grows. */ export const listByDataset = internalQuery({ - args: { datasetId: v.string() }, + args: { + datasetId: v.string(), + cursor: v.optional(v.string()), + limit: v.optional(v.number()), + }, handler: async (ctx, args) => { - const runs = await ctx.db + const limit = Math.min(args.limit ?? DEFAULT_PAGE_SIZE, MAX_PAGE_SIZE); + const { page, isDone, continueCursor } = await ctx.db .query("runStats") - .withIndex("by_dataset", (q) => q.eq("datasetId", args.datasetId)) - .collect(); - return runs.sort((a, b) => b.startedAt - a.startedAt); + .withIndex("by_dataset_started_at", (q) => + q.eq("datasetId", args.datasetId), + ) + .order("desc") + .paginate({ + cursor: args.cursor ?? null, + numItems: limit, + }); + + return { runs: page, isDone, continueCursor }; }, }); /** - * List all runs for a user, newest first. - * TODO: paginate — same concern as listByDataset above. + * List runs for a user, newest first. */ export const listByUser = internalQuery({ - args: { userId: v.string() }, + args: { + userId: v.string(), + cursor: v.optional(v.string()), + limit: v.optional(v.number()), + }, handler: async (ctx, args) => { - const runs = await ctx.db + const limit = Math.min(args.limit ?? DEFAULT_PAGE_SIZE, MAX_PAGE_SIZE); + const { page, isDone, continueCursor } = await ctx.db .query("runStats") - .withIndex("by_user", (q) => q.eq("userId", args.userId)) - .collect(); - return runs.sort((a, b) => b.startedAt - a.startedAt); + .withIndex("by_user_started_at", (q) => q.eq("userId", args.userId)) + .order("desc") + .paginate({ + cursor: args.cursor ?? null, + numItems: limit, + }); + + return { runs: page, isDone, continueCursor }; }, }); diff --git a/frontend/convex/schema.ts b/frontend/convex/schema.ts index d1c1888..5918710 100644 --- a/frontend/convex/schema.ts +++ b/frontend/convex/schema.ts @@ -170,6 +170,8 @@ export default defineSchema({ rowsUpdated: v.optional(v.number()), }) .index("by_dataset", ["datasetId"]) + .index("by_dataset_started_at", ["datasetId", "startedAt"]) .index("by_user", ["userId"]) + .index("by_user_started_at", ["userId", "startedAt"]) .index("by_workflow_run", ["workflowRunId"]), }); diff --git a/scripts/verify-authz.sh b/scripts/verify-authz.sh index b3496a1..3dd457e 100644 --- a/scripts/verify-authz.sh +++ b/scripts/verify-authz.sh @@ -6,6 +6,17 @@ # bash scripts/verify-authz.sh set -u +# Use python3 if available, fallback to python (common on Windows) +PYTHON="python3" +if ! command -v python3 &>/dev/null; then + if command -v python &>/dev/null; then + PYTHON="python" + else + echo "Error: python3 or python is required to run this script." + exit 1 + fi +fi + CONVEX="${CONVEX_URL:-http://localhost:3210}" FRONTEND="${FRONTEND_URL:-http://localhost:3500}" FAIL=0 @@ -34,11 +45,11 @@ mutation() { } assert_success() { - python3 -c "import json,sys; d=json.load(sys.stdin); print('PASS' if d.get('status')=='success' else 'FAIL: '+d.get('errorMessage','?')[:60])" + $PYTHON -c "import json,sys; d=json.load(sys.stdin); print('PASS' if d.get('status')=='success' else 'FAIL: '+d.get('errorMessage','?')[:60])" } assert_error_contains() { local needle="$1" - python3 -c "import json,sys; d=json.load(sys.stdin); print('PASS' if '$needle' in d.get('errorMessage','') else 'FAIL: '+d.get('errorMessage','?')[:80])" + $PYTHON -c "import json,sys; d=json.load(sys.stdin); print('PASS' if '$needle' in d.get('errorMessage','') else 'FAIL: '+d.get('errorMessage','?')[:80])" } echo "════════════════════════════════════════════════════════════════" @@ -47,7 +58,7 @@ echo " convex=$CONVEX frontend=$FRONTEND" echo "════════════════════════════════════════════════════════════════" PUB_ID=$(query '{"path":"datasets:listPublic","args":{},"format":"json"}' \ - | python3 -c "import json,sys; print(json.load(sys.stdin)['value'][0]['_id'])") + | $PYTHON -c "import json,sys; print(json.load(sys.stdin)['value'][0]['_id'])") if [ -z "${PUB_ID:-}" ]; then echo "No public dataset found. Seed curated data first (publicSeed:seedPublicDatasets)." exit 1 @@ -65,7 +76,7 @@ section "Anonymous WRITES — must all be rejected" run_test "anon datasets.listMine -> Not authenticated" \ "$(query '{"path":"datasets:listMine","args":{},"format":"json"}' | assert_error_contains 'Not authenticated')" run_test "anon datasets.create -> Not authenticated" \ - "$(mutation '{"path":"datasets:create","args":{"name":"x","description":"x","cadence":"daily","columns":[]},"format":"json"}' | assert_error_contains 'Not authenticated')" + "$(mutation '{"path":"datasets:create","args":{"name":"x","description":"x","refreshCadence":"daily","columns":[]},"format":"json"}' | assert_error_contains 'Not authenticated')" run_test "anon datasets.updateStatus -> Not authenticated" \ "$(mutation "{\"path\":\"datasets:updateStatus\",\"args\":{\"id\":\"$PUB_ID\",\"status\":\"paused\"},\"format\":\"json\"}" | assert_error_contains 'Not authenticated')" run_test "anon datasets.remove -> Not authenticated" \ From 0103e93e2048b90e979f5308307eee4a9c514da6 Mon Sep 17 00:00:00 2001 From: pritpatel2412 Date: Fri, 5 Jun 2026 17:45:54 +0530 Subject: [PATCH 2/3] feat: implement per-cell source provenance --- backend/src/mastra/agents/investigate.ts | 5 +- backend/src/mastra/tools/dataset-tools.ts | 28 +++++++++- frontend/app/dataset/[id]/page.tsx | 13 ++++- frontend/components/SideSheet.tsx | 67 ++++++++++++++++++++++- frontend/components/table/DataRow.tsx | 13 ++++- frontend/components/table/types.ts | 8 +++ frontend/convex/datasetRows.ts | 21 +++++++ frontend/convex/schema.ts | 10 ++++ 8 files changed, 158 insertions(+), 7 deletions(-) diff --git a/backend/src/mastra/agents/investigate.ts b/backend/src/mastra/agents/investigate.ts index 4cdc32e..99c2a4d 100644 --- a/backend/src/mastra/agents/investigate.ts +++ b/backend/src/mastra/agents/investigate.ts @@ -28,18 +28,19 @@ RULES: - You have at most 6 tool calls total. Budget them: 1 fetch + 1 search + 1 fetch + 1 insert = done. - ALWAYS insert a row, even if some fields are incomplete. Use "" for unknown fields. Partial real data is better than no row. - Never fabricate values. Use "" for anything you cannot verify. +- For every field value you extract and fill in "data", you MUST record the cell-level provenance (the source URL, the search query used to find it, and the exact text snippet context showing the value) in the "provenance" parameter of insert_row/update_row. - insert_row rejects duplicates based on primary key columns. If you get a "Duplicate" error, do NOT retry — report INSERTED: false and move on. TOOL CALL FORMAT — every tool call argument must be a JSON object wrapped in curly braces: search_web: {"query": "your search terms"} fetch_page: {"url": "https://example.com"} - insert_row: {"data": {${columnNames.map((n) => `"${n}": "value"`).join(", ")}}, "sources": ["https://url-you-fetched.com"], "row_summary": "one line about this entity", "how_found": "step by step guide on how to extract the data so an agent in the future can do it too"} + insert_row: {"data": {${columnNames.map((n) => `"${n}": "value"`).join(", ")}}, "sources": ["https://url-you-fetched.com"], "provenance": {${columnNames.map((n) => `"${n}": {"url": "https://url-you-fetched.com", "query": "search query used", "snippet": "exact context snippet from page"}`).join(", ")}}, "row_summary": "one line about this entity", "how_found": "step by step guide on how to extract the data so an agent in the future can do it too"} WORKFLOW: 1. Fetch 1-2 of the provided URLs to get real data (if URLs were given). 2. If you need more, run ONE search and fetch the best result. 3. Call insert_row with whatever real data you have. Use "" for missing fields. - Include "sources" (URLs you fetched), "row_summary" (one line about this entity), and "how_found" (a step by step guide on how you found this data. eg, 1. fetch the contents of this url "", 2. Look for the pricing field, and title name field, 3. etc...) + Include "sources" (URLs you fetched), "provenance" (mapping of column names to their detailed source details), "row_summary" (one line about this entity), and "how_found" (a step by step guide on how you found this data. eg, 1. fetch the contents of this url "", 2. Look for the pricing field, and title name field, 3. etc...) 4. Write your final response: INSERTED: true/false SUMMARY: one line diff --git a/backend/src/mastra/tools/dataset-tools.ts b/backend/src/mastra/tools/dataset-tools.ts index 1fc016e..f044ca3 100644 --- a/backend/src/mastra/tools/dataset-tools.ts +++ b/backend/src/mastra/tools/dataset-tools.ts @@ -131,6 +131,17 @@ export function buildPopulateTools( .array(z.string()) .optional() .describe("URLs you visited or used to gather data for this row"), + provenance: z + .record( + z.string(), + z.object({ + url: z.string(), + query: z.string().optional(), + snippet: z.string().optional(), + }) + ) + .optional() + .describe("Mapping of column names to their detailed source provenance (url, query, snippet)"), row_summary: z .string() .optional() @@ -141,7 +152,7 @@ export function buildPopulateTools( .describe("Brief description of how you found and verified this data"), }), outputSchema: writeResultSchema, - execute: async ({ data, sources, row_summary, how_found }) => { + execute: async ({ data, sources, provenance, row_summary, how_found }) => { if (!data || Object.keys(data).length === 0) return { success: false, @@ -158,6 +169,7 @@ export function buildPopulateTools( datasetId: authorizedDatasetId, data: cleanedData, ...(sources !== undefined ? { sources } : {}), + ...(provenance !== undefined ? { provenance } : {}), ...(row_summary !== undefined ? { rowSummary: row_summary } : {}), ...(how_found !== undefined ? { howFound: how_found } : {}), }); @@ -265,6 +277,17 @@ export function buildPopulateTools( .array(z.string()) .optional() .describe("Updated source URLs where this data was verified"), + provenance: z + .record( + z.string(), + z.object({ + url: z.string(), + query: z.string().optional(), + snippet: z.string().optional(), + }) + ) + .optional() + .describe("Updated mapping of column names to their detailed source provenance (url, query, snippet)"), row_summary: z .string() .optional() @@ -275,7 +298,7 @@ export function buildPopulateTools( .describe("Brief description of how the updated data was found"), }), outputSchema: writeResultSchema, - execute: async ({ rowId, data, sources, row_summary, how_found }) => { + execute: async ({ rowId, data, sources, provenance, row_summary, how_found }) => { if (!rowId) return { success: false, error: "rowId is required." }; if (!data || Object.keys(data).length === 0) return { @@ -293,6 +316,7 @@ export function buildPopulateTools( expectedDatasetId: authorizedDatasetId, data: cleanedData, ...(sources !== undefined ? { sources } : {}), + ...(provenance !== undefined ? { provenance } : {}), ...(row_summary !== undefined ? { rowSummary: row_summary } : {}), ...(how_found !== undefined ? { howFound: how_found } : {}), }); diff --git a/frontend/app/dataset/[id]/page.tsx b/frontend/app/dataset/[id]/page.tsx index d9ee1b6..956dd35 100644 --- a/frontend/app/dataset/[id]/page.tsx +++ b/frontend/app/dataset/[id]/page.tsx @@ -41,6 +41,11 @@ export default function DatasetPage() { column: DatasetColumn; value: unknown; sources?: string[]; + provenance?: { + url: string; + query?: string; + snippet?: string; + }; } | null>(null); const datasetId = params.id as Id<"datasets">; @@ -95,7 +100,12 @@ export default function DatasetPage() { const col = dataset.columns.find((c) => c.name === columnName); if (!col) return; const row = rows.find((r) => r._id === rowId); - setCellDetail({ column: col, value, sources: row?.sources }); + setCellDetail({ + column: col, + value, + sources: row?.sources, + provenance: row?.provenance?.[columnName], + }); }, [dataset, rows]); const openedFired = useRef(null); @@ -409,6 +419,7 @@ export default function DatasetPage() { column={cellDetail.column} value={cellDetail.value} sources={cellDetail.sources} + provenance={cellDetail.provenance} /> )} diff --git a/frontend/components/SideSheet.tsx b/frontend/components/SideSheet.tsx index 215d477..972631c 100644 --- a/frontend/components/SideSheet.tsx +++ b/frontend/components/SideSheet.tsx @@ -119,6 +119,12 @@ interface CellDetailProps { value: unknown; /** Row-level sources stored by the populate agent. */ sources?: string[]; + /** Cell-level provenance metadata. */ + provenance?: { + url: string; + query?: string; + snippet?: string; + }; } function isValidHttpUrl(src: string): boolean { @@ -130,7 +136,7 @@ function isValidHttpUrl(src: string): boolean { } } -export function CellDetail({ column, value, sources }: CellDetailProps) { +export function CellDetail({ column, value, sources, provenance }: CellDetailProps) { const [copied, setCopied] = useState(false); const copyTimerRef = useRef | null>(null); const displayValue = value == null || value === "" ? "—" : String(value); @@ -192,6 +198,65 @@ export function CellDetail({ column, value, sources }: CellDetailProps) { + {/* Cell Provenance */} + {provenance && ( +
+
+ + + + Verified Source Origin +
+ +
+ {/* Source URL */} +
+

Source URL

+ {isValidHttpUrl(provenance.url) ? ( + + + {provenance.url} + + ) : ( +

+ {provenance.url} +

+ )} +
+ + {/* Search Query */} + {provenance.query && ( +
+

Search Query Used

+
+ + + + {provenance.query} +
+
+ )} + + {/* Text Snippet */} + {provenance.snippet && ( +
+

Snippet Context

+
+ +

{provenance.snippet}

+
+
+ )} +
+
+ )} + {/* Sources */} {sources && sources.length > 0 && (
diff --git a/frontend/components/table/DataRow.tsx b/frontend/components/table/DataRow.tsx index 2d54661..0a456d5 100644 --- a/frontend/components/table/DataRow.tsx +++ b/frontend/components/table/DataRow.tsx @@ -117,6 +117,7 @@ function DataRowImpl({ const value = row.original.data[col.name]; const isPending = pendingRowIds.has(row.original._id); const isFlashing = flashingCells.has(`${row.original._id}:${col.name}`); + const hasProvenance = !!row.original.provenance?.[col.name]; return (
- +
+
+ +
+ {hasProvenance && ( + + )} +