diff --git a/bun.lock b/bun.lock index 083df85..0042f56 100644 --- a/bun.lock +++ b/bun.lock @@ -1,6 +1,5 @@ { "lockfileVersion": 1, - "configVersion": 0, "workspaces": { "": { "name": "bytebell-public", @@ -29,6 +28,7 @@ "dependencies": { "@bb/config": "workspace:*", "@bb/errors": "workspace:*", + "@bb/ingest-github": "workspace:*", "@bb/logger": "workspace:*", "@bb/types": "workspace:*", "commander": "^14.0.3", @@ -55,6 +55,20 @@ "@bb/types": "workspace:*", }, }, + "packages/ingest-business-context": { + "name": "@bb/ingest-business-context", + "version": "0.0.0", + "dependencies": { + "@bb/config": "workspace:*", + "@bb/errors": "workspace:*", + "@bb/ingest-github": "workspace:*", + "@bb/llm": "workspace:*", + "@bb/logger": "workspace:*", + "@bb/neo4j": "workspace:*", + "@bb/queue": "workspace:*", + "@bb/types": "workspace:*", + }, + }, "packages/ingest-github": { "name": "@bb/ingest-github", "version": "0.0.0", @@ -75,6 +89,7 @@ "dependencies": { "@bb/config": "workspace:*", "@bb/errors": "workspace:*", + "@bb/logger": "workspace:*", "@bb/mongo": "workspace:*", "@bb/types": "workspace:*", "tiktoken": "^1.0.22", @@ -122,6 +137,7 @@ "dependencies": { "@bb/config": "workspace:*", "@bb/errors": "workspace:*", + "@bb/mongo": "workspace:*", "@bb/types": "workspace:*", "neo4j-driver": "^6.0.1", }, @@ -188,6 +204,8 @@ "@bb/errors": ["@bb/errors@workspace:packages/errors"], + "@bb/ingest-business-context": ["@bb/ingest-business-context@workspace:packages/ingest-business-context"], + "@bb/ingest-github": ["@bb/ingest-github@workspace:packages/ingest-github"], "@bb/llm": ["@bb/llm@workspace:packages/llm"], diff --git a/infra/docker/docker-compose.yml b/infra/docker/docker-compose.yml index c73be99..3fdc22d 100644 --- a/infra/docker/docker-compose.yml +++ b/infra/docker/docker-compose.yml @@ -4,7 +4,7 @@ services: container_name: bytebell-mongo restart: unless-stopped ports: - - "127.0.0.1:27017:27017" + - "127.0.0.1:27117:27017" volumes: - mongo_data:/data/db environment: @@ -26,7 +26,7 @@ services: restart: unless-stopped ports: - "127.0.0.1:7474:7474" - - "127.0.0.1:7687:7687" + - "127.0.0.1:7787:7687" volumes: - neo4j_data:/data environment: @@ -47,7 +47,7 @@ services: container_name: bytebell-redis restart: unless-stopped ports: - - "127.0.0.1:6379:6379" + - "127.0.0.1:6479:6379" volumes: - redis_data:/data command: ["redis-server", "--appendonly", "yes"] diff --git a/package.json b/package.json index 5361baa..c70eb7b 100644 --- a/package.json +++ b/package.json @@ -22,7 +22,7 @@ "lint-staged": { "**/*.{ts,tsx,js,mjs,cjs}": [ "prettier --write", - "eslint --fix --max-warnings=0" + "eslint --fix --max-warnings=0 --no-warn-ignored" ], "**/*.{json,md}": [ "prettier --write" diff --git a/packages/cli/README.md b/packages/cli/README.md index 619a054..d1b0fd6 100644 --- a/packages/cli/README.md +++ b/packages/cli/README.md @@ -51,7 +51,8 @@ infra/docker/docker-compose.yml up -d`, polls prefer `bytebell boot`. - `bytebell index ` / `bytebell ingest [path]` / `bytebell ls` — talk HTTP to a running server (lazy-spawn via - `serverSpawn.ensureServerRunning` when the daemon is down). + `serverSpawn.ensureServerRunning` when the daemon is down). `ls` supports + an interactive mode (`-i`) for hierarchical browsing of repos and commits. - `bytebell delete` — list indexed knowledge in an Ink arrow-key picker (`DeleteSelector.tsx`, plain `useInput` — no extra dep), and on confirm `DELETE /api/v1/repos/:id` against the running server. The @@ -151,7 +152,7 @@ will touch when implemented. Only the **bolded** entries ship in v0. | **`bytebell server start`** | **Spawn `bytebell-server` in foreground.** | **Shipped** | | **`bytebell index `** | **POST `/api/v1/github/index` to local server.** | **Shipped** | | **`bytebell ingest [path]`** | **POST `/api/v1/local/index` for a directory tree.** | **Shipped** | -| **`bytebell ls`** | **Render `/api/v1/repos` as a table.** | **Shipped** | +| **`bytebell ls`** | **Render `/api/v1/repos` as a table or interactive explorer (`-i`). v0.** | **Shipped** | | **`bytebell delete`** | **Ink picker over `/api/v1/repos`, then DELETE `/api/v1/repos/:id` (Mongo + Neo4j + jobs).** | **Shipped** | | **`bytebell stats`** | **Render `/api/v1/stats` (totals + per-repo + per-commit token / cost rows).** | **Shipped** | | `bytebell` | Ink dashboard with Repos / Server / Activity / Cost panes ([docs/arch.md:172-184](../../docs/arch.md#L172-L184)) | After `@bb/server` HTTP API + activity feed | diff --git a/packages/cli/package.json b/packages/cli/package.json index e7297d3..17414e6 100644 --- a/packages/cli/package.json +++ b/packages/cli/package.json @@ -8,12 +8,16 @@ "exports": { ".": "./src/index.ts" }, + "imports": { + "#src/*": "./src/*" + }, "bin": { "bytebell": "./src/index.ts" }, "dependencies": { "@bb/config": "workspace:*", "@bb/errors": "workspace:*", + "@bb/ingest-github": "workspace:*", "@bb/logger": "workspace:*", "@bb/types": "workspace:*", "commander": "^14.0.3", diff --git a/packages/cli/src/BranchSelector.tsx b/packages/cli/src/BranchSelector.tsx new file mode 100644 index 0000000..73b4816 --- /dev/null +++ b/packages/cli/src/BranchSelector.tsx @@ -0,0 +1,130 @@ +import { useMemo, useState } from "react"; +import type { ReactElement } from "react"; +import { Box, Text, useApp, useInput } from "ink"; + +export interface BranchSelectorResult { + branch?: string; + typeManually?: boolean; + cancelled?: boolean; +} + +export interface BranchSelectorProps { + branches: string[]; + title?: string; + onDone: (result: BranchSelectorResult) => void; +} + +const MAX_VISIBLE = 12; + +type ItemKind = "branch" | "manual"; + +export function BranchSelector({ branches: rawBranches, title, onDone }: BranchSelectorProps): ReactElement { + const { exit } = useApp(); + const [filter, setFilter] = useState(""); + const [index, setIndex] = useState(0); + + const branches = useMemo(() => { + const items: Array<{ label: string; kind: ItemKind }> = rawBranches.map((b) => ({ + label: b, + kind: "branch", + })); + items.push({ label: "Type manually...", kind: "manual" }); + return items; + }, [rawBranches]); + + const filtered = useMemo(() => { + if (filter.length === 0) { + return branches; + } + const needle = filter.toLowerCase(); + return branches.filter((item) => item.label.toLowerCase().includes(needle)); + }, [branches, filter]); + + const boundedIndex = filtered.length === 0 ? 0 : Math.min(index, filtered.length - 1); + + useInput((input, key) => { + if (key.escape) { + exit(); + onDone({ cancelled: true }); + return; + } + if (key.return) { + const chosen = filtered[boundedIndex]; + if (!chosen) { + exit(); + onDone({ cancelled: true }); + return; + } + exit(); + if (chosen.kind === "manual") { + onDone({ typeManually: true }); + } else { + onDone({ branch: chosen.label }); + } + return; + } + if (key.upArrow || (input === "k" && filter.length === 0)) { + setIndex(() => (boundedIndex > 0 ? boundedIndex - 1 : Math.max(filtered.length - 1, 0))); + return; + } + if (key.downArrow || (input === "j" && filter.length === 0)) { + setIndex(() => (boundedIndex < filtered.length - 1 ? boundedIndex + 1 : 0)); + return; + } + if (key.backspace || key.delete) { + setFilter((s) => s.slice(0, -1)); + setIndex(0); + return; + } + if (input.length > 0 && !key.ctrl && !key.meta) { + setFilter((s) => s + input); + setIndex(0); + } + }); + + const heading = title ?? "Select a branch"; + const visibleStart = clampWindow(boundedIndex, filtered.length, MAX_VISIBLE); + const visible = filtered.slice(visibleStart, visibleStart + MAX_VISIBLE); + + return ( + + + {heading} + {` (${filtered.length}/${branches.length})`} + + + filter: + {filter.length > 0 ? filter : (type to filter)} + + {filtered.length === 0 ? ( + + No branches match the filter. Backspace to clear. + + ) : ( + visible.map((item, i) => { + const absoluteIndex = visibleStart + i; + const cursor = absoluteIndex === boundedIndex; + const isManual = item.kind === "manual"; + return ( + + {cursor ? "▶ " : " "} + {item.label} + + ); + }) + )} + + [type to filter] [↑/↓] move [Enter] choose [Backspace] clear [Esc] cancel + + + ); +} + +function clampWindow(index: number, total: number, size: number): number { + if (total <= size) { + return 0; + } + const halfWindow = Math.floor(size / 2); + const start = Math.max(0, Math.min(index - halfWindow, total - size)); + return start; +} diff --git a/packages/cli/src/IndexCommand.ts b/packages/cli/src/IndexCommand.ts index 26cc202..51fc392 100644 --- a/packages/cli/src/IndexCommand.ts +++ b/packages/cli/src/IndexCommand.ts @@ -3,8 +3,11 @@ import { Config } from "@bb/types"; import { getConfigValue } from "@bb/config"; import { ensureServerRunning, ServerStartTimeoutError } from "./serverSpawn.ts"; import { getJson, HttpClientError, postJson } from "./httpClient.ts"; -import { createProgressBar, createSpinner, error, type ProgressBar } from "./output.ts"; +import { createProgressBar, createSpinner, error, info, list, type ProgressBar } from "./output.ts"; import { startLogTailer, type LogTailer } from "./logTailer.ts"; +import { promptForToken } from "./pullPrompts.ts"; +import { promptInitialBranch, promptFullBranchSelector } from "./branchPrompts.ts"; +import { parseGithubRepo } from "@bb/ingest-github"; interface IndexResponse { knowledgeId: string; @@ -52,12 +55,16 @@ async function runIndex( if (options.verbose === true) { tailer = await startLogTailer("server"); } - const body: Record = { repoUrl: gitUrl }; - if (options.branch !== undefined) { - body["branch"] = options.branch; + + const { branch: resolvedBranch, token: activeToken } = await probeRepo(gitUrl, options.branch, options.token); + if (resolvedBranch === null) { + // User cancelled during token prompt + return; } - if (options.token !== undefined) { - body["gitToken"] = options.token; + + const body: Record = { repoUrl: gitUrl, branch: resolvedBranch }; + if (activeToken !== undefined) { + body["gitToken"] = activeToken; } const response = await postJson("/api/v1/github/index", body); await pollJobStatus(response.knowledgeId, response.jobId); @@ -126,6 +133,104 @@ async function pollJobStatus(knowledgeId: string, jobId: string): Promise } } +interface ProbeResponse { + status: "ok" | "not_found" | "unauthorized" | "rate_limited" | "error" | "branch_not_found"; + defaultBranch?: string; + branches?: string[]; + message?: string; +} + +async function probeRepo( + gitUrl: string, + suppliedBranch?: string, + suppliedToken?: string, +): Promise<{ branch: string | null; token?: string }> { + let token = suppliedToken; + const parsed = parseGithubRepo(gitUrl); + const repoLabel = parsed ? `${parsed.owner}/${parsed.repo}` : gitUrl; + + // 1. Initial probe to find default branch and check access + const callProbe = async (t?: string) => { + try { + return await postJson("/api/v1/github/probe", { repoUrl: gitUrl, gitToken: t }); + } catch (cause) { + if (cause instanceof HttpClientError && (cause.status === 401 || cause.status === 404)) { + return (cause.body as ProbeResponse) || { status: cause.status === 404 ? "not_found" : "unauthorized" }; + } + throw cause; + } + }; + + let probe = await callProbe(token); + + // 2. Handle private repo if needed + if (probe.status === "not_found" || probe.status === "unauthorized") { + const promptMessage = + probe.status === "unauthorized" + ? "The previous token was rejected. Try a different PAT." + : "This repo looks private. Paste a GitHub PAT with `repo` scope."; + const tokenResult = await promptForToken(repoLabel, promptMessage); + if (tokenResult === null) { + info("Cancelled."); + return { branch: null }; + } + token = tokenResult; + probe = await callProbe(token); + } + + if (probe.status !== "ok") { + error(probe.message ?? "Failed to probe repository."); + return { branch: null }; + } + + // 3. If a branch was already supplied (via flag or URL), just verify it + const branchFromUrl = parsed?.branch; + const initialBranch = suppliedBranch ?? branchFromUrl; + if (initialBranch !== undefined) { + if (probe.branches && !probe.branches.includes(initialBranch)) { + error(`Branch '${initialBranch}' not found.`); + if (probe.branches.length > 0) { + list("Available branches:", probe.branches.slice(0, 20)); + } + return { branch: null }; + } + const res: { branch: string | null; token?: string } = { branch: initialBranch }; + if (token) { + res.token = token; + } + return res; + } + + // 4. Interactive menu flow + const defaultBranch = probe.defaultBranch ?? "main"; + const choice = await promptInitialBranch(defaultBranch); + if (choice === null) { + info("Cancelled."); + return { branch: null }; + } + + if (choice === "default") { + const res: { branch: string | null; token?: string } = { branch: defaultBranch }; + if (token) { + res.token = token; + } + return res; + } + + // User selected "Other branch..." + const fullSelection = await promptFullBranchSelector(probe.branches ?? []); + if (fullSelection === null) { + info("Cancelled."); + return { branch: null }; + } + + const res: { branch: string | null; token?: string } = { branch: fullSelection.branch }; + if (token) { + res.token = token; + } + return res; +} + function handleError(cause: unknown): void { if (cause instanceof ServerStartTimeoutError) { error(cause.message); diff --git a/packages/cli/src/InitialBranchSelector.tsx b/packages/cli/src/InitialBranchSelector.tsx new file mode 100644 index 0000000..70b76c3 --- /dev/null +++ b/packages/cli/src/InitialBranchSelector.tsx @@ -0,0 +1,67 @@ +import { useState } from "react"; +import type { ReactElement } from "react"; +import { Box, Text, useApp, useInput } from "ink"; + +export interface InitialBranchResult { + choice?: "default" | "other"; + cancelled?: boolean; +} + +export interface InitialBranchProps { + defaultBranch: string; + onDone: (result: InitialBranchResult) => void; +} + +export function InitialBranchSelector({ defaultBranch, onDone }: InitialBranchProps): ReactElement { + const { exit } = useApp(); + const [index, setIndex] = useState(0); + + const items = [ + { label: `Default branch (${defaultBranch})`, value: "default" as const }, + { label: "Other branch...", value: "other" as const }, + ]; + + useInput((_input, key) => { + if (key.escape) { + exit(); + onDone({ cancelled: true }); + return; + } + if (key.return) { + exit(); + const choice = items[index]?.value; + if (choice) { + onDone({ choice }); + } else { + onDone({ cancelled: true }); + } + return; + } + if (key.upArrow) { + setIndex(0); + } + if (key.downArrow) { + setIndex(1); + } + }); + + return ( + + + Which branch would you like to index? + + {items.map((item, i) => { + const cursor = i === index; + return ( + + {cursor ? "▶ " : " "} + {item.label} + + ); + })} + + [↑/↓] move [Enter] choose [Esc] cancel + + + ); +} diff --git a/packages/cli/src/LsCommand.ts b/packages/cli/src/LsCommand.ts index c0f103e..5da25cf 100644 --- a/packages/cli/src/LsCommand.ts +++ b/packages/cli/src/LsCommand.ts @@ -4,17 +4,8 @@ import { getConfigValue } from "@bb/config"; import { ensureServerRunning, ServerStartTimeoutError } from "./serverSpawn.ts"; import { getJson, HttpClientError } from "./httpClient.ts"; import { createSpinner, error } from "./output.ts"; - -interface RepoEntry { - knowledgeId: string; - source: - | { kind: "github"; repoUrl: string; branch?: string; commitId?: string; commitHashes?: string[] } - | { kind: "local"; sourcePath: string }; - state: string; - createdAt: string; - updatedAt: string; - fileCount: number; -} +import { promptLsInteractive } from "./lsInteractivePrompt.ts"; +import type { RepoEntry } from "./LsInteractive.tsx"; interface ListResponse { repos: RepoEntry[]; @@ -22,11 +13,15 @@ interface ListResponse { export function buildLsCommand(): Command { const cmd = new Command("ls"); - cmd.description("List indexed knowledge entries.").action(runLs); + cmd + .description("List indexed knowledge entries.") + .option("-i, --interactive", "Use interactive selector to browse entries.", true) + .option("--no-interactive", "Display a plain table instead of the interactive selector.") + .action(runLs); return cmd; } -async function runLs(): Promise { +async function runLs(options: { interactive?: boolean }): Promise { try { let ctx: Awaited>; if ( @@ -47,6 +42,12 @@ async function runLs(): Promise { ); return; } + + if (options.interactive !== false) { + await promptLsInteractive(repos); + return; + } + renderTable(repos); process.stdout.write(`\n${repos.length} ${repos.length === 1 ? "entry" : "entries"}.\n`); } catch (cause: unknown) { diff --git a/packages/cli/src/LsInteractive.tsx b/packages/cli/src/LsInteractive.tsx new file mode 100644 index 0000000..886328d --- /dev/null +++ b/packages/cli/src/LsInteractive.tsx @@ -0,0 +1,289 @@ +import { useState, useMemo } from "react"; +import type { ReactElement } from "react"; +import { Box, Text, useApp, useInput } from "ink"; + +export interface RepoEntry { + knowledgeId: string; + source: + | { kind: "github"; repoUrl: string; branch?: string; commitId?: string; commitHashes?: string[] } + | { kind: "local"; sourcePath: string }; + state: string; + createdAt: string; + updatedAt: string; + fileCount: number; +} + +export interface LsInteractiveProps { + repos: RepoEntry[]; + onDone: () => void; +} + +type ViewMode = "repos" | "branches" | "details"; + +export function LsInteractive({ repos, onDone }: LsInteractiveProps): ReactElement { + const { exit } = useApp(); + const [mode, setMode] = useState("repos"); + const [repoIndex, setRepoIndex] = useState(0); + const [branchIndex, setBranchIndex] = useState(0); + const [selectedRepoUrl, setSelectedRepoUrl] = useState(null); + const [selectedEntry, setSelectedEntry] = useState(null); + + // Group repos by their source URL or Path + const groupedRepos = useMemo(() => { + const groups: Record = {}; + for (const r of repos) { + const key = r.source.kind === "github" ? r.source.repoUrl : r.source.sourcePath; + if (!groups[key]) { + groups[key] = []; + } + groups[key].push(r); + } + return Object.entries(groups).map(([url, entries]) => { + const firstEntry = entries[0]; + if (!firstEntry) { + throw new Error("empty group"); + } + return { + url, + kind: firstEntry.source.kind, + entries, + }; + }); + }, [repos]); + + const currentBranches = useMemo(() => { + if (!selectedRepoUrl) { + return []; + } + const group = groupedRepos.find((g) => g.url === selectedRepoUrl); + return group ? group.entries : []; + }, [selectedRepoUrl, groupedRepos]); + + const handleBack = () => { + if (mode === "details") { + setMode("branches"); + } else if (mode === "branches") { + setMode("repos"); + setSelectedRepoUrl(null); + } else { + exit(); + onDone(); + } + }; + + useInput((input, key) => { + if (key.escape || (input === "q" && mode === "repos")) { + exit(); + onDone(); + return; + } + + if (key.backspace || input === "b" || key.leftArrow) { + handleBack(); + return; + } + + if (mode === "repos") { + if (key.upArrow || input === "k") { + setRepoIndex((i) => (i > 0 ? i - 1 : groupedRepos.length - 1)); + } else if (key.downArrow || input === "j") { + setRepoIndex((i) => (i < groupedRepos.length - 1 ? i + 1 : 0)); + } else if (key.return || key.rightArrow || input === "l") { + const selected = groupedRepos[repoIndex]; + if (selected) { + setSelectedRepoUrl(selected.url); + setBranchIndex(0); + setMode("branches"); + } + } + } else if (mode === "branches") { + if (key.upArrow || input === "k") { + setBranchIndex((i) => (i > 0 ? i - 1 : currentBranches.length - 1)); + } else if (key.downArrow || input === "j") { + setBranchIndex((i) => (i < currentBranches.length - 1 ? i + 1 : 0)); + } else if (key.return || key.rightArrow || input === "l") { + const selected = currentBranches[branchIndex]; + if (selected) { + setSelectedEntry(selected); + setMode("details"); + } + } + } + }); + + const renderRepos = () => ( + + + + Indexed Repositories ({groupedRepos.length}) + + + {groupedRepos.map((group, i) => ( + + {i === repoIndex ? "▶ " : " "} + + {group.kind === "github" ? parseGithubSlug(group.url) : group.url} + + ({group.entries.length} entries) + + ))} + + [↑/↓] move [Enter/→] branches [q/Esc] exit + + + ); + + const renderBranches = () => ( + + + + Repos /{" "} + + + {selectedRepoUrl + ? currentBranches[0]?.source.kind === "github" + ? parseGithubSlug(selectedRepoUrl) + : selectedRepoUrl + : ""} + + + {currentBranches.map((entry, i) => ( + + {i === branchIndex ? "▶ " : " "} + + {entry.source.kind === "github" ? (entry.source.branch ?? "default") : "local"} + + + {entry.state.padEnd(10)} + {entry.knowledgeId.slice(0, 8)}… + + + ))} + + [↑/↓] move [Enter/→] details [Esc/←] back + + + ); + + const renderDetails = () => { + if (!selectedEntry) { + return null; + } + const s = selectedEntry.source; + return ( + + + + Repos / {s.kind === "github" ? parseGithubSlug(s.repoUrl) : s.sourcePath} /{" "} + + + {s.kind === "github" ? (s.branch ?? "default") : "local"} + + + + + + + + + + + {s.kind === "github" && ( + <> + + + GitHub Details + + + + + + + + + Indexed Commits ({s.commitHashes?.length ?? 0}) + + + {(s.commitHashes ?? []).map((h, i) => ( + + {i + 1}. + {h.slice(0, 8)} + {h === s.commitId && (current head)} + + ))} + {(!s.commitHashes || s.commitHashes.length === 0) && ( + + No commit history recorded. + + )} + + )} + + {s.kind === "local" && ( + <> + + + Local Details + + + + + )} + + + + [Esc/←/Backspace] back + + + ); + }; + + return ( + + {mode === "repos" && renderRepos()} + {mode === "branches" && renderBranches()} + {mode === "details" && renderDetails()} + + ); +} + +function DetailRow({ label, value, color }: { label: string; value: string; color?: string }) { + return ( + + + {label}: + + {value} + + ); +} + +function getStateColor(state: string): string { + switch (state) { + case "PROCESSED": + return "green"; + case "PROCESSING": + return "yellow"; + case "FAILED": + return "red"; + default: + return "white"; + } +} + +function parseGithubSlug(repoUrl: string): string { + try { + const u = new URL(repoUrl); + return u.pathname.replace(/^\/+/u, "").replace(/\.git$/u, ""); + } catch { + return repoUrl; + } +} + +function formatDate(iso: string): string { + const d = new Date(iso); + if (Number.isNaN(d.getTime())) { + return iso; + } + return d.toLocaleString(); +} diff --git a/packages/cli/src/ManualBranchPrompt.tsx b/packages/cli/src/ManualBranchPrompt.tsx new file mode 100644 index 0000000..8d3836c --- /dev/null +++ b/packages/cli/src/ManualBranchPrompt.tsx @@ -0,0 +1,42 @@ +import { useState } from "react"; +import type { ReactElement } from "react"; +import { Box, Text, useApp, useInput } from "ink"; +import { Field } from "./Field.tsx"; + +export interface ManualBranchPromptResult { + branch?: string; + cancelled?: boolean; +} + +export interface ManualBranchPromptProps { + onDone: (result: ManualBranchPromptResult) => void; +} + +export function ManualBranchPrompt({ onDone }: ManualBranchPromptProps): ReactElement { + const { exit } = useApp(); + const [value, setValue] = useState(""); + + useInput((_input, key) => { + if (key.escape) { + exit(); + onDone({ cancelled: true }); + return; + } + if (key.return && value.length > 0) { + exit(); + onDone({ branch: value }); + } + }); + + return ( + + + Type branch name manually + + + + [Enter] submit [Esc] cancel + + + ); +} diff --git a/packages/cli/src/branchPrompts.ts b/packages/cli/src/branchPrompts.ts new file mode 100644 index 0000000..4e182a4 --- /dev/null +++ b/packages/cli/src/branchPrompts.ts @@ -0,0 +1,71 @@ +import React from "react"; +import { render } from "ink"; +import { InitialBranchSelector, type InitialBranchResult } from "./InitialBranchSelector.tsx"; +import { BranchSelector, type BranchSelectorResult } from "./BranchSelector.tsx"; +import { ManualBranchPrompt, type ManualBranchPromptResult } from "./ManualBranchPrompt.tsx"; + +export async function promptInitialBranch(defaultBranch: string): Promise<"default" | "other" | null> { + return new Promise<"default" | "other" | null>((resolve) => { + const onDone = (result: InitialBranchResult): void => { + if (result.choice !== undefined) { + resolve(result.choice); + return; + } + resolve(null); + }; + const { waitUntilExit } = render( + React.createElement(InitialBranchSelector, { + defaultBranch, + onDone, + }), + ); + waitUntilExit().catch(() => undefined); + }); +} + +export async function promptFullBranchSelector( + branches: string[], +): Promise<{ branch: string; manual: boolean } | null> { + const result = await new Promise((resolve) => { + const onDone = (res: BranchSelectorResult): void => { + resolve(res); + }; + const { waitUntilExit } = render( + React.createElement(BranchSelector, { + branches, + onDone, + }), + ); + waitUntilExit().catch(() => undefined); + }); + + if (result.cancelled) { + return null; + } + if (result.typeManually) { + const manual = await promptManualBranch(); + return manual ? { branch: manual, manual: true } : null; + } + if (result.branch) { + return { branch: result.branch, manual: false }; + } + return null; +} + +async function promptManualBranch(): Promise { + return new Promise((resolve) => { + const onDone = (result: ManualBranchPromptResult): void => { + if (result.branch !== undefined) { + resolve(result.branch); + return; + } + resolve(null); + }; + const { waitUntilExit } = render( + React.createElement(ManualBranchPrompt, { + onDone, + }), + ); + waitUntilExit().catch(() => undefined); + }); +} diff --git a/packages/cli/src/httpClient.ts b/packages/cli/src/httpClient.ts index 4bdf8a1..b2bb867 100644 --- a/packages/cli/src/httpClient.ts +++ b/packages/cli/src/httpClient.ts @@ -12,12 +12,14 @@ export function baseUrl(): string { export class HttpClientError extends Error { override readonly name = "HttpClientError"; readonly status: number | undefined; + readonly body: unknown | undefined; - constructor(message: string, status?: number) { + constructor(message: string, status?: number, body?: unknown) { super(message); if (status !== undefined) { this.status = status; } + this.body = body; } } @@ -74,17 +76,23 @@ async function parseResponse(res: Response): Promise { if (!res.ok) { const text = await res.text().catch(() => ""); let message = `HTTP ${res.status}`; + let body: unknown = undefined; try { - const parsed = JSON.parse(text) as { error?: unknown }; - if (typeof parsed.error === "string") { - message = parsed.error; + const parsed = JSON.parse(text); + body = parsed; + if (typeof parsed === "object" && parsed !== null) { + if ("error" in parsed && typeof parsed.error === "string") { + message = parsed.error; + } else if ("message" in parsed && typeof parsed.message === "string") { + message = parsed.message; + } } } catch { if (text.length > 0) { message = text.slice(0, 500); } } - throw new HttpClientError(message, res.status); + throw new HttpClientError(message, res.status, body); } return (await res.json()) as T; } diff --git a/packages/cli/src/lsInteractivePrompt.ts b/packages/cli/src/lsInteractivePrompt.ts new file mode 100644 index 0000000..64d5dad --- /dev/null +++ b/packages/cli/src/lsInteractivePrompt.ts @@ -0,0 +1,23 @@ +import React from "react"; +import { render } from "ink"; +import { LsInteractive, type RepoEntry } from "./LsInteractive.tsx"; + +/** + * Renders the interactive repository list and waits for the user to exit. + */ +export async function promptLsInteractive(repos: RepoEntry[]): Promise { + return new Promise((resolve) => { + const onDone = (): void => { + resolve(); + }; + + const { waitUntilExit } = render( + React.createElement(LsInteractive, { + repos, + onDone, + }), + ); + + waitUntilExit().catch(() => resolve()); + }); +} diff --git a/packages/cli/src/pullPrompts.ts b/packages/cli/src/pullPrompts.ts index 039b13d..aa7e6b5 100644 --- a/packages/cli/src/pullPrompts.ts +++ b/packages/cli/src/pullPrompts.ts @@ -63,7 +63,7 @@ export async function resolveCommit( return null; } -async function promptForToken(repoLabel: string, message: string): Promise { +export async function promptForToken(repoLabel: string, message: string): Promise { return new Promise((resolve) => { const onDone = (result: TokenPromptResult): void => { if (result.token !== undefined && result.token.length > 0) { diff --git a/packages/cli/tsconfig.json b/packages/cli/tsconfig.json index 614acd6..4ed0786 100644 --- a/packages/cli/tsconfig.json +++ b/packages/cli/tsconfig.json @@ -1,9 +1,4 @@ { "extends": "../../tsconfig.base.json", - "compilerOptions": { - "rootDir": "./src", - "outDir": "./dist", - "jsx": "react-jsx" - }, - "include": ["src/**/*"] + "include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.json"] } diff --git a/packages/config/README.md b/packages/config/README.md index 33882c7..97e74bc 100644 --- a/packages/config/README.md +++ b/packages/config/README.md @@ -40,9 +40,31 @@ function getBytebellHome(): string function getConfigPath(): string function ensureBytebellHome(): void +function seedConfig(value: unknown): BytebellConfig +function __isSeeded(): boolean +class ConfigSeededError extends Error + +function setBytebellHomeResolver(fn: (() => string | null) | null): void + +function __resetSeedForTests(): void // test-only function __setBytebellHomeForTests(home: string | null): void // test-only ``` +`setBytebellHomeResolver` registers an override function invoked on every +`getBytebellHome()` call (no caching). The resolver returns the home directory +to use for the current invocation, or `null` to fall through to the +`~/.bytebell` default. Pass `null` to clear. + +`seedConfig` injects a pre-parsed config object into the in-memory cache, +validated through `configSchema.parse`. When seeded, `loadConfig()` returns +the seeded values and **does not** call `ensureBytebellHome()` or read +`config.json`. The cache invalidator is also no-op while seeded, so the seed +survives unexpected `__notifyConfigChanged` events. `setConfigValue` throws +`ConfigSeededError` when invoked against a seeded cache — writes are disabled +in that mode. When `seedConfig` is never called, behaviour is bit-for-bit the +disk-backed path: `loadConfig()` materializes `~/.bytebell/config.json` on +first read and `setConfigValue` performs atomic writes. + The `Config` enum lives in `@bb/types`; `ConfigIncompleteError` lives in `@bb/errors`. Both are imported from those packages directly, not from `@bb/config`. @@ -74,8 +96,9 @@ This package does **not** own: 1. **No env var reads.** Source files contain no `process.env` references. Enforced at lint time ([eslint.config.mjs:71-94](../../eslint.config.mjs#L71-L94)). -2. **No `.env` / `dotenv` / `BYTEBELL_HOME`.** The only test seam is the - programmatic `__setBytebellHomeForTests`. +2. **No `.env` / `dotenv` / `BYTEBELL_HOME`.** Programmatic override seams + are `__setBytebellHomeForTests` (test-only, static) and + `setBytebellHomeResolver` (per-call function). 3. **Strict schema.** Unknown keys in `config.json` cause `loadConfig()` to throw — typo defense. 4. **Defaults always present.** `loadConfig()` never returns a partial config; diff --git a/packages/config/package.json b/packages/config/package.json index a38c0a2..d31ebe4 100644 --- a/packages/config/package.json +++ b/packages/config/package.json @@ -8,6 +8,9 @@ "exports": { ".": "./src/index.ts" }, + "imports": { + "#src/*": "./src/*" + }, "dependencies": { "@bb/types": "workspace:*", "zod": "^4.3.6" diff --git a/packages/config/src/README.md b/packages/config/src/README.md index 6ab129e..221e1e2 100644 --- a/packages/config/src/README.md +++ b/packages/config/src/README.md @@ -8,9 +8,11 @@ package-level contract; this file documents how the source tree is split. - **[index.ts](index.ts)** — public re-exports. The only entry point other packages may import. Anything not re-exported here is internal. - **[paths.ts](paths.ts)** — `getBytebellHome`, `getConfigPath`, and the - cache-invalidator registry. Holds the `testHomeOverride` state used by - `__setBytebellHomeForTests`. Pure: imports nothing from the rest of the - package. + cache-invalidator registry. Holds the `testHomeOverride` slot set by + `__setBytebellHomeForTests` and the `homeResolver` slot set by + `setBytebellHomeResolver`. `getBytebellHome` consults the test override + first, then the resolver (if set and returning non-null), then falls back + to `~/.bytebell`. Pure: imports nothing from the rest of the package. - **[schema.ts](schema.ts)** — Zod `configSchema`, `BytebellConfig` type, `ConfigValueMap`, `DEFAULT_CONFIG`, `REQUIRED_KEYS` (infra-always), `requiredKeysFor(provider)` (combines infra + provider-specific keys diff --git a/packages/config/src/index.ts b/packages/config/src/index.ts index de390ec..b4cefb3 100644 --- a/packages/config/src/index.ts +++ b/packages/config/src/index.ts @@ -1,9 +1,15 @@ export { LOG_LEVELS, LLM_PROVIDERS, HINTS } from "./schema.ts"; export type { BytebellConfig, ConfigValue, ConfigValueMap, LogLevel, LlmProvider } from "./schema.ts"; -export { loadConfig, getConfigValue, isConfigComplete } from "./loader.ts"; +export { loadConfig, getConfigValue, isConfigComplete, seedConfig, __isSeeded, __resetSeedForTests } from "./loader.ts"; export type { ConfigCompletenessResult } from "./loader.ts"; -export { setConfigValue, ensureBytebellHome } from "./writer.ts"; +export { setConfigValue, ensureBytebellHome, ConfigSeededError } from "./writer.ts"; -export { getBytebellHome, getConfigPath, isDevMode, __setBytebellHomeForTests } from "./paths.ts"; +export { + getBytebellHome, + getConfigPath, + isDevMode, + setBytebellHomeResolver, + __setBytebellHomeForTests, +} from "./paths.ts"; diff --git a/packages/config/src/loader.ts b/packages/config/src/loader.ts index 5de16c2..b3adcb4 100644 --- a/packages/config/src/loader.ts +++ b/packages/config/src/loader.ts @@ -12,11 +12,30 @@ import { __registerCacheInvalidator, getConfigPath } from "./paths.ts"; import { ensureBytebellHome } from "./writer.ts"; let cached: BytebellConfig | null = null; +let seeded = false; __registerCacheInvalidator(() => { + if (seeded) { + return; + } cached = null; }); +export function seedConfig(value: unknown): BytebellConfig { + cached = configSchema.parse(value); + seeded = true; + return cached; +} + +export function __isSeeded(): boolean { + return seeded; +} + +export function __resetSeedForTests(): void { + cached = null; + seeded = false; +} + export function loadConfig(): BytebellConfig { if (cached !== null) { return cached; diff --git a/packages/config/src/paths.ts b/packages/config/src/paths.ts index b93876e..b9d02f8 100644 --- a/packages/config/src/paths.ts +++ b/packages/config/src/paths.ts @@ -2,15 +2,33 @@ import os from "node:os"; import path from "node:path"; let testHomeOverride: string | null = null; +let homeResolver: (() => string | null) | null = null; const cacheInvalidators: Array<() => void> = []; export function getBytebellHome(): string { if (testHomeOverride !== null) { return testHomeOverride; } + if (homeResolver !== null) { + const resolved = homeResolver(); + if (resolved !== null) { + return resolved; + } + } return path.join(os.homedir(), ".bytebell"); } +/** + * Register an override resolver for `getBytebellHome()`. The resolver runs on + * every call (no caching) so it may return different values across invocations. + * Returning `null` falls through to the `~/.bytebell` default. Pass `null` to + * clear the resolver. + */ +export function setBytebellHomeResolver(fn: (() => string | null) | null): void { + homeResolver = fn; + __notifyConfigChanged(); +} + export function getConfigPath(): string { return path.join(getBytebellHome(), "config.json"); } diff --git a/packages/config/src/schema.ts b/packages/config/src/schema.ts index 63a65d4..d5bae9d 100644 --- a/packages/config/src/schema.ts +++ b/packages/config/src/schema.ts @@ -41,6 +41,10 @@ export const configSchema = z "big.file.concurrency": z.number().int().positive().default(25), "absolute.file.size.cap": z.number().int().positive().default(52428800), "concurrent.workers": z.number().int().positive().default(4), + "llm.concurrency": z.number().int().positive().default(29), + "folder.summary.batch.size": z.number().int().positive().default(10), + "folder.summary.batch.max.files": z.number().int().positive().default(15), + "neo4j.batch.size": z.number().int().positive().default(50), "condense.context.limit": z.number().int().positive().default(12000), "condense.prompt.overhead": z.number().int().nonnegative().default(1500), "small.file.dedup.threshold": z.number().int().positive().default(3), @@ -81,6 +85,10 @@ export type ConfigValueMap = { [Config.BigFileConcurrency]: number; [Config.AbsoluteFileSizeCap]: number; [Config.ConcurrentWorkers]: number; + [Config.LlmConcurrency]: number; + [Config.FolderSummaryBatchSize]: number; + [Config.FolderSummaryBatchMaxFiles]: number; + [Config.Neo4jBatchSize]: number; [Config.CondenseContextLimit]: number; [Config.CondensePromptOverhead]: number; [Config.SmallFileDedupThreshold]: number; @@ -135,6 +143,10 @@ export const HINTS: Readonly> = { [Config.BigFileConcurrency]: "bytebell set big.file.concurrency ", [Config.AbsoluteFileSizeCap]: "bytebell set absolute.file.size.cap ", [Config.ConcurrentWorkers]: "bytebell set concurrent.workers ", + [Config.LlmConcurrency]: "bytebell set llm.concurrency ", + [Config.FolderSummaryBatchSize]: "bytebell set folder.summary.batch.size ", + [Config.FolderSummaryBatchMaxFiles]: "bytebell set folder.summary.batch.max.files ", + [Config.Neo4jBatchSize]: "bytebell set neo4j.batch.size ", [Config.CondenseContextLimit]: "bytebell set condense.context.limit ", [Config.CondensePromptOverhead]: "bytebell set condense.prompt.overhead ", [Config.SmallFileDedupThreshold]: "bytebell set small.file.dedup.threshold ", @@ -195,6 +207,14 @@ export function readField(cfg: BytebellConfig, key: K): Config return cfg["absolute.file.size.cap"] as ConfigValue; case Config.ConcurrentWorkers: return cfg["concurrent.workers"] as ConfigValue; + case Config.LlmConcurrency: + return cfg["llm.concurrency"] as ConfigValue; + case Config.FolderSummaryBatchSize: + return cfg["folder.summary.batch.size"] as ConfigValue; + case Config.FolderSummaryBatchMaxFiles: + return cfg["folder.summary.batch.max.files"] as ConfigValue; + case Config.Neo4jBatchSize: + return cfg["neo4j.batch.size"] as ConfigValue; case Config.CondenseContextLimit: return cfg["condense.context.limit"] as ConfigValue; case Config.CondensePromptOverhead: @@ -264,6 +284,14 @@ export function writeField(cfg: BytebellConfig, key: K, value: return { ...cfg, "absolute.file.size.cap": value as number }; case Config.ConcurrentWorkers: return { ...cfg, "concurrent.workers": value as number }; + case Config.LlmConcurrency: + return { ...cfg, "llm.concurrency": value as number }; + case Config.FolderSummaryBatchSize: + return { ...cfg, "folder.summary.batch.size": value as number }; + case Config.FolderSummaryBatchMaxFiles: + return { ...cfg, "folder.summary.batch.max.files": value as number }; + case Config.Neo4jBatchSize: + return { ...cfg, "neo4j.batch.size": value as number }; case Config.CondenseContextLimit: return { ...cfg, "condense.context.limit": value as number }; case Config.CondensePromptOverhead: diff --git a/packages/config/src/writer.ts b/packages/config/src/writer.ts index 04d73ae..c89a82c 100644 --- a/packages/config/src/writer.ts +++ b/packages/config/src/writer.ts @@ -1,7 +1,15 @@ import fs from "node:fs"; import { configSchema, Config, type BytebellConfig, type ConfigValue, DEFAULT_CONFIG, writeField } from "./schema.ts"; +import { __isSeeded } from "./loader.ts"; import { getBytebellHome, getConfigPath, __notifyConfigChanged } from "./paths.ts"; +export class ConfigSeededError extends Error { + constructor() { + super("config cache is seeded; setConfigValue is disabled"); + this.name = "ConfigSeededError"; + } +} + const FILE_MODE = 0o600; const DIR_MODE = 0o700; @@ -41,6 +49,9 @@ export function ensureBytebellHome(): void { } export function setConfigValue(key: K, value: ConfigValue): void { + if (__isSeeded()) { + throw new ConfigSeededError(); + } ensureBytebellHome(); const current = readConfigFile(); const next = writeField(current, key, value); diff --git a/packages/config/tsconfig.json b/packages/config/tsconfig.json index b2f9baa..4ed0786 100644 --- a/packages/config/tsconfig.json +++ b/packages/config/tsconfig.json @@ -1,10 +1,4 @@ { "extends": "../../tsconfig.base.json", - "compilerOptions": { - "rootDir": "./src", - "outDir": "./dist", - "noEmit": false, - "emitDeclarationOnly": true - }, - "include": ["src/**/*"] + "include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.json"] } diff --git a/packages/errors/package.json b/packages/errors/package.json index c8d42db..bf2da88 100644 --- a/packages/errors/package.json +++ b/packages/errors/package.json @@ -8,6 +8,9 @@ "exports": { ".": "./src/index.ts" }, + "imports": { + "#src/*": "./src/*" + }, "dependencies": { "@bb/types": "workspace:*" } diff --git a/packages/errors/src/README.md b/packages/errors/src/README.md index c2752d6..4edfc65 100644 --- a/packages/errors/src/README.md +++ b/packages/errors/src/README.md @@ -36,7 +36,13 @@ package-level contract; this file documents how the source tree is split. - **[llm-errors.ts](llm-errors.ts)** — errors thrown by `@bb/llm`. Today: `LlmConfigError` (missing OpenRouter API key; carries the `bytebell keys set` hint), `LlmError` (HTTP non-2xx, timeout, empty - completion; accepts an optional `cause`). + completion; accepts an optional `cause` plus an optional + `{ status?: number; detail?: string }` options bag — `status` is the + provider HTTP status when the failure originated from a non-OK response, + `detail` is the raw response body capped to 4000 chars. Downstream + classifiers like `@bb/ingest-github/src/pipeline/failure-classifier.ts` + map `status` → `KnowledgeFailureCategory` so operators see the right + remediation hint). - **[ingest-errors.ts](ingest-errors.ts)** — errors thrown by `@bb/ingest-*` workers and `@bb/cli`'s ingest command. Today: `GitCloneError` (git binary failed; redacts userinfo in the repo URL diff --git a/packages/errors/src/llm-errors.ts b/packages/errors/src/llm-errors.ts index 67e2a70..bc5c00c 100644 --- a/packages/errors/src/llm-errors.ts +++ b/packages/errors/src/llm-errors.ts @@ -10,11 +10,21 @@ export class LlmConfigError extends Error { export class LlmError extends Error { override readonly name = "LlmError"; + /** HTTP status code from the provider when the failure originated from a non-OK response. */ + readonly status?: number; + /** Raw provider response body (or other structured detail), capped to a sane size by the thrower. */ + readonly detail?: string; - constructor(message: string, cause?: unknown) { + constructor(message: string, cause?: unknown, options?: { status?: number; detail?: string }) { super(message); if (cause !== undefined) { this.cause = cause; } + if (options?.status !== undefined) { + this.status = options.status; + } + if (options?.detail !== undefined) { + this.detail = options.detail; + } } } diff --git a/packages/errors/tsconfig.json b/packages/errors/tsconfig.json index c2104f6..4ed0786 100644 --- a/packages/errors/tsconfig.json +++ b/packages/errors/tsconfig.json @@ -1,8 +1,4 @@ { "extends": "../../tsconfig.base.json", - "compilerOptions": { - "rootDir": "./src", - "outDir": "./dist" - }, - "include": ["src/**/*"] + "include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.json"] } diff --git a/packages/ingest-business-context/README.md b/packages/ingest-business-context/README.md new file mode 100644 index 0000000..566f444 --- /dev/null +++ b/packages/ingest-business-context/README.md @@ -0,0 +1,54 @@ +# `@bb/ingest-business-context` — context + +## Tier + +Domain. Depends on Kernel (`@bb/types`, `@bb/errors`), Infrastructure (`@bb/config`, `@bb/neo4j`), +Cross-cutting (`@bb/llm`), and Strategy (`@bb/queue`). One horizontal Domain→Domain dependency on +`@bb/ingest-github` (read-only path helpers + the on-disk layout it owns). May be imported by +Binaries (`@bb/server` calls `registerBusinessContextWorker()` once at boot). Never by `@bb/cli`. + +## Responsibility + +Attaches human-authored business-context notes to a specific indexed commit of a GitHub knowledge. +The package consumes `JobType.BusinessContextProcessing` jobs. For each job it: + +1. Validates the commit is indexed (Neo4j contains either `:File {knowledgeId}` or + `:FileVersion {knowledgeId, commitHash}`). +2. Reads optional enrichment from disk (`metaRoot/repo-summary.json`, `metaRoot/org//*.json`). +3. Runs one LLM call to generate a concise title, then three parallel LLM calls covering + product fields, technical fields, and the shared overview. +4. Persists the result to disk at + `metaRoot/commits//business-context//{original.txt,analysis.json}`. +5. Projects the analysis into Neo4j as a `:BusinessContext` node plus a `:BusinessContextVersion` + snapshot keyed by `(knowledgeId, commitHash)`. The version node `:DESCRIBES` every + `:FileVersion {knowledgeId, commitHash}` that exists for the same commit; if none exist yet + (BC authored before the commit was snapshot), zero edges are created and a later run will + backfill them via the same idempotent MERGE. +6. Creates `:OrgKeyword` nodes for each array field (10 typed relationship classes such as + `HAS_DOMAIN_KEYWORD`, `HAS_STAKEHOLDER`, `HAS_AFFECTED_MODULE`) connected to the parent + `:BusinessContext` via `:APPEARS_IN_BUSINESS_CONTEXT`. + +## Public exports + +- `registerBusinessContextWorker(deps?)` — boots the worker. Called by the deployable at startup. +- `executeBusinessContextStrategy(input)` — the disk pipeline (validate → enrichment → title → + analysis → save). Returns the resolved storage paths and the title. Safe to call directly from + HTTP for synchronous flows. +- `storeBusinessContextToNeo4j(input, analysis, sanitizedTitle)` — graph persistence. Separated + so callers can run it inline or defer it. +- `BUSINESS_CONTEXT_FIELD_DEFS` — single source of truth for the 16-field LLM analysis schema. +- Types: `BusinessContextInput`, `BusinessContextAnalysis`, `BusinessContextStorageResult`, + `BusinessContextNeo4jResult`, `BusinessContextAnalysisMetadata`, `CommitNotIndexedError`. + +## Invariants + +- Single LLM call surface — never bypass `@bb/llm`. Outputs are validated against the field-defs + schema before persistence. +- `:BusinessContext` and `:BusinessContextVersion` are addressed by `(knowledgeId, nodeId)` / + `(knowledgeId, nodeId, commitHash)`; all MERGEs are idempotent and re-runnable. +- `nodeId` is the sanitized title (kebab-case, ≤80 chars). Two BC submissions that LLM-title to the + same string will MERGE onto the same node — by design. +- No outbound calls. No GitHub-API lookups. The strategy never clones or pulls — it operates on + the meta-output already produced by `@bb/ingest-github` for the indexed commit. +- All disk writes scoped under `metaRootFor(knowledgeId)/commits//business-context/` + via the `@bb/ingest-github` path helpers — this package never invents its own layout. diff --git a/packages/ingest-business-context/package.json b/packages/ingest-business-context/package.json new file mode 100644 index 0000000..da74346 --- /dev/null +++ b/packages/ingest-business-context/package.json @@ -0,0 +1,24 @@ +{ + "name": "@bb/ingest-business-context", + "version": "0.0.0", + "private": true, + "type": "module", + "main": "./src/index.ts", + "types": "./src/index.ts", + "exports": { + ".": "./src/index.ts" + }, + "imports": { + "#src/*": "./src/*" + }, + "dependencies": { + "@bb/config": "workspace:*", + "@bb/errors": "workspace:*", + "@bb/ingest-github": "workspace:*", + "@bb/llm": "workspace:*", + "@bb/logger": "workspace:*", + "@bb/neo4j": "workspace:*", + "@bb/queue": "workspace:*", + "@bb/types": "workspace:*" + } +} diff --git a/packages/ingest-business-context/src/README.md b/packages/ingest-business-context/src/README.md new file mode 100644 index 0000000..02cc971 --- /dev/null +++ b/packages/ingest-business-context/src/README.md @@ -0,0 +1,37 @@ +# `@bb/ingest-business-context/src` — implementation map + +See [../README.md](../README.md) for the package contract. + +## Layout + +``` +src/ + README.md + index.ts Public barrel + field-defs.ts 16-field analysis schema (single source of truth) + types.ts Input / output / metadata interfaces + errors.ts CommitNotIndexedError, BusinessContextAnalysisFailedError + + prompt/ System + user prompt builders (title, analysis, user-message) + disk/ Disk persistence (sanitize-title, save-original, save-analysis, load-cached) + llm/ Enrichment-reader, enrichment-format, call-builder, merge, title, analyze-parallel + neo4j/ Indexes, relationship-types, serialize, write-node, write-version, write-keywords + strategy/ commit-validator, execute, store-graph + worker/ handler, register +``` + +## Import rules + +- Cross-folder within the package → `src/folder/file.ts`. +- Sibling within the same folder → `./file.ts`. +- Cross-package → `@bb/foo`. +- **Never** `../` parent traversal. + +## Module-graph rules + +- `disk/**` depends only on `node:fs`, `@bb/ingest-github` (paths), `@bb/logger`, and `src/types.ts`. +- `llm/**` depends only on `@bb/llm`, `@bb/logger`, `@bb/ingest-github` (paths), and `src/prompt/`, `src/field-defs.ts`, `src/types.ts`. +- `neo4j/**` depends only on `@bb/neo4j`, `@bb/logger`, and `src/types.ts`. +- `strategy/**` depends on `disk/`, `llm/`, `neo4j/`, `src/errors.ts`, `@bb/ingest-github` (paths), `@bb/logger`, `@bb/neo4j`. +- `worker/**` depends on `strategy/`, `@bb/queue`, `@bb/types`, `@bb/config`, `@bb/logger`. +- No layer skips another. The public API (`index.ts`) re-exports from each layer. diff --git a/packages/ingest-business-context/src/disk/README.md b/packages/ingest-business-context/src/disk/README.md new file mode 100644 index 0000000..dca6384 --- /dev/null +++ b/packages/ingest-business-context/src/disk/README.md @@ -0,0 +1,17 @@ +# `disk/` — context + +Persists business-context artefacts under +`metaRoot/commits//business-context//`. Paths come +from `@bb/ingest-github`'s `businessContextDir()` — this folder never builds +its own paths. + +| File | Responsibility | +| ------------------- | ------------------------------------------------------------------------------- | +| `sanitize-title.ts` | LLM title → kebab-case filesystem-safe slug (≤80 chars). Also the Neo4j nodeId. | +| `save-original.ts` | Writes `original.txt` (raw user-authored text, mode 0600). | +| `save-analysis.ts` | Wraps the analysis in a metadata envelope and writes `analysis.json`. | +| `load-cached.ts` | Reads back a saved envelope; tolerant of missing / malformed files. | + +Cache key is the sanitized title alone. Two BC submissions whose LLM titles +sanitise to the same slug share the same cached analysis (intentional — same +idea, same node). diff --git a/packages/ingest-business-context/src/disk/load-cached.ts b/packages/ingest-business-context/src/disk/load-cached.ts new file mode 100644 index 0000000..a93f68b --- /dev/null +++ b/packages/ingest-business-context/src/disk/load-cached.ts @@ -0,0 +1,41 @@ +import { readFile } from "node:fs/promises"; +import path from "node:path"; +import { businessContextDir } from "@bb/ingest-github"; +import { logger } from "@bb/logger"; +import type { BusinessContextAnalysisMetadata } from "#src/types.ts"; + +/** + * Returns a previously-saved analysis envelope if one exists for this title, + * otherwise `null`. The cache key is the sanitized title — same title across + * re-runs returns the same envelope and skips a fresh LLM call. + * + * Tolerant of missing or malformed files: the strategy treats `null` as a + * cache miss and proceeds with a full LLM run. We never crash on stale JSON. + */ +export async function loadCachedAnalysis( + knowledgeId: string, + commitHash: string, + sanitizedTitle: string, +): Promise { + const filePath = path.join(businessContextDir(knowledgeId, commitHash, sanitizedTitle), "analysis.json"); + let content: string; + try { + content = await readFile(filePath, "utf-8"); + } catch { + return null; + } + try { + const parsed = JSON.parse(content) as BusinessContextAnalysisMetadata; + if (parsed.analysis === undefined || parsed.analysis === null) { + logger.warn(`business-context: cached envelope at ${filePath} has no analysis field; ignoring`); + return null; + } + logger.info( + `business-context: cache HIT at ${filePath} (generated ${parsed.generatedAt}, model ${parsed.modelName})`, + ); + return parsed; + } catch (err) { + logger.warn(`business-context: failed to parse cached analysis ${filePath}: ${(err as Error).message}`); + return null; + } +} diff --git a/packages/ingest-business-context/src/disk/sanitize-title.ts b/packages/ingest-business-context/src/disk/sanitize-title.ts new file mode 100644 index 0000000..a953abf --- /dev/null +++ b/packages/ingest-business-context/src/disk/sanitize-title.ts @@ -0,0 +1,25 @@ +const NON_ALNUM_DASH = /[^a-z0-9\s-]/gu; +const WHITESPACE_RUN = /\s+/gu; +const DASH_RUN = /-{2,}/gu; +const LEADING_OR_TRAILING_DASH = /^-|-$/gu; + +/** + * Converts an LLM-generated title into a filesystem-safe, URL-safe slug. + * + * Lowercase. Non-alphanumerics collapse to single hyphens. Capped at 80 chars + * so the resulting directory name is comfortably under filesystem limits on + * every platform. Used as both the on-disk directory name and the Neo4j + * `nodeId` — two BC submissions whose LLM titles sanitise to the same slug + * MERGE onto the same `:BusinessContext` node (by design — same idea, same + * node). + */ +export function sanitizeTitle(title: string): string { + return title + .toLowerCase() + .replace(NON_ALNUM_DASH, "") + .replace(WHITESPACE_RUN, "-") + .replace(DASH_RUN, "-") + .replace(LEADING_OR_TRAILING_DASH, "") + .slice(0, 80) + .replace(/-$/u, ""); +} diff --git a/packages/ingest-business-context/src/disk/save-analysis.ts b/packages/ingest-business-context/src/disk/save-analysis.ts new file mode 100644 index 0000000..56a4422 --- /dev/null +++ b/packages/ingest-business-context/src/disk/save-analysis.ts @@ -0,0 +1,48 @@ +import { mkdir, writeFile } from "node:fs/promises"; +import path from "node:path"; +import { businessContextDir } from "@bb/ingest-github"; +import { logger } from "@bb/logger"; +import type { BusinessContextAnalysis, BusinessContextAnalysisMetadata } from "#src/types.ts"; + +const DIR_MODE = 0o700; + +export interface SaveAnalysisMetadata { + commitHash: string; + modelName: string; + inputTokens: number; + outputTokens: number; + description?: string; +} + +/** + * Wraps the LLM analysis in a metadata envelope (provenance: model, tokens, + * timestamp) and writes it as `analysis.json` next to `original.txt`. The + * envelope shape is the cache key — loadCachedAnalysis() reads it back on the + * next run with the same sanitized title. + */ +export async function saveAnalysis( + knowledgeId: string, + commitHash: string, + sanitizedTitle: string, + analysis: BusinessContextAnalysis, + meta: SaveAnalysisMetadata, +): Promise { + const envelope: BusinessContextAnalysisMetadata = { + generatedAt: new Date().toISOString(), + commitHash: meta.commitHash, + modelName: meta.modelName, + inputTokens: meta.inputTokens, + outputTokens: meta.outputTokens, + ...(meta.description !== undefined ? { description: meta.description } : {}), + analysis, + }; + + const dir = businessContextDir(knowledgeId, commitHash, sanitizedTitle); + await mkdir(dir, { recursive: true, mode: DIR_MODE }); + const filePath = path.join(dir, "analysis.json"); + await writeFile(filePath, JSON.stringify(envelope, null, 2), { encoding: "utf-8", mode: 0o600 }); + logger.info( + `business-context: saved analysis at ${filePath} (model=${meta.modelName}, ${meta.inputTokens} in / ${meta.outputTokens} out)`, + ); + return filePath; +} diff --git a/packages/ingest-business-context/src/disk/save-original.ts b/packages/ingest-business-context/src/disk/save-original.ts new file mode 100644 index 0000000..60a0073 --- /dev/null +++ b/packages/ingest-business-context/src/disk/save-original.ts @@ -0,0 +1,25 @@ +import { mkdir, writeFile } from "node:fs/promises"; +import path from "node:path"; +import { businessContextDir } from "@bb/ingest-github"; +import { logger } from "@bb/logger"; + +const DIR_MODE = 0o700; + +/** + * Persists the raw user-authored text. Mirror copy of the input — used for + * audit (proving what was analysed) and for re-running the analysis later + * against an updated field-defs schema without re-prompting the user. + */ +export async function saveOriginalText( + knowledgeId: string, + commitHash: string, + sanitizedTitle: string, + text: string, +): Promise { + const dir = businessContextDir(knowledgeId, commitHash, sanitizedTitle); + await mkdir(dir, { recursive: true, mode: DIR_MODE }); + const filePath = path.join(dir, "original.txt"); + await writeFile(filePath, text, { encoding: "utf-8", mode: 0o600 }); + logger.info(`business-context: saved original text at ${filePath} (${text.length} chars)`); + return filePath; +} diff --git a/packages/ingest-business-context/src/errors.ts b/packages/ingest-business-context/src/errors.ts new file mode 100644 index 0000000..4af4339 --- /dev/null +++ b/packages/ingest-business-context/src/errors.ts @@ -0,0 +1,33 @@ +/** + * Thrown when the worker is asked to attach a business context to a commit + * whose files have not been indexed. The HTTP layer maps this to a 409. + */ +export class CommitNotIndexedError extends Error { + readonly knowledgeId: string; + readonly commitHash: string; + + constructor(knowledgeId: string, commitHash: string) { + super(`Commit ${commitHash.substring(0, 12)} is not indexed for knowledge ${knowledgeId}`); + this.name = "CommitNotIndexedError"; + this.knowledgeId = knowledgeId; + this.commitHash = commitHash; + } +} + +/** + * Thrown when every LLM analysis call returns null (no usable JSON). Distinct + * from upstream LLM errors (rate limits, transport) which propagate as-is. + */ +export class BusinessContextAnalysisFailedError extends Error { + readonly knowledgeId: string; + readonly commitHash: string; + + constructor(knowledgeId: string, commitHash: string) { + super( + `All parallel LLM analysis calls returned null for knowledge ${knowledgeId} @ ${commitHash.substring(0, 12)}`, + ); + this.name = "BusinessContextAnalysisFailedError"; + this.knowledgeId = knowledgeId; + this.commitHash = commitHash; + } +} diff --git a/packages/ingest-business-context/src/field-defs.ts b/packages/ingest-business-context/src/field-defs.ts new file mode 100644 index 0000000..d3b7bec --- /dev/null +++ b/packages/ingest-business-context/src/field-defs.ts @@ -0,0 +1,203 @@ +/** + * Single source of truth for the LLM analysis schema. Each entry defines a + * field's expected type, the human-readable description shown to the LLM, + * special instructions that constrain output, whether the field is requested + * from the LLM (vs. populated by the pipeline), and an example value that + * appears in the prompt template. + * + * Changing any value here propagates to the prompt builders and the validation + * paths; nothing else needs to update. + */ +export interface BusinessContextFieldDef { + type: string; + description: string; + special_instructions: string; + requestedFromLLM: boolean; + example: string; +} + +const _FIELD_DEFS = { + // ── Product People Fields ───────────────────────────────────────────────── + + title: { + type: "string", + description: "Concise, descriptive title for this business context entry", + special_instructions: + "Max 50 words. Should be immediately recognizable to a product manager scanning a list. No technical jargon.", + requestedFromLLM: true, + example: '"Stripe Payment Processing Integration"', + }, + product_area: { + type: "string", + description: "Which product domain or area this context describes", + special_instructions: + "One or two words identifying the product area. Use standard product terminology. Empty string if unclear.", + requestedFromLLM: true, + example: '"Payments"', + }, + user_stories: { + type: "string[]", + description: 'User needs this context addresses, each in "As a [role], I want [goal]" format', + special_instructions: + "Max 5 stories. Each must follow the As a / I want pattern. Derive from the text, do not invent needs not mentioned.", + requestedFromLLM: true, + example: + '["As a customer, I want to pay with my saved card so checkout is faster", "As a finance team member, I want transaction reconciliation reports"]', + }, + business_value: { + type: "string", + description: "What measurable value this provides to the business", + special_instructions: + "2-3 sentences max. Focus on revenue, cost, risk, or user satisfaction impact. No technical implementation details.", + requestedFromLLM: true, + example: + '"Reduces checkout abandonment by 15% through one-click payments. Directly impacts monthly recurring revenue and customer retention metrics."', + }, + stakeholders: { + type: "string[]", + description: "Roles or teams who care about this context", + special_instructions: + "Max 6 entries. Use role titles not individual names. Include both business and technical stakeholders mentioned or implied.", + requestedFromLLM: true, + example: '["Product Manager", "Payments Team", "Finance", "Customer Support"]', + }, + success_metrics: { + type: "string[]", + description: "How success is measured for this business context", + special_instructions: + "Max 5 metrics. Each should be a measurable outcome, not a vague goal. Derive from text, infer reasonable metrics if not stated explicitly.", + requestedFromLLM: true, + example: + '["Checkout conversion rate > 85%", "Payment processing latency < 2s", "Zero failed transactions due to integration errors"]', + }, + user_impact: { + type: "string", + description: "How end users are affected, in plain language", + special_instructions: "2-3 sentences. Describe the before/after for the end user. No technical jargon.", + requestedFromLLM: true, + example: + '"Users can now complete purchases in under 30 seconds with saved payment methods. Previously, re-entering card details on every purchase caused significant drop-off."', + }, + domain_keywords: { + type: "string[]", + description: "Business domain search terms for cross-repo discoverability", + special_instructions: + "Max 10 keywords. Business language only — no code identifiers. Think: what would a product person search for?", + requestedFromLLM: true, + example: '["payments", "checkout", "subscription", "billing", "revenue", "PCI compliance"]', + }, + + // ── Developer Fields ────────────────────────────────────────────────────── + + technical_summary: { + type: "string", + description: "What the code actually does at a technical level", + special_instructions: + "3-5 sentences. Include architecture pattern, key technologies, and data stores involved. This is for senior engineers.", + requestedFromLLM: true, + example: + '"Implements a Stripe webhook handler using Express middleware that processes payment_intent events. Uses idempotency keys stored in Redis to prevent duplicate processing. Failed webhooks are retried via a BullMQ dead-letter queue with exponential backoff."', + }, + affected_modules: { + type: "string[]", + description: "Which parts of the codebase are involved (folder paths or module names)", + special_instructions: + "Max 10 entries. Use folder-level paths (e.g., src/payments/) or module names. Derive from context, do not guess paths not mentioned.", + requestedFromLLM: true, + example: '["src/payments/", "src/webhooks/stripe/", "src/queue/workers/payment-processor"]', + }, + architecture_decisions: { + type: "string[]", + description: 'Key technical choices, each as "Decision: X — Rationale: Y"', + special_instructions: + "Max 5 entries. Focus on decisions that would surprise a new developer or that have non-obvious rationale.", + requestedFromLLM: true, + example: + '["Decision: Use webhook-based flow instead of polling — Rationale: Stripe recommends webhooks for reliability", "Decision: Redis idempotency keys with 24h TTL — Rationale: Stripe may retry webhooks for up to 24 hours"]', + }, + dependencies: { + type: "string[]", + description: "Systems, services, or libraries this relies on", + special_instructions: + "Max 8 entries. Include both internal services and external dependencies. Format: 'name (type)' e.g., 'Stripe API (external)', 'Redis (cache)'.", + requestedFromLLM: true, + example: '["Stripe API (external)", "Redis (cache/idempotency)", "BullMQ (queue)", "PostgreSQL (transactions)"]', + }, + risk_areas: { + type: "string[]", + description: "What could go wrong — known fragilities, operational concerns", + special_instructions: "Max 5 entries. Be specific about failure modes. Include both technical and business risks.", + requestedFromLLM: true, + example: + '["Stripe webhook signing secret rotation requires coordinated deploy", "Redis downtime causes duplicate payment processing"]', + }, + data_flow: { + type: "string", + description: "How data moves through the system for this business context", + special_instructions: + "Describe the flow in plain English with arrow notation. Max 3-4 sentences. Include entry points, transforms, and storage.", + requestedFromLLM: true, + example: + '"User submits payment → Stripe processes charge → Webhook hits /api/webhooks/stripe → Handler validates signature → Event queued in BullMQ → Worker updates order status in PostgreSQL → Confirmation email sent via SendGrid."', + }, + api_surface: { + type: "string[]", + description: "APIs exposed or consumed", + special_instructions: + 'Max 8 entries. Format exposed as "METHOD /path — description". Format consumed as "Consumes: service.endpoint — purpose".', + requestedFromLLM: true, + example: + '["POST /api/webhooks/stripe — Receives Stripe webhook events", "GET /api/payments/:id — Retrieve payment status", "Consumes: Stripe PaymentIntents API — Create and confirm charges"]', + }, + + // ── Shared Fields ───────────────────────────────────────────────────────── + + summary: { + type: "string", + description: "2-3 sentence overview combining both business and technical perspectives", + special_instructions: + "First sentence: business context. Second sentence: technical approach. Optional third: key constraint or trade-off. Max 100 tokens.", + requestedFromLLM: true, + example: + '"Enables one-click checkout by integrating Stripe payment processing with saved card tokens. Implemented as an event-driven pipeline using webhooks and BullMQ for reliable async processing. Designed for PCI compliance with zero card data touching our servers."', + }, + keywords: { + type: "string[]", + description: "Searchable terms covering both business and technical vocabulary", + special_instructions: + "Max 15 keywords. Mix of business terms (from domain_keywords) and technical terms. No duplicates across domain_keywords and keywords.", + requestedFromLLM: true, + example: '["stripe", "webhook", "payment-intent", "idempotency", "BullMQ", "checkout", "PCI", "async-processing"]', + }, +} as const; + +export const BUSINESS_CONTEXT_FIELD_DEFS: Record = _FIELD_DEFS; + +export const PRODUCT_FIELDS: readonly string[] = [ + "title", + "product_area", + "user_stories", + "business_value", + "stakeholders", + "success_metrics", + "user_impact", + "domain_keywords", +]; + +export const TECHNICAL_FIELDS: readonly string[] = [ + "technical_summary", + "affected_modules", + "architecture_decisions", + "dependencies", + "risk_areas", + "data_flow", + "api_surface", +]; + +export const SHARED_FIELDS: readonly string[] = ["summary", "keywords"]; + +export const LLM_FIELD_NAMES: readonly string[] = Object.entries(_FIELD_DEFS) + .filter(([, def]) => def.requestedFromLLM) + .map(([name]) => name); + +export const LLM_FIELD_NAME_SET: ReadonlySet = new Set(LLM_FIELD_NAMES); diff --git a/packages/ingest-business-context/src/index.ts b/packages/ingest-business-context/src/index.ts new file mode 100644 index 0000000..79c6422 --- /dev/null +++ b/packages/ingest-business-context/src/index.ts @@ -0,0 +1,33 @@ +// Public API for @bb/ingest-business-context. + +export { registerBusinessContextWorker } from "./worker/register.ts"; +export { handleBusinessContextProcessing } from "./worker/handler.ts"; + +export { executeBusinessContextStrategy } from "./strategy/execute.ts"; +export type { ExecuteOptions } from "./strategy/execute.ts"; +export { storeBusinessContextToNeo4j } from "./strategy/store-graph.ts"; +export type { StoreGraphInput } from "./strategy/store-graph.ts"; +export { assertCommitIndexed, checkCommitIndexed } from "./strategy/commit-validator.ts"; +export type { CommitIndexStatus } from "./strategy/commit-validator.ts"; + +export { BUSINESS_CONTEXT_FIELD_DEFS, LLM_FIELD_NAMES, LLM_FIELD_NAME_SET } from "./field-defs.ts"; +export type { BusinessContextFieldDef } from "./field-defs.ts"; + +export { BUSINESS_CONTEXT_KEYWORD_TYPES } from "./neo4j/relationship-types.ts"; +export { ensureBusinessContextIndexes } from "./neo4j/indexes.ts"; + +export { sanitizeTitle } from "./disk/sanitize-title.ts"; +export { loadCachedAnalysis } from "./disk/load-cached.ts"; + +export { CommitNotIndexedError, BusinessContextAnalysisFailedError } from "./errors.ts"; + +export type { + BusinessContextAnalysis, + BusinessContextAnalysisMetadata, + BusinessContextInput, + BusinessContextLlmOptions, + BusinessContextNeo4jResult, + BusinessContextStorageResult, + TitleGenerationResult, + AnalysisResult, +} from "./types.ts"; diff --git a/packages/ingest-business-context/src/llm/README.md b/packages/ingest-business-context/src/llm/README.md new file mode 100644 index 0000000..5cd3f94 --- /dev/null +++ b/packages/ingest-business-context/src/llm/README.md @@ -0,0 +1,16 @@ +# `llm/` — context + +LLM-driven analysis. All calls flow through `@bb/llm` (`askJsonLLM`). Per-job +overrides (apiKey, provider, model) come in via the worker payload and are +applied here. + +| File | Responsibility | +| ---------------------- | ------------------------------------------------------------------------------------------ | +| `enrichment-reader.ts` | Reads optional org-level registries and repo-summary from disk. Tolerant of missing files. | +| `enrichment-format.ts` | Renders enrichment data into a per-focus prompt section with a token cap. | +| `call-builder.ts` | Composes one analysis call (system+user) and trims enrichment if over budget. | +| `merge.ts` | Merges three partial blobs into one fully-populated `BusinessContextAnalysis`. | +| `title.ts` | Title-generation call. Returns the fallback "Untitled Business Context" on null. | +| `analyze-parallel.ts` | Runs the 3 analysis calls concurrently and merges results. | + +The package never imports OpenAI / Anthropic SDKs. Only `@bb/llm`. diff --git a/packages/ingest-business-context/src/llm/analyze-parallel.ts b/packages/ingest-business-context/src/llm/analyze-parallel.ts new file mode 100644 index 0000000..cfdb60c --- /dev/null +++ b/packages/ingest-business-context/src/llm/analyze-parallel.ts @@ -0,0 +1,111 @@ +import { askJsonLLM, type AskJsonLlmOptions, type LlmProviderName, tokenLen } from "@bb/llm"; +import { logger } from "@bb/logger"; +import { PRODUCT_FIELDS, SHARED_FIELDS, TECHNICAL_FIELDS } from "#src/field-defs.ts"; +import { buildAnalysisPromptForCall } from "#src/llm/call-builder.ts"; +import type { EnrichmentData } from "#src/llm/enrichment-reader.ts"; +import type { EnrichmentFocus } from "#src/llm/enrichment-format.ts"; +import { mergeAnalysisFields } from "#src/llm/merge.ts"; +import type { AnalysisResult, BusinessContextAnalysis, BusinessContextLlmOptions } from "#src/types.ts"; + +const MAX_CONTEXT_WINDOW = 50_000; +const KNOWN_PROVIDERS: ReadonlySet = new Set(["openrouter", "ollama"]); + +interface AnalysisCall { + name: string; + fields: readonly string[]; + focus: EnrichmentFocus; +} + +const CALLS: readonly AnalysisCall[] = [ + { name: "product", fields: PRODUCT_FIELDS, focus: "product" }, + { name: "technical", fields: TECHNICAL_FIELDS, focus: "technical" }, + { name: "shared", fields: SHARED_FIELDS, focus: "shared" }, +]; + +function buildLlmOpts(options: BusinessContextLlmOptions): AskJsonLlmOptions { + const opts: AskJsonLlmOptions = { maxRetries: 3 }; + if (options.apiKey !== undefined) { + opts.apiKey = options.apiKey; + } + if (options.model !== undefined) { + opts.model = options.model; + } + if (options.provider !== undefined && KNOWN_PROVIDERS.has(options.provider)) { + opts.provider = options.provider as LlmProviderName; + } + return opts; +} + +async function runOneCall( + call: AnalysisCall, + text: string, + title: string, + enrichment: EnrichmentData, + baseOpts: AskJsonLlmOptions, +): Promise<{ + result: Partial | null; + model: string; + inputTokens: number; + outputTokens: number; +}> { + const { systemPrompt, userMessage } = buildAnalysisPromptForCall(call, text, title, enrichment, MAX_CONTEXT_WINDOW); + const promptTokens = tokenLen(systemPrompt) + tokenLen(userMessage); + logger.info(`business-context: call "${call.name}" ~${promptTokens} tokens, ${call.fields.length} fields`); + + const r = await askJsonLLM>(systemPrompt, userMessage, baseOpts); + return { + result: r.result, + model: r.usage.model, + inputTokens: r.usage.inputTokens, + outputTokens: r.usage.outputTokens, + }; +} + +/** + * Runs the three analysis LLM calls in parallel (product, technical, shared) + * and merges the partial results into a single `BusinessContextAnalysis`. + * Returns `analysis: null` only when every call returned null — caller treats + * that as a fatal failure. + */ +export async function analyzeBusinessContextParallel( + text: string, + title: string, + enrichment: EnrichmentData, + options: BusinessContextLlmOptions, +): Promise { + const baseOpts = buildLlmOpts(options); + const calls = await Promise.all(CALLS.map((c) => runOneCall(c, text, title, enrichment, baseOpts))); + + let totalInputTokens = 0; + let totalOutputTokens = 0; + let modelName = ""; + let nonNullResults = 0; + const merged: Record = {}; + + for (let i = 0; i < calls.length; i += 1) { + const r = calls[i]; + const callName = CALLS[i]?.name ?? "?"; + if (r === undefined) { + continue; + } + totalInputTokens += r.inputTokens; + totalOutputTokens += r.outputTokens; + if (modelName.length === 0 && r.model.length > 0) { + modelName = r.model; + } + if (r.result !== null) { + nonNullResults += 1; + Object.assign(merged, r.result); + logger.info(`business-context: call "${callName}" done (${r.inputTokens} in / ${r.outputTokens} out)`); + } else { + logger.warn(`business-context: call "${callName}" returned null — fields will use defaults`); + } + } + + if (nonNullResults === 0) { + return { analysis: null, inputTokens: totalInputTokens, outputTokens: totalOutputTokens, modelName }; + } + + const analysis = mergeAnalysisFields(merged, title); + return { analysis, inputTokens: totalInputTokens, outputTokens: totalOutputTokens, modelName }; +} diff --git a/packages/ingest-business-context/src/llm/call-builder.ts b/packages/ingest-business-context/src/llm/call-builder.ts new file mode 100644 index 0000000..a90d83b --- /dev/null +++ b/packages/ingest-business-context/src/llm/call-builder.ts @@ -0,0 +1,46 @@ +import { tokenLen } from "@bb/llm"; +import { logger } from "@bb/logger"; +import { buildPartialAnalysisPrompt } from "#src/prompt/analysis-prompt.ts"; +import { buildEnrichedUserMessage } from "#src/prompt/user-message.ts"; +import { buildEnrichmentSection, type EnrichmentFocus } from "#src/llm/enrichment-format.ts"; +import type { EnrichmentData } from "#src/llm/enrichment-reader.ts"; + +export interface AnalysisCallShape { + name: string; + fields: readonly string[]; + focus: EnrichmentFocus; +} + +export interface BuiltCall { + systemPrompt: string; + userMessage: string; +} + +/** + * Builds the prompt pair for a single analysis call. If the combined + * system+user token estimate exceeds the budget, the enrichment section is + * trimmed proportionally and the user message is rebuilt — we never let the + * prompt drift past the budget silently. + */ +export function buildAnalysisPromptForCall( + call: AnalysisCallShape, + text: string, + title: string, + enrichment: EnrichmentData, + maxContextWindow: number, +): BuiltCall { + const systemPrompt = buildPartialAnalysisPrompt(call.fields); + let enrichmentSection = buildEnrichmentSection(enrichment, call.focus); + let userMessage = buildEnrichedUserMessage(text, title, enrichmentSection); + let totalTokens = tokenLen(systemPrompt) + tokenLen(userMessage); + + if (totalTokens > maxContextWindow && enrichmentSection.length > 0) { + const ratio = (maxContextWindow / totalTokens) * 0.8; + enrichmentSection = enrichmentSection.slice(0, Math.floor(enrichmentSection.length * ratio)); + userMessage = buildEnrichedUserMessage(text, title, enrichmentSection); + totalTokens = tokenLen(systemPrompt) + tokenLen(userMessage); + logger.warn(`business-context: call "${call.name}" trimmed enrichment to ~${totalTokens} tokens`); + } + + return { systemPrompt, userMessage }; +} diff --git a/packages/ingest-business-context/src/llm/enrichment-format.ts b/packages/ingest-business-context/src/llm/enrichment-format.ts new file mode 100644 index 0000000..6ece2b8 --- /dev/null +++ b/packages/ingest-business-context/src/llm/enrichment-format.ts @@ -0,0 +1,88 @@ +import { tokenLen } from "@bb/llm"; +import { logger } from "@bb/logger"; +import type { EnrichmentData, KeywordCount } from "#src/llm/enrichment-reader.ts"; + +const MAX_ENRICHMENT_TOKENS = 15_000; + +export type EnrichmentFocus = "product" | "technical" | "shared"; + +function formatEntries(entries: readonly KeywordCount[]): string { + return entries.map((e) => ` ${e.keyword} (${e.count})`).join("\n"); +} + +function appendProductSection(enrichment: EnrichmentData, sections: string[]): void { + if (enrichment.topKeywords.length > 0) { + sections.push(`TOP REPOSITORY KEYWORDS (by frequency):\n${formatEntries(enrichment.topKeywords)}`); + } + if (enrichment.topBusinessEntities.length > 0) { + sections.push(`TOP BUSINESS ENTITIES:\n${formatEntries(enrichment.topBusinessEntities)}`); + } + if (enrichment.topOntologyConcepts.length > 0) { + sections.push(`TOP ONTOLOGY CONCEPTS:\n${formatEntries(enrichment.topOntologyConcepts)}`); + } + if (enrichment.majorSubsystems.length > 0) { + const lines = enrichment.majorSubsystems.map((s) => ` ${s.name}: ${s.responsibility}`).join("\n"); + sections.push(`MAJOR SUBSYSTEMS:\n${lines}`); + } +} + +function appendTechnicalSection(enrichment: EnrichmentData, sections: string[]): void { + if (enrichment.repoArchitecture.length > 0) { + sections.push(`REPOSITORY ARCHITECTURE:\n${enrichment.repoArchitecture}`); + } + if (enrichment.repoDataFlow.length > 0) { + sections.push(`DATA FLOW:\n${enrichment.repoDataFlow}`); + } + if (enrichment.repoKeyPatterns.length > 0) { + sections.push(`KEY PATTERNS:\n ${enrichment.repoKeyPatterns.join(", ")}`); + } + if (enrichment.integrationSurface.length > 0) { + sections.push(`INTEGRATION SURFACE:\n${formatEntries(enrichment.integrationSurface)}`); + } + if (enrichment.contractsProvided.length > 0) { + sections.push(`CONTRACTS PROVIDED:\n${formatEntries(enrichment.contractsProvided)}`); + } + if (enrichment.contractsConsumed.length > 0) { + sections.push(`CONTRACTS CONSUMED:\n${formatEntries(enrichment.contractsConsumed)}`); + } + if (enrichment.sideEffects.length > 0) { + sections.push(`SIDE EFFECTS:\n${formatEntries(enrichment.sideEffects)}`); + } + if (enrichment.configDependencies.length > 0) { + sections.push(`CONFIG DEPENDENCIES:\n${formatEntries(enrichment.configDependencies)}`); + } + if (enrichment.topSystemCapabilities.length > 0) { + sections.push(`SYSTEM CAPABILITIES:\n${formatEntries(enrichment.topSystemCapabilities)}`); + } +} + +/** + * Renders the enrichment data into a string targeted at a specific LLM call. + * Product call sees business entities and concepts; technical call sees + * architecture, contracts, side effects; shared call sees both. + * + * Output is capped at `MAX_ENRICHMENT_TOKENS` — over budget, truncated + * proportionally. Empty enrichment returns an empty string and the user-message + * composer elides the section entirely. + */ +export function buildEnrichmentSection(enrichment: EnrichmentData, focus: EnrichmentFocus): string { + const sections: string[] = []; + if (focus === "product" || focus === "shared") { + appendProductSection(enrichment, sections); + } + if (focus === "technical" || focus === "shared") { + appendTechnicalSection(enrichment, sections); + } + const full = sections.join("\n\n"); + if (full.length === 0) { + return ""; + } + + const tokens = tokenLen(full); + if (tokens > MAX_ENRICHMENT_TOKENS) { + logger.info(`business-context: enrichment (${focus}) ${tokens} tokens > cap ${MAX_ENRICHMENT_TOKENS}; truncating`); + const ratio = MAX_ENRICHMENT_TOKENS / tokens; + return full.slice(0, Math.floor(full.length * ratio)); + } + return full; +} diff --git a/packages/ingest-business-context/src/llm/enrichment-reader.ts b/packages/ingest-business-context/src/llm/enrichment-reader.ts new file mode 100644 index 0000000..dd082f6 --- /dev/null +++ b/packages/ingest-business-context/src/llm/enrichment-reader.ts @@ -0,0 +1,147 @@ +import { readFile } from "node:fs/promises"; +import path from "node:path"; +import { metaRootFor, orgRegistryDir } from "@bb/ingest-github"; +import { logger } from "@bb/logger"; + +const TOP_N = 50; + +/** + * Org-level keyword registry files the reader probes for. None of these are + * produced by OSS today; downstream multi-tenant deployments may produce them + * by aggregating across all knowledges in an org. Missing files are normal + * and degrade silently to empty data. + */ +type OrgRegistryFile = + | "keywords.json" + | "business-entities.json" + | "ontology-concepts.json" + | "system-capabilities.json" + | "integration-surface.json" + | "contracts-provided.json" + | "contracts-consumed.json" + | "side-effects.json" + | "config-dependencies.json"; + +export interface KeywordCount { + keyword: string; + count: number; +} + +export interface EnrichmentData { + topKeywords: KeywordCount[]; + topBusinessEntities: KeywordCount[]; + topOntologyConcepts: KeywordCount[]; + topSystemCapabilities: KeywordCount[]; + integrationSurface: KeywordCount[]; + contractsProvided: KeywordCount[]; + contractsConsumed: KeywordCount[]; + sideEffects: KeywordCount[]; + configDependencies: KeywordCount[]; + repoArchitecture: string; + repoDataFlow: string; + repoKeyPatterns: string[]; + majorSubsystems: Array<{ name: string; responsibility: string }>; +} + +export function emptyEnrichment(): EnrichmentData { + return { + topKeywords: [], + topBusinessEntities: [], + topOntologyConcepts: [], + topSystemCapabilities: [], + integrationSurface: [], + contractsProvided: [], + contractsConsumed: [], + sideEffects: [], + configDependencies: [], + repoArchitecture: "", + repoDataFlow: "", + repoKeyPatterns: [], + majorSubsystems: [], + }; +} + +async function readJsonSafe(filePath: string): Promise { + try { + const content = await readFile(filePath, "utf-8"); + return JSON.parse(content) as unknown; + } catch { + return null; + } +} + +async function readOrgRegistry(dir: string, file: OrgRegistryFile): Promise { + const data = await readJsonSafe(path.join(dir, file)); + if (data === null || typeof data !== "object") { + return []; + } + const entries: KeywordCount[] = []; + for (const [keyword, count] of Object.entries(data as Record)) { + if (typeof count === "number") { + entries.push({ keyword, count }); + } + } + entries.sort((a, b) => b.count - a.count); + return entries.slice(0, TOP_N); +} + +interface RepoSummaryShape { + architecture?: string; + dataFlow?: string; + keyPatterns?: unknown; + majorSubsystems?: unknown; +} + +async function readRepoSummary(knowledgeId: string, enrichment: EnrichmentData): Promise { + const repoSummaryJson = path.join(metaRootFor(knowledgeId), "repo-summary.json"); + const data = await readJsonSafe(repoSummaryJson); + if (data === null || typeof data !== "object") { + return; + } + const rs = ((data as { repoSummary?: RepoSummaryShape }).repoSummary ?? data) as RepoSummaryShape; + if (typeof rs.architecture === "string") { + enrichment.repoArchitecture = rs.architecture; + } + if (typeof rs.dataFlow === "string") { + enrichment.repoDataFlow = rs.dataFlow; + } + if (Array.isArray(rs.keyPatterns)) { + enrichment.repoKeyPatterns = rs.keyPatterns.filter((p): p is string => typeof p === "string"); + } + if (Array.isArray(rs.majorSubsystems)) { + enrichment.majorSubsystems = rs.majorSubsystems + .filter((s): s is { name?: unknown; responsibility?: unknown } => typeof s === "object" && s !== null) + .map((s) => ({ + name: typeof s.name === "string" ? s.name : "", + responsibility: typeof s.responsibility === "string" ? s.responsibility : "", + })) + .filter((s) => s.name.length > 0); + } +} + +/** + * Reads enrichment data from disk. Never throws — every missing file degrades + * silently to empty data. The strategy proceeds with whatever it finds; the + * LLM is robust to empty enrichment sections. + */ +export async function collectEnrichmentData(knowledgeId: string, orgId: string): Promise { + const enrichment = emptyEnrichment(); + const registryDir = orgRegistryDir(knowledgeId, orgId); + + enrichment.topKeywords = await readOrgRegistry(registryDir, "keywords.json"); + enrichment.topBusinessEntities = await readOrgRegistry(registryDir, "business-entities.json"); + enrichment.topOntologyConcepts = await readOrgRegistry(registryDir, "ontology-concepts.json"); + enrichment.topSystemCapabilities = await readOrgRegistry(registryDir, "system-capabilities.json"); + enrichment.integrationSurface = await readOrgRegistry(registryDir, "integration-surface.json"); + enrichment.contractsProvided = await readOrgRegistry(registryDir, "contracts-provided.json"); + enrichment.contractsConsumed = await readOrgRegistry(registryDir, "contracts-consumed.json"); + enrichment.sideEffects = await readOrgRegistry(registryDir, "side-effects.json"); + enrichment.configDependencies = await readOrgRegistry(registryDir, "config-dependencies.json"); + + await readRepoSummary(knowledgeId, enrichment); + + logger.info( + `business-context: enrichment loaded — ${enrichment.topKeywords.length} kw, ${enrichment.topBusinessEntities.length} entities, architecture=${enrichment.repoArchitecture.length > 0}, subsystems=${enrichment.majorSubsystems.length}`, + ); + return enrichment; +} diff --git a/packages/ingest-business-context/src/llm/merge.ts b/packages/ingest-business-context/src/llm/merge.ts new file mode 100644 index 0000000..4d269f9 --- /dev/null +++ b/packages/ingest-business-context/src/llm/merge.ts @@ -0,0 +1,40 @@ +import type { BusinessContextAnalysis } from "#src/types.ts"; + +function takeString(value: unknown, fallback = ""): string { + return typeof value === "string" ? value : fallback; +} + +function takeStringArray(value: unknown): string[] { + if (!Array.isArray(value)) { + return []; + } + return value.filter((v): v is string => typeof v === "string"); +} + +/** + * Merges three partial analysis blobs (product, technical, shared) into a + * single fully-populated `BusinessContextAnalysis`. Missing fields default to + * empty values. The pre-generated `title` is used as the final fallback if + * the product call did not emit one. + */ +export function mergeAnalysisFields(merged: Record, fallbackTitle: string): BusinessContextAnalysis { + return { + title: takeString(merged["title"], fallbackTitle), + product_area: takeString(merged["product_area"]), + user_stories: takeStringArray(merged["user_stories"]), + business_value: takeString(merged["business_value"]), + stakeholders: takeStringArray(merged["stakeholders"]), + success_metrics: takeStringArray(merged["success_metrics"]), + user_impact: takeString(merged["user_impact"]), + domain_keywords: takeStringArray(merged["domain_keywords"]), + technical_summary: takeString(merged["technical_summary"]), + affected_modules: takeStringArray(merged["affected_modules"]), + architecture_decisions: takeStringArray(merged["architecture_decisions"]), + dependencies: takeStringArray(merged["dependencies"]), + risk_areas: takeStringArray(merged["risk_areas"]), + data_flow: takeString(merged["data_flow"]), + api_surface: takeStringArray(merged["api_surface"]), + summary: takeString(merged["summary"]), + keywords: takeStringArray(merged["keywords"]), + }; +} diff --git a/packages/ingest-business-context/src/llm/title.ts b/packages/ingest-business-context/src/llm/title.ts new file mode 100644 index 0000000..c55e654 --- /dev/null +++ b/packages/ingest-business-context/src/llm/title.ts @@ -0,0 +1,50 @@ +import { askJsonLLM, type AskJsonLlmOptions, type LlmProviderName } from "@bb/llm"; +import { logger } from "@bb/logger"; +import { buildTitleGenerationPrompt } from "#src/prompt/title-prompt.ts"; +import type { BusinessContextLlmOptions, TitleGenerationResult } from "#src/types.ts"; + +const FALLBACK_TITLE = "Untitled Business Context"; + +const KNOWN_PROVIDERS: ReadonlySet = new Set(["openrouter", "ollama"]); + +function buildLlmOpts(options: BusinessContextLlmOptions): AskJsonLlmOptions { + const opts: AskJsonLlmOptions = { maxRetries: 2 }; + if (options.apiKey !== undefined) { + opts.apiKey = options.apiKey; + } + if (options.model !== undefined) { + opts.model = options.model; + } + if (options.provider !== undefined && KNOWN_PROVIDERS.has(options.provider)) { + opts.provider = options.provider as LlmProviderName; + } + return opts; +} + +/** + * Runs the title-generation LLM call. Returns `FALLBACK_TITLE` if the LLM + * returns nothing parseable — the rest of the pipeline still completes. + */ +export async function generateBusinessContextTitle( + text: string, + options: BusinessContextLlmOptions, +): Promise { + const systemPrompt = buildTitleGenerationPrompt(); + const result = await askJsonLLM<{ title?: unknown }>(systemPrompt, text, buildLlmOpts(options)); + + const title = + result.result !== null && typeof result.result.title === "string" && result.result.title.trim().length > 0 + ? result.result.title.trim() + : FALLBACK_TITLE; + + logger.info( + `business-context: title generated — "${title}" (model=${result.usage.model}, ${result.usage.inputTokens} in / ${result.usage.outputTokens} out)`, + ); + + return { + title, + inputTokens: result.usage.inputTokens, + outputTokens: result.usage.outputTokens, + modelName: result.usage.model, + }; +} diff --git a/packages/ingest-business-context/src/neo4j/README.md b/packages/ingest-business-context/src/neo4j/README.md new file mode 100644 index 0000000..e3033ec --- /dev/null +++ b/packages/ingest-business-context/src/neo4j/README.md @@ -0,0 +1,33 @@ +# `neo4j/` — context + +Owns every Cypher statement the package issues. All writes go through +`@bb/neo4j`'s `runCypher` — no driver imports here. + +## Schema + +``` +(:Knowledge {knowledgeId}) + -[:HAS_BUSINESS_CONTEXT]-> + (:BusinessContext {nodeId, knowledgeId, orgId, title, productArea, summary, + businessValue, technicalSummary, userImpact, + keywordsText, domainKeywordsText, updatedAt}) + -[:HAS_VERSION]-> + (:BusinessContextVersion {knowledgeId, nodeId, commitHash, orgId, + analysisJson, updatedAt}) + -[:DESCRIBES]-> (:FileVersion {knowledgeId, commitHash, …}) [zero or more] + +(:OrgKeyword {orgId, keyword, type}) + -[:APPEARS_IN_BUSINESS_CONTEXT]-> + (:BusinessContext) +``` + +| File | Responsibility | +| ----------------------- | ---------------------------------------------------------------------------- | --- | ------------------------------------- | +| `indexes.ts` | `ensureBusinessContextIndexes()` — 7 `IF NOT EXISTS` indexes. | +| `relationship-types.ts` | Field → relationship-class map (10 typed classes on `:OrgKeyword`). | +| `serialize.ts` | `string[] → "a | b | c"` for fulltext-friendly properties. | +| `write-node.ts` | Merges the parent `:BusinessContext` and links it from `:Knowledge`. | +| `write-version.ts` | Merges the per-commit `:BusinessContextVersion` and links to `:FileVersion`. | +| `write-keywords.ts` | Merges `:OrgKeyword` nodes and `:APPEARS_IN_BUSINESS_CONTEXT` edges. | + +Every MERGE is keyed so re-runs are no-ops (idempotency is the contract). diff --git a/packages/ingest-business-context/src/neo4j/indexes.ts b/packages/ingest-business-context/src/neo4j/indexes.ts new file mode 100644 index 0000000..63dd22d --- /dev/null +++ b/packages/ingest-business-context/src/neo4j/indexes.ts @@ -0,0 +1,24 @@ +import { runCypher } from "@bb/neo4j"; +import { logger } from "@bb/logger"; + +const INDEX_DEFINITIONS: readonly string[] = [ + "CREATE INDEX business_context_by_knowledge IF NOT EXISTS FOR (bc:BusinessContext) ON (bc.knowledgeId)", + "CREATE INDEX business_context_by_node_id IF NOT EXISTS FOR (bc:BusinessContext) ON (bc.nodeId)", + "CREATE INDEX business_context_by_org IF NOT EXISTS FOR (bc:BusinessContext) ON (bc.orgId)", + "CREATE INDEX business_context_version_by_knowledge_commit IF NOT EXISTS FOR (bv:BusinessContextVersion) ON (bv.knowledgeId, bv.commitHash)", + "CREATE INDEX business_context_version_by_node_commit IF NOT EXISTS FOR (bv:BusinessContextVersion) ON (bv.nodeId, bv.commitHash)", + "CREATE INDEX org_keyword_by_org_keyword IF NOT EXISTS FOR (k:OrgKeyword) ON (k.orgId, k.keyword)", + "CREATE INDEX org_keyword_by_type IF NOT EXISTS FOR (k:OrgKeyword) ON (k.type)", +]; + +/** + * Creates the indexes the business-context queries rely on. Safe to call + * repeatedly — every statement uses `IF NOT EXISTS`. The worker invokes + * this once before each Neo4j write. + */ +export async function ensureBusinessContextIndexes(): Promise { + for (const ddl of INDEX_DEFINITIONS) { + await runCypher(ddl); + } + logger.info("business-context: indexes ensured"); +} diff --git a/packages/ingest-business-context/src/neo4j/relationship-types.ts b/packages/ingest-business-context/src/neo4j/relationship-types.ts new file mode 100644 index 0000000..d44a7e1 --- /dev/null +++ b/packages/ingest-business-context/src/neo4j/relationship-types.ts @@ -0,0 +1,19 @@ +/** + * Maps each array-valued field in `BusinessContextAnalysis` to the typed + * relationship name connecting an `:OrgKeyword` to its parent + * `:BusinessContext`. Edge label is fixed (`:APPEARS_IN_BUSINESS_CONTEXT`); + * the `type` property on the `:OrgKeyword` node carries the relationship + * class so queries can filter by stakeholder vs. risk vs. dependency etc. + */ +export const BUSINESS_CONTEXT_KEYWORD_TYPES: Readonly> = { + domain_keywords: "HAS_DOMAIN_KEYWORD", + keywords: "HAS_KEYWORD", + stakeholders: "HAS_STAKEHOLDER", + affected_modules: "HAS_AFFECTED_MODULE", + risk_areas: "HAS_RISK_AREA", + api_surface: "HAS_API_SURFACE", + dependencies: "HAS_DEPENDENCY", + user_stories: "HAS_USER_STORY", + success_metrics: "HAS_SUCCESS_METRIC", + architecture_decisions: "HAS_ARCHITECTURE_DECISION", +}; diff --git a/packages/ingest-business-context/src/neo4j/serialize.ts b/packages/ingest-business-context/src/neo4j/serialize.ts new file mode 100644 index 0000000..19d4416 --- /dev/null +++ b/packages/ingest-business-context/src/neo4j/serialize.ts @@ -0,0 +1,8 @@ +/** + * Joins an array into a single delimited string for storage on a Neo4j property + * that we want full-text indexable. Empty values are skipped; empty input + * returns "". + */ +export function serializeArrayForNeo4j(values: readonly string[]): string { + return values.filter((v) => typeof v === "string" && v.trim().length > 0).join(" | "); +} diff --git a/packages/ingest-business-context/src/neo4j/write-keywords.ts b/packages/ingest-business-context/src/neo4j/write-keywords.ts new file mode 100644 index 0000000..75ea426 --- /dev/null +++ b/packages/ingest-business-context/src/neo4j/write-keywords.ts @@ -0,0 +1,60 @@ +import { runCypher } from "@bb/neo4j"; +import { BUSINESS_CONTEXT_KEYWORD_TYPES } from "#src/neo4j/relationship-types.ts"; +import type { BusinessContextAnalysis } from "#src/types.ts"; + +export interface BusinessContextKeywordIdentity { + knowledgeId: string; + orgId: string; +} + +const MERGE_KEYWORDS = ` +UNWIND $keywords AS kwData +MERGE (kw:OrgKeyword {orgId: $orgId, keyword: kwData.word, type: $relType}) +WITH kw +MATCH (bc:BusinessContext {nodeId: $nodeId, knowledgeId: $knowledgeId}) +MERGE (kw)-[:APPEARS_IN_BUSINESS_CONTEXT]->(bc) +RETURN count(*) AS count +`; + +function pickArrayField(analysis: BusinessContextAnalysis, field: string): string[] { + const value = (analysis as unknown as Record)[field]; + if (!Array.isArray(value)) { + return []; + } + return value + .filter((v): v is string => typeof v === "string") + .map((v) => v.trim()) + .filter((v) => v.length > 0); +} + +/** + * Creates `:OrgKeyword` nodes for every populated array field and connects + * them to the parent `:BusinessContext`. One MERGE per relationship class — + * keeps the writes batched and idempotent. Returns the total count of edges + * (created or pre-existing) across all classes. + */ +export async function createBusinessContextKeywords( + identity: BusinessContextKeywordIdentity, + analysis: BusinessContextAnalysis, + sanitizedTitle: string, +): Promise { + let total = 0; + for (const [field, relType] of Object.entries(BUSINESS_CONTEXT_KEYWORD_TYPES)) { + const words = pickArrayField(analysis, field); + if (words.length === 0) { + continue; + } + + const rows = await runCypher<{ count: number }>(MERGE_KEYWORDS, { + keywords: words.map((w) => ({ word: w })), + relType, + orgId: identity.orgId, + nodeId: sanitizedTitle, + knowledgeId: identity.knowledgeId, + }); + if (rows.length > 0) { + total += Number(rows[0]?.count ?? 0); + } + } + return total; +} diff --git a/packages/ingest-business-context/src/neo4j/write-node.ts b/packages/ingest-business-context/src/neo4j/write-node.ts new file mode 100644 index 0000000..232f887 --- /dev/null +++ b/packages/ingest-business-context/src/neo4j/write-node.ts @@ -0,0 +1,53 @@ +import { runCypher } from "@bb/neo4j"; +import { serializeArrayForNeo4j } from "#src/neo4j/serialize.ts"; +import type { BusinessContextAnalysis } from "#src/types.ts"; + +export interface BusinessContextNodeIdentity { + knowledgeId: string; + orgId: string; +} + +const MERGE_BUSINESS_CONTEXT = ` +MERGE (bc:BusinessContext {nodeId: $nodeId, knowledgeId: $knowledgeId}) +SET bc.orgId = $orgId, + bc.title = $title, + bc.productArea = $productArea, + bc.summary = $summary, + bc.businessValue = $businessValue, + bc.technicalSummary = $technicalSummary, + bc.userImpact = $userImpact, + bc.keywordsText = $keywordsText, + bc.domainKeywordsText = $domainKeywordsText, + bc.updatedAt = $updatedAt +WITH bc +MATCH (k:Knowledge {knowledgeId: $knowledgeId}) +MERGE (k)-[:HAS_BUSINESS_CONTEXT]->(bc) +RETURN count(bc) AS count +`; + +/** + * Creates or updates the parent `:BusinessContext` node, then links it from + * the owning `:Knowledge`. Idempotent — MERGE on `(nodeId, knowledgeId)` means + * resubmitting the same BC returns the same node. + */ +export async function createBusinessContextNode( + identity: BusinessContextNodeIdentity, + analysis: BusinessContextAnalysis, + sanitizedTitle: string, +): Promise { + const rows = await runCypher<{ count: number }>(MERGE_BUSINESS_CONTEXT, { + nodeId: sanitizedTitle, + knowledgeId: identity.knowledgeId, + orgId: identity.orgId, + title: analysis.title, + productArea: analysis.product_area, + summary: analysis.summary, + businessValue: analysis.business_value, + technicalSummary: analysis.technical_summary, + userImpact: analysis.user_impact, + keywordsText: serializeArrayForNeo4j(analysis.keywords), + domainKeywordsText: serializeArrayForNeo4j(analysis.domain_keywords), + updatedAt: new Date().toISOString(), + }); + return rows.length > 0 ? Number(rows[0]?.count ?? 0) : 0; +} diff --git a/packages/ingest-business-context/src/neo4j/write-version.ts b/packages/ingest-business-context/src/neo4j/write-version.ts new file mode 100644 index 0000000..dc9700e --- /dev/null +++ b/packages/ingest-business-context/src/neo4j/write-version.ts @@ -0,0 +1,71 @@ +import { runCypher } from "@bb/neo4j"; +import type { BusinessContextAnalysis } from "#src/types.ts"; + +export interface BusinessContextVersionIdentity { + knowledgeId: string; + orgId: string; + commitHash: string; +} + +const MERGE_VERSION = ` +MERGE (bv:BusinessContextVersion { + knowledgeId: $knowledgeId, + nodeId: $nodeId, + commitHash: $commitHash +}) +SET bv.orgId = $orgId, + bv.analysisJson = $analysisJson, + bv.updatedAt = $updatedAt +WITH bv +MATCH (bc:BusinessContext {nodeId: $nodeId, knowledgeId: $knowledgeId}) +MERGE (bc)-[:HAS_VERSION]->(bv) +RETURN count(bv) AS count +`; + +const LINK_TO_FILE_VERSIONS = ` +MATCH (bv:BusinessContextVersion {knowledgeId: $knowledgeId, nodeId: $nodeId, commitHash: $commitHash}) +WITH bv +MATCH (fv:FileVersion {knowledgeId: $knowledgeId, commitHash: $commitHash}) +MERGE (bv)-[:DESCRIBES]->(fv) +RETURN count(fv) AS count +`; + +/** + * Creates or merges the `:BusinessContextVersion` snapshot for this commit and + * connects it to the parent `:BusinessContext`. Stores the full analysis as a + * JSON property on the version node so historical queries can reconstruct it + * without re-reading disk. + */ +export async function createBusinessContextVersionNode( + identity: BusinessContextVersionIdentity, + analysis: BusinessContextAnalysis, + sanitizedTitle: string, +): Promise { + const rows = await runCypher<{ count: number }>(MERGE_VERSION, { + nodeId: sanitizedTitle, + knowledgeId: identity.knowledgeId, + orgId: identity.orgId, + commitHash: identity.commitHash, + analysisJson: JSON.stringify(analysis), + updatedAt: new Date().toISOString(), + }); + return rows.length > 0 ? Number(rows[0]?.count ?? 0) : 0; +} + +/** + * Links the `:BusinessContextVersion` to every `:FileVersion` that exists for + * the same `(knowledgeId, commitHash)`. Returns the number of edges merged. + * Zero matches → zero edges; re-running after files are snapshot will create + * the missing edges (MERGE is idempotent). + */ +export async function linkVersionToFileVersions( + identity: BusinessContextVersionIdentity, + sanitizedTitle: string, +): Promise { + const rows = await runCypher<{ count: number }>(LINK_TO_FILE_VERSIONS, { + nodeId: sanitizedTitle, + knowledgeId: identity.knowledgeId, + commitHash: identity.commitHash, + }); + return rows.length > 0 ? Number(rows[0]?.count ?? 0) : 0; +} diff --git a/packages/ingest-business-context/src/prompt/README.md b/packages/ingest-business-context/src/prompt/README.md new file mode 100644 index 0000000..9555cd5 --- /dev/null +++ b/packages/ingest-business-context/src/prompt/README.md @@ -0,0 +1,11 @@ +# `prompt/` — context + +Builds the system + user messages consumed by the LLM calls. + +| File | Responsibility | +| -------------------- | ------------------------------------------------------------------------------------ | +| `title-prompt.ts` | System prompt for the title-generation call. Returns `{ "title": "…" }`. | +| `analysis-prompt.ts` | System prompt for partial-fields analysis. Builds the JSON template from field-defs. | +| `user-message.ts` | Composes the user message (text + title + enrichment) for analysis calls. | + +All prompt content stays here. Nothing else in the package builds prompts. diff --git a/packages/ingest-business-context/src/prompt/analysis-prompt.ts b/packages/ingest-business-context/src/prompt/analysis-prompt.ts new file mode 100644 index 0000000..b93b2b6 --- /dev/null +++ b/packages/ingest-business-context/src/prompt/analysis-prompt.ts @@ -0,0 +1,48 @@ +import { BUSINESS_CONTEXT_FIELD_DEFS } from "#src/field-defs.ts"; + +/** + * Builds a system prompt asking the LLM to fill exactly the requested field + * subset (a slice of the full 16-field schema). Each call in the parallel + * pipeline targets one subset (product, technical, shared) so total context + * stays under budget and the JSON outputs are small enough to parse reliably. + * + * The prompt emits a JSON template that lists only the requested fields with + * their descriptions, special instructions, and an example value drawn from + * `BUSINESS_CONTEXT_FIELD_DEFS`. The LLM is asked to populate every key. + */ +export function buildPartialAnalysisPrompt(requestedFields: readonly string[]): string { + const fieldBlocks: string[] = []; + for (const name of requestedFields) { + const def = BUSINESS_CONTEXT_FIELD_DEFS[name]; + if (!def) { + continue; + } + fieldBlocks.push( + ` "${name}": ${def.example}\n // type: ${def.type}\n // description: ${def.description}\n // instructions: ${def.special_instructions}`, + ); + } + + return `You are an analyst combining business context with technical understanding of an indexed codebase. + +The user provides: + 1. Raw business-context text describing why a commit exists. + 2. A pre-generated title for context. + 3. (Optional) Aggregated enrichment data sampled from the repository (top keywords, architecture + summary, file tree, integration surface). Use these as evidence to ground your output; do not + invent claims that conflict with them. + +Your task: extract the following fields, populating EVERY key. If a field cannot be derived from +the text or enrichment, output an empty string or empty array — never null or undefined. + +Output format (strict JSON, no markdown fences, no commentary): + +{ +${fieldBlocks.join(",\n")} +} + +Rules: +- Honour every "instructions" line literally — they cap list lengths and dictate tone. +- Do not echo the field descriptions or instructions in your output. +- Do not introduce extra top-level keys beyond those listed. +- Output ONE JSON object. Nothing else.`; +} diff --git a/packages/ingest-business-context/src/prompt/title-prompt.ts b/packages/ingest-business-context/src/prompt/title-prompt.ts new file mode 100644 index 0000000..b1e60dd --- /dev/null +++ b/packages/ingest-business-context/src/prompt/title-prompt.ts @@ -0,0 +1,22 @@ +/** + * System prompt for the title-generation LLM call. Asks the model to read the + * raw business-context text and return a single JSON object with one key, + * `title`, holding a concise product-recognisable string. + */ +export function buildTitleGenerationPrompt(): string { + return `You are a senior product manager generating a concise title for a business-context entry. + +The user will provide raw text describing a piece of business context attached to a code commit. +Your task: produce ONE short, descriptive title that a product manager would recognise instantly +when scanning a list of business contexts. + +Requirements: +- Maximum 12 words. +- No technical jargon. No code identifiers (no camelCase, no file paths, no function names). +- Product-domain language. Capture the *what* and the *audience*, not the *how*. +- If the text is empty or unintelligible, output the literal string "Untitled Business Context". + +Output strictly as JSON: { "title": "" } + +No prose. No explanations. No markdown code fences.`; +} diff --git a/packages/ingest-business-context/src/prompt/user-message.ts b/packages/ingest-business-context/src/prompt/user-message.ts new file mode 100644 index 0000000..3cbd7de --- /dev/null +++ b/packages/ingest-business-context/src/prompt/user-message.ts @@ -0,0 +1,18 @@ +/** + * Composes the user-side message for the analysis LLM call. Bundles the raw + * business-context text, the pre-generated title, and (optional) enrichment + * data extracted from the repository's meta-output. The enrichment section is + * elided entirely when empty so the call works even before ingest-github has + * produced any repo-summary. + */ +export function buildEnrichedUserMessage(text: string, title: string, enrichmentSection: string): string { + const parts: string[] = [`TITLE (pre-generated):`, title, "", `BUSINESS CONTEXT TEXT (authored by a human):`, text]; + + if (enrichmentSection.trim().length > 0) { + parts.push(""); + parts.push("REPOSITORY ENRICHMENT (sampled from the indexed codebase):"); + parts.push(enrichmentSection); + } + + return parts.join("\n"); +} diff --git a/packages/ingest-business-context/src/strategy/README.md b/packages/ingest-business-context/src/strategy/README.md new file mode 100644 index 0000000..9d7de47 --- /dev/null +++ b/packages/ingest-business-context/src/strategy/README.md @@ -0,0 +1,13 @@ +# `strategy/` — context + +Orchestrates the per-job pipeline. + +| File | Responsibility | +| --------------------- | ------------------------------------------------------------------------------ | +| `commit-validator.ts` | `assertCommitIndexed()` — throws `CommitNotIndexedError` if files don't exist. | +| `execute.ts` | The disk pipeline: validate → enrich → title → analyse → persist. | +| `store-graph.ts` | The Neo4j pipeline: indexes → node → version → file-version edges → keywords. | + +`execute` and `store-graph` are separate by design — the worker calls them in +sequence, but a synchronous HTTP path can call them in the same request, and +a future scheduler can defer `store-graph` for later. diff --git a/packages/ingest-business-context/src/strategy/commit-validator.ts b/packages/ingest-business-context/src/strategy/commit-validator.ts new file mode 100644 index 0000000..983e091 --- /dev/null +++ b/packages/ingest-business-context/src/strategy/commit-validator.ts @@ -0,0 +1,46 @@ +import { runCypher } from "@bb/neo4j"; +import { CommitNotIndexedError } from "#src/errors.ts"; + +const CHECK_INDEXED = ` +OPTIONAL MATCH (fv:FileVersion {knowledgeId: $knowledgeId, commitHash: $commitHash}) +WITH count(fv) AS versionCount +OPTIONAL MATCH (f:File {knowledgeId: $knowledgeId}) +WITH versionCount, count(f) AS fileCount +RETURN versionCount AS versions, fileCount AS files +`; + +export interface CommitIndexStatus { + /** Number of `:FileVersion` rows matching `(knowledgeId, commitHash)`. */ + fileVersions: number; + /** Number of `:File` rows for the knowledge (any commit). */ + liveFiles: number; + /** True if either count is positive. */ + indexed: boolean; +} + +/** + * Reports whether the commit's files are indexed. Two evidence sources: + * + * 1. `:FileVersion {knowledgeId, commitHash}` — historical snapshot exists. + * 2. `:File {knowledgeId}` — live state exists, which implies the knowledge + * was indexed at *some* commit. We accept this because the latest commit + * may not yet have a snapshot (snapshots are taken before the next pull). + * + * If both are zero, the commit (or knowledge) is not indexed. + */ +export async function checkCommitIndexed(knowledgeId: string, commitHash: string): Promise { + const rows = await runCypher<{ versions: number; files: number }>(CHECK_INDEXED, { knowledgeId, commitHash }); + const row = rows[0] ?? { versions: 0, files: 0 }; + const fileVersions = Number(row.versions ?? 0); + const liveFiles = Number(row.files ?? 0); + return { fileVersions, liveFiles, indexed: fileVersions > 0 || liveFiles > 0 }; +} + +/** Throws `CommitNotIndexedError` if neither file-versions nor live files exist. */ +export async function assertCommitIndexed(knowledgeId: string, commitHash: string): Promise { + const status = await checkCommitIndexed(knowledgeId, commitHash); + if (!status.indexed) { + throw new CommitNotIndexedError(knowledgeId, commitHash); + } + return status; +} diff --git a/packages/ingest-business-context/src/strategy/execute.ts b/packages/ingest-business-context/src/strategy/execute.ts new file mode 100644 index 0000000..f72c7a6 --- /dev/null +++ b/packages/ingest-business-context/src/strategy/execute.ts @@ -0,0 +1,96 @@ +import { logger } from "@bb/logger"; +import { loadCachedAnalysis } from "#src/disk/load-cached.ts"; +import { sanitizeTitle } from "#src/disk/sanitize-title.ts"; +import { saveAnalysis } from "#src/disk/save-analysis.ts"; +import { saveOriginalText } from "#src/disk/save-original.ts"; +import { BusinessContextAnalysisFailedError } from "#src/errors.ts"; +import { analyzeBusinessContextParallel } from "#src/llm/analyze-parallel.ts"; +import { collectEnrichmentData } from "#src/llm/enrichment-reader.ts"; +import { generateBusinessContextTitle } from "#src/llm/title.ts"; +import { assertCommitIndexed } from "#src/strategy/commit-validator.ts"; +import { businessContextDir } from "@bb/ingest-github"; +import path from "node:path"; +import type { BusinessContextInput, BusinessContextLlmOptions, BusinessContextStorageResult } from "#src/types.ts"; + +export interface ExecuteOptions { + llmOptions: BusinessContextLlmOptions; +} + +/** + * Main entry point for the BusinessContext disk pipeline. Validates the + * commit is indexed, reads enrichment, runs the title call + the 3 parallel + * analysis calls, persists both the original text and the analysis envelope + * to disk. Neo4j persistence is intentionally separate (`store-graph.ts`) so + * callers can defer it. + */ +export async function executeBusinessContextStrategy( + input: BusinessContextInput, + options: ExecuteOptions, +): Promise { + logger.info( + `business-context: executing — knowledge=${input.knowledgeId}, commit=${input.commitHash.substring(0, 12)}, text=${input.text.length} chars`, + ); + + // 1. Validate the commit (or knowledge) is indexed. + await assertCommitIndexed(input.knowledgeId, input.commitHash); + + // 2. Generate the title. + const titleResult = await generateBusinessContextTitle(input.text, options.llmOptions); + const sanitizedTitle = sanitizeTitle(titleResult.title); + if (sanitizedTitle.length === 0) { + // Defensive: an empty slug would collide on every BC. Bail with a stable fallback. + logger.warn(`business-context: sanitized title was empty for "${titleResult.title}" — using fallback slug`); + } + const effectiveSlug = sanitizedTitle.length > 0 ? sanitizedTitle : "untitled-business-context"; + + // 3. Cache hit? Skip the analysis call and return the existing paths. + const cached = await loadCachedAnalysis(input.knowledgeId, input.commitHash, effectiveSlug); + if (cached !== null) { + const dir = businessContextDir(input.knowledgeId, input.commitHash, effectiveSlug); + return { + analysisPath: path.join(dir, "analysis.json"), + originalTextPath: path.join(dir, "original.txt"), + title: cached.analysis.title, + commitHash: input.commitHash, + sanitizedTitle: effectiveSlug, + }; + } + + // 4. Collect enrichment + run the parallel analysis. + const enrichment = await collectEnrichmentData(input.knowledgeId, input.orgId); + const analysisResult = await analyzeBusinessContextParallel( + input.text, + titleResult.title, + enrichment, + options.llmOptions, + ); + if (analysisResult.analysis === null) { + throw new BusinessContextAnalysisFailedError(input.knowledgeId, input.commitHash); + } + + // 5. Persist to disk in parallel. + const totalInputTokens = titleResult.inputTokens + analysisResult.inputTokens; + const totalOutputTokens = titleResult.outputTokens + analysisResult.outputTokens; + const [originalTextPath, analysisPath] = await Promise.all([ + saveOriginalText(input.knowledgeId, input.commitHash, effectiveSlug, input.text), + saveAnalysis(input.knowledgeId, input.commitHash, effectiveSlug, analysisResult.analysis, { + commitHash: input.commitHash, + modelName: analysisResult.modelName, + inputTokens: totalInputTokens, + outputTokens: totalOutputTokens, + ...(input.description !== undefined ? { description: input.description } : {}), + }), + ]); + + logger.info( + `business-context: strategy complete — title="${analysisResult.analysis.title}", commit=${input.commitHash.substring(0, 12)}`, + ); + + return { + analysisPath, + originalTextPath, + title: analysisResult.analysis.title, + commitHash: input.commitHash, + sanitizedTitle: effectiveSlug, + }; +} diff --git a/packages/ingest-business-context/src/strategy/store-graph.ts b/packages/ingest-business-context/src/strategy/store-graph.ts new file mode 100644 index 0000000..5f25ca7 --- /dev/null +++ b/packages/ingest-business-context/src/strategy/store-graph.ts @@ -0,0 +1,63 @@ +import { logger } from "@bb/logger"; +import { ensureBusinessContextIndexes } from "#src/neo4j/indexes.ts"; +import { createBusinessContextKeywords } from "#src/neo4j/write-keywords.ts"; +import { createBusinessContextNode } from "#src/neo4j/write-node.ts"; +import { createBusinessContextVersionNode, linkVersionToFileVersions } from "#src/neo4j/write-version.ts"; +import type { BusinessContextAnalysis, BusinessContextNeo4jResult } from "#src/types.ts"; + +export interface StoreGraphInput { + knowledgeId: string; + orgId: string; + commitHash: string; +} + +/** + * Persists a completed `BusinessContextAnalysis` to Neo4j. Four steps: + * + * 1. Ensure indexes exist (idempotent). + * 2. Merge the parent `:BusinessContext` and link from `:Knowledge`. + * 3. Merge the per-commit `:BusinessContextVersion`, then MERGE `:DESCRIBES` + * edges to every `:FileVersion {knowledgeId, commitHash}` that exists. + * 4. Merge `:OrgKeyword` nodes and `:APPEARS_IN_BUSINESS_CONTEXT` edges. + */ +export async function storeBusinessContextToNeo4j( + input: StoreGraphInput, + analysis: BusinessContextAnalysis, + sanitizedTitle: string, +): Promise { + await ensureBusinessContextIndexes(); + + const nodeCount = await createBusinessContextNode( + { knowledgeId: input.knowledgeId, orgId: input.orgId }, + analysis, + sanitizedTitle, + ); + + const versionCount = await createBusinessContextVersionNode( + { knowledgeId: input.knowledgeId, orgId: input.orgId, commitHash: input.commitHash }, + analysis, + sanitizedTitle, + ); + + const fileVersionEdges = await linkVersionToFileVersions( + { knowledgeId: input.knowledgeId, orgId: input.orgId, commitHash: input.commitHash }, + sanitizedTitle, + ); + + const keywordEdges = await createBusinessContextKeywords( + { knowledgeId: input.knowledgeId, orgId: input.orgId }, + analysis, + sanitizedTitle, + ); + + logger.info( + `business-context: graph stored — node=${nodeCount > 0}, version=${versionCount > 0}, fileVersion=${fileVersionEdges}, keywords=${keywordEdges}`, + ); + + return { + businessContextNodeCreated: nodeCount > 0, + versionNodeCreated: versionCount > 0, + keywordRelationships: keywordEdges, + fileVersionRelationships: fileVersionEdges, + }; +} diff --git a/packages/ingest-business-context/src/types.ts b/packages/ingest-business-context/src/types.ts new file mode 100644 index 0000000..ec3c6b7 --- /dev/null +++ b/packages/ingest-business-context/src/types.ts @@ -0,0 +1,117 @@ +/** + * The structured analysis produced by LLM from user-authored business-context text. + * Two audiences are served by one document: product people (title, user stories, + * stakeholders, business value) and engineers (technical summary, affected modules, + * architecture decisions, dependencies, data flow). + */ +export interface BusinessContextAnalysis { + // Product fields + title: string; + product_area: string; + user_stories: string[]; + business_value: string; + stakeholders: string[]; + success_metrics: string[]; + user_impact: string; + domain_keywords: string[]; + + // Technical fields + technical_summary: string; + affected_modules: string[]; + architecture_decisions: string[]; + dependencies: string[]; + risk_areas: string[]; + data_flow: string; + api_surface: string[]; + + // Shared fields + summary: string; + keywords: string[]; +} + +/** + * Input to the BusinessContext strategy. `orgId` is single-tenant (`"local"`) in + * OSS; downstream multi-tenant deployments stamp it from the request. + */ +export interface BusinessContextInput { + /** Raw business-context text authored by a human. */ + text: string; + /** Knowledge entity UUID. */ + knowledgeId: string; + /** 40-char hex SHA. Must reference an indexed commit. */ + commitHash: string; + /** Tenant binding. */ + orgId: string; + /** Optional human-supplied description, persisted alongside the analysis envelope. */ + description?: string; +} + +/** Result of the disk-side pipeline (validation → enrichment → LLM → write). */ +export interface BusinessContextStorageResult { + /** Absolute path to the saved `analysis.json`. */ + analysisPath: string; + /** Absolute path to the saved `original.txt`. */ + originalTextPath: string; + /** The LLM-generated title. */ + title: string; + /** The commit hash the analysis is anchored to. */ + commitHash: string; + /** Sanitized title used as the node_id and the on-disk directory name. */ + sanitizedTitle: string; +} + +/** Result returned after persisting to Neo4j. */ +export interface BusinessContextNeo4jResult { + /** Whether the main `:BusinessContext` node was created (true on first run, true on MERGE). */ + businessContextNodeCreated: boolean; + /** Whether the per-commit `:BusinessContextVersion` was created or merged. */ + versionNodeCreated: boolean; + /** Total number of `:OrgKeyword` relationships created. */ + keywordRelationships: number; + /** Count of `[:DESCRIBES]` edges from the version node to file-version nodes for this commit. */ + fileVersionRelationships: number; +} + +/** Metadata envelope wrapping the analysis when persisted to disk. */ +export interface BusinessContextAnalysisMetadata { + /** ISO timestamp of when the analysis was generated. */ + generatedAt: string; + /** The commit hash this analysis is stored under. */ + commitHash: string; + /** LLM model name used. */ + modelName: string; + /** Total input tokens consumed (title + analysis calls combined). */ + inputTokens: number; + /** Total output tokens consumed (title + analysis calls combined). */ + outputTokens: number; + /** Optional human-supplied description carried through from the input. */ + description?: string; + /** The full analysis object. */ + analysis: BusinessContextAnalysis; +} + +/** Result of the title-generation LLM call. */ +export interface TitleGenerationResult { + title: string; + inputTokens: number; + outputTokens: number; + modelName: string; +} + +/** Result of the parallel analysis LLM calls. */ +export interface AnalysisResult { + analysis: BusinessContextAnalysis | null; + inputTokens: number; + outputTokens: number; + modelName: string; +} + +/** Options forwarded to the LLM layer (per-job credential overrides, etc.). */ +export interface BusinessContextLlmOptions { + /** Optional per-job LLM API key override. */ + apiKey?: string; + /** Optional per-job LLM provider override (`"openrouter"` or `"ollama"` in OSS). */ + provider?: string; + /** Optional per-job LLM model override. */ + model?: string; +} diff --git a/packages/ingest-business-context/src/worker/README.md b/packages/ingest-business-context/src/worker/README.md new file mode 100644 index 0000000..d74e95f --- /dev/null +++ b/packages/ingest-business-context/src/worker/README.md @@ -0,0 +1,12 @@ +# `worker/` — context + +BullMQ worker registration. + +| File | Responsibility | +| ------------- | ---------------------------------------------------------------------- | +| `handler.ts` | Runs `execute → store-graph` for each `BusinessContextProcessing` job. | +| `register.ts` | `registerBusinessContextWorker()` — called once by the deployable. | + +The handler re-reads the persisted analysis from disk between the disk and +graph phases so a future split into two queue jobs produces the same result +as the current inline flow. diff --git a/packages/ingest-business-context/src/worker/handler.ts b/packages/ingest-business-context/src/worker/handler.ts new file mode 100644 index 0000000..5be25de --- /dev/null +++ b/packages/ingest-business-context/src/worker/handler.ts @@ -0,0 +1,73 @@ +import { getConfigValue } from "@bb/config"; +import { Config, type BusinessContextProcessingPayload, type JobMessage, JobType } from "@bb/types"; +import { logger } from "@bb/logger"; +import type { JobHandler } from "@bb/queue"; +import { executeBusinessContextStrategy } from "#src/strategy/execute.ts"; +import { storeBusinessContextToNeo4j } from "#src/strategy/store-graph.ts"; +import type { BusinessContextAnalysis, BusinessContextLlmOptions } from "#src/types.ts"; +import { readFile } from "node:fs/promises"; + +const DEFAULT_ORG_ID = "local"; + +function buildLlmOptions(payload: BusinessContextProcessingPayload): BusinessContextLlmOptions { + const opts: BusinessContextLlmOptions = {}; + if (payload.llmApiKey !== undefined) { + opts.apiKey = payload.llmApiKey; + } + if (payload.llmProvider !== undefined) { + opts.provider = payload.llmProvider; + } + if (payload.llmModel !== undefined) { + opts.model = payload.llmModel; + } + return opts; +} + +function resolveOrgId(payload: BusinessContextProcessingPayload): string { + if (payload.orgId !== undefined && payload.orgId.length > 0) { + return payload.orgId; + } + const configured = getConfigValue(Config.OrgId); + return configured.length > 0 ? configured : DEFAULT_ORG_ID; +} + +/** + * BullMQ job handler for `JobType.BusinessContextProcessing`. Runs the disk + * strategy then the graph store. Re-reads the persisted analysis from disk + * before the graph step so a deferred / split execution path produces the same + * result as the inline path. + */ +export const handleBusinessContextProcessing: JobHandler = async ( + msg: JobMessage, +): Promise => { + const { payload } = msg; + const orgId = resolveOrgId(payload); + const input = { + text: payload.customText, + knowledgeId: payload.knowledgeId, + commitHash: payload.commitHash, + orgId, + ...(payload.description !== undefined ? { description: payload.description } : {}), + }; + + logger.info( + `business-context.handler: starting job=${msg.id} knowledge=${input.knowledgeId} commit=${input.commitHash.substring(0, 12)}`, + ); + + const storage = await executeBusinessContextStrategy(input, { llmOptions: buildLlmOptions(payload) }); + + // Re-load the persisted analysis to feed the graph step. Keeps the contract + // identical whether the graph step runs inline or is deferred to a follow-up + // job: in both cases the source of truth is what's on disk. + const envelope = JSON.parse(await readFile(storage.analysisPath, "utf-8")) as { + analysis: BusinessContextAnalysis; + }; + + await storeBusinessContextToNeo4j( + { knowledgeId: input.knowledgeId, orgId, commitHash: input.commitHash }, + envelope.analysis, + storage.sanitizedTitle, + ); + + logger.info(`business-context.handler: completed job=${msg.id}`); +}; diff --git a/packages/ingest-business-context/src/worker/register.ts b/packages/ingest-business-context/src/worker/register.ts new file mode 100644 index 0000000..80a9686 --- /dev/null +++ b/packages/ingest-business-context/src/worker/register.ts @@ -0,0 +1,13 @@ +import { JobType } from "@bb/types"; +import { registerWorker, type WorkerRegistrationOptions } from "@bb/queue"; +import { handleBusinessContextProcessing } from "#src/worker/handler.ts"; + +/** + * Registers the BusinessContext worker against `JobType.BusinessContextProcessing`. + * Called once by the deployable at boot. The default concurrency is sourced + * from `Config.ConcurrencyGithub` (shared with other CPU/LLM-heavy workers); + * callers may override via `opts.concurrency`. + */ +export function registerBusinessContextWorker(opts: WorkerRegistrationOptions = {}): void { + registerWorker(JobType.BusinessContextProcessing, handleBusinessContextProcessing, opts); +} diff --git a/packages/ingest-business-context/tsconfig.json b/packages/ingest-business-context/tsconfig.json new file mode 100644 index 0000000..4ed0786 --- /dev/null +++ b/packages/ingest-business-context/tsconfig.json @@ -0,0 +1,4 @@ +{ + "extends": "../../tsconfig.base.json", + "include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.json"] +} diff --git a/packages/ingest-github/README.md b/packages/ingest-github/README.md index f199dd0..93d7786 100644 --- a/packages/ingest-github/README.md +++ b/packages/ingest-github/README.md @@ -50,7 +50,12 @@ The package does **not** own: strategies can add this) - Semantic chunking, big-file processing, smart sampling (future strategies) -- Recovery / progress reporting / failed-files tracking +- Recovery / failed-files tracking +- Progress **transport** — the package now ships a `ProgressContext` + extension port under `src/progress/` (see that folder's README), but + the actual SSE / Pub-Sub plumbing lives in the host binary's progress + package. The OSS default (`nullProgressContextFactory`) discards every + event, consistent with the no-outbound-calls posture. - Provider abstraction (no Bitbucket support; GitHub-only) - Concurrency control (sequential per-file processing intentional for v0; revisit when users complain) @@ -58,15 +63,54 @@ The package does **not** own: ## Public exports ```ts -function registerGithubWorkers(): void // wires JobType.GithubIndex -function registerLocalIngestWorker(): void // wires JobType.LocalIngest +// High-level registration (OSS standalone wires this once at boot) +function registerGithubWorkers(deps?: RegisterGithubWorkersDeps): void; // wires GithubIndex + GithubPull +function registerLocalIngestWorker(): void; // wires LocalIngest -interface IngestionContext { knowledgeId: string; rootDir: string } -interface IngestionStrategy { readonly name: string; ingest(ctx: IngestionContext): Promise } +interface RegisterGithubWorkersDeps { + sourceFactory?: SourceFactory; // index-side hook + pullFactory?: PullFactory; // pull-side hook (provides reader + diff + targetCommit) + progressContextFactory?: ProgressContextFactory; // SSE progress hook (default: no-op) +} -class BasicFileAnalysisStrategy implements IngestionStrategy +// Lower-level building blocks (downstream consumers with their own queue +// skip registerGithubWorkers and wire these against their own registry) +function createPipelineRunner(deps: CreatePipelineRunnerDeps): IngestRunnerDeps; +function createGithubIngestHandler(deps: IngestJobHandlerDeps): (msg) => Promise; +function createLocalIngestHandler(deps: IngestJobHandlerDeps): (msg) => Promise; +function runPull(msg: JobMessage, pullFactory?: PullFactory): Promise; +function reposRoot(): string; +function repoCloneDir(knowledgeId: string): string; +function metaRootFor(knowledgeId: string): string; +function metaPathsFor(knowledgeId: string): MetaPaths; +function commitMetaDir(knowledgeId: string, commitHash: string): string; +function businessContextDir(knowledgeId: string, commitHash: string, sanitizedTitle: string): string; +function orgRegistryDir(knowledgeId: string, orgId: string): string; + +function createFlatFolderStrategy(deps): IngestStrategy; +function createLlmFileAnalyzer(deps): FileAnalyzer; +function createDiskSourceReader(deps): SourceReader; ``` +The optional `sourceFactory` lets downstream consumers inject a custom +`SourceReader` for index jobs (no local clone). The analogous +`pullFactory` does the same for pull jobs — its result carries the +resolved `targetCommit`, the diff between currentCommit and targetCommit, +and a reader pinned at the target. When unset, both fall back to the +default disk-backed paths (`git clone` for index, `git fetch + diff + +checkout` for pull). See [docs/extension-points.md](docs/extension-points.md) +for the design rationale. + +For per-job LLM credentials, downstream consumers set +`{ llmApiKey?, llmProvider?, llmModel?, llmKeyId? }` on the +`GithubIndexPayload` / `GithubPullPayload` they enqueue +(`PayloadLlmOverrides` from `@bb/types`). The runner extracts those into +`StrategyContext.llmCallContext` and every LLM call site forwards it to +`@bb/llm`. `llmProvider` is `string` (open) so multi-provider consumers +can carry richer taxonomies; the OSS LLM client narrows to +`openrouter`/`ollama` at the boundary. OSS standalone leaves the overrides +unset and falls back to `Config.OpenrouterApiKey` + `Config.LlmProvider`. + Both `register*Workers()` calls run once at `@bb/server` boot. The worker hardcodes a single `IngestionStrategy` instance (currently `new BasicFileAnalysisStrategy()`). Adding another strategy = new file @@ -88,25 +132,51 @@ worker hardcodes a single `IngestionStrategy` instance (currently - `:File` graph nodes + `:HAS_FILE` / `:HAS_KEYWORD` / `:HAS_CLASS` / `:HAS_FUNCTION` / `:HAS_IMPORT_INTERNAL` / `:HAS_IMPORT_EXTERNAL` relationships — written via `upsertFileNode` from `@bb/neo4j`. +- `meta-output/scan-manifest.json` — the canonical small/big/oversized + classification produced by Phase 1 (`scanAndClassify`). Per-file entries + carry `tokenCount`, `kind`, and (for big files) `estimatedChunks`. + Phases 2a (small) and 2b (big) consume the manifest in parallel. +- `meta-output/bigFiles.json` — legacy view written alongside the manifest + for the pull-path and backfill phases. The main strategy no longer + consumes it directly. +- `FileAnalysisCache` (in-memory only, not persisted) — single + `Map` loaded once between the + analyse and backfill phases via parallel `readdir + readFile`. Replaces + three sequential `iterateCondensed` walks (phases 3, 5, 7) with one + parallel preload + three in-memory iterations. The pull workflow loads + its own cache instance; only one strategy run owns a given + `metaPaths` directory at a time. For repos beyond ~50k analysed files + consider a streaming-mode fallback (not implemented today). ## Invariants -1. **Sequential per-file processing.** Intentionally degraded; one - `upsertRawFile` per file. The small-file path issues one `askLLM`; - the big-file path issues N (one per chunk) plus condensation calls, - all sequential — no `Promise.all`, no concurrency cap. Revisit when - the latency profile demands it. -2. **Clone idempotent.** Re-runs (BullMQ retries) call `git fetch` + +1. **Shared LLM concurrency limiter.** The flat-folder strategy + constructs one `withConcurrency(Config.LlmConcurrency)` instance at + entry (default 29). The small-file phase, the big-file chunk phase, + per-file condense calls, **and the folder-summary phase** all check + out from this single pool, so total in-flight LLM calls is bounded + by one knob. The pull-path constructs its own shared limiter at + `runPull` entry and threads it into the selective folder-summary + phase. The legacy `processBigFile` driver used by the pull-path + still uses its own per-file pool sized by `Config.BigFileConcurrency`. +2. **Folder-summary batching by default.** Phase 5 groups small folders + (`≤ Config.FolderSummaryBatchMaxFiles`, default 15) into batches of + up to `Config.FolderSummaryBatchSize` (default 10) and asks the LLM + for one JSON object keyed by integer label that returns one summary + per folder. Bigger folders take the individual single-folder path. + Roll back to one LLM call per folder via + `bytebell set folder.summary.batch.size 1`. +3. **Clone idempotent.** Re-runs (BullMQ retries) call `git fetch` + `git reset --hard` in the existing dir rather than re-cloning. Tokens are re-injected into the remote URL each time. -3. **Token redaction.** `GitCloneError` carries the **redacted** repo +4. **Token redaction.** `GitCloneError` carries the **redacted** repo URL (`https://user:***@host`) — the raw `gitToken` never appears in error messages or logs. -4. **State transition order.** `Processing` is set _before_ any clone +5. **State transition order.** `Processing` is set _before_ any clone work. `Processed` is set _only_ after the entire scan + analyze loop completes. On any thrown error, the handler best-effort sets `Failed` then re-throws so BullMQ records the retry. -5. **Fail-soft analysis, fail-hard infra.** A single file's LLM call +6. **Fail-soft analysis, fail-hard infra.** A single file's LLM call failing falls back to an empty-analysis Raw doc and processing continues. In the big-file path, a single chunk failure contributes an empty analysis to the merge but does not stop the file; a @@ -114,7 +184,7 @@ worker hardcodes a single `IngestionStrategy` instance (currently `dedupAnalyses` so the merged result is always well-formed. A clone failure or Mongo write failure throws and propagates to BullMQ for retry under the queue's `attempts: 3`. -6. **Hardcoded filters only.** No LLM-based ignore decisions in v0. The +7. **Hardcoded filters only.** No LLM-based ignore decisions in v0. The directory / file / extension blocklists in `scan.ts` are the only way files get skipped. @@ -135,7 +205,6 @@ worker hardcodes a single `IngestionStrategy` instance (currently - GitHub API streaming mode (always shell-clone) - Default-branch auto-detection (caller supplies `branch`; defaults to `"main"`) -- Concurrency control / parallel file processing - Folder-level summaries / `repoSummary.json` / `flat-folder` strategy - Semantic chunking (`SemanticChunker`) - Per-chunk persistence (we persist only the merged file-level diff --git a/packages/ingest-github/package.json b/packages/ingest-github/package.json index 4ca252c..936da74 100644 --- a/packages/ingest-github/package.json +++ b/packages/ingest-github/package.json @@ -8,6 +8,9 @@ "exports": { ".": "./src/index.ts" }, + "imports": { + "#src/*": "./src/*" + }, "dependencies": { "@bb/config": "workspace:*", "@bb/errors": "workspace:*", diff --git a/packages/ingest-github/src/README.md b/packages/ingest-github/src/README.md index 58b2bf6..e143560 100644 --- a/packages/ingest-github/src/README.md +++ b/packages/ingest-github/src/README.md @@ -11,16 +11,38 @@ Domain (composes infra: `@bb/config`, `@bb/llm`, `@bb/mongo`, `@bb/neo4j`, ## Top-level files -- **[index.ts](index.ts)** — public surface. `registerGithubWorkers`, - `registerLocalIngestWorker`, `createFlatFolderStrategy`, - `createLlmFileAnalyzer`, `createDiskSourceReader`, the - `SourceReader` / `ArchiveSink` / `SourceFactory` port types, plus - `parseGithubRepo` / `fetchLatestCommitHash` (kept for the pull plan). - `registerGithubWorkers` accepts one optional `sourceFactory` injection - parameter so downstream consumers can replace the default disk-based - clone-and-read; the open-source binary always leaves it undefined. for the - seam. `GithubPull` is registered but the handler throws - `IngestError("…being migrated…")` — the HTTP route mirrors this at 503. +- **[index.ts](index.ts)** — public surface. The high-level + registration helpers (`registerGithubWorkers`, `registerLocalIngestWorker`) + for the OSS standalone, plus the lower-level building blocks downstream + consumers wire against their own queue/registry: + - Factories: `createFlatFolderStrategy`, `createLlmFileAnalyzer`, + `createDiskSourceReader`, `createPipelineRunner` (the orchestrator), + `createGithubIngestHandler` / `createLocalIngestHandler` (the BullMQ + processor factories used internally by `registerGithubWorkers`). + - Direct runner: `runPull(msg, pullFactory?)` — the pull worker the + enterprise wrapper invokes directly from its own registry. + - Helper: `reposRoot()` — resolves `~/.bytebell/repos`. + - Port types: `SourceReader` / `ScanEntry` / `ScannedFile` / + `OversizedFile` / `ScanDeps` / `ArchiveSink` / `ArchiveSinkInput` / + `SourceFactory` / `SourceFactoryInput` / `SourceFactoryResult` / + `PullFactory` / `PullFactoryInput` / `PullFactoryResult` / + `DiffResult` / `RenamedFile` / `FileAnalyzer` / `AnalyzedFileResult`. + - Runner types: `IngestRunnerDeps` / `IngestRunnerInput` / + `IngestJobHandlerDeps` / `CreatePipelineRunnerDeps`. + - Strategy types: `IngestStrategy` / `StrategyInput` / `StrategyResult` / + `StrategyContext`. + - `CondensedFileAnalysis`. + - GitHub helpers: `parseGithubRepo` / `fetchLatestCommitHash` / + `fetchRecentCommits`. + `registerGithubWorkers` accepts optional `sourceFactory` (index) and + `pullFactory` (pull) injections through `RegisterGithubWorkersDeps`; + the open-source binary leaves both undefined. It registers both + `JobType.GithubIndex` (full re-index, via `runner.run` + optional + `sourceFactory`) and `JobType.GithubPull` (incremental diff-and-apply + via `runPull` + optional `pullFactory`). Downstream consumers that + bring their own queue (e.g. the enterprise wrapper using `@bytebell/queue`) + skip `registerGithubWorkers` entirely and call `createPipelineRunner`, + `createGithubIngestHandler`, and `runPull` directly. - **[githubApi.ts](githubApi.ts)** — `parseGithubRepo(repoUrl)` and `fetchLatestCommitHash(owner, repo, branch, gitToken?)`. **Pull-only utility**; revisit in the pull plan. Kept in place rather than deleted so @@ -75,10 +97,18 @@ Tier flow is strict: `types/` is the leaf; `pipeline/`, `adapters/`, ## Invariants enforced here - **One active strategy, factory-wired.** `createFlatFolderStrategy(deps)` - builds the strategy; `createPipelineRunner({ strategy })` wraps it; the - worker handlers are `(msg) => runner.run({ job, payload })`. Adding a - strategy means a new factory and a new wiring line — never editing the - worker. The archived `basic-file-analysis/` is `.archived` (not compiled). + builds the strategy; `createPipelineRunner({ strategy, sourceFactory? })` + wraps it; the worker handlers are `(msg) => runner.run({ job, payload })`. + Adding a strategy means a new factory and a new wiring line — never + editing the worker. The archived `basic-file-analysis/` is `.archived` + (not compiled). +- **Per-job LLM credentials flow payload → context → call site.** The + runner (`pipeline/run.ts` for index, `pipeline/pull.ts` for pull) reads + `{llmApiKey, llmProvider, llmModel}` from the payload, packs them into + an `AskLlmOptions` bag stored on `StrategyContext.llmCallContext`, and + every LLM-touching phase passes that bag into `askJsonLLM` / + `askYesNoLLM`. OSS standalone leaves these unset and falls back to + `Config.OpenrouterApiKey` + `Config.LlmProvider`. - **State transitions are explicit and dual-written.** `pipeline/run.ts` transitions Mongo state to `PROCESSING` before any work, `PROCESSED` on success, `FAILED` best-effort on uncaught errors. Each transition mirrors diff --git a/packages/ingest-github/src/adapters/README.md b/packages/ingest-github/src/adapters/README.md index cd3bfb7..d33d2c1 100644 --- a/packages/ingest-github/src/adapters/README.md +++ b/packages/ingest-github/src/adapters/README.md @@ -13,9 +13,12 @@ Domain. - `llm-file-analyzer.ts` — `createLlmFileAnalyzer(deps)` returns the `FileAnalyzer` port. Deps inject `buildSystemPrompt` and `buildUserPrompt` so the prompts live in `strategies/flat-folder/prompts/` (one-way tier flow - from strategies → adapters via DI, never via import). Also exports - `shapeAnalysis` (raw JSON → `FileAnalysis`, tolerates missing keys) and - `languageFromPath` (extension-based fallback when the LLM omits `language`). + from strategies → adapters via DI, never via import). The returned + `analyze({ relativePath, content, llmCallContext? })` forwards + `llmCallContext` to `askJsonLLM` so per-job LLM credential overrides + reach OpenRouter. Also exports `shapeAnalysis` (raw JSON → + `FileAnalysis`, tolerates missing keys) and `languageFromPath` + (extension-based fallback when the LLM omits `language`). - `index.ts` — barrel. ## Invariants diff --git a/packages/ingest-github/src/adapters/llm-file-analyzer.ts b/packages/ingest-github/src/adapters/llm-file-analyzer.ts index ac83274..8e42d74 100644 --- a/packages/ingest-github/src/adapters/llm-file-analyzer.ts +++ b/packages/ingest-github/src/adapters/llm-file-analyzer.ts @@ -1,8 +1,9 @@ -import { askJsonLLM } from "@bb/llm"; +import { askJsonLLM, type AskLlmOptions } from "@bb/llm"; +import { LlmConfigError, LlmError } from "@bb/errors"; import { logger } from "@bb/logger"; import type { FileAnalysis, FileAnalysisSection } from "@bb/mongo"; -import { FALLBACK_LANGUAGE, emptyFileAnalysis } from "src/types/file-analysis.ts"; -import type { AnalyzedFileResult, FileAnalyzer } from "src/types/pipeline.ts"; +import { FALLBACK_LANGUAGE, emptyFileAnalysis } from "#src/types/file-analysis.ts"; +import type { AnalyzedFileResult, FileAnalyzer } from "#src/types/pipeline.ts"; export interface LlmFileAnalyzerDeps { buildSystemPrompt: () => string; @@ -33,24 +34,45 @@ interface RawAnalysisJson { export function createLlmFileAnalyzer(deps: LlmFileAnalyzerDeps): FileAnalyzer { return { - async analyze(input: { relativePath: string; content: string }): Promise { + async analyze(input: { + relativePath: string; + content: string; + llmCallContext?: AskLlmOptions; + }): Promise { const systemPrompt = deps.buildSystemPrompt(); const userPrompt = deps.buildUserPrompt(input); + const t0 = performance.now(); let raw: RawAnalysisJson | null = null; + let usage: { inputTokens: number; outputTokens: number; costUsd: number } | undefined; try { - const response = await askJsonLLM(systemPrompt, userPrompt); + const response = await askJsonLLM(systemPrompt, userPrompt, input.llmCallContext ?? {}); raw = response.result; + usage = { + inputTokens: response.usage.inputTokens, + outputTokens: response.usage.outputTokens, + costUsd: response.usage.costUsd, + }; if (raw === null) { logger.warn(`llm-file-analyzer: ${input.relativePath} returned unparseable JSON`); } } catch (cause: unknown) { + if (cause instanceof LlmConfigError || cause instanceof LlmError) { + // LLM is unreachable / misconfigured — bubble up so the runner can + // mark the knowledge FAILED with a structured reason. + throw cause; + } const msg = cause instanceof Error ? cause.message : String(cause); logger.warn(`llm-file-analyzer: ${input.relativePath} askJsonLLM failed: ${msg}`); } if (raw === null) { - return { language: FALLBACK_LANGUAGE, analysis: emptyFileAnalysis() }; + return { language: FALLBACK_LANGUAGE, analysis: emptyFileAnalysis(), tokenUsage: usage }; } - return shapeAnalysis(raw); + const shaped = shapeAnalysis(raw); + shaped.tokenUsage = usage; + logger.info( + `llm-file-analyzer: ✓ ${input.relativePath} (${Math.round(performance.now() - t0)}ms, lang=${shaped.language})`, + ); + return shaped; }, }; } diff --git a/packages/ingest-github/src/bootstrap.ts b/packages/ingest-github/src/bootstrap.ts new file mode 100644 index 0000000..69bdda6 --- /dev/null +++ b/packages/ingest-github/src/bootstrap.ts @@ -0,0 +1,16 @@ +import { seedConfig } from "@bb/config"; +import { seedLoggerFactory, type LoggerFactory } from "@bb/logger"; +import { connectMongo } from "@bb/mongo"; +import { connectNeo4j } from "@bb/neo4j"; + +export interface BootstrapRuntimeOptions { + config: unknown; + loggerFactory: LoggerFactory; +} + +export async function bootstrapRuntime(opts: BootstrapRuntimeOptions): Promise { + seedConfig(opts.config); + seedLoggerFactory(opts.loggerFactory); + await connectMongo(); + await connectNeo4j(); +} diff --git a/packages/ingest-github/src/githubApi.ts b/packages/ingest-github/src/githubApi.ts index 7c88b8c..ad2301f 100644 --- a/packages/ingest-github/src/githubApi.ts +++ b/packages/ingest-github/src/githubApi.ts @@ -1,43 +1,10 @@ /** - * Minimal GitHub REST helpers used by the pull flow. + * Repository and branch information fetching from GitHub REST API. * - * Public repo only models GitHub (no Bitbucket), so this stays small — - * a URL parser and a single branch-head lookup. Both are best-effort: - * `null` on parse failure or non-2xx so callers can fall back without - * try/catch noise. + * SPDX-License-Identifier: AGPL-3.0-only WITH non-commercial-clause */ -const USER_AGENT = "ByteBell"; - -export interface ParsedRepo { - owner: string; - repo: string; -} - -/** Parses `https://github.com/{owner}/{repo}(.git)?(/...)?` → `{owner, repo}`. */ -export function parseGithubRepo(repoUrl: string): ParsedRepo | null { - if (!repoUrl) { - return null; - } - try { - const url = new URL(repoUrl); - if (!url.hostname.endsWith("github.com")) { - return null; - } - const segments = url.pathname.split("/").filter((s) => s.length > 0); - if (segments.length < 2) { - return null; - } - const owner = segments[0]; - const repoRaw = segments[1]; - if (owner === undefined || repoRaw === undefined) { - return null; - } - return { owner, repo: repoRaw.replace(/\.git$/u, "") }; - } catch { - return null; - } -} +import { parseGithubRepo, USER_AGENT } from "./githubUrl.ts"; /** * Resolves the head SHA of `branch` on `repoUrl`. Returns `null` for any @@ -73,12 +40,109 @@ export async function fetchLatestCommitHash( return typeof sha === "string" && sha.length > 0 ? sha : null; } +export type DefaultBranchResult = + | { status: "ok"; branch: string } + | { status: "not_found" } + | { status: "unauthorized" } + | { status: "rate_limited" } + | { status: "error"; message: string }; + +/** + * Fetches the default branch name of `repoUrl`. Returns a detailed result + * so callers can distinguish between private repos, rate limits, and errors. + */ +export async function fetchDefaultBranch(repoUrl: string, gitToken?: string): Promise { + const parsed = parseGithubRepo(repoUrl); + if (parsed === null) { + return { status: "error", message: `unparseable github url: ${repoUrl}` }; + } + + const headers: Record = { + Accept: "application/vnd.github+json", + "User-Agent": USER_AGENT, + "X-GitHub-Api-Version": "2022-11-28", + }; + if (gitToken !== undefined && gitToken.length > 0) { + headers["Authorization"] = `Bearer ${gitToken}`; + } + + const url = `https://api.github.com/repos/${parsed.owner}/${parsed.repo}`; + let response: Response; + try { + response = await fetch(url, { headers }); + } catch (cause: unknown) { + const msg = cause instanceof Error ? cause.message : String(cause); + return { status: "error", message: `github fetch failed: ${msg}` }; + } + + if (response.status === 404) { + return { status: "not_found" }; + } + if (response.status === 401) { + return { status: "unauthorized" }; + } + if (response.status === 403 && response.headers.get("x-ratelimit-remaining") === "0") { + return { status: "rate_limited" }; + } + if (!response.ok) { + const body = await response.text().catch(() => ""); + return { status: "error", message: `github ${response.status}: ${body.slice(0, 200)}` }; + } + + const body = (await response.json()) as { default_branch?: unknown }; + const branch = body.default_branch; + if (typeof branch === "string" && branch.length > 0) { + return { status: "ok", branch }; + } + return { status: "error", message: "github API returned empty default_branch" }; +} + +/** + * Fetches the list of branches for `repoUrl`. + */ +export async function fetchBranches( + repoUrl: string, + gitToken?: string, + limit = 100, +): Promise<{ status: "ok"; branches: string[] } | { status: "error"; message: string }> { + const parsed = parseGithubRepo(repoUrl); + if (parsed === null) { + return { status: "error", message: `unparseable github url: ${repoUrl}` }; + } + + const headers: Record = { + Accept: "application/vnd.github+json", + "User-Agent": USER_AGENT, + "X-GitHub-Api-Version": "2022-11-28", + }; + if (gitToken !== undefined && gitToken.length > 0) { + headers["Authorization"] = `Bearer ${gitToken}`; + } + + const url = `https://api.github.com/repos/${parsed.owner}/${parsed.repo}/branches?per_page=${limit}`; + let response: Response; + try { + response = await fetch(url, { headers }); + } catch (cause: unknown) { + const msg = cause instanceof Error ? cause.message : String(cause); + return { status: "error", message: `github fetch failed: ${msg}` }; + } + + if (!response.ok) { + const body = await response.text().catch(() => ""); + return { status: "error", message: `github ${response.status}: ${body.slice(0, 200)}` }; + } + + const body = (await response.json()) as Array<{ name?: unknown }>; + const branches = body.map((b) => b.name).filter((name): name is string => typeof name === "string"); + return { status: "ok", branches }; +} + export interface CommitEntry { - hash: string; - shortHash: string; - subject: string; + sha: string; + message: string; author: string; - date: string; + timestamp: string; } export type FetchCommitsResult = @@ -88,40 +152,19 @@ export type FetchCommitsResult = | { status: "rate_limited" } | { status: "error"; message: string }; -const COMMITS_PAGE_SIZE = 100; - -interface GithubCommitPayload { - sha?: unknown; - commit?: { - message?: unknown; - author?: { name?: unknown; date?: unknown } | null; - committer?: { name?: unknown; date?: unknown } | null; - }; -} - /** - * Fetches up to `limit` commits on `branch` via GitHub's REST API. The - * server route uses this in place of `git log` against a shallow local - * clone — the picker should not depend on clone state. - * - * Paginates over `/commits` (capped at 100 per page) until either `limit` - * is reached or the upstream returns a short page. Unauthenticated calls - * work for public repos; private repos answer 404 until a token is - * supplied, at which point the CLI re-requests with `Authorization`. + * Fetches recent commits for a repository on a specific branch. */ export async function fetchRecentCommits( repoUrl: string, branch: string, - limit: number, + limit = 10, gitToken?: string, ): Promise { const parsed = parseGithubRepo(repoUrl); if (parsed === null) { return { status: "error", message: `unparseable github url: ${repoUrl}` }; } - if (limit <= 0) { - return { status: "ok", commits: [] }; - } const headers: Record = { Accept: "application/vnd.github+json", @@ -132,83 +175,47 @@ export async function fetchRecentCommits( headers["Authorization"] = `Bearer ${gitToken}`; } - const collected: CommitEntry[] = []; - let page = 1; - while (collected.length < limit) { - const remaining = limit - collected.length; - const perPage = Math.min(COMMITS_PAGE_SIZE, remaining); - const url = - `https://api.github.com/repos/${parsed.owner}/${parsed.repo}/commits` + - `?sha=${encodeURIComponent(branch)}&per_page=${perPage}&page=${page}`; - - let response: Response; - try { - response = await fetch(url, { headers }); - } catch (cause: unknown) { - const msg = cause instanceof Error ? cause.message : String(cause); - return { status: "error", message: `github fetch failed: ${msg}` }; - } - - if (response.status === 404) { - return { status: "not_found" }; - } - if (response.status === 401) { - return { status: "unauthorized" }; - } - if (response.status === 403 && response.headers.get("x-ratelimit-remaining") === "0") { - return { status: "rate_limited" }; - } - if (!response.ok) { - const body = await response.text().catch(() => ""); - return { status: "error", message: `github ${response.status}: ${body.slice(0, 200)}` }; - } - - const payload = (await response.json()) as GithubCommitPayload[]; - if (!Array.isArray(payload) || payload.length === 0) { - break; - } - for (const item of payload) { - const entry = toCommitEntry(item); - if (entry !== null) { - collected.push(entry); - if (collected.length >= limit) { - break; - } - } - } - if (payload.length < perPage) { - break; - } - page += 1; - } - - return { status: "ok", commits: collected }; -} + const url = `https://api.github.com/repos/${parsed.owner}/${parsed.repo}/commits?per_page=${limit}&sha=${encodeURIComponent(branch)}`; + let response: Response; + try { + response = await fetch(url, { headers }); + } catch (cause: unknown) { + const msg = cause instanceof Error ? cause.message : String(cause); + return { status: "error", message: `github fetch failed: ${msg}` }; + } -function toCommitEntry(raw: GithubCommitPayload): CommitEntry | null { - const sha = raw.sha; - if (typeof sha !== "string" || sha.length === 0) { - return null; + if (response.status === 404) { + return { status: "not_found" }; } - const message = typeof raw.commit?.message === "string" ? raw.commit.message : ""; - const subjectLine = message.split("\n", 1)[0] ?? ""; - const authorName = - typeof raw.commit?.author?.name === "string" - ? raw.commit.author.name - : typeof raw.commit?.committer?.name === "string" - ? raw.commit.committer.name - : ""; - const authorDate = - typeof raw.commit?.author?.date === "string" - ? raw.commit.author.date - : typeof raw.commit?.committer?.date === "string" - ? raw.commit.committer.date - : ""; - return { - hash: sha, - shortHash: sha.slice(0, 7), - subject: subjectLine, - author: authorName, - date: authorDate, - }; + if (response.status === 401) { + return { status: "unauthorized" }; + } + if (response.status === 403 && response.headers.get("x-ratelimit-remaining") === "0") { + return { status: "rate_limited" }; + } + if (!response.ok) { + const body = await response.text().catch(() => ""); + return { status: "error", message: `github ${response.status}: ${body.slice(0, 200)}` }; + } + + const body = (await response.json()) as Array<{ + sha?: unknown; + commit?: { message?: unknown; author?: { date?: unknown } }; + author?: { login?: unknown }; + }>; + + const commits = body + .map((c) => { + const sha = typeof c.sha === "string" ? c.sha : ""; + const message = typeof c.commit?.message === "string" ? c.commit.message : ""; + const author = typeof c.author?.login === "string" ? c.author.login : ""; + const timestamp = typeof c.commit?.author?.date === "string" ? c.commit.author.date : ""; + return { sha, message, author, timestamp }; + }) + .filter((c): c is CommitEntry => Boolean(c.sha && c.message)); + + return { status: "ok", commits }; } + +export { parseGithubRepo } from "./githubUrl.ts"; +export type { ParsedRepo } from "./githubUrl.ts"; diff --git a/packages/ingest-github/src/githubCommit.ts b/packages/ingest-github/src/githubCommit.ts new file mode 100644 index 0000000..8b2d789 --- /dev/null +++ b/packages/ingest-github/src/githubCommit.ts @@ -0,0 +1,147 @@ +/** + * Commit fetching from GitHub REST API. + * + * SPDX-License-Identifier: AGPL-3.0-only WITH non-commercial-clause + */ + +import { parseGithubRepo, USER_AGENT } from "./githubUrl.ts"; + +export interface CommitEntry { + hash: string; + shortHash: string; + subject: string; + author: string; + date: string; +} + +export type FetchCommitsResult = + | { status: "ok"; commits: CommitEntry[] } + | { status: "not_found" } + | { status: "unauthorized" } + | { status: "rate_limited" } + | { status: "error"; message: string }; + +const COMMITS_PAGE_SIZE = 100; + +interface GithubCommitPayload { + sha?: unknown; + commit?: { + message?: unknown; + author?: { name?: unknown; date?: unknown } | null; + committer?: { name?: unknown; date?: unknown } | null; + }; +} + +/** + * Fetches up to `limit` commits on `branch` via GitHub's REST API. The + * server route uses this in place of `git log` against a shallow local + * clone — the picker should not depend on clone state. + * + * Paginates over `/commits` (capped at 100 per page) until either `limit` + * is reached or the upstream returns a short page. Unauthenticated calls + * work for public repos; private repos answer 404 until a token is + * supplied, at which point the CLI re-requests with `Authorization`. + */ +export async function fetchRecentCommits( + repoUrl: string, + branch: string, + limit: number, + gitToken?: string, +): Promise { + const parsed = parseGithubRepo(repoUrl); + if (parsed === null) { + return { status: "error", message: `unparseable github url: ${repoUrl}` }; + } + if (limit <= 0) { + return { status: "ok", commits: [] }; + } + + const headers: Record = { + Accept: "application/vnd.github+json", + "User-Agent": USER_AGENT, + "X-GitHub-Api-Version": "2022-11-28", + }; + if (gitToken !== undefined && gitToken.length > 0) { + headers["Authorization"] = `Bearer ${gitToken}`; + } + + const collected: CommitEntry[] = []; + let page = 1; + while (collected.length < limit) { + const remaining = limit - collected.length; + const perPage = Math.min(COMMITS_PAGE_SIZE, remaining); + const url = + `https://api.github.com/repos/${parsed.owner}/${parsed.repo}/commits` + + `?sha=${encodeURIComponent(branch)}&per_page=${perPage}&page=${page}`; + + let response: Response; + try { + response = await fetch(url, { headers }); + } catch (cause: unknown) { + const msg = cause instanceof Error ? cause.message : String(cause); + return { status: "error", message: `github fetch failed: ${msg}` }; + } + + if (response.status === 404) { + return { status: "not_found" }; + } + if (response.status === 401) { + return { status: "unauthorized" }; + } + if (response.status === 403 && response.headers.get("x-ratelimit-remaining") === "0") { + return { status: "rate_limited" }; + } + if (!response.ok) { + const body = await response.text().catch(() => ""); + return { status: "error", message: `github ${response.status}: ${body.slice(0, 200)}` }; + } + + const payload = (await response.json()) as GithubCommitPayload[]; + if (!Array.isArray(payload) || payload.length === 0) { + break; + } + for (const item of payload) { + const entry = toCommitEntry(item); + if (entry !== null) { + collected.push(entry); + if (collected.length >= limit) { + break; + } + } + } + if (payload.length < perPage) { + break; + } + page += 1; + } + + return { status: "ok", commits: collected }; +} + +function toCommitEntry(raw: GithubCommitPayload): CommitEntry | null { + const sha = raw.sha; + if (typeof sha !== "string" || sha.length === 0) { + return null; + } + const message = typeof raw.commit?.message === "string" ? raw.commit.message : ""; + const subjectLine = message.split("\n", 1)[0] ?? ""; + const authorName = + typeof raw.commit?.author?.name === "string" + ? raw.commit.author.name + : typeof raw.commit?.committer?.name === "string" + ? raw.commit.committer.name + : ""; + const authorDate = + typeof raw.commit?.author?.date === "string" + ? raw.commit.author.date + : typeof raw.commit?.committer?.date === "string" + ? raw.commit.committer.date + : ""; + return { + hash: sha, + shortHash: sha.slice(0, 7), + subject: subjectLine, + author: authorName, + date: authorDate, + }; +} diff --git a/packages/ingest-github/src/githubUrl.ts b/packages/ingest-github/src/githubUrl.ts new file mode 100644 index 0000000..e080101 --- /dev/null +++ b/packages/ingest-github/src/githubUrl.ts @@ -0,0 +1,53 @@ +/** + * Minimal GitHub REST helpers used by the pull flow. + * + * Public repo only models GitHub (no Bitbucket), so this stays small — + * a URL parser and a single branch-head lookup. Both are best-effort: + * `null` on parse failure or non-2xx so callers can fall back without + * try/catch noise. + * + * SPDX-License-Identifier: AGPL-3.0-only WITH non-commercial-clause + */ + +export const USER_AGENT = "ByteBell"; + +export interface ParsedRepo { + owner: string; + repo: string; + branch?: string; +} + +/** + * Parses `https://github.com/{owner}/{repo}(/tree/{branch})?` → `{owner, repo, branch?}`. + */ +export function parseGithubRepo(repoUrl: string): ParsedRepo | null { + if (!repoUrl) { + return null; + } + try { + const url = new URL(repoUrl); + if (!url.hostname.endsWith("github.com")) { + return null; + } + const segments = url.pathname.split("/").filter((s) => s.length > 0); + if (segments.length < 2) { + return null; + } + const owner = segments[0]; + const repoRaw = segments[1]; + if (owner === undefined || repoRaw === undefined) { + return null; + } + const repo = repoRaw.replace(/\.git$/u, ""); + const out: ParsedRepo = { owner, repo }; + + // Support https://github.com/owner/repo/tree/branch-name + if (segments[2] === "tree" && segments.length > 3) { + out.branch = segments.slice(3).join("/"); + } + + return out; + } catch { + return null; + } +} diff --git a/packages/ingest-github/src/handlers/README.md b/packages/ingest-github/src/handlers/README.md index 934ca3a..edfb36c 100644 --- a/packages/ingest-github/src/handlers/README.md +++ b/packages/ingest-github/src/handlers/README.md @@ -8,8 +8,11 @@ no clone — those belong in `pipeline/run.ts`. - `ingest-job.ts` — `createGithubIngestHandler(deps)` and `createLocalIngestHandler(deps)` both return BullMQ-shaped - `(msg) => Promise` callbacks. They throw `IngestError` on validation - failures; everything else propagates to BullMQ as the worker's failure path. + `(msg) => Promise` callbacks (the summary carries + per-commit `tokenUsage` including `costUsd` so the enterprise wrapper + can mirror it to the knowledge record without a `processing_stats` + round-trip). They throw `IngestError` on validation failures; + everything else propagates to BullMQ as the worker's failure path. - `README.md` — this file. ## Invariants diff --git a/packages/ingest-github/src/handlers/ingest-job.ts b/packages/ingest-github/src/handlers/ingest-job.ts index 39580af..0853a5f 100644 --- a/packages/ingest-github/src/handlers/ingest-job.ts +++ b/packages/ingest-github/src/handlers/ingest-job.ts @@ -1,7 +1,8 @@ import type { GithubIndexPayload, JobMessage, LocalIngestPayload } from "@bb/types"; import { IngestError } from "@bb/errors"; -import { isEnvelopeCoherent, narrowGithubIngest, narrowLocalIngest } from "src/payload/narrow.ts"; -import type { IngestRunnerDeps } from "src/types/ingest-runner.ts"; +import { isEnvelopeCoherent, narrowGithubIngest, narrowLocalIngest } from "#src/payload/narrow.ts"; +import type { IngestRunnerDeps } from "#src/types/ingest-runner.ts"; +import type { PipelineSummary } from "#src/types/pipeline.ts"; export interface IngestJobHandlerDeps { runner: IngestRunnerDeps; @@ -9,8 +10,8 @@ export interface IngestJobHandlerDeps { export function createGithubIngestHandler( deps: IngestJobHandlerDeps, -): (msg: JobMessage) => Promise { - return async function handleGithubIngest(msg: JobMessage): Promise { +): (msg: JobMessage) => Promise { + return async function handleGithubIngest(msg: JobMessage): Promise { const payload = narrowGithubIngest(msg.knowledgeId, msg.payload); if (!isEnvelopeCoherent(msg.knowledgeId, payload.knowledgeId)) { throw new IngestError( @@ -18,14 +19,14 @@ export function createGithubIngestHandler( `envelope mismatch: job.knowledgeId=${msg.knowledgeId} payload.knowledgeId=${payload.knowledgeId}`, ); } - await deps.runner.run({ job: msg, payload }); + return await deps.runner.run({ job: msg, payload }); }; } export function createLocalIngestHandler( deps: IngestJobHandlerDeps, -): (msg: JobMessage) => Promise { - return async function handleLocalIngest(msg: JobMessage): Promise { +): (msg: JobMessage) => Promise { + return async function handleLocalIngest(msg: JobMessage): Promise { const payload = narrowLocalIngest(msg.knowledgeId, msg.payload); if (!isEnvelopeCoherent(msg.knowledgeId, payload.knowledgeId)) { throw new IngestError( @@ -33,6 +34,6 @@ export function createLocalIngestHandler( `envelope mismatch: job.knowledgeId=${msg.knowledgeId} payload.knowledgeId=${payload.knowledgeId}`, ); } - await deps.runner.run({ job: msg, payload }); + return await deps.runner.run({ job: msg, payload }); }; } diff --git a/packages/ingest-github/src/index.ts b/packages/ingest-github/src/index.ts index 49afde3..efd5348 100644 --- a/packages/ingest-github/src/index.ts +++ b/packages/ingest-github/src/index.ts @@ -10,25 +10,36 @@ import { COMBINED_CODE_ANALYSIS_SYSTEM_PROMPT, buildFileAnalysisUserPrompt, } from "./strategies/flat-folder/prompts/file-analysis.ts"; -import type { SourceFactory } from "./types/pipeline.ts"; +import type { PullFactory, SourceFactory } from "./types/pipeline.ts"; +import type { ProgressContextFactory } from "./progress/types.ts"; +import { nullProgressContextFactory } from "./progress/NullProgressReporter.ts"; /** - * Optional dependencies for the GitHub workers. Today only one field is - * exposed: a source factory. Documented in `docs/extension-points.md`. - * The open-source binary leaves this undefined — the default disk reader - * runs unchanged. + * Optional dependencies for the GitHub workers. Factories are documented in + * `docs/extension-points.md`. The open-source binary leaves them undefined — + * index and pull use the default disk-backed readers, and progress events + * are discarded by `nullProgressContextFactory`. */ export interface RegisterGithubWorkersDeps { sourceFactory?: SourceFactory; + pullFactory?: PullFactory; + progressContextFactory?: ProgressContextFactory; } -function buildRunner(sourceFactory: SourceFactory | undefined): ReturnType { +function buildRunner( + sourceFactory: SourceFactory | undefined, + progressContextFactory: ProgressContextFactory, +): ReturnType { const fileAnalyzer = createLlmFileAnalyzer({ buildSystemPrompt: () => COMBINED_CODE_ANALYSIS_SYSTEM_PROMPT, buildUserPrompt: buildFileAnalysisUserPrompt, }); - const strategy = createFlatFolderStrategy({ fileAnalyzer }); - const runnerDeps: Parameters[0] = { reposRootDir: reposRoot(), strategy }; + const strategy = createFlatFolderStrategy({ fileAnalyzer, progressContextFactory }); + const runnerDeps: Parameters[0] = { + reposRootDir: reposRoot(), + strategy, + progressContextFactory, + }; if (sourceFactory !== undefined) { runnerDeps.sourceFactory = sourceFactory; } @@ -36,19 +47,49 @@ function buildRunner(sourceFactory: SourceFactory | undefined): ReturnType`; the handler now returns + // `Promise` so the enterprise queue bridge can mirror + // per-commit tokens + cost into the knowledge record. The OSS in-process + // worker discards the summary — local stats are read off + // `source.commitHashes[]` via `bytebell stats` instead. + const indexHandler = createGithubIngestHandler({ runner }); + registerWorker(JobType.GithubIndex, async (msg) => { + await indexHandler(msg); + }); + const pullFactory = deps.pullFactory; + registerWorker(JobType.GithubPull, async (msg) => { + await runPull(msg, pullFactory, progressContextFactory); + }); } export function registerLocalIngestWorker(): void { - const runner = buildRunner(undefined); - registerWorker(JobType.LocalIngest, createLocalIngestHandler({ runner })); + const runner = buildRunner(undefined, nullProgressContextFactory); + const localHandler = createLocalIngestHandler({ runner }); + registerWorker(JobType.LocalIngest, async (msg) => { + await localHandler(msg); + }); } export { createFlatFolderStrategy } from "./strategies/flat-folder/index.ts"; export { createLlmFileAnalyzer } from "./adapters/llm-file-analyzer.ts"; export { createDiskSourceReader } from "./pipeline/disk-source-reader.ts"; +export { createPipelineRunner } from "./pipeline/run.ts"; +export type { CreatePipelineRunnerDeps } from "./pipeline/run.ts"; +export { createGithubIngestHandler, createLocalIngestHandler } from "./handlers/ingest-job.ts"; +export type { IngestJobHandlerDeps } from "./handlers/ingest-job.ts"; +export { runPull } from "./pipeline/pull.ts"; +export { + reposRoot, + repoCloneDir, + metaRootFor, + metaPathsFor, + commitMetaDir, + businessContextDir, + orgRegistryDir, +} from "./pipeline/paths.ts"; +export type { IngestRunnerDeps, IngestRunnerInput } from "./types/ingest-runner.ts"; export type { IngestStrategy, StrategyInput, StrategyResult, StrategyContext } from "./types/strategy.ts"; export type { FileAnalyzer, @@ -63,7 +104,32 @@ export type { SourceFactory, SourceFactoryInput, SourceFactoryResult, + PullFactory, + PullFactoryInput, + PullFactoryResult, } from "./types/pipeline.ts"; +export type { DiffResult, RenamedFile } from "./pipeline/git-diff.ts"; export type { CondensedFileAnalysis } from "./types/condensed-file-analysis.ts"; -export { fetchLatestCommitHash, fetchRecentCommits, parseGithubRepo } from "./githubApi.ts"; -export type { CommitEntry, FetchCommitsResult, ParsedRepo } from "./githubApi.ts"; +export { + fetchLatestCommitHash, + fetchRecentCommits, + fetchDefaultBranch, + fetchBranches, + parseGithubRepo, +} from "./githubApi.ts"; +export type { CommitEntry, FetchCommitsResult, ParsedRepo, DefaultBranchResult } from "./githubApi.ts"; +export { bootstrapRuntime } from "./bootstrap.ts"; +export type { BootstrapRuntimeOptions } from "./bootstrap.ts"; +export { + COMBINED_CODE_ANALYSIS_SYSTEM_PROMPT, + buildFileAnalysisUserPrompt, +} from "./strategies/flat-folder/prompts/file-analysis.ts"; +export type { + ProgressContext, + ProgressContextFactory, + ProgressPhase, + ProgressReporter, + ProgressReporterInput, + ProgressTotalMode, +} from "./progress/types.ts"; +export { nullProgressContextFactory } from "./progress/NullProgressReporter.ts"; diff --git a/packages/ingest-github/src/payload/narrow.ts b/packages/ingest-github/src/payload/narrow.ts index e6a5b36..c7f818b 100644 --- a/packages/ingest-github/src/payload/narrow.ts +++ b/packages/ingest-github/src/payload/narrow.ts @@ -1,6 +1,32 @@ -import type { GithubIndexPayload, LocalIngestPayload } from "@bb/types"; +import type { GithubIndexPayload, LocalIngestPayload, PayloadLlmOverrides } from "@bb/types"; import { IngestError } from "@bb/errors"; +/** + * Copies optional LLM credential / model overrides from a payload record onto + * a typed payload. Enterprise wrappers resolve per-org credentials at the + * enqueue boundary and stamp them on the BullMQ payload; without this passthrough + * the worker would always fall back to global config (and the resolver work is + * wasted). OSS standalone leaves all four unset, so nothing happens here. + */ +function attachLlmOverrides(rec: Record, target: PayloadLlmOverrides): void { + const apiKey = rec["llmApiKey"]; + if (typeof apiKey === "string" && apiKey.length > 0) { + target.llmApiKey = apiKey; + } + const provider = rec["llmProvider"]; + if (typeof provider === "string" && provider.length > 0) { + target.llmProvider = provider; + } + const model = rec["llmModel"]; + if (typeof model === "string" && model.length > 0) { + target.llmModel = model; + } + const keyId = rec["llmKeyId"]; + if (typeof keyId === "string" && keyId.length > 0) { + target.llmKeyId = keyId; + } +} + export function narrowGithubIngest(knowledgeId: string, payload: unknown): GithubIndexPayload { if (typeof payload !== "object" || payload === null) { throw new IngestError(knowledgeId, "github_index payload must be an object"); @@ -31,6 +57,7 @@ export function narrowGithubIngest(knowledgeId: string, payload: unknown): Githu if (typeof orgId === "string" && orgId.length > 0) { out.orgId = orgId; } + attachLlmOverrides(rec, out); return out; } @@ -52,6 +79,7 @@ export function narrowLocalIngest(knowledgeId: string, payload: unknown): LocalI if (typeof orgId === "string" && orgId.length > 0) { out.orgId = orgId; } + attachLlmOverrides(rec, out as PayloadLlmOverrides); return out; } diff --git a/packages/ingest-github/src/pipeline/README.md b/packages/ingest-github/src/pipeline/README.md index b3fecdb..7c7b0d6 100644 --- a/packages/ingest-github/src/pipeline/README.md +++ b/packages/ingest-github/src/pipeline/README.md @@ -28,7 +28,7 @@ Domain (sub-folder of `@bb/ingest-github`). - `skip-decisions/` — LLM-backed unknown-extension gate. See `skip-decisions/README.md`. Active when `Config.SkipDecisionEnabled = true` (default). Consumed by `scan.ts` via the optional `skipDecider` - dep; built by `classifyAndAnalyseSmall` if not injected. + dep; built by `scanAndClassify` (Phase 1) if not injected. - `disk-source-reader.ts` — `createDiskSourceReader({ repoDir, commitHash })` returns a `SourceReader` that wraps `scanRepository` + `node:fs.readFile`. The default reader the open-source binary always uses, unless the caller @@ -38,9 +38,30 @@ true` (default). Consumed by `scan.ts` via the optional `skipDecider` byte size exceeds `Config.AbsoluteFileSizeCap` (skipped before read) or when its line count exceeds `Config.BigFileLineThreshold` (default 1200; enters the big-file phase). Both thresholds are config-driven — no - magic numbers in this file. `readScannedFile` re-reads a file by - absolute path for the big-file phase which streams content lazily. -- `run.ts` — `createPipelineRunner({ reposRootDir, strategy, sourceFactory? })` + magic numbers in this file. `deps.llmCallContext` (when present) is + forwarded into every `SkipDeciderInput` so the LLM branch of the + unknown-extension gate uses per-job credentials. + + **Two scan modes:** + - **Two-pass (default for the flat-folder strategy)** — activated when + `deps.skipDecider` AND `deps.limiter` are both supplied. Pass 1 walks + the tree calling `decider.decideStatic(...)`; static-resolved files + yield immediately, "needs LLM" files go into a pending buffer with + their content. Pass 2 dedupes pending entries by `ext:` or + `filename:`, dispatches one `decider.decideAndDeferSave(...)` per + unique key through the shared limiter via `Promise.all`, then calls + `decider.persist()` exactly once. Pass 3 drains pending — every + `decideStatic` call is now a cache hit, so the drain is sync at the + decider boundary and yields each kept file with its buffered content. + - **Legacy inline (`walk()`)** — used when `deps.limiter` is omitted (e.g. + a custom `SourceFactory` consumer that didn't opt in). Inline `await +deps.skipDecider.decide(input)` per file. Same semantics as before this + refactor; preserved for backwards compatibility. + + `readScannedFile` re-reads a file by absolute path for the big-file phase + which streams content lazily. + +- `run.ts` — `createPipelineRunner({ reposRootDir, strategy, sourceFactory?, progressContextFactory? })` builds an `IngestRunnerDeps`. GitHub payloads run: branch resolve, source-reader construction, strategy execute, commit persistence. Local payloads skip the clone. The source reader is chosen by the optional @@ -50,11 +71,64 @@ true` (default). Consumed by `scan.ts` via the optional `skipDecider` with `{ knowledgeId, payload, branch }` and uses the returned reader + commit hash; the local clone is skipped. The factory may also return an `archiveSink` which the strategy then threads through to its - analyse phase. `resolveOrgId(payload)` returns - `payload.orgId ?? getConfigValue(Config.OrgId)` — the only place orgId - is resolved. State transitions (`CREATED → QUEUED → INGESTED → …`) are - persisted to Mongo + Neo4j via `transitionState`, and - `CancellationError` is re-thrown without flipping to FAILED. + analyse phase. State transitions (`CREATED → QUEUED → INGESTED → …`) are + persisted to Mongo + Neo4j via `transitionState`, and `CancellationError` + is re-thrown without flipping to FAILED. The optional + `progressContextFactory` is the runner's own `ProgressContext` source: + `runGithub` emits `phaseChanged("clone")` before `syncRepository` (or before the + `sourceFactory` call) and `phaseChanged("scan")` before invoking + `strategy.execute`, so SSE clients see liveness during the + network/disk-bound prelude. On a non-`CancellationError` throw the + runner emits `failed(message)` only when the strategy has not yet + started — once `strategy.execute` is reached, the strategy owns + terminal emission and the runner stays silent to avoid double-FAILED. +- `pull.ts` — `runPull(msg, pullFactory?, progressContextFactory?)` orchestrates the pull job. Returns `Promise` (was `Promise`); the returned `tokenUsage` carries `inputTokens`, `outputTokens`, and `costUsd` summed across the pull phases for callers (e.g. the enterprise queue bridge) that need to mirror the run into a knowledge record. + Reads `repoUrl` and `branch` directly off `knowledge.info.*` (loaded via + `@bb/mongo.getKnowledge`). The `KnowledgeSource` discriminator (`kind`) is + still read off `knowledge.source` along with `commitId`/`commitHashes`, but + the repo coordinates themselves live on `info` — no fallback chain. + When `pullFactory` is provided, it returns `{source, diff, targetCommit, +archiveSink?}` and `runPull` skips `syncRepository` + `materialiseEndpoints` + - `assertReachableFromBranch` + `computePullDiff` + `checkoutCommit` — + the factory is the sole source of truth. When `pullFactory` is undefined + (open-source default), the legacy git-based path runs. Either path + produces the same downstream pipeline: snapshot prior version, + `analyseChangedFiles` (now reading via `SourceReader`), + `processBigFilesQueue`, `backfillMissingFields`, + `runSelectiveFolderSummary`, `summariseRepo`, `storePullAnalysis`. + Mirrors the index-side strategy orchestrator for progress: builds one + `ProgressContext` per job from the optional `progressContextFactory` + (default `nullProgressContextFactory`), emits `phaseChanged` at + `file_analysis` / `folder_analysis` / `indexing` boundaries, threads + the context into every phase that takes a `progressContext?` field, + and finishes with `completed()` on success or `failed(message)` on a + non-`CancellationError` throw. +- `stats.ts` — small shared helpers: `repoNameFromUrl` parses an owner/repo + display name out of a GitHub URL with a graceful fallback, `localRepoName` + derives a name from a local path, and `describe` flattens an `unknown` + cause to a short string for `IngestError` messages. The previous + `persistStats` write into the `processing_stats` collection has been + removed — per-commit token and cost data now lives on the knowledge + document's `source.commitHashes[]` (set by `setKnowledgeCommit` from + `@bb/mongo`), with the per-call `costUsd` sourced directly from + OpenRouter's `response.usage.cost`. +- `failure-classifier.ts` — `classifyFailure(cause)` returns + `{ reason, category, detail? }` for any thrown ingestion error. + `LlmConfigError` → `llm_config`. `LlmError` is subdivided by its + `status` field: `401`/`403` → `llm_auth`, `402` → `llm_quota`, `429` → + `llm_rate_limit`, `5xx`/no-status → `llm_unreachable`. Anything else → + `internal`. Each category produces a single short operator-readable + `reason` sentence; the raw provider response body lives in `detail`. + Used by `run.ts`/`pull.ts` catch blocks (Mongo persistence via + `markKnowledgeFailed`) and `strategies/flat-folder/index.ts` (SSE event + via `progressContext.failed`) so both paths share one classification. +- `context.ts` — shared helpers to resolve pipeline organization IDs and parse + optional LLM context parameter overrides from payload messages: + `resolveOrgId(payload)` returns `payload.orgId ?? getConfigValue(Config.OrgId)` + (the only place orgId is resolved), and `llmCallContextFromPayload(payload)` + extracts the optional `{ llmApiKey, llmProvider, llmModel }` overrides + from the payload and packs them into an `AskLlmOptions` bag stored on + `StrategyContext.llmCallContext`. - `branch.ts` — `resolveBranch(knowledgeId, payload)`. Defaults to `main` when the payload omits it; rejects branch names that don't match `^[\w./-]+$` with `IngestError` (defence against shell-injection into git args). @@ -72,10 +146,10 @@ true` (default). Consumed by `scan.ts` via the optional `skipDecider` - Sibling files in this folder may import each other. - Down: `src/types/*` only (intra-package, via the `src/*` alias). - Up: `@bb/config`, `@bb/types`, `@bb/errors`, `@bb/logger`, `node:*`. -- `run.ts` additionally imports `@bb/mongo`, `@bb/neo4j`, and `@bb/llm` - for state transitions, graph state writes, and cost estimation - respectively — it is the orchestrator that owns those side effects so - the strategies stay pure. +- `run.ts` and `pull.ts` additionally import `@bb/mongo` and `@bb/neo4j` + for state transitions and graph state writes respectively. +- `stats.ts` has no cross-package imports — it carries only pure helpers + (`repoNameFromUrl`, `localRepoName`, `describe`). - Forbidden: importing from `../strategies`, `../adapters`, `../handlers`. ## Invariants diff --git a/packages/ingest-github/src/pipeline/branch.ts b/packages/ingest-github/src/pipeline/branch.ts index 213a5e5..70186ab 100644 --- a/packages/ingest-github/src/pipeline/branch.ts +++ b/packages/ingest-github/src/pipeline/branch.ts @@ -1,15 +1,31 @@ import type { GithubIndexPayload } from "@bb/types"; import { IngestError } from "@bb/errors"; +import { fetchDefaultBranch } from "#src/githubApi.ts"; const DEFAULT_BRANCH = "main"; -export function resolveBranch(knowledgeId: string, payload: GithubIndexPayload): string { +export async function resolveBranch( + knowledgeId: string, + payload: GithubIndexPayload, + gitToken?: string, +): Promise { const branch = payload.branch; - if (branch === undefined || branch.length === 0) { - return DEFAULT_BRANCH; + if (branch !== undefined && branch.length > 0) { + if (!/^[\w./-]+$/u.test(branch)) { + throw new IngestError(knowledgeId, `invalid branch name: ${branch}`); + } + return branch; } - if (!/^[\w./-]+$/u.test(branch)) { - throw new IngestError(knowledgeId, `invalid branch name: ${branch}`); + + // No branch provided -> attempt to fetch the default branch from GitHub. + try { + const result = await fetchDefaultBranch(payload.repoUrl, gitToken); + if (result.status === "ok") { + return result.branch; + } + } catch { + // Best-effort; fall back to the hardcoded default. } - return branch; + + return DEFAULT_BRANCH; } diff --git a/packages/ingest-github/src/pipeline/context.ts b/packages/ingest-github/src/pipeline/context.ts new file mode 100644 index 0000000..c392291 --- /dev/null +++ b/packages/ingest-github/src/pipeline/context.ts @@ -0,0 +1,28 @@ +import { Config } from "@bb/types"; +import { getConfigValue } from "@bb/config"; +import type { AskLlmOptions } from "@bb/llm"; + +export function resolveOrgId(payload: { orgId?: string }): string { + if (typeof payload.orgId === "string" && payload.orgId.length > 0) { + return payload.orgId; + } + return getConfigValue(Config.OrgId); +} + +export function llmCallContextFromPayload(payload: { + llmApiKey?: string; + llmProvider?: string; + llmModel?: string; +}): AskLlmOptions | undefined { + const ctx: AskLlmOptions = {}; + if (payload.llmApiKey !== undefined && payload.llmApiKey.length > 0) { + ctx.apiKey = payload.llmApiKey; + } + if (payload.llmProvider === "openrouter" || payload.llmProvider === "ollama") { + ctx.provider = payload.llmProvider; + } + if (payload.llmModel !== undefined && payload.llmModel.length > 0) { + ctx.model = payload.llmModel; + } + return Object.keys(ctx).length > 0 ? ctx : undefined; +} diff --git a/packages/ingest-github/src/pipeline/disk-source-reader.ts b/packages/ingest-github/src/pipeline/disk-source-reader.ts index 9e404be..43dfd3f 100644 --- a/packages/ingest-github/src/pipeline/disk-source-reader.ts +++ b/packages/ingest-github/src/pipeline/disk-source-reader.ts @@ -1,6 +1,6 @@ import path from "node:path"; import { readFile } from "node:fs/promises"; -import type { ScanDeps, ScanEntry, SourceReader } from "src/types/pipeline.ts"; +import type { ScanDeps, ScanEntry, SourceReader } from "#src/types/pipeline.ts"; import { scanRepository } from "./scan.ts"; export interface DiskSourceReaderDeps { diff --git a/packages/ingest-github/src/pipeline/failure-classifier.ts b/packages/ingest-github/src/pipeline/failure-classifier.ts new file mode 100644 index 0000000..38e9baa --- /dev/null +++ b/packages/ingest-github/src/pipeline/failure-classifier.ts @@ -0,0 +1,77 @@ +import { LlmConfigError, LlmError } from "@bb/errors"; +import type { KnowledgeFailureCategory } from "@bb/types"; +import { describe } from "./stats.ts"; + +export interface ClassifiedFailure { + /** Operator-readable single-sentence summary. UI surfaces this directly. */ + reason: string; + category: KnowledgeFailureCategory; + /** Raw provider response or structured debug payload. Optional. */ + detail?: string; +} + +/** + * Translates a thrown ingestion error into the structured `(reason, category, + * detail)` triple persisted on `KnowledgeDoc.failure` and stamped on the SSE + * FAILED event. + * + * For LLM transport errors, the provider's HTTP status drives the category so + * operators can distinguish "wrong key" (401/403) from "out of credits" (402) + * from "throttled" (429) from "infra down" (5xx). Each path produces a short + * sentence; the raw response body lands in `detail` for the disclosure UI. + */ +export function classifyFailure(cause: unknown): ClassifiedFailure { + if (cause instanceof LlmConfigError) { + return { + category: "llm_config", + reason: "LLM provider is not configured. Set the API key and retry.", + detail: cause.message, + }; + } + if (cause instanceof LlmError) { + return classifyLlmTransport(cause); + } + return { category: "internal", reason: describe(cause) }; +} + +function classifyLlmTransport(cause: LlmError): ClassifiedFailure { + const status = cause.status; + const detail = cause.detail ?? cause.message; + if (status === 401 || status === 403) { + return { + category: "llm_auth", + reason: "LLM provider rejected the API key. Update the key and retry.", + detail, + }; + } + if (status === 402) { + return { + category: "llm_quota", + reason: "LLM provider is out of credits or over its spend limit. Top up and retry.", + detail, + }; + } + if (status === 429) { + return { + category: "llm_rate_limit", + reason: "LLM provider rate-limited the request. Wait and retry.", + detail, + }; + } + if (status !== undefined && status >= 500 && status < 600) { + return { + category: "llm_unreachable", + reason: `LLM provider responded with HTTP ${String(status)}. Provider is temporarily unavailable.`, + detail, + }; + } + // Network/timeout (no status) or any other non-OK status. + return { + category: "llm_unreachable", + reason: + status === undefined + ? "LLM provider is unreachable (network error or timeout)." + : `LLM provider responded with HTTP ${String(status)}.`, + detail, + }; +} diff --git a/packages/ingest-github/src/pipeline/paths.ts b/packages/ingest-github/src/pipeline/paths.ts index 21db948..ac52215 100644 --- a/packages/ingest-github/src/pipeline/paths.ts +++ b/packages/ingest-github/src/pipeline/paths.ts @@ -1,7 +1,7 @@ import { mkdir } from "node:fs/promises"; import path from "node:path"; import { getBytebellHome } from "@bb/config"; -import type { MetaPaths } from "src/types/meta-paths.ts"; +import type { MetaPaths } from "#src/types/meta-paths.ts"; const DIR_MODE = 0o700; @@ -17,8 +17,12 @@ export async function ensureReposRoot(): Promise { await mkdir(reposRoot(), { recursive: true, mode: DIR_MODE }); } +export function metaRootFor(knowledgeId: string): string { + return path.join(reposRoot(), ".meta", knowledgeId); +} + export function metaPathsFor(knowledgeId: string): MetaPaths { - const metaRoot = path.join(reposRoot(), ".meta", knowledgeId); + const metaRoot = metaRootFor(knowledgeId); return { metaRoot, fileAnalysisDir: path.join(metaRoot, "file-analysis"), @@ -26,10 +30,40 @@ export function metaPathsFor(knowledgeId: string): MetaPaths { bigFileAnalysisDir: path.join(metaRoot, "big-file-analysis"), bigFileChunksDir: path.join(metaRoot, "big-file-analysis", "chunks"), bigFilesJson: path.join(metaRoot, "bigFiles.json"), + scanManifestJson: path.join(metaRoot, "scan-manifest.json"), repoSummaryJson: path.join(metaRoot, "repo-summary.json"), }; } +/** + * Per-commit meta directory for content scoped to a specific indexed commit. + * Sits under the knowledge's `metaRoot/commits//` so it survives + * subsequent pulls that overwrite the live `:File` set. + */ +export function commitMetaDir(knowledgeId: string, commitHash: string): string { + return path.join(metaRootFor(knowledgeId), "commits", commitHash); +} + +/** + * Directory for business-context analyses authored against a specific commit. + * Each business context lives at `business-context//` and contains + * `original.txt` (the raw user-authored text) and `analysis.json` (the LLM + * analysis wrapped in its metadata envelope). + */ +export function businessContextDir(knowledgeId: string, commitHash: string, sanitizedTitle: string): string { + return path.join(commitMetaDir(knowledgeId, commitHash), "business-context", sanitizedTitle); +} + +/** + * Org-level keyword registry directory. In single-tenant OSS this resolves to + * `metaRoot/org//` (orgId defaults to `"local"`); downstream multi-tenant + * deployments may aggregate registries across multiple knowledges into the same + * directory. The business-context enrichment reader tolerates missing files. + */ +export function orgRegistryDir(knowledgeId: string, orgId: string): string { + return path.join(metaRootFor(knowledgeId), "org", orgId); +} + export async function ensureMetaDirs(paths: MetaPaths): Promise { await mkdir(paths.fileAnalysisDir, { recursive: true, mode: DIR_MODE }); await mkdir(paths.folderSummariesDir, { recursive: true, mode: DIR_MODE }); diff --git a/packages/ingest-github/src/pipeline/pull.ts b/packages/ingest-github/src/pipeline/pull.ts index 2ce7452..be344a6 100644 --- a/packages/ingest-github/src/pipeline/pull.ts +++ b/packages/ingest-github/src/pipeline/pull.ts @@ -1,40 +1,47 @@ import { Config, KnowledgeState, type GithubPullPayload, type JobMessage } from "@bb/types"; import { getConfigValue } from "@bb/config"; -import { getKnowledge, recordProcessingStats, setKnowledgeCommit, setKnowledgeState } from "@bb/mongo"; +import { withConcurrency } from "./concurrency.ts"; +import { getKnowledge, markKnowledgeFailed, setKnowledgeCommit, setKnowledgeState } from "@bb/mongo"; import { setKnowledgeStateInGraph, snapshotFilesToVersion, type NodeScope } from "@bb/neo4j"; -import { estimateCostFromBreakdown } from "@bb/llm"; +import type { PipelineSummary } from "#src/types/pipeline.ts"; +import { resolveOrgId, llmCallContextFromPayload } from "./context.ts"; import { IngestError, KnowledgeNotFoundError } from "@bb/errors"; +import { classifyFailure } from "./failure-classifier.ts"; import { logger } from "@bb/logger"; import { ensureMetaDirs, metaPathsFor, repoCloneDir, ensureReposRoot } from "./paths.ts"; import { readHeadCommitHash, syncRepository } from "./source.ts"; import { CancellationError, clearCancellation, throwIfCancelled } from "./cancellation.ts"; -import { assertReachableFromBranch, checkoutCommit } from "./git-diff.ts"; +import { assertReachableFromBranch, checkoutCommit, type DiffResult } from "./git-diff.ts"; import { computePullDiff, materialiseEndpoints } from "./pull-diff-resolver.ts"; import { affectedFoldersFromDiff } from "./affected-folders.ts"; import { createDiskSourceReader } from "./disk-source-reader.ts"; -import { analyseChangedFiles } from "src/strategies/flat-folder/analyse-changed.ts"; -import { processBigFilesQueue } from "src/strategies/flat-folder/phases/process-big-files.ts"; -import { backfillMissingFields } from "src/strategies/flat-folder/backfill/fields.ts"; -import { backfillBigFiles } from "src/strategies/flat-folder/backfill/big-files.ts"; -import { runSelectiveFolderSummary } from "src/strategies/flat-folder/folder-summary-selective.ts"; -import { makeRepoSummaryEnvelope, persistRepoSummary, summariseRepo } from "src/strategies/flat-folder/repo-summary.ts"; -import { storePullAnalysis } from "src/strategies/flat-folder/store-pull.ts"; -import { createLlmFileAnalyzer } from "src/adapters/llm-file-analyzer.ts"; +import type { PullFactory, SourceReader, ArchiveSink } from "#src/types/pipeline.ts"; +import type { ProgressContextFactory } from "#src/progress/types.ts"; +import { nullProgressContextFactory } from "#src/progress/NullProgressReporter.ts"; +import { analyseChangedFiles } from "#src/strategies/flat-folder/analyse-changed.ts"; +import { processBigFilesQueue } from "#src/strategies/flat-folder/phases/process-big-files.ts"; +import { backfillMissingFields } from "#src/strategies/flat-folder/backfill/fields.ts"; +import { FileAnalysisCache } from "#src/strategies/flat-folder/file-analysis-cache.ts"; +import { runSelectiveFolderSummary } from "#src/strategies/flat-folder/folder-summary-selective.ts"; +import { + makeRepoSummaryEnvelope, + persistRepoSummary, + summariseRepo, +} from "#src/strategies/flat-folder/repo-summary.ts"; +import { storePullAnalysis } from "#src/strategies/flat-folder/store-pull.ts"; +import { createLlmFileAnalyzer } from "#src/adapters/llm-file-analyzer.ts"; import { COMBINED_CODE_ANALYSIS_SYSTEM_PROMPT, buildFileAnalysisUserPrompt, -} from "src/strategies/flat-folder/prompts/file-analysis.ts"; +} from "#src/strategies/flat-folder/prompts/file-analysis.ts"; const COMMIT_HASH_RE = /^[0-9a-f]{40}$/u; -function resolveOrgId(payload: { orgId?: string }): string { - if (typeof payload.orgId === "string" && payload.orgId.length > 0) { - return payload.orgId; - } - return getConfigValue(Config.OrgId); -} - -export async function runPull(msg: JobMessage): Promise { +export async function runPull( + msg: JobMessage, + pullFactory?: PullFactory, + progressContextFactory: ProgressContextFactory = nullProgressContextFactory, +): Promise { const { knowledgeId } = msg.payload; if (msg.payload.targetCommitHash !== undefined && !COMMIT_HASH_RE.test(msg.payload.targetCommitHash)) { throw new IngestError( @@ -58,53 +65,78 @@ export async function runPull(msg: JobMessage): Promise ); } - const branch = knowledge.source.branch ?? "main"; - const repoUrl = knowledge.source.repoUrl; + const branch = knowledge.info.branch ?? "main"; + const repoUrl = knowledge.info.repoUrl; + if (repoUrl === undefined || repoUrl.length === 0) { + throw new IngestError(knowledgeId, "pull requires knowledge.info.repoUrl"); + } const gitToken = msg.payload.gitToken; clearCancellation(knowledgeId); - const startedAt = Date.now(); await transitionState(knowledgeId, KnowledgeState.Processing); + const progressContext = progressContextFactory(knowledgeId); try { throwIfCancelled(knowledgeId); - await ensureReposRoot(); - const repoDir = repoCloneDir(knowledgeId); - const cloneOpts: { repoUrl: string; branch: string; destinationDir: string; gitToken?: string } = { - repoUrl, - branch, - destinationDir: repoDir, - }; - if (gitToken !== undefined) { - cloneOpts.gitToken = gitToken; - } - await syncRepository(cloneOpts); - const branchHead = await readHeadCommitHash(repoDir); - if (branchHead === "unknown") { - throw new IngestError(knowledgeId, "could not resolve branch HEAD after clone"); - } - const targetCommit = msg.payload.targetCommitHash ?? branchHead; + let source: SourceReader; + let diff: DiffResult; + let targetCommit: string; + let archiveSink: ArchiveSink | undefined; - if (targetCommit === currentCommit) { - logger.info(`pull: ${knowledgeId} already at ${targetCommit.slice(0, 12)}; no-op`); - await transitionState(knowledgeId, KnowledgeState.Processed); - return; - } + if (pullFactory !== undefined) { + const factoryResult = await pullFactory({ knowledgeId, payload: msg.payload, currentCommit, branch }); + source = factoryResult.source; + diff = factoryResult.diff; + targetCommit = factoryResult.targetCommit; + archiveSink = factoryResult.archiveSink; + logger.info(`pull: pull factory wired (knowledgeId=${knowledgeId}, target=${targetCommit.slice(0, 12)})`); + if (targetCommit === currentCommit) { + logger.info(`pull: ${knowledgeId} already at ${targetCommit.slice(0, 12)}; no-op`); + await transitionState(knowledgeId, KnowledgeState.Processed); + return emptyPullSummary(targetCommit); + } + } else { + await ensureReposRoot(); + const repoDir = repoCloneDir(knowledgeId); + const cloneOpts: { repoUrl: string; branch: string; destinationDir: string; gitToken?: string } = { + repoUrl, + branch, + destinationDir: repoDir, + }; + if (gitToken !== undefined) { + cloneOpts.gitToken = gitToken; + } + await syncRepository(cloneOpts); - // Deepen the shallow clone first so historical commits selected via the - // picker become visible to `merge-base --is-ancestor`. Without this the - // assertion below rejects every non-HEAD pick on a `--depth=1` clone. - await materialiseEndpoints(repoDir, branch, currentCommit, targetCommit); + const branchHead = await readHeadCommitHash(repoDir); + if (branchHead === "unknown") { + throw new IngestError(knowledgeId, "could not resolve branch HEAD after clone"); + } + targetCommit = msg.payload.targetCommitHash ?? branchHead; - if (!(await assertReachableFromBranch(repoDir, targetCommit, branch))) { - throw new IngestError( - knowledgeId, - `target commit ${targetCommit} is not reachable from origin/${branch}. Cross-branch pulls are not supported; create a fresh github_index job for the new branch.`, - ); - } + if (targetCommit === currentCommit) { + logger.info(`pull: ${knowledgeId} already at ${targetCommit.slice(0, 12)}; no-op`); + await transitionState(knowledgeId, KnowledgeState.Processed); + return emptyPullSummary(targetCommit); + } + + // Deepen the shallow clone first so historical commits selected via the + // picker become visible to `merge-base --is-ancestor`. Without this the + // assertion below rejects every non-HEAD pick on a `--depth=1` clone. + await materialiseEndpoints(repoDir, branch, currentCommit, targetCommit); - const diff = await computePullDiff(repoDir, currentCommit, targetCommit); + if (!(await assertReachableFromBranch(repoDir, targetCommit, branch))) { + throw new IngestError( + knowledgeId, + `target commit ${targetCommit} is not reachable from origin/${branch}. Cross-branch pulls are not supported; create a fresh github_index job for the new branch.`, + ); + } + + diff = await computePullDiff(repoDir, currentCommit, targetCommit); + await checkoutCommit(repoDir, targetCommit); + source = createDiskSourceReader({ repoDir, commitHash: targetCommit }); + } throwIfCancelled(knowledgeId); await snapshotFilesToVersion({ knowledgeId, commitHash: currentCommit }).catch((cause: unknown) => { @@ -112,8 +144,6 @@ export async function runPull(msg: JobMessage): Promise logger.warn(`pull: snapshot of ${currentCommit.slice(0, 12)} failed (non-fatal): ${msgText}`); }); - await checkoutCommit(repoDir, targetCommit); - const metaPaths = metaPathsFor(knowledgeId); await ensureMetaDirs(metaPaths); @@ -124,39 +154,82 @@ export async function runPull(msg: JobMessage): Promise buildUserPrompt: buildFileAnalysisUserPrompt, }); + const llmCallContext = llmCallContextFromPayload(msg.payload); + + progressContext.phaseChanged("file_analysis"); logger.info(`pull: phase per-file dispatcher for ${knowledgeId} starting`); throwIfCancelled(knowledgeId); - await analyseChangedFiles({ + const analyseChangedInput: Parameters[0] = { knowledgeId, - repoDir, + source, metaPaths, analyzer: fileAnalyzer, diff, - }); - - const source = createDiskSourceReader({ repoDir, commitHash: targetCommit }); + progressContext, + }; + if (llmCallContext !== undefined) { + analyseChangedInput.llmCallContext = llmCallContext; + } + if (archiveSink !== undefined) { + analyseChangedInput.archiveSink = archiveSink; + } + const phase1 = await analyseChangedFiles(analyseChangedInput); + let totalInputTokens = phase1.tokenUsage.inputTokens; + let totalOutputTokens = phase1.tokenUsage.outputTokens; + let totalCostUsd = phase1.tokenUsage.costUsd; logger.info(`pull: phase process big files starting`); throwIfCancelled(knowledgeId); - await processBigFilesQueue({ knowledgeId, source, metaPaths }); + const processBigFilesInput: Parameters[0] = { + knowledgeId, + source, + metaPaths, + progressContext, + }; + if (llmCallContext !== undefined) { + processBigFilesInput.llmCallContext = llmCallContext; + } + const phase2 = await processBigFilesQueue(processBigFilesInput); + totalInputTokens += phase2.tokenUsage.inputTokens; + totalOutputTokens += phase2.tokenUsage.outputTokens; + totalCostUsd += phase2.tokenUsage.costUsd; - logger.info(`pull: phase backfill fields starting`); + logger.info(`pull: loading file-analysis cache`); throwIfCancelled(knowledgeId); - await backfillMissingFields(metaPaths); + const fileAnalysisCache = await FileAnalysisCache.loadAll(metaPaths); + const limiter = withConcurrency(getConfigValue(Config.LlmConcurrency)); - logger.info(`pull: phase backfill big-files starting`); + logger.info(`pull: phase backfill fields starting`); throwIfCancelled(knowledgeId); - await backfillBigFiles({ knowledgeId, source, metaPaths }); + await backfillMissingFields(metaPaths, fileAnalysisCache, limiter, llmCallContext, progressContext); + progressContext.phaseChanged("folder_analysis"); logger.info(`pull: phase selective folder summary (${affectedFolders.size} folders) starting`); throwIfCancelled(knowledgeId); - await runSelectiveFolderSummary({ knowledgeId, metaPaths, affectedFolders }); + const selectiveInput: Parameters[0] = { + knowledgeId, + metaPaths, + cache: fileAnalysisCache, + limiter, + affectedFolders, + }; + if (llmCallContext !== undefined) { + selectiveInput.llmCallContext = llmCallContext; + } + const phase5 = await runSelectiveFolderSummary(selectiveInput); + totalInputTokens += phase5.tokenUsage.inputTokens; + totalOutputTokens += phase5.tokenUsage.outputTokens; + totalCostUsd += phase5.tokenUsage.costUsd; + progressContext.phaseChanged("indexing"); logger.info(`pull: phase repo summary starting`); throwIfCancelled(knowledgeId); const orgId = resolveOrgId({ ...(knowledge.source.kind === "github" ? {} : {}) }); const scope: NodeScope = { orgId, knowledgeId, repoId: knowledgeId }; - const repoSummary = await summariseRepo(knowledgeId, metaPaths); + const { summary: repoSummary, tokenUsage: repoUsage } = await summariseRepo(knowledgeId, metaPaths, llmCallContext); + totalInputTokens += repoUsage.inputTokens; + totalOutputTokens += repoUsage.outputTokens; + totalCostUsd += repoUsage.costUsd; if (repoSummary !== null) { await persistRepoSummary(metaPaths, makeRepoSummaryEnvelope(knowledgeId, orgId, repoSummary)); } @@ -172,27 +245,37 @@ export async function runPull(msg: JobMessage): Promise affectedFolders, }); - await persistPullStats({ + await setKnowledgeCommit( knowledgeId, - repoName: repoNameFromUrl(repoUrl), - commitHash: targetCommit, - filesAnalyzed: stored.filesUpserted, - foldersSummarised: stored.foldersUpserted, - processingTimeMs: Date.now() - startedAt, - }); - await setKnowledgeCommit(knowledgeId, targetCommit); + targetCommit, + String(totalInputTokens), + String(totalOutputTokens), + String(totalCostUsd), + ); await transitionState(knowledgeId, KnowledgeState.Processed); + progressContext.completed("github_pull complete"); logger.info( `pull: ${knowledgeId} ${currentCommit.slice(0, 12)} -> ${targetCommit.slice(0, 12)} done (filesUpserted=${stored.filesUpserted} filesDeleted=${stored.filesDeleted} foldersUpserted=${stored.foldersUpserted})`, ); + return { + filesAnalyzed: stored.filesUpserted, + foldersSummarised: stored.foldersUpserted, + repoSummarised: repoSummary !== null, + graphNodesWritten: stored.filesUpserted + stored.foldersUpserted, + commitHash: targetCommit, + tokenUsage: { inputTokens: totalInputTokens, outputTokens: totalOutputTokens, costUsd: totalCostUsd }, + }; } catch (cause: unknown) { if (cause instanceof CancellationError) { clearCancellation(knowledgeId); logger.info(`pull: cancelled for ${knowledgeId}`); throw cause; } - await transitionState(knowledgeId, KnowledgeState.Failed).catch(() => undefined); - throw new IngestError(knowledgeId, `github_pull failed: ${describe(cause)}`, cause); + const { category, reason, detail } = classifyFailure(cause); + await markKnowledgeFailed(knowledgeId, reason, category, detail).catch(() => undefined); + await setKnowledgeStateInGraph(knowledgeId, KnowledgeState.Failed).catch(() => undefined); + progressContext.failed(reason, undefined, category, detail); + throw new IngestError(knowledgeId, `github_pull failed: ${reason}`, cause); } } @@ -201,48 +284,13 @@ async function transitionState(knowledgeId: string, state: KnowledgeState): Prom await setKnowledgeStateInGraph(knowledgeId, state).catch(() => undefined); } -interface PersistPullStatsInput { - knowledgeId: string; - repoName: string; - commitHash: string; - filesAnalyzed: number; - foldersSummarised: number; - processingTimeMs: number; -} - -async function persistPullStats(input: PersistPullStatsInput): Promise { - const estimatedCost = await estimateCostFromBreakdown({}); - await recordProcessingStats({ - knowledgeId: input.knowledgeId, - repoName: input.repoName, - commitHash: input.commitHash, - modelTokens: {}, - estimatedCost, - totalBatches: 1, - totalFiles: input.filesAnalyzed, - totalFolders: input.foldersSummarised, - filesAnalyzed: input.filesAnalyzed, - processingTimeMs: input.processingTimeMs, - }); -} - -function repoNameFromUrl(repoUrl: string): string { - try { - const segments = new URL(repoUrl).pathname - .split("/") - .map((s) => s.trim()) - .filter((s) => s.length > 0); - const repo = segments.at(-1)?.replace(/\.git$/u, ""); - const owner = segments.at(-2); - if (owner !== undefined && repo !== undefined) { - return `${owner}/${repo}`; - } - } catch { - // fall through - } - return repoUrl; -} - -function describe(cause: unknown): string { - return cause instanceof Error ? cause.message : String(cause); +function emptyPullSummary(commitHash: string): PipelineSummary { + return { + filesAnalyzed: 0, + foldersSummarised: 0, + repoSummarised: false, + graphNodesWritten: 0, + commitHash, + tokenUsage: { inputTokens: 0, outputTokens: 0, costUsd: 0 }, + }; } diff --git a/packages/ingest-github/src/pipeline/run.ts b/packages/ingest-github/src/pipeline/run.ts index 5d76146..eca725e 100644 --- a/packages/ingest-github/src/pipeline/run.ts +++ b/packages/ingest-github/src/pipeline/run.ts @@ -1,25 +1,26 @@ -import { Config, KnowledgeState, type GithubIndexPayload, type LocalIngestPayload } from "@bb/types"; -import { getConfigValue } from "@bb/config"; -import { recordProcessingStats, setKnowledgeCommit, setKnowledgeState } from "@bb/mongo"; -import { setKnowledgeStateInGraph } from "@bb/neo4j"; -import { estimateCostFromBreakdown } from "@bb/llm"; +import { + KnowledgeState, + type GithubIndexPayload, + type KnowledgeFailureCategory, + type LocalIngestPayload, +} from "@bb/types"; +import { markKnowledgeFailed, setKnowledgeBranch, setKnowledgeCommit, setKnowledgeState } from "@bb/mongo"; +import { setKnowledgeBranchInGraph, setKnowledgeStateInGraph } from "@bb/neo4j"; import { IngestError } from "@bb/errors"; import { logger } from "@bb/logger"; -import type { IngestRunnerDeps, IngestRunnerInput } from "src/types/ingest-runner.ts"; -import type { IngestStrategy } from "src/types/strategy.ts"; -import type { ArchiveSink, PipelineSummary, SourceFactory, SourceReader } from "src/types/pipeline.ts"; +import { classifyFailure } from "./failure-classifier.ts"; +import type { IngestRunnerDeps, IngestRunnerInput } from "#src/types/ingest-runner.ts"; +import type { IngestStrategy } from "#src/types/strategy.ts"; +import type { ArchiveSink, PipelineSummary, SourceFactory, SourceReader } from "#src/types/pipeline.ts"; +import type { ProgressContextFactory } from "#src/progress/types.ts"; +import { nullProgressContextFactory } from "#src/progress/NullProgressReporter.ts"; import { ensureMetaDirs, ensureReposRoot, metaPathsFor, repoCloneDir } from "./paths.ts"; import { readHeadCommitHash, syncRepository } from "./source.ts"; import { resolveBranch } from "./branch.ts"; import { CancellationError, clearCancellation, throwIfCancelled } from "./cancellation.ts"; import { createDiskSourceReader } from "./disk-source-reader.ts"; - -function resolveOrgId(payload: { orgId?: string }): string { - if (typeof payload.orgId === "string" && payload.orgId.length > 0) { - return payload.orgId; - } - return getConfigValue(Config.OrgId); -} +import { resolveOrgId, llmCallContextFromPayload } from "./context.ts"; +import { localRepoName } from "./stats.ts"; export interface CreatePipelineRunnerDeps { reposRootDir: string; @@ -31,16 +32,23 @@ export interface CreatePipelineRunnerDeps { * supplies one. */ sourceFactory?: SourceFactory; + /** + * Optional progress context factory. When provided, the runner emits + * pre-strategy phase changes (`clone`, `scan`) so SSE clients see liveness + * during the network/disk-bound prelude. Defaults to a no-op. + */ + progressContextFactory?: ProgressContextFactory; } export function createPipelineRunner(deps: CreatePipelineRunnerDeps): IngestRunnerDeps { + const progressContextFactory = deps.progressContextFactory ?? nullProgressContextFactory; return { reposRootDir: deps.reposRootDir, strategy: deps.strategy, run: async (input: IngestRunnerInput): Promise => { const payload = input.payload; if (isGithubPayload(payload)) { - return await runGithub(deps.strategy, payload, deps.sourceFactory); + return await runGithub(deps.strategy, payload, deps.sourceFactory, progressContextFactory); } return await runLocal(deps.strategy, payload); }, @@ -51,19 +59,25 @@ async function runGithub( strategy: IngestStrategy, payload: GithubIndexPayload, sourceFactory: SourceFactory | undefined, + progressContextFactory: ProgressContextFactory, ): Promise { const { knowledgeId } = payload; clearCancellation(knowledgeId); const startedAt = Date.now(); await transitionState(knowledgeId, KnowledgeState.Processing); + const progressContext = progressContextFactory(knowledgeId); + let strategyStarted = false; try { throwIfCancelled(knowledgeId); - const branch = resolveBranch(knowledgeId, payload); + const branch = await resolveBranch(knowledgeId, payload, payload.gitToken); + await setKnowledgeBranch(knowledgeId, branch); + await setKnowledgeBranchInGraph(knowledgeId, branch).catch(() => undefined); let source: SourceReader; let archiveSink: ArchiveSink | undefined; let commitHash: string; + progressContext.phaseChanged("clone"); if (sourceFactory !== undefined) { const factoryResult = await sourceFactory({ knowledgeId, payload, branch }); source = factoryResult.source; @@ -89,38 +103,53 @@ async function runGithub( source = createDiskSourceReader({ repoDir, commitHash }); } + progressContext.phaseChanged("scan"); const metaPaths = metaPathsFor(knowledgeId); await ensureMetaDirs(metaPaths); + const baseContext: Parameters[0]["context"] = { + knowledgeId, + orgId: resolveOrgId(payload), + repoId: knowledgeId, + }; + const llmCallContext = llmCallContextFromPayload(payload); + if (llmCallContext !== undefined) { + baseContext.llmCallContext = llmCallContext; + } const strategyInput: Parameters[0] = { payload, branch, source, metaPaths, - context: { knowledgeId, orgId: resolveOrgId(payload), repoId: knowledgeId }, + context: baseContext, }; if (archiveSink !== undefined) { strategyInput.archiveSink = archiveSink; } + strategyStarted = true; const result = await strategy.execute(strategyInput); - await persistStats({ + await setKnowledgeCommit( knowledgeId, - repoName: repoNameFromUrl(payload.repoUrl), commitHash, - filesAnalyzed: result.filesAnalyzed, - foldersSummarised: result.foldersSummarised, - processingTimeMs: Date.now() - startedAt, - }); - await setKnowledgeCommit(knowledgeId, commitHash); + String(result.tokenUsage.inputTokens), + String(result.tokenUsage.outputTokens), + String(result.tokenUsage.costUsd), + ); await transitionState(knowledgeId, KnowledgeState.Processed); + const totalMs = Date.now() - startedAt; + logger.info( + `pipeline/run: ✓ github_index complete (knowledgeId=${knowledgeId}, commit=${commitHash.slice(0, 12)}, files=${result.filesAnalyzed}, folders=${result.foldersSummarised}, nodes=${result.graphNodesWritten}, ${totalMs}ms)`, + ); + return { filesAnalyzed: result.filesAnalyzed, foldersSummarised: result.foldersSummarised, repoSummarised: result.repoSummarised, graphNodesWritten: result.graphNodesWritten, commitHash, + tokenUsage: result.tokenUsage, }; } catch (cause: unknown) { if (cause instanceof CancellationError) { @@ -128,8 +157,12 @@ async function runGithub( logger.info(`pipeline/run: ingestion cancelled for ${knowledgeId}`); throw cause; } - await transitionState(knowledgeId, KnowledgeState.Failed).catch(() => undefined); - throw new IngestError(knowledgeId, `github_index pipeline failed: ${describe(cause)}`, cause); + const { category, reason, detail } = classifyFailure(cause); + await persistFailure(knowledgeId, category, reason, detail); + if (!strategyStarted) { + progressContext.failed(reason, undefined, category, detail); + } + throw new IngestError(knowledgeId, `github_index pipeline failed: ${reason}`, cause); } } @@ -154,14 +187,9 @@ async function runLocal(strategy: IngestStrategy, payload: LocalIngestPayload): }); const commitHash = `local-${startedAt}`; - await persistStats({ - knowledgeId, - repoName: localRepoName(rootDir), - commitHash, - filesAnalyzed: result.filesAnalyzed, - foldersSummarised: result.foldersSummarised, - processingTimeMs: Date.now() - startedAt, - }); + logger.info( + `pipeline/run: ✓ local_ingest complete (knowledgeId=${knowledgeId}, repo=${localRepoName(rootDir)}, files=${result.filesAnalyzed}, in=${result.tokenUsage.inputTokens}, out=${result.tokenUsage.outputTokens}, cost=$${result.tokenUsage.costUsd})`, + ); await transitionState(knowledgeId, KnowledgeState.Processed); return { filesAnalyzed: result.filesAnalyzed, @@ -169,14 +197,16 @@ async function runLocal(strategy: IngestStrategy, payload: LocalIngestPayload): repoSummarised: result.repoSummarised, graphNodesWritten: result.graphNodesWritten, commitHash, + tokenUsage: result.tokenUsage, }; } catch (cause: unknown) { if (cause instanceof CancellationError) { clearCancellation(knowledgeId); throw cause; } - await transitionState(knowledgeId, KnowledgeState.Failed).catch(() => undefined); - throw new IngestError(knowledgeId, `local_ingest pipeline failed: ${describe(cause)}`, cause); + const { category, reason, detail } = classifyFailure(cause); + await persistFailure(knowledgeId, category, reason, detail); + throw new IngestError(knowledgeId, `local_ingest pipeline failed: ${reason}`, cause); } } @@ -185,57 +215,21 @@ async function transitionState(knowledgeId: string, state: KnowledgeState): Prom await setKnowledgeStateInGraph(knowledgeId, state).catch(() => undefined); } -interface PersistStatsInput { - knowledgeId: string; - repoName: string; - commitHash: string; - filesAnalyzed: number; - foldersSummarised: number; - processingTimeMs: number; -} - -async function persistStats(input: PersistStatsInput): Promise { - const estimatedCost = await estimateCostFromBreakdown({}); - await recordProcessingStats({ - knowledgeId: input.knowledgeId, - repoName: input.repoName, - commitHash: input.commitHash, - modelTokens: {}, - estimatedCost, - totalBatches: 1, - totalFiles: input.filesAnalyzed, - totalFolders: input.foldersSummarised, - filesAnalyzed: input.filesAnalyzed, - processingTimeMs: input.processingTimeMs, - }); +/** + * Persists the FAILED state + structured failure reason to Mongo, then + * mirrors the state into Neo4j on a best-effort basis. Errors from both + * sides are swallowed so the throw path is preserved. + */ +async function persistFailure( + knowledgeId: string, + category: KnowledgeFailureCategory, + reason: string, + detail?: string, +): Promise { + await markKnowledgeFailed(knowledgeId, reason, category, detail).catch(() => undefined); + await setKnowledgeStateInGraph(knowledgeId, KnowledgeState.Failed).catch(() => undefined); } function isGithubPayload(payload: GithubIndexPayload | LocalIngestPayload): payload is GithubIndexPayload { return (payload as GithubIndexPayload).repoUrl !== undefined; } - -function repoNameFromUrl(repoUrl: string): string { - try { - const segments = new URL(repoUrl).pathname - .split("/") - .map((s) => s.trim()) - .filter((s) => s.length > 0); - const repo = segments.at(-1)?.replace(/\.git$/u, ""); - const owner = segments.at(-2); - if (owner !== undefined && repo !== undefined) { - return `${owner}/${repo}`; - } - } catch { - // fall through - } - return repoUrl; -} - -function localRepoName(rootDir: string): string { - const segments = rootDir.split("/").filter((s) => s.length > 0); - return segments.at(-1) ?? rootDir; -} - -function describe(cause: unknown): string { - return cause instanceof Error ? cause.message : String(cause); -} diff --git a/packages/ingest-github/src/pipeline/scan.ts b/packages/ingest-github/src/pipeline/scan.ts index 5bbf5e5..d7d9db6 100644 --- a/packages/ingest-github/src/pipeline/scan.ts +++ b/packages/ingest-github/src/pipeline/scan.ts @@ -2,9 +2,11 @@ import { opendir, readFile, stat } from "node:fs/promises"; import path from "node:path"; import { Config } from "@bb/types"; import { getConfigValue } from "@bb/config"; +import type { AskLlmOptions } from "@bb/llm"; import { logger } from "@bb/logger"; import { SKIP_DIRS, looksBinary, passesPathFilters } from "./filters.ts"; -import type { ScanEntry, SkipDecider } from "src/types/pipeline.ts"; +import type { ConcurrencyLimiter } from "./concurrency.ts"; +import type { ScanEntry, SkipDecider, SkipDeciderInput } from "#src/types/pipeline.ts"; interface ScanLimits { absoluteCap: number; @@ -13,18 +15,8 @@ interface ScanLimits { export interface ScanRepositoryDeps { skipDecider?: SkipDecider; -} - -export async function* scanRepository(rootDir: string, deps: ScanRepositoryDeps = {}): AsyncGenerator { - const limits: ScanLimits = { - absoluteCap: getConfigValue(Config.AbsoluteFileSizeCap), - bigFileLineThreshold: getConfigValue(Config.BigFileLineThreshold), - }; - const counts = { acceptStatic: 0, acceptLlm: 0, rejectStatic: 0, rejectLlm: 0, oversized: 0, binary: 0 }; - yield* walk(rootDir, rootDir, limits, deps, counts); - logger.info( - `scan: acceptStatic=${counts.acceptStatic} acceptLlm=${counts.acceptLlm} rejectStatic=${counts.rejectStatic} rejectLlm=${counts.rejectLlm} oversized=${counts.oversized} binary=${counts.binary}`, - ); + llmCallContext?: AskLlmOptions; + limiter?: ConcurrencyLimiter; } interface ScanCounts { @@ -36,6 +28,44 @@ interface ScanCounts { binary: number; } +interface PendingFile { + relativePath: string; + absolutePath: string; + sizeBytes: number; + content: string; + ext: string; + input: SkipDeciderInput; +} + +function newCounts(): ScanCounts { + return { acceptStatic: 0, acceptLlm: 0, rejectStatic: 0, rejectLlm: 0, oversized: 0, binary: 0 }; +} + +function logCounts(counts: ScanCounts): void { + logger.info( + `scan: acceptStatic=${counts.acceptStatic} acceptLlm=${counts.acceptLlm} rejectStatic=${counts.rejectStatic} rejectLlm=${counts.rejectLlm} oversized=${counts.oversized} binary=${counts.binary}`, + ); +} + +export async function* scanRepository(rootDir: string, deps: ScanRepositoryDeps = {}): AsyncGenerator { + const limits: ScanLimits = { + absoluteCap: getConfigValue(Config.AbsoluteFileSizeCap), + bigFileLineThreshold: getConfigValue(Config.BigFileLineThreshold), + }; + + // Two-pass parallel mode requires both a skip-decider AND a limiter so that + // pending LLM resolutions can be deduplicated and dispatched concurrently. + // Without either, fall back to the inline-await walk that's been here all along. + if (deps.skipDecider !== undefined && deps.limiter !== undefined) { + yield* twoPassScan(rootDir, limits, deps.skipDecider, deps.limiter, deps); + return; + } + + const counts = newCounts(); + yield* walk(rootDir, rootDir, limits, deps, counts); + logCounts(counts); +} + async function* walk( rootDir: string, currentDir: string, @@ -80,7 +110,11 @@ async function* walk( continue; } if (deps.skipDecider !== undefined) { - const decision = await deps.skipDecider.decide({ relativePath, absolutePath: abs, ext }); + const deciderInput: SkipDeciderInput = { relativePath, absolutePath: abs, ext }; + if (deps.llmCallContext !== undefined) { + deciderInput.llmCallContext = deps.llmCallContext; + } + const decision = await deps.skipDecider.decide(deciderInput); if (decision === "reject-static") { counts.rejectStatic += 1; continue; @@ -107,6 +141,145 @@ async function* walk( } } +async function* twoPassScan( + rootDir: string, + limits: ScanLimits, + decider: SkipDecider, + limiter: ConcurrencyLimiter, + deps: ScanRepositoryDeps, +): AsyncGenerator { + const counts = newCounts(); + const pending: PendingFile[] = []; + + // Pass 1: walk + categorize. Static-decided files yield immediately; + // "needs LLM" files go into `pending` for batch resolution. + yield* walkAndCategorize(rootDir, rootDir, limits, deps, decider, counts, pending); + + // Pass 2: dedupe pending by decision key (extension or filename), schedule + // one LLM call per unique key through the shared limiter, then persist the + // decider's cache once. + if (pending.length > 0) { + const unique = new Map(); + for (const p of pending) { + const key = decisionKey(p); + if (!unique.has(key)) { + unique.set(key, p.input); + } + } + logger.info(`scan: resolving ${unique.size} unique skip-decision keys for ${pending.length} pending files`); + await Promise.all(Array.from(unique.values()).map((input) => limiter(() => decider.decideAndDeferSave(input)))); + decider.persist(); + } + + // Pass 3: drain pending. Every decideStatic call is now a cache hit. + for (const p of pending) { + const decision = decider.decideStatic(p.input); + if (decision === "reject-static" || decision === null) { + counts.rejectStatic += 1; + continue; + } + if (decision === "reject-llm") { + counts.rejectLlm += 1; + continue; + } + if (decision === "accept-llm") { + counts.acceptLlm += 1; + } else { + counts.acceptStatic += 1; + } + yield { + kind: "file", + relativePath: p.relativePath, + absolutePath: p.absolutePath, + sizeBytes: p.sizeBytes, + content: p.content, + }; + } + + logCounts(counts); +} + +async function* walkAndCategorize( + rootDir: string, + currentDir: string, + limits: ScanLimits, + deps: ScanRepositoryDeps, + decider: SkipDecider, + counts: ScanCounts, + pending: PendingFile[], +): AsyncGenerator { + const dir = await opendir(currentDir); + for await (const entry of dir) { + const abs = path.join(currentDir, entry.name); + if (entry.isDirectory()) { + if (SKIP_DIRS.has(entry.name)) { + continue; + } + yield* walkAndCategorize(rootDir, abs, limits, deps, decider, counts, pending); + continue; + } + if (!entry.isFile()) { + continue; + } + if (!passesPathFilters(entry.name, path.extname(entry.name))) { + counts.rejectStatic += 1; + continue; + } + const sizeBytes = (await stat(abs)).size; + const relativePath = path.relative(rootDir, abs); + const ext = path.extname(entry.name).toLowerCase(); + if (sizeBytes > limits.absoluteCap) { + counts.oversized += 1; + yield { kind: "oversized", relativePath, absolutePath: abs, sizeBytes }; + continue; + } + const buf = await readFile(abs); + if (looksBinary(buf)) { + counts.binary += 1; + continue; + } + const content = buf.toString("utf8"); + if (countLines(content) > limits.bigFileLineThreshold) { + counts.oversized += 1; + yield { kind: "oversized", relativePath, absolutePath: abs, sizeBytes }; + continue; + } + const deciderInput: SkipDeciderInput = { relativePath, absolutePath: abs, ext }; + if (deps.llmCallContext !== undefined) { + deciderInput.llmCallContext = deps.llmCallContext; + } + const sync = decider.decideStatic(deciderInput); + if (sync === "reject-static") { + counts.rejectStatic += 1; + continue; + } + if (sync === "reject-llm") { + counts.rejectLlm += 1; + continue; + } + if (sync === "accept-llm") { + counts.acceptLlm += 1; + yield { kind: "file", relativePath, absolutePath: abs, sizeBytes, content }; + continue; + } + if (sync === "accept") { + counts.acceptStatic += 1; + yield { kind: "file", relativePath, absolutePath: abs, sizeBytes, content }; + continue; + } + // sync === null → needs LLM. Defer to pass 2. + pending.push({ relativePath, absolutePath: abs, sizeBytes, content, ext, input: deciderInput }); + } +} + +function decisionKey(p: PendingFile): string { + if (p.ext.length > 0) { + return `ext:${p.ext}`; + } + const segments = p.relativePath.split("/"); + return `filename:${segments[segments.length - 1] ?? p.relativePath}`; +} + function countLines(content: string): number { if (content.length === 0) { return 0; diff --git a/packages/ingest-github/src/pipeline/skip-decisions/README.md b/packages/ingest-github/src/pipeline/skip-decisions/README.md index 90d7096..18d80bb 100644 --- a/packages/ingest-github/src/pipeline/skip-decisions/README.md +++ b/packages/ingest-github/src/pipeline/skip-decisions/README.md @@ -17,6 +17,36 @@ single-tenant public layout. 8. Persist verdict to ~/.bytebell/llmDecisions.json. LLM failure → reject + cache the rejection. ``` +Steps 1-6 are pure CPU + cached lookup — they run synchronously via +`decideStatic`. Step 7 is the slow LLM branch; `decide` performs it +inline, while `decideAndDeferSave` performs it without flushing the +cache to disk so a batched caller can `persist()` once at the end of +its batch. + +## Public methods (`SkipDecider`) + +```ts +interface SkipDecider { + decide(input): Promise; // legacy single-shot path + decideStatic(input): SkipDecision | null; // sync; null = needs LLM + decideAndDeferSave(input): Promise; // LLM call, no disk save + persist(): void; // flush cache to disk once +} +``` + +- `decide` — the original single-shot API. Calls `decideStatic`; if that + returns `null`, runs the LLM call and `persist()`s the cache. Used by + the legacy `walk()` in `scan.ts` when no shared limiter is passed + (e.g. custom `SourceFactory` consumers that don't opt into two-pass). +- `decideStatic` — synchronous. Returns the resolved `SkipDecision` for + steps 1-6; returns `null` to signal "would need an LLM call". Used by + the two-pass scan to categorise files without blocking the walk. +- `decideAndDeferSave` — runs the LLM call and mutates the in-memory + cache but does **not** flush to disk. Scan calls this concurrently + for unique extension/filename keys under a shared limiter; the disk + write happens once via `persist()` after the batch. +- `persist` — best-effort cache flush; swallows I/O errors. + ## Files - `seed.ts` — loads the four bundled JSON files (directory/filename/pattern/extension lists) @@ -34,7 +64,12 @@ single-tenant public layout. - `decider.ts` — `makeSkipDecider(deps)` returns a `SkipDecider` (port type from `src/types/pipeline.ts`). Reads `Config.SkipDecisionEnabled` once at factory time; when disabled the decider degrades to "accept everything - past the static blocklist". + past the static blocklist". The LLM branch forwards + `SkipDeciderInput.llmCallContext` (when set by the runner) into + `askYesNoLLM` so per-job credentials reach the decision call. The four + methods (`decide`, `decideStatic`, `decideAndDeferSave`, `persist`) share + one internal `staticDecision()` helper so the seed-list + cache-lookup + branch is defined exactly once. - `seed-data/` — the five JSON files copied from kube's `shared/`: `directoryIgnore.json`, `filenameIgnore.json`, `ignorePatterns.json`, `extensions.json`, `llmDecisionsBase.json`. `llmDecisionsBase.json` is @@ -54,8 +89,15 @@ single-tenant public layout. beyond reading the cache file once at factory time. Only the LLM branch reads file content from disk, and even that is bounded by `Config.SkipDecisionMaxCharsForLlm`. -- Every LLM verdict is flushed to disk immediately so a crash mid-scan does - not lose decisions made earlier in the run. +- `decide` flushes to disk immediately after each LLM verdict — same + semantics as before this refactor, so crash mid-scan does not lose + decisions made earlier in the run when the legacy inline path is in use. +- `decideAndDeferSave` does **not** flush; the batched caller (two-pass + scan) is responsible for calling `persist()` exactly once after the + parallel batch resolves. This avoids racing tmp/rename writes when many + unique extensions resolve concurrently. Crash recovery in two-pass mode + is acceptable because the batch is short and re-running the scan + re-resolves the same decisions. - LLM failure defaults to reject and caches the rejection — matches kube's one-shot-rule behavior. Users can hand-edit the cache to revisit. - The decider is process-local: tests may construct one with `cachePath` diff --git a/packages/ingest-github/src/pipeline/skip-decisions/decider.ts b/packages/ingest-github/src/pipeline/skip-decisions/decider.ts index 91ad7ae..50185e8 100644 --- a/packages/ingest-github/src/pipeline/skip-decisions/decider.ts +++ b/packages/ingest-github/src/pipeline/skip-decisions/decider.ts @@ -2,9 +2,9 @@ import { readFile } from "node:fs/promises"; import path from "node:path"; import { Config } from "@bb/types"; import { getConfigValue } from "@bb/config"; -import { askYesNoLLM } from "@bb/llm"; +import { askYesNoLLM, type AskLlmOptions } from "@bb/llm"; import { logger } from "@bb/logger"; -import type { SkipDecider, SkipDeciderInput, SkipDecision } from "src/types/pipeline.ts"; +import type { SkipDecider, SkipDeciderInput, SkipDecision } from "#src/types/pipeline.ts"; import { defaultCachePath, emptyCache, @@ -29,6 +29,11 @@ export interface SkipDeciderDeps { cachePath?: string; } +interface StaticDecisionContext { + filename: string; + segments: string[]; +} + export function makeSkipDecider(deps: SkipDeciderDeps = {}): SkipDecider { const enabled = getConfigValue(Config.SkipDecisionEnabled); const cachePath = deps.cachePath ?? defaultCachePath(); @@ -37,58 +42,98 @@ export function makeSkipDecider(deps: SkipDeciderDeps = {}): SkipDecider { logCacheSummary(cache); } - return { - async decide(input: SkipDeciderInput): Promise { - const segments = input.relativePath.split("/"); - const filename = segments[segments.length - 1] ?? input.relativePath; - for (const segment of segments.slice(0, -1)) { - if (SEED_DIRECTORIES.has(segment)) { - return "reject-static"; - } - } - if (SEED_FILENAMES.has(filename)) { - return "reject-static"; - } - if (input.ext.length > 0 && SEED_EXTENSIONS.has(input.ext)) { - return "reject-static"; - } - if (matchesAnyGlob(filename)) { + function contextFor(input: SkipDeciderInput): StaticDecisionContext { + const segments = input.relativePath.split("/"); + const filename = segments[segments.length - 1] ?? input.relativePath; + return { filename, segments }; + } + + function staticDecision(input: SkipDeciderInput): SkipDecision | null { + const { filename, segments } = contextFor(input); + for (const segment of segments.slice(0, -1)) { + if (SEED_DIRECTORIES.has(segment)) { return "reject-static"; } + } + if (SEED_FILENAMES.has(filename)) { + return "reject-static"; + } + if (input.ext.length > 0 && SEED_EXTENSIONS.has(input.ext)) { + return "reject-static"; + } + if (matchesAnyGlob(filename)) { + return "reject-static"; + } - if (input.ext.length > 0 && KNOWN_LANGUAGE_EXTENSIONS.has(input.ext)) { - return "accept"; - } + if (input.ext.length > 0 && KNOWN_LANGUAGE_EXTENSIONS.has(input.ext)) { + return "accept"; + } - if (!enabled) { - return "accept"; - } + if (!enabled) { + return "accept"; + } - const cacheKey = input.ext.length > 0 ? input.ext : filename; - const section = input.ext.length > 0 ? cache.extensions : cache.filenames; - const cached = section[cacheKey]; - if (cached !== undefined) { - return cached.ignore ? "reject-llm" : "accept-llm"; - } + const cacheKey = input.ext.length > 0 ? input.ext : filename; + const section = input.ext.length > 0 ? cache.extensions : cache.filenames; + const cached = section[cacheKey]; + if (cached !== undefined) { + return cached.ignore ? "reject-llm" : "accept-llm"; + } + return null; + } + + async function resolveLlm(input: SkipDeciderInput): Promise { + const { filename } = contextFor(input); + const decision = await askLlmDecision(input, deps.repositoryName, input.llmCallContext); + if (input.ext.length > 0) { + setExtensionDecision(cache, input.ext, !decision, "llm", deps.repositoryName, input.relativePath); + } else { + setFilenameDecision(cache, filename, !decision, "llm", deps.repositoryName, input.relativePath); + } + return decision ? "accept-llm" : "reject-llm"; + } + + function persist(): void { + if (!enabled) { + return; + } + try { + saveCache(cachePath, cache); + } catch (cause: unknown) { + const msg = cause instanceof Error ? cause.message : String(cause); + logger.warn(`skip-decisions: failed to save cache to ${cachePath}: ${msg}`); + } + } - const decision = await askLlmDecision(input, deps.repositoryName); - if (input.ext.length > 0) { - setExtensionDecision(cache, input.ext, !decision, "llm", deps.repositoryName, input.relativePath); - } else { - setFilenameDecision(cache, filename, !decision, "llm", deps.repositoryName, input.relativePath); + return { + async decide(input: SkipDeciderInput): Promise { + const sync = staticDecision(input); + if (sync !== null) { + return sync; } - try { - saveCache(cachePath, cache); - } catch (cause: unknown) { - const msg = cause instanceof Error ? cause.message : String(cause); - logger.warn(`skip-decisions: failed to save cache to ${cachePath}: ${msg}`); + const result = await resolveLlm(input); + persist(); + return result; + }, + decideStatic(input: SkipDeciderInput): SkipDecision | null { + return staticDecision(input); + }, + async decideAndDeferSave(input: SkipDeciderInput): Promise { + const sync = staticDecision(input); + if (sync !== null) { + return sync; } - return decision ? "accept-llm" : "reject-llm"; + return await resolveLlm(input); }, + persist, }; } -async function askLlmDecision(input: SkipDeciderInput, repositoryName: string | undefined): Promise { +async function askLlmDecision( + input: SkipDeciderInput, + repositoryName: string | undefined, + llmCallContext: AskLlmOptions | undefined, +): Promise { const maxChars = getConfigValue(Config.SkipDecisionMaxCharsForLlm); let content: string; if (input.content !== undefined) { @@ -115,6 +160,7 @@ async function askLlmDecision(input: SkipDeciderInput, repositoryName: string | content, truncatedTo: content.length, }), + llmCallContext ?? {}, ); if (result.decision === null) { logger.warn(`skip-decisions: LLM returned no decision for ${input.relativePath}; defaulting to reject`); diff --git a/packages/ingest-github/src/pipeline/skip-decisions/seed-data/ignorePatterns.json b/packages/ingest-github/src/pipeline/skip-decisions/seed-data/ignorePatterns.json index f7991f1..96de6e3 100644 --- a/packages/ingest-github/src/pipeline/skip-decisions/seed-data/ignorePatterns.json +++ b/packages/ingest-github/src/pipeline/skip-decisions/seed-data/ignorePatterns.json @@ -305,7 +305,8 @@ { "type": "exact", "pattern": "CODE_OF_CONDUCT.txt" }, { "type": "exact", "pattern": "FAQ.md" }, { "type": "exact", "pattern": "TROUBLESHOOTING.md" }, - { "type": "exact", "pattern": "UPGRADING.md" } + { "type": "exact", "pattern": "UPGRADING.md" }, + { "type": "extension", "pattern": ".md" } ], "logFiles": [ { "type": "extension", "pattern": ".log" }, diff --git a/packages/ingest-github/src/pipeline/stats.ts b/packages/ingest-github/src/pipeline/stats.ts new file mode 100644 index 0000000..e7682e1 --- /dev/null +++ b/packages/ingest-github/src/pipeline/stats.ts @@ -0,0 +1,25 @@ +export function repoNameFromUrl(repoUrl: string): string { + try { + const segments = new URL(repoUrl).pathname + .split("/") + .map((s) => s.trim()) + .filter((s) => s.length > 0); + const repo = segments.at(-1)?.replace(/\.git$/u, ""); + const owner = segments.at(-2); + if (owner !== undefined && repo !== undefined) { + return `${owner}/${repo}`; + } + } catch { + // fall through + } + return repoUrl; +} + +export function localRepoName(rootDir: string): string { + const segments = rootDir.split("/").filter((s) => s.length > 0); + return segments.at(-1) ?? rootDir; +} + +export function describe(cause: unknown): string { + return cause instanceof Error ? cause.message : String(cause); +} diff --git a/packages/ingest-github/src/progress/NullProgressReporter.ts b/packages/ingest-github/src/progress/NullProgressReporter.ts new file mode 100644 index 0000000..a35f74f --- /dev/null +++ b/packages/ingest-github/src/progress/NullProgressReporter.ts @@ -0,0 +1,45 @@ +import type { + ProgressContext, + ProgressContextFactory, + ProgressPhase, + ProgressReporter, + ProgressReporterInput, +} from "#src/progress/types.ts"; + +class NullProgressReporter implements ProgressReporter { + async start(): Promise { + /* no-op */ + } + increment(_delta?: number, _meta?: { fileName?: string }): void { + /* no-op */ + } + incrementSeen(_delta?: number): void { + /* no-op */ + } + setTotal(_total: number): void { + /* no-op */ + } + stop(): void { + /* no-op */ + } +} + +class NullProgressContext implements ProgressContext { + reporter(_input: ProgressReporterInput): ProgressReporter { + return new NullProgressReporter(); + } + phaseChanged(_phase: ProgressPhase): void { + /* no-op */ + } + completed(_message?: string): void { + /* no-op */ + } + failed(_error: string, _phase?: ProgressPhase, _category?: string, _detail?: string): void { + /* no-op */ + } +} + +const SINGLETON: ProgressContext = new NullProgressContext(); + +/** Default factory used when no host binary supplies one. */ +export const nullProgressContextFactory: ProgressContextFactory = (_knowledgeId: string) => SINGLETON; diff --git a/packages/ingest-github/src/progress/README.md b/packages/ingest-github/src/progress/README.md new file mode 100644 index 0000000..e6d1013 --- /dev/null +++ b/packages/ingest-github/src/progress/README.md @@ -0,0 +1,35 @@ +# `ingest-github / progress` + +**Tier:** Domain extension port + +## Responsibility + +Defines the host-binary extension port for observing ingestion-phase progress without coupling `@bb/ingest-github` to any transport. + +The strategy emits two kinds of signals through this port: + +- **Intra-phase ticks** via `ProgressReporter` — one reporter per phase or sub-phase of one job, driven by the strategy as it makes progress. +- **Phase boundaries and terminal state** via `ProgressContext.phaseChanged / completed / failed`. + +A host binary supplies a `ProgressContextFactory(knowledgeId)`. `@bb/server` does not — it falls back to `nullProgressContextFactory`, which discards every signal. + +## Public API + +- `ProgressPhase` — `"clone" | "scan" | "file_analysis" | "folder_analysis" | "indexing"`. `clone` and `scan` are emitted by `runGithub` (the runner) before the strategy starts, so SSE clients see liveness during the network/disk-bound prelude. `file_analysis`, `folder_analysis`, and `indexing` are emitted by the strategy. +- `ProgressTotalMode` — `{ kind: "fixed"; total }` or `{ kind: "growing"; initialTotal? }` +- `ProgressReporterInput` — phase + sub-phase + total mode + optional restart-seed hook +- `ProgressReporter` — `start / increment / incrementSeen / setTotal / stop` +- `ProgressContext` — bundles `reporter()` with boundary-event publishers +- `ProgressContextFactory` — `(knowledgeId) => ProgressContext` +- `nullProgressContextFactory` — no-op fallback used when the host does not supply one + +## Invariants + +- Pure types and a no-op default. No transport. No outbound calls. +- Tracker decisions (sampling cadence, persistence, fanout) belong to the host implementation. +- The strategy must call `reporter.stop()` in a `finally` so the host can emit a final tick deterministically. +- Reporters returned for the same `(knowledgeId, phase, subPhase)` are not reused across invocations — each `reporter()` call returns a fresh instance. + +## External dependencies + +None. diff --git a/packages/ingest-github/src/progress/types.ts b/packages/ingest-github/src/progress/types.ts new file mode 100644 index 0000000..5d2f1ab --- /dev/null +++ b/packages/ingest-github/src/progress/types.ts @@ -0,0 +1,54 @@ +/** + * Progress-reporting extension port. + * + * `@bb/ingest-github` exposes this interface so a host binary can observe + * phase progress without the strategy importing the host's transport. The + * default is a no-op (`NullProgressContext`) — consistent with the + * no-outbound-calls posture. + */ + +export type ProgressPhase = "clone" | "scan" | "file_analysis" | "folder_analysis" | "indexing"; + +export type ProgressTotalMode = { kind: "fixed"; total: number } | { kind: "growing"; initialTotal?: number }; + +export interface ProgressReporterInput { + readonly phase: ProgressPhase; + readonly subPhase?: string; + readonly total: ProgressTotalMode; + readonly resolveInitialProcessed?: () => Promise | number; +} + +/** + * Per-phase progress sink. One instance per phase or sub-phase of a job. + * The host implementation decides whether emissions are timer-sampled, + * push-per-call, persisted, or discarded. + */ +export interface ProgressReporter { + start(): Promise; + increment(delta?: number, meta?: { fileName?: string }): void; + /** Grow the denominator when the work set is a streaming iterator. */ + incrementSeen(delta?: number): void; + setTotal(total: number): void; + stop(): void; +} + +/** + * Bundle of progress facilities scoped to a single ingestion job. Returned + * by `ProgressContextFactory(knowledgeId)`. + */ +export interface ProgressContext { + reporter(input: ProgressReporterInput): ProgressReporter; + phaseChanged(phase: ProgressPhase): void; + completed(message?: string): void; + /** + * Emit a terminal FAILED event. `error` is a short operator-readable + * sentence (e.g. "OpenRouter is out of credits"). `category` is the + * classification taxonomy (`"llm_config" | "llm_auth" | "llm_quota" | + * "llm_rate_limit" | "llm_unreachable" | "cancelled" | "internal"`). + * `detail` is the optional raw provider response or structured debug + * payload — UIs typically hide it behind a disclosure. + */ + failed(error: string, phase?: ProgressPhase, category?: string, detail?: string): void; +} + +export type ProgressContextFactory = (knowledgeId: string) => ProgressContext; diff --git a/packages/ingest-github/src/strategies/basic-file-analysis/BasicFileAnalysisStrategy.ts.archived b/packages/ingest-github/src/strategies/basic-file-analysis/BasicFileAnalysisStrategy.ts.archived index 3eeaead..0c8696e 100644 --- a/packages/ingest-github/src/strategies/basic-file-analysis/BasicFileAnalysisStrategy.ts.archived +++ b/packages/ingest-github/src/strategies/basic-file-analysis/BasicFileAnalysisStrategy.ts.archived @@ -1,4 +1,3 @@ -// SPDX-License-Identifier: AGPL-3.0-only WITH non-commercial-clause // // ===================================================================== // ARCHIVED — v1 strategy. Superseded by `strategies/flat-folder/`. diff --git a/packages/ingest-github/src/strategies/flat-folder/README.md b/packages/ingest-github/src/strategies/flat-folder/README.md index b089716..78d8acf 100644 --- a/packages/ingest-github/src/strategies/flat-folder/README.md +++ b/packages/ingest-github/src/strategies/flat-folder/README.md @@ -1,50 +1,123 @@ # `@bb/ingest-github/src/strategies/flat-folder` -The v2 ingestion strategy: clone → scan → big-file split → per-file analyse → -folder summary → repo summary → graph store. Each phase persists artifacts on -disk before the next begins, so a crash resumes cleanly from the next -sub-phase boundary. +The v2 ingestion strategy: scan + classify → analyse small + big in parallel → +field backfill → folder summary → repo summary → graph store. Each phase +persists artifacts on disk before the next begins, so a crash resumes cleanly +from the next sub-phase boundary. + +The strategy constructs **one shared `ConcurrencyLimiter`** at entry (sized by +`Config.LlmConcurrency`, default 29). Every LLM call across small-file +analyses, big-file chunk analyses, per-file condense calls, the skip-decision +LLM gate (during scan), field backfill, and folder summaries checks out from +this single pool. One knob bounds total in-flight LLM concurrency. ## Phases -1. **classify-and-analyse-small** (`phases/classify-and-analyse-small.ts`) — - walks `source.scan({ skipDecider })`; small files → LLM file-analysis → - write `CondensedFileAnalysis` Oversized files → write a stub. Big-by-tokens - files → append to `bigFiles.json` for Phase 2. -2. **process-big-files** (`phases/process-big-files.ts`) — reads - `bigFiles.json`, calls `source.readFile(relativePath)` per entry, - dispatches `processBigFile` sequentially (chunk-level concurrency - inside). -3. **backfill-fields** (`backfill/fields.ts`) — top up `keywords`, - `sideEffects`, `configDependencies`, `dataFlowDirection` on condensed - entries that miss them. Idempotent. -4. **backfill-big-files** (`backfill/big-files.ts`) — re-condense entries - whose chunks exist but condensed JSON is stale or missing. -5. **summarise-folders** (`folder-summary.ts`) — group condensed entries by - `path.posix.dirname` (root = ""), one LLM call per folder, persist to - `folder-summaries/.json`. -6. **summarise-repo** (`repo-summary.ts`) — load folder summaries +1. **scan-and-classify** (`phases/scan-and-classify.ts`) — walks + `source.scan({ skipDecider, limiter })` once, tokenises each file, classifies + as `small` / `big` / `oversized`, and writes + `meta-output/scan-manifest.json` (canonical) plus the legacy + `bigFiles.json` (for the pull-path consumers). Scan internally uses a + **two-pass** strategy: walk + cache-only `decideStatic` first, then + parallel-deduplicated LLM resolution for unknown extensions/filenames + through the shared limiter, then drain. + 2a. **analyse-small** (`phases/analyse-small.ts`) — reads the manifest's + `kind: "small"` entries, re-opens content, runs the LLM file-analyser + per file under the shared limiter, writes `CondensedFileAnalysis` JSON. + Also writes oversized stubs. + 2b. **analyse-big-files** (`phases/process-big-files.ts` — + `analyseBigFiles`) — chunk-task queue across all big files. Every chunk + is an independent task on the shared limiter; per-file condense is + scheduled as soon as that file's last chunk lands (one in-place retry + on transient condense failures). Runs **concurrently with 2a**. +2. **backfill-fields** (`backfill/fields.ts`) — for each cached condensed + entry with missing extended fields (`keywords`, `sideEffects`, + `dataFlowDirection`, `sectionMap`, …) dispatches one LLM call through + the shared limiter to fill the gaps. Idempotent — no-op on a complete + entry. +3. **summarise-folders** (`folder-summary.ts`) — groups condensed entries + by direct parent folder. Small folders + (`≤ Config.FolderSummaryBatchMaxFiles`, default 15) are batched up to + `Config.FolderSummaryBatchSize` (default 10) per LLM call. Bigger + folders take the individual single-folder path. Both flows run through + the shared limiter. +4. **summarise-repo** (`repo-summary.ts`) — load folder summaries shallowest-first; one call if it fits `ContextWindowLimit`, batch + merge otherwise; persist `repo-summary.json` with the v2-flat envelope. -7. **store-flat-analysis** (`phases/store-flat-analysis.ts`) — ensure +5. **store-flat-analysis** (`phases/store-flat-analysis.ts`) — ensure flat-folder indexes, upsert `:Repo`, then every `:Folder`, then every `:File` with the extended analysis + Folder→File `CONTAINS` edge. +## Progress events + +The strategy emits progress through the `ProgressContext` port defined in +`src/progress/`. `createFlatFolderStrategy(deps)` accepts an optional +`progressContextFactory`; absent → `nullProgressContextFactory` +(no-op, OSS default). + +- **Boundary events** are split between the runner and the strategy: + - `phaseChanged("clone")` is emitted by `pipeline/run.ts` (the runner) + before `syncRepository`, so the SSE stream stays alive during the + network/disk-bound prelude. + - `phaseChanged("scan")` is emitted by `index.ts` before phase 1. + - `phaseChanged("file_analysis")` before the parallel 2a/2b block. + - `phaseChanged("folder_analysis")` before phase 4 (folder summaries). + - `phaseChanged("indexing")` before phase 5 (which feeds phase 6). + - `completed()` after phase 6 returns. + - `failed(message)` from a `try/catch` wrapping the whole `execute`. +- **Intra-phase ticks** are emitted via per-phase reporters created from + `progressContext.reporter(...)`. Sub-phase labels: + - phase 1 (scan) → no sub-phase, growing total (driven by `incrementSeen`). + - phase 2a (analyse-small) → `analyse_small`, fixed total = + `smallCount + oversizedCount`. + - phase 2b (analyse-big) → two reporters: `big_files_chunks` (fixed total + = sum of estimated chunks across all big files) and `big_files_condense` + (fixed total = `bigCount`). + - phase 3 → `backfill`, fixed total = `cache.size`. + - phase 4 → no sub-phase, fixed total = directly-grouped folder count. + - phase 6 → `folders` (growing) then `files` (fixed total = `cache.size`). +- **Pull-path-only sub-phases** (emitted by `pipeline/pull.ts` workflow, + not the main strategy): `big_files_queue` (legacy single-file driver), + `big_file:` (per-big-file chunk pulses inside the legacy + driver), `pull` (`analyse-changed.ts` selective file analysis). +- **Total mode**: scan is the only main-strategy phase that uses + `growing` mode. Everything else has fixed totals known up front from the + scan manifest, the file-analysis cache, or the folder grouping. +- The cancellation path in `execute` lets `CancellationError` propagate + past the orchestrator; `failed()` only fires for non-cancellation + errors. + ## Files -- `index.ts` — `createFlatFolderStrategy(deps)` orchestrates the 7 phases. +- `index.ts` — `createFlatFolderStrategy(deps)` orchestrates the phases. + Accepts `{ fileAnalyzer, progressContextFactory? }`. Constructs one + `ProgressContext` per job AND one shared `ConcurrencyLimiter` per job + (sized by `Config.LlmConcurrency`); threads both into every phase that + needs them. - `types.ts` — `AnalyzedFileEntry`, `FolderSummary`, `RepoSummary`, `RepoSummaryEnvelope`, `FlatFolderResult`. -- `analyse-file.ts` — `analyseScannedFile(analyzer, file)` + `buildOversizedStub`. +- `analyse-file.ts` — `analyseScannedFile(analyzer, file, llmCallContext?)` + `buildOversizedStub`. +- `analyse-changed.ts` — `analyseChangedFiles({knowledgeId, source, metaPaths, analyzer, diff, llmCallContext?, archiveSink?, progressContext?})`. Pull-time per-file dispatcher. Reads changed file content through `input.source` (a `SourceReader`) so it works with both the disk-backed reader (OSS default) and any HTTP-backed alternative supplied via the `pullFactory` hook. Mirrors `analyseSmallFiles`'s per-file path: filter → fetch → size cap → binary detect → line count → analyse → save + archive push. Does NOT invoke the skip-decision LLM gate. When `progressContext` is present it creates a fixed-total reporter (`subPhase: "pull"`, `total = dedupedPaths.length`) and increments per-path so the pull SSE stream stays live. +- `file-analysis-cache.ts` — in-memory `Map` + loaded once between phase 2 and phase 3; shared read-only by phases 3, 4, + 6; mutated by phase 3 backfill via `cache.set(entry)` so downstream phases + see updated entries without re-reading disk. +- `scan-manifest.ts` — `ScanManifest` shape, `readScanManifest`, + `writeScanManifest`. The canonical handoff between phase 1 and phases 2a/2b. - `folder-path.ts` — `directFolderOf`, `affectedFolderPaths`. -- `folder-summary.ts` — group + summarise + persist + iterate folder summaries. +- `folder-summary.ts` — group + summarise (individual or batched) + persist + - iterate folder summaries; shared `dispatchFolderSummaries` used by both + the main strategy and the pull-path's selective folder phase. +- `folder-summary-selective.ts` — pull-time selective folder summary phase. - `repo-summary.ts` — single-shot or batched repo summary with envelope writer. -- `phases/classify-and-analyse-small.ts` — Phase 1. -- `phases/process-big-files.ts` — Phase 2. -- `phases/store-flat-analysis.ts` — Phase 7. -- `backfill/fields.ts` — Phase 3. -- `backfill/big-files.ts` — Phase 4. -- `big-file/` — chunker, analyzer, condenser, storage, cache for Phase 2 & 4. +- `phases/scan-and-classify.ts` — Phase 1. +- `phases/analyse-small.ts` — Phase 2a. +- `phases/process-big-files.ts` — Phase 2b (`analyseBigFiles`, chunk-task + queue) plus the legacy `processBigFilesQueue` driver used by the pull-path. +- `phases/store-flat-analysis.ts` — Phase 6. +- `backfill/fields.ts` — Phase 3 (parallel via shared limiter). +- `big-file/` — chunker, analyzer, condenser, storage, cache used by both + big-file drivers. - `prompts/` — LLM prompts shared across the phases. ## Invariants @@ -66,3 +139,15 @@ sub-phase boundary. after `saveCondensed`; failures inside the sink are logged WARN and do not interrupt the analyse loop. The open-source binary never wires a sink — `archiveSink` is undefined and the call is skipped entirely. +- **Per-call LLM credentials thread through every phase.** The orchestrator + reads `context.llmCallContext` (an optional `AskLlmOptions` built by + the runner from `GithubIndexPayload.{llmApiKey, llmProvider, llmModel}`) + and forwards it into every phase that issues LLM calls: phase 1 via + `scanAndClassify` (forwarded into `source.scan({ llmCallContext })` for + the skip-decision LLM gate), phase 2a via `analyseSmallFiles`, phase 2b + via `analyseBigFiles` (which threads it into **both** the chunk analyzer + and `condenseChunks`), phase 3 via `backfillMissingFields`, phase 4 via + `runFolderSummaryPhase`, phase 5 via `summariseRepo`. The phases pass + the same option object through to `askJsonLLM` so the per-call override + reaches `@bb/llm` unchanged. When `llmCallContext` is undefined the call + falls back to `Config.OpenrouterApiKey` + `Config.LlmProvider`. diff --git a/packages/ingest-github/src/strategies/flat-folder/analyse-changed.ts b/packages/ingest-github/src/strategies/flat-folder/analyse-changed.ts index 1f10ae8..17f0125 100644 --- a/packages/ingest-github/src/strategies/flat-folder/analyse-changed.ts +++ b/packages/ingest-github/src/strategies/flat-folder/analyse-changed.ts @@ -1,33 +1,31 @@ import path from "node:path"; -import { readFile, stat } from "node:fs/promises"; -import { tokenLen } from "@bb/llm"; +import { tokenLen, type AskLlmOptions } from "@bb/llm"; +import { LlmConfigError, LlmError } from "@bb/errors"; import { logger } from "@bb/logger"; import { Config } from "@bb/types"; import { getConfigValue } from "@bb/config"; -import type { FileAnalyzer, ScannedFile } from "src/types/pipeline.ts"; -import type { MetaPaths } from "src/types/meta-paths.ts"; -import type { BigFileEntry } from "src/types/big-file.ts"; -import { looksBinary, passesPathFilters } from "src/pipeline/filters.ts"; -import { withConcurrency } from "src/pipeline/concurrency.ts"; -import { throwIfCancelled, CancellationError } from "src/pipeline/cancellation.ts"; -import type { DiffResult } from "src/pipeline/git-diff.ts"; -import { analyseScannedFile, buildOversizedStub } from "src/strategies/flat-folder/analyse-file.ts"; -import { saveCondensed } from "src/strategies/flat-folder/big-file/storage.ts"; -import { readBigFiles, writeBigFiles } from "src/strategies/flat-folder/big-file/detector.ts"; +import type { ArchiveSink, FileAnalyzer, ScannedFile, SourceReader } from "#src/types/pipeline.ts"; +import type { MetaPaths } from "#src/types/meta-paths.ts"; +import type { BigFileEntry } from "#src/types/big-file.ts"; +import type { ProgressContext } from "#src/progress/types.ts"; +import { looksBinary, passesPathFilters } from "#src/pipeline/filters.ts"; +import { withConcurrency } from "#src/pipeline/concurrency.ts"; +import { throwIfCancelled, CancellationError } from "#src/pipeline/cancellation.ts"; +import type { DiffResult } from "#src/pipeline/git-diff.ts"; +import { analyseScannedFile, buildOversizedStub } from "#src/strategies/flat-folder/analyse-file.ts"; +import { saveCondensed } from "#src/strategies/flat-folder/big-file/storage.ts"; +import { readBigFiles, writeBigFiles } from "#src/strategies/flat-folder/big-file/detector.ts"; export interface AnalyseChangedInput { knowledgeId: string; - repoDir: string; + source: SourceReader; metaPaths: MetaPaths; analyzer: FileAnalyzer; diff: DiffResult; - /** - * Invoked once per consumed path (analysed, stubbed, queued-as-big-file, - * filtered, or failed). Lets the caller drive a `processedFiles` counter - * for the progress bar without coupling this strategy to mongo. Best - * effort — errors from the callback are swallowed. - */ - onFileProcessed?: () => Promise | void; + llmCallContext?: AskLlmOptions; + /** Optional non-fatal archive sink. When set, analysed content is pushed after `saveCondensed`. */ + archiveSink?: ArchiveSink; + progressContext?: ProgressContext; } export interface AnalyseChangedResult { @@ -36,22 +34,28 @@ export interface AnalyseChangedResult { oversizedStubs: number; skipped: number; failed: number; + tokenUsage: { inputTokens: number; outputTokens: number; costUsd: number }; } /** - * Pull-time per-file dispatcher. Iterates the changed file set from the git - * diff and runs the same per-file work as `classifyAndAnalyseSmall`, but + * Pull-time per-file dispatcher. Iterates the changed file set from the + * diff and runs the same per-file work as `analyseSmallFiles`, but * targeted at known paths rather than a tree walk. * - * For added / modified / renamed-to paths: read content, apply static path - * filters, classify by tokens. Small files run the analyser inline and - * persist a `CondensedFileAnalysis`. Files above the context window join - * `bigFiles.json` for the big-file phase. Files above the absolute size cap - * get an oversized stub. + * Reads file content through `input.source` (a `SourceReader`) so the + * dispatcher works with both the disk-backed reader (OSS default) and + * any HTTP-backed alternative supplied via the pull factory hook. * - * The dispatcher does NOT invoke the skip-decision LLM gate. Pulls re-analyse - * paths that already passed the gate during the initial index (or paths so - * new the gate has not seen them yet — for v1 we accept that lag). + * For added / modified / renamed-to paths: read content, apply static + * path filters, classify by tokens. Small files run the analyser inline + * and persist a `CondensedFileAnalysis`. Files above the context window + * join `bigFiles.json` for the big-file phase. Files above the absolute + * size cap get an oversized stub. + * + * The dispatcher does NOT invoke the skip-decision LLM gate. Pulls + * re-analyse paths that already passed the gate during the initial + * index (or paths so new the gate has not seen them yet — for v1 we + * accept that lag). */ export async function analyseChangedFiles(input: AnalyseChangedInput): Promise { const contextWindowLimit = getConfigValue(Config.ContextWindowLimit); @@ -75,112 +79,147 @@ export async function analyseChangedFiles(input: AnalyseChangedInput): Promise[] = []; - for (const relativePath of dedupedPaths) { - throwIfCancelled(input.knowledgeId); - const filename = path.basename(relativePath); - const ext = path.extname(filename).toLowerCase(); - if (!passesPathFilters(filename, ext)) { - skipped += 1; - continue; - } - - const abs = path.join(input.repoDir, relativePath); - let sizeBytes: number; - try { - sizeBytes = (await stat(abs)).size; - } catch (cause: unknown) { - failed += 1; - logger.warn(`pull-analyse: stat failed for ${relativePath}: ${describe(cause)}`); - continue; - } + const reporter = input.progressContext?.reporter({ + phase: "file_analysis", + subPhase: "pull", + total: { kind: "fixed", total: dedupedPaths.length }, + }); + await reporter?.start(); - if (sizeBytes > absoluteCap) { - bigFileBuffer.push({ - relativePath, - sizeBytes, - tokenCount: 0, - reason: "too-large", - }); - try { - await saveCondensed(input.metaPaths, buildOversizedStub(relativePath, sizeBytes)); - oversizedStubs += 1; - } catch (cause: unknown) { - failed += 1; - logger.warn(`pull-analyse: oversized stub write failed for ${relativePath}: ${describe(cause)}`); + try { + for (const relativePath of dedupedPaths) { + throwIfCancelled(input.knowledgeId); + const filename = path.basename(relativePath); + const ext = path.extname(filename).toLowerCase(); + if (!passesPathFilters(filename, ext)) { + skipped += 1; + reporter?.increment(1, { fileName: relativePath }); + continue; } - continue; - } - let buf: Buffer; - try { - buf = await readFile(abs); - } catch (cause: unknown) { - failed += 1; - logger.warn(`pull-analyse: read failed for ${relativePath}: ${describe(cause)}`); - continue; - } - if (looksBinary(buf)) { - skipped += 1; - continue; - } - const content = buf.toString("utf8"); - if (countLines(content) > bigFileLineThreshold) { - bigFileBuffer.push({ - relativePath, - sizeBytes, - tokenCount: 0, - reason: "too-large", - }); + let content: string; try { - await saveCondensed(input.metaPaths, buildOversizedStub(relativePath, sizeBytes)); - oversizedStubs += 1; + content = await input.source.readFile(relativePath); } catch (cause: unknown) { failed += 1; - logger.warn(`pull-analyse: oversized stub write failed for ${relativePath}: ${describe(cause)}`); + logger.warn(`pull-analyse: read failed for ${relativePath}: ${describe(cause)}`); + reporter?.increment(1, { fileName: relativePath }); + continue; } - continue; - } + if (content.length === 0) { + skipped += 1; + reporter?.increment(1, { fileName: relativePath }); + continue; + } + const sizeBytes = Buffer.byteLength(content, "utf8"); - const tokenCount = tokenLen(content); - if (tokenCount > contextWindowLimit) { - bigFileBuffer.push({ - relativePath, - sizeBytes, - tokenCount, - reason: "context-window-exceeded", - }); - continue; - } + if (sizeBytes > absoluteCap) { + bigFileBuffer.push({ + relativePath, + sizeBytes, + tokenCount: 0, + reason: "too-large", + }); + try { + await saveCondensed(input.metaPaths, buildOversizedStub(relativePath, sizeBytes)); + oversizedStubs += 1; + } catch (cause: unknown) { + failed += 1; + logger.warn(`pull-analyse: oversized stub write failed for ${relativePath}: ${describe(cause)}`); + } + reporter?.increment(1, { fileName: relativePath }); + continue; + } - const scanned: ScannedFile = { - kind: "file", - relativePath, - absolutePath: abs, - sizeBytes, - content, - }; - pending.push( - limit(async () => { + if (looksBinary(Buffer.from(content, "utf8"))) { + skipped += 1; + reporter?.increment(1, { fileName: relativePath }); + continue; + } + if (countLines(content) > bigFileLineThreshold) { + bigFileBuffer.push({ + relativePath, + sizeBytes, + tokenCount: 0, + reason: "too-large", + }); try { - throwIfCancelled(input.knowledgeId); - const condensed = await analyseScannedFile(input.analyzer, scanned); - await saveCondensed(input.metaPaths, condensed); - smallFilesAnalysed += 1; + await saveCondensed(input.metaPaths, buildOversizedStub(relativePath, sizeBytes)); + oversizedStubs += 1; } catch (cause: unknown) { - if (cause instanceof CancellationError) { - throw cause; - } failed += 1; - logger.warn(`pull-analyse: analyse failed for ${relativePath}: ${describe(cause)}`); + logger.warn(`pull-analyse: oversized stub write failed for ${relativePath}: ${describe(cause)}`); } - }), - ); - } + reporter?.increment(1, { fileName: relativePath }); + continue; + } + + const tokenCount = tokenLen(content); + if (tokenCount > contextWindowLimit) { + bigFileBuffer.push({ + relativePath, + sizeBytes, + tokenCount, + reason: "context-window-exceeded", + }); + // Big-file path runs in its own phase; this entry leaves the small-loop accounting. + reporter?.increment(1, { fileName: relativePath }); + continue; + } - await Promise.all(pending); + const scanned: ScannedFile = { + kind: "file", + relativePath, + absolutePath: relativePath, + sizeBytes, + content, + }; + const fileContent = content; + const filePath = relativePath; + pending.push( + limit(async () => { + try { + throwIfCancelled(input.knowledgeId); + const condensed = await analyseScannedFile(input.analyzer, scanned, input.llmCallContext); + await saveCondensed(input.metaPaths, condensed); + if (input.archiveSink !== undefined) { + await input.archiveSink.push({ + knowledgeId: input.knowledgeId, + relativePath: filePath, + content: fileContent, + }); + } + if (condensed.tokenUsage) { + totalInputTokens += condensed.tokenUsage.inputTokens; + totalOutputTokens += condensed.tokenUsage.outputTokens; + totalCostUsd += condensed.tokenUsage.costUsd; + } + smallFilesAnalysed += 1; + } catch (cause: unknown) { + if (cause instanceof CancellationError) { + throw cause; + } + if (cause instanceof LlmConfigError || cause instanceof LlmError) { + throw cause; + } + failed += 1; + logger.warn(`pull-analyse: analyse failed for ${relativePath}: ${describe(cause)}`); + } + reporter?.increment(1, { fileName: filePath }); + }), + ); + } + + await Promise.all(pending); + } finally { + reporter?.stop(); + } if (bigFileBuffer.length > 0) { const existing = await readBigFiles(input.metaPaths); @@ -197,6 +236,7 @@ export async function analyseChangedFiles(input: AnalyseChangedInput): Promise { - const { language, analysis } = await analyzer.analyze({ +export async function analyseScannedFile( + analyzer: FileAnalyzer, + file: ScannedFile, + llmCallContext?: AskLlmOptions, +): Promise { + const analyzerInput: Parameters[0] = { relativePath: file.relativePath, content: file.content, - }); + }; + if (llmCallContext !== undefined) { + analyzerInput.llmCallContext = llmCallContext; + } + const { language, analysis, tokenUsage } = await analyzer.analyze(analyzerInput); return { relativePath: file.relativePath, language, @@ -19,6 +27,7 @@ export async function analyseScannedFile(analyzer: FileAnalyzer, file: ScannedFi totalTokenCount: 0, analysedAt: new Date().toISOString(), analysis, + tokenUsage, }; } diff --git a/packages/ingest-github/src/strategies/flat-folder/backfill/README.md b/packages/ingest-github/src/strategies/flat-folder/backfill/README.md index 7762c0f..f580f19 100644 --- a/packages/ingest-github/src/strategies/flat-folder/backfill/README.md +++ b/packages/ingest-github/src/strategies/flat-folder/backfill/README.md @@ -1,59 +1,72 @@ # `@bb/ingest-github/src/strategies/flat-folder/backfill` -Post-analysis top-up phases. After Phases 1 and 2 have produced -`CondensedFileAnalysis` JSON on disk, the backfill phases sweep the cache -to fill gaps left by per-file LLM noise or by interrupted big-file runs. -Both are idempotent and skip entries that already look complete. +Post-analysis top-up. After Phases 1 and 2 have produced +`CondensedFileAnalysis` JSON on disk, this phase sweeps the in-memory +cache to fill extended-analysis fields the main per-file prompt left +empty. Idempotent — entries that already look complete are skipped +without an LLM call. + +The big-file backfill phase that used to live here was removed: the +new chunk-task-queue model in `phases/process-big-files.ts` handles +crash recovery directly via the per-chunk disk cache and `inspect()`, +and same-run condense failures are now retried twice in-place before +being marked failed. ## Files -- `fields.ts` — Phase 3. `backfillMissingFields(metaPaths)` iterates every - condensed entry via `iterateCondensed`, computes which extended-analysis - fields are missing (`keywords`, `ontologyConcepts`, `businessEntities`, - `systemCapabilities`, `sideEffects`, `configDependencies`, - `dataFlowDirection`, `integrationSurface`, `contractsProvided`, - `contractsConsumed`, `sectionMap`), and asks one LLM call per file to - fill only the missing slots. The response is validated and normalised - (`pickStringArray`, `pickSections`) before being written back via - `saveCondensed`. Entries with nothing missing are skipped without an - LLM call. -- `big-files.ts` — Phase 4. `backfillBigFiles({knowledgeId, repoDir, -metaPaths})` re-reads `bigFiles.json`, skips `reason === "too-large"`, - and for each non-complete entry (per `inspect`) re-runs `processBigFile` - against the file on disk so the condensed JSON is rebuilt from cached - chunks where possible. +- `fields.ts` — Phase 3. `backfillMissingFields(metaPaths, cache, limiter, llmCallContext?, progressContext?)` + iterates every condensed entry from the shared `FileAnalysisCache`, + computes which extended-analysis fields are missing (`keywords`, + `ontologyConcepts`, `businessEntities`, `systemCapabilities`, + `sideEffects`, `configDependencies`, `dataFlowDirection`, + `integrationSurface`, `contractsProvided`, `contractsConsumed`, + `sectionMap`), and dispatches one LLM call per file **through the shared + `ConcurrencyLimiter`** to fill only the missing slots. Tasks run + concurrently up to `Config.LlmConcurrency`; the loop builds the task + array and awaits `Promise.all` at the end. The response is validated and + normalised (`pickStringArray`, `pickSections`) before being written back + via `saveCondensed` **and** mirrored into the cache via `cache.set(entry)` + so downstream phases (folder summary, graph store) see the updated entry + without re-reading disk. Entries with nothing missing are skipped + without an LLM call. Progress reporter is fixed-total sized by + `cache.size`. Emits `phase3 dispatching N backfill tasks` at entry so the + caller can see how many tasks went through the limiter. ## Public interfaces -- `backfillMissingFields(metaPaths): Promise<{ updated, failed }>` -- `backfillBigFiles(input: BackfillBigFilesInput): Promise` +- `backfillMissingFields(metaPaths, cache, limiter, llmCallContext?, progressContext?): Promise<{ updated, failed }>` -Both return phase-summary counters consumed by `createFlatFolderStrategy` +Returns phase-summary counters consumed by `createFlatFolderStrategy` to roll up into the strategy result. ## Data ownership -These phases own no new on-disk artifacts. They mutate existing condensed -JSON in place via `saveCondensed`, and (Phase 4) drive `processBigFile` to -refresh the chunk and condensed caches under `big-file/storage.ts`. +This phase owns no new on-disk artifacts. It mutates existing +condensed JSON in place via `saveCondensed` and mirrors the same +mutation into `FileAnalysisCache`. ## Invariants - Idempotent: a second run is a no-op once every entry passes the completeness check. - Per-file LLM failure is logged and counted, never thrown. The phase - continues to the next entry. -- LLM output is untrusted: missing slots are filled only when the response - yields a non-empty value of the expected shape; partial responses leave - unfilled slots for a future pass. -- Phase 4 never touches `reason === "too-large"` entries — those stay as - stubs forever. + continues to the next entry. Only `LlmConfigError` / `LlmError` + propagate (treated as job-fatal upstream). +- LLM output is untrusted: missing slots are filled only when the + response yields a non-empty value of the expected shape; partial + responses leave unfilled slots for a future pass. +- Cache and disk stay in lockstep — every `saveCondensed` is paired + with a `cache.set(entry)` in the same code path. +- Concurrency is bounded by the shared `ConcurrencyLimiter` (today's + `Config.LlmConcurrency`). Counters (`updated`, `failed`, token totals) + are mutated from inside the concurrent tasks — safe under JS's + single-threaded event loop, no locking needed. ## External dependencies `@bb/llm` (`askJsonLLM`), `@bb/logger`, `@bb/mongo` (types only — `FileAnalysis`, `FileAnalysisSection`), the sibling -`flat-folder/big-file/` cache layer, and the prompts under +`flat-folder/file-analysis-cache.ts`, and the prompts under `flat-folder/prompts/backfill.ts`. ## Tier diff --git a/packages/ingest-github/src/strategies/flat-folder/backfill/big-files.ts b/packages/ingest-github/src/strategies/flat-folder/backfill/big-files.ts deleted file mode 100644 index 9d1a393..0000000 --- a/packages/ingest-github/src/strategies/flat-folder/backfill/big-files.ts +++ /dev/null @@ -1,54 +0,0 @@ -import { logger } from "@bb/logger"; -import type { MetaPaths } from "src/types/meta-paths.ts"; -import type { SourceReader } from "src/types/pipeline.ts"; -import { readBigFiles } from "src/strategies/flat-folder/big-file/detector.ts"; -import { inspect } from "src/strategies/flat-folder/big-file/cache.ts"; -import { processBigFile } from "src/strategies/flat-folder/big-file/index.ts"; - -export interface BackfillBigFilesInput { - knowledgeId: string; - source: SourceReader; - metaPaths: MetaPaths; -} - -export interface BackfillBigFilesResult { - reCondensed: number; - failed: number; -} - -export async function backfillBigFiles(input: BackfillBigFilesInput): Promise { - const entries = await readBigFiles(input.metaPaths); - let reCondensed = 0; - let failed = 0; - for (const entry of entries) { - if (entry.reason === "too-large") { - continue; - } - const status = await inspect(input.metaPaths, entry.relativePath); - if (status === "complete") { - continue; - } - try { - const content = await input.source.readFile(entry.relativePath); - if (content.length === 0) { - failed += 1; - logger.warn(`phase4: empty content for ${entry.relativePath}; skipping`); - continue; - } - await processBigFile({ - knowledgeId: input.knowledgeId, - metaPaths: input.metaPaths, - relativePath: entry.relativePath, - content, - sizeBytes: entry.sizeBytes, - }); - reCondensed += 1; - } catch (cause: unknown) { - failed += 1; - const msg = cause instanceof Error ? cause.message : String(cause); - logger.warn(`phase4: re-condense failed for ${entry.relativePath}: ${msg}`); - } - } - logger.info(`phase4 done: reCondensed=${reCondensed} failed=${failed}`); - return { reCondensed, failed }; -} diff --git a/packages/ingest-github/src/strategies/flat-folder/backfill/fields.ts b/packages/ingest-github/src/strategies/flat-folder/backfill/fields.ts index 2d3e5d9..9effedb 100644 --- a/packages/ingest-github/src/strategies/flat-folder/backfill/fields.ts +++ b/packages/ingest-github/src/strategies/flat-folder/backfill/fields.ts @@ -1,10 +1,13 @@ -import { askJsonLLM } from "@bb/llm"; +import { askJsonLLM, type AskLlmOptions } from "@bb/llm"; +import { LlmConfigError, LlmError } from "@bb/errors"; import { logger } from "@bb/logger"; import type { FileAnalysis, FileAnalysisSection } from "@bb/mongo"; -import type { MetaPaths } from "src/types/meta-paths.ts"; -import { iterateCondensed } from "src/strategies/flat-folder/big-file/storage.ts"; -import { saveCondensed } from "src/strategies/flat-folder/big-file/storage.ts"; -import { BACKFILL_SYSTEM_PROMPT, buildBackfillUserPrompt } from "src/strategies/flat-folder/prompts/backfill.ts"; +import type { MetaPaths } from "#src/types/meta-paths.ts"; +import type { ProgressContext } from "#src/progress/types.ts"; +import type { ConcurrencyLimiter } from "#src/pipeline/concurrency.ts"; +import { saveCondensed } from "#src/strategies/flat-folder/big-file/storage.ts"; +import type { FileAnalysisCache } from "#src/strategies/flat-folder/file-analysis-cache.ts"; +import { BACKFILL_SYSTEM_PROMPT, buildBackfillUserPrompt } from "#src/strategies/flat-folder/prompts/backfill.ts"; const EXTENDED_ARRAY_KEYS = [ "ontologyConcepts", @@ -40,32 +43,65 @@ interface NeededFlags { sectionMap: boolean; } -export async function backfillMissingFields(metaPaths: MetaPaths): Promise<{ updated: number; failed: number }> { +export async function backfillMissingFields( + metaPaths: MetaPaths, + cache: FileAnalysisCache, + limiter: ConcurrencyLimiter, + llmCallContext?: AskLlmOptions, + progressContext?: ProgressContext, +): Promise<{ updated: number; failed: number }> { let updated = 0; let failed = 0; - for await (const entry of iterateCondensed(metaPaths)) { - const a = entry.analysis; - const needed = computeNeeded(a); - if (!hasAnyMissing(needed)) { - continue; - } - const userPrompt = buildBackfillUserPrompt(entry.relativePath, entry.analysis); - try { - const response = await askJsonLLM(BACKFILL_SYSTEM_PROMPT, userPrompt); - const result = response.result; - if (result === null) { + let dispatched = 0; + const reporter = progressContext?.reporter({ + phase: "file_analysis", + subPhase: "backfill", + total: { kind: "fixed", total: cache.size }, + }); + await reporter?.start(); + try { + const tasks: Promise[] = []; + for (const entry of cache.values()) { + const a = entry.analysis; + const needed = computeNeeded(a); + if (!hasAnyMissing(needed)) { + reporter?.increment(1, { fileName: entry.relativePath }); continue; } - applyBackfill(a, result, needed); - await saveCondensed(metaPaths, entry); - updated += 1; - } catch (cause: unknown) { - failed += 1; - logger.warn(`phase3: backfill failed for ${entry.relativePath}: ${describe(cause)}`); + dispatched += 1; + tasks.push( + limiter(async () => { + const userPrompt = buildBackfillUserPrompt(entry.relativePath, entry.analysis); + try { + const response = await askJsonLLM(BACKFILL_SYSTEM_PROMPT, userPrompt, llmCallContext ?? {}); + const result = response.result; + if (result === null) { + reporter?.increment(1, { fileName: entry.relativePath }); + return; + } + applyBackfill(a, result, needed); + await saveCondensed(metaPaths, entry); + cache.set(entry); + updated += 1; + } catch (cause: unknown) { + if (cause instanceof LlmConfigError || cause instanceof LlmError) { + throw cause; + } + failed += 1; + logger.warn(`phase3: backfill failed for ${entry.relativePath}: ${describe(cause)}`); + } finally { + reporter?.increment(1, { fileName: entry.relativePath }); + } + }), + ); } + logger.info(`phase3 dispatching ${dispatched} backfill tasks`); + await Promise.all(tasks); + logger.info(`phase3 done: updated=${updated} failed=${failed}`); + return { updated, failed }; + } finally { + reporter?.stop(); } - logger.info(`phase3 done: updated=${updated} failed=${failed}`); - return { updated, failed }; } function computeNeeded(a: FileAnalysis): NeededFlags { diff --git a/packages/ingest-github/src/strategies/flat-folder/big-file/README.md b/packages/ingest-github/src/strategies/flat-folder/big-file/README.md index 17b00ef..264d8ea 100644 --- a/packages/ingest-github/src/strategies/flat-folder/big-file/README.md +++ b/packages/ingest-github/src/strategies/flat-folder/big-file/README.md @@ -11,27 +11,66 @@ depending on chunk count and prompt budget. - `detector.ts` — `classifyByTokens`, `buildBigFileEntry`, plus the on-disk `bigFiles.json` reader / writer / appender (dedupe-by-path on write). - `chunker.ts` — `splitFileIntoChunks` (line-aligned, ≤ `MaxTokensPerChunk`). -- `chunk-analyzer.ts` — `analyzeChunk(chunk)` calls `askJsonLLM` with the - chunk prompt; tolerates failures by returning an empty analysis. -- `condenser.ts` — `condenseChunks(relativePath, chunks)`: +- `chunk-analyzer.ts` — `analyzeChunk(chunk, llmCallContext?)` calls + `askJsonLLM` with the chunk prompt; tolerates failures by returning an + empty analysis. `llmCallContext` forwards per-job LLM credentials + threaded through from `StrategyContext`. +- `condenser.ts` — `condenseChunks(relativePath, chunks, llmCallContext?)`: ≤ `SmallFileDedupThreshold` → deterministic merge (no LLM); - above → recursive map-reduce. Per-condense LLM failure falls back to - deterministic dedup so recursion always terminates. + above → recursive map-reduce. `llmCallContext` is threaded through + `condenseRecursively` and `condenseOne` to every `askJsonLLM` call so + the same per-call credential bag the chunk analyser uses also reaches + the condense step — without it, callers that rely on per-call overrides + instead of `Config.OpenrouterApiKey` would hit `LlmConfigError` here. + Per-condense LLM failure falls back to deterministic dedup so recursion + always terminates. - `storage.ts` — on-disk cache (chunk JSON, manifest, condensed analysis) + `iterateCondensed(metaPaths)` async iterator used by Phase 5. - `cache.ts` — `inspect(metaPaths, relativePath)` returns `complete`, - `stale-condensed`, or `missing`. Used by Phase 2 to short-circuit and by - Phase 4 to find candidates for cheap re-condense. + `stale-condensed`, or `missing`. Used by Phase 2 to short-circuit + already-finished big files on resume. The chunk task queue then + re-uses cached chunks via `loadChunkIfPresent` and re-runs condense + to recover any `stale-condensed` files — this is the crash-recovery + pathway that replaced the deleted Phase 4 backfill. - `index.ts` — `processBigFile({knowledgeId, metaPaths, relativePath, content, -sizeBytes})`. Sequential per file (chunk-level concurrency inside). - Persists every intermediate artifact, so a restart resumes from the next - unfinished chunk. +sizeBytes, llmCallContext?, progressContext?})`. Sequential per file + (chunk-level concurrency inside). Persists every intermediate artifact, + so a restart resumes from the next unfinished chunk. `llmCallContext` + is forwarded to **both** sides of the big-file pipeline — every + `analyzeChunk` call inside the worker loop **and** the final + `condenseChunks(...)` call — so per-call LLM credentials reach + `@bb/llm` consistently across chunk analysis and condense. When + `progressContext` is present, the chunk pool runs under a fixed-total + reporter (`subPhase: "big_file:"`, `total = chunks.length`) + so long single-file analyses surface as live `PHASE_TICK` envelopes + carrying per-chunk progress instead of looking frozen. + +## Two callers + +These leaf helpers (`splitFileIntoChunks`, `analyzeChunk`, `condenseChunks`, +the storage / cache primitives) are consumed by **two** drivers: + +- `processBigFile` (`index.ts`) — legacy serial driver. One big file at a + time, chunks-within-file parallel under `Config.BigFileConcurrency`, + followed by a blocking condense. Used today only by the pull-path + (`pipeline/pull.ts`) via `processBigFilesQueue`. +- `analyseBigFiles` (`phases/process-big-files.ts`) — manifest-driven + chunk-task queue used by the main strategy entry. Every chunk of every + big file is an independent task scheduled through a strategy-wide + shared `ConcurrencyLimiter`. As soon as a file's last chunk lands, + that file's `condenseChunks` is scheduled through the same limiter + (with one in-place retry on transient failure) — multiple condenses + run in parallel with chunks of slower files. Reuses + `splitFileIntoChunks`, `analyzeChunk`, `condenseChunks`, and the + storage helpers without modification. ## Invariants -- One big file at a time. Concurrency lives at the chunk level inside - `processBigFile`, never across files, to bound peak memory. - Every artifact is durable on disk before the next step. The chunk cache - short-circuits on re-runs; the manifest plus condensed JSON are the - Phase 7 graph-store inputs. -- Cancellation is checked between chunks (`throwIfCancelled(knowledgeId)`). + short-circuits on re-runs (per-chunk granularity, not per-file); the + manifest plus condensed JSON are the Phase 7 graph-store inputs. +- Cancellation is checked between chunks and before each condense + dispatch (`throwIfCancelled(knowledgeId)`). +- `bigFiles.json` is now a derived view written by `scanAndClassify`. + The main strategy reads it indirectly via the manifest; the legacy + drivers (pull-path + backfill) continue to read it directly. diff --git a/packages/ingest-github/src/strategies/flat-folder/big-file/cache.ts b/packages/ingest-github/src/strategies/flat-folder/big-file/cache.ts index 86bd85a..5f62fe9 100644 --- a/packages/ingest-github/src/strategies/flat-folder/big-file/cache.ts +++ b/packages/ingest-github/src/strategies/flat-folder/big-file/cache.ts @@ -1,5 +1,5 @@ import { readManifestIfPresent, readCondensed } from "./storage.ts"; -import type { MetaPaths } from "src/types/meta-paths.ts"; +import type { MetaPaths } from "#src/types/meta-paths.ts"; export type BigFileCacheStatus = "complete" | "stale-condensed" | "missing"; diff --git a/packages/ingest-github/src/strategies/flat-folder/big-file/chunk-analyzer.ts b/packages/ingest-github/src/strategies/flat-folder/big-file/chunk-analyzer.ts index 79fe4f6..c645587 100644 --- a/packages/ingest-github/src/strategies/flat-folder/big-file/chunk-analyzer.ts +++ b/packages/ingest-github/src/strategies/flat-folder/big-file/chunk-analyzer.ts @@ -1,11 +1,12 @@ -import { askJsonLLM } from "@bb/llm"; +import { askJsonLLM, type AskLlmOptions } from "@bb/llm"; +import { LlmConfigError, LlmError } from "@bb/errors"; import { logger } from "@bb/logger"; -import type { ChunkAnalysisResult, FileChunk } from "src/types/big-file.ts"; -import { FALLBACK_LANGUAGE, emptyFileAnalysis } from "src/types/file-analysis.ts"; -import { shapeAnalysis } from "src/adapters/llm-file-analyzer.ts"; -import { CHUNK_ANALYSIS_SYSTEM_PROMPT, buildChunkUserPrompt } from "src/strategies/flat-folder/prompts/chunk.ts"; +import type { ChunkAnalysisResult, FileChunk } from "#src/types/big-file.ts"; +import { FALLBACK_LANGUAGE, emptyFileAnalysis } from "#src/types/file-analysis.ts"; +import { shapeAnalysis } from "#src/adapters/llm-file-analyzer.ts"; +import { CHUNK_ANALYSIS_SYSTEM_PROMPT, buildChunkUserPrompt } from "#src/strategies/flat-folder/prompts/chunk.ts"; -export async function analyzeChunk(chunk: FileChunk): Promise { +export async function analyzeChunk(chunk: FileChunk, llmCallContext?: AskLlmOptions): Promise { const systemPrompt = CHUNK_ANALYSIS_SYSTEM_PROMPT; const userPrompt = buildChunkUserPrompt({ relativePath: chunk.relativePath, @@ -16,7 +17,7 @@ export async function analyzeChunk(chunk: FileChunk): Promise>(systemPrompt, userPrompt); + const response = await askJsonLLM>(systemPrompt, userPrompt, llmCallContext ?? {}); if (response.result === null) { logger.warn( `analyzeChunk: ${chunk.relativePath} chunk ${chunk.chunkIndex + 1}/${chunk.totalChunks} returned unparseable JSON`, @@ -32,8 +33,16 @@ export async function analyzeChunk(chunk: FileChunk): Promise { @@ -29,6 +33,13 @@ export async function processBigFile(input: ProcessBigFileInput): Promise => { while (nextIndex < chunks.length) { const idx = nextIndex; @@ -41,26 +52,40 @@ export async function processBigFile(input: ProcessBigFileInput): Promise[] = []; - for (let i = 0; i < workerCount; i += 1) { - workers.push(worker()); + try { + const workerCount = Math.min(concurrency, chunks.length); + const workers: Promise[] = []; + for (let i = 0; i < workerCount; i += 1) { + workers.push(worker()); + } + await Promise.all(workers); + } finally { + reporter?.stop(); } - await Promise.all(workers); throwIfCancelled(input.knowledgeId); - const merged = await condenseChunks(input.relativePath, results); + const merged = await condenseChunks(input.relativePath, results, input.llmCallContext); const chunkPaths = chunks.map((_, i) => `chunks/${encodeFolder(input.relativePath)}/chunk-${i}.json`); const totalTokenCount = chunks.reduce((acc, c) => acc + c.tokenCount, 0); + + const chunkInputTokens = results.reduce((acc, r) => acc + (r.tokenUsage?.inputTokens ?? 0), 0); + const chunkOutputTokens = results.reduce((acc, r) => acc + (r.tokenUsage?.outputTokens ?? 0), 0); + const chunkCostUsd = results.reduce((acc, r) => acc + (r.tokenUsage?.costUsd ?? 0), 0); + const totalInputTokens = chunkInputTokens + (merged.tokenUsage?.inputTokens ?? 0); + const totalOutputTokens = chunkOutputTokens + (merged.tokenUsage?.outputTokens ?? 0); + const totalCostUsd = chunkCostUsd + (merged.tokenUsage?.costUsd ?? 0); + const manifest: HugeFileManifest = { relativePath: input.relativePath, totalChunks: chunks.length, @@ -81,6 +106,7 @@ export async function processBigFile(input: ProcessBigFileInput): Promise; + + private constructor(map: Map) { + this.map = map; + } + + static async loadAll(metaPaths: MetaPaths): Promise { + const startedAt = Date.now(); + let filenames: string[]; + try { + filenames = await readdir(metaPaths.fileAnalysisDir); + } catch (cause: unknown) { + logger.warn(`file-analysis-cache: readdir failed for ${metaPaths.fileAnalysisDir}: ${describe(cause)}`); + return new FileAnalysisCache(new Map()); + } + const jsonFiles = filenames.filter((n) => n.endsWith(".json")); + const map = new Map(); + const limit = withConcurrency(LOAD_CONCURRENCY); + const tasks: Promise[] = []; + for (const name of jsonFiles) { + tasks.push( + limit(async () => { + const full = path.join(metaPaths.fileAnalysisDir, name); + try { + const raw = await readFile(full, "utf8"); + const parsed: unknown = JSON.parse(raw); + if (typeof parsed !== "object" || parsed === null) { + return; + } + const entry = parsed as CondensedFileAnalysis; + if (typeof entry.relativePath !== "string" || entry.relativePath.length === 0) { + return; + } + map.set(entry.relativePath, entry); + } catch (cause: unknown) { + logger.warn(`file-analysis-cache: failed to read ${name}: ${describe(cause)}`); + } + }), + ); + } + await Promise.all(tasks); + const elapsedMs = Date.now() - startedAt; + logger.info(`file-analysis-cache: loaded ${map.size} entries in ${elapsedMs} ms`); + return new FileAnalysisCache(map); + } + + get(relativePath: string): CondensedFileAnalysis | undefined { + return this.map.get(relativePath); + } + + set(entry: CondensedFileAnalysis): void { + this.map.set(entry.relativePath, entry); + } + + values(): IterableIterator { + return this.map.values(); + } + + entries(): IterableIterator<[string, CondensedFileAnalysis]> { + return this.map.entries(); + } + + get size(): number { + return this.map.size; + } +} + +function describe(cause: unknown): string { + return cause instanceof Error ? cause.message : String(cause); +} diff --git a/packages/ingest-github/src/strategies/flat-folder/folder-summary-selective.ts b/packages/ingest-github/src/strategies/flat-folder/folder-summary-selective.ts index 8b0ee45..a2d8791 100644 --- a/packages/ingest-github/src/strategies/flat-folder/folder-summary-selective.ts +++ b/packages/ingest-github/src/strategies/flat-folder/folder-summary-selective.ts @@ -1,69 +1,60 @@ import { logger } from "@bb/logger"; -import { Config } from "@bb/types"; -import { getConfigValue } from "@bb/config"; -import type { MetaPaths } from "src/types/meta-paths.ts"; -import { withConcurrency } from "src/pipeline/concurrency.ts"; -import { throwIfCancelled, CancellationError } from "src/pipeline/cancellation.ts"; -import { - groupByDirectFolder, - persistFolderSummary, - summariseFolder, -} from "src/strategies/flat-folder/folder-summary.ts"; +import type { AskLlmOptions } from "@bb/llm"; +import type { CondensedFileAnalysis } from "#src/types/condensed-file-analysis.ts"; +import type { MetaPaths } from "#src/types/meta-paths.ts"; +import type { ConcurrencyLimiter } from "#src/pipeline/concurrency.ts"; +import type { FileAnalysisCache } from "#src/strategies/flat-folder/file-analysis-cache.ts"; +import { dispatchFolderSummaries, groupByDirectFolder } from "#src/strategies/flat-folder/folder-summary.ts"; export interface SelectiveFolderSummaryInput { knowledgeId: string; metaPaths: MetaPaths; + cache: FileAnalysisCache; + limiter: ConcurrencyLimiter; affectedFolders: Set; + llmCallContext?: AskLlmOptions; } export interface SelectiveFolderSummaryResult { succeeded: number; failed: number; skipped: number; + tokenUsage: { inputTokens: number; outputTokens: number; costUsd: number }; } /** * Pull-time folder summary. Same machinery as `runFolderSummaryPhase` but - * only regenerates folders the caller flagged as affected. Reads condensed - * file analyses from disk; the dispatcher must have populated them already. + * only regenerates folders the caller flagged as affected. Filters by + * `affectedFolders` BEFORE batching so skipped folders never enter a batch. */ export async function runSelectiveFolderSummary( input: SelectiveFolderSummaryInput, ): Promise { - const concurrentWorkers = getConfigValue(Config.ConcurrentWorkers); - const limit = withConcurrency(concurrentWorkers); - const groups = await groupByDirectFolder(input.metaPaths); - let succeeded = 0; - let failed = 0; + const allGroups = groupByDirectFolder(input.cache); + const affectedGroups = new Map(); let skipped = 0; - const tasks: Promise[] = []; - for (const [folderPath, files] of groups.entries()) { - if (!input.affectedFolders.has(folderPath)) { + for (const [folderPath, files] of allGroups.entries()) { + if (input.affectedFolders.has(folderPath)) { + affectedGroups.set(folderPath, files); + } else { skipped += 1; - continue; } - tasks.push( - limit(async () => { - try { - throwIfCancelled(input.knowledgeId); - const summary = await summariseFolder(folderPath, files); - if (summary !== null) { - await persistFolderSummary(input.metaPaths, summary); - succeeded += 1; - } else { - failed += 1; - } - } catch (cause: unknown) { - if (cause instanceof CancellationError) { - throw cause; - } - failed += 1; - logger.warn(`pull-folder-summary: failed for ${folderPath || ""}`); - } - }), - ); } - await Promise.all(tasks); - logger.info(`pull-folder-summary done: succeeded=${succeeded} failed=${failed} skipped=${skipped}`); - return { succeeded, failed, skipped }; + + const totals = await dispatchFolderSummaries( + affectedGroups, + input.metaPaths, + input.limiter, + input.llmCallContext, + undefined, + input.knowledgeId, + "pull-folder-summary", + ); + logger.info(`pull-folder-summary done: succeeded=${totals.succeeded} failed=${totals.failed} skipped=${skipped}`); + return { + succeeded: totals.succeeded, + failed: totals.failed, + skipped, + tokenUsage: { inputTokens: totals.inputTokens, outputTokens: totals.outputTokens, costUsd: totals.costUsd }, + }; } diff --git a/packages/ingest-github/src/strategies/flat-folder/folder-summary.ts b/packages/ingest-github/src/strategies/flat-folder/folder-summary.ts index 8f7d15c..cdd9c5d 100644 --- a/packages/ingest-github/src/strategies/flat-folder/folder-summary.ts +++ b/packages/ingest-github/src/strategies/flat-folder/folder-summary.ts @@ -1,22 +1,30 @@ import { readFile, readdir, writeFile } from "node:fs/promises"; import path from "node:path"; -import { askJsonLLM } from "@bb/llm"; +import { askJsonLLM, type AskLlmOptions } from "@bb/llm"; +import { LlmConfigError, LlmError } from "@bb/errors"; import { logger } from "@bb/logger"; import { Config } from "@bb/types"; import { getConfigValue } from "@bb/config"; -import type { CondensedFileAnalysis } from "src/types/condensed-file-analysis.ts"; -import type { MetaPaths } from "src/types/meta-paths.ts"; -import { encodeMetaPath } from "src/pipeline/paths.ts"; -import { withConcurrency } from "src/pipeline/concurrency.ts"; -import { throwIfCancelled, CancellationError } from "src/pipeline/cancellation.ts"; -import { iterateCondensed } from "./big-file/storage.ts"; +import type { CondensedFileAnalysis } from "#src/types/condensed-file-analysis.ts"; +import type { MetaPaths } from "#src/types/meta-paths.ts"; +import { encodeMetaPath } from "#src/pipeline/paths.ts"; +import type { ConcurrencyLimiter } from "#src/pipeline/concurrency.ts"; +import { throwIfCancelled, CancellationError } from "#src/pipeline/cancellation.ts"; +import type { ProgressContext } from "#src/progress/types.ts"; +import type { FileAnalysisCache } from "./file-analysis-cache.ts"; import { directFolderOf } from "./folder-path.ts"; -import { FOLDER_ANALYSIS_SYSTEM_PROMPT, folderAnalysisUserPrompt } from "./prompts/folder-summary.ts"; +import { + FOLDER_ANALYSIS_SYSTEM_PROMPT, + FOLDER_BATCH_SYSTEM_PROMPT, + folderAnalysisUserPrompt, + folderBatchUserPrompt, + type BatchedFolderInput, +} from "./prompts/folder-summary.ts"; import type { FolderSummary } from "./types.ts"; -export async function groupByDirectFolder(metaPaths: MetaPaths): Promise> { +export function groupByDirectFolder(cache: FileAnalysisCache): Map { const groups = new Map(); - for await (const entry of iterateCondensed(metaPaths)) { + for (const entry of cache.values()) { const folder = directFolderOf(entry.relativePath); const bucket = groups.get(folder) ?? []; bucket.push(entry); @@ -36,22 +44,159 @@ interface FolderSummaryJson { dependencyGraph?: unknown; } +export interface FolderBucket { + folderPath: string; + files: CondensedFileAnalysis[]; +} + +/** + * Splits the folder groups into "individual" (one LLM call per folder, used + * for big folders or when batching is disabled) and "batches" (N small + * folders summarised in one LLM call). Driven by `Config.FolderSummaryBatchSize` + * (set to 1 to disable batching entirely) and `Config.FolderSummaryBatchMaxFiles` + * (folders exceeding this file count always take the individual path). + * + * Folders are sorted by path so that two runs of the same repo produce the + * same batch composition — helpful when A/B-comparing outputs. + */ +export function groupFoldersForBatching(groups: Map): { + individual: FolderBucket[]; + batches: FolderBucket[][]; +} { + const batchSize = getConfigValue(Config.FolderSummaryBatchSize); + const maxFiles = getConfigValue(Config.FolderSummaryBatchMaxFiles); + const sorted: FolderBucket[] = [...groups.entries()] + .map(([folderPath, files]) => ({ folderPath, files })) + .sort((a, b) => a.folderPath.localeCompare(b.folderPath)); + + if (batchSize <= 1) { + return { individual: sorted, batches: [] }; + } + + const individual: FolderBucket[] = []; + const batchable: FolderBucket[] = []; + for (const bucket of sorted) { + if (bucket.files.length > maxFiles) { + individual.push(bucket); + } else { + batchable.push(bucket); + } + } + + const batches: FolderBucket[][] = []; + for (let i = 0; i < batchable.length; i += batchSize) { + batches.push(batchable.slice(i, i + batchSize)); + } + return { individual, batches }; +} + export async function summariseFolder( folderPath: string, files: CondensedFileAnalysis[], -): Promise { + llmCallContext?: AskLlmOptions, +): Promise<{ + summary: FolderSummary | null; + tokenUsage: { inputTokens: number; outputTokens: number; costUsd: number }; +}> { const userPrompt = folderAnalysisUserPrompt(folderPath, files); try { - const response = await askJsonLLM(FOLDER_ANALYSIS_SYSTEM_PROMPT, userPrompt); + const response = await askJsonLLM( + FOLDER_ANALYSIS_SYSTEM_PROMPT, + userPrompt, + llmCallContext ?? {}, + ); if (response.result === null) { logger.warn(`summariseFolder: ${folderPath || ""} returned unparseable JSON`); - return null; + return { + summary: null, + tokenUsage: { + inputTokens: response.usage.inputTokens, + outputTokens: response.usage.outputTokens, + costUsd: response.usage.costUsd, + }, + }; } - return shapeFolderSummary(folderPath, response.result); + return { + summary: shapeFolderSummary(folderPath, response.result), + tokenUsage: { + inputTokens: response.usage.inputTokens, + outputTokens: response.usage.outputTokens, + costUsd: response.usage.costUsd, + }, + }; } catch (cause: unknown) { + if (cause instanceof LlmConfigError || cause instanceof LlmError) { + throw cause; + } const msg = cause instanceof Error ? cause.message : String(cause); logger.warn(`summariseFolder: ${folderPath || ""} askJsonLLM failed: ${msg}`); - return null; + return { summary: null, tokenUsage: { inputTokens: 0, outputTokens: 0, costUsd: 0 } }; + } +} + +/** + * Multi-folder summary. Builds a label-indexed prompt, parses the keyed JSON + * response, returns one `FolderSummary | null` per folder. Folders missing + * from the response (or whose entry fails shape validation) are surfaced as + * `null` with a warn log; the caller counts those as failed. + */ +export async function summariseFolderBatch( + batch: FolderBucket[], + llmCallContext?: AskLlmOptions, +): Promise<{ + summaries: Map; + tokenUsage: { inputTokens: number; outputTokens: number; costUsd: number }; +}> { + const labeled: BatchedFolderInput[] = batch.map((b, i) => ({ label: i, folderPath: b.folderPath, files: b.files })); + const userPrompt = folderBatchUserPrompt(labeled); + const summaries = new Map(); + try { + const response = await askJsonLLM>( + FOLDER_BATCH_SYSTEM_PROMPT, + userPrompt, + llmCallContext ?? {}, + ); + if (response.result === null) { + logger.warn(`summariseFolderBatch: batch of ${batch.length} returned unparseable JSON`); + for (const b of batch) { + summaries.set(b.folderPath, null); + } + return { + summaries, + tokenUsage: { + inputTokens: response.usage.inputTokens, + outputTokens: response.usage.outputTokens, + costUsd: response.usage.costUsd, + }, + }; + } + for (const b of labeled) { + const raw = response.result[String(b.label)]; + if (raw === undefined || typeof raw !== "object" || raw === null) { + logger.warn(`summariseFolderBatch: missing/invalid entry for label ${b.label} (${b.folderPath || ""})`); + summaries.set(b.folderPath, null); + continue; + } + summaries.set(b.folderPath, shapeFolderSummary(b.folderPath, raw)); + } + return { + summaries, + tokenUsage: { + inputTokens: response.usage.inputTokens, + outputTokens: response.usage.outputTokens, + costUsd: response.usage.costUsd, + }, + }; + } catch (cause: unknown) { + if (cause instanceof LlmConfigError || cause instanceof LlmError) { + throw cause; + } + const msg = cause instanceof Error ? cause.message : String(cause); + logger.warn(`summariseFolderBatch: batch of ${batch.length} askJsonLLM failed: ${msg}`); + for (const b of batch) { + summaries.set(b.folderPath, null); + } + return { summaries, tokenUsage: { inputTokens: 0, outputTokens: 0, costUsd: 0 } }; } } @@ -83,41 +228,163 @@ export async function* iterateFolderSummaries(metaPaths: MetaPaths): AsyncGenera } } -export async function runFolderSummaryPhase( +interface FolderSummaryTotals { + succeeded: number; + failed: number; + inputTokens: number; + outputTokens: number; + costUsd: number; +} + +/** + * Dispatches a single folder through `summariseFolder` and persists the + * result. Shared between `runFolderSummaryPhase` and `runSelectiveFolderSummary`. + */ +async function dispatchIndividual( + bucket: FolderBucket, + metaPaths: MetaPaths, + totals: FolderSummaryTotals, + llmCallContext: AskLlmOptions | undefined, + reporter: ReturnType> | undefined, knowledgeId: string, + phaseLabel: string, +): Promise { + try { + throwIfCancelled(knowledgeId); + const { summary, tokenUsage } = await summariseFolder(bucket.folderPath, bucket.files, llmCallContext); + totals.inputTokens += tokenUsage.inputTokens; + totals.outputTokens += tokenUsage.outputTokens; + totals.costUsd += tokenUsage.costUsd; + if (summary !== null) { + await persistFolderSummary(metaPaths, summary); + totals.succeeded += 1; + } else { + totals.failed += 1; + } + } catch (cause: unknown) { + if (cause instanceof CancellationError) { + throw cause; + } + totals.failed += 1; + logger.warn(`${phaseLabel}: folder summary failed for ${bucket.folderPath || ""}`); + } finally { + reporter?.increment(1, { fileName: bucket.folderPath || "" }); + } +} + +/** + * Dispatches a multi-folder batch through `summariseFolderBatch`. Each + * non-null per-folder summary is persisted; missing/null entries count + * toward `failed`. Progress increments once per folder. + */ +async function dispatchBatch( + batch: FolderBucket[], metaPaths: MetaPaths, -): Promise<{ succeeded: number; failed: number }> { - const concurrentWorkers = getConfigValue(Config.ConcurrentWorkers); - const limit = withConcurrency(concurrentWorkers); - const groups = await groupByDirectFolder(metaPaths); - let succeeded = 0; - let failed = 0; - const tasks: Promise[] = []; - for (const [folderPath, files] of groups.entries()) { - tasks.push( - limit(async () => { + totals: FolderSummaryTotals, + llmCallContext: AskLlmOptions | undefined, + reporter: ReturnType> | undefined, + knowledgeId: string, + phaseLabel: string, +): Promise { + try { + throwIfCancelled(knowledgeId); + const { summaries, tokenUsage } = await summariseFolderBatch(batch, llmCallContext); + totals.inputTokens += tokenUsage.inputTokens; + totals.outputTokens += tokenUsage.outputTokens; + totals.costUsd += tokenUsage.costUsd; + for (const bucket of batch) { + const summary = summaries.get(bucket.folderPath) ?? null; + if (summary !== null) { try { - throwIfCancelled(knowledgeId); - const summary = await summariseFolder(folderPath, files); - if (summary !== null) { - await persistFolderSummary(metaPaths, summary); - succeeded += 1; - } else { - failed += 1; - } + await persistFolderSummary(metaPaths, summary); + totals.succeeded += 1; } catch (cause: unknown) { - if (cause instanceof CancellationError) { - throw cause; - } - failed += 1; - logger.warn(`phase5: folder summary failed for ${folderPath || ""}`); + totals.failed += 1; + logger.warn( + `${phaseLabel}: persist failed for ${bucket.folderPath || ""}: ${cause instanceof Error ? cause.message : String(cause)}`, + ); } - }), + } else { + totals.failed += 1; + } + reporter?.increment(1, { fileName: bucket.folderPath || "" }); + } + } catch (cause: unknown) { + if (cause instanceof CancellationError) { + throw cause; + } + totals.failed += batch.length; + for (const bucket of batch) { + reporter?.increment(1, { fileName: bucket.folderPath || "" }); + } + logger.warn( + `${phaseLabel}: batch summary failed for ${batch.length} folders: ${cause instanceof Error ? cause.message : String(cause)}`, + ); + } +} + +/** + * Dispatch helper used by both `runFolderSummaryPhase` and + * `runSelectiveFolderSummary`. Splits `groups` into individual + batched + * buckets, schedules every task through the shared `limiter`, awaits all, + * and returns the aggregated totals. + */ +export async function dispatchFolderSummaries( + groups: Map, + metaPaths: MetaPaths, + limiter: ConcurrencyLimiter, + llmCallContext: AskLlmOptions | undefined, + reporter: ReturnType> | undefined, + knowledgeId: string, + phaseLabel: string, +): Promise { + const totals: FolderSummaryTotals = { succeeded: 0, failed: 0, inputTokens: 0, outputTokens: 0, costUsd: 0 }; + const { individual, batches } = groupFoldersForBatching(groups); + const tasks: Promise[] = []; + for (const bucket of individual) { + tasks.push( + limiter(() => dispatchIndividual(bucket, metaPaths, totals, llmCallContext, reporter, knowledgeId, phaseLabel)), + ); + } + for (const batch of batches) { + tasks.push( + limiter(() => dispatchBatch(batch, metaPaths, totals, llmCallContext, reporter, knowledgeId, phaseLabel)), ); } await Promise.all(tasks); - logger.info(`phase5 done: foldersSummarised=${succeeded} failed=${failed}`); - return { succeeded, failed }; + return totals; +} + +export async function runFolderSummaryPhase( + knowledgeId: string, + metaPaths: MetaPaths, + cache: FileAnalysisCache, + limiter: ConcurrencyLimiter, + llmCallContext?: AskLlmOptions, + progressContext?: ProgressContext, +): Promise<{ + succeeded: number; + failed: number; + tokenUsage: { inputTokens: number; outputTokens: number; costUsd: number }; +}> { + const groups = groupByDirectFolder(cache); + const reporter = progressContext?.reporter({ + phase: "folder_analysis", + total: { kind: "fixed", total: groups.size }, + }); + await reporter?.start(); + let totals: FolderSummaryTotals; + try { + totals = await dispatchFolderSummaries(groups, metaPaths, limiter, llmCallContext, reporter, knowledgeId, "phase5"); + } finally { + reporter?.stop(); + } + logger.info(`phase5 done: foldersSummarised=${totals.succeeded} failed=${totals.failed}`); + return { + succeeded: totals.succeeded, + failed: totals.failed, + tokenUsage: { inputTokens: totals.inputTokens, outputTokens: totals.outputTokens, costUsd: totals.costUsd }, + }; } function shapeFolderSummary(folderPath: string, raw: FolderSummaryJson): FolderSummary { diff --git a/packages/ingest-github/src/strategies/flat-folder/index.ts b/packages/ingest-github/src/strategies/flat-folder/index.ts index 76f16a3..86797a6 100644 --- a/packages/ingest-github/src/strategies/flat-folder/index.ts +++ b/packages/ingest-github/src/strategies/flat-folder/index.ts @@ -1,79 +1,159 @@ +import { Config } from "@bb/types"; +import { getConfigValue } from "@bb/config"; import { logger } from "@bb/logger"; -import type { FileAnalyzer } from "src/types/pipeline.ts"; -import type { IngestStrategy, StrategyInput, StrategyResult } from "src/types/strategy.ts"; -import { throwIfCancelled } from "src/pipeline/cancellation.ts"; -import { classifyAndAnalyseSmall } from "./phases/classify-and-analyse-small.ts"; -import { processBigFilesQueue } from "./phases/process-big-files.ts"; +import type { FileAnalyzer } from "#src/types/pipeline.ts"; +import type { IngestStrategy, StrategyInput, StrategyResult } from "#src/types/strategy.ts"; +import { throwIfCancelled } from "#src/pipeline/cancellation.ts"; +import { classifyFailure } from "#src/pipeline/failure-classifier.ts"; +import { withConcurrency } from "#src/pipeline/concurrency.ts"; +import { scanAndClassify } from "./phases/scan-and-classify.ts"; +import { analyseSmallFiles } from "./phases/analyse-small.ts"; +import { analyseBigFiles } from "./phases/analyse-big-files.ts"; import { backfillMissingFields } from "./backfill/fields.ts"; -import { backfillBigFiles } from "./backfill/big-files.ts"; +import { FileAnalysisCache } from "./file-analysis-cache.ts"; import { runFolderSummaryPhase } from "./folder-summary.ts"; import { makeRepoSummaryEnvelope, persistRepoSummary, summariseRepo } from "./repo-summary.ts"; import { storeFlatAnalysis } from "./phases/store-flat-analysis.ts"; +import type { ProgressContext, ProgressContextFactory } from "#src/progress/types.ts"; +import { nullProgressContextFactory } from "#src/progress/NullProgressReporter.ts"; export interface FlatFolderStrategyDeps { fileAnalyzer: FileAnalyzer; + progressContextFactory?: ProgressContextFactory; } export function createFlatFolderStrategy(deps: FlatFolderStrategyDeps): IngestStrategy { + const progressContextFactory = deps.progressContextFactory ?? nullProgressContextFactory; return { name: "flat-folder", async execute(input: StrategyInput): Promise { const { context, source, archiveSink, metaPaths, payload, branch } = input; - const { knowledgeId, orgId, repoId } = context; + const { knowledgeId, orgId, repoId, llmCallContext } = context; + const progressContext: ProgressContext = progressContextFactory(knowledgeId); - logger.info(`flat-folder: phase1 (classify + analyse small) starting for ${knowledgeId}`); - throwIfCancelled(knowledgeId); - const phase1Input: Parameters[0] = { - knowledgeId, - source, - metaPaths, - analyzer: deps.fileAnalyzer, - }; - if (archiveSink !== undefined) { - phase1Input.archiveSink = archiveSink; - } - const phase1 = await classifyAndAnalyseSmall(phase1Input); + try { + // Shared LLM limiter — small-file analyses, big-file chunk analyses, + // and per-file condense calls all check out from this single pool. + const llmConcurrency = getConfigValue(Config.LlmConcurrency); + const limiter = withConcurrency(llmConcurrency); - logger.info(`flat-folder: phase2 (process big files) starting`); - throwIfCancelled(knowledgeId); - const phase2 = await processBigFilesQueue({ knowledgeId, source, metaPaths }); + progressContext.phaseChanged("scan"); + logger.info(`flat-folder: phase1 (scan + classify) starting for ${knowledgeId} limit=${llmConcurrency}`); + throwIfCancelled(knowledgeId); + const scanInput: Parameters[0] = { + knowledgeId, + source, + metaPaths, + limiter, + progressContext, + }; + if (llmCallContext !== undefined) { + scanInput.llmCallContext = llmCallContext; + } + const { manifest } = await scanAndClassify(scanInput); - logger.info(`flat-folder: phase3 (backfill missing fields) starting`); - throwIfCancelled(knowledgeId); - await backfillMissingFields(metaPaths); + progressContext.phaseChanged("file_analysis"); + logger.info( + `flat-folder: phase2 (analyse small ${manifest.summary.smallCount} + big ${manifest.summary.bigCount}) starting in parallel`, + ); + throwIfCancelled(knowledgeId); + const smallInput: Parameters[0] = { + knowledgeId, + manifest, + source, + metaPaths, + analyzer: deps.fileAnalyzer, + limiter, + progressContext, + }; + if (archiveSink !== undefined) { + smallInput.archiveSink = archiveSink; + } + if (llmCallContext !== undefined) { + smallInput.llmCallContext = llmCallContext; + } + const bigInput: Parameters[0] = { + knowledgeId, + manifest, + source, + metaPaths, + limiter, + progressContext, + }; + if (llmCallContext !== undefined) { + bigInput.llmCallContext = llmCallContext; + } + const [smallResult, bigResult] = await Promise.all([analyseSmallFiles(smallInput), analyseBigFiles(bigInput)]); + let totalInputTokens = smallResult.tokenUsage.inputTokens + bigResult.tokenUsage.inputTokens; + let totalOutputTokens = smallResult.tokenUsage.outputTokens + bigResult.tokenUsage.outputTokens; + let totalCostUsd = smallResult.tokenUsage.costUsd + bigResult.tokenUsage.costUsd; - logger.info(`flat-folder: phase4 (backfill big files) starting`); - throwIfCancelled(knowledgeId); - await backfillBigFiles({ knowledgeId, source, metaPaths }); + logger.info(`flat-folder: loading file-analysis cache`); + throwIfCancelled(knowledgeId); + const fileAnalysisCache = await FileAnalysisCache.loadAll(metaPaths); - logger.info(`flat-folder: phase5 (folder summaries) starting`); - throwIfCancelled(knowledgeId); - const phase5 = await runFolderSummaryPhase(knowledgeId, metaPaths); + logger.info(`flat-folder: phase3 (backfill missing fields) starting`); + throwIfCancelled(knowledgeId); + await backfillMissingFields(metaPaths, fileAnalysisCache, limiter, llmCallContext, progressContext); - logger.info(`flat-folder: phase6 (repo summary) starting`); - throwIfCancelled(knowledgeId); - const repoSummary = await summariseRepo(knowledgeId, metaPaths); - let repoSummarised = false; - if (repoSummary !== null) { - await persistRepoSummary(metaPaths, makeRepoSummaryEnvelope(knowledgeId, orgId, repoSummary)); - repoSummarised = true; - } + progressContext.phaseChanged("folder_analysis"); + logger.info(`flat-folder: phase5 (folder summaries) starting`); + throwIfCancelled(knowledgeId); + const phase5 = await runFolderSummaryPhase( + knowledgeId, + metaPaths, + fileAnalysisCache, + limiter, + llmCallContext, + progressContext, + ); + totalInputTokens += phase5.tokenUsage.inputTokens; + totalOutputTokens += phase5.tokenUsage.outputTokens; + totalCostUsd += phase5.tokenUsage.costUsd; + + progressContext.phaseChanged("indexing"); + logger.info(`flat-folder: phase6 (repo summary) starting`); + throwIfCancelled(knowledgeId); + const { summary: repoSummary, tokenUsage: repoUsage } = await summariseRepo( + knowledgeId, + metaPaths, + llmCallContext, + ); + totalInputTokens += repoUsage.inputTokens; + totalOutputTokens += repoUsage.outputTokens; + totalCostUsd += repoUsage.costUsd; + let repoSummarised = false; + if (repoSummary !== null) { + await persistRepoSummary(metaPaths, makeRepoSummaryEnvelope(knowledgeId, orgId, repoSummary)); + repoSummarised = true; + } - logger.info(`flat-folder: phase7 (graph store) starting`); - throwIfCancelled(knowledgeId); - const phase7 = await storeFlatAnalysis({ - scope: { orgId, knowledgeId, repoId }, - payload, - branch, - metaPaths, - }); + logger.info(`flat-folder: phase7 (graph store) starting`); + throwIfCancelled(knowledgeId); + const phase7 = await storeFlatAnalysis({ + scope: { orgId, knowledgeId, repoId }, + payload, + branch, + metaPaths, + cache: fileAnalysisCache, + progressContext, + }); - return { - filesAnalyzed: phase1.smallFilesAnalysed + phase2.processed + phase2.cached + phase1.oversizedStubs, - foldersSummarised: phase5.succeeded, - repoSummarised, - graphNodesWritten: phase7.nodesWritten, - }; + progressContext.completed(); + + return { + filesAnalyzed: + smallResult.smallFilesAnalysed + smallResult.oversizedStubs + bigResult.processed + bigResult.cached, + foldersSummarised: phase5.succeeded, + repoSummarised, + graphNodesWritten: phase7.nodesWritten, + tokenUsage: { inputTokens: totalInputTokens, outputTokens: totalOutputTokens, costUsd: totalCostUsd }, + }; + } catch (cause: unknown) { + const { category, reason, detail } = classifyFailure(cause); + progressContext.failed(reason, undefined, category, detail); + throw cause; + } }, }; } diff --git a/packages/ingest-github/src/strategies/flat-folder/phases/README.md b/packages/ingest-github/src/strategies/flat-folder/phases/README.md index 4c7889d..64cfc96 100644 --- a/packages/ingest-github/src/strategies/flat-folder/phases/README.md +++ b/packages/ingest-github/src/strategies/flat-folder/phases/README.md @@ -6,72 +6,151 @@ Backfill (Phases 3 and 4) lives in the sibling `backfill/` folder; folder and repo summarisation (Phases 5 and 6) live as `folder-summary.ts` and `repo-summary.ts` at the strategy root. +The strategy constructs a **shared LLM limiter** (`withConcurrency(Config.LlmConcurrency)`, +default 29) once at entry. Every LLM call across the small-file phase, +the big-file chunk phase, and per-file condense calls checks out from +the same pool — the single tunable for total in-flight LLM calls. + ## Files -- `classify-and-analyse-small.ts` — Phase 1. - `classifyAndAnalyseSmall({knowledgeId, repoDir, metaPaths, analyzer})` - walks `scanRepository(repoDir)` and per entry: - - `kind === "oversized"` → write a stub via `buildOversizedStub` + - `saveCondensed`, and append a `too-large` row to `bigFiles.json`. - - token count > `Config.ContextWindowLimit` → buffer a - `context-window-exceeded` row for Phase 2. - - otherwise → run `analyseScannedFile(analyzer, entry)` and persist via - `saveCondensed`, under a `withConcurrency(Config.ConcurrentWorkers)` - limiter so analyses run in parallel. - Cancellation is checked at scan boundaries and inside each task; the - buffered big-file list is flushed via `writeBigFiles` after all tasks - drain. -- `process-big-files.ts` — Phase 2. - `processBigFilesQueue({knowledgeId, repoDir, metaPaths})` reads - `bigFiles.json`, skips `too-large` entries (counted as - `skippedOversized`), short-circuits when `inspect` returns `complete` - (counted as `cached`), reads the file from disk, and dispatches - `processBigFile` sequentially per file. Cancellation re-throws past the - phase; other errors are logged per file and counted as `failed`. +- `scan-and-classify.ts` — Phase 1. `scanAndClassify({knowledgeId, source, +metaPaths, skipDecider?, llmCallContext?, progressContext?})` walks + `source.scan({ skipDecider, llmCallContext })` exactly once, counts + tokens for every eligible entry, classifies each as `"small"`, + `"big"` (token count > `Config.ContextWindowLimit`), or `"oversized"` + (yielded as `kind === "oversized"` by `scanRepository`), and writes + `meta-output/scan-manifest.json` plus the legacy `bigFiles.json` (for + pull-path and backfill consumers that have not migrated). Big entries + get a cheap `estimatedChunks = ceil(tokenCount / Config.MaxTokensPerChunk)` + used by Phase 2's progress reporter. No LLM calls. No file analysis. +- `analyse-small.ts` — Phase 2a. `analyseSmallFiles({knowledgeId, manifest, +source, metaPaths, analyzer, limiter, archiveSink?, llmCallContext?, +progressContext?})` filters the manifest to `kind === "small"` entries, + re-reads each file via `source.readFile`, runs the LLM file analyser, + and persists via `saveCondensed`. Oversized entries also flow through + here as stub writes (no LLM). Every LLM dispatch goes through the + shared `limiter`. Progress is a fixed total — `smallCount + oversizedCount`. +- `process-big-files.ts` — Phase 2b plus the legacy queue. Exports two + functions: + - `analyseBigFiles({knowledgeId, manifest, source, metaPaths, limiter, +llmCallContext?, progressContext?})` — manifest-driven chunk-task + queue. Skips files already complete (manifest + condensed on disk). + For each remaining big file: read content, split into chunks + via `splitFileIntoChunks`, register a per-file `pendingChunks` + counter. Every chunk becomes an independent task scheduled through + the shared limiter: cache-check via `loadChunkIfPresent`, otherwise + `analyzeChunk` + `saveChunk`. When a file's last chunk lands, that + file's condense is **immediately** scheduled through the same + limiter — condenses across multiple files run in parallel with + chunks of slower files. Two fixed-total progress sub-phases: + `"big_files_chunks"` (sum of `estimatedChunks`) and + `"big_files_condense"` (`bigCount`). + - `processBigFilesQueue({knowledgeId, source, metaPaths, llmCallContext?, +progressContext?})` — legacy serial driver kept for the pull-path + (`pipeline/pull.ts`) and any caller that has not migrated to + `analyseBigFiles(manifest, …)`. Reads `bigFiles.json`, dispatches + `processBigFile` once per file in a `for` loop. - `store-flat-analysis.ts` — Phase 7. - `storeFlatAnalysis({scope, payload, branch, metaPaths})` ensures + `storeFlatAnalysis({scope, payload, branch, metaPaths, cache})` ensures `flat-folder` Neo4j indexes, upserts `:Repo` (from `repo-summary.json` - if present, empty payload otherwise), then iterates folder summaries - via `iterateFolderSummaries` to upsert `:Folder`, then iterates - condensed entries via `iterateCondensed` to upsert `:File`. Files whose - containing folder was not in the summaries set get a synthesised empty - `:Folder` so the `CONTAINS` edge always lands. `languageFromPath` - fills `language` when the analysis left it blank. + if present, empty payload otherwise), then **dispatches `:Folder` and + `:File` upserts in batches of `Config.Neo4jBatchSize` (default 50)** + via `upsertFolderNodesBatch` / `upsertFileNodesBatch` from `@bb/neo4j`. + Each batch is one Neo4j write transaction containing the same 12 + Cyphers (1 MERGE + 1 folder-attach + 5 rel CLEARs + 5 rel ATTACHes via + UNWIND) that a single upsert used to issue — so a 1 000-file repo + collapses from ~12 000 round-trips to ~240. Files whose containing + folder was not in the summaries set get a synthesised empty `:Folder` + entry added to the folder batch list **up front** (before any batch + dispatches) so the `CONTAINS` edge always lands. + `languageFromPath` fills `language` when the analysis left it blank. + Both progress reporters (`folders`, `files`) open at phase entry with + their fixed totals so the indexing overall-progress aggregate sees + both denominators from the first tick — fixes the prior "leaps to 100 + then sits there" UX bug. + +## Execution order + +``` +scanAndClassify + ↓ (manifest in-memory + on disk) +┌── analyseSmallFiles ──┐ +│ │ (Promise.all, share one limiter) +└── analyseBigFiles ────┘ + ↓ +FileAnalysisCache.loadAll (one parallel readdir+readFile pass) + ↓ +backfillMissingFields → folderSummary → repoSummary → storeFlatAnalysis + (cache read+write) (cache read) (cache read) +``` + +`FileAnalysisCache` is a `Map` loaded +once between phase 2 and phase 3. Phases 3, 5, 7 all consume the same +instance — phase 3 also calls `cache.set(...)` after each backfill write +so phases 5 and 7 see the updated entries without re-reading disk. ## Public interfaces -- `classifyAndAnalyseSmall(input): Promise` — - `{ smallFilesAnalysed, bigFilesQueued, oversizedStubs, failed }`. -- `processBigFilesQueue(input): Promise` — - `{ processed, cached, failed, skippedOversized }`. +- `scanAndClassify(input): Promise` — + `{ manifest }`. The manifest contains every eligible file plus a + `summary` with `totalFiles`, `smallCount`, `bigCount`, `oversizedCount`, + `totalTokens`, `estimatedBigChunks`. +- `analyseSmallFiles(input): Promise` — + `{ smallFilesAnalysed, oversizedStubs, failed, tokenUsage }`. + Progress: fixed-total reporter sized by `smallCount + oversizedCount`. +- `analyseBigFiles(input): Promise` — + `{ processed, cached, failed, skippedOversized, tokenUsage }`. + Progress: two fixed-total reporters — one for chunks across all + big files, one for per-file condenses. +- `processBigFilesQueue(input): Promise` — same + result shape; legacy driver used by the pull path. - `storeFlatAnalysis(input): Promise` — `{ nodesWritten, foldersWritten, filesWritten }`. -Each phase returns its own counter shape; the strategy aggregates them -into `FlatFolderResult`. - ## Data ownership -- Phase 1 writes condensed JSON (small files + oversized stubs) and - `bigFiles.json`. -- Phase 2 writes chunk artifacts, the chunk manifest, and condensed JSON - for big files via `processBigFile`. -- Phase 7 owns no disk artifacts. It reads the on-disk state produced by +- Phase 1 writes `scan-manifest.json` (canonical) and `bigFiles.json` + (legacy view for backfill + pull). It does not write per-file + analyses. +- Phase 2a writes condensed JSON for small files + oversized stubs. +- Phase 2b writes per-chunk JSON (`chunks//chunk-N.json`), + per-file chunk manifests (`.manifest.json`), and condensed JSON + for big files. +- `FileAnalysisCache` is an in-memory artifact owned by the strategy + run (not persisted). It loads from `fileAnalysisDir` once and is + passed by reference to phases 3, 5, and 7. +- Phase 7 owns no disk artifacts. It reads on-disk state produced by Phases 1–6 and writes Neo4j nodes (`:Repo`, `:Folder`, `:File`) plus the `CONTAINS` edge. ## Invariants - Disk is the inter-phase contract; nothing crosses a phase boundary in - memory. + memory (except the in-memory manifest object that scan returns directly + to the orchestrator, which is a convenience — the canonical copy on + disk is what later resume/backfill runs read). - `throwIfCancelled(knowledgeId)` runs at every scan boundary, every - big-file boundary, and before each Neo4j upsert in Phase 7. -- Per-file LLM or I/O failures are logged and counted; phases do not - abort on a single bad file. Only `CancellationError` propagates. + per-chunk and per-file dispatch boundary, and before each Neo4j + upsert in Phase 7. +- Per-file or per-chunk LLM/I/O failures are logged and counted; phases + do not abort on a single bad file. Only `CancellationError`, + `LlmConfigError`, and `LlmError` propagate. +- The shared LLM limiter is the only place LLM concurrency is bounded + during the small/big phases **and the folder-summary phase**. + `Config.BigFileConcurrency` is no longer consulted from the chunk-queue + path (it is still consulted by the legacy `processBigFile` used by the + pull-path driver). `Config.ConcurrentWorkers` is no longer consulted + by the folder-summary phase. +- Phase 5 batches small folders by default. `Config.FolderSummaryBatchSize` + (default 10) controls batch size; set to 1 to disable and restore one + LLM call per folder. `Config.FolderSummaryBatchMaxFiles` (default 15) + is the per-folder file ceiling above which a folder always takes the + individual path so the LLM still sees the full per-file context. Large + folders run side-by-side with batches under the same shared limiter. +- Phase 1 respects `Config.ContextWindowLimit` and + `Config.MaxTokensPerChunk`; do not hardcode either. - Phase 7 always emits a `:Repo` node, even when `repo-summary.json` is absent (logged as a `phase7` warning). -- Phase 1 respects `Config.ContextWindowLimit` and - `Config.ConcurrentWorkers`; do not hardcode either. ## External dependencies @@ -80,8 +159,8 @@ into `FlatFolderResult`. `upsertRepoNode`, `upsertFolderNode`, `upsertFileNode`, `NodeScope`), `pipeline/scan.ts`, `pipeline/concurrency.ts`, `pipeline/cancellation.ts`, and the sibling `flat-folder/{analyse-file, big-file, folder-summary, -folder-path}` modules plus `adapters/llm-file-analyzer.ts` -(`languageFromPath`). +folder-path, scan-manifest}` modules plus +`adapters/llm-file-analyzer.ts` (`languageFromPath`). ## Tier diff --git a/packages/ingest-github/src/strategies/flat-folder/phases/analyse-big-files.ts b/packages/ingest-github/src/strategies/flat-folder/phases/analyse-big-files.ts new file mode 100644 index 0000000..33f6446 --- /dev/null +++ b/packages/ingest-github/src/strategies/flat-folder/phases/analyse-big-files.ts @@ -0,0 +1,287 @@ +import { createHash } from "node:crypto"; +import { logger } from "@bb/logger"; +import { Config } from "@bb/types"; +import { getConfigValue } from "@bb/config"; +import type { AskLlmOptions } from "@bb/llm"; +import { LlmConfigError, LlmError } from "@bb/errors"; +import type { MetaPaths } from "#src/types/meta-paths.ts"; +import type { AnalyzedFileResult, SourceReader } from "#src/types/pipeline.ts"; +import type { ProgressContext } from "#src/progress/types.ts"; +import type { ConcurrencyLimiter } from "#src/pipeline/concurrency.ts"; +import type { ChunkAnalysisResult, FileChunk, HugeFileManifest } from "#src/types/big-file.ts"; +import type { CondensedFileAnalysis } from "#src/types/condensed-file-analysis.ts"; +import { throwIfCancelled, CancellationError } from "#src/pipeline/cancellation.ts"; +import { inspect } from "#src/strategies/flat-folder/big-file/cache.ts"; +import { splitFileIntoChunks } from "#src/strategies/flat-folder/big-file/chunker.ts"; +import { analyzeChunk } from "#src/strategies/flat-folder/big-file/chunk-analyzer.ts"; +import { condenseChunks } from "#src/strategies/flat-folder/big-file/condenser.ts"; +import { + loadChunkIfPresent, + saveChunk, + saveCondensed, + saveManifest, +} from "#src/strategies/flat-folder/big-file/storage.ts"; +import type { ScanManifest, ScanManifestEntry } from "#src/strategies/flat-folder/scan-manifest.ts"; +import type { ProcessBigFilesResult } from "#src/strategies/flat-folder/phases/process-big-files.ts"; +import { describe } from "#src/strategies/flat-folder/phases/process-big-files.ts"; + +const CONDENSE_MAX_ATTEMPTS = 2; +const CONDENSE_RETRY_BACKOFF_MS = 2000; + +export interface AnalyseBigFilesInput { + knowledgeId: string; + manifest: ScanManifest; + source: SourceReader; + metaPaths: MetaPaths; + limiter: ConcurrencyLimiter; + llmCallContext?: AskLlmOptions; + progressContext?: ProgressContext; +} + +interface BigFileState { + entry: ScanManifestEntry; + content: string; + chunks: FileChunk[]; + results: (ChunkAnalysisResult | undefined)[]; + pendingChunks: number; + fatal: boolean; +} + +/** + * Manifest-driven big-file phase. Every chunk of every big file is an + * independent task scheduled through the shared LLM limiter. As soon as the + * last chunk of a given file lands, that file's condense is scheduled — + * multiple condenses run in parallel with the still-pending chunks of slower + * files. All LLM calls (chunk + condense) check out from the same limiter. + * + * Files already fully processed (manifest + condensed on disk) are skipped. + */ +export async function analyseBigFiles(input: AnalyseBigFilesInput): Promise { + const maxTokensPerChunk = getConfigValue(Config.MaxTokensPerChunk); + const bigEntries = input.manifest.entries.filter((e) => e.kind === "big"); + + let cached = 0; + let failed = 0; + let processed = 0; + let totalInputTokens = 0; + let totalOutputTokens = 0; + let totalCostUsd = 0; + + // Per-file preparation: read content, chunk, record state. Sequential and + // cheap — no LLM calls here. + const states: BigFileState[] = []; + for (const entry of bigEntries) { + throwIfCancelled(input.knowledgeId); + const status = await inspect(input.metaPaths, entry.relativePath); + if (status === "complete") { + cached += 1; + continue; + } + let content: string; + try { + content = await input.source.readFile(entry.relativePath); + } catch (cause: unknown) { + failed += 1; + logger.warn(`analyse-big: read failed for ${entry.relativePath}: ${describe(cause)}`); + continue; + } + if (content.length === 0) { + failed += 1; + logger.warn(`analyse-big: empty content for ${entry.relativePath}; skipping`); + continue; + } + const chunks = splitFileIntoChunks(entry.relativePath, content, maxTokensPerChunk); + states.push({ + entry, + content, + chunks, + results: new Array(chunks.length), + pendingChunks: chunks.length, + fatal: false, + }); + logger.info(`analyse-big: ${entry.relativePath} split into ${chunks.length} chunks`); + } + + const totalChunks = states.reduce((acc, s) => acc + s.chunks.length, 0); + const chunkReporter = input.progressContext?.reporter({ + phase: "file_analysis", + subPhase: "big_files_chunks", + total: { kind: "fixed", total: totalChunks }, + }); + await chunkReporter?.start(); + const condenseReporter = input.progressContext?.reporter({ + phase: "file_analysis", + subPhase: "big_files_condense", + total: { kind: "fixed", total: states.length }, + }); + await condenseReporter?.start(); + + // For oversized entries the legacy phase counted them; we accept the manifest + // already accounted for them via the small phase (which writes the stub). + // Surfaced here for parity with the legacy result shape. + const skippedOversized = input.manifest.entries.filter((e) => e.kind === "oversized").length; + + const condensePromises: Promise[] = []; + + function maybeScheduleCondense(state: BigFileState): void { + if (state.pendingChunks > 0 || state.fatal) { + return; + } + const definedResults = state.results.filter((r): r is ChunkAnalysisResult => r !== undefined); + condensePromises.push( + input.limiter(async () => { + throwIfCancelled(input.knowledgeId); + let merged: AnalyzedFileResult | null = null; + for (let attempt = 1; attempt <= CONDENSE_MAX_ATTEMPTS; attempt += 1) { + try { + merged = await condenseChunks(state.entry.relativePath, definedResults, input.llmCallContext); + break; + } catch (cause: unknown) { + if (cause instanceof CancellationError) { + throw cause; + } + if (cause instanceof LlmConfigError || cause instanceof LlmError) { + throw cause; + } + if (attempt < CONDENSE_MAX_ATTEMPTS) { + logger.warn( + `analyse-big: condense attempt ${attempt}/${CONDENSE_MAX_ATTEMPTS} failed for ${state.entry.relativePath}; retrying: ${describe(cause)}`, + ); + await sleep(CONDENSE_RETRY_BACKOFF_MS); + continue; + } + failed += 1; + logger.warn( + `analyse-big: condense failed after ${CONDENSE_MAX_ATTEMPTS} attempts for ${state.entry.relativePath}: ${describe(cause)}`, + ); + } + } + if (merged === null) { + condenseReporter?.increment(1, { fileName: state.entry.relativePath }); + return; + } + + try { + const chunkInputTokens = definedResults.reduce((acc, r) => acc + (r.tokenUsage?.inputTokens ?? 0), 0); + const chunkOutputTokens = definedResults.reduce((acc, r) => acc + (r.tokenUsage?.outputTokens ?? 0), 0); + const chunkCostUsd = definedResults.reduce((acc, r) => acc + (r.tokenUsage?.costUsd ?? 0), 0); + const totalTokenCount = state.chunks.reduce((acc, c) => acc + c.tokenCount, 0); + const totalIn = chunkInputTokens + (merged.tokenUsage?.inputTokens ?? 0); + const totalOut = chunkOutputTokens + (merged.tokenUsage?.outputTokens ?? 0); + const totalCost = chunkCostUsd + (merged.tokenUsage?.costUsd ?? 0); + + const manifest: HugeFileManifest = { + relativePath: state.entry.relativePath, + totalChunks: state.chunks.length, + totalTokenCount, + chunkPaths: state.chunks.map((_, i) => `chunks/${encodeFolder(state.entry.relativePath)}/chunk-${i}.json`), + generatedAt: new Date().toISOString(), + }; + await saveManifest(input.metaPaths, manifest); + + const condensed: CondensedFileAnalysis = { + relativePath: state.entry.relativePath, + language: merged.language, + sha256: sha256(state.content), + sizeBytes: state.entry.sizeBytes, + tokenCount: totalTokenCount, + isBigFile: true, + totalChunks: state.chunks.length, + totalTokenCount, + analysedAt: new Date().toISOString(), + analysis: merged.analysis, + tokenUsage: { inputTokens: totalIn, outputTokens: totalOut, costUsd: totalCost }, + }; + await saveCondensed(input.metaPaths, condensed); + + totalInputTokens += totalIn; + totalOutputTokens += totalOut; + totalCostUsd += totalCost; + processed += 1; + } catch (cause: unknown) { + if (cause instanceof CancellationError) { + throw cause; + } + failed += 1; + logger.warn(`analyse-big: persist failed for ${state.entry.relativePath}: ${describe(cause)}`); + } finally { + condenseReporter?.increment(1, { fileName: state.entry.relativePath }); + } + }), + ); + } + + const chunkPromises: Promise[] = []; + for (const state of states) { + for (let i = 0; i < state.chunks.length; i += 1) { + const idx = i; + const chunk = state.chunks[idx]; + if (chunk === undefined) { + continue; + } + chunkPromises.push( + input.limiter(async () => { + throwIfCancelled(input.knowledgeId); + try { + const cachedChunk = await loadChunkIfPresent(input.metaPaths, state.entry.relativePath, idx); + if (cachedChunk !== null) { + state.results[idx] = cachedChunk; + } else { + const analyzed = await analyzeChunk(chunk, input.llmCallContext); + await saveChunk(input.metaPaths, analyzed); + state.results[idx] = analyzed; + } + } catch (cause: unknown) { + if (cause instanceof CancellationError) { + throw cause; + } + if (cause instanceof LlmConfigError || cause instanceof LlmError) { + state.fatal = true; + throw cause; + } + logger.warn( + `analyse-big: chunk ${idx + 1}/${state.chunks.length} failed for ${state.entry.relativePath}: ${describe(cause)}`, + ); + } finally { + state.pendingChunks -= 1; + chunkReporter?.increment(1, { fileName: `${state.entry.relativePath}#chunk-${String(idx)}` }); + maybeScheduleCondense(state); + } + }), + ); + } + } + + try { + await Promise.all(chunkPromises); + await Promise.all(condensePromises); + } finally { + chunkReporter?.stop(); + condenseReporter?.stop(); + } + + logger.info( + `analyse-big done: processed=${processed} cached=${cached} failed=${failed} skippedOversized=${skippedOversized}`, + ); + return { + processed, + cached, + failed, + skippedOversized, + tokenUsage: { inputTokens: totalInputTokens, outputTokens: totalOutputTokens, costUsd: totalCostUsd }, + }; +} + +function sha256(content: string): string { + return createHash("sha256").update(content).digest("hex"); +} + +function encodeFolder(relativePath: string): string { + return relativePath.replace(/\//gu, "__SL__").replace(/\\/gu, "__BS__"); +} + +function sleep(ms: number): Promise { + return new Promise((resolve) => { + setTimeout(resolve, ms); + }); +} diff --git a/packages/ingest-github/src/strategies/flat-folder/phases/analyse-small.ts b/packages/ingest-github/src/strategies/flat-folder/phases/analyse-small.ts new file mode 100644 index 0000000..5176f7f --- /dev/null +++ b/packages/ingest-github/src/strategies/flat-folder/phases/analyse-small.ts @@ -0,0 +1,133 @@ +import { logger } from "@bb/logger"; +import type { AskLlmOptions } from "@bb/llm"; +import { LlmConfigError, LlmError } from "@bb/errors"; +import type { ArchiveSink, FileAnalyzer, ScannedFile, SourceReader } from "#src/types/pipeline.ts"; +import type { MetaPaths } from "#src/types/meta-paths.ts"; +import type { ProgressContext } from "#src/progress/types.ts"; +import type { ConcurrencyLimiter } from "#src/pipeline/concurrency.ts"; +import { throwIfCancelled, CancellationError } from "#src/pipeline/cancellation.ts"; +import { analyseScannedFile, buildOversizedStub } from "#src/strategies/flat-folder/analyse-file.ts"; +import { saveCondensed } from "#src/strategies/flat-folder/big-file/storage.ts"; +import type { ScanManifest } from "#src/strategies/flat-folder/scan-manifest.ts"; + +export interface AnalyseSmallInput { + knowledgeId: string; + manifest: ScanManifest; + source: SourceReader; + metaPaths: MetaPaths; + analyzer: FileAnalyzer; + limiter: ConcurrencyLimiter; + archiveSink?: ArchiveSink; + llmCallContext?: AskLlmOptions; + progressContext?: ProgressContext; +} + +export interface AnalyseSmallResult { + smallFilesAnalysed: number; + oversizedStubs: number; + failed: number; + tokenUsage: { inputTokens: number; outputTokens: number; costUsd: number }; +} + +/** + * Consumes the `scan-manifest.json` produced by `scanAndClassify` and + * analyses every `kind: "small"` entry through the shared LLM limiter. + * + * Oversized stubs are also written here (they don't go through the LLM but + * still need a placeholder analysis row on disk so downstream phases see a + * complete file set). + */ +export async function analyseSmallFiles(input: AnalyseSmallInput): Promise { + const smallEntries = input.manifest.entries.filter((e) => e.kind === "small"); + const oversizedEntries = input.manifest.entries.filter((e) => e.kind === "oversized"); + + let smallFilesAnalysed = 0; + let oversizedStubs = 0; + let failed = 0; + let totalInputTokens = 0; + let totalOutputTokens = 0; + let totalCostUsd = 0; + + const reporter = input.progressContext?.reporter({ + phase: "file_analysis", + subPhase: "analyse_small", + total: { kind: "fixed", total: smallEntries.length + oversizedEntries.length }, + }); + await reporter?.start(); + + try { + for (const entry of oversizedEntries) { + throwIfCancelled(input.knowledgeId); + try { + await saveCondensed(input.metaPaths, buildOversizedStub(entry.relativePath, entry.sizeBytes)); + oversizedStubs += 1; + } catch (cause: unknown) { + failed += 1; + logger.warn(`analyse-small: oversized stub write failed for ${entry.relativePath}: ${describe(cause)}`); + } + reporter?.increment(1, { fileName: entry.relativePath }); + } + + const pending: Promise[] = []; + for (const entry of smallEntries) { + pending.push( + input.limiter(async () => { + throwIfCancelled(input.knowledgeId); + try { + const content = await input.source.readFile(entry.relativePath); + const scanned: ScannedFile = { + kind: "file", + relativePath: entry.relativePath, + absolutePath: entry.absolutePath, + sizeBytes: entry.sizeBytes, + content, + }; + const condensed = await analyseScannedFile(input.analyzer, scanned, input.llmCallContext); + await saveCondensed(input.metaPaths, condensed); + if (input.archiveSink !== undefined) { + await input.archiveSink.push({ + knowledgeId: input.knowledgeId, + relativePath: entry.relativePath, + content, + }); + } + if (condensed.tokenUsage) { + totalInputTokens += condensed.tokenUsage.inputTokens; + totalOutputTokens += condensed.tokenUsage.outputTokens; + totalCostUsd += condensed.tokenUsage.costUsd; + } + smallFilesAnalysed += 1; + reporter?.increment(1, { fileName: entry.relativePath }); + } catch (cause: unknown) { + if (cause instanceof CancellationError) { + throw cause; + } + if (cause instanceof LlmConfigError || cause instanceof LlmError) { + throw cause; + } + failed += 1; + logger.warn(`analyse-small: analyse failed for ${entry.relativePath}: ${describe(cause)}`); + reporter?.increment(1, { fileName: entry.relativePath }); + } + }), + ); + } + await Promise.all(pending); + } finally { + reporter?.stop(); + } + + logger.info( + `analyse-small done: smallFilesAnalysed=${smallFilesAnalysed} oversizedStubs=${oversizedStubs} failed=${failed}`, + ); + return { + smallFilesAnalysed, + oversizedStubs, + failed, + tokenUsage: { inputTokens: totalInputTokens, outputTokens: totalOutputTokens, costUsd: totalCostUsd }, + }; +} + +function describe(cause: unknown): string { + return cause instanceof Error ? cause.message : String(cause); +} diff --git a/packages/ingest-github/src/strategies/flat-folder/phases/classify-and-analyse-small.ts b/packages/ingest-github/src/strategies/flat-folder/phases/classify-and-analyse-small.ts deleted file mode 100644 index 6cd5c7f..0000000 --- a/packages/ingest-github/src/strategies/flat-folder/phases/classify-and-analyse-small.ts +++ /dev/null @@ -1,122 +0,0 @@ -import path from "node:path"; -import { tokenLen } from "@bb/llm"; -import { logger } from "@bb/logger"; -import { Config } from "@bb/types"; -import { getConfigValue } from "@bb/config"; -import type { ArchiveSink, FileAnalyzer, SkipDecider, SourceReader } from "src/types/pipeline.ts"; -import type { MetaPaths } from "src/types/meta-paths.ts"; -import type { BigFileEntry } from "src/types/big-file.ts"; -import { withConcurrency } from "src/pipeline/concurrency.ts"; -import { throwIfCancelled, CancellationError } from "src/pipeline/cancellation.ts"; -import { makeSkipDecider } from "src/pipeline/skip-decisions/index.ts"; -import { analyseScannedFile, buildOversizedStub } from "src/strategies/flat-folder/analyse-file.ts"; -import { saveCondensed } from "src/strategies/flat-folder/big-file/storage.ts"; -import { writeBigFiles } from "src/strategies/flat-folder/big-file/detector.ts"; - -export interface ClassifyPhaseInput { - knowledgeId: string; - source: SourceReader; - metaPaths: MetaPaths; - analyzer: FileAnalyzer; - skipDecider?: SkipDecider; - archiveSink?: ArchiveSink; -} - -export interface ClassifyPhaseResult { - smallFilesAnalysed: number; - bigFilesQueued: number; - oversizedStubs: number; - failed: number; -} - -export async function classifyAndAnalyseSmall(input: ClassifyPhaseInput): Promise { - const contextWindowLimit = getConfigValue(Config.ContextWindowLimit); - const concurrentWorkers = getConfigValue(Config.ConcurrentWorkers); - const limit = withConcurrency(concurrentWorkers); - const bigFileBuffer: BigFileEntry[] = []; - let smallFilesAnalysed = 0; - let oversizedStubs = 0; - let failed = 0; - - const repositoryHint = - input.source.localRepoDir.length > 0 ? path.basename(input.source.localRepoDir) : input.knowledgeId; - const skipDecider = input.skipDecider ?? makeSkipDecider({ repositoryName: repositoryHint }); - - const pending: Promise[] = []; - - for await (const entry of input.source.scan({ skipDecider })) { - throwIfCancelled(input.knowledgeId); - - if (entry.kind === "oversized") { - bigFileBuffer.push({ - relativePath: entry.relativePath, - sizeBytes: entry.sizeBytes, - tokenCount: 0, - reason: "too-large", - }); - try { - await saveCondensed(input.metaPaths, buildOversizedStub(entry.relativePath, entry.sizeBytes)); - oversizedStubs += 1; - } catch (cause: unknown) { - failed += 1; - logger.warn(`phase1: oversized stub write failed for ${entry.relativePath}: ${describe(cause)}`); - } - continue; - } - - const tokenCount = tokenLen(entry.content); - if (tokenCount > contextWindowLimit) { - bigFileBuffer.push({ - relativePath: entry.relativePath, - sizeBytes: entry.sizeBytes, - tokenCount, - reason: "context-window-exceeded", - }); - continue; - } - - const fileContent = entry.content; - const filePath = entry.relativePath; - pending.push( - limit(async () => { - try { - throwIfCancelled(input.knowledgeId); - const condensed = await analyseScannedFile(input.analyzer, entry); - await saveCondensed(input.metaPaths, condensed); - if (input.archiveSink !== undefined) { - await input.archiveSink.push({ - knowledgeId: input.knowledgeId, - relativePath: filePath, - content: fileContent, - }); - } - smallFilesAnalysed += 1; - } catch (cause: unknown) { - if (cause instanceof CancellationError) { - throw cause; - } - failed += 1; - logger.warn(`phase1: analyse failed for ${entry.relativePath}: ${describe(cause)}`); - } - }), - ); - } - - await Promise.all(pending); - - await writeBigFiles(input.metaPaths, bigFileBuffer); - - logger.info( - `phase1 done: smallFilesAnalysed=${smallFilesAnalysed} bigFilesQueued=${bigFileBuffer.filter((e) => e.reason === "context-window-exceeded").length} oversizedStubs=${oversizedStubs} failed=${failed}`, - ); - return { - smallFilesAnalysed, - bigFilesQueued: bigFileBuffer.filter((e) => e.reason === "context-window-exceeded").length, - oversizedStubs, - failed, - }; -} - -function describe(cause: unknown): string { - return cause instanceof Error ? cause.message : String(cause); -} diff --git a/packages/ingest-github/src/strategies/flat-folder/phases/process-big-files.ts b/packages/ingest-github/src/strategies/flat-folder/phases/process-big-files.ts index 174be8a..951b10e 100644 --- a/packages/ingest-github/src/strategies/flat-folder/phases/process-big-files.ts +++ b/packages/ingest-github/src/strategies/flat-folder/phases/process-big-files.ts @@ -1,15 +1,20 @@ import { logger } from "@bb/logger"; -import type { MetaPaths } from "src/types/meta-paths.ts"; -import type { SourceReader } from "src/types/pipeline.ts"; -import { throwIfCancelled, CancellationError } from "src/pipeline/cancellation.ts"; -import { readBigFiles } from "src/strategies/flat-folder/big-file/detector.ts"; -import { inspect } from "src/strategies/flat-folder/big-file/cache.ts"; -import { processBigFile } from "src/strategies/flat-folder/big-file/index.ts"; +import type { AskLlmOptions } from "@bb/llm"; +import { LlmConfigError, LlmError } from "@bb/errors"; +import type { MetaPaths } from "#src/types/meta-paths.ts"; +import type { SourceReader } from "#src/types/pipeline.ts"; +import type { ProgressContext } from "#src/progress/types.ts"; +import { throwIfCancelled, CancellationError } from "#src/pipeline/cancellation.ts"; +import { readBigFiles } from "#src/strategies/flat-folder/big-file/detector.ts"; +import { inspect } from "#src/strategies/flat-folder/big-file/cache.ts"; +import { processBigFile } from "#src/strategies/flat-folder/big-file/index.ts"; export interface ProcessBigFilesInput { knowledgeId: string; source: SourceReader; metaPaths: MetaPaths; + llmCallContext?: AskLlmOptions; + progressContext?: ProgressContext; } export interface ProcessBigFilesResult { @@ -17,62 +22,104 @@ export interface ProcessBigFilesResult { cached: number; failed: number; skippedOversized: number; + tokenUsage: { inputTokens: number; outputTokens: number; costUsd: number }; } +/** + * Legacy big-file driver. Reads the deprecated `bigFiles.json`, processes + * each entry serially via `processBigFile` (which internally does + * chunk-then-condense). Kept for the pull-path (`pipeline/pull.ts`) and any + * caller that has not migrated to `analyseBigFiles(manifest, …)` yet. + */ export async function processBigFilesQueue(input: ProcessBigFilesInput): Promise { const entries = await readBigFiles(input.metaPaths); let processed = 0; let cached = 0; let failed = 0; let skippedOversized = 0; + let totalInputTokens = 0; + let totalOutputTokens = 0; + let totalCostUsd = 0; - for (const entry of entries) { - throwIfCancelled(input.knowledgeId); - if (entry.reason === "too-large") { - skippedOversized += 1; - continue; - } - const status = await inspect(input.metaPaths, entry.relativePath); - if (status === "complete") { - cached += 1; - continue; - } - let content: string; - try { - content = await input.source.readFile(entry.relativePath); - } catch (cause: unknown) { - failed += 1; - logger.warn(`phase2: read failed for ${entry.relativePath}: ${describe(cause)}`); - continue; - } - if (content.length === 0) { - failed += 1; - logger.warn(`phase2: empty content for ${entry.relativePath}; skipping`); - continue; - } - try { - await processBigFile({ - knowledgeId: input.knowledgeId, - metaPaths: input.metaPaths, - relativePath: entry.relativePath, - content, - sizeBytes: entry.sizeBytes, - }); - processed += 1; - } catch (cause: unknown) { - if (cause instanceof CancellationError) { - throw cause; + const reporter = input.progressContext?.reporter({ + phase: "file_analysis", + subPhase: "big_files_queue", + total: { kind: "fixed", total: entries.length }, + }); + await reporter?.start(); + + try { + for (const entry of entries) { + throwIfCancelled(input.knowledgeId); + if (entry.reason === "too-large") { + skippedOversized += 1; + reporter?.increment(1, { fileName: entry.relativePath }); + continue; + } + const status = await inspect(input.metaPaths, entry.relativePath); + if (status === "complete") { + cached += 1; + reporter?.increment(1, { fileName: entry.relativePath }); + continue; + } + let content: string; + try { + content = await input.source.readFile(entry.relativePath); + } catch (cause: unknown) { + failed += 1; + logger.warn(`big-files-queue: read failed for ${entry.relativePath}: ${describe(cause)}`); + reporter?.increment(1, { fileName: entry.relativePath }); + continue; + } + if (content.length === 0) { + failed += 1; + logger.warn(`big-files-queue: empty content for ${entry.relativePath}; skipping`); + reporter?.increment(1, { fileName: entry.relativePath }); + continue; + } + try { + const condensed = await processBigFile({ + knowledgeId: input.knowledgeId, + metaPaths: input.metaPaths, + relativePath: entry.relativePath, + content, + sizeBytes: entry.sizeBytes, + ...(input.llmCallContext !== undefined ? { llmCallContext: input.llmCallContext } : {}), + ...(input.progressContext !== undefined ? { progressContext: input.progressContext } : {}), + }); + processed += 1; + if (condensed.tokenUsage) { + totalInputTokens += condensed.tokenUsage.inputTokens; + totalOutputTokens += condensed.tokenUsage.outputTokens; + totalCostUsd += condensed.tokenUsage.costUsd; + } + } catch (cause: unknown) { + if (cause instanceof CancellationError) { + throw cause; + } + if (cause instanceof LlmConfigError || cause instanceof LlmError) { + throw cause; + } + failed += 1; + logger.warn(`big-files-queue: processBigFile failed for ${entry.relativePath}: ${describe(cause)}`); } - failed += 1; - logger.warn(`phase2: processBigFile failed for ${entry.relativePath}: ${describe(cause)}`); + reporter?.increment(1, { fileName: entry.relativePath }); } + logger.info( + `big-files-queue done: processed=${processed} cached=${cached} failed=${failed} skippedOversized=${skippedOversized}`, + ); + return { + processed, + cached, + failed, + skippedOversized, + tokenUsage: { inputTokens: totalInputTokens, outputTokens: totalOutputTokens, costUsd: totalCostUsd }, + }; + } finally { + reporter?.stop(); } - logger.info( - `phase2 done: processed=${processed} cached=${cached} failed=${failed} skippedOversized=${skippedOversized}`, - ); - return { processed, cached, failed, skippedOversized }; } -function describe(cause: unknown): string { +export function describe(cause: unknown): string { return cause instanceof Error ? cause.message : String(cause); } diff --git a/packages/ingest-github/src/strategies/flat-folder/phases/scan-and-classify.ts b/packages/ingest-github/src/strategies/flat-folder/phases/scan-and-classify.ts new file mode 100644 index 0000000..6dc92a7 --- /dev/null +++ b/packages/ingest-github/src/strategies/flat-folder/phases/scan-and-classify.ts @@ -0,0 +1,143 @@ +import path from "node:path"; +import { Config } from "@bb/types"; +import { getConfigValue } from "@bb/config"; +import { logger } from "@bb/logger"; +import type { AskLlmOptions } from "@bb/llm"; +import type { MetaPaths } from "#src/types/meta-paths.ts"; +import type { BigFileEntry } from "#src/types/big-file.ts"; +import type { SkipDecider, SourceReader } from "#src/types/pipeline.ts"; +import type { ProgressContext } from "#src/progress/types.ts"; +import type { ConcurrencyLimiter } from "#src/pipeline/concurrency.ts"; +import { throwIfCancelled } from "#src/pipeline/cancellation.ts"; +import { makeSkipDecider } from "#src/pipeline/skip-decisions/index.ts"; +import { classifyByTokens, writeBigFiles } from "#src/strategies/flat-folder/big-file/detector.ts"; +import { + emptyManifest, + writeScanManifest, + type ScanManifest, + type ScanManifestEntry, +} from "#src/strategies/flat-folder/scan-manifest.ts"; + +export interface ScanAndClassifyInput { + knowledgeId: string; + source: SourceReader; + metaPaths: MetaPaths; + skipDecider?: SkipDecider; + llmCallContext?: AskLlmOptions; + progressContext?: ProgressContext; + /** + * Shared LLM-concurrency limiter. When supplied the underlying + * `scanRepository` runs its two-pass strategy: walk + cache-only decisions + * first, then parallel-deduplicated LLM resolution for unknown + * extensions/filenames under this limiter. Optional so the function + * still works standalone. + */ + limiter?: ConcurrencyLimiter; +} + +export interface ScanAndClassifyResult { + manifest: ScanManifest; +} + +/** + * Walks the repo once, classifies every eligible file as small / big / + * oversized by token count, and writes `scan-manifest.json`. The downstream + * small-file and big-file phases consume the manifest instead of re-walking. + * + * Also writes the legacy `bigFiles.json` so the pull-path and backfill phases + * (which still read it directly) keep working without migration. + */ +export async function scanAndClassify(input: ScanAndClassifyInput): Promise { + const contextWindowLimit = getConfigValue(Config.ContextWindowLimit); + const maxTokensPerChunk = getConfigValue(Config.MaxTokensPerChunk); + const manifest = emptyManifest(); + const bigFileEntries: BigFileEntry[] = []; + + const repositoryHint = + input.source.localRepoDir.length > 0 ? path.basename(input.source.localRepoDir) : input.knowledgeId; + const skipDecider = input.skipDecider ?? makeSkipDecider({ repositoryName: repositoryHint }); + + const reporter = input.progressContext?.reporter({ + phase: "scan", + total: { kind: "growing" }, + }); + await reporter?.start(); + + try { + const scanDeps: Parameters[0] = { skipDecider }; + if (input.limiter !== undefined) { + scanDeps.limiter = input.limiter; + } + if (input.llmCallContext !== undefined) { + scanDeps.llmCallContext = input.llmCallContext; + } + + for await (const entry of input.source.scan(scanDeps)) { + throwIfCancelled(input.knowledgeId); + reporter?.incrementSeen(); + + if (entry.kind === "oversized") { + const manifestEntry: ScanManifestEntry = { + relativePath: entry.relativePath, + absolutePath: entry.absolutePath, + sizeBytes: entry.sizeBytes, + tokenCount: 0, + kind: "oversized", + }; + manifest.entries.push(manifestEntry); + manifest.summary.oversizedCount += 1; + manifest.summary.totalFiles += 1; + bigFileEntries.push({ + relativePath: entry.relativePath, + sizeBytes: entry.sizeBytes, + tokenCount: 0, + reason: "too-large", + }); + reporter?.increment(1, { fileName: entry.relativePath }); + continue; + } + + const { tokenCount, isBigFile } = classifyByTokens(entry.content, contextWindowLimit); + manifest.summary.totalFiles += 1; + manifest.summary.totalTokens += tokenCount; + if (isBigFile) { + const estimatedChunks = Math.max(1, Math.ceil(tokenCount / maxTokensPerChunk)); + manifest.entries.push({ + relativePath: entry.relativePath, + absolutePath: entry.absolutePath, + sizeBytes: entry.sizeBytes, + tokenCount, + kind: "big", + estimatedChunks, + }); + manifest.summary.bigCount += 1; + manifest.summary.estimatedBigChunks += estimatedChunks; + bigFileEntries.push({ + relativePath: entry.relativePath, + sizeBytes: entry.sizeBytes, + tokenCount, + reason: "context-window-exceeded", + }); + } else { + manifest.entries.push({ + relativePath: entry.relativePath, + absolutePath: entry.absolutePath, + sizeBytes: entry.sizeBytes, + tokenCount, + kind: "small", + }); + manifest.summary.smallCount += 1; + } + reporter?.increment(1, { fileName: entry.relativePath }); + } + } finally { + reporter?.stop(); + } + + await writeScanManifest(input.metaPaths, manifest); + await writeBigFiles(input.metaPaths, bigFileEntries); + logger.info( + `scan-and-classify done: total=${manifest.summary.totalFiles} small=${manifest.summary.smallCount} big=${manifest.summary.bigCount} oversized=${manifest.summary.oversizedCount} totalTokens=${manifest.summary.totalTokens} estimatedBigChunks=${manifest.summary.estimatedBigChunks}`, + ); + return { manifest }; +} diff --git a/packages/ingest-github/src/strategies/flat-folder/phases/store-flat-analysis.ts b/packages/ingest-github/src/strategies/flat-folder/phases/store-flat-analysis.ts index d6e14b7..7db4433 100644 --- a/packages/ingest-github/src/strategies/flat-folder/phases/store-flat-analysis.ts +++ b/packages/ingest-github/src/strategies/flat-folder/phases/store-flat-analysis.ts @@ -1,20 +1,33 @@ import { readFile } from "node:fs/promises"; +import { Config } from "@bb/types"; +import { getConfigValue } from "@bb/config"; import { logger } from "@bb/logger"; -import { ensureFlatFolderIndexes, upsertFileNode, upsertFolderNode, upsertRepoNode, type NodeScope } from "@bb/neo4j"; +import { + ensureFlatFolderIndexes, + upsertFileNodesBatch, + upsertFolderNodesBatch, + upsertRepoNode, + type NodeScope, + type UpsertFileNodeInput, + type UpsertFolderNodeInput, +} from "@bb/neo4j"; import type { GithubIndexPayload } from "@bb/types"; -import type { MetaPaths } from "src/types/meta-paths.ts"; -import { throwIfCancelled } from "src/pipeline/cancellation.ts"; -import { iterateCondensed } from "src/strategies/flat-folder/big-file/storage.ts"; -import { iterateFolderSummaries } from "src/strategies/flat-folder/folder-summary.ts"; -import { directFolderOf } from "src/strategies/flat-folder/folder-path.ts"; -import { languageFromPath } from "src/adapters/llm-file-analyzer.ts"; -import type { FolderSummary, RepoSummary, RepoSummaryEnvelope } from "src/strategies/flat-folder/types.ts"; +import type { MetaPaths } from "#src/types/meta-paths.ts"; +import { throwIfCancelled } from "#src/pipeline/cancellation.ts"; +import type { FileAnalysisCache } from "#src/strategies/flat-folder/file-analysis-cache.ts"; +import { iterateFolderSummaries } from "#src/strategies/flat-folder/folder-summary.ts"; +import { directFolderOf } from "#src/strategies/flat-folder/folder-path.ts"; +import { languageFromPath } from "#src/adapters/llm-file-analyzer.ts"; +import type { ProgressContext } from "#src/progress/types.ts"; +import type { FolderSummary, RepoSummary, RepoSummaryEnvelope } from "#src/strategies/flat-folder/types.ts"; export interface StoreFlatAnalysisInput { scope: NodeScope; payload: GithubIndexPayload; branch: string; metaPaths: MetaPaths; + cache: FileAnalysisCache; + progressContext?: ProgressContext; } export interface StoreFlatAnalysisResult { @@ -27,10 +40,10 @@ export async function storeFlatAnalysis(input: StoreFlatAnalysisInput): Promise< throwIfCancelled(input.scope.knowledgeId); await ensureFlatFolderIndexes(); - let nodesWritten = 0; - let foldersWritten = 0; - let filesWritten = 0; + const batchSize = getConfigValue(Config.Neo4jBatchSize); + // 1. :Repo node — single upsert, not batched (one repo per knowledge). + let nodesWritten = 0; const repoSummary = await readRepoSummary(input.metaPaths); if (repoSummary !== null) { await upsertRepoNode({ @@ -47,7 +60,6 @@ export async function storeFlatAnalysis(input: StoreFlatAnalysisInput): Promise< keyPatterns: repoSummary.keyPatterns, }, }); - nodesWritten += 1; } else { logger.warn(`phase7: no repo summary on disk; writing :Repo with empty summary`); await upsertRepoNode({ @@ -56,51 +68,104 @@ export async function storeFlatAnalysis(input: StoreFlatAnalysisInput): Promise< branch: input.branch, summary: emptyRepoSummaryPayload(), }); - nodesWritten += 1; } + nodesWritten += 1; + // 2. Collect every folder we'll upsert: the on-disk folder summaries plus + // synthesised parents for any file whose folder didn't get a summary. Doing + // this up front gives both reporters real fixed totals so `overallProgress` + // doesn't leap to 100 the moment the folder loop completes (the previous + // UX bug where the file sub-phase registered too late to dilute the + // indexing aggregate). + const folderInputs: UpsertFolderNodeInput[] = []; const folderPaths = new Set(); for await (const folder of iterateFolderSummaries(input.metaPaths)) { - throwIfCancelled(input.scope.knowledgeId); - await upsertFolderNode({ + folderInputs.push({ scope: input.scope, folderPath: folder.folderPath, summary: shapeFolderPayload(folder), }); folderPaths.add(folder.folderPath); - foldersWritten += 1; - nodesWritten += 1; } - - for await (const file of iterateCondensed(input.metaPaths)) { - throwIfCancelled(input.scope.knowledgeId); + for (const file of input.cache.values()) { const folderPath = directFolderOf(file.relativePath); if (!folderPaths.has(folderPath)) { - await upsertFolderNode({ + folderInputs.push({ scope: input.scope, folderPath, summary: emptyFolderPayload(), }); folderPaths.add(folderPath); - foldersWritten += 1; - nodesWritten += 1; } - await upsertFileNode({ - orgId: input.scope.orgId, - knowledgeId: input.scope.knowledgeId, - repoId: input.scope.repoId, - relativePath: file.relativePath, - folderPath, - language: file.language.length > 0 ? file.language : languageFromPath(file.relativePath), - sha: file.sha256, - sizeBytes: file.sizeBytes, - analysis: file.analysis, - isBigFile: file.isBigFile, - totalChunks: file.totalChunks, - totalTokenCount: file.totalTokenCount, - }); - filesWritten += 1; - nodesWritten += 1; + } + + // 3. Both reporters open at phase entry with their true totals so the + // overall-progress aggregate sees both denominators from the first tick. + const folderReporter = input.progressContext?.reporter({ + phase: "indexing", + subPhase: "folders", + total: { kind: "fixed", total: folderInputs.length }, + }); + const fileReporter = input.progressContext?.reporter({ + phase: "indexing", + subPhase: "files", + total: { kind: "fixed", total: input.cache.size }, + }); + await folderReporter?.start(); + await fileReporter?.start(); + + let foldersWritten = 0; + let filesWritten = 0; + try { + // 4. Batched folder upserts. + logger.info( + `phase7: folder upsert dispatching ${Math.ceil(folderInputs.length / batchSize)} batches of up to ${batchSize} folders (total=${folderInputs.length})`, + ); + for (let i = 0; i < folderInputs.length; i += batchSize) { + throwIfCancelled(input.scope.knowledgeId); + const batch = folderInputs.slice(i, i + batchSize); + await upsertFolderNodesBatch(batch); + foldersWritten += batch.length; + nodesWritten += batch.length; + for (const item of batch) { + folderReporter?.increment(1, { fileName: item.folderPath || "" }); + } + } + + // 5. Batched file upserts. + const fileInputs: UpsertFileNodeInput[] = []; + for (const file of input.cache.values()) { + fileInputs.push({ + orgId: input.scope.orgId, + knowledgeId: input.scope.knowledgeId, + repoId: input.scope.repoId, + relativePath: file.relativePath, + folderPath: directFolderOf(file.relativePath), + language: file.language.length > 0 ? file.language : languageFromPath(file.relativePath), + sha: file.sha256, + sizeBytes: file.sizeBytes, + analysis: file.analysis, + isBigFile: file.isBigFile, + totalChunks: file.totalChunks, + totalTokenCount: file.totalTokenCount, + }); + } + logger.info( + `phase7: file upsert dispatching ${Math.ceil(fileInputs.length / batchSize)} batches of up to ${batchSize} files (total=${fileInputs.length})`, + ); + for (let i = 0; i < fileInputs.length; i += batchSize) { + throwIfCancelled(input.scope.knowledgeId); + const batch = fileInputs.slice(i, i + batchSize); + await upsertFileNodesBatch(batch); + filesWritten += batch.length; + nodesWritten += batch.length; + for (const item of batch) { + fileReporter?.increment(1, { fileName: item.relativePath }); + } + } + } finally { + folderReporter?.stop(); + fileReporter?.stop(); } logger.info(`phase7 done: nodesWritten=${nodesWritten} folders=${foldersWritten} files=${filesWritten}`); diff --git a/packages/ingest-github/src/strategies/flat-folder/prompts/folder-summary.ts b/packages/ingest-github/src/strategies/flat-folder/prompts/folder-summary.ts index 465c9dc..30e110b 100644 --- a/packages/ingest-github/src/strategies/flat-folder/prompts/folder-summary.ts +++ b/packages/ingest-github/src/strategies/flat-folder/prompts/folder-summary.ts @@ -1,4 +1,4 @@ -import type { CondensedFileAnalysis } from "src/types/condensed-file-analysis.ts"; +import type { CondensedFileAnalysis } from "#src/types/condensed-file-analysis.ts"; export const FOLDER_ANALYSIS_SYSTEM_PROMPT = `You are summarising a single FOLDER of a source repository. The user will provide the per-file analyses of the files DIRECTLY inside that folder (subfolders are summarised separately and are NOT in your input). @@ -40,3 +40,57 @@ Per-file analyses (direct children only): ${serialised}`; } + +export const FOLDER_BATCH_SYSTEM_PROMPT = `You are summarising MULTIPLE small folders of a source repository in one pass. The user will provide several folders, each labeled with an integer ID (0, 1, 2, ...). Each folder lists the files directly inside it (subfolders are summarised separately and are NOT in your input). + +Return ONLY a JSON object whose keys are the integer labels as strings ("0", "1", ...) and whose values are folder-summary objects with EXACTLY these keys: + +- purpose : string — one-paragraph explanation of what this folder is responsible for. +- summary : string — natural-language summary of how the files in this folder work together. Plain English, no key-value pairs. ≤ 300 tokens. +- keywords : string[] — up to 10 domain keywords describing this folder. +- classes : string[] — most important class/type entries, deduplicated. Format "Name: short purpose". Max 15 entries. +- functions : string[] — most important function/method entries, deduplicated. Format "name: short purpose". Max 15 entries. +- importsInternal : string[] — significant relative imports observed across the folder's files. Max 15 entries. +- importsExternal : string[] — significant external packages observed across the folder's files. Max 15 entries. +- dependencyGraph : string — Mermaid \`graph LR\` block (no triple-backtick fences) of inter-file dependencies. Empty string if not enough signal. + +You MUST return one entry per labeled folder, even if some fields are empty arrays. Do NOT invent files not listed. Do NOT speculate about subfolders. Do NOT add keys outside the integer-label set; do NOT add commentary outside the JSON object.`; + +export interface BatchedFolderInput { + label: number; + folderPath: string; + files: CondensedFileAnalysis[]; +} + +export function folderBatchUserPrompt(batch: BatchedFolderInput[]): string { + const sections = batch.map((b) => { + const folderLabel = b.folderPath.length === 0 ? "" : b.folderPath; + const fileLines = b.files.map((f) => `- ${f.relativePath}: ${f.analysis.purpose}`).join("\n"); + const aggregatedKeywords = aggregateKeywords(b.files, 10); + return `### Folder ${b.label} :: ${folderLabel} +Files: ${b.files.length} +${fileLines} +Aggregated keywords: ${JSON.stringify(aggregatedKeywords)}`; + }); + return `You are summarising ${batch.length} folder(s). Produce one folder-summary object per labeled folder. + +${sections.join("\n\n")}`; +} + +function aggregateKeywords(files: CondensedFileAnalysis[], cap: number): string[] { + const seen = new Set(); + const out: string[] = []; + for (const f of files) { + for (const k of f.analysis.keywords) { + if (typeof k !== "string" || k.length === 0 || seen.has(k)) { + continue; + } + seen.add(k); + out.push(k); + if (out.length >= cap) { + return out; + } + } + } + return out; +} diff --git a/packages/ingest-github/src/strategies/flat-folder/repo-summary.ts b/packages/ingest-github/src/strategies/flat-folder/repo-summary.ts index b41fd52..13d54fb 100644 --- a/packages/ingest-github/src/strategies/flat-folder/repo-summary.ts +++ b/packages/ingest-github/src/strategies/flat-folder/repo-summary.ts @@ -1,10 +1,11 @@ import { writeFile } from "node:fs/promises"; -import { askJsonLLM, tokenLen } from "@bb/llm"; +import { askJsonLLM, tokenLen, type AskLlmOptions } from "@bb/llm"; +import { LlmConfigError, LlmError } from "@bb/errors"; import { logger } from "@bb/logger"; import { Config } from "@bb/types"; import { getConfigValue } from "@bb/config"; -import type { MetaPaths } from "src/types/meta-paths.ts"; -import { throwIfCancelled } from "src/pipeline/cancellation.ts"; +import type { MetaPaths } from "#src/types/meta-paths.ts"; +import { throwIfCancelled } from "#src/pipeline/cancellation.ts"; import { iterateFolderSummaries } from "./folder-summary.ts"; import { REPO_SUMMARY_SYSTEM_PROMPT, @@ -25,14 +26,24 @@ interface RepoSummaryJson { keyPatterns?: unknown; } -export async function summariseRepo(knowledgeId: string, metaPaths: MetaPaths): Promise { +export async function summariseRepo( + knowledgeId: string, + metaPaths: MetaPaths, + llmCallContext?: AskLlmOptions, +): Promise<{ + summary: RepoSummary | null; + tokenUsage: { inputTokens: number; outputTokens: number; costUsd: number }; +}> { const folders: FolderSummary[] = []; for await (const f of iterateFolderSummaries(metaPaths)) { folders.push(f); } + let totalInputTokens = 0; + let totalOutputTokens = 0; + let totalCostUsd = 0; if (folders.length === 0) { logger.warn(`phase6: no folder summaries on disk; skipping repo summary`); - return null; + return { summary: null, tokenUsage: { inputTokens: 0, outputTokens: 0, costUsd: 0 } }; } folders.sort((a, b) => a.folderPath.split("/").length - b.folderPath.split("/").length); const infos = repoFolderInfosFrom(folders); @@ -42,7 +53,7 @@ export async function summariseRepo(knowledgeId: string, metaPaths: MetaPaths): const oneShotPrompt = buildRepoPromptFromFolders(infos); if (tokenLen(oneShotPrompt) + promptOverhead <= contextLimit) { throwIfCancelled(knowledgeId); - return await callRepoSummary(oneShotPrompt); + return await callRepoSummary(oneShotPrompt, llmCallContext); } logger.info(`phase6: repo prompt > ${contextLimit} tokens; batching`); @@ -50,32 +61,75 @@ export async function summariseRepo(knowledgeId: string, metaPaths: MetaPaths): const partials: string[] = []; for (const batch of batches) { throwIfCancelled(knowledgeId); - const partial = await callRepoSummary(buildRepoPromptFromFolders(batch)); + const { summary: partial, tokenUsage } = await callRepoSummary(buildRepoPromptFromFolders(batch), llmCallContext); + totalInputTokens += tokenUsage.inputTokens; + totalOutputTokens += tokenUsage.outputTokens; + totalCostUsd += tokenUsage.costUsd; if (partial !== null) { partials.push(JSON.stringify(partial)); } } if (partials.length === 0) { - return null; + return { + summary: null, + tokenUsage: { inputTokens: totalInputTokens, outputTokens: totalOutputTokens, costUsd: totalCostUsd }, + }; } if (partials.length === 1) { - return JSON.parse(partials[0] ?? "null") as RepoSummary | null; + return { + summary: JSON.parse(partials[0] ?? "null") as RepoSummary | null, + tokenUsage: { inputTokens: totalInputTokens, outputTokens: totalOutputTokens, costUsd: totalCostUsd }, + }; } throwIfCancelled(knowledgeId); - return await callRepoSummary(buildRepoMergePrompt(partials)); + const { summary: final, tokenUsage: finalUsage } = await callRepoSummary( + buildRepoMergePrompt(partials), + llmCallContext, + ); + return { + summary: final, + tokenUsage: { + inputTokens: totalInputTokens + finalUsage.inputTokens, + outputTokens: totalOutputTokens + finalUsage.outputTokens, + costUsd: totalCostUsd + finalUsage.costUsd, + }, + }; } -async function callRepoSummary(userPrompt: string): Promise { +async function callRepoSummary( + userPrompt: string, + llmCallContext?: AskLlmOptions, +): Promise<{ + summary: RepoSummary | null; + tokenUsage: { inputTokens: number; outputTokens: number; costUsd: number }; +}> { try { - const response = await askJsonLLM(REPO_SUMMARY_SYSTEM_PROMPT, userPrompt); + const response = await askJsonLLM(REPO_SUMMARY_SYSTEM_PROMPT, userPrompt, llmCallContext ?? {}); if (response.result === null) { - return null; + return { + summary: null, + tokenUsage: { + inputTokens: response.usage.inputTokens, + outputTokens: response.usage.outputTokens, + costUsd: response.usage.costUsd, + }, + }; } - return shapeRepoSummary(response.result); + return { + summary: shapeRepoSummary(response.result), + tokenUsage: { + inputTokens: response.usage.inputTokens, + outputTokens: response.usage.outputTokens, + costUsd: response.usage.costUsd, + }, + }; } catch (cause: unknown) { + if (cause instanceof LlmConfigError || cause instanceof LlmError) { + throw cause; + } const msg = cause instanceof Error ? cause.message : String(cause); logger.warn(`callRepoSummary: askJsonLLM failed: ${msg}`); - return null; + return { summary: null, tokenUsage: { inputTokens: 0, outputTokens: 0, costUsd: 0 } }; } } diff --git a/packages/ingest-github/src/strategies/flat-folder/scan-manifest.ts b/packages/ingest-github/src/strategies/flat-folder/scan-manifest.ts new file mode 100644 index 0000000..5caee3b --- /dev/null +++ b/packages/ingest-github/src/strategies/flat-folder/scan-manifest.ts @@ -0,0 +1,61 @@ +import { readFile, writeFile } from "node:fs/promises"; +import type { MetaPaths } from "#src/types/meta-paths.ts"; + +export type ScanEntryKind = "small" | "big" | "oversized"; + +export interface ScanManifestEntry { + relativePath: string; + absolutePath: string; + sizeBytes: number; + tokenCount: number; + kind: ScanEntryKind; + estimatedChunks?: number; +} + +export interface ScanManifestSummary { + totalFiles: number; + smallCount: number; + bigCount: number; + oversizedCount: number; + totalTokens: number; + estimatedBigChunks: number; +} + +export interface ScanManifest { + generatedAt: string; + summary: ScanManifestSummary; + entries: ScanManifestEntry[]; +} + +export function emptyManifest(): ScanManifest { + return { + generatedAt: new Date().toISOString(), + summary: { totalFiles: 0, smallCount: 0, bigCount: 0, oversizedCount: 0, totalTokens: 0, estimatedBigChunks: 0 }, + entries: [], + }; +} + +export async function writeScanManifest(metaPaths: MetaPaths, manifest: ScanManifest): Promise { + await writeFile(metaPaths.scanManifestJson, JSON.stringify(manifest, null, 2), "utf8"); +} + +export async function readScanManifest(metaPaths: MetaPaths): Promise { + try { + const raw = await readFile(metaPaths.scanManifestJson, "utf8"); + const parsed: unknown = JSON.parse(raw); + if (!isManifest(parsed)) { + return null; + } + return parsed; + } catch { + return null; + } +} + +function isManifest(value: unknown): value is ScanManifest { + if (typeof value !== "object" || value === null) { + return false; + } + const rec = value as Record; + return Array.isArray(rec["entries"]) && typeof rec["summary"] === "object" && typeof rec["generatedAt"] === "string"; +} diff --git a/packages/ingest-github/src/strategies/flat-folder/store-pull.ts b/packages/ingest-github/src/strategies/flat-folder/store-pull.ts index 9b74fad..d070c42 100644 --- a/packages/ingest-github/src/strategies/flat-folder/store-pull.ts +++ b/packages/ingest-github/src/strategies/flat-folder/store-pull.ts @@ -10,15 +10,15 @@ import { } from "@bb/neo4j"; import { deleteRawFiles } from "@bb/mongo"; import type { GithubIndexPayload } from "@bb/types"; -import type { MetaPaths } from "src/types/meta-paths.ts"; -import type { CondensedFileAnalysis } from "src/types/condensed-file-analysis.ts"; -import { throwIfCancelled } from "src/pipeline/cancellation.ts"; -import type { DiffResult } from "src/pipeline/git-diff.ts"; -import { readCondensed } from "src/strategies/flat-folder/big-file/storage.ts"; -import { iterateFolderSummaries } from "src/strategies/flat-folder/folder-summary.ts"; -import { directFolderOf } from "src/strategies/flat-folder/folder-path.ts"; -import { languageFromPath } from "src/adapters/llm-file-analyzer.ts"; -import type { FolderSummary, RepoSummary, RepoSummaryEnvelope } from "src/strategies/flat-folder/types.ts"; +import type { MetaPaths } from "#src/types/meta-paths.ts"; +import type { CondensedFileAnalysis } from "#src/types/condensed-file-analysis.ts"; +import { throwIfCancelled } from "#src/pipeline/cancellation.ts"; +import type { DiffResult } from "#src/pipeline/git-diff.ts"; +import { readCondensed } from "#src/strategies/flat-folder/big-file/storage.ts"; +import { iterateFolderSummaries } from "#src/strategies/flat-folder/folder-summary.ts"; +import { directFolderOf } from "#src/strategies/flat-folder/folder-path.ts"; +import { languageFromPath } from "#src/adapters/llm-file-analyzer.ts"; +import type { FolderSummary, RepoSummary, RepoSummaryEnvelope } from "#src/strategies/flat-folder/types.ts"; export interface StorePullInput { scope: NodeScope; diff --git a/packages/ingest-github/src/strategies/flat-folder/types.ts b/packages/ingest-github/src/strategies/flat-folder/types.ts index 9d33168..15ac29f 100644 --- a/packages/ingest-github/src/strategies/flat-folder/types.ts +++ b/packages/ingest-github/src/strategies/flat-folder/types.ts @@ -1,4 +1,4 @@ -import type { CondensedFileAnalysis } from "src/types/condensed-file-analysis.ts"; +import type { CondensedFileAnalysis } from "#src/types/condensed-file-analysis.ts"; export interface AnalyzedFileEntry { relativePath: string; diff --git a/packages/ingest-github/src/types/README.md b/packages/ingest-github/src/types/README.md index 99d26f5..1fd8479 100644 --- a/packages/ingest-github/src/types/README.md +++ b/packages/ingest-github/src/types/README.md @@ -11,16 +11,41 @@ Domain (sub-folder of `@bb/ingest-github`). - `strategy.ts` — `IngestStrategy`, `StrategyInput`, `StrategyResult`, `StrategyContext`. The strategy port the orchestrator dispatches to. + `StrategyContext` carries `{ knowledgeId, orgId, repoId, +llmCallContext? }`; `llmCallContext` is the optional `AskLlmOptions` + bag the runner builds from the job payload's LLM overrides and that + each phase forwards into its `askJsonLLM` / `askYesNoLLM` calls. Absent + in OSS standalone runs — calls fall back to `Config.OpenrouterApiKey`. - `pipeline.ts` — `ScannedFile`, `OversizedFile`, `ScanEntry`, `FileAnalyzer` port, `AnalyzedFileResult`, `PipelineDeps`, `PipelineSummary`, `SkipDecider` / `SkipDeciderInput` / `SkipDecision` (the unknown-extension - gate port; implementation lives under `pipeline/skip-decisions/`), + gate port; implementation lives under `pipeline/skip-decisions/`). The + `SkipDecider` interface exposes four methods: `decide` (legacy async + single-shot), `decideStatic` (synchronous; returns the resolved decision + or `null` to signal "needs an LLM call"), `decideAndDeferSave` (async LLM + call that mutates the in-memory cache without flushing to disk), and + `persist` (one-shot cache flush). The two-pass scan in `scan.ts` uses the + latter three so unknown-extension probes fan out under the shared LLM + limiter and the disk cache is written exactly once at the end of the + batch. `SourceReader` / `ScanDeps` (the repository-read abstraction; default - implementation in `pipeline/disk-source-reader.ts`), `ArchiveSink` / + implementation in `pipeline/disk-source-reader.ts`). `ScanDeps.limiter` + is the optional shared `ConcurrencyLimiter`; when supplied together with + `skipDecider`, `scanRepository` switches to its two-pass strategy + instead of the legacy inline-await walk. + `ArchiveSink` / `ArchiveSinkInput` (an optional non-fatal sink that the open-source - binary never calls), and `SourceFactory` / `SourceFactoryInput` / - `SourceFactoryResult` (the optional injection hook surfaced through - `registerGithubWorkers`; see `docs/extension-points.md`). + binary never calls), `SourceFactory` / `SourceFactoryInput` / + `SourceFactoryResult` (the optional index-side injection hook surfaced + through `registerGithubWorkers`), and `PullFactory` / `PullFactoryInput` + / `PullFactoryResult` (the analogous pull-side injection hook). + `FileAnalyzer.analyze()`, `SkipDeciderInput`, and `ScanDeps` each accept + an optional `llmCallContext?: AskLlmOptions` so per-job credentials + flow from `StrategyContext` into every LLM call site without breaking + the OSS standalone (defaults to undefined → config-driven). Both + factories are documented in `docs/extension-points.md`. The two are + separate because pull additionally needs a `diff` and a resolved + `targetCommit`, which index doesn't. - `meta-paths.ts` — `MetaPaths` shape (`~/.bytebell/repos/.meta//...`). - `file-analysis.ts` — `FALLBACK_LANGUAGE = "unknown"` and `emptyFileAnalysis()` factory. Both consumed by the LLM adapter and the big-file condenser. diff --git a/packages/ingest-github/src/types/big-file.ts b/packages/ingest-github/src/types/big-file.ts index b681670..4d73838 100644 --- a/packages/ingest-github/src/types/big-file.ts +++ b/packages/ingest-github/src/types/big-file.ts @@ -27,6 +27,7 @@ export interface ChunkAnalysisResult { endLine: number; language: string; analysis: FileAnalysis; + tokenUsage?: { inputTokens: number; outputTokens: number; costUsd: number } | undefined; } export interface HugeFileManifest { diff --git a/packages/ingest-github/src/types/condensed-file-analysis.ts b/packages/ingest-github/src/types/condensed-file-analysis.ts index 337555b..eeee56d 100644 --- a/packages/ingest-github/src/types/condensed-file-analysis.ts +++ b/packages/ingest-github/src/types/condensed-file-analysis.ts @@ -11,4 +11,5 @@ export interface CondensedFileAnalysis { totalTokenCount: number; analysedAt: string; analysis: FileAnalysis; + tokenUsage?: { inputTokens: number; outputTokens: number; costUsd: number } | undefined; } diff --git a/packages/ingest-github/src/types/meta-paths.ts b/packages/ingest-github/src/types/meta-paths.ts index 8898df3..5da4f89 100644 --- a/packages/ingest-github/src/types/meta-paths.ts +++ b/packages/ingest-github/src/types/meta-paths.ts @@ -5,5 +5,6 @@ export interface MetaPaths { bigFileAnalysisDir: string; bigFileChunksDir: string; bigFilesJson: string; + scanManifestJson: string; repoSummaryJson: string; } diff --git a/packages/ingest-github/src/types/pipeline.ts b/packages/ingest-github/src/types/pipeline.ts index b761cef..aaf13a5 100644 --- a/packages/ingest-github/src/types/pipeline.ts +++ b/packages/ingest-github/src/types/pipeline.ts @@ -1,5 +1,8 @@ -import type { GithubIndexPayload } from "@bb/types"; +import type { GithubIndexPayload, GithubPullPayload } from "@bb/types"; +import type { AskLlmOptions } from "@bb/llm"; import type { FileAnalysis } from "@bb/mongo"; +import type { ConcurrencyLimiter } from "#src/pipeline/concurrency.ts"; +import type { DiffResult } from "#src/pipeline/git-diff.ts"; export interface ScannedFile { kind: "file"; @@ -21,10 +24,20 @@ export type ScanEntry = ScannedFile | OversizedFile; export interface AnalyzedFileResult { language: string; analysis: FileAnalysis; + tokenUsage?: { inputTokens: number; outputTokens: number; costUsd: number } | undefined; } export interface FileAnalyzer { - analyze(input: { relativePath: string; content: string }): Promise; + analyze(input: { + relativePath: string; + content: string; + /** + * Per-job LLM credential overrides. When set, passed to `askJsonLLM` so + * the file analysis uses the caller-supplied credentials instead of + * `Config.OpenrouterApiKey`. Absent in OSS standalone. + */ + llmCallContext?: AskLlmOptions; + }): Promise; } export interface PipelineSummary { @@ -33,6 +46,7 @@ export interface PipelineSummary { repoSummarised: boolean; graphNodesWritten: number; commitHash: string; + tokenUsage: { inputTokens: number; outputTokens: number; costUsd: number }; } export interface PipelineDeps { @@ -41,6 +55,19 @@ export interface PipelineDeps { export interface ScanDeps { skipDecider?: SkipDecider; + /** + * Per-job LLM credential overrides forwarded to the skip-decider when it + * invokes the LLM branch. Absent in OSS standalone runs. + */ + llmCallContext?: AskLlmOptions; + /** + * Shared LLM-concurrency limiter. When set, `scanRepository` uses a + * two-pass strategy: walk + cache-only decisions in pass 1, parallel + * deduplicated LLM resolution under this limiter in pass 2, drain the + * pending list in pass 3 (all cache-hits). When absent (e.g. legacy + * `SourceFactory` consumers), scan falls back to inline-await per file. + */ + limiter?: ConcurrencyLimiter; } export interface SourceReader { @@ -88,6 +115,35 @@ export interface SourceFactoryResult { */ export type SourceFactory = (input: SourceFactoryInput) => Promise; +export interface PullFactoryInput { + knowledgeId: string; + payload: GithubPullPayload; + /** The commit currently anchored on the knowledge in Mongo. The factory diffs from here to `targetCommit`. */ + currentCommit: string; + /** Branch the knowledge tracks. The factory resolves the target commit relative to this branch. */ + branch: string; +} + +export interface PullFactoryResult { + /** Reader pinned at the resolved target commit; used by every downstream phase for file I/O. */ + source: SourceReader; + /** Files changed between `currentCommit` and the resolved target. Same shape as `git diff --name-status`. */ + diff: DiffResult; + /** Resolved target commit hash. Either the payload's `targetCommitHash` or the branch HEAD chosen by the factory. */ + targetCommit: string; + /** Optional non-fatal sink. When set, the strategy archives analysed content via `push` after each file. */ + archiveSink?: ArchiveSink; +} + +/** + * Optional injection hook used by `registerGithubWorkers` for pull jobs. + * When provided, `runPull` skips `syncRepository` + `computePullDiff` + + * `checkoutCommit` and uses the factory's reader + diff directly. The + * open-source binary leaves this undefined and pull runs against a local + * git clone via `node:child_process`. + */ +export type PullFactory = (input: PullFactoryInput) => Promise; + export type SkipDecision = "accept" | "reject-static" | "reject-llm" | "accept-llm"; export interface SkipDeciderInput { @@ -96,8 +152,40 @@ export interface SkipDeciderInput { ext: string; /** Pre-loaded content. When set, the LLM branch uses this instead of reading absolutePath from disk. */ content?: string; + /** + * Per-job LLM credential overrides. When set and the decider invokes the + * LLM branch, these credentials override `Config.OpenrouterApiKey`. Absent + * in OSS standalone — the LLM branch falls back to the configured key. + */ + llmCallContext?: AskLlmOptions; } export interface SkipDecider { + /** + * Single-shot decision: applies static filters, consults the in-memory + * + on-disk caches, and falls through to the LLM when neither resolves + * the decision. Persists the cache to disk after each LLM call. + * Kept for non-scan callers and the legacy inline-await path. + */ decide(input: SkipDeciderInput): Promise; + /** + * Synchronous static-only decision. Returns the resolved `SkipDecision` + * when static filters or cache hit resolves it; returns `null` to signal + * "this needs an LLM call to resolve". Used by `scanRepository` in its + * two-pass mode to collect pending entries without blocking the walk. + */ + decideStatic(input: SkipDeciderInput): SkipDecision | null; + /** + * Asynchronous LLM-resolution path that **mutates the in-memory cache** + * but does NOT persist to disk. The caller (typically `scanRepository`) + * batches these under a `ConcurrencyLimiter` and then calls `persist()` + * exactly once at the end of the batch, so concurrent `saveCache` calls + * don't race on the tmp/rename atomicity. + */ + decideAndDeferSave(input: SkipDeciderInput): Promise; + /** + * Persist the in-memory decision cache to disk. Best-effort: swallows + * I/O errors. Called once at the end of a `decideAndDeferSave` batch. + */ + persist(): void; } diff --git a/packages/ingest-github/src/types/strategy.ts b/packages/ingest-github/src/types/strategy.ts index 1b95537..a9e6bee 100644 --- a/packages/ingest-github/src/types/strategy.ts +++ b/packages/ingest-github/src/types/strategy.ts @@ -1,4 +1,5 @@ import type { GithubIndexPayload } from "@bb/types"; +import type { AskLlmOptions } from "@bb/llm"; import type { MetaPaths } from "./meta-paths.ts"; import type { ArchiveSink, SourceReader } from "./pipeline.ts"; @@ -6,6 +7,13 @@ export interface StrategyContext { knowledgeId: string; orgId: string; repoId: string; + /** + * Per-job LLM credential overrides extracted from the job payload. When + * present, phases pass these to every `askLLM` / `askJsonLLM` call so the + * per-org credential reaches the LLM provider. Absent in OSS standalone + * runs, where calls fall back to `Config.OpenrouterApiKey`. + */ + llmCallContext?: AskLlmOptions; } export interface StrategyInput { @@ -22,6 +30,7 @@ export interface StrategyResult { foldersSummarised: number; repoSummarised: boolean; graphNodesWritten: number; + tokenUsage: { inputTokens: number; outputTokens: number; costUsd: number }; } export interface IngestStrategy { diff --git a/packages/ingest-github/tsconfig.json b/packages/ingest-github/tsconfig.json index 0fd9bd8..4ed0786 100644 --- a/packages/ingest-github/tsconfig.json +++ b/packages/ingest-github/tsconfig.json @@ -1,15 +1,4 @@ { "extends": "../../tsconfig.base.json", - "compilerOptions": { - "rootDir": "./src", - "outDir": "./dist", - "baseUrl": ".", - "paths": { - "src/*": ["./src/*"] - }, - "ignoreDeprecations": "5.0", - "noEmit": false, - "emitDeclarationOnly": true - }, - "include": ["src/**/*", "src/**/*.json"] + "include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.json"] } diff --git a/packages/ingest-github/types/README.md b/packages/ingest-github/types/README.md new file mode 100644 index 0000000..7bea793 --- /dev/null +++ b/packages/ingest-github/types/README.md @@ -0,0 +1,39 @@ +# `@bb/ingest-github/types` + +Hand-written type declarations for the `@bb/ingest-github` package's public surface. Consumed by TypeScript via `package.json` `"types": "./types/index.d.ts"`. Not executed at runtime — runtime resolves through `package.json` `"main": "./src/index.ts"`. + +## Tier + +Domain (sub-folder of `@bb/ingest-github`). + +## Responsibility + +Provide a stable, loosely-typed declaration of every public export of `@bb/ingest-github`. This shim short-circuits TypeScript before it walks into `src/`, which uses package-local `src/*` path aliases that don't resolve under a consumer's tsconfig context. + +Without this shim, any external project that imports `@bb/ingest-github` and runs `tsc -b` would fail on `TS2307: Cannot find module 'src/types/foo.ts'` errors from the package's internal imports. + +## Public Interface + +`./index.d.ts` declares every exported symbol of the runtime `src/index.ts`. Function signatures are intentionally permissive (`(...args: any[]) => any` in many cases) — full type fidelity is sacrificed for resolution stability. + +When `src/index.ts` adds or renames a public export, `index.d.ts` must be updated in the same commit. + +## Data Ownership + +None. This directory contains only type declaration files and does not own or persist any runtime data. + +## External Dependencies + +None. + +## Invariants + +1. **Never imported by `src/`.** This is a consumer-facing artifact only. +2. **Mirror of `src/index.ts` exports.** A symbol exported here that doesn't exist in `src/index.ts` is a leak; a symbol exported from `src/` but not here will appear as `any` to consumers at best, or break their typecheck at worst. +3. **No runtime code.** Pure `.d.ts` declarations only. + +## Out of Scope + +- Full structural types for complex shapes (use `any` / `unknown` where resolution stability is preferred). +- Generic constraints (keep signatures flat). +- Documentation comments (the source in `src/` is authoritative). diff --git a/packages/llm/README.md b/packages/llm/README.md index 96b3719..64e6cef 100644 --- a/packages/llm/README.md +++ b/packages/llm/README.md @@ -18,30 +18,34 @@ selected by `Config.LlmProvider` (`"openrouter"` default, or - `askLLM(prompt, opts?)` — dispatches to either `src/openrouter.ts` or `src/ollama.ts` depending on `Config.LlmProvider`. Returns - `{ content, usage: { model, inputTokens, outputTokens } }`. Caller - never sees the provider; the result shape is identical across - backends. + `{ content, usage: { model, inputTokens, outputTokens, costUsd } }`. + Caller never sees the provider; the result shape is identical across + backends. `costUsd` is the provider-reported USD cost for that single + call — taken straight from the provider's response, never computed + client-side. - **OpenRouter mode** — POST to OpenRouter's chat-completions endpoint using `Config.OpenrouterApiKey` + `Config.OpenrouterModel` as the primary model, plus `Config.OpenrouterFallbackModel1..4` as the fallback chain. The request body includes a `models: [...]` array - when the deduplicated chain has ≥2 non-empty entries; OpenRouter - routes among them and bills only the responder. `usage.model` is the - actual model the gateway picked. Tokens come straight from - OpenRouter's `usage.prompt_tokens` / `usage.completion_tokens`. + when the deduplicated chain has ≥2 non-empty entries and always sends + `usage: { include: true }` so OpenRouter populates `usage.cost` in + the response. The body also pins `provider: { allow_fallbacks: false }` + so OpenRouter does not silently cycle across upstream providers of the + same model — a slow or sick provider surfaces a real error to us + instead of consuming the wall-clock budget. Model-level fallback + through the `models` chain is unaffected. `usage.model` is the actual + model the gateway picked. Tokens come straight from OpenRouter's + `usage.prompt_tokens` / `usage.completion_tokens`; `costUsd` from + `usage.cost` (defaults to `0` when the provider omits it — common for + `:free` models). - **Ollama mode** — POST to `${Config.OllamaUrl}/api/chat` with `{ model: Config.OllamaModel, messages, stream: false }`. Single model per request — no fallback chain (Ollama does not have a multi-model fan-out). The model string is free-form: any model the user has pulled into their Ollama daemon works (`llama3.1`, `qwen2.5-coder:7b`, custom Modelfile names — we do not validate). - `inputTokens` ← `prompt_eval_count`, `outputTokens` ← `eval_count`. - Cost is reported as `$0` (see `estimateCostUsd` short-circuit). -- `estimateCostUsd(model, inputTokens, outputTokens)` and - `estimateCostFromBreakdown(modelTokens)` — async cost helpers backed - by a one-shot fetch of OpenRouter's `/api/v1/models` (cached in module - scope for the process lifetime). Returns `-1` when the model has no - published pricing. + `inputTokens` ← `prompt_eval_count`, `outputTokens` ← `eval_count`, + `costUsd` ← `0` (Ollama is keyless / local). - AbortController-based timeout (default 90s, matches the kube-package reference `askLLM` shape) - Typed errors via `@bb/errors`: `LlmConfigError` (missing key) and @@ -56,24 +60,29 @@ selected by `Config.LlmProvider` (`"openrouter"` default, or ```ts function askLLM(prompt: string, opts?: AskLlmOptions): Promise; -function estimateCostUsd(model: string, inputTokens: number, outputTokens: number): Promise; -function estimateCostFromBreakdown(modelTokens: ModelTokenBreakdown): Promise; function tokenLen(text: string): number; function encodeTokens(text: string): number[]; function decodeTokens(tokens: number[]): string; +type LlmProviderName = "openrouter" | "ollama"; interface AskLlmOptions { model?: string; // overrides Config.OpenrouterModel fallbackModels?: string[]; // overrides Config.OpenrouterFallbackModel1..4 timeoutMs?: number; // default 90_000 systemPrompt?: string; // optional system role message + apiKey?: string; // per-call OpenRouter key override (ignored for Ollama); skips Config.OpenrouterApiKey + provider?: LlmProviderName; // per-call provider override; skips Config.LlmProvider } interface AskLlmResult { content: string; - usage: { model: string; inputTokens: number; outputTokens: number }; + usage: { model: string; inputTokens: number; outputTokens: number; costUsd: number }; } ``` +Local-pricing helpers (`estimateCostUsd`, `estimateCostFromBreakdown`) +have been removed — cost is now sourced directly from +`response.usage.cost` returned by OpenRouter. + The package has no module-scoped HTTP client. Each `askLLM` call constructs its own `fetch` request. @@ -124,10 +133,19 @@ it. The cost ledger described in [docs/arch.md](../../docs/arch.md) is at `https://openrouter.ai/api/v1/chat/completions`; Ollama URL is user-configured via `Config.OllamaUrl` (default `http://localhost:11434`). Provider is selected by - `Config.LlmProvider`. -2. **No env reads.** API key + model come from `getConfigValue(...)`. No - `process.env`, no `.env`. Repo-wide ESLint rule blocks `process.env`. -3. **OpenRouter-native fallback chain.** The request body sends + `Config.LlmProvider`, or by `opts.provider` when the caller wants to + override on a per-call basis. +2. **Per-call credential override.** When `opts.apiKey` is set, the + OpenRouter call uses it directly and skips `Config.OpenrouterApiKey`. + This is the extension point that lets downstream consumers (e.g. the + enterprise wrapper) pre-resolve per-org credentials at the enqueue + boundary and pass them through job payloads, without the LLM client + knowing anything about per-org resolution. The Ollama provider is + keyless and ignores `opts.apiKey`. +3. **No env reads.** API key + model come from `getConfigValue(...)` or + `opts.apiKey`. No `process.env`, no `.env`. Repo-wide ESLint rule + blocks `process.env`. +4. **OpenRouter-native fallback chain.** The request body sends `models: [primary, ...fallbacks]` whenever the deduplicated chain has ≥2 entries. Primary is `Config.OpenrouterModel`; fallbacks come from four discrete slots `Config.OpenrouterFallbackModel1` through @@ -138,12 +156,21 @@ it. The cost ledger described in [docs/arch.md](../../docs/arch.md) is sees a single `AskLlmResult`. BullMQ's `attempts: 3` wraps the whole call — retries walk the chain again, useful when a transient OpenRouter outage clears between retries. -4. **Errors are typed, not strings.** `LlmConfigError` carries the exact + 4a. **No upstream-provider fallback.** Every request carries + `provider: { allow_fallbacks: false }`. This is orthogonal to the + `models` chain in invariant 4 — `models` controls _which model_ the + gateway tries; `allow_fallbacks` controls whether OpenRouter routes + to a different upstream backend serving the same model when the first + one stalls. We disable the latter so a slow provider cannot eat the + wall-clock without ever producing tokens; the surfaced error becomes + actionable (specific provider, specific status) instead of a generic + timeout. +5. **Errors are typed, not strings.** `LlmConfigError` carries the exact `bytebell keys set` hint; `LlmError` carries `cause`. -5. **Timeout is enforced.** AbortController fires at `timeoutMs`; the +6. **Timeout is enforced.** AbortController fires at `timeoutMs`; the resulting `AbortError` is wrapped in `LlmError` with the timeout in the message. -6. **Tokenizer is module-cached.** `tiktoken`'s `cl100k_base` encoder +7. **Tokenizer is module-cached.** `tiktoken`'s `cl100k_base` encoder is lazy-initialized on first `tokenLen` call and reused for the process lifetime. Chosen because every modern OpenRouter chat model tokenizes within ~10% of `cl100k_base` for code-shaped input. Char/4 diff --git a/packages/llm/package.json b/packages/llm/package.json index 3591373..32be323 100644 --- a/packages/llm/package.json +++ b/packages/llm/package.json @@ -8,9 +8,13 @@ "exports": { ".": "./src/index.ts" }, + "imports": { + "#src/*": "./src/*" + }, "dependencies": { "@bb/config": "workspace:*", "@bb/errors": "workspace:*", + "@bb/logger": "workspace:*", "@bb/mongo": "workspace:*", "@bb/types": "workspace:*", "tiktoken": "^1.0.22" diff --git a/packages/llm/src/README.md b/packages/llm/src/README.md index 33fbafa..1b3bba7 100644 --- a/packages/llm/src/README.md +++ b/packages/llm/src/README.md @@ -6,20 +6,40 @@ package-level contract; this file documents how the source tree is split. ## Files - **[index.ts](index.ts)** — public re-exports. The only entry point other - packages may import. Exposes `askLLM` and the `AskLlmOptions` type. - Anything not re-exported here is internal. -- **[client.ts](client.ts)** — the `askLLM` implementation. Reads - `Config.OpenrouterApiKey`, the primary `Config.OpenrouterModel`, and - the four fallback slots `Config.OpenrouterFallbackModel1..4` via - `@bb/config`. Builds the deduplicated chain `[primary, ...nonEmpty -(slot1..4)]`; if the chain has ≥2 entries the request body includes a - `models: [...]` array so OpenRouter routes among them natively. Builds - the `messages` array (optional system prompt + user prompt), POSTs to - OpenRouter via Bun's built-in `fetch` with an AbortController timeout, - parses the typed `OpenRouterResponse`, returns the first choice's - content. `usage.model` reflects which model OpenRouter actually - routed to. Throws `LlmConfigError` if the API key is empty, `LlmError` - on timeout / HTTP non-2xx / empty completion. + packages may import. Exposes `askLLM`, the `AskLlmOptions` type, the + `LlmProviderName` union (`"openrouter" | "ollama"`), plus the JSON + client surface. Anything not re-exported here is internal. +- **[client.ts](client.ts)** — the `askLLM` orchestrator. Selects the + active provider via `opts.provider ?? getConfigValue(Config.LlmProvider)` + (per-call override beats config), dispatches to `openrouter.ts` or + `ollama.ts`. Consults the filesystem decision cache before issuing a + request. Throws typed errors via `@bb/errors`. +- **[openrouter.ts](openrouter.ts)** — `callOpenRouter` and + `resolveOpenRouterChain`. Reads the API key as `opts.apiKey +?? getConfigValue(Config.OpenrouterApiKey)` (per-call override beats + config), reads the model chain (`opts.model`, `opts.fallbackModels`, + or `Config.OpenrouterModel` + four fallback slots), caps the chain at + 3 entries (OpenRouter's hard limit), POSTs to the chat-completions + endpoint with an AbortController timeout, parses the typed + `OpenRouterResponse`, returns the first choice's content. The body + always carries `provider: { allow_fallbacks: false }` so OpenRouter + cannot silently route across upstream providers of the same model; + see `OpenRouterProviderRouting` in this file and invariant 4a in the + package README. `usage.model` reflects which model OpenRouter actually + routed to. Throws `LlmConfigError` if the API key resolves to empty, + `LlmError` on timeout / HTTP non-2xx / empty completion. +- **[ollama.ts](ollama.ts)** — `callOllama` and `resolveOllamaChain`. + Single-model per request (Ollama has no fan-out). Reads model from + `opts.model ?? Config.OllamaModel`. Ignores `opts.apiKey` (Ollama is + keyless). +- **[jsonClient.ts](jsonClient.ts)** — `askJsonLLM`, `askYesNoLLM`, + `tryParseJson`, `stripJsonFence`. Wraps `askLLM` with JSON-strict + retry logic. Forwards `opts` (including `apiKey` / `provider` / `model`) + to `askLLM` unchanged. +- **[cache.ts](cache.ts)** — filesystem-backed decision cache. Key + includes `provider` and `modelChain`; `opts.apiKey` is intentionally + NOT part of the key (the cached decision is the same regardless of + which key produced it — keys are auth, not semantic input). - **[tokenizer.ts](tokenizer.ts)** — `tokenLen`, `encodeTokens`, `decodeTokens`. Module-cached `tiktoken` encoder using `cl100k_base`, lazy-initialized via `get_encoding`. All three helpers fall back to @@ -56,8 +76,9 @@ pricing). `bytebell keys set` hint; `LlmError` accepts an optional `cause` and composes a single-line message capped at 500 chars of any HTTP error body (so the logger doesn't blow up on multi-MB error responses). -- **No env reads.** Only `getConfigValue(Config.OpenrouterApiKey)` / - `getConfigValue(Config.OpenrouterModel)` provide secrets/config. +- **No env reads.** Secrets come from `opts.apiKey` first, then + `getConfigValue(Config.OpenrouterApiKey)`. Same fallback shape for the + provider switch via `opts.provider` → `Config.LlmProvider`. - **Empty completions are errors.** A 200 OK with no `choices[0].message .content` throws `LlmError("OpenRouter returned empty completion")` — do not silently return an empty string. diff --git a/packages/llm/src/cache.ts b/packages/llm/src/cache.ts index a98f3e3..c1e13a1 100644 --- a/packages/llm/src/cache.ts +++ b/packages/llm/src/cache.ts @@ -1,4 +1,3 @@ -// SPDX-License-Identifier: AGPL-3.0-only WITH non-commercial-clause import { createHash } from "node:crypto"; import fs from "node:fs/promises"; import path from "node:path"; @@ -58,6 +57,10 @@ export async function getCachedDecision(key: string): Promise { - const provider = getConfigValue(Config.LlmProvider); + const provider: LlmProviderName = opts.provider ?? (getConfigValue(Config.LlmProvider) as LlmProviderName); const timeoutMs = opts.timeoutMs ?? DEFAULT_TIMEOUT_MS; const chain = provider === "ollama" ? resolveOllamaChain(opts) : resolveOpenRouterChain(opts); @@ -43,11 +64,11 @@ export async function askLLM(prompt: string, opts: AskLlmOptions = {}): Promise< const cached = await getCachedDecision(cacheKey); if (cached !== null) { const saved = cached.usage.inputTokens + cached.usage.outputTokens; - console.info(`[LLM CACHE HIT] key=${cacheKey.slice(0, 8)} tokens-saved=${saved}`); + logger.debug(`llm: cache hit (key=${cacheKey.slice(0, 8)}, tokens-saved=${saved})`); void recordHit(cacheKey); return { content: cached.content, usage: cached.usage }; } - console.info(`[LLM CACHE MISS] key=${cacheKey.slice(0, 8)}`); + logger.debug(`llm: cache miss (key=${cacheKey.slice(0, 8)})`); } const result = diff --git a/packages/llm/src/index.ts b/packages/llm/src/index.ts index 8137b29..d65beaa 100644 --- a/packages/llm/src/index.ts +++ b/packages/llm/src/index.ts @@ -1,7 +1,6 @@ export { askLLM } from "./client.ts"; -export type { AskLlmOptions, AskLlmResult, AskLlmUsage } from "./client.ts"; +export type { AskLlmOptions, AskLlmResult, AskLlmUsage, LlmProviderName } from "./client.ts"; export { askJsonLLM, askYesNoLLM, tryParseJson, stripJsonFence } from "./jsonClient.ts"; export type { AskJsonLlmOptions, AskJsonLlmResult, AskYesNoLlmResult } from "./jsonClient.ts"; -export { estimateCostUsd, estimateCostFromBreakdown } from "./pricing.ts"; export { tokenLen, encodeTokens, decodeTokens } from "./tokenizer.ts"; export { UsageTracker } from "./usageTracker.ts"; diff --git a/packages/llm/src/jsonClient.ts b/packages/llm/src/jsonClient.ts index 55f0034..20337d2 100644 --- a/packages/llm/src/jsonClient.ts +++ b/packages/llm/src/jsonClient.ts @@ -61,7 +61,7 @@ export async function askYesNoLLM( } return { decision: null, usage, raw: content }; } catch { - return { decision: null, usage: { model: "", inputTokens: 0, outputTokens: 0 }, raw: "" }; + return { decision: null, usage: { model: "", inputTokens: 0, outputTokens: 0, costUsd: 0 }, raw: "" }; } } @@ -73,18 +73,37 @@ export async function askJsonLLM( const maxRetries = opts.maxRetries ?? 1; const baseOpts: AskLlmOptions = { ...opts, systemPrompt }; - let lastUsage: AskLlmUsage = { model: "", inputTokens: 0, outputTokens: 0 }; + let totalInputTokens = 0; + let totalOutputTokens = 0; + let totalCostUsd = 0; + let lastModel = ""; let lastRaw = ""; for (let attempt = 0; attempt <= maxRetries; attempt += 1) { const { content, usage } = await askLLM(userPrompt, baseOpts); - lastUsage = usage; + totalInputTokens += usage.inputTokens; + totalOutputTokens += usage.outputTokens; + totalCostUsd += usage.costUsd; + lastModel = usage.model; lastRaw = content; const parsed = tryParseJson(content); if (parsed !== null) { - return { result: parsed, usage, raw: content }; + return { + result: parsed, + usage: { + model: usage.model, + inputTokens: totalInputTokens, + outputTokens: totalOutputTokens, + costUsd: totalCostUsd, + }, + raw: content, + }; } } - return { result: null, usage: lastUsage, raw: lastRaw }; + return { + result: null, + usage: { model: lastModel, inputTokens: totalInputTokens, outputTokens: totalOutputTokens, costUsd: totalCostUsd }, + raw: lastRaw, + }; } diff --git a/packages/llm/src/ollama.ts b/packages/llm/src/ollama.ts index 2f131f2..096b7ac 100644 --- a/packages/llm/src/ollama.ts +++ b/packages/llm/src/ollama.ts @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: AGPL-3.0-only WITH non-commercial-clause import { getConfigValue } from "@bb/config"; import { Config } from "@bb/types"; import { LlmConfigError, LlmError } from "@bb/errors"; +import { tokenLen } from "./tokenizer.ts"; import type { AskLlmOptions, AskLlmResult } from "./client.ts"; interface OllamaMessage { @@ -85,8 +85,12 @@ export async function callOllama(prompt: string, opts: AskLlmOptions, timeoutMs: content, usage: { model: typeof json.model === "string" && json.model.length > 0 ? json.model : model, - inputTokens: typeof json.prompt_eval_count === "number" ? json.prompt_eval_count : 0, - outputTokens: typeof json.eval_count === "number" ? json.eval_count : 0, + inputTokens: + typeof json.prompt_eval_count === "number" + ? json.prompt_eval_count + : tokenLen((opts.systemPrompt ?? "") + prompt), + outputTokens: typeof json.eval_count === "number" ? json.eval_count : tokenLen(content), + costUsd: 0, }, }; } diff --git a/packages/llm/src/openrouter.ts b/packages/llm/src/openrouter.ts index fc71acb..a4f99e7 100644 --- a/packages/llm/src/openrouter.ts +++ b/packages/llm/src/openrouter.ts @@ -1,7 +1,7 @@ -// SPDX-License-Identifier: AGPL-3.0-only WITH non-commercial-clause import { getConfigValue } from "@bb/config"; import { Config } from "@bb/types"; import { LlmConfigError, LlmError } from "@bb/errors"; +import { tokenLen } from "./tokenizer.ts"; import type { AskLlmOptions, AskLlmResult } from "./client.ts"; const OPENROUTER_URL = "https://openrouter.ai/api/v1/chat/completions"; @@ -11,10 +11,28 @@ interface OpenRouterMessage { content: string; } +interface OpenRouterUsageAccounting { + /** + * Opt-in flag that asks OpenRouter to populate `usage.cost` in the + * response with the authoritative billed cost (in USD credits). Without + * this, OpenRouter omits the cost field. + */ + include: true; +} + +interface OpenRouterProviderRouting { + // Pin OpenRouter to the first viable upstream provider. Without this, + // OpenRouter silently cycles across providers on slow/failed calls and + // we lose the per-call wall-clock budget before a real error surfaces. + allow_fallbacks: boolean; +} + interface OpenRouterRequest { model: string; models?: string[]; messages: OpenRouterMessage[]; + usage: OpenRouterUsageAccounting; + provider: OpenRouterProviderRouting; } interface OpenRouterResponse { @@ -23,11 +41,12 @@ interface OpenRouterResponse { usage?: { prompt_tokens?: number; completion_tokens?: number; + cost?: number; }; } export function resolveOpenRouterChain(opts: AskLlmOptions): string[] { - const apiKey = getConfigValue(Config.OpenrouterApiKey); + const apiKey = opts.apiKey ?? getConfigValue(Config.OpenrouterApiKey); if (apiKey.length === 0) { throw new LlmConfigError("bytebell keys set"); } @@ -45,7 +64,7 @@ export function resolveOpenRouterChain(opts: AskLlmOptions): string[] { } export async function callOpenRouter(prompt: string, opts: AskLlmOptions, timeoutMs: number): Promise { - const apiKey = getConfigValue(Config.OpenrouterApiKey); + const apiKey = opts.apiKey ?? getConfigValue(Config.OpenrouterApiKey); const cappedChain = resolveOpenRouterChain(opts); const model = cappedChain[0] ?? opts.model ?? getConfigValue(Config.OpenrouterModel); @@ -55,8 +74,12 @@ export async function callOpenRouter(prompt: string, opts: AskLlmOptions, timeou } messages.push({ role: "user", content: prompt }); + const usageAccounting: OpenRouterUsageAccounting = { include: true }; + const providerRouting: OpenRouterProviderRouting = { allow_fallbacks: false }; const body: OpenRouterRequest = - cappedChain.length > 1 ? { model, models: cappedChain, messages } : { model, messages }; + cappedChain.length > 1 + ? { model, models: cappedChain, messages, usage: usageAccounting, provider: providerRouting } + : { model, messages, usage: usageAccounting, provider: providerRouting }; const controller = new AbortController(); const timer = setTimeout(() => controller.abort(), timeoutMs); @@ -82,7 +105,10 @@ export async function callOpenRouter(prompt: string, opts: AskLlmOptions, timeou if (!response.ok) { const text = await response.text().catch(() => ""); - throw new LlmError(`OpenRouter HTTP ${response.status}: ${text.slice(0, 500)}`); + throw new LlmError(`OpenRouter HTTP ${response.status}`, undefined, { + status: response.status, + detail: text.slice(0, 4000), + }); } const json = (await response.json()) as OpenRouterResponse; @@ -94,8 +120,13 @@ export async function callOpenRouter(prompt: string, opts: AskLlmOptions, timeou content, usage: { model: typeof json.model === "string" && json.model.length > 0 ? json.model : model, - inputTokens: typeof json.usage?.prompt_tokens === "number" ? json.usage.prompt_tokens : 0, - outputTokens: typeof json.usage?.completion_tokens === "number" ? json.usage.completion_tokens : 0, + inputTokens: + typeof json.usage?.prompt_tokens === "number" + ? json.usage.prompt_tokens + : tokenLen((opts.systemPrompt ?? "") + prompt), + outputTokens: + typeof json.usage?.completion_tokens === "number" ? json.usage.completion_tokens : tokenLen(content), + costUsd: typeof json.usage?.cost === "number" ? json.usage.cost : 0, }, }; } diff --git a/packages/llm/src/pricing.ts b/packages/llm/src/pricing.ts deleted file mode 100644 index 78da4d7..0000000 --- a/packages/llm/src/pricing.ts +++ /dev/null @@ -1,137 +0,0 @@ -import { getConfigValue } from "@bb/config"; -import { Config, type ModelTokenBreakdown } from "@bb/types"; - -const OPENROUTER_MODELS_URL = "https://openrouter.ai/api/v1/models"; -const PRICING_TIMEOUT_MS = 8_000; -const COST_UNKNOWN = -1; - -interface OpenRouterPricing { - prompt?: string; - completion?: string; -} - -interface OpenRouterModel { - id?: string; - pricing?: OpenRouterPricing; -} - -interface OpenRouterModelsResponse { - data?: OpenRouterModel[]; -} - -interface ModelPrice { - inputUsdPerToken: number; - outputUsdPerToken: number; -} - -let pricingCache: Map | null = null; -let pricingPromise: Promise> | null = null; - -async function fetchPricing(): Promise> { - const map = new Map(); - let response: Response; - try { - response = await fetch(OPENROUTER_MODELS_URL, { - signal: AbortSignal.timeout(PRICING_TIMEOUT_MS), - }); - } catch { - return map; - } - if (!response.ok) { - return map; - } - const json = (await response.json().catch(() => null)) as OpenRouterModelsResponse | null; - if (json === null || !Array.isArray(json.data)) { - return map; - } - for (const entry of json.data) { - if (typeof entry.id !== "string" || entry.id.length === 0) { - continue; - } - const promptStr = entry.pricing?.prompt; - const completionStr = entry.pricing?.completion; - const inputPrice = typeof promptStr === "string" ? Number.parseFloat(promptStr) : Number.NaN; - const outputPrice = typeof completionStr === "string" ? Number.parseFloat(completionStr) : Number.NaN; - if (!Number.isFinite(inputPrice) || !Number.isFinite(outputPrice)) { - continue; - } - map.set(entry.id, { inputUsdPerToken: inputPrice, outputUsdPerToken: outputPrice }); - } - return map; -} - -async function getPricing(): Promise> { - if (pricingCache !== null) { - return pricingCache; - } - if (pricingPromise === null) { - pricingPromise = fetchPricing().then((map) => { - pricingCache = map; - return map; - }); - } - return pricingPromise; -} - -function resolvePrice(prices: Map, model: string): ModelPrice | undefined { - const direct = prices.get(model); - if (direct !== undefined) { - return direct; - } - for (const [id, price] of prices.entries()) { - if (id.endsWith(`/${model}`) || model.endsWith(`/${id}`)) { - return price; - } - } - return undefined; -} - -function isOllamaProvider(): boolean { - try { - return getConfigValue(Config.LlmProvider) === "ollama"; - } catch { - return false; - } -} - -export async function estimateCostUsd(model: string, inputTokens: number, outputTokens: number): Promise { - if (isOllamaProvider()) { - return 0; - } - const prices = await getPricing(); - if (prices.size === 0) { - return COST_UNKNOWN; - } - const price = resolvePrice(prices, model); - if (price === undefined) { - return COST_UNKNOWN; - } - const cost = inputTokens * price.inputUsdPerToken + outputTokens * price.outputUsdPerToken; - return Math.round(cost * 1_000_000) / 1_000_000; -} - -export async function estimateCostFromBreakdown(modelTokens: ModelTokenBreakdown): Promise { - const entries = Object.entries(modelTokens); - if (entries.length === 0) { - return 0; - } - let total = 0; - let anyKnown = false; - for (const [model, usage] of entries) { - const cost = await estimateCostUsd(model, usage.inputTokens, usage.outputTokens); - if (cost === COST_UNKNOWN) { - continue; - } - anyKnown = true; - total += cost; - } - if (!anyKnown) { - return COST_UNKNOWN; - } - return Math.round(total * 1_000_000) / 1_000_000; -} - -export function _resetPricingForTests(): void { - pricingCache = null; - pricingPromise = null; -} diff --git a/packages/llm/tsconfig.json b/packages/llm/tsconfig.json index c2104f6..4ed0786 100644 --- a/packages/llm/tsconfig.json +++ b/packages/llm/tsconfig.json @@ -1,8 +1,4 @@ { "extends": "../../tsconfig.base.json", - "compilerOptions": { - "rootDir": "./src", - "outDir": "./dist" - }, - "include": ["src/**/*"] + "include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.json"] } diff --git a/packages/logger/README.md b/packages/logger/README.md index 7256095..113af4e 100644 --- a/packages/logger/README.md +++ b/packages/logger/README.md @@ -19,16 +19,32 @@ Single logging surface for the workspace. Two sinks: ```ts type LoggerScope = "server" | "cli" +type LoggerFactory = (scope: LoggerScope) => Logger type Logger // re-exported from winston +const logger: Logger // proxy → getLogger("server") function getLogger(scope: LoggerScope): Logger +function seedLoggerFactory(factory: LoggerFactory): void function shutdownLoggers(): Promise function getLogsDir(): string function ensureLogsDir(): void +function __isLoggerFactorySeeded(): boolean function __resetLoggersForTests(): void // test-only ``` +`logger` (the default export) is a Proxy that lazily resolves to +`getLogger("server")` on every access — necessary because the resolved logger +may change after `seedLoggerFactory` is called by a parent process. + +`seedLoggerFactory(factory)` registers a factory used by all subsequent +`getLogger(scope)` calls. The previous scope cache is cleared on registration +so any logger already imported via the `logger` proxy resolves to the new +factory's output on its next method call. When no factory is seeded, +`getLogger` falls back to `buildLogger(scope)` — the disk-backed +DailyRotateFile + Console transport setup. The standalone binary never seeds +and gets the original behaviour bit-for-bit. + `getLogger(scope)` is idempotent. Workers tag via `getLogger("server").child({ worker: "pdf-1" })` — there is no per-worker file split. diff --git a/packages/logger/package.json b/packages/logger/package.json index 3d25c86..f7a1152 100644 --- a/packages/logger/package.json +++ b/packages/logger/package.json @@ -8,6 +8,9 @@ "exports": { ".": "./src/index.ts" }, + "imports": { + "#src/*": "./src/*" + }, "dependencies": { "@bb/config": "workspace:*", "@bb/types": "workspace:*", diff --git a/packages/logger/src/index.ts b/packages/logger/src/index.ts index c6678a5..c2e1376 100644 --- a/packages/logger/src/index.ts +++ b/packages/logger/src/index.ts @@ -1,10 +1,23 @@ +import type winston from "winston"; import { getLogger } from "./logger.ts"; -export { getLogger, shutdownLoggers, __resetLoggersForTests } from "./logger.ts"; -export type { LoggerScope } from "./logger.ts"; +export { + getLogger, + seedLoggerFactory, + shutdownLoggers, + __isLoggerFactorySeeded, + __resetLoggersForTests, +} from "./logger.ts"; +export type { LoggerScope, LoggerFactory } from "./logger.ts"; export { getLogsDir, ensureLogsDir } from "./dirs.ts"; export type { Logger } from "winston"; -export const logger = getLogger("server"); +export const logger = new Proxy({} as winston.Logger, { + get(_target, prop, receiver) { + const actual = getLogger("server"); + const value = Reflect.get(actual, prop, receiver); + return typeof value === "function" ? (value as (...args: unknown[]) => unknown).bind(actual) : value; + }, +}); diff --git a/packages/logger/src/logger.ts b/packages/logger/src/logger.ts index e7c0248..d6adf83 100644 --- a/packages/logger/src/logger.ts +++ b/packages/logger/src/logger.ts @@ -6,7 +6,10 @@ import { flushTransport, makeConsoleTransport, makeFileTransport } from "./trans export type LoggerScope = "server" | "cli"; +export type LoggerFactory = (scope: LoggerScope) => winston.Logger; + const scopeLoggers = new Map(); +let seededFactory: LoggerFactory | null = null; function buildLogger(scope: LoggerScope): winston.Logger { ensureLogsDir(); @@ -17,12 +20,21 @@ function buildLogger(scope: LoggerScope): winston.Logger { }); } +export function seedLoggerFactory(factory: LoggerFactory): void { + seededFactory = factory; + scopeLoggers.clear(); +} + +export function __isLoggerFactorySeeded(): boolean { + return seededFactory !== null; +} + export function getLogger(scope: LoggerScope): winston.Logger { const cached = scopeLoggers.get(scope); if (cached !== undefined) { return cached; } - const logger = buildLogger(scope); + const logger = seededFactory !== null ? seededFactory(scope) : buildLogger(scope); scopeLoggers.set(scope, logger); return logger; } @@ -42,4 +54,5 @@ export function __resetLoggersForTests(): void { logger.close(); } scopeLoggers.clear(); + seededFactory = null; } diff --git a/packages/logger/tsconfig.json b/packages/logger/tsconfig.json index 5ae1abb..4ed0786 100644 --- a/packages/logger/tsconfig.json +++ b/packages/logger/tsconfig.json @@ -1,11 +1,4 @@ { "extends": "../../tsconfig.base.json", - "compilerOptions": { - "rootDir": "./src", - "outDir": "./dist", - "noEmit": false, - "emitDeclarationOnly": true - }, - "include": ["src/**/*"], - "references": [{ "path": "../config" }] + "include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.json"] } diff --git a/packages/mcp/package.json b/packages/mcp/package.json index 7d58faf..98fc5a5 100644 --- a/packages/mcp/package.json +++ b/packages/mcp/package.json @@ -8,6 +8,9 @@ "exports": { ".": "./src/index.ts" }, + "imports": { + "#src/*": "./src/*" + }, "dependencies": { "@bb/config": "workspace:*", "@bb/logger": "workspace:*", diff --git a/packages/mcp/tsconfig.json b/packages/mcp/tsconfig.json index c2104f6..4ed0786 100644 --- a/packages/mcp/tsconfig.json +++ b/packages/mcp/tsconfig.json @@ -1,8 +1,4 @@ { "extends": "../../tsconfig.base.json", - "compilerOptions": { - "rootDir": "./src", - "outDir": "./dist" - }, - "include": ["src/**/*"] + "include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.json"] } diff --git a/packages/mongo/README.md b/packages/mongo/README.md index 73d1b72..028a513 100644 --- a/packages/mongo/README.md +++ b/packages/mongo/README.md @@ -22,22 +22,27 @@ The package owns: - `upsertKnowledge` / `listKnowledge` — knowledge-doc upsert and list (with file count joined from `raw`). Used by the github / local index routes and by `@bb/cli`'s `ls` and `delete` flows. - - `deleteKnowledge` — hard delete: removes the `knowledge` doc, every - `raw` row tagged with that `knowledgeId`, and every - `processing_stats` commit row tagged with that `knowledgeId`. - Called by the server's `DELETE /api/v1/repos/:knowledgeId` route. + - `deleteKnowledge` — hard delete: removes the `knowledge` doc and + every `raw` row tagged with that `knowledgeId`. Called by the + server's `DELETE /api/v1/repos/:knowledgeId` route. - `upsertRawFile` — per-file Raw doc writer (compound key `{ knowledgeId, relativePath }`). Called by `@bb/ingest-github`'s worker for every scanned file. - - `recordProcessingStats` — upsert one `processing_stats` row keyed - on `{ knowledgeId, commitHash }`. Called by `@bb/ingest-github`'s - worker once per ingest run with the per-model token totals, - estimated cost, and processing time. - - `aggregateStats` — read every `knowledge` + `processing_stats` doc - and assemble the kube-shaped `StatsResponse` (totals, repos, - commitStats). Called by the server's `GET /api/v1/stats` route. + - `setKnowledgeCommit(knowledgeId, commitHash, inputTokens, outputTokens, costUsd)` + — appends `{ hash, inputTokens, outputTokens, costUsd }` to + `source.commitHashes[]` and sets `source.commitId`. `costUsd` is the + OpenRouter-reported USD cost (`response.usage.cost`) summed across + the pipeline phases for this commit — never computed client-side. + - `aggregateStats` — read every `knowledge` doc and assemble the + kube-shaped `StatsResponse` (totals, repos, commitStats) by summing + `source.commitHashes[].{inputTokens,outputTokens,costUsd}`. Called by + the server's `GET /api/v1/stats` route. The deprecated + `processing_stats` collection is no longer queried. - A central registry of collection name strings (`Collections` enum): - `knowledge`, `raw`, `processing_stats`. + `knowledge`, `raw`, `mcp_usage`, `mcp_activity`. The + `processing_stats` collection has been removed — per-commit token + + cost data lives on the knowledge document's `source.commitHashes[]` + instead. The package does **not** own: diff --git a/packages/mongo/package.json b/packages/mongo/package.json index c32bc30..54f058a 100644 --- a/packages/mongo/package.json +++ b/packages/mongo/package.json @@ -8,6 +8,9 @@ "exports": { ".": "./src/index.ts" }, + "imports": { + "#src/*": "./src/*" + }, "dependencies": { "@bb/config": "workspace:*", "@bb/errors": "workspace:*", diff --git a/packages/mongo/src/README.md b/packages/mongo/src/README.md index f89f382..6ae0c4e 100644 --- a/packages/mongo/src/README.md +++ b/packages/mongo/src/README.md @@ -7,9 +7,9 @@ package-level contract; this file documents how the source tree is split. - **[index.ts](index.ts)** — public re-exports. The only entry point other packages may import. Exposes `connectMongo`, `closeMongo`, `pingMongo`, - `setKnowledgeState`, `upsertRawFile`, and the `PingResult` / - `FileAnalysis` / `RawFileDoc` types. Anything not re-exported here is - internal. + `setKnowledgeState`, `markKnowledgeFailed`, `upsertRawFile`, and the + `PingResult` / `FileAnalysis` / `RawFileDoc` types. Anything not + re-exported here is internal. - **[client.ts](client.ts)** — module-scoped `MongoClient` singleton plus the lifecycle (`connectMongo`, `closeMongo`), the health probe (`pingMongo`), and the **internal** `_getDb()` accessor. Reads the URI via @@ -22,12 +22,19 @@ package-level contract; this file documents how the source tree is split. `Collections.Knowledge = "knowledge"`, `Collections.Raw = "raw"`. `Nodes` and `Jobs` join when their helpers land. **Internal** — not re-exported from `index.ts`; consumed only by helpers in this folder. -- **[knowledge.ts](knowledge.ts)** — domain CRUD helper: - `setKnowledgeState(knowledgeId, state)`. Uses `_getDb()` to access - `Collections.Knowledge`, runs `updateOne({ knowledgeId }, { $set: { -"status.state": state, updatedAt: } })`, and throws - `KnowledgeNotFoundError` on `matchedCount === 0`. Called by `@bb/queue` - publishers on enqueue. +- **[knowledge.ts](knowledge.ts)** — domain CRUD helpers: + - `setKnowledgeState(knowledgeId, state)` runs + `updateOne({ knowledgeId }, { $set: { "status.state": state, updatedAt }, $unset: { failure: "" } })`. + The `$unset` of `failure` makes the next successful transition out of + FAILED automatically clear any stale failure metadata. Throws + `KnowledgeNotFoundError` on `matchedCount === 0`. Called by `@bb/queue` + publishers on enqueue and by the runner on terminal success. + - `markKnowledgeFailed(knowledgeId, reason, category, detail?)` writes + the structured `failure: { reason, category, at, detail? }` subdoc + alongside `status.state = "FAILED"`. `reason` is short and + operator-readable; `detail` is the optional raw provider response. + Called by `@bb/ingest-github/src/pipeline/run.ts` (and `pull.ts`) + catch blocks via the shared `classifyFailure` helper. - **[raw.ts](raw.ts)** — domain CRUD helpers for `Collections.Raw`. Defines the `FileAnalysis` and `RawFileDoc` interfaces (package-local until promotion to `@bb/types`). Exports: diff --git a/packages/mongo/src/aggregateStats.ts b/packages/mongo/src/aggregateStats.ts new file mode 100644 index 0000000..95f7d59 --- /dev/null +++ b/packages/mongo/src/aggregateStats.ts @@ -0,0 +1,149 @@ +import type { KnowledgeDoc, StatsCommitEntry, StatsRepoEntry, StatsResponse, StatsTotals } from "@bb/types"; +import { _getDb } from "./client.ts"; +import { Collections } from "./collections.ts"; + +interface CommitHashRecord { + hash: string; + inputTokens: string; + outputTokens: string; + costUsd: string; +} + +/** + * Aggregates token + cost stats over the `knowledge` collection. Replaces the + * previous read against the deleted `processing_stats` collection — the + * authoritative per-commit numbers now live on the knowledge document's + * `source.commitHashes[]` (populated by `setKnowledgeCommit`). + * + * Fields that the old `processing_stats` row carried but the knowledge doc + * does not (per-commit `processingTimeMs`, `totalBatches`, `totalFolders`, + * `filesAnalyzed`, `createdAt`/`updatedAt`) are reported as 0 / empty — + * the `bytebell stats` UI tolerates that. + */ +export async function aggregateStats(): Promise { + const db = _getDb(); + const knowledgeDocs = (await db + .collection(Collections.Knowledge) + .find({}) + .sort({ updatedAt: -1 }) + .toArray()) as unknown as KnowledgeDoc[]; + + const repos: StatsRepoEntry[] = []; + const commitStats: StatsCommitEntry[] = []; + let totalInputTokens = 0; + let totalOutputTokens = 0; + let totalCost = 0; + let totalFiles = 0; + + for (const doc of knowledgeDocs) { + const commits = pickCommits(doc); + const fileCount = await db.collection(Collections.Raw).countDocuments({ knowledgeId: doc.knowledgeId }); + const repoName = deriveRepoName(doc); + const type = doc.source.kind === "github" ? ("GITHUB" as const) : ("LOCAL" as const); + + let repoIn = 0; + let repoOut = 0; + let repoCost = 0; + for (const c of commits) { + const inT = parseNumber(c.inputTokens); + const outT = parseNumber(c.outputTokens); + const cost = parseNumber(c.costUsd); + repoIn += inT; + repoOut += outT; + repoCost += cost; + commitStats.push({ + knowledgeId: doc.knowledgeId, + repoName, + commitHash: c.hash, + inputTokens: inT, + outputTokens: outT, + estimatedCost: cost, + totalBatches: 0, + processingTimeMs: 0, + totalFiles: fileCount, + totalFolders: 0, + filesAnalyzed: fileCount, + createdAt: "", + updatedAt: "", + }); + } + + repos.push({ + knowledgeId: doc.knowledgeId, + repoName, + type, + fileCount, + folderCount: 0, + inputTokens: repoIn, + outputTokens: repoOut, + estimatedCost: repoCost, + }); + + totalInputTokens += repoIn; + totalOutputTokens += repoOut; + totalCost += repoCost; + totalFiles += fileCount; + } + + const totals: StatsTotals = { + totalRepos: knowledgeDocs.length, + totalFiles, + totalFolders: 0, + totalInputTokens, + totalOutputTokens, + totalEstimatedCost: Math.round(totalCost * 1_000_000) / 1_000_000, + }; + + return { totals, repos, commitStats }; +} + +function pickCommits(doc: KnowledgeDoc): CommitHashRecord[] { + const source = (doc as unknown as { source?: { commitHashes?: unknown } }).source; + const raw = source?.commitHashes; + if (!Array.isArray(raw)) { + return []; + } + const out: CommitHashRecord[] = []; + for (const entry of raw) { + if (typeof entry !== "object" || entry === null) { + continue; + } + const rec = entry as Partial; + if (typeof rec.hash !== "string") { + continue; + } + out.push({ + hash: rec.hash, + inputTokens: typeof rec.inputTokens === "string" ? rec.inputTokens : "0", + outputTokens: typeof rec.outputTokens === "string" ? rec.outputTokens : "0", + costUsd: typeof rec.costUsd === "string" ? rec.costUsd : "0", + }); + } + return out; +} + +function parseNumber(value: string): number { + const n = Number.parseFloat(value); + return Number.isFinite(n) ? n : 0; +} + +function deriveRepoName(doc: KnowledgeDoc): string { + if (doc.source.kind === "local") { + const segments = doc.source.sourcePath.split("/").filter((s) => s.length > 0); + return segments.at(-1) ?? doc.source.sourcePath; + } + try { + const segments = new URL(doc.info.repoUrl ?? "").pathname + .split("/") + .map((s) => s.trim()) + .filter((s) => s.length > 0); + const repo = segments.at(-1)?.replace(/\.git$/u, ""); + const owner = segments.at(-2); + if (owner !== undefined && repo !== undefined) { + return `${owner}/${repo}`; + } + } catch { + // fall through + } + return doc.info.repoUrl ?? ""; +} diff --git a/packages/mongo/src/collections.ts b/packages/mongo/src/collections.ts index 8a0714d..7737836 100644 --- a/packages/mongo/src/collections.ts +++ b/packages/mongo/src/collections.ts @@ -1,7 +1,6 @@ export enum Collections { Knowledge = "knowledge", Raw = "raw", - ProcessingStats = "processing_stats", Usage = "mcp_usage", Activity = "mcp_activity", } diff --git a/packages/mongo/src/index.ts b/packages/mongo/src/index.ts index 4303968..bcee0b0 100644 --- a/packages/mongo/src/index.ts +++ b/packages/mongo/src/index.ts @@ -5,6 +5,8 @@ export { getKnowledge, setKnowledgeCommit, setKnowledgeState, + markKnowledgeFailed, + setKnowledgeBranch, updateKnowledgeProgress, upsertKnowledge, listKnowledge, @@ -15,8 +17,7 @@ export type { KnowledgeListEntry, DeleteKnowledgeResult } from "./knowledge.ts"; export { upsertRawFile, listRawFileShas, deleteRawFiles } from "./raw.ts"; export type { FileAnalysis, FileAnalysisSection, RawFileDoc } from "./raw.ts"; -export { recordProcessingStats, aggregateStats } from "./processingStats.ts"; -export type { RecordProcessingStatsInput } from "./processingStats.ts"; +export { aggregateStats } from "./aggregateStats.ts"; export { incrementUsage, getMonthlyUsage, getGlobalUsage } from "./usage.ts"; export { recordActivity } from "./activity.ts"; diff --git a/packages/mongo/src/knowledge.ts b/packages/mongo/src/knowledge.ts index c1bfb15..a83e30e 100644 --- a/packages/mongo/src/knowledge.ts +++ b/packages/mongo/src/knowledge.ts @@ -1,4 +1,4 @@ -import { KnowledgeState, type KnowledgeDoc } from "@bb/types"; +import type { KnowledgeDoc, KnowledgeFailureCategory, KnowledgeState } from "@bb/types"; import { KnowledgeNotFoundError } from "@bb/errors"; import { _getDb } from "./client.ts"; import { Collections } from "./collections.ts"; @@ -10,9 +10,51 @@ export interface KnowledgeListEntry extends KnowledgeDoc { } export async function setKnowledgeState(knowledgeId: string, state: KnowledgeState): Promise { + const update: Record = { "status.state": state, updatedAt: new Date() }; const result = await _getDb() .collection(Collections.Knowledge) - .updateOne({ knowledgeId }, { $set: { "status.state": state, updatedAt: new Date() } }); + .updateOne({ knowledgeId }, { $set: update, $unset: { failure: "" } }); + if (result.matchedCount === 0) { + throw new KnowledgeNotFoundError(knowledgeId); + } +} + +/** + * Marks a knowledge as FAILED and records the structured failure reason on + * the top-level `failure` subdoc. The next successful transition out of + * FAILED automatically clears it (see `setKnowledgeState`'s `$unset`). + * + * `reason` is a short operator-readable sentence (UI surfaces it directly). + * `detail` is the raw provider response or structured debug payload (UI may + * hide behind a disclosure). + */ +export async function markKnowledgeFailed( + knowledgeId: string, + reason: string, + category: KnowledgeFailureCategory, + detail?: string, +): Promise { + const now = new Date(); + const failure: { reason: string; category: KnowledgeFailureCategory; at: Date; detail?: string } = { + reason, + category, + at: now, + }; + if (detail !== undefined && detail.length > 0) { + failure.detail = detail; + } + const result = await _getDb() + .collection(Collections.Knowledge) + .updateOne( + { knowledgeId }, + { + $set: { + "status.state": "FAILED", + failure, + updatedAt: now, + }, + }, + ); if (result.matchedCount === 0) { throw new KnowledgeNotFoundError(knowledgeId); } @@ -26,14 +68,22 @@ export async function setKnowledgeState(knowledgeId: string, state: KnowledgeSta * * Throws `KnowledgeNotFoundError` if the document doesn't exist. */ -export async function setKnowledgeCommit(knowledgeId: string, commitHash: string): Promise { +export async function setKnowledgeCommit( + knowledgeId: string, + commitHash: string, + inputTokens: string = "", + outputTokens: string = "", + costUsd: string = "0", +): Promise { const result = await _getDb() .collection(Collections.Knowledge) .updateOne( { knowledgeId }, { $set: { "source.commitId": commitHash, updatedAt: new Date() }, - $addToSet: { "source.commitHashes": commitHash }, + $addToSet: { + "source.commitHashes": { hash: commitHash, inputTokens, outputTokens, costUsd }, + }, }, ); if (result.matchedCount === 0) { @@ -41,6 +91,18 @@ export async function setKnowledgeCommit(knowledgeId: string, commitHash: string } } +/** + * Updates the branch name of a GitHub knowledge entry. + */ +export async function setKnowledgeBranch(knowledgeId: string, branch: string): Promise { + const result = await _getDb() + .collection(Collections.Knowledge) + .updateOne({ knowledgeId }, { $set: { "source.branch": branch, updatedAt: new Date() } }); + if (result.matchedCount === 0) { + throw new KnowledgeNotFoundError(knowledgeId); + } +} + export async function updateKnowledgeProgress( knowledgeId: string, processedFiles: number, @@ -83,7 +145,6 @@ export async function upsertKnowledge(doc: Omit & { u export interface DeleteKnowledgeResult { knowledgeDeleted: number; rawDeleted: number; - statsDeleted: number; } export async function deleteKnowledge(knowledgeId: string): Promise { @@ -93,11 +154,9 @@ export async function deleteKnowledge(knowledgeId: string): Promise { - const now = new Date(); - const totals = sumModelTokens(input.modelTokens); - await _getDb() - .collection(Collections.ProcessingStats) - .updateOne( - { knowledgeId: input.knowledgeId, commitHash: input.commitHash }, - { - $set: { - repoName: input.repoName, - modelTokens: input.modelTokens, - inputTokens: totals.inputTokens, - outputTokens: totals.outputTokens, - estimatedCost: input.estimatedCost, - totalBatches: input.totalBatches, - totalFiles: input.totalFiles, - totalFolders: input.totalFolders, - filesAnalyzed: input.filesAnalyzed, - processingTimeMs: input.processingTimeMs, - updatedAt: now, - }, - $setOnInsert: { - knowledgeId: input.knowledgeId, - commitHash: input.commitHash, - createdAt: now, - }, - }, - { upsert: true }, - ); -} - -export async function aggregateStats(): Promise { - const db = _getDb(); - const knowledgeDocs = (await db - .collection(Collections.Knowledge) - .find({}) - .sort({ updatedAt: -1 }) - .toArray()) as unknown as KnowledgeDoc[]; - - const statsDocs = (await db - .collection(Collections.ProcessingStats) - .find({}) - .sort({ updatedAt: -1 }) - .toArray()) as unknown as ProcessingStatsDoc[]; - - const repos: StatsRepoEntry[] = []; - for (const doc of knowledgeDocs) { - const matchedStats = statsDocs.filter((s) => s.knowledgeId === doc.knowledgeId); - const aggregate = aggregateRepoTokens(matchedStats); - const fileCount = await db.collection(Collections.Raw).countDocuments({ knowledgeId: doc.knowledgeId }); - repos.push({ - knowledgeId: doc.knowledgeId, - repoName: matchedStats[0]?.repoName ?? deriveRepoName(doc), - type: doc.source.kind === "github" ? "GITHUB" : "LOCAL", - fileCount, - folderCount: 0, - inputTokens: aggregate.inputTokens, - outputTokens: aggregate.outputTokens, - estimatedCost: aggregate.estimatedCost, - }); - } - - const commitStats: StatsCommitEntry[] = statsDocs.map((s) => ({ - knowledgeId: s.knowledgeId, - repoName: s.repoName, - commitHash: s.commitHash, - inputTokens: s.inputTokens, - outputTokens: s.outputTokens, - estimatedCost: s.estimatedCost, - totalBatches: s.totalBatches, - processingTimeMs: s.processingTimeMs, - totalFiles: s.totalFiles, - totalFolders: s.totalFolders, - filesAnalyzed: s.filesAnalyzed, - createdAt: toIso(s.createdAt), - updatedAt: toIso(s.updatedAt), - })); - - const totals: StatsTotals = { - totalRepos: knowledgeDocs.length, - totalFiles: repos.reduce((sum, r) => sum + r.fileCount, 0), - totalFolders: 0, - totalInputTokens: statsDocs.reduce((sum, s) => sum + (s.inputTokens ?? 0), 0), - totalOutputTokens: statsDocs.reduce((sum, s) => sum + (s.outputTokens ?? 0), 0), - totalEstimatedCost: sumCost(statsDocs.map((s) => s.estimatedCost)), - }; - - return { totals, repos, commitStats }; -} - -function sumModelTokens(modelTokens: ModelTokenBreakdown): { inputTokens: number; outputTokens: number } { - let inputTokens = 0; - let outputTokens = 0; - for (const usage of Object.values(modelTokens)) { - inputTokens += usage.inputTokens; - outputTokens += usage.outputTokens; - } - return { inputTokens, outputTokens }; -} - -function aggregateRepoTokens(stats: ProcessingStatsDoc[]): { - inputTokens: number; - outputTokens: number; - estimatedCost: number; -} { - let inputTokens = 0; - let outputTokens = 0; - for (const s of stats) { - inputTokens += s.inputTokens ?? 0; - outputTokens += s.outputTokens ?? 0; - } - return { - inputTokens, - outputTokens, - estimatedCost: sumCost(stats.map((s) => s.estimatedCost)), - }; -} - -function sumCost(values: number[]): number { - let total = 0; - let anyKnown = false; - for (const v of values) { - if (typeof v !== "number" || v === COST_UNKNOWN) { - continue; - } - anyKnown = true; - total += v; - } - if (!anyKnown) { - return values.length === 0 ? 0 : COST_UNKNOWN; - } - return Math.round(total * 1_000_000) / 1_000_000; -} - -function deriveRepoName(doc: KnowledgeDoc): string { - if (doc.source.kind === "local") { - const segments = doc.source.sourcePath.split("/").filter((s) => s.length > 0); - return segments.at(-1) ?? doc.source.sourcePath; - } - try { - const segments = new URL(doc.source.repoUrl).pathname - .split("/") - .map((s) => s.trim()) - .filter((s) => s.length > 0); - const repo = segments.at(-1)?.replace(/\.git$/u, ""); - const owner = segments.at(-2); - if (owner !== undefined && repo !== undefined) { - return `${owner}/${repo}`; - } - } catch { - // fall through - } - return doc.source.repoUrl; -} - -function toIso(value: Date | string | undefined): string { - if (value === undefined) { - return new Date(0).toISOString(); - } - if (value instanceof Date) { - return value.toISOString(); - } - return new Date(value).toISOString(); -} diff --git a/packages/mongo/tsconfig.json b/packages/mongo/tsconfig.json index c2104f6..4ed0786 100644 --- a/packages/mongo/tsconfig.json +++ b/packages/mongo/tsconfig.json @@ -1,8 +1,4 @@ { "extends": "../../tsconfig.base.json", - "compilerOptions": { - "rootDir": "./src", - "outDir": "./dist" - }, - "include": ["src/**/*"] + "include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.json"] } diff --git a/packages/neo4j/README.md b/packages/neo4j/README.md index e363877..ba441b2 100644 --- a/packages/neo4j/README.md +++ b/packages/neo4j/README.md @@ -40,20 +40,25 @@ The package owns: function / import edges), and one to remove the `:Knowledge` node itself. Called by the server's `DELETE /api/v1/repos/:knowledgeId` route. -- File-node CRUD (`upsertFileNode`) — composes the per-file relationships - (`:HAS_KEYWORD / :HAS_CLASS / :HAS_FUNCTION / :HAS_IMPORT_INTERNAL / -:HAS_IMPORT_EXTERNAL`), clearing stale relationships before - re-attaching for re-runs. The two-`:HAS_IMPORT_*` split mirrors - kube-package's distinction between relative imports and external - packages — downstream MCP queries can ask "which files import this - internal module" vs "which files import this external package" - cleanly +- File-node CRUD (`upsertFileNode`, `upsertFileNodesBatch`) — composes + the per-file relationships (`:HAS_KEYWORD / :HAS_CLASS / :HAS_FUNCTION +/ :HAS_IMPORT_INTERNAL / :HAS_IMPORT_EXTERNAL`), clearing stale + relationships before re-attaching for re-runs. The two-`:HAS_IMPORT_*` + split mirrors kube-package's distinction between relative imports and + external packages — downstream MCP queries can ask "which files + import this internal module" vs "which files import this external + package" cleanly. The `*Batch` variant lands an arbitrary number of + files in **one transaction** via Cypher `UNWIND` — same Cypher shape, + wrapped with an outer UNWIND so 50+ files cost the same 12 Cyphers a + single file used to cost. +- Folder-node CRUD (`upsertFolderNode`, `upsertFolderNodesBatch`) — + same shape as file CRUD; batched variant for bulk indexing. The package does **not** own: - Read queries — defer to a future `@bb/graph` once `@bb/mcp` retrieval has a use case -- Telemetry, retry, or transaction batching — driver defaults apply +- Telemetry — driver defaults apply. - Migration tooling — the `IF NOT EXISTS` constraint creates handle schema drift; richer migrations land later @@ -69,6 +74,9 @@ function upsertKnowledgeNode(doc: KnowledgeDoc): Promise; function setKnowledgeStateInGraph(knowledgeId: string, state: KnowledgeState): Promise; function deleteKnowledgeGraph(knowledgeId: string): Promise; function upsertFileNode(input: UpsertFileNodeInput): Promise; +function upsertFileNodesBatch(inputs: readonly UpsertFileNodeInput[]): Promise; +function upsertFolderNode(input: UpsertFolderNodeInput): Promise; +function upsertFolderNodesBatch(inputs: readonly UpsertFolderNodeInput[]): Promise; function runCypher(query: string, params?: Record): Promise; @@ -160,9 +168,12 @@ Neo4jPassword`). Repo-wide ESLint rule blocks `process.env`. "already exists" errors (Neo4j refuses constraints when a matching plain index exists). Operators must drop conflicting indexes manually if uniqueness guarantees matter. -6. **`upsertFileNode` clears stale relationships before re-attaching.** - Re-runs of the same `(knowledgeId, relativePath)` produce a clean - relationship set rather than accumulating outdated keywords/imports. +6. **`upsertFileNode` and `upsertFileNodesBatch` clear stale relationships + before re-attaching.** Re-runs of the same `(knowledgeId, relativePath)` + produce a clean relationship set rather than accumulating outdated + keywords/imports. In the batched variant the clear-then-attach happens + atomically inside one transaction per batch — partial failures roll + back, so re-runs always start from a consistent state. 7. **No raw `Driver` leaks.** `_getDriver()` is not in `src/index.ts`. Higher tiers go through the typed helpers. @@ -174,7 +185,6 @@ Neo4jPassword`). Repo-wide ESLint rule blocks `process.env`. ## What is intentionally out of scope (v0) - Read queries (defer to `@bb/graph`) -- Cypher transactions / batch writes (single-statement per call) - Schema migrations / drops / renames (only `IF NOT EXISTS` creates) - Multi-database support (we use the default `neo4j` db) - Pub/sub / change-data-capture diff --git a/packages/neo4j/package.json b/packages/neo4j/package.json index 6733044..7e16617 100644 --- a/packages/neo4j/package.json +++ b/packages/neo4j/package.json @@ -8,10 +8,14 @@ "exports": { ".": "./src/index.ts" }, + "imports": { + "#src/*": "./src/*" + }, "dependencies": { "@bb/config": "workspace:*", "@bb/errors": "workspace:*", "@bb/types": "workspace:*", + "@bb/mongo": "workspace:*", "neo4j-driver": "^6.0.1" } } diff --git a/packages/neo4j/src/client.ts b/packages/neo4j/src/client.ts index 56207d2..dac5fbb 100644 --- a/packages/neo4j/src/client.ts +++ b/packages/neo4j/src/client.ts @@ -81,6 +81,35 @@ export async function _runCypher(query: string, params: Record; +} + +/** + * Run multiple Cypher statements inside one write transaction. All-or-nothing: + * either every statement commits or none do. Used by the batched upsert APIs + * so a 50-file batch lands as one transaction instead of 12 × 50 sessions. + * + * Uses the driver's `executeWrite` which retries automatically on transient + * errors (deadlock, leader switch) up to a few attempts. + */ +export async function _runInTransaction(steps: readonly CypherStep[]): Promise { + if (steps.length === 0) { + return; + } + const session: Session = _getDriver().session(); + try { + await session.executeWrite(async (tx) => { + for (const step of steps) { + await tx.run(step.query, step.params); + } + }); + } finally { + await session.close(); + } +} + export function toNeo4jInt(value: number): Integer { return int(value); } diff --git a/packages/neo4j/src/files.ts b/packages/neo4j/src/files.ts index eaf4182..7d049e3 100644 --- a/packages/neo4j/src/files.ts +++ b/packages/neo4j/src/files.ts @@ -1,5 +1,5 @@ import type { FileAnalysis } from "@bb/mongo"; -import { _runCypher } from "./client.ts"; +import { _runCypher, _runInTransaction, type CypherStep } from "./client.ts"; const UPSERT_FILE = ` MERGE (f:File {knowledgeId: $knowledgeId, relativePath: $relativePath}) @@ -133,6 +133,232 @@ export async function deleteFileNodes(knowledgeId: string, relativePaths: string await _runCypher(DELETE_FILES, { knowledgeId, relativePaths }); } +// ───────────────────────────────────────────────────────────────────────────── +// Batched upsert — used by the flat-folder indexing phase to land 50+ files in +// one transaction instead of 12 round-trips per file. Same Cypher shape as the +// single-shot path above; just wrapped with an outer UNWIND so one query +// services every file in the batch. The five rel types (HAS_KEYWORD / +// HAS_CLASS / HAS_FUNCTION / HAS_IMPORT_INTERNAL / HAS_IMPORT_EXTERNAL) each +// take two Cyphers: a batched DELETE that clears existing rels for every file +// in the batch by relativePath, then a batched UNWIND that attaches the new +// rels from flattened `(knowledgeId, relativePath, name)` triples. +// ───────────────────────────────────────────────────────────────────────────── + +const BATCH_UPSERT_FILES = ` +UNWIND $files AS f +MERGE (file:File {knowledgeId: f.knowledgeId, relativePath: f.relativePath}) +SET file.orgId = f.orgId, + file.repoId = f.repoId, + file.language = f.language, + file.sha = f.sha, + file.sizeBytes = f.sizeBytes, + file.purpose = f.purpose, + file.summary = f.summary, + file.businessContext = f.businessContext, + file.dataFlowDirection = f.dataFlowDirection, + file.ontologyConcepts = f.ontologyConcepts, + file.businessEntities = f.businessEntities, + file.systemCapabilities = f.systemCapabilities, + file.sideEffects = f.sideEffects, + file.configDependencies = f.configDependencies, + file.integrationSurface = f.integrationSurface, + file.contractsProvided = f.contractsProvided, + file.contractsConsumed = f.contractsConsumed, + file.sectionNames = f.sectionNames, + file.sectionDescriptions = f.sectionDescriptions, + file.isBigFile = f.isBigFile, + file.totalChunks = f.totalChunks, + file.totalTokenCount = f.totalTokenCount, + file.updatedAt = $updatedAt +WITH file, f +MATCH (k:Knowledge {knowledgeId: f.knowledgeId}) +MERGE (k)-[:HAS_FILE]->(file) +`; + +const BATCH_ATTACH_FILES_TO_FOLDERS = ` +UNWIND $pairs AS pair +MATCH (file:File {knowledgeId: pair.knowledgeId, relativePath: pair.relativePath}) +MATCH (folder:Folder {knowledgeId: pair.knowledgeId, folderPath: pair.folderPath}) +MERGE (folder)-[:CONTAINS]->(file) +`; + +const BATCH_CLEAR_RELS_BY_TYPE: Readonly> = { + HAS_KEYWORD: ` +UNWIND $files AS f +MATCH (file:File {knowledgeId: f.knowledgeId, relativePath: f.relativePath})-[r:HAS_KEYWORD]->() +DELETE r +`, + HAS_CLASS: ` +UNWIND $files AS f +MATCH (file:File {knowledgeId: f.knowledgeId, relativePath: f.relativePath})-[r:HAS_CLASS]->() +DELETE r +`, + HAS_FUNCTION: ` +UNWIND $files AS f +MATCH (file:File {knowledgeId: f.knowledgeId, relativePath: f.relativePath})-[r:HAS_FUNCTION]->() +DELETE r +`, + HAS_IMPORT_INTERNAL: ` +UNWIND $files AS f +MATCH (file:File {knowledgeId: f.knowledgeId, relativePath: f.relativePath})-[r:HAS_IMPORT_INTERNAL]->() +DELETE r +`, + HAS_IMPORT_EXTERNAL: ` +UNWIND $files AS f +MATCH (file:File {knowledgeId: f.knowledgeId, relativePath: f.relativePath})-[r:HAS_IMPORT_EXTERNAL]->() +DELETE r +`, +}; + +const BATCH_ATTACH_KEYWORDS = ` +UNWIND $pairs AS p +MATCH (file:File {knowledgeId: p.knowledgeId, relativePath: p.relativePath}) +MERGE (kw:Keyword {name: p.name}) +MERGE (file)-[:HAS_KEYWORD]->(kw) +`; + +const BATCH_ATTACH_CLASSES = ` +UNWIND $pairs AS p +MATCH (file:File {knowledgeId: p.knowledgeId, relativePath: p.relativePath}) +MERGE (c:Class {signature: p.signature}) +MERGE (file)-[:HAS_CLASS]->(c) +`; + +const BATCH_ATTACH_FUNCTIONS = ` +UNWIND $pairs AS p +MATCH (file:File {knowledgeId: p.knowledgeId, relativePath: p.relativePath}) +MERGE (fn:Function {signature: p.signature}) +MERGE (file)-[:HAS_FUNCTION]->(fn) +`; + +const BATCH_ATTACH_IMPORTS_INTERNAL = ` +UNWIND $pairs AS p +MATCH (file:File {knowledgeId: p.knowledgeId, relativePath: p.relativePath}) +MERGE (m:Module {name: p.name}) +MERGE (file)-[:HAS_IMPORT_INTERNAL]->(m) +`; + +const BATCH_ATTACH_IMPORTS_EXTERNAL = ` +UNWIND $pairs AS p +MATCH (file:File {knowledgeId: p.knowledgeId, relativePath: p.relativePath}) +MERGE (m:Module {name: p.name}) +MERGE (file)-[:HAS_IMPORT_EXTERNAL]->(m) +`; + +type RelType = "HAS_KEYWORD" | "HAS_CLASS" | "HAS_FUNCTION" | "HAS_IMPORT_INTERNAL" | "HAS_IMPORT_EXTERNAL"; + +interface FileRow { + knowledgeId: string; + relativePath: string; +} + +export async function upsertFileNodesBatch(inputs: readonly UpsertFileNodeInput[]): Promise { + if (inputs.length === 0) { + return; + } + const updatedAt = new Date().toISOString(); + const files = inputs.map((input) => fileRowFor(input)); + const fileKeys: FileRow[] = inputs.map((input) => ({ + knowledgeId: input.knowledgeId, + relativePath: input.relativePath, + })); + const folderPairs = inputs + .filter((input): input is UpsertFileNodeInput & { folderPath: string } => input.folderPath !== undefined) + .map((input) => ({ + knowledgeId: input.knowledgeId, + relativePath: input.relativePath, + folderPath: input.folderPath, + })); + + const keywordPairs = flattenPairs(inputs, "keywords", "name", (v) => v.toLowerCase()); + const classPairs = flattenPairs(inputs, "classes", "signature"); + const functionPairs = flattenPairs(inputs, "functions", "signature"); + const importsInternalPairs = flattenPairs(inputs, "importsInternal", "name"); + const importsExternalPairs = flattenPairs(inputs, "importsExternal", "name"); + + const steps: CypherStep[] = [{ query: BATCH_UPSERT_FILES, params: { files, updatedAt } }]; + if (folderPairs.length > 0) { + steps.push({ query: BATCH_ATTACH_FILES_TO_FOLDERS, params: { pairs: folderPairs } }); + } + // Clear existing rels of every type for every file in the batch. + for (const relType of [ + "HAS_KEYWORD", + "HAS_CLASS", + "HAS_FUNCTION", + "HAS_IMPORT_INTERNAL", + "HAS_IMPORT_EXTERNAL", + ] as const) { + steps.push({ query: BATCH_CLEAR_RELS_BY_TYPE[relType], params: { files: fileKeys } }); + } + if (keywordPairs.length > 0) { + steps.push({ query: BATCH_ATTACH_KEYWORDS, params: { pairs: keywordPairs } }); + } + if (classPairs.length > 0) { + steps.push({ query: BATCH_ATTACH_CLASSES, params: { pairs: classPairs } }); + } + if (functionPairs.length > 0) { + steps.push({ query: BATCH_ATTACH_FUNCTIONS, params: { pairs: functionPairs } }); + } + if (importsInternalPairs.length > 0) { + steps.push({ query: BATCH_ATTACH_IMPORTS_INTERNAL, params: { pairs: importsInternalPairs } }); + } + if (importsExternalPairs.length > 0) { + steps.push({ query: BATCH_ATTACH_IMPORTS_EXTERNAL, params: { pairs: importsExternalPairs } }); + } + + await _runInTransaction(steps); +} + +function fileRowFor(input: UpsertFileNodeInput): Record { + const sectionMap = input.analysis.sectionMap ?? []; + return { + knowledgeId: input.knowledgeId, + relativePath: input.relativePath, + orgId: input.orgId ?? "local", + repoId: input.repoId ?? input.knowledgeId, + language: input.language, + sha: input.sha, + sizeBytes: input.sizeBytes, + purpose: input.analysis.purpose, + summary: input.analysis.summary, + businessContext: input.analysis.businessContext, + dataFlowDirection: input.analysis.dataFlowDirection ?? "", + ontologyConcepts: input.analysis.ontologyConcepts ?? [], + businessEntities: input.analysis.businessEntities ?? [], + systemCapabilities: input.analysis.systemCapabilities ?? [], + sideEffects: input.analysis.sideEffects ?? [], + configDependencies: input.analysis.configDependencies ?? [], + integrationSurface: input.analysis.integrationSurface ?? [], + contractsProvided: input.analysis.contractsProvided ?? [], + contractsConsumed: input.analysis.contractsConsumed ?? [], + sectionNames: sectionMap.map((s) => s.name), + sectionDescriptions: sectionMap.map((s) => s.description), + isBigFile: input.isBigFile ?? false, + totalChunks: input.totalChunks ?? 0, + totalTokenCount: input.totalTokenCount ?? 0, + }; +} + +function flattenPairs( + inputs: readonly UpsertFileNodeInput[], + field: "keywords" | "classes" | "functions" | "importsInternal" | "importsExternal", + valueKey: "name" | "signature", + normalize?: (v: string) => string, +): Array> { + const out: Array> = []; + for (const input of inputs) { + const values = input.analysis[field]; + if (!Array.isArray(values)) { + continue; + } + for (const raw of values) { + const value = normalize !== undefined ? normalize(raw) : raw; + out.push({ knowledgeId: input.knowledgeId, relativePath: input.relativePath, [valueKey]: value }); + } + } + return out; +} + export async function upsertFileNode(input: UpsertFileNodeInput): Promise { const params = { knowledgeId: input.knowledgeId, relativePath: input.relativePath }; const sectionMap = input.analysis.sectionMap ?? []; diff --git a/packages/neo4j/src/folder.ts b/packages/neo4j/src/folder.ts index e862c3e..f4c8ad8 100644 --- a/packages/neo4j/src/folder.ts +++ b/packages/neo4j/src/folder.ts @@ -1,4 +1,4 @@ -import { _runCypher } from "./client.ts"; +import { _runCypher, _runInTransaction, type CypherStep } from "./client.ts"; import type { NodeScope } from "./repo.ts"; export interface FolderSummaryPayload { @@ -41,6 +41,80 @@ MERGE (kw:Keyword {name: name}) MERGE (folder)-[:HAS_KEYWORD]->(kw) `; +// ───────────────────────────────────────────────────────────────────────────── +// Batched folder upsert. Same Cypher shape as the single-shot path; wrapped +// with an outer UNWIND so one transaction lands every folder in the batch. +// ───────────────────────────────────────────────────────────────────────────── + +const BATCH_UPSERT_FOLDERS = ` +UNWIND $folders AS fld +MERGE (folder:Folder {orgId: fld.orgId, knowledgeId: fld.knowledgeId, repoId: fld.repoId, folderPath: fld.folderPath}) +SET folder.purpose = fld.purpose, + folder.summary = fld.summary, + folder.dependencyGraph = fld.dependencyGraph, + folder.updatedAt = $updatedAt +WITH folder, fld +MATCH (r:Repo {orgId: fld.orgId, knowledgeId: fld.knowledgeId, repoId: fld.repoId}) +MERGE (r)-[:CONTAINS]->(folder) +`; + +const BATCH_CLEAR_FOLDER_KEYWORDS = ` +UNWIND $folders AS fld +MATCH (folder:Folder {orgId: fld.orgId, knowledgeId: fld.knowledgeId, repoId: fld.repoId, folderPath: fld.folderPath})-[rel:HAS_KEYWORD]->() +DELETE rel +`; + +const BATCH_ATTACH_FOLDER_KEYWORDS = ` +UNWIND $pairs AS p +MATCH (folder:Folder {orgId: p.orgId, knowledgeId: p.knowledgeId, repoId: p.repoId, folderPath: p.folderPath}) +MERGE (kw:Keyword {name: p.name}) +MERGE (folder)-[:HAS_KEYWORD]->(kw) +`; + +export async function upsertFolderNodesBatch(inputs: readonly UpsertFolderNodeInput[]): Promise { + if (inputs.length === 0) { + return; + } + const updatedAt = new Date().toISOString(); + const folders = inputs.map((input) => ({ + orgId: input.scope.orgId, + knowledgeId: input.scope.knowledgeId, + repoId: input.scope.repoId, + folderPath: input.folderPath, + purpose: input.summary.purpose, + summary: input.summary.summary, + dependencyGraph: input.summary.dependencyGraph, + })); + const folderKeys = inputs.map((input) => ({ + orgId: input.scope.orgId, + knowledgeId: input.scope.knowledgeId, + repoId: input.scope.repoId, + folderPath: input.folderPath, + })); + const keywordPairs: Array> = []; + for (const input of inputs) { + for (const raw of input.summary.keywords) { + keywordPairs.push({ + orgId: input.scope.orgId, + knowledgeId: input.scope.knowledgeId, + repoId: input.scope.repoId, + folderPath: input.folderPath, + name: raw.toLowerCase(), + }); + } + } + + const steps: CypherStep[] = [ + { query: BATCH_UPSERT_FOLDERS, params: { folders, updatedAt } }, + { query: BATCH_CLEAR_FOLDER_KEYWORDS, params: { folders: folderKeys } }, + ]; + if (keywordPairs.length > 0) { + steps.push({ query: BATCH_ATTACH_FOLDER_KEYWORDS, params: { pairs: keywordPairs } }); + } + + await _runInTransaction(steps); +} + export async function upsertFolderNode(input: UpsertFolderNodeInput): Promise { const scope = input.scope; const params = { diff --git a/packages/neo4j/src/index.ts b/packages/neo4j/src/index.ts index e4e2d54..c581c80 100644 --- a/packages/neo4j/src/index.ts +++ b/packages/neo4j/src/index.ts @@ -5,15 +5,20 @@ export type { PingResult } from "./client.ts"; export { ensureKnowledgeIndexes } from "./indexes.ts"; export { ensureFlatFolderIndexes } from "./flatFolderIndexes.ts"; -export { upsertKnowledgeNode, setKnowledgeStateInGraph, deleteKnowledgeGraph } from "./knowledge.ts"; +export { + upsertKnowledgeNode, + setKnowledgeStateInGraph, + setKnowledgeBranchInGraph, + deleteKnowledgeGraph, +} from "./knowledge.ts"; -export { upsertFileNode, deleteFileNodes } from "./files.ts"; +export { upsertFileNode, upsertFileNodesBatch, deleteFileNodes } from "./files.ts"; export type { UpsertFileNodeInput } from "./files.ts"; export { upsertRepoNode } from "./repo.ts"; export type { NodeScope, RepoSummaryPayload, UpsertRepoNodeInput } from "./repo.ts"; -export { upsertFolderNode } from "./folder.ts"; +export { upsertFolderNode, upsertFolderNodesBatch } from "./folder.ts"; export type { FolderSummaryPayload, UpsertFolderNodeInput } from "./folder.ts"; export { snapshotFilesToVersion } from "./fileVersions.ts"; diff --git a/packages/neo4j/src/knowledge.ts b/packages/neo4j/src/knowledge.ts index fcb8043..8721817 100644 --- a/packages/neo4j/src/knowledge.ts +++ b/packages/neo4j/src/knowledge.ts @@ -1,5 +1,5 @@ import path from "node:path"; -import type { KnowledgeDoc, KnowledgeSource, KnowledgeState } from "@bb/types"; +import type { KnowledgeDoc, KnowledgeState } from "@bb/types"; import { _runCypher } from "./client.ts"; const UPSERT_KNOWLEDGE = ` @@ -18,6 +18,11 @@ MATCH (k:Knowledge {knowledgeId: $knowledgeId}) SET k.state = $state, k.updatedAt = $updatedAt `; +const SET_BRANCH = ` +MATCH (k:Knowledge {knowledgeId: $knowledgeId}) +SET k.branch = $branch, k.updatedAt = $updatedAt +`; + const DELETE_FILES_BY_KNOWLEDGE = ` MATCH (f:File {knowledgeId: $knowledgeId}) DETACH DELETE f @@ -52,14 +57,14 @@ DELETE n export async function upsertKnowledgeNode(doc: KnowledgeDoc): Promise { const sourceKind = doc.source.kind; - const sourceUrl = doc.source.kind === "github" ? doc.source.repoUrl : doc.source.sourcePath; - const branch = doc.source.kind === "github" ? (doc.source.branch ?? null) : null; + const sourceUrl = doc.source.kind === "github" ? (doc.info.repoUrl ?? "") : doc.source.sourcePath; + const branch = doc.source.kind === "github" ? (doc.info.branch ?? null) : null; await _runCypher(UPSERT_KNOWLEDGE, { knowledgeId: doc.knowledgeId, sourceKind, sourceUrl, branch, - repoName: deriveRepoName(doc.source), + repoName: deriveRepoName(doc), state: doc.status.state, createdAt: doc.createdAt.toISOString(), updatedAt: doc.updatedAt.toISOString(), @@ -74,6 +79,14 @@ export async function setKnowledgeStateInGraph(knowledgeId: string, state: Knowl }); } +export async function setKnowledgeBranchInGraph(knowledgeId: string, branch: string): Promise { + await _runCypher(SET_BRANCH, { + knowledgeId, + branch, + updatedAt: new Date().toISOString(), + }); +} + export async function deleteKnowledgeGraph(knowledgeId: string): Promise { await _runCypher(DELETE_FILES_BY_KNOWLEDGE, { knowledgeId }); await _runCypher(DELETE_ORPHAN_FILES); @@ -81,11 +94,11 @@ export async function deleteKnowledgeGraph(knowledgeId: string): Promise { await _runCypher(DELETE_ORPHAN_ENTITIES); } -function deriveRepoName(source: KnowledgeSource): string { - if (source.kind === "local") { - return path.basename(source.sourcePath); +function deriveRepoName(doc: KnowledgeDoc): string { + if (doc.source.kind === "local") { + return path.basename(doc.source.sourcePath); } - return repoNameFromGithubUrl(source.repoUrl); + return repoNameFromGithubUrl(doc.info.repoUrl ?? ""); } function repoNameFromGithubUrl(repoUrl: string): string { diff --git a/packages/neo4j/tsconfig.json b/packages/neo4j/tsconfig.json index c2104f6..4ed0786 100644 --- a/packages/neo4j/tsconfig.json +++ b/packages/neo4j/tsconfig.json @@ -1,8 +1,4 @@ { "extends": "../../tsconfig.base.json", - "compilerOptions": { - "rootDir": "./src", - "outDir": "./dist" - }, - "include": ["src/**/*"] + "include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.json"] } diff --git a/packages/queue/package.json b/packages/queue/package.json index 308c1eb..99230be 100644 --- a/packages/queue/package.json +++ b/packages/queue/package.json @@ -8,6 +8,9 @@ "exports": { ".": "./src/index.ts" }, + "imports": { + "#src/*": "./src/*" + }, "dependencies": { "@bb/config": "workspace:*", "@bb/errors": "workspace:*", diff --git a/packages/queue/src/manager.ts b/packages/queue/src/manager.ts index 456f75d..3d2f45d 100644 --- a/packages/queue/src/manager.ts +++ b/packages/queue/src/manager.ts @@ -1,4 +1,5 @@ -import { Queue, Worker } from "bullmq"; +import type { Worker } from "bullmq"; +import { Queue } from "bullmq"; import { JobType } from "@bb/types"; import { QueueConnectError, QueueNotConnectedError } from "@bb/errors"; import { getRedisConnection } from "@bb/redis"; diff --git a/packages/queue/src/workers.ts b/packages/queue/src/workers.ts index 5ef8e24..3bfe659 100644 --- a/packages/queue/src/workers.ts +++ b/packages/queue/src/workers.ts @@ -40,6 +40,7 @@ function defaultConcurrencyFor(type: JobType): number { case JobType.GithubIndex: case JobType.GithubPull: case JobType.LocalIngest: + case JobType.BusinessContextProcessing: return getConfigValue(Config.ConcurrencyGithub); } } diff --git a/packages/queue/tsconfig.json b/packages/queue/tsconfig.json index c2104f6..4ed0786 100644 --- a/packages/queue/tsconfig.json +++ b/packages/queue/tsconfig.json @@ -1,8 +1,4 @@ { "extends": "../../tsconfig.base.json", - "compilerOptions": { - "rootDir": "./src", - "outDir": "./dist" - }, - "include": ["src/**/*"] + "include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.json"] } diff --git a/packages/redis/package.json b/packages/redis/package.json index 400bfc7..3aeccf2 100644 --- a/packages/redis/package.json +++ b/packages/redis/package.json @@ -8,6 +8,9 @@ "exports": { ".": "./src/index.ts" }, + "imports": { + "#src/*": "./src/*" + }, "dependencies": { "@bb/config": "workspace:*", "@bb/errors": "workspace:*", diff --git a/packages/redis/tsconfig.json b/packages/redis/tsconfig.json index c2104f6..4ed0786 100644 --- a/packages/redis/tsconfig.json +++ b/packages/redis/tsconfig.json @@ -1,8 +1,4 @@ { "extends": "../../tsconfig.base.json", - "compilerOptions": { - "rootDir": "./src", - "outDir": "./dist" - }, - "include": ["src/**/*"] + "include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.json"] } diff --git a/packages/server/package.json b/packages/server/package.json index 2a3915c..aa828af 100644 --- a/packages/server/package.json +++ b/packages/server/package.json @@ -8,6 +8,9 @@ "exports": { ".": "./src/index.ts" }, + "imports": { + "#src/*": "./src/*" + }, "bin": { "bytebell-server": "./src/index.ts" }, diff --git a/packages/server/src/deleteRoute.ts b/packages/server/src/deleteRoute.ts index 72788eb..a35a63f 100644 --- a/packages/server/src/deleteRoute.ts +++ b/packages/server/src/deleteRoute.ts @@ -40,7 +40,6 @@ export function buildDeleteRoute(): Router { jobsRemoved: removedJobs.removed, mongoDeleted: mongoResult.knowledgeDeleted, rawDeleted: mongoResult.rawDeleted, - statsDeleted: mongoResult.statsDeleted, }); }); return router; diff --git a/packages/server/src/githubCommitsRoute.ts b/packages/server/src/githubCommitsRoute.ts index a8ee67a..2bc909b 100644 --- a/packages/server/src/githubCommitsRoute.ts +++ b/packages/server/src/githubCommitsRoute.ts @@ -51,13 +51,25 @@ export function buildGithubCommitsRoute(): Router { .json({ error: `commits endpoint is only supported for github knowledge (kind=${knowledge.source.kind})` }); return; } - const branch = knowledge.source.branch ?? "main"; + const branch = knowledge.info.branch ?? "main"; + const repoUrl = knowledge.info.repoUrl; + if (repoUrl === undefined || repoUrl.length === 0) { + res.status(422).json({ error: "commits endpoint requires knowledge.info.repoUrl" }); + return; + } const gitToken = extractBearerToken(req.headers["authorization"]); - const result = await fetchRecentCommits(knowledge.source.repoUrl, branch, limit, gitToken); + const result = await fetchRecentCommits(repoUrl, branch, limit, gitToken); switch (result.status) { case "ok": { - const payload: CommitsResponse = { knowledgeId, branch, commits: result.commits }; + const commits = result.commits.map((c) => ({ + hash: c.sha, + shortHash: c.sha.slice(0, 7), + subject: c.message.split("\n")[0] ?? "", + author: c.author, + date: c.timestamp, + })); + const payload: CommitsResponse = { knowledgeId, branch, commits }; res.status(200).json(payload); return; } diff --git a/packages/server/src/githubIndexRoute.ts b/packages/server/src/githubIndexRoute.ts index 13aa084..e92dde0 100644 --- a/packages/server/src/githubIndexRoute.ts +++ b/packages/server/src/githubIndexRoute.ts @@ -31,7 +31,8 @@ export function buildGithubIndexRoute(): Router { const now = new Date(); const doc: KnowledgeDoc = { knowledgeId, - source: { kind: "github", repoUrl, ...(branch !== undefined ? { branch } : {}) }, + source: { kind: "github" }, + info: { repoUrl, ...(branch !== undefined ? { branch } : {}) }, status: { state: KnowledgeState.Created }, createdAt: now, updatedAt: now, diff --git a/packages/server/src/githubProbeRoute.ts b/packages/server/src/githubProbeRoute.ts new file mode 100644 index 0000000..fb1f596 --- /dev/null +++ b/packages/server/src/githubProbeRoute.ts @@ -0,0 +1,60 @@ +import type { Request, Response, Router } from "express"; +import express from "express"; +import { fetchDefaultBranch, fetchBranches } from "@bb/ingest-github"; + +interface ProbeBody { + repoUrl?: unknown; + gitToken?: unknown; + branch?: unknown; +} + +export function buildGithubProbeRoute(): Router { + const router = express.Router(); + router.post("/api/v1/github/probe", async (req: Request, res: Response) => { + const body = req.body as ProbeBody; + if (typeof body.repoUrl !== "string" || body.repoUrl.length === 0) { + res.status(400).json({ error: "repoUrl required" }); + return; + } + const repoUrl = body.repoUrl; + const gitToken = typeof body.gitToken === "string" && body.gitToken.length > 0 ? body.gitToken : undefined; + const targetBranch = typeof body.branch === "string" && body.branch.length > 0 ? body.branch : undefined; + + const result = await fetchDefaultBranch(repoUrl, gitToken); + switch (result.status) { + case "ok": { + const defaultBranch = result.branch; + const branchesResult = await fetchBranches(repoUrl, gitToken); + const branches = branchesResult.status === "ok" ? branchesResult.branches : []; + + if (targetBranch !== undefined && !branches.includes(targetBranch)) { + const suggestions = branches + .filter((b: string) => b.toLowerCase().includes(targetBranch.toLowerCase())) + .slice(0, 10); + res.status(404).json({ + status: "branch_not_found", + message: `Branch '${targetBranch}' not found.`, + branches: suggestions.length > 0 ? suggestions : branches.slice(0, 20), + }); + return; + } + + res.status(200).json({ status: "ok", defaultBranch, branches }); + break; + } + case "not_found": + res.status(404).json({ status: "not_found", message: "Repository not found or private." }); + break; + case "unauthorized": + res.status(401).json({ status: "unauthorized", message: "GitHub token rejected." }); + break; + case "rate_limited": + res.status(429).json({ status: "rate_limited", message: "GitHub rate limit reached." }); + break; + case "error": + res.status(502).json({ status: "error", message: result.message }); + break; + } + }); + return router; +} diff --git a/packages/server/src/githubPullRoute.ts b/packages/server/src/githubPullRoute.ts index f80d79f..0b72a81 100644 --- a/packages/server/src/githubPullRoute.ts +++ b/packages/server/src/githubPullRoute.ts @@ -71,11 +71,16 @@ export function buildGithubPullRoute(): Router { return; } - const branch = knowledge.source.branch ?? "main"; + const branch = knowledge.info.branch ?? "main"; + const repoUrl = knowledge.info.repoUrl; + if (repoUrl === undefined || repoUrl.length === 0) { + res.status(422).json({ error: "pull requires knowledge.info.repoUrl" }); + return; + } let targetCommit = suppliedTarget; if (targetCommit === undefined) { try { - const head = await fetchLatestCommitHash(knowledge.source.repoUrl, branch, gitToken); + const head = await fetchLatestCommitHash(repoUrl, branch, gitToken); if (head !== null && COMMIT_HASH_RE.test(head)) { targetCommit = head; } diff --git a/packages/server/src/localIndexRoute.ts b/packages/server/src/localIndexRoute.ts index 532f33b..326185e 100644 --- a/packages/server/src/localIndexRoute.ts +++ b/packages/server/src/localIndexRoute.ts @@ -48,6 +48,7 @@ export function buildLocalIndexRoute(): Router { const doc: KnowledgeDoc = { knowledgeId, source: { kind: "local", sourcePath }, + info: {}, status: { state: KnowledgeState.Created }, createdAt: now, updatedAt: now, diff --git a/packages/server/src/routes.ts b/packages/server/src/routes.ts index 7ddf100..1738790 100644 --- a/packages/server/src/routes.ts +++ b/packages/server/src/routes.ts @@ -2,6 +2,7 @@ import type { Application } from "express"; import { mountMcp } from "@bb/mcp"; import { buildHealthRoute } from "./healthRoute.ts"; import { buildGithubIndexRoute } from "./githubIndexRoute.ts"; +import { buildGithubProbeRoute } from "./githubProbeRoute.ts"; import { buildGithubPullRoute } from "./githubPullRoute.ts"; import { buildGithubCommitsRoute } from "./githubCommitsRoute.ts"; import { buildLocalIndexRoute } from "./localIndexRoute.ts"; @@ -13,6 +14,7 @@ import { buildMcpStatsRoute } from "./mcpStatsRoute.ts"; export function registerRoutes(app: Application): void { app.use(buildHealthRoute()); app.use(buildGithubIndexRoute()); + app.use(buildGithubProbeRoute()); app.use(buildGithubPullRoute()); app.use(buildGithubCommitsRoute()); app.use(buildLocalIndexRoute()); diff --git a/packages/server/tsconfig.json b/packages/server/tsconfig.json index 8d1771a..4ed0786 100644 --- a/packages/server/tsconfig.json +++ b/packages/server/tsconfig.json @@ -1,9 +1,4 @@ { "extends": "../../tsconfig.base.json", - "compilerOptions": { - "rootDir": "./src", - "outDir": "./dist" - }, - "include": ["src/**/*"], - "references": [{ "path": "../ingest-github" }] + "include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.json"] } diff --git a/packages/types/README.md b/packages/types/README.md index ba23e6c..7dd5780 100644 --- a/packages/types/README.md +++ b/packages/types/README.md @@ -15,36 +15,39 @@ Single home for shared types and enums that cross package boundaries: `@bb/logger`, `@bb/mongo` — refer to it without wanting an implementation dependency on `@bb/config`'s schema/loader/writer. - `JobType`, `JobPriority`, `JobMessage

`, `GithubIndexPayload`, - `GithubPullPayload`, `PayloadFor` — the queue/job vocabulary shared - between `@bb/queue` (publisher) and future `@bb/ingest-*` packages - (worker handlers). + `GithubPullPayload`, `LocalIngestPayload`, `PayloadFor`, + `PayloadLlmOverrides` — the queue/job vocabulary shared between + `@bb/queue` (publisher) and `@bb/ingest-*` packages (worker handlers). + `PayloadLlmOverrides` is the optional `{ llmApiKey?, llmProvider?, +llmModel?, llmKeyId? }` mixin that lets downstream consumers carry per-job + LLM credentials through the payload (the extension point used by the + enterprise wrapper to inject per-org credentials at the enqueue + boundary). `llmProvider` is intentionally typed as `string` rather than + a closed union — OSS standalone uses `"openrouter"`/`"ollama"`, but + downstream consumers may carry richer taxonomies (`"anthropic"`, + `"gemini"`, …) that OSS ignores at runtime. `llmKeyId` is opaque to OSS; + it's an audit pointer kept by downstream consumers. Mixed into both + GitHub payloads. - `KnowledgeState` — the processing-status lifecycle enum (`CREATED → QUEUED → INGESTED → PROCESSING → PROCESSED ↘ FAILED`) referenced by `@bb/queue` (writes `QUEUED`), `@bb/mongo` (`setKnowledgeState`), and future ingest workers. - -Future inhabitants (added on need basis): full `Knowledge`, `Raw`, -`Node`, `MCP*` document shapes — the cross-package domain types named in +- `KnowledgeDoc`, `KnowledgeSource`, `GithubKnowledgeSource`, + `LocalKnowledgeSource`, `KnowledgeInfo` — the cross-package shape of the + Mongo `knowledge` document. Split into two substructures with + non-overlapping responsibilities: `KnowledgeSource` discriminates the + upstream type (github vs local) and carries per-kind ingestion state — + for github, the current head commit and the full commit history; for + local, the on-disk path. `KnowledgeInfo` carries the repo coordinates the + pipeline reads on every run (URL and branch); it has an open shape so + downstream consumers can attach extra fields without forcing schema + changes here. The pull pipeline reads URL and branch off `KnowledgeInfo` + directly — there is no fallback chain to `KnowledgeSource`. + +Future inhabitants (added on need basis): full `Raw`, `Node`, `MCP*` +document shapes — the cross-package domain types named in [docs/arch.md:69](../../docs/arch.md#L69). -## Public exports - -```ts -enum Config { ... } - -enum JobType { GithubIndex, GithubPull } -enum JobPriority { Low, Normal, High } -interface GithubIndexPayload { knowledgeId, repoUrl, branch?, commitHash?, gitToken? } -interface GithubPullPayload { knowledgeId, targetCommitHash?, gitToken? } -interface JobMessage

{ id, type, priority, knowledgeId, attempt, createdAt, payload } -type PayloadFor - -enum KnowledgeState { Created, Queued, Ingested, Processing, Processed, Failed } -``` - -Add new shared types here only when **two or more** packages need to refer -to the same shape. - ## Data ownership None. This package owns no runtime state — only types and enum members. diff --git a/packages/types/package.json b/packages/types/package.json index 3fc568f..e29793f 100644 --- a/packages/types/package.json +++ b/packages/types/package.json @@ -8,5 +8,8 @@ "exports": { ".": "./src/index.ts" }, + "imports": { + "#src/*": "./src/*" + }, "dependencies": {} } diff --git a/packages/types/src/README.md b/packages/types/src/README.md index 3b21576..d9c4fa0 100644 --- a/packages/types/src/README.md +++ b/packages/types/src/README.md @@ -16,17 +16,55 @@ package-level contract; this file documents how the source tree is split. - **[job.ts](job.ts)** — the queue vocabulary: `JobType` (today: GitHub index + pull, local ingest), `JobPriority`, the per-type payload interfaces (`GithubIndexPayload`, `GithubPullPayload`, - `LocalIngestPayload`), the `JobMessage

` envelope wrapping payloads - as BullMQ `job.data`, and the `PayloadFor` type-level dispatcher. - Shared between `@bb/queue` (publisher) and future `@bb/ingest-*` - packages (worker handlers). Ingest payloads carry an optional - `orgId?: string` override; OSS callers omit it and the pipeline reads - `Config.OrgId` from `~/.bytebell/config.json` (locked to `"local"` - in OSS builds; downstream enterprise builds set `orgId` per-job). + `LocalIngestPayload`), the `PayloadLlmOverrides` mixin, the + `JobMessage

` envelope wrapping payloads as BullMQ `job.data`, and + the `PayloadFor` type-level dispatcher. Shared between `@bb/queue` + (publisher) and `@bb/ingest-*` packages (worker handlers). Ingest + payloads carry an optional `orgId?: string` override; OSS callers omit + it and the pipeline reads `Config.OrgId` from `~/.bytebell/config.json` + (locked to `"local"` in OSS builds; downstream enterprise builds set + `orgId` per-job). Both GitHub payloads also extend `PayloadLlmOverrides` + which adds optional `llmApiKey?`, `llmProvider?: string`, `llmModel?`, + `llmKeyId?` — the extension point that lets downstream enterprise + builds resolve per-org LLM credentials at the enqueue boundary and + pass them through the payload. `llmProvider` is `string` (not a closed + union) so multi-provider enterprise consumers can carry `"anthropic"`, + `"gemini"`, etc.; OSS narrows to `"openrouter"`/`"ollama"` at the LLM + client boundary. `llmKeyId` is opaque audit metadata OSS ignores. OSS + standalone leaves all four fields unset and the pipeline falls back to + `Config.OpenrouterApiKey` + `Config.LlmProvider`. `GithubPullPayload` + also carries an optional `orgId?` so downstream multi-tenant workers + can scope Mongo/Neo4j lookups by org. - **[knowledge.ts](knowledge.ts)** — the `KnowledgeState` enum modeling - the lifecycle in [CLAUDE.md](../../../CLAUDE.md). v0 only ships the - enum; the full `Knowledge` document interface lands when domain CRUD - helpers in `@bb/mongo` need it. + the lifecycle in [CLAUDE.md](../../../CLAUDE.md), plus the + `KnowledgeDoc` document interface and its substructures: + - `KnowledgeSource` is a discriminated union (`GithubKnowledgeSource | LocalKnowledgeSource`) + that captures **what kind of upstream produced this knowledge** plus per-kind + state. For github: `commitId` (current head) and `commitHashes` (history). + For local: `sourcePath`. `source` does **not** carry `repoUrl` or `branch` — + those live on `info` (see below). + - `KnowledgeInfo` carries the human-readable repo coordinates the pipeline + needs every run: `repoUrl`, `branch`, plus an open index signature so + downstream consumers can stash extra fields without forcing schema changes + here. The pull pipeline reads `knowledge.info.repoUrl` / `knowledge.info.branch` + directly — that's the single source of truth for the URL/branch, no fallback. + - `KnowledgeFailureCategory` is a closed union covering the operator-facing + failure taxonomy: `"llm_config"` (no key), `"llm_auth"` (401/403), + `"llm_quota"` (402), `"llm_rate_limit"` (429), `"llm_unreachable"` + (5xx / network / timeout), `"cancelled"`, `"internal"`. The + HTTP-status → category mapping lives in + `@bb/ingest-github/src/pipeline/failure-classifier.ts`. + - `KnowledgeFailure` is the structured failure record: + `{ reason: string; category: KnowledgeFailureCategory; at: Date; detail?: string }`. + `reason` is a single short operator-readable sentence (UI surfaces it + directly), `detail` is the raw provider response body (UI hides it + behind a disclosure). + - `KnowledgeDoc` carries both: `source` for upstream-type + indexed-commit + state, `info` for repo coordinates. Both are required on every doc. The + optional `failure?: KnowledgeFailure` field is populated when + `status.state === FAILED` and cleared automatically by the next + `setKnowledgeState` call (the function `$unset`s it on transitions out + of FAILED). ## Module dependency graph diff --git a/packages/types/src/config.ts b/packages/types/src/config.ts index 882381a..c878718 100644 --- a/packages/types/src/config.ts +++ b/packages/types/src/config.ts @@ -23,6 +23,10 @@ export enum Config { BigFileConcurrency = "big.file.concurrency", AbsoluteFileSizeCap = "absolute.file.size.cap", ConcurrentWorkers = "concurrent.workers", + LlmConcurrency = "llm.concurrency", + FolderSummaryBatchSize = "folder.summary.batch.size", + FolderSummaryBatchMaxFiles = "folder.summary.batch.max.files", + Neo4jBatchSize = "neo4j.batch.size", CondenseContextLimit = "condense.context.limit", CondensePromptOverhead = "condense.prompt.overhead", SmallFileDedupThreshold = "small.file.dedup.threshold", diff --git a/packages/types/src/index.ts b/packages/types/src/index.ts index e6ccf57..b5171f8 100644 --- a/packages/types/src/index.ts +++ b/packages/types/src/index.ts @@ -1,15 +1,23 @@ export { Config } from "./config.ts"; export { JobType, JobPriority } from "./job.ts"; -export type { GithubIndexPayload, GithubPullPayload, LocalIngestPayload, JobMessage, PayloadFor } from "./job.ts"; +export type { + GithubIndexPayload, + GithubPullPayload, + LocalIngestPayload, + BusinessContextProcessingPayload, + JobMessage, + PayloadFor, + PayloadLlmOverrides, +} from "./job.ts"; export { KnowledgeState } from "./knowledge.ts"; -export type { GithubKnowledgeSource, KnowledgeDoc, KnowledgeSource, LocalKnowledgeSource } from "./knowledge.ts"; export type { - ModelTokenBreakdown, - ModelTokenUsage, - ProcessingStatsDoc, - StatsCommitEntry, - StatsRepoEntry, - StatsResponse, - StatsTotals, -} from "./stats.ts"; + GithubKnowledgeSource, + KnowledgeDoc, + KnowledgeFailure, + KnowledgeFailureCategory, + KnowledgeInfo, + KnowledgeSource, + LocalKnowledgeSource, +} from "./knowledge.ts"; +export type { StatsCommitEntry, StatsRepoEntry, StatsResponse, StatsTotals } from "./stats.ts"; export type { UsageDoc, ActivityDoc, UsageIncrement, ActivityInput } from "./usage.ts"; diff --git a/packages/types/src/job.ts b/packages/types/src/job.ts index 9eccd26..7d42db6 100644 --- a/packages/types/src/job.ts +++ b/packages/types/src/job.ts @@ -2,6 +2,7 @@ export enum JobType { GithubIndex = "github_index", GithubPull = "github_pull", LocalIngest = "local_ingest", + BusinessContextProcessing = "CUSTOM_CONTEXT_PROCESSING", } export enum JobPriority { @@ -10,7 +11,28 @@ export enum JobPriority { High = 2, } -export interface GithubIndexPayload { +/** + * Optional per-job LLM credential overrides. When set, take precedence over + * `Config.OpenrouterApiKey` and `Config.LlmProvider` for the duration of this + * job's processing. Used by downstream consumers (e.g. the enterprise wrapper) + * that resolve per-org credentials at the enqueue boundary and infuse them + * into the payload — OSS standalone leaves all four unset. + * + * `llmProvider` is intentionally `string` rather than a closed union: OSS + * standalone uses `"openrouter"` or `"ollama"` (the only values the LLM + * client routes on today), but downstream consumers may carry richer + * provider taxonomies (`"anthropic"`, `"gemini"`, `"mistral"`, …) that the + * OSS client ignores. The `llmKeyId` field is opaque to OSS — kept as an + * audit pointer back to the resolver's source of truth. + */ +export interface PayloadLlmOverrides { + llmApiKey?: string; + llmProvider?: string; + llmModel?: string; + llmKeyId?: string; +} + +export interface GithubIndexPayload extends PayloadLlmOverrides { knowledgeId: string; repoUrl: string; branch?: string; @@ -19,8 +41,14 @@ export interface GithubIndexPayload { orgId?: string; } -export interface GithubPullPayload { +export interface GithubPullPayload extends PayloadLlmOverrides { knowledgeId: string; + /** + * Optional org binding. OSS standalone leaves this unset and the pipeline + * reads `Config.OrgId` (locked to `"local"`). Downstream multi-tenant + * deployments stamp it from the request so worker lookups can scope by org. + */ + orgId?: string; /** * Optional commit to re-index the knowledge to. Must be a 40-character hex SHA * and must be reachable from `origin/`. When omitted, the @@ -38,6 +66,33 @@ export interface LocalIngestPayload { orgId?: string; } +/** + * Payload for the BusinessContext processing job. A BusinessContext is a free-text + * note authored by a human against a specific indexed commit of a GitHub knowledge. + * The worker analyses the text into structured product/technical fields, persists + * it to the per-commit meta tree on disk, and projects it into Neo4j as a + * `:BusinessContext` node plus a `:BusinessContextVersion` snapshot keyed by + * `(knowledgeId, commitHash)`. + * + * `orgId` is single-tenant (`"local"`) in OSS; downstream multi-tenant deployments + * stamp it from the request so org-scoped keyword nodes stay isolated. + */ +export interface BusinessContextProcessingPayload extends PayloadLlmOverrides { + knowledgeId: string; + /** 40-char hex SHA of the commit this business context applies to. */ + commitHash: string; + /** Raw, user-authored business-context text. */ + customText: string; + /** Optional human-supplied description for the job-tracking record. */ + description?: string; + /** Optional repo URL (carried for audit; ingestion does not re-clone). */ + repoUrl?: string; + /** Optional branch (carried for audit). */ + branch?: string; + /** Tenant binding. OSS standalone leaves this unset (defaults to `"local"`). */ + orgId?: string; +} + export interface JobMessage

{ id: string; type: JobType; @@ -54,4 +109,6 @@ export type PayloadFor = T extends JobType.GithubIndex ? GithubPullPayload : T extends JobType.LocalIngest ? LocalIngestPayload - : never; + : T extends JobType.BusinessContextProcessing + ? BusinessContextProcessingPayload + : never; diff --git a/packages/types/src/knowledge.ts b/packages/types/src/knowledge.ts index a6e1309..aa6f77b 100644 --- a/packages/types/src/knowledge.ts +++ b/packages/types/src/knowledge.ts @@ -7,14 +7,20 @@ export enum KnowledgeState { Failed = "FAILED", } +export interface CommitHashRecord { + hash: string; + inputTokens: string; + outputTokens: string; + /** Authoritative provider-reported cost in USD (OpenRouter `usage.cost`). "0" for Ollama or when omitted by provider. */ + costUsd: string; +} + export interface GithubKnowledgeSource { kind: "github"; - repoUrl: string; - branch?: string; /** Current head pointer — the most recently indexed commit. */ commitId?: string; /** Every commit this knowledge has been indexed at, oldest → newest. Pull appends to this list. */ - commitHashes?: string[]; + commitHashes?: (string | CommitHashRecord)[]; } export interface LocalKnowledgeSource { @@ -24,10 +30,54 @@ export interface LocalKnowledgeSource { export type KnowledgeSource = GithubKnowledgeSource | LocalKnowledgeSource; +export interface KnowledgeInfo { + repoUrl?: string; + branch?: string; + git_url?: string; + githubInfo?: { commitId?: string; commitHashes?: string[]; branchName?: string }; + [key: string]: unknown; +} + +/** + * Categorises why a knowledge ingestion failed. Drives operator triage and + * downstream UI hints. + * + * - `llm_config` — missing or empty API key (operator action required) + * - `llm_auth` — 401/403 from provider, key invalid/expired (operator action) + * - `llm_quota` — 402, credit/billing exhausted (operator action) + * - `llm_rate_limit` — 429, transient — could be retried later by operator + * - `llm_unreachable` — 5xx / network / timeout (transient infra issue) + * - `cancelled` — operator-initiated cancellation + * - `internal` — anything else (bug, infra, unexpected exception) + */ +export type KnowledgeFailureCategory = + | "llm_config" + | "llm_auth" + | "llm_quota" + | "llm_rate_limit" + | "llm_unreachable" + | "cancelled" + | "internal"; + +export interface KnowledgeFailure { + /** Short, operator-readable sentence. UI can render this directly. */ + reason: string; + category: KnowledgeFailureCategory; + at: Date; + /** Raw provider response or structured detail for debugging. May be long. */ + detail?: string; +} + export interface KnowledgeDoc { knowledgeId: string; source: KnowledgeSource; status: { state: KnowledgeState; totalFiles?: number; processedFiles?: number }; createdAt: Date; updatedAt: Date; + info: KnowledgeInfo; + /** + * Populated when `status.state === KnowledgeState.Failed`. Cleared + * automatically on the next successful transition out of FAILED. + */ + failure?: KnowledgeFailure; } diff --git a/packages/types/src/stats.ts b/packages/types/src/stats.ts index cdfee5b..5e1e1e7 100644 --- a/packages/types/src/stats.ts +++ b/packages/types/src/stats.ts @@ -1,27 +1,3 @@ -export interface ModelTokenUsage { - inputTokens: number; - outputTokens: number; -} - -export type ModelTokenBreakdown = Record; - -export interface ProcessingStatsDoc { - knowledgeId: string; - repoName: string; - commitHash: string; - modelTokens: ModelTokenBreakdown; - inputTokens: number; - outputTokens: number; - estimatedCost: number; - totalBatches: number; - totalFiles: number; - totalFolders: number; - filesAnalyzed: number; - processingTimeMs: number; - createdAt: Date; - updatedAt: Date; -} - export interface StatsTotals { totalRepos: number; totalFiles: number; diff --git a/packages/types/tsconfig.json b/packages/types/tsconfig.json index c2104f6..4ed0786 100644 --- a/packages/types/tsconfig.json +++ b/packages/types/tsconfig.json @@ -1,8 +1,4 @@ { "extends": "../../tsconfig.base.json", - "compilerOptions": { - "rootDir": "./src", - "outDir": "./dist" - }, - "include": ["src/**/*"] + "include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.json"] } diff --git a/tsconfig.base.json b/tsconfig.base.json index 0275a27..9226217 100644 --- a/tsconfig.base.json +++ b/tsconfig.base.json @@ -6,6 +6,7 @@ "module": "ESNext", "moduleResolution": "bundler", "moduleDetection": "force", + "jsx": "react-jsx", "allowImportingTsExtensions": true, "verbatimModuleSyntax": true, "isolatedModules": true, @@ -36,11 +37,9 @@ "types": ["bun"], - "composite": true, - "declaration": true, - "declarationMap": true, - "sourceMap": true, - "incremental": true, - "noEmit": true + "composite": false, + "declaration": false, + "noEmit": true, + "incremental": true } } diff --git a/tsconfig.json b/tsconfig.json index 97edcbe..80c98f2 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -1,25 +1,8 @@ { "extends": "./tsconfig.base.json", "compilerOptions": { - "composite": false, - "declaration": false, - "declarationMap": false, "noEmit": true }, - "files": [], - "references": [ - { "path": "packages/types" }, - { "path": "packages/errors" }, - { "path": "packages/config" }, - { "path": "packages/logger" }, - { "path": "packages/mongo" }, - { "path": "packages/redis" }, - { "path": "packages/queue" }, - { "path": "packages/llm" }, - { "path": "packages/ingest-github" }, - { "path": "packages/cli" }, - { "path": "packages/server" }, - { "path": "packages/neo4j" }, - { "path": "packages/mcp" } - ] + "include": ["packages/*/src/**/*.ts", "packages/*/src/**/*.tsx", "packages/*/src/**/*.json"], + "exclude": ["**/node_modules", "**/dist", "**/*.d.ts"] }