From 8e48469be78dd4a69ef7719ca02c4cb6bd118247 Mon Sep 17 00:00:00 2001 From: Dead-Bytes <143434285+Dead-Bytes@users.noreply.github.com> Date: Wed, 13 May 2026 22:37:13 +0530 Subject: [PATCH 01/34] feat(internal): the llm providers config options enabled --- packages/llm/context.md | 26 +++++++++++++----- packages/llm/src/client.ts | 17 +++++++++++- packages/llm/src/context.md | 50 +++++++++++++++++++++++----------- packages/llm/src/index.ts | 2 +- packages/llm/src/openrouter.ts | 4 +-- packages/types/context.md | 19 +++++++++---- packages/types/src/context.md | 20 +++++++++----- packages/types/src/index.ts | 9 +++++- packages/types/src/job.ts | 17 ++++++++++-- 9 files changed, 121 insertions(+), 43 deletions(-) diff --git a/packages/llm/context.md b/packages/llm/context.md index 96b3719..dbddd0b 100644 --- a/packages/llm/context.md +++ b/packages/llm/context.md @@ -62,11 +62,14 @@ function tokenLen(text: string): number; function encodeTokens(text: string): number[]; function decodeTokens(tokens: number[]): string; +type LlmProviderName = "openrouter" | "ollama"; interface AskLlmOptions { model?: string; // overrides Config.OpenrouterModel fallbackModels?: string[]; // overrides Config.OpenrouterFallbackModel1..4 timeoutMs?: number; // default 90_000 systemPrompt?: string; // optional system role message + apiKey?: string; // per-call OpenRouter key override (ignored for Ollama); skips Config.OpenrouterApiKey + provider?: LlmProviderName; // per-call provider override; skips Config.LlmProvider } interface AskLlmResult { content: string; @@ -124,10 +127,19 @@ it. The cost ledger described in [docs/arch.md](../../docs/arch.md) is at `https://openrouter.ai/api/v1/chat/completions`; Ollama URL is user-configured via `Config.OllamaUrl` (default `http://localhost:11434`). Provider is selected by - `Config.LlmProvider`. -2. **No env reads.** API key + model come from `getConfigValue(...)`. No - `process.env`, no `.env`. Repo-wide ESLint rule blocks `process.env`. -3. **OpenRouter-native fallback chain.** The request body sends + `Config.LlmProvider`, or by `opts.provider` when the caller wants to + override on a per-call basis. +2. **Per-call credential override.** When `opts.apiKey` is set, the + OpenRouter call uses it directly and skips `Config.OpenrouterApiKey`. + This is the extension point that lets downstream consumers (e.g. the + enterprise wrapper) pre-resolve per-org credentials at the enqueue + boundary and pass them through job payloads, without the LLM client + knowing anything about per-org resolution. The Ollama provider is + keyless and ignores `opts.apiKey`. +3. **No env reads.** API key + model come from `getConfigValue(...)` or + `opts.apiKey`. No `process.env`, no `.env`. Repo-wide ESLint rule + blocks `process.env`. +4. **OpenRouter-native fallback chain.** The request body sends `models: [primary, ...fallbacks]` whenever the deduplicated chain has ≥2 entries. Primary is `Config.OpenrouterModel`; fallbacks come from four discrete slots `Config.OpenrouterFallbackModel1` through @@ -138,12 +150,12 @@ it. The cost ledger described in [docs/arch.md](../../docs/arch.md) is sees a single `AskLlmResult`. BullMQ's `attempts: 3` wraps the whole call — retries walk the chain again, useful when a transient OpenRouter outage clears between retries. -4. **Errors are typed, not strings.** `LlmConfigError` carries the exact +5. **Errors are typed, not strings.** `LlmConfigError` carries the exact `bytebell keys set` hint; `LlmError` carries `cause`. -5. **Timeout is enforced.** AbortController fires at `timeoutMs`; the +6. **Timeout is enforced.** AbortController fires at `timeoutMs`; the resulting `AbortError` is wrapped in `LlmError` with the timeout in the message. -6. **Tokenizer is module-cached.** `tiktoken`'s `cl100k_base` encoder +7. **Tokenizer is module-cached.** `tiktoken`'s `cl100k_base` encoder is lazy-initialized on first `tokenLen` call and reused for the process lifetime. Chosen because every modern OpenRouter chat model tokenizes within ~10% of `cl100k_base` for code-shaped input. Char/4 diff --git a/packages/llm/src/client.ts b/packages/llm/src/client.ts index bf2d837..0aa1634 100644 --- a/packages/llm/src/client.ts +++ b/packages/llm/src/client.ts @@ -7,11 +7,26 @@ import { callOpenRouter, resolveOpenRouterChain } from "./openrouter.ts"; const DEFAULT_TIMEOUT_MS = 360_000; +export type LlmProviderName = "openrouter" | "ollama"; + export interface AskLlmOptions { model?: string; fallbackModels?: string[]; timeoutMs?: number; systemPrompt?: string; + /** + * Per-call override of the OpenRouter API key. When set, takes precedence + * over `Config.OpenrouterApiKey`. Used by downstream consumers (e.g. the + * enterprise wrapper) that resolve per-org credentials at the enqueue + * boundary and pass them through the job payload. Ignored by the Ollama + * provider (which is keyless). + */ + apiKey?: string; + /** + * Per-call override of `Config.LlmProvider`. When set, routes the call to + * the named provider regardless of the configured default. + */ + provider?: LlmProviderName; } export interface AskLlmUsage { @@ -26,7 +41,7 @@ export interface AskLlmResult { } export async function askLLM(prompt: string, opts: AskLlmOptions = {}): Promise { - const provider = getConfigValue(Config.LlmProvider); + const provider: LlmProviderName = opts.provider ?? (getConfigValue(Config.LlmProvider) as LlmProviderName); const timeoutMs = opts.timeoutMs ?? DEFAULT_TIMEOUT_MS; const chain = provider === "ollama" ? resolveOllamaChain(opts) : resolveOpenRouterChain(opts); diff --git a/packages/llm/src/context.md b/packages/llm/src/context.md index 739b3cf..e151bf5 100644 --- a/packages/llm/src/context.md +++ b/packages/llm/src/context.md @@ -6,20 +6,37 @@ package-level contract; this file documents how the source tree is split. ## Files - **[index.ts](index.ts)** — public re-exports. The only entry point other - packages may import. Exposes `askLLM` and the `AskLlmOptions` type. - Anything not re-exported here is internal. -- **[client.ts](client.ts)** — the `askLLM` implementation. Reads - `Config.OpenrouterApiKey`, the primary `Config.OpenrouterModel`, and - the four fallback slots `Config.OpenrouterFallbackModel1..4` via - `@bb/config`. Builds the deduplicated chain `[primary, ...nonEmpty -(slot1..4)]`; if the chain has ≥2 entries the request body includes a - `models: [...]` array so OpenRouter routes among them natively. Builds - the `messages` array (optional system prompt + user prompt), POSTs to - OpenRouter via Bun's built-in `fetch` with an AbortController timeout, - parses the typed `OpenRouterResponse`, returns the first choice's - content. `usage.model` reflects which model OpenRouter actually - routed to. Throws `LlmConfigError` if the API key is empty, `LlmError` - on timeout / HTTP non-2xx / empty completion. + packages may import. Exposes `askLLM`, the `AskLlmOptions` type, the + `LlmProviderName` union (`"openrouter" | "ollama"`), plus the JSON + client surface. Anything not re-exported here is internal. +- **[client.ts](client.ts)** — the `askLLM` orchestrator. Selects the + active provider via `opts.provider ?? getConfigValue(Config.LlmProvider)` + (per-call override beats config), dispatches to `openrouter.ts` or + `ollama.ts`. Consults the filesystem decision cache before issuing a + request. Throws typed errors via `@bb/errors`. +- **[openrouter.ts](openrouter.ts)** — `callOpenRouter` and + `resolveOpenRouterChain`. Reads the API key as `opts.apiKey +?? getConfigValue(Config.OpenrouterApiKey)` (per-call override beats + config), reads the model chain (`opts.model`, `opts.fallbackModels`, + or `Config.OpenrouterModel` + four fallback slots), caps the chain at + 3 entries (OpenRouter's hard limit), POSTs to the chat-completions + endpoint with an AbortController timeout, parses the typed + `OpenRouterResponse`, returns the first choice's content. `usage.model` + reflects which model OpenRouter actually routed to. Throws + `LlmConfigError` if the API key resolves to empty, `LlmError` on + timeout / HTTP non-2xx / empty completion. +- **[ollama.ts](ollama.ts)** — `callOllama` and `resolveOllamaChain`. + Single-model per request (Ollama has no fan-out). Reads model from + `opts.model ?? Config.OllamaModel`. Ignores `opts.apiKey` (Ollama is + keyless). +- **[jsonClient.ts](jsonClient.ts)** — `askJsonLLM`, `askYesNoLLM`, + `tryParseJson`, `stripJsonFence`. Wraps `askLLM` with JSON-strict + retry logic. Forwards `opts` (including `apiKey` / `provider` / `model`) + to `askLLM` unchanged. +- **[cache.ts](cache.ts)** — filesystem-backed decision cache. Key + includes `provider` and `modelChain`; `opts.apiKey` is intentionally + NOT part of the key (the cached decision is the same regardless of + which key produced it — keys are auth, not semantic input). - **[tokenizer.ts](tokenizer.ts)** — `tokenLen`, `encodeTokens`, `decodeTokens`. Module-cached `tiktoken` encoder using `cl100k_base`, lazy-initialized via `get_encoding`. All three helpers fall back to @@ -56,8 +73,9 @@ pricing). `bytebell keys set` hint; `LlmError` accepts an optional `cause` and composes a single-line message capped at 500 chars of any HTTP error body (so the logger doesn't blow up on multi-MB error responses). -- **No env reads.** Only `getConfigValue(Config.OpenrouterApiKey)` / - `getConfigValue(Config.OpenrouterModel)` provide secrets/config. +- **No env reads.** Secrets come from `opts.apiKey` first, then + `getConfigValue(Config.OpenrouterApiKey)`. Same fallback shape for the + provider switch via `opts.provider` → `Config.LlmProvider`. - **Empty completions are errors.** A 200 OK with no `choices[0].message .content` throws `LlmError("OpenRouter returned empty completion")` — do not silently return an empty string. diff --git a/packages/llm/src/index.ts b/packages/llm/src/index.ts index 8137b29..52d2f72 100644 --- a/packages/llm/src/index.ts +++ b/packages/llm/src/index.ts @@ -1,5 +1,5 @@ export { askLLM } from "./client.ts"; -export type { AskLlmOptions, AskLlmResult, AskLlmUsage } from "./client.ts"; +export type { AskLlmOptions, AskLlmResult, AskLlmUsage, LlmProviderName } from "./client.ts"; export { askJsonLLM, askYesNoLLM, tryParseJson, stripJsonFence } from "./jsonClient.ts"; export type { AskJsonLlmOptions, AskJsonLlmResult, AskYesNoLlmResult } from "./jsonClient.ts"; export { estimateCostUsd, estimateCostFromBreakdown } from "./pricing.ts"; diff --git a/packages/llm/src/openrouter.ts b/packages/llm/src/openrouter.ts index fc71acb..6f90228 100644 --- a/packages/llm/src/openrouter.ts +++ b/packages/llm/src/openrouter.ts @@ -27,7 +27,7 @@ interface OpenRouterResponse { } export function resolveOpenRouterChain(opts: AskLlmOptions): string[] { - const apiKey = getConfigValue(Config.OpenrouterApiKey); + const apiKey = opts.apiKey ?? getConfigValue(Config.OpenrouterApiKey); if (apiKey.length === 0) { throw new LlmConfigError("bytebell keys set"); } @@ -45,7 +45,7 @@ export function resolveOpenRouterChain(opts: AskLlmOptions): string[] { } export async function callOpenRouter(prompt: string, opts: AskLlmOptions, timeoutMs: number): Promise { - const apiKey = getConfigValue(Config.OpenrouterApiKey); + const apiKey = opts.apiKey ?? getConfigValue(Config.OpenrouterApiKey); const cappedChain = resolveOpenRouterChain(opts); const model = cappedChain[0] ?? opts.model ?? getConfigValue(Config.OpenrouterModel); diff --git a/packages/types/context.md b/packages/types/context.md index ba23e6c..ad0d679 100644 --- a/packages/types/context.md +++ b/packages/types/context.md @@ -15,9 +15,14 @@ Single home for shared types and enums that cross package boundaries: `@bb/logger`, `@bb/mongo` — refer to it without wanting an implementation dependency on `@bb/config`'s schema/loader/writer. - `JobType`, `JobPriority`, `JobMessage

`, `GithubIndexPayload`, - `GithubPullPayload`, `PayloadFor` — the queue/job vocabulary shared - between `@bb/queue` (publisher) and future `@bb/ingest-*` packages - (worker handlers). + `GithubPullPayload`, `LocalIngestPayload`, `PayloadFor`, + `PayloadLlmOverrides` — the queue/job vocabulary shared between + `@bb/queue` (publisher) and `@bb/ingest-*` packages (worker handlers). + `PayloadLlmOverrides` is the optional `{ llmApiKey?, llmProvider?, +llmModel? }` mixin that lets downstream consumers carry per-job LLM + credentials through the payload (the extension point used by + the enterprise wrapper to inject per-org credentials at the enqueue + boundary). Mixed into both GitHub payloads. - `KnowledgeState` — the processing-status lifecycle enum (`CREATED → QUEUED → INGESTED → PROCESSING → PROCESSED ↘ FAILED`) referenced by `@bb/queue` (writes `QUEUED`), `@bb/mongo` (`setKnowledgeState`), and @@ -32,10 +37,12 @@ Future inhabitants (added on need basis): full `Knowledge`, `Raw`, ```ts enum Config { ... } -enum JobType { GithubIndex, GithubPull } +enum JobType { GithubIndex, GithubPull, LocalIngest } enum JobPriority { Low, Normal, High } -interface GithubIndexPayload { knowledgeId, repoUrl, branch?, commitHash?, gitToken? } -interface GithubPullPayload { knowledgeId, targetCommitHash?, gitToken? } +interface PayloadLlmOverrides { llmApiKey?, llmProvider?: "openrouter" | "ollama", llmModel? } +interface GithubIndexPayload extends PayloadLlmOverrides { knowledgeId, repoUrl, branch?, commitHash?, gitToken?, orgId? } +interface GithubPullPayload extends PayloadLlmOverrides { knowledgeId, targetCommitHash?, gitToken? } +interface LocalIngestPayload { knowledgeId, rootDir, orgId? } interface JobMessage

{ id, type, priority, knowledgeId, attempt, createdAt, payload } type PayloadFor diff --git a/packages/types/src/context.md b/packages/types/src/context.md index edce406..e2a6ed1 100644 --- a/packages/types/src/context.md +++ b/packages/types/src/context.md @@ -16,13 +16,19 @@ package-level contract; this file documents how the source tree is split. - **[job.ts](job.ts)** — the queue vocabulary: `JobType` (today: GitHub index + pull, local ingest), `JobPriority`, the per-type payload interfaces (`GithubIndexPayload`, `GithubPullPayload`, - `LocalIngestPayload`), the `JobMessage

` envelope wrapping payloads - as BullMQ `job.data`, and the `PayloadFor` type-level dispatcher. - Shared between `@bb/queue` (publisher) and future `@bb/ingest-*` - packages (worker handlers). Ingest payloads carry an optional - `orgId?: string` override; OSS callers omit it and the pipeline reads - `Config.OrgId` from `~/.bytebell/config.json` (locked to `"local"` - in OSS builds; downstream enterprise builds set `orgId` per-job). + `LocalIngestPayload`), the `PayloadLlmOverrides` mixin, the + `JobMessage

` envelope wrapping payloads as BullMQ `job.data`, and + the `PayloadFor` type-level dispatcher. Shared between `@bb/queue` + (publisher) and `@bb/ingest-*` packages (worker handlers). Ingest + payloads carry an optional `orgId?: string` override; OSS callers omit + it and the pipeline reads `Config.OrgId` from `~/.bytebell/config.json` + (locked to `"local"` in OSS builds; downstream enterprise builds set + `orgId` per-job). Both GitHub payloads also extend `PayloadLlmOverrides` + which adds optional `llmApiKey?`, `llmProvider?`, `llmModel?` — the + extension point that lets downstream enterprise builds resolve per-org + LLM credentials at the enqueue boundary and pass them through the + payload. OSS standalone leaves the LLM fields unset and the pipeline + falls back to `Config.OpenrouterApiKey` + `Config.LlmProvider`. - **[knowledge.ts](knowledge.ts)** — the `KnowledgeState` enum modeling the lifecycle in [CLAUDE.md](../../../CLAUDE.md). v0 only ships the enum; the full `Knowledge` document interface lands when domain CRUD diff --git a/packages/types/src/index.ts b/packages/types/src/index.ts index e6ccf57..99ef34d 100644 --- a/packages/types/src/index.ts +++ b/packages/types/src/index.ts @@ -1,6 +1,13 @@ export { Config } from "./config.ts"; export { JobType, JobPriority } from "./job.ts"; -export type { GithubIndexPayload, GithubPullPayload, LocalIngestPayload, JobMessage, PayloadFor } from "./job.ts"; +export type { + GithubIndexPayload, + GithubPullPayload, + LocalIngestPayload, + JobMessage, + PayloadFor, + PayloadLlmOverrides, +} from "./job.ts"; export { KnowledgeState } from "./knowledge.ts"; export type { GithubKnowledgeSource, KnowledgeDoc, KnowledgeSource, LocalKnowledgeSource } from "./knowledge.ts"; export type { diff --git a/packages/types/src/job.ts b/packages/types/src/job.ts index 9eccd26..6fa8142 100644 --- a/packages/types/src/job.ts +++ b/packages/types/src/job.ts @@ -10,7 +10,20 @@ export enum JobPriority { High = 2, } -export interface GithubIndexPayload { +/** + * Optional per-job LLM credential overrides. When set, take precedence over + * `Config.OpenrouterApiKey` and `Config.LlmProvider` for the duration of this + * job's processing. Used by downstream consumers (e.g. the enterprise wrapper) + * that resolve per-org credentials at the enqueue boundary and infuse them + * into the payload — OSS standalone leaves all three unset. + */ +export interface PayloadLlmOverrides { + llmApiKey?: string; + llmProvider?: "openrouter" | "ollama"; + llmModel?: string; +} + +export interface GithubIndexPayload extends PayloadLlmOverrides { knowledgeId: string; repoUrl: string; branch?: string; @@ -19,7 +32,7 @@ export interface GithubIndexPayload { orgId?: string; } -export interface GithubPullPayload { +export interface GithubPullPayload extends PayloadLlmOverrides { knowledgeId: string; /** * Optional commit to re-index the knowledge to. Must be a 40-character hex SHA From 385104b238b7b8b8e3d8d024fcf128d1f2b6ce22 Mon Sep 17 00:00:00 2001 From: Dead-Bytes <143434285+Dead-Bytes@users.noreply.github.com> Date: Wed, 13 May 2026 22:53:16 +0530 Subject: [PATCH 02/34] fix(internal): call sites pass on llm-context according to prev commit --- .../ingest-github/src/adapters/context.md | 9 ++- .../src/adapters/llm-file-analyzer.ts | 10 +++- .../ingest-github/src/pipeline/context.md | 22 ++++++-- packages/ingest-github/src/pipeline/pull.ts | 56 ++++++++++++++++--- packages/ingest-github/src/pipeline/run.ts | 31 +++++++++- packages/ingest-github/src/pipeline/scan.ts | 8 ++- .../src/pipeline/skip-decisions/context.md | 4 +- .../src/pipeline/skip-decisions/decider.ts | 11 +++- .../strategies/flat-folder/analyse-file.ts | 16 ++++-- .../flat-folder/backfill/big-files.ts | 3 + .../flat-folder/backfill/context.md | 3 +- .../strategies/flat-folder/backfill/fields.ts | 9 ++- .../flat-folder/big-file/chunk-analyzer.ts | 6 +- .../flat-folder/big-file/context.md | 14 +++-- .../strategies/flat-folder/big-file/index.ts | 4 +- .../src/strategies/flat-folder/context.md | 11 ++++ .../flat-folder/folder-summary-selective.ts | 4 +- .../strategies/flat-folder/folder-summary.ts | 12 +++- .../src/strategies/flat-folder/index.ts | 23 ++++++-- .../phases/classify-and-analyse-small.ts | 11 +++- .../strategies/flat-folder/phases/context.md | 14 +++-- .../flat-folder/phases/process-big-files.ts | 3 + .../strategies/flat-folder/repo-summary.ts | 18 +++--- packages/ingest-github/src/types/context.md | 9 +++ packages/ingest-github/src/types/pipeline.ts | 55 +++++++++++++++++- packages/ingest-github/src/types/strategy.ts | 8 +++ 26 files changed, 303 insertions(+), 71 deletions(-) diff --git a/packages/ingest-github/src/adapters/context.md b/packages/ingest-github/src/adapters/context.md index cd3bfb7..d33d2c1 100644 --- a/packages/ingest-github/src/adapters/context.md +++ b/packages/ingest-github/src/adapters/context.md @@ -13,9 +13,12 @@ Domain. - `llm-file-analyzer.ts` — `createLlmFileAnalyzer(deps)` returns the `FileAnalyzer` port. Deps inject `buildSystemPrompt` and `buildUserPrompt` so the prompts live in `strategies/flat-folder/prompts/` (one-way tier flow - from strategies → adapters via DI, never via import). Also exports - `shapeAnalysis` (raw JSON → `FileAnalysis`, tolerates missing keys) and - `languageFromPath` (extension-based fallback when the LLM omits `language`). + from strategies → adapters via DI, never via import). The returned + `analyze({ relativePath, content, llmCallContext? })` forwards + `llmCallContext` to `askJsonLLM` so per-job LLM credential overrides + reach OpenRouter. Also exports `shapeAnalysis` (raw JSON → + `FileAnalysis`, tolerates missing keys) and `languageFromPath` + (extension-based fallback when the LLM omits `language`). - `index.ts` — barrel. ## Invariants diff --git a/packages/ingest-github/src/adapters/llm-file-analyzer.ts b/packages/ingest-github/src/adapters/llm-file-analyzer.ts index ac83274..13bda00 100644 --- a/packages/ingest-github/src/adapters/llm-file-analyzer.ts +++ b/packages/ingest-github/src/adapters/llm-file-analyzer.ts @@ -1,4 +1,4 @@ -import { askJsonLLM } from "@bb/llm"; +import { askJsonLLM, type AskLlmOptions } from "@bb/llm"; import { logger } from "@bb/logger"; import type { FileAnalysis, FileAnalysisSection } from "@bb/mongo"; import { FALLBACK_LANGUAGE, emptyFileAnalysis } from "src/types/file-analysis.ts"; @@ -33,12 +33,16 @@ interface RawAnalysisJson { export function createLlmFileAnalyzer(deps: LlmFileAnalyzerDeps): FileAnalyzer { return { - async analyze(input: { relativePath: string; content: string }): Promise { + async analyze(input: { + relativePath: string; + content: string; + llmCallContext?: AskLlmOptions; + }): Promise { const systemPrompt = deps.buildSystemPrompt(); const userPrompt = deps.buildUserPrompt(input); let raw: RawAnalysisJson | null = null; try { - const response = await askJsonLLM(systemPrompt, userPrompt); + const response = await askJsonLLM(systemPrompt, userPrompt, input.llmCallContext ?? {}); raw = response.result; if (raw === null) { logger.warn(`llm-file-analyzer: ${input.relativePath} returned unparseable JSON`); diff --git a/packages/ingest-github/src/pipeline/context.md b/packages/ingest-github/src/pipeline/context.md index 9e379e2..9fdf38f 100644 --- a/packages/ingest-github/src/pipeline/context.md +++ b/packages/ingest-github/src/pipeline/context.md @@ -38,8 +38,11 @@ true` (default). Consumed by `scan.ts` via the optional `skipDecider` byte size exceeds `Config.AbsoluteFileSizeCap` (skipped before read) or when its line count exceeds `Config.BigFileLineThreshold` (default 1200; enters the big-file phase). Both thresholds are config-driven — no - magic numbers in this file. `readScannedFile` re-reads a file by - absolute path for the big-file phase which streams content lazily. + magic numbers in this file. `deps.llmCallContext` (when present) is + forwarded into every `SkipDeciderInput` so the LLM branch of the + unknown-extension gate uses per-job credentials. `readScannedFile` + re-reads a file by absolute path for the big-file phase which streams + content lazily. - `run.ts` — `createPipelineRunner({ reposRootDir, strategy, sourceFactory? })` builds an `IngestRunnerDeps`. GitHub payloads run: branch resolve, source-reader construction, strategy execute, commit persistence. Local @@ -52,9 +55,18 @@ true` (default). Consumed by `scan.ts` via the optional `skipDecider` `archiveSink` which the strategy then threads through to its analyse phase. `resolveOrgId(payload)` returns `payload.orgId ?? getConfigValue(Config.OrgId)` — the only place orgId - is resolved. State transitions (`CREATED → QUEUED → INGESTED → …`) are - persisted to Mongo + Neo4j via `transitionState`, and - `CancellationError` is re-thrown without flipping to FAILED. + is resolved. `llmCallContextFromPayload(payload)` extracts the optional + `{ llmApiKey, llmProvider, llmModel }` overrides from the payload and + packs them into an `AskLlmOptions` bag stored on `StrategyContext. +llmCallContext`, which every LLM call site downstream consumes. State + transitions (`CREATED → QUEUED → INGESTED → …`) are persisted to Mongo + - Neo4j via `transitionState`, and `CancellationError` is re-thrown + without flipping to FAILED. +- `pull.ts` — `runPull(msg)` orchestrates the pull job. Mirrors the same + payload-to-`llmCallContext` extraction as `run.ts` and threads the + resulting `AskLlmOptions` bag into every phase invocation + (`analyseChangedFiles`, `processBigFilesQueue`, `backfillMissingFields`, + `backfillBigFiles`, `runSelectiveFolderSummary`, `summariseRepo`). - `branch.ts` — `resolveBranch(knowledgeId, payload)`. Defaults to `main` when the payload omits it; rejects branch names that don't match `^[\w./-]+$` with `IngestError` (defence against shell-injection into git args). diff --git a/packages/ingest-github/src/pipeline/pull.ts b/packages/ingest-github/src/pipeline/pull.ts index 2ce7452..ea2c9d3 100644 --- a/packages/ingest-github/src/pipeline/pull.ts +++ b/packages/ingest-github/src/pipeline/pull.ts @@ -2,7 +2,7 @@ import { Config, KnowledgeState, type GithubPullPayload, type JobMessage } from import { getConfigValue } from "@bb/config"; import { getKnowledge, recordProcessingStats, setKnowledgeCommit, setKnowledgeState } from "@bb/mongo"; import { setKnowledgeStateInGraph, snapshotFilesToVersion, type NodeScope } from "@bb/neo4j"; -import { estimateCostFromBreakdown } from "@bb/llm"; +import { estimateCostFromBreakdown, type AskLlmOptions } from "@bb/llm"; import { IngestError, KnowledgeNotFoundError } from "@bb/errors"; import { logger } from "@bb/logger"; import { ensureMetaDirs, metaPathsFor, repoCloneDir, ensureReposRoot } from "./paths.ts"; @@ -34,6 +34,24 @@ function resolveOrgId(payload: { orgId?: string }): string { return getConfigValue(Config.OrgId); } +function llmCallContextFromPayload(payload: { + llmApiKey?: string; + llmProvider?: "openrouter" | "ollama"; + llmModel?: string; +}): AskLlmOptions | undefined { + const ctx: AskLlmOptions = {}; + if (payload.llmApiKey !== undefined && payload.llmApiKey.length > 0) { + ctx.apiKey = payload.llmApiKey; + } + if (payload.llmProvider !== undefined) { + ctx.provider = payload.llmProvider; + } + if (payload.llmModel !== undefined && payload.llmModel.length > 0) { + ctx.model = payload.llmModel; + } + return Object.keys(ctx).length > 0 ? ctx : undefined; +} + export async function runPull(msg: JobMessage): Promise { const { knowledgeId } = msg.payload; if (msg.payload.targetCommitHash !== undefined && !COMMIT_HASH_RE.test(msg.payload.targetCommitHash)) { @@ -124,39 +142,61 @@ export async function runPull(msg: JobMessage): Promise buildUserPrompt: buildFileAnalysisUserPrompt, }); + const llmCallContext = llmCallContextFromPayload(msg.payload); + logger.info(`pull: phase per-file dispatcher for ${knowledgeId} starting`); throwIfCancelled(knowledgeId); - await analyseChangedFiles({ + const analyseChangedInput: Parameters[0] = { knowledgeId, repoDir, metaPaths, analyzer: fileAnalyzer, diff, - }); + }; + if (llmCallContext !== undefined) { + analyseChangedInput.llmCallContext = llmCallContext; + } + await analyseChangedFiles(analyseChangedInput); const source = createDiskSourceReader({ repoDir, commitHash: targetCommit }); logger.info(`pull: phase process big files starting`); throwIfCancelled(knowledgeId); - await processBigFilesQueue({ knowledgeId, source, metaPaths }); + const processBigFilesInput: Parameters[0] = { knowledgeId, source, metaPaths }; + if (llmCallContext !== undefined) { + processBigFilesInput.llmCallContext = llmCallContext; + } + await processBigFilesQueue(processBigFilesInput); logger.info(`pull: phase backfill fields starting`); throwIfCancelled(knowledgeId); - await backfillMissingFields(metaPaths); + await backfillMissingFields(metaPaths, llmCallContext); logger.info(`pull: phase backfill big-files starting`); throwIfCancelled(knowledgeId); - await backfillBigFiles({ knowledgeId, source, metaPaths }); + const backfillBigFilesInput: Parameters[0] = { knowledgeId, source, metaPaths }; + if (llmCallContext !== undefined) { + backfillBigFilesInput.llmCallContext = llmCallContext; + } + await backfillBigFiles(backfillBigFilesInput); logger.info(`pull: phase selective folder summary (${affectedFolders.size} folders) starting`); throwIfCancelled(knowledgeId); - await runSelectiveFolderSummary({ knowledgeId, metaPaths, affectedFolders }); + const selectiveInput: Parameters[0] = { + knowledgeId, + metaPaths, + affectedFolders, + }; + if (llmCallContext !== undefined) { + selectiveInput.llmCallContext = llmCallContext; + } + await runSelectiveFolderSummary(selectiveInput); logger.info(`pull: phase repo summary starting`); throwIfCancelled(knowledgeId); const orgId = resolveOrgId({ ...(knowledge.source.kind === "github" ? {} : {}) }); const scope: NodeScope = { orgId, knowledgeId, repoId: knowledgeId }; - const repoSummary = await summariseRepo(knowledgeId, metaPaths); + const repoSummary = await summariseRepo(knowledgeId, metaPaths, llmCallContext); if (repoSummary !== null) { await persistRepoSummary(metaPaths, makeRepoSummaryEnvelope(knowledgeId, orgId, repoSummary)); } diff --git a/packages/ingest-github/src/pipeline/run.ts b/packages/ingest-github/src/pipeline/run.ts index 5d76146..31924f3 100644 --- a/packages/ingest-github/src/pipeline/run.ts +++ b/packages/ingest-github/src/pipeline/run.ts @@ -2,7 +2,7 @@ import { Config, KnowledgeState, type GithubIndexPayload, type LocalIngestPayloa import { getConfigValue } from "@bb/config"; import { recordProcessingStats, setKnowledgeCommit, setKnowledgeState } from "@bb/mongo"; import { setKnowledgeStateInGraph } from "@bb/neo4j"; -import { estimateCostFromBreakdown } from "@bb/llm"; +import { estimateCostFromBreakdown, type AskLlmOptions } from "@bb/llm"; import { IngestError } from "@bb/errors"; import { logger } from "@bb/logger"; import type { IngestRunnerDeps, IngestRunnerInput } from "src/types/ingest-runner.ts"; @@ -21,6 +21,24 @@ function resolveOrgId(payload: { orgId?: string }): string { return getConfigValue(Config.OrgId); } +function llmCallContextFromPayload(payload: { + llmApiKey?: string; + llmProvider?: "openrouter" | "ollama"; + llmModel?: string; +}): AskLlmOptions | undefined { + const ctx: AskLlmOptions = {}; + if (payload.llmApiKey !== undefined && payload.llmApiKey.length > 0) { + ctx.apiKey = payload.llmApiKey; + } + if (payload.llmProvider !== undefined) { + ctx.provider = payload.llmProvider; + } + if (payload.llmModel !== undefined && payload.llmModel.length > 0) { + ctx.model = payload.llmModel; + } + return Object.keys(ctx).length > 0 ? ctx : undefined; +} + export interface CreatePipelineRunnerDeps { reposRootDir: string; strategy: IngestStrategy; @@ -92,12 +110,21 @@ async function runGithub( const metaPaths = metaPathsFor(knowledgeId); await ensureMetaDirs(metaPaths); + const baseContext: Parameters[0]["context"] = { + knowledgeId, + orgId: resolveOrgId(payload), + repoId: knowledgeId, + }; + const llmCallContext = llmCallContextFromPayload(payload); + if (llmCallContext !== undefined) { + baseContext.llmCallContext = llmCallContext; + } const strategyInput: Parameters[0] = { payload, branch, source, metaPaths, - context: { knowledgeId, orgId: resolveOrgId(payload), repoId: knowledgeId }, + context: baseContext, }; if (archiveSink !== undefined) { strategyInput.archiveSink = archiveSink; diff --git a/packages/ingest-github/src/pipeline/scan.ts b/packages/ingest-github/src/pipeline/scan.ts index 5bbf5e5..a6b03ef 100644 --- a/packages/ingest-github/src/pipeline/scan.ts +++ b/packages/ingest-github/src/pipeline/scan.ts @@ -2,6 +2,7 @@ import { opendir, readFile, stat } from "node:fs/promises"; import path from "node:path"; import { Config } from "@bb/types"; import { getConfigValue } from "@bb/config"; +import type { AskLlmOptions } from "@bb/llm"; import { logger } from "@bb/logger"; import { SKIP_DIRS, looksBinary, passesPathFilters } from "./filters.ts"; import type { ScanEntry, SkipDecider } from "src/types/pipeline.ts"; @@ -13,6 +14,7 @@ interface ScanLimits { export interface ScanRepositoryDeps { skipDecider?: SkipDecider; + llmCallContext?: AskLlmOptions; } export async function* scanRepository(rootDir: string, deps: ScanRepositoryDeps = {}): AsyncGenerator { @@ -80,7 +82,11 @@ async function* walk( continue; } if (deps.skipDecider !== undefined) { - const decision = await deps.skipDecider.decide({ relativePath, absolutePath: abs, ext }); + const deciderInput: Parameters[0] = { relativePath, absolutePath: abs, ext }; + if (deps.llmCallContext !== undefined) { + deciderInput.llmCallContext = deps.llmCallContext; + } + const decision = await deps.skipDecider.decide(deciderInput); if (decision === "reject-static") { counts.rejectStatic += 1; continue; diff --git a/packages/ingest-github/src/pipeline/skip-decisions/context.md b/packages/ingest-github/src/pipeline/skip-decisions/context.md index 90d7096..f4e0273 100644 --- a/packages/ingest-github/src/pipeline/skip-decisions/context.md +++ b/packages/ingest-github/src/pipeline/skip-decisions/context.md @@ -34,7 +34,9 @@ single-tenant public layout. - `decider.ts` — `makeSkipDecider(deps)` returns a `SkipDecider` (port type from `src/types/pipeline.ts`). Reads `Config.SkipDecisionEnabled` once at factory time; when disabled the decider degrades to "accept everything - past the static blocklist". + past the static blocklist". The LLM branch forwards + `SkipDeciderInput.llmCallContext` (when set by the runner) into + `askYesNoLLM` so per-job credentials reach the decision call. - `seed-data/` — the five JSON files copied from kube's `shared/`: `directoryIgnore.json`, `filenameIgnore.json`, `ignorePatterns.json`, `extensions.json`, `llmDecisionsBase.json`. `llmDecisionsBase.json` is diff --git a/packages/ingest-github/src/pipeline/skip-decisions/decider.ts b/packages/ingest-github/src/pipeline/skip-decisions/decider.ts index 91ad7ae..9c5d3cd 100644 --- a/packages/ingest-github/src/pipeline/skip-decisions/decider.ts +++ b/packages/ingest-github/src/pipeline/skip-decisions/decider.ts @@ -2,7 +2,7 @@ import { readFile } from "node:fs/promises"; import path from "node:path"; import { Config } from "@bb/types"; import { getConfigValue } from "@bb/config"; -import { askYesNoLLM } from "@bb/llm"; +import { askYesNoLLM, type AskLlmOptions } from "@bb/llm"; import { logger } from "@bb/logger"; import type { SkipDecider, SkipDeciderInput, SkipDecision } from "src/types/pipeline.ts"; import { @@ -71,7 +71,7 @@ export function makeSkipDecider(deps: SkipDeciderDeps = {}): SkipDecider { return cached.ignore ? "reject-llm" : "accept-llm"; } - const decision = await askLlmDecision(input, deps.repositoryName); + const decision = await askLlmDecision(input, deps.repositoryName, input.llmCallContext); if (input.ext.length > 0) { setExtensionDecision(cache, input.ext, !decision, "llm", deps.repositoryName, input.relativePath); } else { @@ -88,7 +88,11 @@ export function makeSkipDecider(deps: SkipDeciderDeps = {}): SkipDecider { }; } -async function askLlmDecision(input: SkipDeciderInput, repositoryName: string | undefined): Promise { +async function askLlmDecision( + input: SkipDeciderInput, + repositoryName: string | undefined, + llmCallContext: AskLlmOptions | undefined, +): Promise { const maxChars = getConfigValue(Config.SkipDecisionMaxCharsForLlm); let content: string; if (input.content !== undefined) { @@ -115,6 +119,7 @@ async function askLlmDecision(input: SkipDeciderInput, repositoryName: string | content, truncatedTo: content.length, }), + llmCallContext ?? {}, ); if (result.decision === null) { logger.warn(`skip-decisions: LLM returned no decision for ${input.relativePath}; defaulting to reject`); diff --git a/packages/ingest-github/src/strategies/flat-folder/analyse-file.ts b/packages/ingest-github/src/strategies/flat-folder/analyse-file.ts index 21279ca..ea0576a 100644 --- a/packages/ingest-github/src/strategies/flat-folder/analyse-file.ts +++ b/packages/ingest-github/src/strategies/flat-folder/analyse-file.ts @@ -1,13 +1,21 @@ import { createHash } from "node:crypto"; -import { tokenLen } from "@bb/llm"; +import { tokenLen, type AskLlmOptions } from "@bb/llm"; import type { CondensedFileAnalysis } from "src/types/condensed-file-analysis.ts"; import type { FileAnalyzer, ScannedFile } from "src/types/pipeline.ts"; -export async function analyseScannedFile(analyzer: FileAnalyzer, file: ScannedFile): Promise { - const { language, analysis } = await analyzer.analyze({ +export async function analyseScannedFile( + analyzer: FileAnalyzer, + file: ScannedFile, + llmCallContext?: AskLlmOptions, +): Promise { + const analyzerInput: Parameters[0] = { relativePath: file.relativePath, content: file.content, - }); + }; + if (llmCallContext !== undefined) { + analyzerInput.llmCallContext = llmCallContext; + } + const { language, analysis } = await analyzer.analyze(analyzerInput); return { relativePath: file.relativePath, language, diff --git a/packages/ingest-github/src/strategies/flat-folder/backfill/big-files.ts b/packages/ingest-github/src/strategies/flat-folder/backfill/big-files.ts index 9d1a393..7aad4e2 100644 --- a/packages/ingest-github/src/strategies/flat-folder/backfill/big-files.ts +++ b/packages/ingest-github/src/strategies/flat-folder/backfill/big-files.ts @@ -1,4 +1,5 @@ import { logger } from "@bb/logger"; +import type { AskLlmOptions } from "@bb/llm"; import type { MetaPaths } from "src/types/meta-paths.ts"; import type { SourceReader } from "src/types/pipeline.ts"; import { readBigFiles } from "src/strategies/flat-folder/big-file/detector.ts"; @@ -9,6 +10,7 @@ export interface BackfillBigFilesInput { knowledgeId: string; source: SourceReader; metaPaths: MetaPaths; + llmCallContext?: AskLlmOptions; } export interface BackfillBigFilesResult { @@ -41,6 +43,7 @@ export async function backfillBigFiles(input: BackfillBigFilesInput): Promise` +- `backfillMissingFields(metaPaths, llmCallContext?): Promise<{ updated, failed }>` - `backfillBigFiles(input: BackfillBigFilesInput): Promise` + — `BackfillBigFilesInput` carries an optional `llmCallContext?: AskLlmOptions` that the inner `processBigFile` call uses to forward per-job LLM credentials. Both return phase-summary counters consumed by `createFlatFolderStrategy` to roll up into the strategy result. diff --git a/packages/ingest-github/src/strategies/flat-folder/backfill/fields.ts b/packages/ingest-github/src/strategies/flat-folder/backfill/fields.ts index 2d3e5d9..e9bb72e 100644 --- a/packages/ingest-github/src/strategies/flat-folder/backfill/fields.ts +++ b/packages/ingest-github/src/strategies/flat-folder/backfill/fields.ts @@ -1,4 +1,4 @@ -import { askJsonLLM } from "@bb/llm"; +import { askJsonLLM, type AskLlmOptions } from "@bb/llm"; import { logger } from "@bb/logger"; import type { FileAnalysis, FileAnalysisSection } from "@bb/mongo"; import type { MetaPaths } from "src/types/meta-paths.ts"; @@ -40,7 +40,10 @@ interface NeededFlags { sectionMap: boolean; } -export async function backfillMissingFields(metaPaths: MetaPaths): Promise<{ updated: number; failed: number }> { +export async function backfillMissingFields( + metaPaths: MetaPaths, + llmCallContext?: AskLlmOptions, +): Promise<{ updated: number; failed: number }> { let updated = 0; let failed = 0; for await (const entry of iterateCondensed(metaPaths)) { @@ -51,7 +54,7 @@ export async function backfillMissingFields(metaPaths: MetaPaths): Promise<{ upd } const userPrompt = buildBackfillUserPrompt(entry.relativePath, entry.analysis); try { - const response = await askJsonLLM(BACKFILL_SYSTEM_PROMPT, userPrompt); + const response = await askJsonLLM(BACKFILL_SYSTEM_PROMPT, userPrompt, llmCallContext ?? {}); const result = response.result; if (result === null) { continue; diff --git a/packages/ingest-github/src/strategies/flat-folder/big-file/chunk-analyzer.ts b/packages/ingest-github/src/strategies/flat-folder/big-file/chunk-analyzer.ts index 79fe4f6..46876bf 100644 --- a/packages/ingest-github/src/strategies/flat-folder/big-file/chunk-analyzer.ts +++ b/packages/ingest-github/src/strategies/flat-folder/big-file/chunk-analyzer.ts @@ -1,11 +1,11 @@ -import { askJsonLLM } from "@bb/llm"; +import { askJsonLLM, type AskLlmOptions } from "@bb/llm"; import { logger } from "@bb/logger"; import type { ChunkAnalysisResult, FileChunk } from "src/types/big-file.ts"; import { FALLBACK_LANGUAGE, emptyFileAnalysis } from "src/types/file-analysis.ts"; import { shapeAnalysis } from "src/adapters/llm-file-analyzer.ts"; import { CHUNK_ANALYSIS_SYSTEM_PROMPT, buildChunkUserPrompt } from "src/strategies/flat-folder/prompts/chunk.ts"; -export async function analyzeChunk(chunk: FileChunk): Promise { +export async function analyzeChunk(chunk: FileChunk, llmCallContext?: AskLlmOptions): Promise { const systemPrompt = CHUNK_ANALYSIS_SYSTEM_PROMPT; const userPrompt = buildChunkUserPrompt({ relativePath: chunk.relativePath, @@ -16,7 +16,7 @@ export async function analyzeChunk(chunk: FileChunk): Promise>(systemPrompt, userPrompt); + const response = await askJsonLLM>(systemPrompt, userPrompt, llmCallContext ?? {}); if (response.result === null) { logger.warn( `analyzeChunk: ${chunk.relativePath} chunk ${chunk.chunkIndex + 1}/${chunk.totalChunks} returned unparseable JSON`, diff --git a/packages/ingest-github/src/strategies/flat-folder/big-file/context.md b/packages/ingest-github/src/strategies/flat-folder/big-file/context.md index 17b00ef..0c26ca6 100644 --- a/packages/ingest-github/src/strategies/flat-folder/big-file/context.md +++ b/packages/ingest-github/src/strategies/flat-folder/big-file/context.md @@ -11,8 +11,10 @@ depending on chunk count and prompt budget. - `detector.ts` — `classifyByTokens`, `buildBigFileEntry`, plus the on-disk `bigFiles.json` reader / writer / appender (dedupe-by-path on write). - `chunker.ts` — `splitFileIntoChunks` (line-aligned, ≤ `MaxTokensPerChunk`). -- `chunk-analyzer.ts` — `analyzeChunk(chunk)` calls `askJsonLLM` with the - chunk prompt; tolerates failures by returning an empty analysis. +- `chunk-analyzer.ts` — `analyzeChunk(chunk, llmCallContext?)` calls + `askJsonLLM` with the chunk prompt; tolerates failures by returning an + empty analysis. `llmCallContext` forwards per-job LLM credentials + threaded through from `StrategyContext`. - `condenser.ts` — `condenseChunks(relativePath, chunks)`: ≤ `SmallFileDedupThreshold` → deterministic merge (no LLM); above → recursive map-reduce. Per-condense LLM failure falls back to @@ -23,9 +25,11 @@ depending on chunk count and prompt budget. `stale-condensed`, or `missing`. Used by Phase 2 to short-circuit and by Phase 4 to find candidates for cheap re-condense. - `index.ts` — `processBigFile({knowledgeId, metaPaths, relativePath, content, -sizeBytes})`. Sequential per file (chunk-level concurrency inside). - Persists every intermediate artifact, so a restart resumes from the next - unfinished chunk. +sizeBytes, llmCallContext?})`. Sequential per file (chunk-level + concurrency inside). Persists every intermediate artifact, so a + restart resumes from the next unfinished chunk. `llmCallContext` is + forwarded to every chunk analyzer call so per-job LLM credentials + reach `@bb/llm`. ## Invariants diff --git a/packages/ingest-github/src/strategies/flat-folder/big-file/index.ts b/packages/ingest-github/src/strategies/flat-folder/big-file/index.ts index ffcbbd2..893e416 100644 --- a/packages/ingest-github/src/strategies/flat-folder/big-file/index.ts +++ b/packages/ingest-github/src/strategies/flat-folder/big-file/index.ts @@ -1,6 +1,7 @@ import { createHash } from "node:crypto"; import { Config } from "@bb/types"; import { getConfigValue } from "@bb/config"; +import type { AskLlmOptions } from "@bb/llm"; import { logger } from "@bb/logger"; import type { ChunkAnalysisResult, HugeFileManifest } from "src/types/big-file.ts"; import type { CondensedFileAnalysis } from "src/types/condensed-file-analysis.ts"; @@ -17,6 +18,7 @@ export interface ProcessBigFileInput { relativePath: string; content: string; sizeBytes: number; + llmCallContext?: AskLlmOptions; } export async function processBigFile(input: ProcessBigFileInput): Promise { @@ -43,7 +45,7 @@ export async function processBigFile(input: ProcessBigFileInput): Promise; + llmCallContext?: AskLlmOptions; } export interface SelectiveFolderSummaryResult { @@ -46,7 +48,7 @@ export async function runSelectiveFolderSummary( limit(async () => { try { throwIfCancelled(input.knowledgeId); - const summary = await summariseFolder(folderPath, files); + const summary = await summariseFolder(folderPath, files, input.llmCallContext); if (summary !== null) { await persistFolderSummary(input.metaPaths, summary); succeeded += 1; diff --git a/packages/ingest-github/src/strategies/flat-folder/folder-summary.ts b/packages/ingest-github/src/strategies/flat-folder/folder-summary.ts index 8f7d15c..7aa3e07 100644 --- a/packages/ingest-github/src/strategies/flat-folder/folder-summary.ts +++ b/packages/ingest-github/src/strategies/flat-folder/folder-summary.ts @@ -1,6 +1,6 @@ import { readFile, readdir, writeFile } from "node:fs/promises"; import path from "node:path"; -import { askJsonLLM } from "@bb/llm"; +import { askJsonLLM, type AskLlmOptions } from "@bb/llm"; import { logger } from "@bb/logger"; import { Config } from "@bb/types"; import { getConfigValue } from "@bb/config"; @@ -39,10 +39,15 @@ interface FolderSummaryJson { export async function summariseFolder( folderPath: string, files: CondensedFileAnalysis[], + llmCallContext?: AskLlmOptions, ): Promise { const userPrompt = folderAnalysisUserPrompt(folderPath, files); try { - const response = await askJsonLLM(FOLDER_ANALYSIS_SYSTEM_PROMPT, userPrompt); + const response = await askJsonLLM( + FOLDER_ANALYSIS_SYSTEM_PROMPT, + userPrompt, + llmCallContext ?? {}, + ); if (response.result === null) { logger.warn(`summariseFolder: ${folderPath || ""} returned unparseable JSON`); return null; @@ -86,6 +91,7 @@ export async function* iterateFolderSummaries(metaPaths: MetaPaths): AsyncGenera export async function runFolderSummaryPhase( knowledgeId: string, metaPaths: MetaPaths, + llmCallContext?: AskLlmOptions, ): Promise<{ succeeded: number; failed: number }> { const concurrentWorkers = getConfigValue(Config.ConcurrentWorkers); const limit = withConcurrency(concurrentWorkers); @@ -98,7 +104,7 @@ export async function runFolderSummaryPhase( limit(async () => { try { throwIfCancelled(knowledgeId); - const summary = await summariseFolder(folderPath, files); + const summary = await summariseFolder(folderPath, files, llmCallContext); if (summary !== null) { await persistFolderSummary(metaPaths, summary); succeeded += 1; diff --git a/packages/ingest-github/src/strategies/flat-folder/index.ts b/packages/ingest-github/src/strategies/flat-folder/index.ts index 76f16a3..5304318 100644 --- a/packages/ingest-github/src/strategies/flat-folder/index.ts +++ b/packages/ingest-github/src/strategies/flat-folder/index.ts @@ -19,7 +19,7 @@ export function createFlatFolderStrategy(deps: FlatFolderStrategyDeps): IngestSt name: "flat-folder", async execute(input: StrategyInput): Promise { const { context, source, archiveSink, metaPaths, payload, branch } = input; - const { knowledgeId, orgId, repoId } = context; + const { knowledgeId, orgId, repoId, llmCallContext } = context; logger.info(`flat-folder: phase1 (classify + analyse small) starting for ${knowledgeId}`); throwIfCancelled(knowledgeId); @@ -32,27 +32,38 @@ export function createFlatFolderStrategy(deps: FlatFolderStrategyDeps): IngestSt if (archiveSink !== undefined) { phase1Input.archiveSink = archiveSink; } + if (llmCallContext !== undefined) { + phase1Input.llmCallContext = llmCallContext; + } const phase1 = await classifyAndAnalyseSmall(phase1Input); logger.info(`flat-folder: phase2 (process big files) starting`); throwIfCancelled(knowledgeId); - const phase2 = await processBigFilesQueue({ knowledgeId, source, metaPaths }); + const phase2Input: Parameters[0] = { knowledgeId, source, metaPaths }; + if (llmCallContext !== undefined) { + phase2Input.llmCallContext = llmCallContext; + } + const phase2 = await processBigFilesQueue(phase2Input); logger.info(`flat-folder: phase3 (backfill missing fields) starting`); throwIfCancelled(knowledgeId); - await backfillMissingFields(metaPaths); + await backfillMissingFields(metaPaths, llmCallContext); logger.info(`flat-folder: phase4 (backfill big files) starting`); throwIfCancelled(knowledgeId); - await backfillBigFiles({ knowledgeId, source, metaPaths }); + const phase4Input: Parameters[0] = { knowledgeId, source, metaPaths }; + if (llmCallContext !== undefined) { + phase4Input.llmCallContext = llmCallContext; + } + await backfillBigFiles(phase4Input); logger.info(`flat-folder: phase5 (folder summaries) starting`); throwIfCancelled(knowledgeId); - const phase5 = await runFolderSummaryPhase(knowledgeId, metaPaths); + const phase5 = await runFolderSummaryPhase(knowledgeId, metaPaths, llmCallContext); logger.info(`flat-folder: phase6 (repo summary) starting`); throwIfCancelled(knowledgeId); - const repoSummary = await summariseRepo(knowledgeId, metaPaths); + const repoSummary = await summariseRepo(knowledgeId, metaPaths, llmCallContext); let repoSummarised = false; if (repoSummary !== null) { await persistRepoSummary(metaPaths, makeRepoSummaryEnvelope(knowledgeId, orgId, repoSummary)); diff --git a/packages/ingest-github/src/strategies/flat-folder/phases/classify-and-analyse-small.ts b/packages/ingest-github/src/strategies/flat-folder/phases/classify-and-analyse-small.ts index 6cd5c7f..3306d23 100644 --- a/packages/ingest-github/src/strategies/flat-folder/phases/classify-and-analyse-small.ts +++ b/packages/ingest-github/src/strategies/flat-folder/phases/classify-and-analyse-small.ts @@ -1,5 +1,5 @@ import path from "node:path"; -import { tokenLen } from "@bb/llm"; +import { tokenLen, type AskLlmOptions } from "@bb/llm"; import { logger } from "@bb/logger"; import { Config } from "@bb/types"; import { getConfigValue } from "@bb/config"; @@ -20,6 +20,7 @@ export interface ClassifyPhaseInput { analyzer: FileAnalyzer; skipDecider?: SkipDecider; archiveSink?: ArchiveSink; + llmCallContext?: AskLlmOptions; } export interface ClassifyPhaseResult { @@ -44,7 +45,11 @@ export async function classifyAndAnalyseSmall(input: ClassifyPhaseInput): Promis const pending: Promise[] = []; - for await (const entry of input.source.scan({ skipDecider })) { + const scanDeps: Parameters[0] = { skipDecider }; + if (input.llmCallContext !== undefined) { + scanDeps.llmCallContext = input.llmCallContext; + } + for await (const entry of input.source.scan(scanDeps)) { throwIfCancelled(input.knowledgeId); if (entry.kind === "oversized") { @@ -81,7 +86,7 @@ export async function classifyAndAnalyseSmall(input: ClassifyPhaseInput): Promis limit(async () => { try { throwIfCancelled(input.knowledgeId); - const condensed = await analyseScannedFile(input.analyzer, entry); + const condensed = await analyseScannedFile(input.analyzer, entry, input.llmCallContext); await saveCondensed(input.metaPaths, condensed); if (input.archiveSink !== undefined) { await input.archiveSink.push({ diff --git a/packages/ingest-github/src/strategies/flat-folder/phases/context.md b/packages/ingest-github/src/strategies/flat-folder/phases/context.md index 4c7889d..71ac8d8 100644 --- a/packages/ingest-github/src/strategies/flat-folder/phases/context.md +++ b/packages/ingest-github/src/strategies/flat-folder/phases/context.md @@ -9,8 +9,9 @@ and repo summarisation (Phases 5 and 6) live as `folder-summary.ts` and ## Files - `classify-and-analyse-small.ts` — Phase 1. - `classifyAndAnalyseSmall({knowledgeId, repoDir, metaPaths, analyzer})` - walks `scanRepository(repoDir)` and per entry: + `classifyAndAnalyseSmall({knowledgeId, source, metaPaths, analyzer, +skipDecider?, archiveSink?, llmCallContext?})` walks `source.scan({ +skipDecider, llmCallContext })` and per entry: - `kind === "oversized"` → write a stub via `buildOversizedStub` + `saveCondensed`, and append a `too-large` row to `bigFiles.json`. - token count > `Config.ContextWindowLimit` → buffer a @@ -22,11 +23,12 @@ and repo summarisation (Phases 5 and 6) live as `folder-summary.ts` and buffered big-file list is flushed via `writeBigFiles` after all tasks drain. - `process-big-files.ts` — Phase 2. - `processBigFilesQueue({knowledgeId, repoDir, metaPaths})` reads - `bigFiles.json`, skips `too-large` entries (counted as + `processBigFilesQueue({knowledgeId, source, metaPaths, llmCallContext?})` + reads `bigFiles.json`, skips `too-large` entries (counted as `skippedOversized`), short-circuits when `inspect` returns `complete` - (counted as `cached`), reads the file from disk, and dispatches - `processBigFile` sequentially per file. Cancellation re-throws past the + (counted as `cached`), reads the file via `source.readFile`, and + dispatches `processBigFile` sequentially per file with the per-job + `llmCallContext` threaded through. Cancellation re-throws past the phase; other errors are logged per file and counted as `failed`. - `store-flat-analysis.ts` — Phase 7. `storeFlatAnalysis({scope, payload, branch, metaPaths})` ensures diff --git a/packages/ingest-github/src/strategies/flat-folder/phases/process-big-files.ts b/packages/ingest-github/src/strategies/flat-folder/phases/process-big-files.ts index 174be8a..26357af 100644 --- a/packages/ingest-github/src/strategies/flat-folder/phases/process-big-files.ts +++ b/packages/ingest-github/src/strategies/flat-folder/phases/process-big-files.ts @@ -1,4 +1,5 @@ import { logger } from "@bb/logger"; +import type { AskLlmOptions } from "@bb/llm"; import type { MetaPaths } from "src/types/meta-paths.ts"; import type { SourceReader } from "src/types/pipeline.ts"; import { throwIfCancelled, CancellationError } from "src/pipeline/cancellation.ts"; @@ -10,6 +11,7 @@ export interface ProcessBigFilesInput { knowledgeId: string; source: SourceReader; metaPaths: MetaPaths; + llmCallContext?: AskLlmOptions; } export interface ProcessBigFilesResult { @@ -57,6 +59,7 @@ export async function processBigFilesQueue(input: ProcessBigFilesInput): Promise relativePath: entry.relativePath, content, sizeBytes: entry.sizeBytes, + ...(input.llmCallContext !== undefined ? { llmCallContext: input.llmCallContext } : {}), }); processed += 1; } catch (cause: unknown) { diff --git a/packages/ingest-github/src/strategies/flat-folder/repo-summary.ts b/packages/ingest-github/src/strategies/flat-folder/repo-summary.ts index b41fd52..2fe6c06 100644 --- a/packages/ingest-github/src/strategies/flat-folder/repo-summary.ts +++ b/packages/ingest-github/src/strategies/flat-folder/repo-summary.ts @@ -1,5 +1,5 @@ import { writeFile } from "node:fs/promises"; -import { askJsonLLM, tokenLen } from "@bb/llm"; +import { askJsonLLM, tokenLen, type AskLlmOptions } from "@bb/llm"; import { logger } from "@bb/logger"; import { Config } from "@bb/types"; import { getConfigValue } from "@bb/config"; @@ -25,7 +25,11 @@ interface RepoSummaryJson { keyPatterns?: unknown; } -export async function summariseRepo(knowledgeId: string, metaPaths: MetaPaths): Promise { +export async function summariseRepo( + knowledgeId: string, + metaPaths: MetaPaths, + llmCallContext?: AskLlmOptions, +): Promise { const folders: FolderSummary[] = []; for await (const f of iterateFolderSummaries(metaPaths)) { folders.push(f); @@ -42,7 +46,7 @@ export async function summariseRepo(knowledgeId: string, metaPaths: MetaPaths): const oneShotPrompt = buildRepoPromptFromFolders(infos); if (tokenLen(oneShotPrompt) + promptOverhead <= contextLimit) { throwIfCancelled(knowledgeId); - return await callRepoSummary(oneShotPrompt); + return await callRepoSummary(oneShotPrompt, llmCallContext); } logger.info(`phase6: repo prompt > ${contextLimit} tokens; batching`); @@ -50,7 +54,7 @@ export async function summariseRepo(knowledgeId: string, metaPaths: MetaPaths): const partials: string[] = []; for (const batch of batches) { throwIfCancelled(knowledgeId); - const partial = await callRepoSummary(buildRepoPromptFromFolders(batch)); + const partial = await callRepoSummary(buildRepoPromptFromFolders(batch), llmCallContext); if (partial !== null) { partials.push(JSON.stringify(partial)); } @@ -62,12 +66,12 @@ export async function summariseRepo(knowledgeId: string, metaPaths: MetaPaths): return JSON.parse(partials[0] ?? "null") as RepoSummary | null; } throwIfCancelled(knowledgeId); - return await callRepoSummary(buildRepoMergePrompt(partials)); + return await callRepoSummary(buildRepoMergePrompt(partials), llmCallContext); } -async function callRepoSummary(userPrompt: string): Promise { +async function callRepoSummary(userPrompt: string, llmCallContext?: AskLlmOptions): Promise { try { - const response = await askJsonLLM(REPO_SUMMARY_SYSTEM_PROMPT, userPrompt); + const response = await askJsonLLM(REPO_SUMMARY_SYSTEM_PROMPT, userPrompt, llmCallContext ?? {}); if (response.result === null) { return null; } diff --git a/packages/ingest-github/src/types/context.md b/packages/ingest-github/src/types/context.md index 99d26f5..6e9b378 100644 --- a/packages/ingest-github/src/types/context.md +++ b/packages/ingest-github/src/types/context.md @@ -11,6 +11,11 @@ Domain (sub-folder of `@bb/ingest-github`). - `strategy.ts` — `IngestStrategy`, `StrategyInput`, `StrategyResult`, `StrategyContext`. The strategy port the orchestrator dispatches to. + `StrategyContext` carries `{ knowledgeId, orgId, repoId, +llmCallContext? }`; `llmCallContext` is the optional `AskLlmOptions` + bag the runner builds from the job payload's LLM overrides and that + each phase forwards into its `askJsonLLM` / `askYesNoLLM` calls. Absent + in OSS standalone runs — calls fall back to `Config.OpenrouterApiKey`. - `pipeline.ts` — `ScannedFile`, `OversizedFile`, `ScanEntry`, `FileAnalyzer` port, `AnalyzedFileResult`, `PipelineDeps`, `PipelineSummary`, `SkipDecider` / `SkipDeciderInput` / `SkipDecision` (the unknown-extension @@ -21,6 +26,10 @@ Domain (sub-folder of `@bb/ingest-github`). binary never calls), and `SourceFactory` / `SourceFactoryInput` / `SourceFactoryResult` (the optional injection hook surfaced through `registerGithubWorkers`; see `docs/extension-points.md`). + `FileAnalyzer.analyze()`, `SkipDeciderInput`, and `ScanDeps` each accept + an optional `llmCallContext?: AskLlmOptions` so per-job credentials + flow from `StrategyContext` into every LLM call site without breaking + the OSS standalone (defaults to undefined → config-driven). - `meta-paths.ts` — `MetaPaths` shape (`~/.bytebell/repos/.meta//...`). - `file-analysis.ts` — `FALLBACK_LANGUAGE = "unknown"` and `emptyFileAnalysis()` factory. Both consumed by the LLM adapter and the big-file condenser. diff --git a/packages/ingest-github/src/types/pipeline.ts b/packages/ingest-github/src/types/pipeline.ts index b761cef..196900e 100644 --- a/packages/ingest-github/src/types/pipeline.ts +++ b/packages/ingest-github/src/types/pipeline.ts @@ -1,5 +1,7 @@ -import type { GithubIndexPayload } from "@bb/types"; +import type { GithubIndexPayload, GithubPullPayload } from "@bb/types"; +import type { AskLlmOptions } from "@bb/llm"; import type { FileAnalysis } from "@bb/mongo"; +import type { DiffResult } from "src/pipeline/git-diff.ts"; export interface ScannedFile { kind: "file"; @@ -24,7 +26,16 @@ export interface AnalyzedFileResult { } export interface FileAnalyzer { - analyze(input: { relativePath: string; content: string }): Promise; + analyze(input: { + relativePath: string; + content: string; + /** + * Per-job LLM credential overrides. When set, passed to `askJsonLLM` so + * the file analysis uses the caller-supplied credentials instead of + * `Config.OpenrouterApiKey`. Absent in OSS standalone. + */ + llmCallContext?: AskLlmOptions; + }): Promise; } export interface PipelineSummary { @@ -41,6 +52,11 @@ export interface PipelineDeps { export interface ScanDeps { skipDecider?: SkipDecider; + /** + * Per-job LLM credential overrides forwarded to the skip-decider when it + * invokes the LLM branch. Absent in OSS standalone runs. + */ + llmCallContext?: AskLlmOptions; } export interface SourceReader { @@ -88,6 +104,35 @@ export interface SourceFactoryResult { */ export type SourceFactory = (input: SourceFactoryInput) => Promise; +export interface PullFactoryInput { + knowledgeId: string; + payload: GithubPullPayload; + /** The commit currently anchored on the knowledge in Mongo. The factory diffs from here to `targetCommit`. */ + currentCommit: string; + /** Branch the knowledge tracks. The factory resolves the target commit relative to this branch. */ + branch: string; +} + +export interface PullFactoryResult { + /** Reader pinned at the resolved target commit; used by every downstream phase for file I/O. */ + source: SourceReader; + /** Files changed between `currentCommit` and the resolved target. Same shape as `git diff --name-status`. */ + diff: DiffResult; + /** Resolved target commit hash. Either the payload's `targetCommitHash` or the branch HEAD chosen by the factory. */ + targetCommit: string; + /** Optional non-fatal sink. When set, the strategy archives analysed content via `push` after each file. */ + archiveSink?: ArchiveSink; +} + +/** + * Optional injection hook used by `registerGithubWorkers` for pull jobs. + * When provided, `runPull` skips `syncRepository` + `computePullDiff` + + * `checkoutCommit` and uses the factory's reader + diff directly. The + * open-source binary leaves this undefined and pull runs against a local + * git clone via `node:child_process`. + */ +export type PullFactory = (input: PullFactoryInput) => Promise; + export type SkipDecision = "accept" | "reject-static" | "reject-llm" | "accept-llm"; export interface SkipDeciderInput { @@ -96,6 +141,12 @@ export interface SkipDeciderInput { ext: string; /** Pre-loaded content. When set, the LLM branch uses this instead of reading absolutePath from disk. */ content?: string; + /** + * Per-job LLM credential overrides. When set and the decider invokes the + * LLM branch, these credentials override `Config.OpenrouterApiKey`. Absent + * in OSS standalone — the LLM branch falls back to the configured key. + */ + llmCallContext?: AskLlmOptions; } export interface SkipDecider { diff --git a/packages/ingest-github/src/types/strategy.ts b/packages/ingest-github/src/types/strategy.ts index 1b95537..2f079b0 100644 --- a/packages/ingest-github/src/types/strategy.ts +++ b/packages/ingest-github/src/types/strategy.ts @@ -1,4 +1,5 @@ import type { GithubIndexPayload } from "@bb/types"; +import type { AskLlmOptions } from "@bb/llm"; import type { MetaPaths } from "./meta-paths.ts"; import type { ArchiveSink, SourceReader } from "./pipeline.ts"; @@ -6,6 +7,13 @@ export interface StrategyContext { knowledgeId: string; orgId: string; repoId: string; + /** + * Per-job LLM credential overrides extracted from the job payload. When + * present, phases pass these to every `askLLM` / `askJsonLLM` call so the + * per-org credential reaches the LLM provider. Absent in OSS standalone + * runs, where calls fall back to `Config.OpenrouterApiKey`. + */ + llmCallContext?: AskLlmOptions; } export interface StrategyInput { From 286412aa86333e1d7e6df33d38ea81d8d1446637 Mon Sep 17 00:00:00 2001 From: Dead-Bytes <143434285+Dead-Bytes@users.noreply.github.com> Date: Wed, 13 May 2026 23:03:44 +0530 Subject: [PATCH 03/34] feat(internal): enhance GitHub worker with pullfactory for pulls --- packages/ingest-github/context.md | 28 ++++- packages/ingest-github/src/context.md | 37 ++++-- packages/ingest-github/src/index.ts | 18 ++- .../ingest-github/src/pipeline/context.md | 16 ++- packages/ingest-github/src/pipeline/pull.ts | 106 +++++++++++------- .../strategies/flat-folder/analyse-changed.ts | 69 +++++++----- .../src/strategies/flat-folder/context.md | 3 +- packages/ingest-github/src/types/context.md | 12 +- 8 files changed, 185 insertions(+), 104 deletions(-) diff --git a/packages/ingest-github/context.md b/packages/ingest-github/context.md index f199dd0..ba9599d 100644 --- a/packages/ingest-github/context.md +++ b/packages/ingest-github/context.md @@ -58,15 +58,31 @@ The package does **not** own: ## Public exports ```ts -function registerGithubWorkers(): void // wires JobType.GithubIndex -function registerLocalIngestWorker(): void // wires JobType.LocalIngest +function registerGithubWorkers(deps?: RegisterGithubWorkersDeps): void // wires GithubIndex + GithubPull +function registerLocalIngestWorker(): void // wires LocalIngest -interface IngestionContext { knowledgeId: string; rootDir: string } -interface IngestionStrategy { readonly name: string; ingest(ctx: IngestionContext): Promise } - -class BasicFileAnalysisStrategy implements IngestionStrategy +interface RegisterGithubWorkersDeps { + sourceFactory?: SourceFactory; // index-side hook + pullFactory?: PullFactory; // pull-side hook (provides reader + diff + targetCommit) +} ``` +The optional `sourceFactory` lets downstream consumers inject a custom +`SourceReader` for index jobs (no local clone). The analogous +`pullFactory` does the same for pull jobs — its result carries the +resolved `targetCommit`, the diff between currentCommit and targetCommit, +and a reader pinned at the target. When unset, both fall back to the +default disk-backed paths (`git clone` for index, `git fetch + diff + +checkout` for pull). See [docs/extension-points.md](docs/extension-points.md) +for the design rationale. + +For per-job LLM credentials, downstream consumers set +`{ llmApiKey?, llmProvider?, llmModel? }` on the `GithubIndexPayload` / +`GithubPullPayload` they enqueue (`PayloadLlmOverrides` from `@bb/types`). +The runner extracts those into `StrategyContext.llmCallContext` and every +LLM call site forwards it to `@bb/llm`. OSS standalone leaves the overrides +unset and falls back to `Config.OpenrouterApiKey` + `Config.LlmProvider`. + Both `register*Workers()` calls run once at `@bb/server` boot. The worker hardcodes a single `IngestionStrategy` instance (currently `new BasicFileAnalysisStrategy()`). Adding another strategy = new file diff --git a/packages/ingest-github/src/context.md b/packages/ingest-github/src/context.md index 02355e0..c71cd2f 100644 --- a/packages/ingest-github/src/context.md +++ b/packages/ingest-github/src/context.md @@ -14,13 +14,20 @@ Domain (composes infra: `@bb/config`, `@bb/llm`, `@bb/mongo`, `@bb/neo4j`, - **[index.ts](index.ts)** — public surface. `registerGithubWorkers`, `registerLocalIngestWorker`, `createFlatFolderStrategy`, `createLlmFileAnalyzer`, `createDiskSourceReader`, the - `SourceReader` / `ArchiveSink` / `SourceFactory` port types, plus - `parseGithubRepo` / `fetchLatestCommitHash` (kept for the pull plan). - `registerGithubWorkers` accepts one optional `sourceFactory` injection - parameter so downstream consumers can replace the default disk-based - clone-and-read; the open-source binary always leaves it undefined. for the - seam. `GithubPull` is registered but the handler throws - `IngestError("…being migrated…")` — the HTTP route mirrors this at 503. + `SourceReader` / `ScanEntry` / `ScannedFile` / `OversizedFile` / + `ScanDeps` / `ArchiveSink` / `ArchiveSinkInput` / `SourceFactory` / + `SourceFactoryInput` / `SourceFactoryResult` / `PullFactory` / + `PullFactoryInput` / `PullFactoryResult` / `DiffResult` / + `RenamedFile` / `FileAnalyzer` / `AnalyzedFileResult` port types, the + `IngestStrategy` / `StrategyInput` / `StrategyResult` / + `StrategyContext` types, and `CondensedFileAnalysis`. Plus + `parseGithubRepo` / `fetchLatestCommitHash` / `fetchRecentCommits` + (used by the pull route). `registerGithubWorkers` accepts optional + `sourceFactory` (index) and `pullFactory` (pull) injections through + `RegisterGithubWorkersDeps`; the open-source binary leaves both + undefined. It registers both `JobType.GithubIndex` (full re-index, via + `runner.run` + optional `sourceFactory`) and `JobType.GithubPull` + (incremental diff-and-apply via `runPull` + optional `pullFactory`). - **[githubApi.ts](githubApi.ts)** — `parseGithubRepo(repoUrl)` and `fetchLatestCommitHash(owner, repo, branch, gitToken?)`. **Pull-only utility**; revisit in the pull plan. Kept in place rather than deleted so @@ -75,10 +82,18 @@ Tier flow is strict: `types/` is the leaf; `pipeline/`, `adapters/`, ## Invariants enforced here - **One active strategy, factory-wired.** `createFlatFolderStrategy(deps)` - builds the strategy; `createPipelineRunner({ strategy })` wraps it; the - worker handlers are `(msg) => runner.run({ job, payload })`. Adding a - strategy means a new factory and a new wiring line — never editing the - worker. The archived `basic-file-analysis/` is `.archived` (not compiled). + builds the strategy; `createPipelineRunner({ strategy, sourceFactory? })` + wraps it; the worker handlers are `(msg) => runner.run({ job, payload })`. + Adding a strategy means a new factory and a new wiring line — never + editing the worker. The archived `basic-file-analysis/` is `.archived` + (not compiled). +- **Per-job LLM credentials flow payload → context → call site.** The + runner (`pipeline/run.ts` for index, `pipeline/pull.ts` for pull) reads + `{llmApiKey, llmProvider, llmModel}` from the payload, packs them into + an `AskLlmOptions` bag stored on `StrategyContext.llmCallContext`, and + every LLM-touching phase passes that bag into `askJsonLLM` / + `askYesNoLLM`. OSS standalone leaves these unset and falls back to + `Config.OpenrouterApiKey` + `Config.LlmProvider`. - **State transitions are explicit and dual-written.** `pipeline/run.ts` transitions Mongo state to `PROCESSING` before any work, `PROCESSED` on success, `FAILED` best-effort on uncaught errors. Each transition mirrors diff --git a/packages/ingest-github/src/index.ts b/packages/ingest-github/src/index.ts index 49afde3..e24d58b 100644 --- a/packages/ingest-github/src/index.ts +++ b/packages/ingest-github/src/index.ts @@ -10,16 +10,17 @@ import { COMBINED_CODE_ANALYSIS_SYSTEM_PROMPT, buildFileAnalysisUserPrompt, } from "./strategies/flat-folder/prompts/file-analysis.ts"; -import type { SourceFactory } from "./types/pipeline.ts"; +import type { PullFactory, SourceFactory } from "./types/pipeline.ts"; /** - * Optional dependencies for the GitHub workers. Today only one field is - * exposed: a source factory. Documented in `docs/extension-points.md`. - * The open-source binary leaves this undefined — the default disk reader - * runs unchanged. + * Optional dependencies for the GitHub workers. Both factories are + * documented in `docs/extension-points.md`. The open-source binary + * leaves both undefined — index and pull use the default disk-backed + * readers and a local `git clone` / `git diff`. */ export interface RegisterGithubWorkersDeps { sourceFactory?: SourceFactory; + pullFactory?: PullFactory; } function buildRunner(sourceFactory: SourceFactory | undefined): ReturnType { @@ -38,7 +39,8 @@ function buildRunner(sourceFactory: SourceFactory | undefined): ReturnType runPull(msg, pullFactory)); } export function registerLocalIngestWorker(): void { @@ -63,7 +65,11 @@ export type { SourceFactory, SourceFactoryInput, SourceFactoryResult, + PullFactory, + PullFactoryInput, + PullFactoryResult, } from "./types/pipeline.ts"; +export type { DiffResult, RenamedFile } from "./pipeline/git-diff.ts"; export type { CondensedFileAnalysis } from "./types/condensed-file-analysis.ts"; export { fetchLatestCommitHash, fetchRecentCommits, parseGithubRepo } from "./githubApi.ts"; export type { CommitEntry, FetchCommitsResult, ParsedRepo } from "./githubApi.ts"; diff --git a/packages/ingest-github/src/pipeline/context.md b/packages/ingest-github/src/pipeline/context.md index 9fdf38f..6c8cf3d 100644 --- a/packages/ingest-github/src/pipeline/context.md +++ b/packages/ingest-github/src/pipeline/context.md @@ -62,11 +62,17 @@ llmCallContext`, which every LLM call site downstream consumes. State transitions (`CREATED → QUEUED → INGESTED → …`) are persisted to Mongo - Neo4j via `transitionState`, and `CancellationError` is re-thrown without flipping to FAILED. -- `pull.ts` — `runPull(msg)` orchestrates the pull job. Mirrors the same - payload-to-`llmCallContext` extraction as `run.ts` and threads the - resulting `AskLlmOptions` bag into every phase invocation - (`analyseChangedFiles`, `processBigFilesQueue`, `backfillMissingFields`, - `backfillBigFiles`, `runSelectiveFolderSummary`, `summariseRepo`). +- `pull.ts` — `runPull(msg, pullFactory?)` orchestrates the pull job. + When `pullFactory` is provided, it returns `{source, diff, targetCommit, +archiveSink?}` and `runPull` skips `syncRepository` + `materialiseEndpoints` + + `assertReachableFromBranch` + `computePullDiff` + `checkoutCommit` — + the factory is the sole source of truth. When `pullFactory` is undefined + (open-source default), the legacy git-based path runs. Either path + produces the same downstream pipeline: snapshot prior version, + `analyseChangedFiles` (now reading via `SourceReader`), + `processBigFilesQueue`, `backfillMissingFields`, `backfillBigFiles`, + `runSelectiveFolderSummary`, `summariseRepo`, `storePullAnalysis`. + Mirrors `run.ts` for `llmCallContext` extraction from payload. - `branch.ts` — `resolveBranch(knowledgeId, payload)`. Defaults to `main` when the payload omits it; rejects branch names that don't match `^[\w./-]+$` with `IngestError` (defence against shell-injection into git args). diff --git a/packages/ingest-github/src/pipeline/pull.ts b/packages/ingest-github/src/pipeline/pull.ts index ea2c9d3..24cf87e 100644 --- a/packages/ingest-github/src/pipeline/pull.ts +++ b/packages/ingest-github/src/pipeline/pull.ts @@ -8,10 +8,11 @@ import { logger } from "@bb/logger"; import { ensureMetaDirs, metaPathsFor, repoCloneDir, ensureReposRoot } from "./paths.ts"; import { readHeadCommitHash, syncRepository } from "./source.ts"; import { CancellationError, clearCancellation, throwIfCancelled } from "./cancellation.ts"; -import { assertReachableFromBranch, checkoutCommit } from "./git-diff.ts"; +import { assertReachableFromBranch, checkoutCommit, type DiffResult } from "./git-diff.ts"; import { computePullDiff, materialiseEndpoints } from "./pull-diff-resolver.ts"; import { affectedFoldersFromDiff } from "./affected-folders.ts"; import { createDiskSourceReader } from "./disk-source-reader.ts"; +import type { PullFactory, SourceReader, ArchiveSink } from "src/types/pipeline.ts"; import { analyseChangedFiles } from "src/strategies/flat-folder/analyse-changed.ts"; import { processBigFilesQueue } from "src/strategies/flat-folder/phases/process-big-files.ts"; import { backfillMissingFields } from "src/strategies/flat-folder/backfill/fields.ts"; @@ -52,7 +53,7 @@ function llmCallContextFromPayload(payload: { return Object.keys(ctx).length > 0 ? ctx : undefined; } -export async function runPull(msg: JobMessage): Promise { +export async function runPull(msg: JobMessage, pullFactory?: PullFactory): Promise { const { knowledgeId } = msg.payload; if (msg.payload.targetCommitHash !== undefined && !COMMIT_HASH_RE.test(msg.payload.targetCommitHash)) { throw new IngestError( @@ -86,52 +87,72 @@ export async function runPull(msg: JobMessage): Promise try { throwIfCancelled(knowledgeId); - await ensureReposRoot(); - const repoDir = repoCloneDir(knowledgeId); - const cloneOpts: { repoUrl: string; branch: string; destinationDir: string; gitToken?: string } = { - repoUrl, - branch, - destinationDir: repoDir, - }; - if (gitToken !== undefined) { - cloneOpts.gitToken = gitToken; - } - await syncRepository(cloneOpts); - - const branchHead = await readHeadCommitHash(repoDir); - if (branchHead === "unknown") { - throw new IngestError(knowledgeId, "could not resolve branch HEAD after clone"); - } - const targetCommit = msg.payload.targetCommitHash ?? branchHead; - - if (targetCommit === currentCommit) { - logger.info(`pull: ${knowledgeId} already at ${targetCommit.slice(0, 12)}; no-op`); - await transitionState(knowledgeId, KnowledgeState.Processed); - return; - } - - // Deepen the shallow clone first so historical commits selected via the - // picker become visible to `merge-base --is-ancestor`. Without this the - // assertion below rejects every non-HEAD pick on a `--depth=1` clone. - await materialiseEndpoints(repoDir, branch, currentCommit, targetCommit); - if (!(await assertReachableFromBranch(repoDir, targetCommit, branch))) { - throw new IngestError( - knowledgeId, - `target commit ${targetCommit} is not reachable from origin/${branch}. Cross-branch pulls are not supported; create a fresh github_index job for the new branch.`, - ); + let source: SourceReader; + let diff: DiffResult; + let targetCommit: string; + let archiveSink: ArchiveSink | undefined; + + if (pullFactory !== undefined) { + const factoryResult = await pullFactory({ knowledgeId, payload: msg.payload, currentCommit, branch }); + source = factoryResult.source; + diff = factoryResult.diff; + targetCommit = factoryResult.targetCommit; + archiveSink = factoryResult.archiveSink; + logger.info(`pull: pull factory wired (knowledgeId=${knowledgeId}, target=${targetCommit.slice(0, 12)})`); + if (targetCommit === currentCommit) { + logger.info(`pull: ${knowledgeId} already at ${targetCommit.slice(0, 12)}; no-op`); + await transitionState(knowledgeId, KnowledgeState.Processed); + return; + } + } else { + await ensureReposRoot(); + const repoDir = repoCloneDir(knowledgeId); + const cloneOpts: { repoUrl: string; branch: string; destinationDir: string; gitToken?: string } = { + repoUrl, + branch, + destinationDir: repoDir, + }; + if (gitToken !== undefined) { + cloneOpts.gitToken = gitToken; + } + await syncRepository(cloneOpts); + + const branchHead = await readHeadCommitHash(repoDir); + if (branchHead === "unknown") { + throw new IngestError(knowledgeId, "could not resolve branch HEAD after clone"); + } + targetCommit = msg.payload.targetCommitHash ?? branchHead; + + if (targetCommit === currentCommit) { + logger.info(`pull: ${knowledgeId} already at ${targetCommit.slice(0, 12)}; no-op`); + await transitionState(knowledgeId, KnowledgeState.Processed); + return; + } + + // Deepen the shallow clone first so historical commits selected via the + // picker become visible to `merge-base --is-ancestor`. Without this the + // assertion below rejects every non-HEAD pick on a `--depth=1` clone. + await materialiseEndpoints(repoDir, branch, currentCommit, targetCommit); + + if (!(await assertReachableFromBranch(repoDir, targetCommit, branch))) { + throw new IngestError( + knowledgeId, + `target commit ${targetCommit} is not reachable from origin/${branch}. Cross-branch pulls are not supported; create a fresh github_index job for the new branch.`, + ); + } + + diff = await computePullDiff(repoDir, currentCommit, targetCommit); + await checkoutCommit(repoDir, targetCommit); + source = createDiskSourceReader({ repoDir, commitHash: targetCommit }); } - const diff = await computePullDiff(repoDir, currentCommit, targetCommit); - throwIfCancelled(knowledgeId); await snapshotFilesToVersion({ knowledgeId, commitHash: currentCommit }).catch((cause: unknown) => { const msgText = cause instanceof Error ? cause.message : String(cause); logger.warn(`pull: snapshot of ${currentCommit.slice(0, 12)} failed (non-fatal): ${msgText}`); }); - await checkoutCommit(repoDir, targetCommit); - const metaPaths = metaPathsFor(knowledgeId); await ensureMetaDirs(metaPaths); @@ -148,7 +169,7 @@ export async function runPull(msg: JobMessage): Promise throwIfCancelled(knowledgeId); const analyseChangedInput: Parameters[0] = { knowledgeId, - repoDir, + source, metaPaths, analyzer: fileAnalyzer, diff, @@ -156,10 +177,11 @@ export async function runPull(msg: JobMessage): Promise if (llmCallContext !== undefined) { analyseChangedInput.llmCallContext = llmCallContext; } + if (archiveSink !== undefined) { + analyseChangedInput.archiveSink = archiveSink; + } await analyseChangedFiles(analyseChangedInput); - const source = createDiskSourceReader({ repoDir, commitHash: targetCommit }); - logger.info(`pull: phase process big files starting`); throwIfCancelled(knowledgeId); const processBigFilesInput: Parameters[0] = { knowledgeId, source, metaPaths }; diff --git a/packages/ingest-github/src/strategies/flat-folder/analyse-changed.ts b/packages/ingest-github/src/strategies/flat-folder/analyse-changed.ts index 1f10ae8..6db4a7a 100644 --- a/packages/ingest-github/src/strategies/flat-folder/analyse-changed.ts +++ b/packages/ingest-github/src/strategies/flat-folder/analyse-changed.ts @@ -1,10 +1,9 @@ import path from "node:path"; -import { readFile, stat } from "node:fs/promises"; -import { tokenLen } from "@bb/llm"; +import { tokenLen, type AskLlmOptions } from "@bb/llm"; import { logger } from "@bb/logger"; import { Config } from "@bb/types"; import { getConfigValue } from "@bb/config"; -import type { FileAnalyzer, ScannedFile } from "src/types/pipeline.ts"; +import type { ArchiveSink, FileAnalyzer, ScannedFile, SourceReader } from "src/types/pipeline.ts"; import type { MetaPaths } from "src/types/meta-paths.ts"; import type { BigFileEntry } from "src/types/big-file.ts"; import { looksBinary, passesPathFilters } from "src/pipeline/filters.ts"; @@ -17,10 +16,13 @@ import { readBigFiles, writeBigFiles } from "src/strategies/flat-folder/big-file export interface AnalyseChangedInput { knowledgeId: string; - repoDir: string; + source: SourceReader; metaPaths: MetaPaths; analyzer: FileAnalyzer; diff: DiffResult; + llmCallContext?: AskLlmOptions; + /** Optional non-fatal archive sink. When set, analysed content is pushed after `saveCondensed`. */ + archiveSink?: ArchiveSink; /** * Invoked once per consumed path (analysed, stubbed, queued-as-big-file, * filtered, or failed). Lets the caller drive a `processedFiles` counter @@ -39,19 +41,24 @@ export interface AnalyseChangedResult { } /** - * Pull-time per-file dispatcher. Iterates the changed file set from the git + * Pull-time per-file dispatcher. Iterates the changed file set from the * diff and runs the same per-file work as `classifyAndAnalyseSmall`, but * targeted at known paths rather than a tree walk. * - * For added / modified / renamed-to paths: read content, apply static path - * filters, classify by tokens. Small files run the analyser inline and - * persist a `CondensedFileAnalysis`. Files above the context window join - * `bigFiles.json` for the big-file phase. Files above the absolute size cap - * get an oversized stub. + * Reads file content through `input.source` (a `SourceReader`) so the + * dispatcher works with both the disk-backed reader (OSS default) and + * any HTTP-backed alternative supplied via the pull factory hook. * - * The dispatcher does NOT invoke the skip-decision LLM gate. Pulls re-analyse - * paths that already passed the gate during the initial index (or paths so - * new the gate has not seen them yet — for v1 we accept that lag). + * For added / modified / renamed-to paths: read content, apply static + * path filters, classify by tokens. Small files run the analyser inline + * and persist a `CondensedFileAnalysis`. Files above the context window + * join `bigFiles.json` for the big-file phase. Files above the absolute + * size cap get an oversized stub. + * + * The dispatcher does NOT invoke the skip-decision LLM gate. Pulls + * re-analyse paths that already passed the gate during the initial + * index (or paths so new the gate has not seen them yet — for v1 we + * accept that lag). */ export async function analyseChangedFiles(input: AnalyseChangedInput): Promise { const contextWindowLimit = getConfigValue(Config.ContextWindowLimit); @@ -87,15 +94,19 @@ export async function analyseChangedFiles(input: AnalyseChangedInput): Promise absoluteCap) { bigFileBuffer.push({ @@ -114,19 +125,10 @@ export async function analyseChangedFiles(input: AnalyseChangedInput): Promise bigFileLineThreshold) { bigFileBuffer.push({ relativePath, @@ -158,16 +160,25 @@ export async function analyseChangedFiles(input: AnalyseChangedInput): Promise { try { throwIfCancelled(input.knowledgeId); - const condensed = await analyseScannedFile(input.analyzer, scanned); + const condensed = await analyseScannedFile(input.analyzer, scanned, input.llmCallContext); await saveCondensed(input.metaPaths, condensed); + if (input.archiveSink !== undefined) { + await input.archiveSink.push({ + knowledgeId: input.knowledgeId, + relativePath: filePath, + content: fileContent, + }); + } smallFilesAnalysed += 1; } catch (cause: unknown) { if (cause instanceof CancellationError) { diff --git a/packages/ingest-github/src/strategies/flat-folder/context.md b/packages/ingest-github/src/strategies/flat-folder/context.md index a5e6c07..ee75ffc 100644 --- a/packages/ingest-github/src/strategies/flat-folder/context.md +++ b/packages/ingest-github/src/strategies/flat-folder/context.md @@ -35,7 +35,8 @@ sub-phase boundary. - `index.ts` — `createFlatFolderStrategy(deps)` orchestrates the 7 phases. - `types.ts` — `AnalyzedFileEntry`, `FolderSummary`, `RepoSummary`, `RepoSummaryEnvelope`, `FlatFolderResult`. -- `analyse-file.ts` — `analyseScannedFile(analyzer, file)` + `buildOversizedStub`. +- `analyse-file.ts` — `analyseScannedFile(analyzer, file, llmCallContext?)` + `buildOversizedStub`. +- `analyse-changed.ts` — `analyseChangedFiles({knowledgeId, source, metaPaths, analyzer, diff, llmCallContext?, archiveSink?})`. Pull-time per-file dispatcher. Reads changed file content through `input.source` (a `SourceReader`) so it works with both the disk-backed reader (OSS default) and any HTTP-backed alternative supplied via the `pullFactory` hook. Mirrors `classifyAndAnalyseSmall`'s small-file path: filter → fetch → size cap → binary detect → line count → analyse → save + archive push. Does NOT invoke the skip-decision LLM gate. - `folder-path.ts` — `directFolderOf`, `affectedFolderPaths`. - `folder-summary.ts` — group + summarise + persist + iterate folder summaries. - `repo-summary.ts` — single-shot or batched repo summary with envelope writer. diff --git a/packages/ingest-github/src/types/context.md b/packages/ingest-github/src/types/context.md index 6e9b378..87b2cea 100644 --- a/packages/ingest-github/src/types/context.md +++ b/packages/ingest-github/src/types/context.md @@ -23,13 +23,17 @@ llmCallContext? }`; `llmCallContext` is the optional `AskLlmOptions` `SourceReader` / `ScanDeps` (the repository-read abstraction; default implementation in `pipeline/disk-source-reader.ts`), `ArchiveSink` / `ArchiveSinkInput` (an optional non-fatal sink that the open-source - binary never calls), and `SourceFactory` / `SourceFactoryInput` / - `SourceFactoryResult` (the optional injection hook surfaced through - `registerGithubWorkers`; see `docs/extension-points.md`). + binary never calls), `SourceFactory` / `SourceFactoryInput` / + `SourceFactoryResult` (the optional index-side injection hook surfaced + through `registerGithubWorkers`), and `PullFactory` / `PullFactoryInput` + / `PullFactoryResult` (the analogous pull-side injection hook). `FileAnalyzer.analyze()`, `SkipDeciderInput`, and `ScanDeps` each accept an optional `llmCallContext?: AskLlmOptions` so per-job credentials flow from `StrategyContext` into every LLM call site without breaking - the OSS standalone (defaults to undefined → config-driven). + the OSS standalone (defaults to undefined → config-driven). Both + factories are documented in `docs/extension-points.md`. The two are + separate because pull additionally needs a `diff` and a resolved + `targetCommit`, which index doesn't. - `meta-paths.ts` — `MetaPaths` shape (`~/.bytebell/repos/.meta//...`). - `file-analysis.ts` — `FALLBACK_LANGUAGE = "unknown"` and `emptyFileAnalysis()` factory. Both consumed by the LLM adapter and the big-file condenser. From d0fbbfa17c40fbc08245a1b4ce6472c5ec6b33e7 Mon Sep 17 00:00:00 2001 From: Dead-Bytes <143434285+Dead-Bytes@users.noreply.github.com> Date: Thu, 14 May 2026 00:36:30 +0530 Subject: [PATCH 04/34] feat(internal): enhance GitHub ingestion with new pipeline and LLM support --- packages/ingest-github/context.md | 24 ++++++++-- packages/ingest-github/src/context.md | 49 ++++++++++++++------- packages/ingest-github/src/index.ts | 7 +++ packages/ingest-github/src/pipeline/pull.ts | 4 +- packages/ingest-github/src/pipeline/run.ts | 4 +- packages/types/context.md | 17 ++++--- packages/types/src/context.md | 16 ++++--- packages/types/src/job.ts | 18 +++++++- 8 files changed, 101 insertions(+), 38 deletions(-) diff --git a/packages/ingest-github/context.md b/packages/ingest-github/context.md index ba9599d..be7800b 100644 --- a/packages/ingest-github/context.md +++ b/packages/ingest-github/context.md @@ -58,6 +58,7 @@ The package does **not** own: ## Public exports ```ts +// High-level registration (OSS standalone wires this once at boot) function registerGithubWorkers(deps?: RegisterGithubWorkersDeps): void // wires GithubIndex + GithubPull function registerLocalIngestWorker(): void // wires LocalIngest @@ -65,6 +66,18 @@ interface RegisterGithubWorkersDeps { sourceFactory?: SourceFactory; // index-side hook pullFactory?: PullFactory; // pull-side hook (provides reader + diff + targetCommit) } + +// Lower-level building blocks (downstream consumers with their own queue +// skip registerGithubWorkers and wire these against their own registry) +function createPipelineRunner(deps: CreatePipelineRunnerDeps): IngestRunnerDeps +function createGithubIngestHandler(deps: IngestJobHandlerDeps): (msg) => Promise +function createLocalIngestHandler(deps: IngestJobHandlerDeps): (msg) => Promise +function runPull(msg: JobMessage, pullFactory?: PullFactory): Promise +function reposRoot(): string + +function createFlatFolderStrategy(deps): IngestStrategy +function createLlmFileAnalyzer(deps): FileAnalyzer +function createDiskSourceReader(deps): SourceReader ``` The optional `sourceFactory` lets downstream consumers inject a custom @@ -77,10 +90,13 @@ checkout` for pull). See [docs/extension-points.md](docs/extension-points.md) for the design rationale. For per-job LLM credentials, downstream consumers set -`{ llmApiKey?, llmProvider?, llmModel? }` on the `GithubIndexPayload` / -`GithubPullPayload` they enqueue (`PayloadLlmOverrides` from `@bb/types`). -The runner extracts those into `StrategyContext.llmCallContext` and every -LLM call site forwards it to `@bb/llm`. OSS standalone leaves the overrides +`{ llmApiKey?, llmProvider?, llmModel?, llmKeyId? }` on the +`GithubIndexPayload` / `GithubPullPayload` they enqueue +(`PayloadLlmOverrides` from `@bb/types`). The runner extracts those into +`StrategyContext.llmCallContext` and every LLM call site forwards it to +`@bb/llm`. `llmProvider` is `string` (open) so multi-provider consumers +can carry richer taxonomies; the OSS LLM client narrows to +`openrouter`/`ollama` at the boundary. OSS standalone leaves the overrides unset and falls back to `Config.OpenrouterApiKey` + `Config.LlmProvider`. Both `register*Workers()` calls run once at `@bb/server` boot. The diff --git a/packages/ingest-github/src/context.md b/packages/ingest-github/src/context.md index c71cd2f..2c22181 100644 --- a/packages/ingest-github/src/context.md +++ b/packages/ingest-github/src/context.md @@ -11,23 +11,38 @@ Domain (composes infra: `@bb/config`, `@bb/llm`, `@bb/mongo`, `@bb/neo4j`, ## Top-level files -- **[index.ts](index.ts)** — public surface. `registerGithubWorkers`, - `registerLocalIngestWorker`, `createFlatFolderStrategy`, - `createLlmFileAnalyzer`, `createDiskSourceReader`, the - `SourceReader` / `ScanEntry` / `ScannedFile` / `OversizedFile` / - `ScanDeps` / `ArchiveSink` / `ArchiveSinkInput` / `SourceFactory` / - `SourceFactoryInput` / `SourceFactoryResult` / `PullFactory` / - `PullFactoryInput` / `PullFactoryResult` / `DiffResult` / - `RenamedFile` / `FileAnalyzer` / `AnalyzedFileResult` port types, the - `IngestStrategy` / `StrategyInput` / `StrategyResult` / - `StrategyContext` types, and `CondensedFileAnalysis`. Plus - `parseGithubRepo` / `fetchLatestCommitHash` / `fetchRecentCommits` - (used by the pull route). `registerGithubWorkers` accepts optional - `sourceFactory` (index) and `pullFactory` (pull) injections through - `RegisterGithubWorkersDeps`; the open-source binary leaves both - undefined. It registers both `JobType.GithubIndex` (full re-index, via - `runner.run` + optional `sourceFactory`) and `JobType.GithubPull` - (incremental diff-and-apply via `runPull` + optional `pullFactory`). +- **[index.ts](index.ts)** — public surface. The high-level + registration helpers (`registerGithubWorkers`, `registerLocalIngestWorker`) + for the OSS standalone, plus the lower-level building blocks downstream + consumers wire against their own queue/registry: + - Factories: `createFlatFolderStrategy`, `createLlmFileAnalyzer`, + `createDiskSourceReader`, `createPipelineRunner` (the orchestrator), + `createGithubIngestHandler` / `createLocalIngestHandler` (the BullMQ + processor factories used internally by `registerGithubWorkers`). + - Direct runner: `runPull(msg, pullFactory?)` — the pull worker the + enterprise wrapper invokes directly from its own registry. + - Helper: `reposRoot()` — resolves `~/.bytebell/repos`. + - Port types: `SourceReader` / `ScanEntry` / `ScannedFile` / + `OversizedFile` / `ScanDeps` / `ArchiveSink` / `ArchiveSinkInput` / + `SourceFactory` / `SourceFactoryInput` / `SourceFactoryResult` / + `PullFactory` / `PullFactoryInput` / `PullFactoryResult` / + `DiffResult` / `RenamedFile` / `FileAnalyzer` / `AnalyzedFileResult`. + - Runner types: `IngestRunnerDeps` / `IngestRunnerInput` / + `IngestJobHandlerDeps` / `CreatePipelineRunnerDeps`. + - Strategy types: `IngestStrategy` / `StrategyInput` / `StrategyResult` / + `StrategyContext`. + - `CondensedFileAnalysis`. + - GitHub helpers: `parseGithubRepo` / `fetchLatestCommitHash` / + `fetchRecentCommits`. + `registerGithubWorkers` accepts optional `sourceFactory` (index) and + `pullFactory` (pull) injections through `RegisterGithubWorkersDeps`; + the open-source binary leaves both undefined. It registers both + `JobType.GithubIndex` (full re-index, via `runner.run` + optional + `sourceFactory`) and `JobType.GithubPull` (incremental diff-and-apply + via `runPull` + optional `pullFactory`). Downstream consumers that + bring their own queue (e.g. the enterprise wrapper using `@bytebell/queue`) + skip `registerGithubWorkers` entirely and call `createPipelineRunner`, + `createGithubIngestHandler`, and `runPull` directly. - **[githubApi.ts](githubApi.ts)** — `parseGithubRepo(repoUrl)` and `fetchLatestCommitHash(owner, repo, branch, gitToken?)`. **Pull-only utility**; revisit in the pull plan. Kept in place rather than deleted so diff --git a/packages/ingest-github/src/index.ts b/packages/ingest-github/src/index.ts index e24d58b..ead22c6 100644 --- a/packages/ingest-github/src/index.ts +++ b/packages/ingest-github/src/index.ts @@ -51,6 +51,13 @@ export function registerLocalIngestWorker(): void { export { createFlatFolderStrategy } from "./strategies/flat-folder/index.ts"; export { createLlmFileAnalyzer } from "./adapters/llm-file-analyzer.ts"; export { createDiskSourceReader } from "./pipeline/disk-source-reader.ts"; +export { createPipelineRunner } from "./pipeline/run.ts"; +export type { CreatePipelineRunnerDeps } from "./pipeline/run.ts"; +export { createGithubIngestHandler, createLocalIngestHandler } from "./handlers/ingest-job.ts"; +export type { IngestJobHandlerDeps } from "./handlers/ingest-job.ts"; +export { runPull } from "./pipeline/pull.ts"; +export { reposRoot } from "./pipeline/paths.ts"; +export type { IngestRunnerDeps, IngestRunnerInput } from "./types/ingest-runner.ts"; export type { IngestStrategy, StrategyInput, StrategyResult, StrategyContext } from "./types/strategy.ts"; export type { FileAnalyzer, diff --git a/packages/ingest-github/src/pipeline/pull.ts b/packages/ingest-github/src/pipeline/pull.ts index 24cf87e..c3f08f3 100644 --- a/packages/ingest-github/src/pipeline/pull.ts +++ b/packages/ingest-github/src/pipeline/pull.ts @@ -37,14 +37,14 @@ function resolveOrgId(payload: { orgId?: string }): string { function llmCallContextFromPayload(payload: { llmApiKey?: string; - llmProvider?: "openrouter" | "ollama"; + llmProvider?: string; llmModel?: string; }): AskLlmOptions | undefined { const ctx: AskLlmOptions = {}; if (payload.llmApiKey !== undefined && payload.llmApiKey.length > 0) { ctx.apiKey = payload.llmApiKey; } - if (payload.llmProvider !== undefined) { + if (payload.llmProvider === "openrouter" || payload.llmProvider === "ollama") { ctx.provider = payload.llmProvider; } if (payload.llmModel !== undefined && payload.llmModel.length > 0) { diff --git a/packages/ingest-github/src/pipeline/run.ts b/packages/ingest-github/src/pipeline/run.ts index 31924f3..e0b5482 100644 --- a/packages/ingest-github/src/pipeline/run.ts +++ b/packages/ingest-github/src/pipeline/run.ts @@ -23,14 +23,14 @@ function resolveOrgId(payload: { orgId?: string }): string { function llmCallContextFromPayload(payload: { llmApiKey?: string; - llmProvider?: "openrouter" | "ollama"; + llmProvider?: string; llmModel?: string; }): AskLlmOptions | undefined { const ctx: AskLlmOptions = {}; if (payload.llmApiKey !== undefined && payload.llmApiKey.length > 0) { ctx.apiKey = payload.llmApiKey; } - if (payload.llmProvider !== undefined) { + if (payload.llmProvider === "openrouter" || payload.llmProvider === "ollama") { ctx.provider = payload.llmProvider; } if (payload.llmModel !== undefined && payload.llmModel.length > 0) { diff --git a/packages/types/context.md b/packages/types/context.md index ad0d679..31c18a4 100644 --- a/packages/types/context.md +++ b/packages/types/context.md @@ -19,10 +19,15 @@ Single home for shared types and enums that cross package boundaries: `PayloadLlmOverrides` — the queue/job vocabulary shared between `@bb/queue` (publisher) and `@bb/ingest-*` packages (worker handlers). `PayloadLlmOverrides` is the optional `{ llmApiKey?, llmProvider?, -llmModel? }` mixin that lets downstream consumers carry per-job LLM - credentials through the payload (the extension point used by - the enterprise wrapper to inject per-org credentials at the enqueue - boundary). Mixed into both GitHub payloads. +llmModel?, llmKeyId? }` mixin that lets downstream consumers carry per-job + LLM credentials through the payload (the extension point used by the + enterprise wrapper to inject per-org credentials at the enqueue + boundary). `llmProvider` is intentionally typed as `string` rather than + a closed union — OSS standalone uses `"openrouter"`/`"ollama"`, but + downstream consumers may carry richer taxonomies (`"anthropic"`, + `"gemini"`, …) that OSS ignores at runtime. `llmKeyId` is opaque to OSS; + it's an audit pointer kept by downstream consumers. Mixed into both + GitHub payloads. - `KnowledgeState` — the processing-status lifecycle enum (`CREATED → QUEUED → INGESTED → PROCESSING → PROCESSED ↘ FAILED`) referenced by `@bb/queue` (writes `QUEUED`), `@bb/mongo` (`setKnowledgeState`), and @@ -39,9 +44,9 @@ enum Config { ... } enum JobType { GithubIndex, GithubPull, LocalIngest } enum JobPriority { Low, Normal, High } -interface PayloadLlmOverrides { llmApiKey?, llmProvider?: "openrouter" | "ollama", llmModel? } +interface PayloadLlmOverrides { llmApiKey?, llmProvider?: string, llmModel?, llmKeyId? } interface GithubIndexPayload extends PayloadLlmOverrides { knowledgeId, repoUrl, branch?, commitHash?, gitToken?, orgId? } -interface GithubPullPayload extends PayloadLlmOverrides { knowledgeId, targetCommitHash?, gitToken? } +interface GithubPullPayload extends PayloadLlmOverrides { knowledgeId, orgId?, targetCommitHash?, gitToken? } interface LocalIngestPayload { knowledgeId, rootDir, orgId? } interface JobMessage

{ id, type, priority, knowledgeId, attempt, createdAt, payload } type PayloadFor diff --git a/packages/types/src/context.md b/packages/types/src/context.md index e2a6ed1..6786ddd 100644 --- a/packages/types/src/context.md +++ b/packages/types/src/context.md @@ -24,11 +24,17 @@ package-level contract; this file documents how the source tree is split. it and the pipeline reads `Config.OrgId` from `~/.bytebell/config.json` (locked to `"local"` in OSS builds; downstream enterprise builds set `orgId` per-job). Both GitHub payloads also extend `PayloadLlmOverrides` - which adds optional `llmApiKey?`, `llmProvider?`, `llmModel?` — the - extension point that lets downstream enterprise builds resolve per-org - LLM credentials at the enqueue boundary and pass them through the - payload. OSS standalone leaves the LLM fields unset and the pipeline - falls back to `Config.OpenrouterApiKey` + `Config.LlmProvider`. + which adds optional `llmApiKey?`, `llmProvider?: string`, `llmModel?`, + `llmKeyId?` — the extension point that lets downstream enterprise + builds resolve per-org LLM credentials at the enqueue boundary and + pass them through the payload. `llmProvider` is `string` (not a closed + union) so multi-provider enterprise consumers can carry `"anthropic"`, + `"gemini"`, etc.; OSS narrows to `"openrouter"`/`"ollama"` at the LLM + client boundary. `llmKeyId` is opaque audit metadata OSS ignores. OSS + standalone leaves all four fields unset and the pipeline falls back to + `Config.OpenrouterApiKey` + `Config.LlmProvider`. `GithubPullPayload` + also carries an optional `orgId?` so downstream multi-tenant workers + can scope Mongo/Neo4j lookups by org. - **[knowledge.ts](knowledge.ts)** — the `KnowledgeState` enum modeling the lifecycle in [CLAUDE.md](../../../CLAUDE.md). v0 only ships the enum; the full `Knowledge` document interface lands when domain CRUD diff --git a/packages/types/src/job.ts b/packages/types/src/job.ts index 6fa8142..befc5df 100644 --- a/packages/types/src/job.ts +++ b/packages/types/src/job.ts @@ -15,12 +15,20 @@ export enum JobPriority { * `Config.OpenrouterApiKey` and `Config.LlmProvider` for the duration of this * job's processing. Used by downstream consumers (e.g. the enterprise wrapper) * that resolve per-org credentials at the enqueue boundary and infuse them - * into the payload — OSS standalone leaves all three unset. + * into the payload — OSS standalone leaves all four unset. + * + * `llmProvider` is intentionally `string` rather than a closed union: OSS + * standalone uses `"openrouter"` or `"ollama"` (the only values the LLM + * client routes on today), but downstream consumers may carry richer + * provider taxonomies (`"anthropic"`, `"gemini"`, `"mistral"`, …) that the + * OSS client ignores. The `llmKeyId` field is opaque to OSS — kept as an + * audit pointer back to the resolver's source of truth. */ export interface PayloadLlmOverrides { llmApiKey?: string; - llmProvider?: "openrouter" | "ollama"; + llmProvider?: string; llmModel?: string; + llmKeyId?: string; } export interface GithubIndexPayload extends PayloadLlmOverrides { @@ -34,6 +42,12 @@ export interface GithubIndexPayload extends PayloadLlmOverrides { export interface GithubPullPayload extends PayloadLlmOverrides { knowledgeId: string; + /** + * Optional org binding. OSS standalone leaves this unset and the pipeline + * reads `Config.OrgId` (locked to `"local"`). Downstream multi-tenant + * deployments stamp it from the request so worker lookups can scope by org. + */ + orgId?: string; /** * Optional commit to re-index the knowledge to. Must be a 40-character hex SHA * and must be reachable from `origin/`. When omitted, the From a4cb935a41637323aaf2ccb1699c1d501aa2cfb0 Mon Sep 17 00:00:00 2001 From: lovanshu garg Date: Thu, 14 May 2026 00:50:57 +0530 Subject: [PATCH 05/34] feat(interactive): brscnh fetching --- packages/cli/README.md | 5 +- packages/cli/src/BranchSelector.tsx | 130 ++++++++ packages/cli/src/IndexCommand.ts | 117 ++++++- packages/cli/src/InitialBranchSelector.tsx | 67 ++++ packages/cli/src/LsCommand.ts | 26 +- packages/cli/src/LsInteractive.tsx | 289 ++++++++++++++++++ packages/cli/src/ManualBranchPrompt.tsx | 42 +++ packages/cli/src/branchPrompts.ts | 71 +++++ packages/cli/src/httpClient.ts | 18 +- packages/cli/src/lsInteractivePrompt.ts | 23 ++ packages/cli/src/pullPrompts.ts | 2 +- .../src/{githubApi.ts => githubCommit.ts} | 73 +---- packages/ingest-github/src/githubRepo.ts | 139 +++++++++ packages/ingest-github/src/githubUrl.ts | 53 ++++ packages/ingest-github/src/index.ts | 10 +- packages/ingest-github/src/pipeline/branch.ts | 28 +- packages/ingest-github/src/pipeline/run.ts | 8 +- packages/mongo/src/index.ts | 1 + packages/mongo/src/knowledge.ts | 12 + packages/neo4j/src/index.ts | 7 +- packages/neo4j/src/knowledge.ts | 13 + packages/server/src/githubProbeRoute.ts | 60 ++++ packages/server/src/routes.ts | 2 + 23 files changed, 1087 insertions(+), 109 deletions(-) create mode 100644 packages/cli/src/BranchSelector.tsx create mode 100644 packages/cli/src/InitialBranchSelector.tsx create mode 100644 packages/cli/src/LsInteractive.tsx create mode 100644 packages/cli/src/ManualBranchPrompt.tsx create mode 100644 packages/cli/src/branchPrompts.ts create mode 100644 packages/cli/src/lsInteractivePrompt.ts rename packages/ingest-github/src/{githubApi.ts => githubCommit.ts} (66%) create mode 100644 packages/ingest-github/src/githubRepo.ts create mode 100644 packages/ingest-github/src/githubUrl.ts create mode 100644 packages/server/src/githubProbeRoute.ts diff --git a/packages/cli/README.md b/packages/cli/README.md index 619a054..d1b0fd6 100644 --- a/packages/cli/README.md +++ b/packages/cli/README.md @@ -51,7 +51,8 @@ infra/docker/docker-compose.yml up -d`, polls prefer `bytebell boot`. - `bytebell index ` / `bytebell ingest [path]` / `bytebell ls` — talk HTTP to a running server (lazy-spawn via - `serverSpawn.ensureServerRunning` when the daemon is down). + `serverSpawn.ensureServerRunning` when the daemon is down). `ls` supports + an interactive mode (`-i`) for hierarchical browsing of repos and commits. - `bytebell delete` — list indexed knowledge in an Ink arrow-key picker (`DeleteSelector.tsx`, plain `useInput` — no extra dep), and on confirm `DELETE /api/v1/repos/:id` against the running server. The @@ -151,7 +152,7 @@ will touch when implemented. Only the **bolded** entries ship in v0. | **`bytebell server start`** | **Spawn `bytebell-server` in foreground.** | **Shipped** | | **`bytebell index `** | **POST `/api/v1/github/index` to local server.** | **Shipped** | | **`bytebell ingest [path]`** | **POST `/api/v1/local/index` for a directory tree.** | **Shipped** | -| **`bytebell ls`** | **Render `/api/v1/repos` as a table.** | **Shipped** | +| **`bytebell ls`** | **Render `/api/v1/repos` as a table or interactive explorer (`-i`). v0.** | **Shipped** | | **`bytebell delete`** | **Ink picker over `/api/v1/repos`, then DELETE `/api/v1/repos/:id` (Mongo + Neo4j + jobs).** | **Shipped** | | **`bytebell stats`** | **Render `/api/v1/stats` (totals + per-repo + per-commit token / cost rows).** | **Shipped** | | `bytebell` | Ink dashboard with Repos / Server / Activity / Cost panes ([docs/arch.md:172-184](../../docs/arch.md#L172-L184)) | After `@bb/server` HTTP API + activity feed | diff --git a/packages/cli/src/BranchSelector.tsx b/packages/cli/src/BranchSelector.tsx new file mode 100644 index 0000000..73b4816 --- /dev/null +++ b/packages/cli/src/BranchSelector.tsx @@ -0,0 +1,130 @@ +import { useMemo, useState } from "react"; +import type { ReactElement } from "react"; +import { Box, Text, useApp, useInput } from "ink"; + +export interface BranchSelectorResult { + branch?: string; + typeManually?: boolean; + cancelled?: boolean; +} + +export interface BranchSelectorProps { + branches: string[]; + title?: string; + onDone: (result: BranchSelectorResult) => void; +} + +const MAX_VISIBLE = 12; + +type ItemKind = "branch" | "manual"; + +export function BranchSelector({ branches: rawBranches, title, onDone }: BranchSelectorProps): ReactElement { + const { exit } = useApp(); + const [filter, setFilter] = useState(""); + const [index, setIndex] = useState(0); + + const branches = useMemo(() => { + const items: Array<{ label: string; kind: ItemKind }> = rawBranches.map((b) => ({ + label: b, + kind: "branch", + })); + items.push({ label: "Type manually...", kind: "manual" }); + return items; + }, [rawBranches]); + + const filtered = useMemo(() => { + if (filter.length === 0) { + return branches; + } + const needle = filter.toLowerCase(); + return branches.filter((item) => item.label.toLowerCase().includes(needle)); + }, [branches, filter]); + + const boundedIndex = filtered.length === 0 ? 0 : Math.min(index, filtered.length - 1); + + useInput((input, key) => { + if (key.escape) { + exit(); + onDone({ cancelled: true }); + return; + } + if (key.return) { + const chosen = filtered[boundedIndex]; + if (!chosen) { + exit(); + onDone({ cancelled: true }); + return; + } + exit(); + if (chosen.kind === "manual") { + onDone({ typeManually: true }); + } else { + onDone({ branch: chosen.label }); + } + return; + } + if (key.upArrow || (input === "k" && filter.length === 0)) { + setIndex(() => (boundedIndex > 0 ? boundedIndex - 1 : Math.max(filtered.length - 1, 0))); + return; + } + if (key.downArrow || (input === "j" && filter.length === 0)) { + setIndex(() => (boundedIndex < filtered.length - 1 ? boundedIndex + 1 : 0)); + return; + } + if (key.backspace || key.delete) { + setFilter((s) => s.slice(0, -1)); + setIndex(0); + return; + } + if (input.length > 0 && !key.ctrl && !key.meta) { + setFilter((s) => s + input); + setIndex(0); + } + }); + + const heading = title ?? "Select a branch"; + const visibleStart = clampWindow(boundedIndex, filtered.length, MAX_VISIBLE); + const visible = filtered.slice(visibleStart, visibleStart + MAX_VISIBLE); + + return ( + + + {heading} + {` (${filtered.length}/${branches.length})`} + + + filter: + {filter.length > 0 ? filter : (type to filter)} + + {filtered.length === 0 ? ( + + No branches match the filter. Backspace to clear. + + ) : ( + visible.map((item, i) => { + const absoluteIndex = visibleStart + i; + const cursor = absoluteIndex === boundedIndex; + const isManual = item.kind === "manual"; + return ( + + {cursor ? "▶ " : " "} + {item.label} + + ); + }) + )} + + [type to filter] [↑/↓] move [Enter] choose [Backspace] clear [Esc] cancel + + + ); +} + +function clampWindow(index: number, total: number, size: number): number { + if (total <= size) { + return 0; + } + const halfWindow = Math.floor(size / 2); + const start = Math.max(0, Math.min(index - halfWindow, total - size)); + return start; +} diff --git a/packages/cli/src/IndexCommand.ts b/packages/cli/src/IndexCommand.ts index 26cc202..51fc392 100644 --- a/packages/cli/src/IndexCommand.ts +++ b/packages/cli/src/IndexCommand.ts @@ -3,8 +3,11 @@ import { Config } from "@bb/types"; import { getConfigValue } from "@bb/config"; import { ensureServerRunning, ServerStartTimeoutError } from "./serverSpawn.ts"; import { getJson, HttpClientError, postJson } from "./httpClient.ts"; -import { createProgressBar, createSpinner, error, type ProgressBar } from "./output.ts"; +import { createProgressBar, createSpinner, error, info, list, type ProgressBar } from "./output.ts"; import { startLogTailer, type LogTailer } from "./logTailer.ts"; +import { promptForToken } from "./pullPrompts.ts"; +import { promptInitialBranch, promptFullBranchSelector } from "./branchPrompts.ts"; +import { parseGithubRepo } from "@bb/ingest-github"; interface IndexResponse { knowledgeId: string; @@ -52,12 +55,16 @@ async function runIndex( if (options.verbose === true) { tailer = await startLogTailer("server"); } - const body: Record = { repoUrl: gitUrl }; - if (options.branch !== undefined) { - body["branch"] = options.branch; + + const { branch: resolvedBranch, token: activeToken } = await probeRepo(gitUrl, options.branch, options.token); + if (resolvedBranch === null) { + // User cancelled during token prompt + return; } - if (options.token !== undefined) { - body["gitToken"] = options.token; + + const body: Record = { repoUrl: gitUrl, branch: resolvedBranch }; + if (activeToken !== undefined) { + body["gitToken"] = activeToken; } const response = await postJson("/api/v1/github/index", body); await pollJobStatus(response.knowledgeId, response.jobId); @@ -126,6 +133,104 @@ async function pollJobStatus(knowledgeId: string, jobId: string): Promise } } +interface ProbeResponse { + status: "ok" | "not_found" | "unauthorized" | "rate_limited" | "error" | "branch_not_found"; + defaultBranch?: string; + branches?: string[]; + message?: string; +} + +async function probeRepo( + gitUrl: string, + suppliedBranch?: string, + suppliedToken?: string, +): Promise<{ branch: string | null; token?: string }> { + let token = suppliedToken; + const parsed = parseGithubRepo(gitUrl); + const repoLabel = parsed ? `${parsed.owner}/${parsed.repo}` : gitUrl; + + // 1. Initial probe to find default branch and check access + const callProbe = async (t?: string) => { + try { + return await postJson("/api/v1/github/probe", { repoUrl: gitUrl, gitToken: t }); + } catch (cause) { + if (cause instanceof HttpClientError && (cause.status === 401 || cause.status === 404)) { + return (cause.body as ProbeResponse) || { status: cause.status === 404 ? "not_found" : "unauthorized" }; + } + throw cause; + } + }; + + let probe = await callProbe(token); + + // 2. Handle private repo if needed + if (probe.status === "not_found" || probe.status === "unauthorized") { + const promptMessage = + probe.status === "unauthorized" + ? "The previous token was rejected. Try a different PAT." + : "This repo looks private. Paste a GitHub PAT with `repo` scope."; + const tokenResult = await promptForToken(repoLabel, promptMessage); + if (tokenResult === null) { + info("Cancelled."); + return { branch: null }; + } + token = tokenResult; + probe = await callProbe(token); + } + + if (probe.status !== "ok") { + error(probe.message ?? "Failed to probe repository."); + return { branch: null }; + } + + // 3. If a branch was already supplied (via flag or URL), just verify it + const branchFromUrl = parsed?.branch; + const initialBranch = suppliedBranch ?? branchFromUrl; + if (initialBranch !== undefined) { + if (probe.branches && !probe.branches.includes(initialBranch)) { + error(`Branch '${initialBranch}' not found.`); + if (probe.branches.length > 0) { + list("Available branches:", probe.branches.slice(0, 20)); + } + return { branch: null }; + } + const res: { branch: string | null; token?: string } = { branch: initialBranch }; + if (token) { + res.token = token; + } + return res; + } + + // 4. Interactive menu flow + const defaultBranch = probe.defaultBranch ?? "main"; + const choice = await promptInitialBranch(defaultBranch); + if (choice === null) { + info("Cancelled."); + return { branch: null }; + } + + if (choice === "default") { + const res: { branch: string | null; token?: string } = { branch: defaultBranch }; + if (token) { + res.token = token; + } + return res; + } + + // User selected "Other branch..." + const fullSelection = await promptFullBranchSelector(probe.branches ?? []); + if (fullSelection === null) { + info("Cancelled."); + return { branch: null }; + } + + const res: { branch: string | null; token?: string } = { branch: fullSelection.branch }; + if (token) { + res.token = token; + } + return res; +} + function handleError(cause: unknown): void { if (cause instanceof ServerStartTimeoutError) { error(cause.message); diff --git a/packages/cli/src/InitialBranchSelector.tsx b/packages/cli/src/InitialBranchSelector.tsx new file mode 100644 index 0000000..70b76c3 --- /dev/null +++ b/packages/cli/src/InitialBranchSelector.tsx @@ -0,0 +1,67 @@ +import { useState } from "react"; +import type { ReactElement } from "react"; +import { Box, Text, useApp, useInput } from "ink"; + +export interface InitialBranchResult { + choice?: "default" | "other"; + cancelled?: boolean; +} + +export interface InitialBranchProps { + defaultBranch: string; + onDone: (result: InitialBranchResult) => void; +} + +export function InitialBranchSelector({ defaultBranch, onDone }: InitialBranchProps): ReactElement { + const { exit } = useApp(); + const [index, setIndex] = useState(0); + + const items = [ + { label: `Default branch (${defaultBranch})`, value: "default" as const }, + { label: "Other branch...", value: "other" as const }, + ]; + + useInput((_input, key) => { + if (key.escape) { + exit(); + onDone({ cancelled: true }); + return; + } + if (key.return) { + exit(); + const choice = items[index]?.value; + if (choice) { + onDone({ choice }); + } else { + onDone({ cancelled: true }); + } + return; + } + if (key.upArrow) { + setIndex(0); + } + if (key.downArrow) { + setIndex(1); + } + }); + + return ( + + + Which branch would you like to index? + + {items.map((item, i) => { + const cursor = i === index; + return ( + + {cursor ? "▶ " : " "} + {item.label} + + ); + })} + + [↑/↓] move [Enter] choose [Esc] cancel + + + ); +} diff --git a/packages/cli/src/LsCommand.ts b/packages/cli/src/LsCommand.ts index c0f103e..c58f9ee 100644 --- a/packages/cli/src/LsCommand.ts +++ b/packages/cli/src/LsCommand.ts @@ -4,17 +4,8 @@ import { getConfigValue } from "@bb/config"; import { ensureServerRunning, ServerStartTimeoutError } from "./serverSpawn.ts"; import { getJson, HttpClientError } from "./httpClient.ts"; import { createSpinner, error } from "./output.ts"; - -interface RepoEntry { - knowledgeId: string; - source: - | { kind: "github"; repoUrl: string; branch?: string; commitId?: string; commitHashes?: string[] } - | { kind: "local"; sourcePath: string }; - state: string; - createdAt: string; - updatedAt: string; - fileCount: number; -} +import { promptLsInteractive } from "./lsInteractivePrompt.ts"; +import type { RepoEntry } from "./LsInteractive.tsx"; interface ListResponse { repos: RepoEntry[]; @@ -22,11 +13,14 @@ interface ListResponse { export function buildLsCommand(): Command { const cmd = new Command("ls"); - cmd.description("List indexed knowledge entries.").action(runLs); + cmd + .description("List indexed knowledge entries.") + .option("-i, --interactive", "Use interactive selector to browse entries.") + .action(runLs); return cmd; } -async function runLs(): Promise { +async function runLs(options: { interactive?: boolean }): Promise { try { let ctx: Awaited>; if ( @@ -47,6 +41,12 @@ async function runLs(): Promise { ); return; } + + if (options.interactive === true) { + await promptLsInteractive(repos); + return; + } + renderTable(repos); process.stdout.write(`\n${repos.length} ${repos.length === 1 ? "entry" : "entries"}.\n`); } catch (cause: unknown) { diff --git a/packages/cli/src/LsInteractive.tsx b/packages/cli/src/LsInteractive.tsx new file mode 100644 index 0000000..886328d --- /dev/null +++ b/packages/cli/src/LsInteractive.tsx @@ -0,0 +1,289 @@ +import { useState, useMemo } from "react"; +import type { ReactElement } from "react"; +import { Box, Text, useApp, useInput } from "ink"; + +export interface RepoEntry { + knowledgeId: string; + source: + | { kind: "github"; repoUrl: string; branch?: string; commitId?: string; commitHashes?: string[] } + | { kind: "local"; sourcePath: string }; + state: string; + createdAt: string; + updatedAt: string; + fileCount: number; +} + +export interface LsInteractiveProps { + repos: RepoEntry[]; + onDone: () => void; +} + +type ViewMode = "repos" | "branches" | "details"; + +export function LsInteractive({ repos, onDone }: LsInteractiveProps): ReactElement { + const { exit } = useApp(); + const [mode, setMode] = useState("repos"); + const [repoIndex, setRepoIndex] = useState(0); + const [branchIndex, setBranchIndex] = useState(0); + const [selectedRepoUrl, setSelectedRepoUrl] = useState(null); + const [selectedEntry, setSelectedEntry] = useState(null); + + // Group repos by their source URL or Path + const groupedRepos = useMemo(() => { + const groups: Record = {}; + for (const r of repos) { + const key = r.source.kind === "github" ? r.source.repoUrl : r.source.sourcePath; + if (!groups[key]) { + groups[key] = []; + } + groups[key].push(r); + } + return Object.entries(groups).map(([url, entries]) => { + const firstEntry = entries[0]; + if (!firstEntry) { + throw new Error("empty group"); + } + return { + url, + kind: firstEntry.source.kind, + entries, + }; + }); + }, [repos]); + + const currentBranches = useMemo(() => { + if (!selectedRepoUrl) { + return []; + } + const group = groupedRepos.find((g) => g.url === selectedRepoUrl); + return group ? group.entries : []; + }, [selectedRepoUrl, groupedRepos]); + + const handleBack = () => { + if (mode === "details") { + setMode("branches"); + } else if (mode === "branches") { + setMode("repos"); + setSelectedRepoUrl(null); + } else { + exit(); + onDone(); + } + }; + + useInput((input, key) => { + if (key.escape || (input === "q" && mode === "repos")) { + exit(); + onDone(); + return; + } + + if (key.backspace || input === "b" || key.leftArrow) { + handleBack(); + return; + } + + if (mode === "repos") { + if (key.upArrow || input === "k") { + setRepoIndex((i) => (i > 0 ? i - 1 : groupedRepos.length - 1)); + } else if (key.downArrow || input === "j") { + setRepoIndex((i) => (i < groupedRepos.length - 1 ? i + 1 : 0)); + } else if (key.return || key.rightArrow || input === "l") { + const selected = groupedRepos[repoIndex]; + if (selected) { + setSelectedRepoUrl(selected.url); + setBranchIndex(0); + setMode("branches"); + } + } + } else if (mode === "branches") { + if (key.upArrow || input === "k") { + setBranchIndex((i) => (i > 0 ? i - 1 : currentBranches.length - 1)); + } else if (key.downArrow || input === "j") { + setBranchIndex((i) => (i < currentBranches.length - 1 ? i + 1 : 0)); + } else if (key.return || key.rightArrow || input === "l") { + const selected = currentBranches[branchIndex]; + if (selected) { + setSelectedEntry(selected); + setMode("details"); + } + } + } + }); + + const renderRepos = () => ( + + + + Indexed Repositories ({groupedRepos.length}) + + + {groupedRepos.map((group, i) => ( + + {i === repoIndex ? "▶ " : " "} + + {group.kind === "github" ? parseGithubSlug(group.url) : group.url} + + ({group.entries.length} entries) + + ))} + + [↑/↓] move [Enter/→] branches [q/Esc] exit + + + ); + + const renderBranches = () => ( + + + + Repos /{" "} + + + {selectedRepoUrl + ? currentBranches[0]?.source.kind === "github" + ? parseGithubSlug(selectedRepoUrl) + : selectedRepoUrl + : ""} + + + {currentBranches.map((entry, i) => ( + + {i === branchIndex ? "▶ " : " "} + + {entry.source.kind === "github" ? (entry.source.branch ?? "default") : "local"} + + + {entry.state.padEnd(10)} + {entry.knowledgeId.slice(0, 8)}… + + + ))} + + [↑/↓] move [Enter/→] details [Esc/←] back + + + ); + + const renderDetails = () => { + if (!selectedEntry) { + return null; + } + const s = selectedEntry.source; + return ( + + + + Repos / {s.kind === "github" ? parseGithubSlug(s.repoUrl) : s.sourcePath} /{" "} + + + {s.kind === "github" ? (s.branch ?? "default") : "local"} + + + + + + + + + + + {s.kind === "github" && ( + <> + + + GitHub Details + + + + + + + + + Indexed Commits ({s.commitHashes?.length ?? 0}) + + + {(s.commitHashes ?? []).map((h, i) => ( + + {i + 1}. + {h.slice(0, 8)} + {h === s.commitId && (current head)} + + ))} + {(!s.commitHashes || s.commitHashes.length === 0) && ( + + No commit history recorded. + + )} + + )} + + {s.kind === "local" && ( + <> + + + Local Details + + + + + )} + + + + [Esc/←/Backspace] back + + + ); + }; + + return ( + + {mode === "repos" && renderRepos()} + {mode === "branches" && renderBranches()} + {mode === "details" && renderDetails()} + + ); +} + +function DetailRow({ label, value, color }: { label: string; value: string; color?: string }) { + return ( + + + {label}: + + {value} + + ); +} + +function getStateColor(state: string): string { + switch (state) { + case "PROCESSED": + return "green"; + case "PROCESSING": + return "yellow"; + case "FAILED": + return "red"; + default: + return "white"; + } +} + +function parseGithubSlug(repoUrl: string): string { + try { + const u = new URL(repoUrl); + return u.pathname.replace(/^\/+/u, "").replace(/\.git$/u, ""); + } catch { + return repoUrl; + } +} + +function formatDate(iso: string): string { + const d = new Date(iso); + if (Number.isNaN(d.getTime())) { + return iso; + } + return d.toLocaleString(); +} diff --git a/packages/cli/src/ManualBranchPrompt.tsx b/packages/cli/src/ManualBranchPrompt.tsx new file mode 100644 index 0000000..8d3836c --- /dev/null +++ b/packages/cli/src/ManualBranchPrompt.tsx @@ -0,0 +1,42 @@ +import { useState } from "react"; +import type { ReactElement } from "react"; +import { Box, Text, useApp, useInput } from "ink"; +import { Field } from "./Field.tsx"; + +export interface ManualBranchPromptResult { + branch?: string; + cancelled?: boolean; +} + +export interface ManualBranchPromptProps { + onDone: (result: ManualBranchPromptResult) => void; +} + +export function ManualBranchPrompt({ onDone }: ManualBranchPromptProps): ReactElement { + const { exit } = useApp(); + const [value, setValue] = useState(""); + + useInput((_input, key) => { + if (key.escape) { + exit(); + onDone({ cancelled: true }); + return; + } + if (key.return && value.length > 0) { + exit(); + onDone({ branch: value }); + } + }); + + return ( + + + Type branch name manually + + + + [Enter] submit [Esc] cancel + + + ); +} diff --git a/packages/cli/src/branchPrompts.ts b/packages/cli/src/branchPrompts.ts new file mode 100644 index 0000000..4e182a4 --- /dev/null +++ b/packages/cli/src/branchPrompts.ts @@ -0,0 +1,71 @@ +import React from "react"; +import { render } from "ink"; +import { InitialBranchSelector, type InitialBranchResult } from "./InitialBranchSelector.tsx"; +import { BranchSelector, type BranchSelectorResult } from "./BranchSelector.tsx"; +import { ManualBranchPrompt, type ManualBranchPromptResult } from "./ManualBranchPrompt.tsx"; + +export async function promptInitialBranch(defaultBranch: string): Promise<"default" | "other" | null> { + return new Promise<"default" | "other" | null>((resolve) => { + const onDone = (result: InitialBranchResult): void => { + if (result.choice !== undefined) { + resolve(result.choice); + return; + } + resolve(null); + }; + const { waitUntilExit } = render( + React.createElement(InitialBranchSelector, { + defaultBranch, + onDone, + }), + ); + waitUntilExit().catch(() => undefined); + }); +} + +export async function promptFullBranchSelector( + branches: string[], +): Promise<{ branch: string; manual: boolean } | null> { + const result = await new Promise((resolve) => { + const onDone = (res: BranchSelectorResult): void => { + resolve(res); + }; + const { waitUntilExit } = render( + React.createElement(BranchSelector, { + branches, + onDone, + }), + ); + waitUntilExit().catch(() => undefined); + }); + + if (result.cancelled) { + return null; + } + if (result.typeManually) { + const manual = await promptManualBranch(); + return manual ? { branch: manual, manual: true } : null; + } + if (result.branch) { + return { branch: result.branch, manual: false }; + } + return null; +} + +async function promptManualBranch(): Promise { + return new Promise((resolve) => { + const onDone = (result: ManualBranchPromptResult): void => { + if (result.branch !== undefined) { + resolve(result.branch); + return; + } + resolve(null); + }; + const { waitUntilExit } = render( + React.createElement(ManualBranchPrompt, { + onDone, + }), + ); + waitUntilExit().catch(() => undefined); + }); +} diff --git a/packages/cli/src/httpClient.ts b/packages/cli/src/httpClient.ts index 4bdf8a1..b2bb867 100644 --- a/packages/cli/src/httpClient.ts +++ b/packages/cli/src/httpClient.ts @@ -12,12 +12,14 @@ export function baseUrl(): string { export class HttpClientError extends Error { override readonly name = "HttpClientError"; readonly status: number | undefined; + readonly body: unknown | undefined; - constructor(message: string, status?: number) { + constructor(message: string, status?: number, body?: unknown) { super(message); if (status !== undefined) { this.status = status; } + this.body = body; } } @@ -74,17 +76,23 @@ async function parseResponse(res: Response): Promise { if (!res.ok) { const text = await res.text().catch(() => ""); let message = `HTTP ${res.status}`; + let body: unknown = undefined; try { - const parsed = JSON.parse(text) as { error?: unknown }; - if (typeof parsed.error === "string") { - message = parsed.error; + const parsed = JSON.parse(text); + body = parsed; + if (typeof parsed === "object" && parsed !== null) { + if ("error" in parsed && typeof parsed.error === "string") { + message = parsed.error; + } else if ("message" in parsed && typeof parsed.message === "string") { + message = parsed.message; + } } } catch { if (text.length > 0) { message = text.slice(0, 500); } } - throw new HttpClientError(message, res.status); + throw new HttpClientError(message, res.status, body); } return (await res.json()) as T; } diff --git a/packages/cli/src/lsInteractivePrompt.ts b/packages/cli/src/lsInteractivePrompt.ts new file mode 100644 index 0000000..64d5dad --- /dev/null +++ b/packages/cli/src/lsInteractivePrompt.ts @@ -0,0 +1,23 @@ +import React from "react"; +import { render } from "ink"; +import { LsInteractive, type RepoEntry } from "./LsInteractive.tsx"; + +/** + * Renders the interactive repository list and waits for the user to exit. + */ +export async function promptLsInteractive(repos: RepoEntry[]): Promise { + return new Promise((resolve) => { + const onDone = (): void => { + resolve(); + }; + + const { waitUntilExit } = render( + React.createElement(LsInteractive, { + repos, + onDone, + }), + ); + + waitUntilExit().catch(() => resolve()); + }); +} diff --git a/packages/cli/src/pullPrompts.ts b/packages/cli/src/pullPrompts.ts index 039b13d..aa7e6b5 100644 --- a/packages/cli/src/pullPrompts.ts +++ b/packages/cli/src/pullPrompts.ts @@ -63,7 +63,7 @@ export async function resolveCommit( return null; } -async function promptForToken(repoLabel: string, message: string): Promise { +export async function promptForToken(repoLabel: string, message: string): Promise { return new Promise((resolve) => { const onDone = (result: TokenPromptResult): void => { if (result.token !== undefined && result.token.length > 0) { diff --git a/packages/ingest-github/src/githubApi.ts b/packages/ingest-github/src/githubCommit.ts similarity index 66% rename from packages/ingest-github/src/githubApi.ts rename to packages/ingest-github/src/githubCommit.ts index 7c88b8c..8b2d789 100644 --- a/packages/ingest-github/src/githubApi.ts +++ b/packages/ingest-github/src/githubCommit.ts @@ -1,77 +1,10 @@ /** - * Minimal GitHub REST helpers used by the pull flow. + * Commit fetching from GitHub REST API. * - * Public repo only models GitHub (no Bitbucket), so this stays small — - * a URL parser and a single branch-head lookup. Both are best-effort: - * `null` on parse failure or non-2xx so callers can fall back without - * try/catch noise. + * SPDX-License-Identifier: AGPL-3.0-only WITH non-commercial-clause */ -const USER_AGENT = "ByteBell"; - -export interface ParsedRepo { - owner: string; - repo: string; -} - -/** Parses `https://github.com/{owner}/{repo}(.git)?(/...)?` → `{owner, repo}`. */ -export function parseGithubRepo(repoUrl: string): ParsedRepo | null { - if (!repoUrl) { - return null; - } - try { - const url = new URL(repoUrl); - if (!url.hostname.endsWith("github.com")) { - return null; - } - const segments = url.pathname.split("/").filter((s) => s.length > 0); - if (segments.length < 2) { - return null; - } - const owner = segments[0]; - const repoRaw = segments[1]; - if (owner === undefined || repoRaw === undefined) { - return null; - } - return { owner, repo: repoRaw.replace(/\.git$/u, "") }; - } catch { - return null; - } -} - -/** - * Resolves the head SHA of `branch` on `repoUrl`. Returns `null` for any - * non-2xx, parse failure, or unparsable URL — callers treat `null` as - * "couldn't anchor, proceed without it". - */ -export async function fetchLatestCommitHash( - repoUrl: string, - branch: string, - gitToken?: string, -): Promise { - const parsed = parseGithubRepo(repoUrl); - if (parsed === null) { - return null; - } - - const headers: Record = { - Accept: "application/vnd.github+json", - "User-Agent": USER_AGENT, - "X-GitHub-Api-Version": "2022-11-28", - }; - if (gitToken !== undefined && gitToken.length > 0) { - headers["Authorization"] = `Bearer ${gitToken}`; - } - - const url = `https://api.github.com/repos/${parsed.owner}/${parsed.repo}/branches/${encodeURIComponent(branch)}`; - const response = await fetch(url, { headers }); - if (!response.ok) { - return null; - } - const body = (await response.json()) as { commit?: { sha?: unknown } }; - const sha = body.commit?.sha; - return typeof sha === "string" && sha.length > 0 ? sha : null; -} +import { parseGithubRepo, USER_AGENT } from "./githubUrl.ts"; export interface CommitEntry { hash: string; diff --git a/packages/ingest-github/src/githubRepo.ts b/packages/ingest-github/src/githubRepo.ts new file mode 100644 index 0000000..7d3c266 --- /dev/null +++ b/packages/ingest-github/src/githubRepo.ts @@ -0,0 +1,139 @@ +/** + * Repository and branch information fetching from GitHub REST API. + * + * SPDX-License-Identifier: AGPL-3.0-only WITH non-commercial-clause + */ + +import { parseGithubRepo, USER_AGENT } from "./githubUrl.ts"; + +/** + * Resolves the head SHA of `branch` on `repoUrl`. Returns `null` for any + * non-2xx, parse failure, or unparsable URL — callers treat `null` as + * "couldn't anchor, proceed without it". + */ +export async function fetchLatestCommitHash( + repoUrl: string, + branch: string, + gitToken?: string, +): Promise { + const parsed = parseGithubRepo(repoUrl); + if (parsed === null) { + return null; + } + + const headers: Record = { + Accept: "application/vnd.github+json", + "User-Agent": USER_AGENT, + "X-GitHub-Api-Version": "2022-11-28", + }; + if (gitToken !== undefined && gitToken.length > 0) { + headers["Authorization"] = `Bearer ${gitToken}`; + } + + const url = `https://api.github.com/repos/${parsed.owner}/${parsed.repo}/branches/${encodeURIComponent(branch)}`; + const response = await fetch(url, { headers }); + if (!response.ok) { + return null; + } + const body = (await response.json()) as { commit?: { sha?: unknown } }; + const sha = body.commit?.sha; + return typeof sha === "string" && sha.length > 0 ? sha : null; +} + +export type DefaultBranchResult = + | { status: "ok"; branch: string } + | { status: "not_found" } + | { status: "unauthorized" } + | { status: "rate_limited" } + | { status: "error"; message: string }; + +/** + * Fetches the default branch name of `repoUrl`. Returns a detailed result + * so callers can distinguish between private repos, rate limits, and errors. + */ +export async function fetchDefaultBranch(repoUrl: string, gitToken?: string): Promise { + const parsed = parseGithubRepo(repoUrl); + if (parsed === null) { + return { status: "error", message: `unparseable github url: ${repoUrl}` }; + } + + const headers: Record = { + Accept: "application/vnd.github+json", + "User-Agent": USER_AGENT, + "X-GitHub-Api-Version": "2022-11-28", + }; + if (gitToken !== undefined && gitToken.length > 0) { + headers["Authorization"] = `Bearer ${gitToken}`; + } + + const url = `https://api.github.com/repos/${parsed.owner}/${parsed.repo}`; + let response: Response; + try { + response = await fetch(url, { headers }); + } catch (cause: unknown) { + const msg = cause instanceof Error ? cause.message : String(cause); + return { status: "error", message: `github fetch failed: ${msg}` }; + } + + if (response.status === 404) { + return { status: "not_found" }; + } + if (response.status === 401) { + return { status: "unauthorized" }; + } + if (response.status === 403 && response.headers.get("x-ratelimit-remaining") === "0") { + return { status: "rate_limited" }; + } + if (!response.ok) { + const body = await response.text().catch(() => ""); + return { status: "error", message: `github ${response.status}: ${body.slice(0, 200)}` }; + } + + const body = (await response.json()) as { default_branch?: unknown }; + const branch = body.default_branch; + if (typeof branch === "string" && branch.length > 0) { + return { status: "ok", branch }; + } + return { status: "error", message: "github API returned empty default_branch" }; +} + +/** + * Fetches the list of branches for `repoUrl`. + */ +export async function fetchBranches( + repoUrl: string, + gitToken?: string, + limit = 100, +): Promise<{ status: "ok"; branches: string[] } | { status: "error"; message: string }> { + const parsed = parseGithubRepo(repoUrl); + if (parsed === null) { + return { status: "error", message: `unparseable github url: ${repoUrl}` }; + } + + const headers: Record = { + Accept: "application/vnd.github+json", + "User-Agent": USER_AGENT, + "X-GitHub-Api-Version": "2022-11-28", + }; + if (gitToken !== undefined && gitToken.length > 0) { + headers["Authorization"] = `Bearer ${gitToken}`; + } + + const url = `https://api.github.com/repos/${parsed.owner}/${parsed.repo}/branches?per_page=${limit}`; + let response: Response; + try { + response = await fetch(url, { headers }); + } catch (cause: unknown) { + const msg = cause instanceof Error ? cause.message : String(cause); + return { status: "error", message: `github fetch failed: ${msg}` }; + } + + if (!response.ok) { + const body = await response.text().catch(() => ""); + return { status: "error", message: `github ${response.status}: ${body.slice(0, 200)}` }; + } + + const body = (await response.json()) as Array<{ name?: unknown }>; + const branches = body.map((b) => b.name).filter((name): name is string => typeof name === "string"); + return { status: "ok", branches }; +} diff --git a/packages/ingest-github/src/githubUrl.ts b/packages/ingest-github/src/githubUrl.ts new file mode 100644 index 0000000..e080101 --- /dev/null +++ b/packages/ingest-github/src/githubUrl.ts @@ -0,0 +1,53 @@ +/** + * Minimal GitHub REST helpers used by the pull flow. + * + * Public repo only models GitHub (no Bitbucket), so this stays small — + * a URL parser and a single branch-head lookup. Both are best-effort: + * `null` on parse failure or non-2xx so callers can fall back without + * try/catch noise. + * + * SPDX-License-Identifier: AGPL-3.0-only WITH non-commercial-clause + */ + +export const USER_AGENT = "ByteBell"; + +export interface ParsedRepo { + owner: string; + repo: string; + branch?: string; +} + +/** + * Parses `https://github.com/{owner}/{repo}(/tree/{branch})?` → `{owner, repo, branch?}`. + */ +export function parseGithubRepo(repoUrl: string): ParsedRepo | null { + if (!repoUrl) { + return null; + } + try { + const url = new URL(repoUrl); + if (!url.hostname.endsWith("github.com")) { + return null; + } + const segments = url.pathname.split("/").filter((s) => s.length > 0); + if (segments.length < 2) { + return null; + } + const owner = segments[0]; + const repoRaw = segments[1]; + if (owner === undefined || repoRaw === undefined) { + return null; + } + const repo = repoRaw.replace(/\.git$/u, ""); + const out: ParsedRepo = { owner, repo }; + + // Support https://github.com/owner/repo/tree/branch-name + if (segments[2] === "tree" && segments.length > 3) { + out.branch = segments.slice(3).join("/"); + } + + return out; + } catch { + return null; + } +} diff --git a/packages/ingest-github/src/index.ts b/packages/ingest-github/src/index.ts index 49afde3..84e25da 100644 --- a/packages/ingest-github/src/index.ts +++ b/packages/ingest-github/src/index.ts @@ -65,5 +65,11 @@ export type { SourceFactoryResult, } from "./types/pipeline.ts"; export type { CondensedFileAnalysis } from "./types/condensed-file-analysis.ts"; -export { fetchLatestCommitHash, fetchRecentCommits, parseGithubRepo } from "./githubApi.ts"; -export type { CommitEntry, FetchCommitsResult, ParsedRepo } from "./githubApi.ts"; +export { + fetchLatestCommitHash, + fetchRecentCommits, + fetchDefaultBranch, + fetchBranches, + parseGithubRepo, +} from "./githubApi.ts"; +export type { CommitEntry, FetchCommitsResult, ParsedRepo, DefaultBranchResult } from "./githubApi.ts"; diff --git a/packages/ingest-github/src/pipeline/branch.ts b/packages/ingest-github/src/pipeline/branch.ts index 213a5e5..dedc693 100644 --- a/packages/ingest-github/src/pipeline/branch.ts +++ b/packages/ingest-github/src/pipeline/branch.ts @@ -1,15 +1,31 @@ import type { GithubIndexPayload } from "@bb/types"; import { IngestError } from "@bb/errors"; +import { fetchDefaultBranch } from "../githubRepo.ts"; const DEFAULT_BRANCH = "main"; -export function resolveBranch(knowledgeId: string, payload: GithubIndexPayload): string { +export async function resolveBranch( + knowledgeId: string, + payload: GithubIndexPayload, + gitToken?: string, +): Promise { const branch = payload.branch; - if (branch === undefined || branch.length === 0) { - return DEFAULT_BRANCH; + if (branch !== undefined && branch.length > 0) { + if (!/^[\w./-]+$/u.test(branch)) { + throw new IngestError(knowledgeId, `invalid branch name: ${branch}`); + } + return branch; } - if (!/^[\w./-]+$/u.test(branch)) { - throw new IngestError(knowledgeId, `invalid branch name: ${branch}`); + + // No branch provided -> attempt to fetch the default branch from GitHub. + try { + const result = await fetchDefaultBranch(payload.repoUrl, gitToken); + if (result.status === "ok") { + return result.branch; + } + } catch { + // Best-effort; fall back to the hardcoded default. } - return branch; + + return DEFAULT_BRANCH; } diff --git a/packages/ingest-github/src/pipeline/run.ts b/packages/ingest-github/src/pipeline/run.ts index 5d76146..5987432 100644 --- a/packages/ingest-github/src/pipeline/run.ts +++ b/packages/ingest-github/src/pipeline/run.ts @@ -1,7 +1,7 @@ import { Config, KnowledgeState, type GithubIndexPayload, type LocalIngestPayload } from "@bb/types"; import { getConfigValue } from "@bb/config"; -import { recordProcessingStats, setKnowledgeCommit, setKnowledgeState } from "@bb/mongo"; -import { setKnowledgeStateInGraph } from "@bb/neo4j"; +import { recordProcessingStats, setKnowledgeCommit, setKnowledgeState, setKnowledgeBranch } from "@bb/mongo"; +import { setKnowledgeStateInGraph, setKnowledgeBranchInGraph } from "@bb/neo4j"; import { estimateCostFromBreakdown } from "@bb/llm"; import { IngestError } from "@bb/errors"; import { logger } from "@bb/logger"; @@ -58,7 +58,9 @@ async function runGithub( await transitionState(knowledgeId, KnowledgeState.Processing); try { throwIfCancelled(knowledgeId); - const branch = resolveBranch(knowledgeId, payload); + const branch = await resolveBranch(knowledgeId, payload, payload.gitToken); + await setKnowledgeBranch(knowledgeId, branch); + await setKnowledgeBranchInGraph(knowledgeId, branch).catch(() => undefined); let source: SourceReader; let archiveSink: ArchiveSink | undefined; diff --git a/packages/mongo/src/index.ts b/packages/mongo/src/index.ts index 4303968..9f7780a 100644 --- a/packages/mongo/src/index.ts +++ b/packages/mongo/src/index.ts @@ -5,6 +5,7 @@ export { getKnowledge, setKnowledgeCommit, setKnowledgeState, + setKnowledgeBranch, updateKnowledgeProgress, upsertKnowledge, listKnowledge, diff --git a/packages/mongo/src/knowledge.ts b/packages/mongo/src/knowledge.ts index c1bfb15..c91f5e3 100644 --- a/packages/mongo/src/knowledge.ts +++ b/packages/mongo/src/knowledge.ts @@ -41,6 +41,18 @@ export async function setKnowledgeCommit(knowledgeId: string, commitHash: string } } +/** + * Updates the branch name of a GitHub knowledge entry. + */ +export async function setKnowledgeBranch(knowledgeId: string, branch: string): Promise { + const result = await _getDb() + .collection(Collections.Knowledge) + .updateOne({ knowledgeId }, { $set: { "source.branch": branch, updatedAt: new Date() } }); + if (result.matchedCount === 0) { + throw new KnowledgeNotFoundError(knowledgeId); + } +} + export async function updateKnowledgeProgress( knowledgeId: string, processedFiles: number, diff --git a/packages/neo4j/src/index.ts b/packages/neo4j/src/index.ts index e4e2d54..03b51c0 100644 --- a/packages/neo4j/src/index.ts +++ b/packages/neo4j/src/index.ts @@ -5,7 +5,12 @@ export type { PingResult } from "./client.ts"; export { ensureKnowledgeIndexes } from "./indexes.ts"; export { ensureFlatFolderIndexes } from "./flatFolderIndexes.ts"; -export { upsertKnowledgeNode, setKnowledgeStateInGraph, deleteKnowledgeGraph } from "./knowledge.ts"; +export { + upsertKnowledgeNode, + setKnowledgeStateInGraph, + setKnowledgeBranchInGraph, + deleteKnowledgeGraph, +} from "./knowledge.ts"; export { upsertFileNode, deleteFileNodes } from "./files.ts"; export type { UpsertFileNodeInput } from "./files.ts"; diff --git a/packages/neo4j/src/knowledge.ts b/packages/neo4j/src/knowledge.ts index fcb8043..c155f17 100644 --- a/packages/neo4j/src/knowledge.ts +++ b/packages/neo4j/src/knowledge.ts @@ -18,6 +18,11 @@ MATCH (k:Knowledge {knowledgeId: $knowledgeId}) SET k.state = $state, k.updatedAt = $updatedAt `; +const SET_BRANCH = ` +MATCH (k:Knowledge {knowledgeId: $knowledgeId}) +SET k.branch = $branch, k.updatedAt = $updatedAt +`; + const DELETE_FILES_BY_KNOWLEDGE = ` MATCH (f:File {knowledgeId: $knowledgeId}) DETACH DELETE f @@ -74,6 +79,14 @@ export async function setKnowledgeStateInGraph(knowledgeId: string, state: Knowl }); } +export async function setKnowledgeBranchInGraph(knowledgeId: string, branch: string): Promise { + await _runCypher(SET_BRANCH, { + knowledgeId, + branch, + updatedAt: new Date().toISOString(), + }); +} + export async function deleteKnowledgeGraph(knowledgeId: string): Promise { await _runCypher(DELETE_FILES_BY_KNOWLEDGE, { knowledgeId }); await _runCypher(DELETE_ORPHAN_FILES); diff --git a/packages/server/src/githubProbeRoute.ts b/packages/server/src/githubProbeRoute.ts new file mode 100644 index 0000000..fb1f596 --- /dev/null +++ b/packages/server/src/githubProbeRoute.ts @@ -0,0 +1,60 @@ +import type { Request, Response, Router } from "express"; +import express from "express"; +import { fetchDefaultBranch, fetchBranches } from "@bb/ingest-github"; + +interface ProbeBody { + repoUrl?: unknown; + gitToken?: unknown; + branch?: unknown; +} + +export function buildGithubProbeRoute(): Router { + const router = express.Router(); + router.post("/api/v1/github/probe", async (req: Request, res: Response) => { + const body = req.body as ProbeBody; + if (typeof body.repoUrl !== "string" || body.repoUrl.length === 0) { + res.status(400).json({ error: "repoUrl required" }); + return; + } + const repoUrl = body.repoUrl; + const gitToken = typeof body.gitToken === "string" && body.gitToken.length > 0 ? body.gitToken : undefined; + const targetBranch = typeof body.branch === "string" && body.branch.length > 0 ? body.branch : undefined; + + const result = await fetchDefaultBranch(repoUrl, gitToken); + switch (result.status) { + case "ok": { + const defaultBranch = result.branch; + const branchesResult = await fetchBranches(repoUrl, gitToken); + const branches = branchesResult.status === "ok" ? branchesResult.branches : []; + + if (targetBranch !== undefined && !branches.includes(targetBranch)) { + const suggestions = branches + .filter((b: string) => b.toLowerCase().includes(targetBranch.toLowerCase())) + .slice(0, 10); + res.status(404).json({ + status: "branch_not_found", + message: `Branch '${targetBranch}' not found.`, + branches: suggestions.length > 0 ? suggestions : branches.slice(0, 20), + }); + return; + } + + res.status(200).json({ status: "ok", defaultBranch, branches }); + break; + } + case "not_found": + res.status(404).json({ status: "not_found", message: "Repository not found or private." }); + break; + case "unauthorized": + res.status(401).json({ status: "unauthorized", message: "GitHub token rejected." }); + break; + case "rate_limited": + res.status(429).json({ status: "rate_limited", message: "GitHub rate limit reached." }); + break; + case "error": + res.status(502).json({ status: "error", message: result.message }); + break; + } + }); + return router; +} diff --git a/packages/server/src/routes.ts b/packages/server/src/routes.ts index 7ddf100..1738790 100644 --- a/packages/server/src/routes.ts +++ b/packages/server/src/routes.ts @@ -2,6 +2,7 @@ import type { Application } from "express"; import { mountMcp } from "@bb/mcp"; import { buildHealthRoute } from "./healthRoute.ts"; import { buildGithubIndexRoute } from "./githubIndexRoute.ts"; +import { buildGithubProbeRoute } from "./githubProbeRoute.ts"; import { buildGithubPullRoute } from "./githubPullRoute.ts"; import { buildGithubCommitsRoute } from "./githubCommitsRoute.ts"; import { buildLocalIndexRoute } from "./localIndexRoute.ts"; @@ -13,6 +14,7 @@ import { buildMcpStatsRoute } from "./mcpStatsRoute.ts"; export function registerRoutes(app: Application): void { app.use(buildHealthRoute()); app.use(buildGithubIndexRoute()); + app.use(buildGithubProbeRoute()); app.use(buildGithubPullRoute()); app.use(buildGithubCommitsRoute()); app.use(buildLocalIndexRoute()); From dd1946c3f487e692943de626657f217d262d5413 Mon Sep 17 00:00:00 2001 From: lovanshu garg Date: Thu, 14 May 2026 12:02:36 +0530 Subject: [PATCH 06/34] feat(interactive): build fixes and linting fixes --- bun.lock | 104 ++++++++---------- packages/cli/package.json | 1 + packages/cli/tsconfig.json | 9 +- packages/config/tsconfig.json | 3 +- packages/errors/tsconfig.json | 3 +- .../src/{githubRepo.ts => githubApi.ts} | 82 ++++++++++++++ packages/ingest-github/src/pipeline/branch.ts | 2 +- packages/ingest-github/tsconfig.json | 12 +- packages/llm/tsconfig.json | 3 +- packages/logger/tsconfig.json | 2 +- packages/mcp/tsconfig.json | 9 +- packages/mongo/src/knowledge.ts | 2 +- packages/mongo/tsconfig.json | 3 +- packages/neo4j/package.json | 1 + packages/neo4j/tsconfig.json | 3 +- packages/queue/src/manager.ts | 3 +- packages/queue/tsconfig.json | 9 +- packages/redis/tsconfig.json | 3 +- packages/server/src/githubCommitsRoute.ts | 9 +- packages/server/tsconfig.json | 12 +- tsconfig.base.json | 3 +- 21 files changed, 205 insertions(+), 73 deletions(-) rename packages/ingest-github/src/{githubRepo.ts => githubApi.ts} (64%) diff --git a/bun.lock b/bun.lock index 083df85..5b75687 100644 --- a/bun.lock +++ b/bun.lock @@ -1,6 +1,6 @@ { "lockfileVersion": 1, - "configVersion": 0, + "configVersion": 1, "workspaces": { "": { "name": "bytebell-public", @@ -29,6 +29,7 @@ "dependencies": { "@bb/config": "workspace:*", "@bb/errors": "workspace:*", + "@bb/ingest-github": "workspace:*", "@bb/logger": "workspace:*", "@bb/types": "workspace:*", "commander": "^14.0.3", @@ -122,6 +123,7 @@ "dependencies": { "@bb/config": "workspace:*", "@bb/errors": "workspace:*", + "@bb/mongo": "workspace:*", "@bb/types": "workspace:*", "neo4j-driver": "^6.0.1", }, @@ -210,13 +212,13 @@ "@colors/colors": ["@colors/colors@1.6.0", "", {}, "sha512-Ir+AOibqzrIsL6ajt3Rz3LskB7OiMVHqltZmspbW/TJuTVuyOMirVqAkjfY6JISiLHgyNqicAC8AyHHGzNd/dA=="], - "@commitlint/cli": ["@commitlint/cli@20.5.2", "", { "dependencies": { "@commitlint/format": "^20.5.0", "@commitlint/lint": "^20.5.0", "@commitlint/load": "^20.5.2", "@commitlint/read": "^20.5.0", "@commitlint/types": "^20.5.0", "tinyexec": "^1.0.0", "yargs": "^17.0.0" }, "bin": { "commitlint": "./cli.js" } }, "sha512-IXr5xd3IX8SEG936P8gcpozRplkDeDSwJlt8UvoY1winwIy2udTbQ/cOCgbaaxcjdDqVoS29VUcz/wkwnSozbA=="], + "@commitlint/cli": ["@commitlint/cli@20.5.3", "", { "dependencies": { "@commitlint/format": "^20.5.0", "@commitlint/lint": "^20.5.3", "@commitlint/load": "^20.5.3", "@commitlint/read": "^20.5.0", "@commitlint/types": "^20.5.0", "tinyexec": "^1.0.0", "yargs": "^17.0.0" }, "bin": { "commitlint": "./cli.js" } }, "sha512-OJdL0EXWD5y9LPa0nr/geOwzaS8BsdaybKkcloB0JgsguGxNv2R+hC2FTPqrAcprg35zF33KOQerY0x8W1aesA=="], - "@commitlint/config-conventional": ["@commitlint/config-conventional@20.5.0", "", { "dependencies": { "@commitlint/types": "^20.5.0", "conventional-changelog-conventionalcommits": "^9.2.0" } }, "sha512-t3Ni88rFw1XMa4nZHgOKJ8fIAT9M2j5TnKyTqJzsxea7FUetlNdYFus9dz+MhIRZmc16P0PPyEfh6X2d/qw8SA=="], + "@commitlint/config-conventional": ["@commitlint/config-conventional@20.5.3", "", { "dependencies": { "@commitlint/types": "^20.5.0", "conventional-changelog-conventionalcommits": "^9.2.0" } }, "sha512-j34Qqeaa152chJgz2ysyk0BCpHenJn1lV0Rx0VXf8k3ccQcED+48EZrzMvo9jLmJUyBrrBwvu89I+2er4gW7QQ=="], "@commitlint/config-validator": ["@commitlint/config-validator@20.5.0", "", { "dependencies": { "@commitlint/types": "^20.5.0", "ajv": "^8.11.0" } }, "sha512-T/Uh6iJUzyx7j35GmHWdIiGRQB+ouZDk0pwAaYq4SXgB54KZhFdJ0vYmxiW6AMYICTIWuyMxDBl1jK74oFp/Gw=="], - "@commitlint/ensure": ["@commitlint/ensure@20.5.0", "", { "dependencies": { "@commitlint/types": "^20.5.0", "lodash.camelcase": "^4.3.0", "lodash.kebabcase": "^4.1.1", "lodash.snakecase": "^4.1.1", "lodash.startcase": "^4.4.0", "lodash.upperfirst": "^4.3.1" } }, "sha512-IpHqAUesBeW1EDDdjzJeaOxU9tnogLAyXLRBn03SHlj1SGENn2JGZqSWGkFvBJkJzfXAuCNtsoYzax+ZPS+puw=="], + "@commitlint/ensure": ["@commitlint/ensure@20.5.3", "", { "dependencies": { "@commitlint/types": "^20.5.0", "es-toolkit": "^1.46.0" } }, "sha512-4i4AgNvH62owG9MwSiWKrle7HGNpBHHdLnWFIp5fTsHUYe5kRuh15t08L/0pdbbrRk8JKXQxxN4hZQcn+szkrw=="], "@commitlint/execute-rule": ["@commitlint/execute-rule@20.0.0", "", {}, "sha512-xyCoOShoPuPL44gVa+5EdZsBVao/pNzpQhkzq3RdtlFdKZtjWcLlUFQHSWBuhk5utKYykeJPSz2i8ABHQA+ZZw=="], @@ -224,9 +226,9 @@ "@commitlint/is-ignored": ["@commitlint/is-ignored@20.5.0", "", { "dependencies": { "@commitlint/types": "^20.5.0", "semver": "^7.6.0" } }, "sha512-JWLarAsurHJhPozbuAH6GbP4p/hdOCoqS9zJMfqwswne+/GPs5V0+rrsfOkP68Y8PSLphwtFXV0EzJ+GTXTTGg=="], - "@commitlint/lint": ["@commitlint/lint@20.5.0", "", { "dependencies": { "@commitlint/is-ignored": "^20.5.0", "@commitlint/parse": "^20.5.0", "@commitlint/rules": "^20.5.0", "@commitlint/types": "^20.5.0" } }, "sha512-jiM3hNUdu04jFBf1VgPdjtIPvbuVfDTBAc6L98AWcoLjF5sYqkulBHBzlVWll4rMF1T5zeQFB6r//a+s+BBKlA=="], + "@commitlint/lint": ["@commitlint/lint@20.5.3", "", { "dependencies": { "@commitlint/is-ignored": "^20.5.0", "@commitlint/parse": "^20.5.0", "@commitlint/rules": "^20.5.3", "@commitlint/types": "^20.5.0" } }, "sha512-M7JbWBNr2gXKaPc4i/KipsuW1gkDHpj35KPjWtKy3Z+2AQw5wu1gBi1LIO0uoaij67CqY4K8PxPZSGens4evCw=="], - "@commitlint/load": ["@commitlint/load@20.5.2", "", { "dependencies": { "@commitlint/config-validator": "^20.5.0", "@commitlint/execute-rule": "^20.0.0", "@commitlint/resolve-extends": "^20.5.2", "@commitlint/types": "^20.5.0", "cosmiconfig": "^9.0.1", "cosmiconfig-typescript-loader": "^6.1.0", "is-plain-obj": "^4.1.0", "lodash.mergewith": "^4.6.2", "picocolors": "^1.1.1" } }, "sha512-zmr0RGDz7vThxW1I8ohb9yBjnGuH9mqwJpn21hInjGla+IlLOkS9ey0+dD5HlkzFlY0lX2NYdA2lDW6/0rO7Gw=="], + "@commitlint/load": ["@commitlint/load@20.5.3", "", { "dependencies": { "@commitlint/config-validator": "^20.5.0", "@commitlint/execute-rule": "^20.0.0", "@commitlint/resolve-extends": "^20.5.3", "@commitlint/types": "^20.5.0", "cosmiconfig": "^9.0.1", "cosmiconfig-typescript-loader": "^6.1.0", "es-toolkit": "^1.46.0", "is-plain-obj": "^4.1.0", "picocolors": "^1.1.1" } }, "sha512-1FDZWuKyu98Myb8i7Tp31jPU2rZpOwAdYRyJcy2KoGg7Xk2A+bgHN8smhMaaNSNkmE8fwt53BokywZq8Gv/5XQ=="], "@commitlint/message": ["@commitlint/message@20.4.3", "", {}, "sha512-6akwCYrzcrFcTYz9GyUaWlhisY4lmQ3KvrnabmhoeAV8nRH4dXJAh4+EUQ3uArtxxKQkvxJS78hNX2EU3USgxQ=="], @@ -234,9 +236,9 @@ "@commitlint/read": ["@commitlint/read@20.5.0", "", { "dependencies": { "@commitlint/top-level": "^20.4.3", "@commitlint/types": "^20.5.0", "git-raw-commits": "^5.0.0", "minimist": "^1.2.8", "tinyexec": "^1.0.0" } }, "sha512-JDEIJ2+GnWpK8QqwfmW7O42h0aycJEWNqcdkJnyzLD11nf9dW2dWLTVEa8Wtlo4IZFGLPATjR5neA5QlOvIH1w=="], - "@commitlint/resolve-extends": ["@commitlint/resolve-extends@20.5.2", "", { "dependencies": { "@commitlint/config-validator": "^20.5.0", "@commitlint/types": "^20.5.0", "global-directory": "^5.0.0", "import-meta-resolve": "^4.0.0", "lodash.mergewith": "^4.6.2", "resolve-from": "^5.0.0" } }, "sha512-8EhSCU9eNos/5cI1yg64GW79UH1c64O69AfStCsj4zqy6An/qIphVEXj4/+2M6056T8coz00f+UXFn4WUUP1HQ=="], + "@commitlint/resolve-extends": ["@commitlint/resolve-extends@20.5.3", "", { "dependencies": { "@commitlint/config-validator": "^20.5.0", "@commitlint/types": "^20.5.0", "es-toolkit": "^1.46.0", "global-directory": "^5.0.0", "import-meta-resolve": "^4.0.0", "resolve-from": "^5.0.0" } }, "sha512-+ogW9v/u9JqpvAgTrLra/YTFo0KkjU6iNblF89pPsj4NebNc+DAWctsludwezI8YnsjBmfHpApSwcXprN/f/ew=="], - "@commitlint/rules": ["@commitlint/rules@20.5.0", "", { "dependencies": { "@commitlint/ensure": "^20.5.0", "@commitlint/message": "^20.4.3", "@commitlint/to-lines": "^20.0.0", "@commitlint/types": "^20.5.0" } }, "sha512-5NdQXQEdnDPT5pK8O39ZA7HohzPRHEsDGU23cyVCNPQy4WegAbAwrQk3nIu7p2sl3dutPk8RZd91yKTrMTnRkQ=="], + "@commitlint/rules": ["@commitlint/rules@20.5.3", "", { "dependencies": { "@commitlint/ensure": "^20.5.3", "@commitlint/message": "^20.4.3", "@commitlint/to-lines": "^20.0.0", "@commitlint/types": "^20.5.0" } }, "sha512-MPlMnb9D3wbszYMp+1hPtuhtPJndRo6I6yfkZVA4+jR8w7Kqp0u2u/Y+gzbaItx5Lltq5rw7FSZQWJMoXUC4NQ=="], "@commitlint/to-lines": ["@commitlint/to-lines@20.0.0", "", {}, "sha512-2l9gmwiCRqZNWgV+pX1X7z4yP0b3ex/86UmUFgoRt672Ez6cAM2lOQeHFRUTuE6sPpi8XBCGnd8Kh3bMoyHwJw=="], @@ -280,7 +282,7 @@ "@modelcontextprotocol/sdk": ["@modelcontextprotocol/sdk@1.29.0", "", { "dependencies": { "@hono/node-server": "^1.19.9", "ajv": "^8.17.1", "ajv-formats": "^3.0.1", "content-type": "^1.0.5", "cors": "^2.8.5", "cross-spawn": "^7.0.5", "eventsource": "^3.0.2", "eventsource-parser": "^3.0.0", "express": "^5.2.1", "express-rate-limit": "^8.2.1", "hono": "^4.11.4", "jose": "^6.1.3", "json-schema-typed": "^8.0.2", "pkce-challenge": "^5.0.0", "raw-body": "^3.0.0", "zod": "^3.25 || ^4.0", "zod-to-json-schema": "^3.25.1" }, "peerDependencies": { "@cfworker/json-schema": "^4.1.1" }, "optionalPeers": ["@cfworker/json-schema"] }, "sha512-zo37mZA9hJWpULgkRpowewez1y6ML5GsXJPY8FI0tBBCd77HEvza4jDqRKOXgHNn867PVGCyTdzqpz0izu5ZjQ=="], - "@mongodb-js/saslprep": ["@mongodb-js/saslprep@1.4.9", "", { "dependencies": { "sparse-bitfield": "^3.0.3" } }, "sha512-RXSxsokhAF/4nWys8An8npsqOI33Ex1Hlzqjw2pZOO+GKtMAR2noGnUdsFiGwsaO/xXI+56mtjTmDA3JXJsvmA=="], + "@mongodb-js/saslprep": ["@mongodb-js/saslprep@1.4.11", "", { "dependencies": { "sparse-bitfield": "^3.0.3" } }, "sha512-o9rAHc0IpIjuPSxRutWpE1F62x7n+4mVS4rCNHkzhIUMQcc18bb6xEq5wd2NdN0WjepIyXIppRshYI2kQDOZVA=="], "@msgpackr-extract/msgpackr-extract-darwin-arm64": ["@msgpackr-extract/msgpackr-extract-darwin-arm64@3.0.3", "", { "os": "darwin", "cpu": "arm64" }, "sha512-QZHtlVgbAdy2zAqNA9Gu1UpIuI8Xvsd1v8ic6B2pZmeFnFcMWiPLfWXh7TVw4eGEZ/C9TH281KwhVoeQUKbyjw=="], @@ -304,13 +306,13 @@ "@types/body-parser": ["@types/body-parser@1.19.6", "", { "dependencies": { "@types/connect": "*", "@types/node": "*" } }, "sha512-HLFeCYgz89uk22N5Qg3dvGvsv46B8GLvKKo1zKG4NybA8U2DiEO3w9lqGg29t/tfLRJpJ6iQxnVw4OnB7MoM9g=="], - "@types/bun": ["@types/bun@1.3.13", "", { "dependencies": { "bun-types": "1.3.13" } }, "sha512-9fqXWk5YIHGGnUau9TEi+qdlTYDAnOj+xLCmSTwXfAIqXr2x4tytJb43E9uCvt09zJURKXwAtkoH4nLQfzeTXw=="], + "@types/bun": ["@types/bun@1.3.14", "", { "dependencies": { "bun-types": "1.3.14" } }, "sha512-h1hFqFVcvAvD9j9K7ZW7vd82aSA+rTdznZa+5bwvCwqSB1jmmfLcbIWhOLx1/+boy/xmjgCs/OMUL8hRJSmnPw=="], "@types/connect": ["@types/connect@3.4.38", "", { "dependencies": { "@types/node": "*" } }, "sha512-K6uROf1LD88uDQqJCktA4yzL1YYAK6NgfsI0v/mTgyPKWsX1CnJ0XPSDhViejru1GcRkLWb8RlzFYJRqGUbaug=="], "@types/esrecurse": ["@types/esrecurse@4.3.1", "", {}, "sha512-xJBAbDifo5hpffDBuHl0Y8ywswbiAp/Wi7Y/GtAgSlZyIABppyurxVueOPE8LUQOxdlgi6Zqce7uoEpqNTeiUw=="], - "@types/estree": ["@types/estree@1.0.8", "", {}, "sha512-dWHzHa2WqEXI/O1E9OjrocMTKJl2mSrEolh1Iomrv6U+JuNwaHXsXx9bLu5gG7BUWFIN0skIQJQ/L1rIex4X6w=="], + "@types/estree": ["@types/estree@1.0.9", "", {}, "sha512-GhdPgy1el4/ImP05X05Uw4cw2/M93BCUmnEvWZNStlCzEKME4Fkk+YpoA5OiHNQmoS7Cafb8Xa3Pya8m1Qrzeg=="], "@types/express": ["@types/express@5.0.6", "", { "dependencies": { "@types/body-parser": "*", "@types/express-serve-static-core": "^5.0.0", "@types/serve-static": "^2" } }, "sha512-sKYVuV7Sv9fbPIt/442koC7+IIwK5olP1KWeD88e/idgoJqDm3JV/YUiPwkoKK92ylff2MGxSz1CSjsXelx0YA=="], @@ -320,9 +322,9 @@ "@types/json-schema": ["@types/json-schema@7.0.15", "", {}, "sha512-5+fP8P8MFNC+AyZCDxrB2pkZFPGzqQWUzpSeuuVLvm8VMcorNYavBqoFcxK8bQz4Qsbn4oUEEem4wDLfcysGHA=="], - "@types/node": ["@types/node@25.6.0", "", { "dependencies": { "undici-types": "~7.19.0" } }, "sha512-+qIYRKdNYJwY3vRCZMdJbPLJAtGjQBudzZzdzwQYkEPQd+PJGixUL5QfvCLDaULoLv+RhT3LDkwEfKaAkgSmNQ=="], + "@types/node": ["@types/node@25.7.0", "", { "dependencies": { "undici-types": "~7.21.0" } }, "sha512-z+pdZyxE+RTQE9AcboAZCb4otwcrvgHD+GlBpPgn0emDVt0ohrTMhAwlr2Wd9nZ+nihhYFxO2pThz3C5qSu2Eg=="], - "@types/qs": ["@types/qs@6.15.0", "", {}, "sha512-JawvT8iBVWpzTrz3EGw9BTQFg3BQNmwERdKE22vlTxawwtbyUSlMppvZYKLZzB5zgACXdXxbD3m1bXaMqP/9ow=="], + "@types/qs": ["@types/qs@6.15.1", "", {}, "sha512-GZHUBZR9hckSUhrxmp1nG6NwdpM9fCunJwyThLW1X3AyHgd9IlHb6VANpQQqDr2o/qQp6McZ3y/IA2rVzKzSbw=="], "@types/range-parser": ["@types/range-parser@1.2.7", "", {}, "sha512-hKormJbkJqzQGhziax5PItDUTMAM9uE2XXQmM37dyd4hVM+5aVl7oVxMVUiVQn2oCQFN/LKCZdvSM0pFRqbSmQ=="], @@ -338,25 +340,25 @@ "@types/whatwg-url": ["@types/whatwg-url@13.0.0", "", { "dependencies": { "@types/webidl-conversions": "*" } }, "sha512-N8WXpbE6Wgri7KUSvrmQcqrMllKZ9uxkYWMt+mCSGwNc0Hsw9VQTW7ApqI4XNrx6/SaM2QQJCzMPDEXE058s+Q=="], - "@typescript-eslint/eslint-plugin": ["@typescript-eslint/eslint-plugin@8.59.0", "", { "dependencies": { "@eslint-community/regexpp": "^4.12.2", "@typescript-eslint/scope-manager": "8.59.0", "@typescript-eslint/type-utils": "8.59.0", "@typescript-eslint/utils": "8.59.0", "@typescript-eslint/visitor-keys": "8.59.0", "ignore": "^7.0.5", "natural-compare": "^1.4.0", "ts-api-utils": "^2.5.0" }, "peerDependencies": { "@typescript-eslint/parser": "^8.59.0", "eslint": "^8.57.0 || ^9.0.0 || ^10.0.0", "typescript": ">=4.8.4 <6.1.0" } }, "sha512-HyAZtpdkgZwpq8Sz3FSUvCR4c+ScbuWa9AksK2Jweub7w4M3yTz4O11AqVJzLYjy/B9ZWPyc81I+mOdJU/bDQw=="], + "@typescript-eslint/eslint-plugin": ["@typescript-eslint/eslint-plugin@8.59.3", "", { "dependencies": { "@eslint-community/regexpp": "^4.12.2", "@typescript-eslint/scope-manager": "8.59.3", "@typescript-eslint/type-utils": "8.59.3", "@typescript-eslint/utils": "8.59.3", "@typescript-eslint/visitor-keys": "8.59.3", "ignore": "^7.0.5", "natural-compare": "^1.4.0", "ts-api-utils": "^2.5.0" }, "peerDependencies": { "@typescript-eslint/parser": "^8.59.3", "eslint": "^8.57.0 || ^9.0.0 || ^10.0.0", "typescript": ">=4.8.4 <6.1.0" } }, "sha512-PwFvSKsXGShKGW6n5bZOhGHEcCZXM8HofLK9fNsEwZXzFRjoY+XT1Vsf1zgyXdwTr0ZYz1/2tkZ0DBTT9jZjhw=="], - "@typescript-eslint/parser": ["@typescript-eslint/parser@8.59.0", "", { "dependencies": { "@typescript-eslint/scope-manager": "8.59.0", "@typescript-eslint/types": "8.59.0", "@typescript-eslint/typescript-estree": "8.59.0", "@typescript-eslint/visitor-keys": "8.59.0", "debug": "^4.4.3" }, "peerDependencies": { "eslint": "^8.57.0 || ^9.0.0 || ^10.0.0", "typescript": ">=4.8.4 <6.1.0" } }, "sha512-TI1XGwKbDpo9tRW8UDIXCOeLk55qe9ZFGs8MTKU6/M08HWTw52DD/IYhfQtOEhEdPhLMT26Ka/x7p70nd3dzDg=="], + "@typescript-eslint/parser": ["@typescript-eslint/parser@8.59.3", "", { "dependencies": { "@typescript-eslint/scope-manager": "8.59.3", "@typescript-eslint/types": "8.59.3", "@typescript-eslint/typescript-estree": "8.59.3", "@typescript-eslint/visitor-keys": "8.59.3", "debug": "^4.4.3" }, "peerDependencies": { "eslint": "^8.57.0 || ^9.0.0 || ^10.0.0", "typescript": ">=4.8.4 <6.1.0" } }, "sha512-HPwA+hVkfcriajbNvTmZv4VRauibay+cWArYUYq7u7W7PmGShMxbPxLvrwDme55a6d5alG3nrYfhyJ/G28XlLg=="], - "@typescript-eslint/project-service": ["@typescript-eslint/project-service@8.59.0", "", { "dependencies": { "@typescript-eslint/tsconfig-utils": "^8.59.0", "@typescript-eslint/types": "^8.59.0", "debug": "^4.4.3" }, "peerDependencies": { "typescript": ">=4.8.4 <6.1.0" } }, "sha512-Lw5ITrR5s5TbC19YSvlr63ZfLaJoU6vtKTHyB0GQOpX0W7d5/Ir6vUahWi/8Sps/nOukZQ0IB3SmlxZnjaKVnw=="], + "@typescript-eslint/project-service": ["@typescript-eslint/project-service@8.59.3", "", { "dependencies": { "@typescript-eslint/tsconfig-utils": "^8.59.3", "@typescript-eslint/types": "^8.59.3", "debug": "^4.4.3" }, "peerDependencies": { "typescript": ">=4.8.4 <6.1.0" } }, "sha512-ECiUWa/KYRGDFUqTNehaRgzDshnJfkTABJxVemHk4ko22gcr0ukloKjWvyQ64g8YCV/UI47kN1dbmjf/GaQYng=="], - "@typescript-eslint/scope-manager": ["@typescript-eslint/scope-manager@8.59.0", "", { "dependencies": { "@typescript-eslint/types": "8.59.0", "@typescript-eslint/visitor-keys": "8.59.0" } }, "sha512-UzR16Ut8IpA3Mc4DbgAShlPPkVm8xXMWafXxB0BocaVRHs8ZGakAxGRskF7FId3sdk9lgGD73GSFaWmWFDE4dg=="], + "@typescript-eslint/scope-manager": ["@typescript-eslint/scope-manager@8.59.3", "", { "dependencies": { "@typescript-eslint/types": "8.59.3", "@typescript-eslint/visitor-keys": "8.59.3" } }, "sha512-t2LvZnoEfzKtnPjgeEu41xw5gxq9mQVfYy4OoZ4Vlt0sk3JwxmhCca/AR7DwOiHrjWgjAj6as4AhRLKSDfvZIA=="], - "@typescript-eslint/tsconfig-utils": ["@typescript-eslint/tsconfig-utils@8.59.0", "", { "peerDependencies": { "typescript": ">=4.8.4 <6.1.0" } }, "sha512-91Sbl3s4Kb3SybliIY6muFBmHVv+pYXfybC4Oolp3dvk8BvIE3wOPc+403CWIT7mJNkfQRGtdqghzs2+Z91Tqg=="], + "@typescript-eslint/tsconfig-utils": ["@typescript-eslint/tsconfig-utils@8.59.3", "", { "peerDependencies": { "typescript": ">=4.8.4 <6.1.0" } }, "sha512-PcIJHjmaREXLgIAIzLnSY9VucEzz8FKXsRgFa1DmdGCK/5tJpW03TKJF01Q6VZd1lLdz2sIKPWaDUZN9dp//dw=="], - "@typescript-eslint/type-utils": ["@typescript-eslint/type-utils@8.59.0", "", { "dependencies": { "@typescript-eslint/types": "8.59.0", "@typescript-eslint/typescript-estree": "8.59.0", "@typescript-eslint/utils": "8.59.0", "debug": "^4.4.3", "ts-api-utils": "^2.5.0" }, "peerDependencies": { "eslint": "^8.57.0 || ^9.0.0 || ^10.0.0", "typescript": ">=4.8.4 <6.1.0" } }, "sha512-3TRiZaQSltGqGeNrJzzr1+8YcEobKH9rHnqIp/1psfKFmhRQDNMGP5hBufanYTGznwShzVLs3Mz+gDN7HkWfXg=="], + "@typescript-eslint/type-utils": ["@typescript-eslint/type-utils@8.59.3", "", { "dependencies": { "@typescript-eslint/types": "8.59.3", "@typescript-eslint/typescript-estree": "8.59.3", "@typescript-eslint/utils": "8.59.3", "debug": "^4.4.3", "ts-api-utils": "^2.5.0" }, "peerDependencies": { "eslint": "^8.57.0 || ^9.0.0 || ^10.0.0", "typescript": ">=4.8.4 <6.1.0" } }, "sha512-g71d8QD8UaiHGvrJwyIS1hCX5r63w6Jll+4VEYhEAHXTDIqX1JgxhTAbEHtKntL9kuc4jRo7/GWw5xfCepSccQ=="], - "@typescript-eslint/types": ["@typescript-eslint/types@8.59.0", "", {}, "sha512-nLzdsT1gdOgFxxxwrlNVUBzSNBEEHJ86bblmk4QAS6stfig7rcJzWKqCyxFy3YRRHXDWEkb2NralA1nOYkkm/A=="], + "@typescript-eslint/types": ["@typescript-eslint/types@8.59.3", "", {}, "sha512-ePFoH0g4ludssdRFqqDxQePCxU4WQyRa9+XVwjm7yLn0FKhMeoetC+qBEEI1Eyb1pGSDveTIT09Bvw2WhlGayg=="], - "@typescript-eslint/typescript-estree": ["@typescript-eslint/typescript-estree@8.59.0", "", { "dependencies": { "@typescript-eslint/project-service": "8.59.0", "@typescript-eslint/tsconfig-utils": "8.59.0", "@typescript-eslint/types": "8.59.0", "@typescript-eslint/visitor-keys": "8.59.0", "debug": "^4.4.3", "minimatch": "^10.2.2", "semver": "^7.7.3", "tinyglobby": "^0.2.15", "ts-api-utils": "^2.5.0" }, "peerDependencies": { "typescript": ">=4.8.4 <6.1.0" } }, "sha512-O9Re9P1BmBLFJyikRbQpLku/QA3/AueZNO9WePLBwQrvkixTmDe8u76B6CYUAITRl/rHawggEqUGn5QIkVRLMw=="], + "@typescript-eslint/typescript-estree": ["@typescript-eslint/typescript-estree@8.59.3", "", { "dependencies": { "@typescript-eslint/project-service": "8.59.3", "@typescript-eslint/tsconfig-utils": "8.59.3", "@typescript-eslint/types": "8.59.3", "@typescript-eslint/visitor-keys": "8.59.3", "debug": "^4.4.3", "minimatch": "^10.2.2", "semver": "^7.7.3", "tinyglobby": "^0.2.15", "ts-api-utils": "^2.5.0" }, "peerDependencies": { "typescript": ">=4.8.4 <6.1.0" } }, "sha512-CbRjVRAf7Lr9Kr8RopKcbY45p2VfmmHrm0ygOCYFi7oU8q19m0Fs/6iHS7kNOmwpp+ob07ZVcAqlxUod9lYdmg=="], - "@typescript-eslint/utils": ["@typescript-eslint/utils@8.59.0", "", { "dependencies": { "@eslint-community/eslint-utils": "^4.9.1", "@typescript-eslint/scope-manager": "8.59.0", "@typescript-eslint/types": "8.59.0", "@typescript-eslint/typescript-estree": "8.59.0" }, "peerDependencies": { "eslint": "^8.57.0 || ^9.0.0 || ^10.0.0", "typescript": ">=4.8.4 <6.1.0" } }, "sha512-I1R/K7V07XsMJ12Oaxg/O9GfrysGTmCRhvZJBv0RE0NcULMzjqVpR5kRRQjHsz3J/bElU7HwCO7zkqL+MSUz+g=="], + "@typescript-eslint/utils": ["@typescript-eslint/utils@8.59.3", "", { "dependencies": { "@eslint-community/eslint-utils": "^4.9.1", "@typescript-eslint/scope-manager": "8.59.3", "@typescript-eslint/types": "8.59.3", "@typescript-eslint/typescript-estree": "8.59.3" }, "peerDependencies": { "eslint": "^8.57.0 || ^9.0.0 || ^10.0.0", "typescript": ">=4.8.4 <6.1.0" } }, "sha512-JAvT14goBzRzzzZyqq3P9BLArIxTtQURUtFgQ/V7FO+eU+Gg6ES+5ymOPP1wRxXcxAYeivCk4uS3jCKWI1K8Zg=="], - "@typescript-eslint/visitor-keys": ["@typescript-eslint/visitor-keys@8.59.0", "", { "dependencies": { "@typescript-eslint/types": "8.59.0", "eslint-visitor-keys": "^5.0.0" } }, "sha512-/uejZt4dSere1bx12WLlPfv8GktzcaDtuJ7s42/HEZ5zGj9oxRaD4bj7qwSunXkf+pbAhFt2zjpHYUiT5lHf0Q=="], + "@typescript-eslint/visitor-keys": ["@typescript-eslint/visitor-keys@8.59.3", "", { "dependencies": { "@typescript-eslint/types": "8.59.3", "eslint-visitor-keys": "^5.0.0" } }, "sha512-f1UQF7ggd42YiwI5wGrRaPsa+P0CINBlrkLPmGfpq/u/I/oVtecoEIfFR9ag/oa1sLOsRNZ6xehf6qMZhQGBDg=="], "accepts": ["accepts@2.0.0", "", { "dependencies": { "mime-types": "^3.0.0", "negotiator": "^1.0.0" } }, "sha512-5cvg6CtKwfgdmVqY1WIiXKc3Q1bkRqGLi+2W/6ao+6Y7gu/RCwRuAhGEzh5B4KlszSuTLgZYuqFqo5bImjNKng=="], @@ -388,15 +390,15 @@ "body-parser": ["body-parser@2.2.2", "", { "dependencies": { "bytes": "^3.1.2", "content-type": "^1.0.5", "debug": "^4.4.3", "http-errors": "^2.0.0", "iconv-lite": "^0.7.0", "on-finished": "^2.4.1", "qs": "^6.14.1", "raw-body": "^3.0.1", "type-is": "^2.0.1" } }, "sha512-oP5VkATKlNwcgvxi0vM0p/D3n2C3EReYVX+DNYs5TjZFn/oQt2j+4sVJtSMr18pdRr8wjTcBl6LoV+FUwzPmNA=="], - "brace-expansion": ["brace-expansion@5.0.5", "", { "dependencies": { "balanced-match": "^4.0.2" } }, "sha512-VZznLgtwhn+Mact9tfiwx64fA9erHH/MCXEUfB/0bX/6Fz6ny5EGTXYltMocqg4xFAQZtnO3DHWWXi8RiuN7cQ=="], + "brace-expansion": ["brace-expansion@5.0.6", "", { "dependencies": { "balanced-match": "^4.0.2" } }, "sha512-kLpxurY4Z4r9sgMsyG0Z9uzsBlgiU/EFKhj/h91/8yHu0edo7XuixOIH3VcJ8kkxs6/jPzoI6U9Vj3WqbMQ94g=="], "bson": ["bson@7.2.0", "", {}, "sha512-YCEo7KjMlbNlyHhz7zAZNDpIpQbd+wOEHJYezv0nMYTn4x31eIUM2yomNNubclAt63dObUzKHWsBLJ9QcZNSnQ=="], "buffer": ["buffer@6.0.3", "", { "dependencies": { "base64-js": "^1.3.1", "ieee754": "^1.2.1" } }, "sha512-FTiCpNxtwiZZHEZbcbTIcZjERVICn9yq/pDFkTl95/AxzD1naBctN7YO68riM/gLSDY7sdrMby8hofADYuuqOA=="], - "bullmq": ["bullmq@5.76.3", "", { "dependencies": { "cron-parser": "4.9.0", "ioredis": "5.10.1", "msgpackr": "1.11.5", "node-abort-controller": "3.1.1", "semver": "7.7.4", "tslib": "2.8.1" } }, "sha512-UBICMeWLYa+Dz7IGBNebXApQ1OIxNd4t6nX+AFPQ5gFA3sosW34PENe8Q1cvbjcbMTaU3xrKPorb6tM1czRSsw=="], + "bullmq": ["bullmq@5.76.8", "", { "dependencies": { "cron-parser": "4.9.0", "ioredis": "5.10.1", "msgpackr": "2.0.1", "node-abort-controller": "3.1.1", "semver": "7.8.0", "tslib": "2.8.1" } }, "sha512-v3WTwA8diFtsADaJ8eK2ozyi2CYK9rDZCeoKF+dIPF/MUL8HxAOa3H72Gmu1lC4yKlho6t1PwNr/QpDVqaNEZQ=="], - "bun-types": ["bun-types@1.3.13", "", { "dependencies": { "@types/node": "*" } }, "sha512-QXKeHLlOLqQX9LgYaHJfzdBaV21T63HhFJnvuRCcjZiaUDpbs5ED1MgxbMra71CsryN/1dAoXuJJJwIv/2drVA=="], + "bun-types": ["bun-types@1.3.14", "", { "dependencies": { "@types/node": "*" } }, "sha512-4N0ig0fEomHt5R0KCFWjovxow98rIoRwKolrYdCcknNwMekCXRnWEUvgu5soYV8QXtVsrUD8B95MBOZGPvr6KQ=="], "bytes": ["bytes@3.1.2", "", {}, "sha512-/Nf7TyzTx6S3yRJObOAV7956r8cr2+Oj8AC5dt8wSP3BQAoeX58NoHyCU8P8zGkNXStjTSi6fzO6F0pBdcYbEg=="], @@ -504,7 +506,7 @@ "escape-string-regexp": ["escape-string-regexp@4.0.0", "", {}, "sha512-TtpcNJ3XAzx3Gq8sWRzJaVajRs0uVxA2YAkdb1jm2YkPz4G6egUFAyA3n5vtEIZefPk5Wa4UXbKuS5fKkJWdgA=="], - "eslint": ["eslint@10.2.1", "", { "dependencies": { "@eslint-community/eslint-utils": "^4.8.0", "@eslint-community/regexpp": "^4.12.2", "@eslint/config-array": "^0.23.5", "@eslint/config-helpers": "^0.5.5", "@eslint/core": "^1.2.1", "@eslint/plugin-kit": "^0.7.1", "@humanfs/node": "^0.16.6", "@humanwhocodes/module-importer": "^1.0.1", "@humanwhocodes/retry": "^0.4.2", "@types/estree": "^1.0.6", "ajv": "^6.14.0", "cross-spawn": "^7.0.6", "debug": "^4.3.2", "escape-string-regexp": "^4.0.0", "eslint-scope": "^9.1.2", "eslint-visitor-keys": "^5.0.1", "espree": "^11.2.0", "esquery": "^1.7.0", "esutils": "^2.0.2", "fast-deep-equal": "^3.1.3", "file-entry-cache": "^8.0.0", "find-up": "^5.0.0", "glob-parent": "^6.0.2", "ignore": "^5.2.0", "imurmurhash": "^0.1.4", "is-glob": "^4.0.0", "json-stable-stringify-without-jsonify": "^1.0.1", "minimatch": "^10.2.4", "natural-compare": "^1.4.0", "optionator": "^0.9.3" }, "peerDependencies": { "jiti": "*" }, "optionalPeers": ["jiti"], "bin": { "eslint": "bin/eslint.js" } }, "sha512-wiyGaKsDgqXvF40P8mDwiUp/KQjE1FdrIEJsM8PZ3XCiniTMXS3OHWWUe5FI5agoCnr8x4xPrTDZuxsBlNHl+Q=="], + "eslint": ["eslint@10.3.0", "", { "dependencies": { "@eslint-community/eslint-utils": "^4.8.0", "@eslint-community/regexpp": "^4.12.2", "@eslint/config-array": "^0.23.5", "@eslint/config-helpers": "^0.5.5", "@eslint/core": "^1.2.1", "@eslint/plugin-kit": "^0.7.1", "@humanfs/node": "^0.16.6", "@humanwhocodes/module-importer": "^1.0.1", "@humanwhocodes/retry": "^0.4.2", "@types/estree": "^1.0.6", "ajv": "^6.14.0", "cross-spawn": "^7.0.6", "debug": "^4.3.2", "escape-string-regexp": "^4.0.0", "eslint-scope": "^9.1.2", "eslint-visitor-keys": "^5.0.1", "espree": "^11.2.0", "esquery": "^1.7.0", "esutils": "^2.0.2", "fast-deep-equal": "^3.1.3", "file-entry-cache": "^8.0.0", "find-up": "^5.0.0", "glob-parent": "^6.0.2", "ignore": "^5.2.0", "imurmurhash": "^0.1.4", "is-glob": "^4.0.0", "json-stable-stringify-without-jsonify": "^1.0.1", "minimatch": "^10.2.4", "natural-compare": "^1.4.0", "optionator": "^0.9.3" }, "peerDependencies": { "jiti": "*" }, "optionalPeers": ["jiti"], "bin": { "eslint": "bin/eslint.js" } }, "sha512-XbEXaRva5cF0ZQB8w6MluHA0kZZfV2DuCMJ3ozyEOHLwDpZX2Lmm/7Pp0xdJmI0GL1W05VH5VwIFHEm1Vcw2gw=="], "eslint-config-prettier": ["eslint-config-prettier@10.1.8", "", { "peerDependencies": { "eslint": ">=7.0.0" }, "bin": { "eslint-config-prettier": "bin/cli.js" } }, "sha512-82GZUjRS0p/jganf6q1rEO25VSoHH0hKPCTrgillPjdI/3bgBhAE1QzHrHTizjpRvy6pGAvKjDJtk2pF9NDq8w=="], @@ -534,7 +536,7 @@ "express": ["express@5.2.1", "", { "dependencies": { "accepts": "^2.0.0", "body-parser": "^2.2.1", "content-disposition": "^1.0.0", "content-type": "^1.0.5", "cookie": "^0.7.1", "cookie-signature": "^1.2.1", "debug": "^4.4.0", "depd": "^2.0.0", "encodeurl": "^2.0.0", "escape-html": "^1.0.3", "etag": "^1.8.1", "finalhandler": "^2.1.0", "fresh": "^2.0.0", "http-errors": "^2.0.0", "merge-descriptors": "^2.0.0", "mime-types": "^3.0.0", "on-finished": "^2.4.1", "once": "^1.4.0", "parseurl": "^1.3.3", "proxy-addr": "^2.0.7", "qs": "^6.14.0", "range-parser": "^1.2.1", "router": "^2.2.0", "send": "^1.1.0", "serve-static": "^2.2.0", "statuses": "^2.0.1", "type-is": "^2.0.1", "vary": "^1.1.2" } }, "sha512-hIS4idWWai69NezIdRt2xFVofaF4j+6INOpJlVOLDO8zXGpUVEVzIYk12UUi2JzjEzWL3IOAxcTubgz9Po0yXw=="], - "express-rate-limit": ["express-rate-limit@8.5.0", "", { "dependencies": { "ip-address": "10.1.0" }, "peerDependencies": { "express": ">= 4.11" } }, "sha512-XKhFohWaSBdVJNTi5TaHziqnPkv04I9UQV6q1Wy7Ui6GGQZVW12ojDFwqer14EvCXxjvPG0CyWXx7cAXpALB4Q=="], + "express-rate-limit": ["express-rate-limit@8.5.1", "", { "dependencies": { "ip-address": "^10.2.0" }, "peerDependencies": { "express": ">= 4.11" } }, "sha512-5O6KYmyJEpuPJV5hNTXKbAHWRqrzyu+OI3vUnSd2kXFubIVpG7ezpgxQy76Zo5GQZtrQBg86hF+CM/NX+cioiQ=="], "fast-deep-equal": ["fast-deep-equal@3.1.3", "", {}, "sha512-f3qQ9oQy9j2AhBe/H9VC91wLmKBCCU/gDOnKNAYG5hswO7BLKj09Hc5HYNz9cGI++xlpDCIgDaitVs03ATR84Q=="], @@ -544,7 +546,7 @@ "fast-levenshtein": ["fast-levenshtein@2.0.6", "", {}, "sha512-DCXu6Ifhqcks7TZKY3Hxp3y6qphY5SJZmrWMDrKcERSOXWQdMhU9Ig/PYrzyw/ul9jOIyh0N4M0tbC5hodg8dw=="], - "fast-uri": ["fast-uri@3.1.0", "", {}, "sha512-iPeeDKJSWf4IEOasVVrknXpaBV0IApz/gp7S2bb7Z4Lljbl2MGJRqInZiUrQwV16cpzw/D3S5j5Julj/gT52AA=="], + "fast-uri": ["fast-uri@3.1.2", "", {}, "sha512-rVjf7ArG3LTk+FS6Yw81V1DLuZl1bRbNrev6Tmd/9RaroeeRRJhAt7jg/6YFxbvAQXUCavSoZhPPj6oOx+5KjQ=="], "fdir": ["fdir@6.5.0", "", { "peerDependencies": { "picomatch": "^3 || ^4" }, "optionalPeers": ["picomatch"] }, "sha512-tIbYtZbucOs0BRGqPJkshJUYdL+SDH7dVM8gjy+ERp3WAUjLEFJE+02kanyHtwjWOnwrKYBiwAmM0p4kLJAnXg=="], @@ -572,7 +574,7 @@ "get-caller-file": ["get-caller-file@2.0.5", "", {}, "sha512-DyFP3BM/3YHTQOCUL/w0OZHR0lpKeGrxotcHWcqNEdnltqFwXVfhEBQ94eIo34AfQpo0rGki4cyIiftY06h2Fg=="], - "get-east-asian-width": ["get-east-asian-width@1.5.0", "", {}, "sha512-CQ+bEO+Tva/qlmw24dCejulK5pMzVnUOFOijVogd3KQs07HnRIgp8TGipvCCRT06xeYEbpbgwaCxglFyiuIcmA=="], + "get-east-asian-width": ["get-east-asian-width@1.6.0", "", {}, "sha512-QRbvDIbx6YklUe6RxeTeleMR0yv3cYH6PsPZHcnVn7xv7zO1BHN8r0XETu8n6Ye3Q+ahtSarc3WgtNWmehIBfA=="], "get-intrinsic": ["get-intrinsic@1.3.0", "", { "dependencies": { "call-bind-apply-helpers": "^1.0.2", "es-define-property": "^1.0.1", "es-errors": "^1.3.0", "es-object-atoms": "^1.1.1", "function-bind": "^1.1.2", "get-proto": "^1.0.1", "gopd": "^1.2.0", "has-symbols": "^1.1.0", "hasown": "^2.0.2", "math-intrinsics": "^1.1.0" } }, "sha512-9fSjSaos/fRIVIp+xSJlE6lfwhES7LNtKaCBIamHsjr2na1BiABJPo0mOjjz8GJDURarmCPGqaiVg5mfjb98CQ=="], @@ -584,7 +586,7 @@ "global-directory": ["global-directory@5.0.0", "", { "dependencies": { "ini": "6.0.0" } }, "sha512-1pgFdhK3J2LeM+dVf2Pd424yHx2ou338lC0ErNP2hPx4j8eW1Sp0XqSjNxtk6Tc4Kr5wlWtSvz8cn2yb7/SG/w=="], - "globals": ["globals@17.5.0", "", {}, "sha512-qoV+HK2yFl/366t2/Cb3+xxPUo5BuMynomoDmiaZBIdbs+0pYbjfZU+twLhGKp4uCZ/+NbtpVepH5bGCxRyy2g=="], + "globals": ["globals@17.6.0", "", {}, "sha512-sepffkT8stwnIYbsMBpoCHJuJM5l98FUF2AnE07hfvE0m/qp3R586hw4jF4uadbhvg1ooIdzuu7CsfD2jzCaNA=="], "gopd": ["gopd@1.2.0", "", {}, "sha512-ZUKRh6/kUFoAiTAtTYPZJ3hw9wNxx+BIBOijnlG9PnrJsCcSjs1wyyD6vJpaYtgnzDrKYRSqf3OO6Rfa93xsRg=="], @@ -592,7 +594,7 @@ "hasown": ["hasown@2.0.3", "", { "dependencies": { "function-bind": "^1.1.2" } }, "sha512-ej4AhfhfL2Q2zpMmLo7U1Uv9+PyhIZpgQLGT1F9miIGmiCJIoCgSmczFdrc97mWT4kVY72KA+WnnhJ5pghSvSg=="], - "hono": ["hono@4.12.17", "", {}, "sha512-FbJJNb/XgX7YW0hX/V8w5oYLztKEsRLykCMZWt1WdLtsfjzMvmoqWBA4H4t5norinq8/rh20oiZYr+WSl4UzAQ=="], + "hono": ["hono@4.12.18", "", {}, "sha512-RWzP96k/yv0PQfyXnWjs6zot20TqfpfsNXhOnev8d1InAxubW93L11/oNUc3tQqn2G0bSdAOBpX+2uDFHV7kdQ=="], "http-errors": ["http-errors@2.0.1", "", { "dependencies": { "depd": "~2.0.0", "inherits": "~2.0.4", "setprototypeof": "~1.2.0", "statuses": "~2.0.2", "toidentifier": "~1.0.1" } }, "sha512-4FbRdAX+bSdmo4AUFuS0WNiPz8NgFt+r8ThgNWmlrjQjt1Q7ZR9+zTlce2859x4KSXrwIsaeTqDoKQmtP8pLmQ=="], @@ -616,13 +618,13 @@ "ini": ["ini@6.0.0", "", {}, "sha512-IBTdIkzZNOpqm7q3dRqJvMaldXjDHWkEDfrwGEQTs5eaQMWV+djAhR+wahyNNMAa+qpbDUhBMVt4ZKNwpPm7xQ=="], - "ink": ["ink@7.0.1", "", { "dependencies": { "@alcalzone/ansi-tokenize": "^0.3.0", "ansi-escapes": "^7.3.0", "ansi-styles": "^6.2.3", "auto-bind": "^5.0.1", "chalk": "^5.6.2", "cli-boxes": "^4.0.1", "cli-cursor": "^4.0.0", "cli-truncate": "^6.0.0", "code-excerpt": "^4.0.0", "es-toolkit": "^1.45.1", "indent-string": "^5.0.0", "is-in-ci": "^2.0.0", "patch-console": "^2.0.0", "react-reconciler": "^0.33.0", "scheduler": "^0.27.0", "signal-exit": "^3.0.7", "slice-ansi": "^9.0.0", "stack-utils": "^2.0.6", "string-width": "^8.2.0", "terminal-size": "^4.0.1", "type-fest": "^5.5.0", "widest-line": "^6.0.0", "wrap-ansi": "^10.0.0", "ws": "^8.20.0", "yoga-layout": "~3.2.1" }, "peerDependencies": { "@types/react": ">=19.2.0", "react": ">=19.2.0", "react-devtools-core": ">=6.1.2" }, "optionalPeers": ["@types/react", "react-devtools-core"] }, "sha512-o6LAC268PLawlGVYrXTyaTfke4VtJftEheuwbgkQf7yvSXyWp1nRwBbAyKEkWXFZZsW/la5wrMuNbuBvZK2C1w=="], + "ink": ["ink@7.0.3", "", { "dependencies": { "@alcalzone/ansi-tokenize": "^0.3.0", "ansi-escapes": "^7.3.0", "ansi-styles": "^6.2.3", "auto-bind": "^5.0.1", "chalk": "^5.6.2", "cli-boxes": "^4.0.1", "cli-cursor": "^4.0.0", "cli-truncate": "^6.0.0", "code-excerpt": "^4.0.0", "es-toolkit": "^1.45.1", "indent-string": "^5.0.0", "is-in-ci": "^2.0.0", "patch-console": "^2.0.0", "react-reconciler": "^0.33.0", "scheduler": "^0.27.0", "signal-exit": "^3.0.7", "slice-ansi": "^9.0.0", "stack-utils": "^2.0.6", "string-width": "^8.2.0", "terminal-size": "^4.0.1", "type-fest": "^5.5.0", "widest-line": "^6.0.0", "wrap-ansi": "^10.0.0", "ws": "^8.20.0", "yoga-layout": "~3.2.1" }, "peerDependencies": { "@types/react": ">=19.2.0", "react": ">=19.2.0", "react-devtools-core": ">=6.1.2" }, "optionalPeers": ["@types/react", "react-devtools-core"] }, "sha512-5kxHkIj9+RuqCU3zyvP4qvYWNOSHP2TW/SHayHGHOmk87KwfVcZwvJGemi9ch+ci2gXUqerK/Eh2DGEDt5q45g=="], "ink-text-input": ["ink-text-input@6.0.0", "", { "dependencies": { "chalk": "^5.3.0", "type-fest": "^4.18.2" }, "peerDependencies": { "ink": ">=5", "react": ">=18" } }, "sha512-Fw64n7Yha5deb1rHY137zHTAbSTNelUKuB5Kkk2HACXEtwIHBCf9OH2tP/LQ9fRYTl1F0dZgbW0zPnZk6FA9Lw=="], "ioredis": ["ioredis@5.10.1", "", { "dependencies": { "@ioredis/commands": "1.5.1", "cluster-key-slot": "^1.1.0", "debug": "^4.3.4", "denque": "^2.1.0", "lodash.defaults": "^4.2.0", "lodash.isarguments": "^3.1.0", "redis-errors": "^1.2.0", "redis-parser": "^3.0.0", "standard-as-callback": "^2.1.0" } }, "sha512-HuEDBTI70aYdx1v6U97SbNx9F1+svQKBDo30o0b9fw055LMepzpOOd0Ccg9Q6tbqmBSJaMuY0fB7yw9/vjBYCA=="], - "ip-address": ["ip-address@10.1.0", "", {}, "sha512-XXADHxXmvT9+CRxhXg56LJovE+bmWnEWB78LB83VZTprKTmaC5QfruXocxzTZ2Kl0DNwKuBdlIhjL8LeY8Sf8Q=="], + "ip-address": ["ip-address@10.2.0", "", {}, "sha512-/+S6j4E9AHvW9SWMSEY9Xfy66O5PWvVEJ08O0y5JGyEKQpojb0K0GKpz/v5HJ/G0vi3D2sjGK78119oXZeE0qA=="], "ipaddr.js": ["ipaddr.js@1.9.1", "", {}, "sha512-0KI/607xoxSToH7GjN1FfSbLoU0+btTicjsQSWQlh/hZykN8KpmMf7uYwPW3R+akZ6R/w18ZlXSHBYXiYUPO3g=="], @@ -678,22 +680,10 @@ "locate-path": ["locate-path@6.0.0", "", { "dependencies": { "p-locate": "^5.0.0" } }, "sha512-iPZK6eYjbxRu3uB4/WZ3EsEIMJFMqAoopl3R+zuq0UjcAm/MO6KCweDgPfP3elTztoKP3KtnVHxTn2NHBSDVUw=="], - "lodash.camelcase": ["lodash.camelcase@4.3.0", "", {}, "sha512-TwuEnCnxbc3rAvhf/LbG7tJUDzhqXyFnv3dtzLOPgCG/hODL7WFnsbwktkD7yUV0RrreP/l1PALq/YSg6VvjlA=="], - "lodash.defaults": ["lodash.defaults@4.2.0", "", {}, "sha512-qjxPLHd3r5DnsdGacqOMU6pb/avJzdh9tFX2ymgoZE27BmjXrNy/y4LoaiTeAb+O3gL8AfpJGtqfX/ae2leYYQ=="], "lodash.isarguments": ["lodash.isarguments@3.1.0", "", {}, "sha512-chi4NHZlZqZD18a0imDHnZPrDeBbTtVN7GXMwuGdRH9qotxAjYs3aVLKc7zNOG9eddR5Ksd8rvFEBc9SsggPpg=="], - "lodash.kebabcase": ["lodash.kebabcase@4.1.1", "", {}, "sha512-N8XRTIMMqqDgSy4VLKPnJ/+hpGZN+PHQiJnSenYqPaVV/NCqEogTnAdZLQiGKhxX+JCs8waWq2t1XHWKOmlY8g=="], - - "lodash.mergewith": ["lodash.mergewith@4.6.2", "", {}, "sha512-GK3g5RPZWTRSeLSpgP8Xhra+pnjBC56q9FZYe1d5RN3TJ35dbkGy3YqBSMbyCrlbi+CM9Z3Jk5yTL7RCsqboyQ=="], - - "lodash.snakecase": ["lodash.snakecase@4.1.1", "", {}, "sha512-QZ1d4xoBHYUeuouhEq3lk3Uq7ldgyFXGBhg04+oRLnIz8o9T65Eh+8YdroUwn846zchkA9yDsDl5CVVaV2nqYw=="], - - "lodash.startcase": ["lodash.startcase@4.4.0", "", {}, "sha512-+WKqsK294HMSc2jEbNgpHpd0JfIBhp7rEV4aqXWqFr6AlXov+SlcgB1Fv01y2kGe3Gc8nMW7VA0SrGuSkRfIEg=="], - - "lodash.upperfirst": ["lodash.upperfirst@4.3.1", "", {}, "sha512-sReKOYJIJf74dhJONhU4e0/shzi1trVbSWDOhKYE5XV2O+H7Sb2Dihwuc7xWxVl+DgFPyTqIN3zMfT9cq5iWDg=="], - "log-update": ["log-update@6.1.0", "", { "dependencies": { "ansi-escapes": "^7.0.0", "cli-cursor": "^5.0.0", "slice-ansi": "^7.1.0", "strip-ansi": "^7.1.0", "wrap-ansi": "^9.0.0" } }, "sha512-9ie8ItPR6tjY5uYJh8K/Zrv/RMZ5VOlOWvtZdEHYSTFKZfIBPQa9tOAEeAWhd+AnIneLJ22w5fjOYtoutpWq5w=="], "logform": ["logform@2.7.0", "", { "dependencies": { "@colors/colors": "1.6.0", "@types/triple-beam": "^1.3.2", "fecha": "^4.2.0", "ms": "^2.1.1", "safe-stable-stringify": "^2.3.1", "triple-beam": "^1.3.0" } }, "sha512-TFYA4jnP7PVbmlBIfhlSe+WKxs9dklXMTEGcBCIvLhE/Tn3H6Gk1norupVW7m5Cnd4bLcr08AytbyV/xj7f/kQ=="], @@ -730,7 +720,7 @@ "ms": ["ms@2.1.3", "", {}, "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA=="], - "msgpackr": ["msgpackr@1.11.5", "", { "optionalDependencies": { "msgpackr-extract": "^3.0.2" } }, "sha512-UjkUHN0yqp9RWKy0Lplhh+wlpdt9oQBYgULZOiFhV3VclSF1JnSQWZ5r9gORQlNYaUKQoR8itv7g7z1xDDuACA=="], + "msgpackr": ["msgpackr@2.0.1", "", { "optionalDependencies": { "msgpackr-extract": "^3.0.2" } }, "sha512-9J+tqTEsbHqY8YohazYgty7LgerFIWxvMLpUjqETSmjHojtJm2WnX2kK/2a1fLI7CO7ERP1YSEUXMucz4j+yBA=="], "msgpackr-extract": ["msgpackr-extract@3.0.3", "", { "dependencies": { "node-gyp-build-optional-packages": "5.2.2" }, "optionalDependencies": { "@msgpackr-extract/msgpackr-extract-darwin-arm64": "3.0.3", "@msgpackr-extract/msgpackr-extract-darwin-x64": "3.0.3", "@msgpackr-extract/msgpackr-extract-linux-arm": "3.0.3", "@msgpackr-extract/msgpackr-extract-linux-arm64": "3.0.3", "@msgpackr-extract/msgpackr-extract-linux-x64": "3.0.3", "@msgpackr-extract/msgpackr-extract-win32-x64": "3.0.3" }, "bin": { "download-msgpackr-prebuilds": "bin/download-prebuilds.js" } }, "sha512-P0efT1C9jIdVRefqjzOQ9Xml57zpOXnIuS+csaB4MdZbTdmGDLo8XhzBG1N7aO11gKDDkJvBLULeFTo46wwreA=="], @@ -804,7 +794,7 @@ "raw-body": ["raw-body@3.0.2", "", { "dependencies": { "bytes": "~3.1.2", "http-errors": "~2.0.1", "iconv-lite": "~0.7.0", "unpipe": "~1.0.0" } }, "sha512-K5zQjDllxWkf7Z5xJdV0/B0WTNqx6vxG70zJE4N0kBs4LovmEYWJzQGxC9bS9RAKu3bgM40lrd5zoLJ12MQ5BA=="], - "react": ["react@19.2.5", "", {}, "sha512-llUJLzz1zTUBrskt2pwZgLq59AemifIftw4aB7JxOqf1HY2FDaGDxgwpAPVzHU1kdWabH7FauP4i1oEeer2WCA=="], + "react": ["react@19.2.6", "", {}, "sha512-sfWGGfavi0xr8Pg0sVsyHMAOziVYKgPLNrS7ig+ivMNb3wbCBw3KxtflsGBAwD3gYQlE/AEZsTLgToRrSCjb0Q=="], "react-reconciler": ["react-reconciler@0.33.0", "", { "dependencies": { "scheduler": "^0.27.0" }, "peerDependencies": { "react": "^19.2.0" } }, "sha512-KetWRytFv1epdpJc3J4G75I4WrplZE5jOL7Yq0p34+OVOKF4Se7WrdIdVC45XsSSmUTlht2FM/fM1FZb1mfQeA=="], @@ -836,7 +826,7 @@ "scheduler": ["scheduler@0.27.0", "", {}, "sha512-eNv+WrVbKu1f3vbYJT/xtiF5syA5HPIMtf9IgY/nKg0sWqzAUEvqY/xm7OcZc/qafLx/iO9FgOmeSAp4v5ti/Q=="], - "semver": ["semver@7.7.4", "", { "bin": { "semver": "bin/semver.js" } }, "sha512-vFKC2IEtQnVhpT78h1Yp8wzwrf8CM+MzKMHGJZfBtzhZNycRFnXsHk6E5TxIkkMsgNS7mdX3AGB7x2QM2di4lA=="], + "semver": ["semver@7.8.0", "", { "bin": { "semver": "bin/semver.js" } }, "sha512-AcM7dV/5ul4EekoQ29Agm5vri8JNqRyj39o0qpX6vDF2GZrtutZl5RwgD1XnZjiTAfncsJhMI48QQH3sN87YNA=="], "send": ["send@1.2.1", "", { "dependencies": { "debug": "^4.4.3", "encodeurl": "^2.0.0", "escape-html": "^1.0.3", "etag": "^1.8.1", "fresh": "^2.0.0", "http-errors": "^2.0.1", "mime-types": "^3.0.2", "ms": "^2.1.3", "on-finished": "^2.4.1", "range-parser": "^1.2.1", "statuses": "^2.0.2" } }, "sha512-1gnZf7DFcoIcajTjTwjwuDjzuz4PPcY2StKPlsGAQ1+YH20IRVrBaXSWmdjowTJ6u8Rc01PoYOGHXfP1mYcZNQ=="], @@ -888,7 +878,7 @@ "tiktoken": ["tiktoken@1.0.22", "", {}, "sha512-PKvy1rVF1RibfF3JlXBSP0Jrcw2uq3yXdgcEXtKTYn3QJ/cBRBHDnrJ5jHky+MENZ6DIPwNUGWpkVx+7joCpNA=="], - "tinyexec": ["tinyexec@1.1.1", "", {}, "sha512-VKS/ZaQhhkKFMANmAOhhXVoIfBXblQxGX1myCQ2faQrfmobMftXeJPcZGp0gS07ocvGJWDLZGyOZDadDBqYIJg=="], + "tinyexec": ["tinyexec@1.1.2", "", {}, "sha512-dAqSqE/RabpBKI8+h26GfLq6Vb3JVXs30XYQjdMjaj/c2tS8IYYMbIzP599KtRj7c57/wYApb3QjgRgXmrCukA=="], "tinyglobby": ["tinyglobby@0.2.16", "", { "dependencies": { "fdir": "^6.5.0", "picomatch": "^4.0.4" } }, "sha512-pn99VhoACYR8nFHhxqix+uvsbXineAasWm5ojXoN8xEwK5Kd3/TrhNn1wByuD52UxWRLy8pu+kRMniEi6Eq9Zg=="], @@ -906,13 +896,13 @@ "type-fest": ["type-fest@5.6.0", "", { "dependencies": { "tagged-tag": "^1.0.0" } }, "sha512-8ZiHFm91orbSAe2PSAiSVBVko18pbhbiB3U9GglSzF/zCGkR+rxpHx6sEMCUm4kxY4LjDIUGgCfUMtwfZfjfUA=="], - "type-is": ["type-is@2.0.1", "", { "dependencies": { "content-type": "^1.0.5", "media-typer": "^1.1.0", "mime-types": "^3.0.0" } }, "sha512-OZs6gsjF4vMp32qrCbiVSkrFmXtG/AZhY3t0iAMrMBiAZyV9oALtXO8hsrHbMXF9x6L3grlFuwW2oAz7cav+Gw=="], + "type-is": ["type-is@2.1.0", "", { "dependencies": { "content-type": "^2.0.0", "media-typer": "^1.1.0", "mime-types": "^3.0.0" } }, "sha512-faYHw0anBbc/kWF3zFTEnxSFOAGUX9GFbOBthvDdLsIlEoWOFOtS0zgCiQYwIskL9iGXZL3kAXD8OoZ4GmMATA=="], "typescript": ["typescript@5.9.3", "", { "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver" } }, "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw=="], - "typescript-eslint": ["typescript-eslint@8.59.0", "", { "dependencies": { "@typescript-eslint/eslint-plugin": "8.59.0", "@typescript-eslint/parser": "8.59.0", "@typescript-eslint/typescript-estree": "8.59.0", "@typescript-eslint/utils": "8.59.0" }, "peerDependencies": { "eslint": "^8.57.0 || ^9.0.0 || ^10.0.0", "typescript": ">=4.8.4 <6.1.0" } }, "sha512-BU3ONW9X+v90EcCH9ZS6LMackcVtxRLlI3XrYyqZIwVSHIk7Qf7bFw1z0M9Q0IUxhTMZCf8piY9hTYaNEIASrw=="], + "typescript-eslint": ["typescript-eslint@8.59.3", "", { "dependencies": { "@typescript-eslint/eslint-plugin": "8.59.3", "@typescript-eslint/parser": "8.59.3", "@typescript-eslint/typescript-estree": "8.59.3", "@typescript-eslint/utils": "8.59.3" }, "peerDependencies": { "eslint": "^8.57.0 || ^9.0.0 || ^10.0.0", "typescript": ">=4.8.4 <6.1.0" } }, "sha512-KgusgyDgG4LI8Ih/sWaCtZ06tckLAS5CvT5A4D1Q7bYVoAAyzwiZvE4BmwDHkhRVkvhRBepKeASoFzQetha7Fg=="], - "undici-types": ["undici-types@7.19.2", "", {}, "sha512-qYVnV5OEm2AW8cJMCpdV20CDyaN3g0AjDlOGf1OW4iaDEx8MwdtChUp4zu4H0VP3nDRF/8RKWH+IPp9uW0YGZg=="], + "undici-types": ["undici-types@7.21.0", "", {}, "sha512-w9IMgQrz4O0YN1LtB7K5P63vhlIOvC7opSmouCJ+ZywlPAlO9gIkJ+otk6LvGpAs2wg4econaCz3TvQ9xPoyuQ=="], "unpipe": ["unpipe@1.0.0", "", {}, "sha512-pjy2bYhSsufwWlKwPc+l3cN7+wuJlK6uz0YdJEOlQDbl6jo/YlPi4mb8agUkVC8BF7V8NuzeyPNqRksA3hztKQ=="], @@ -942,11 +932,11 @@ "wrappy": ["wrappy@1.0.2", "", {}, "sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ=="], - "ws": ["ws@8.20.0", "", { "peerDependencies": { "bufferutil": "^4.0.1", "utf-8-validate": ">=5.0.2" }, "optionalPeers": ["bufferutil", "utf-8-validate"] }, "sha512-sAt8BhgNbzCtgGbt2OxmpuryO63ZoDk/sqaB/znQm94T4fCEsy/yV+7CdC1kJhOU9lboAEU7R3kquuycDoibVA=="], + "ws": ["ws@8.20.1", "", { "peerDependencies": { "bufferutil": "^4.0.1", "utf-8-validate": ">=5.0.2" }, "optionalPeers": ["bufferutil", "utf-8-validate"] }, "sha512-It4dO0K5v//JtTXuPkfEOaI3uUN87iYPnqo/ZzqCoG3g8uhA66QUMs/SrM0YK7/NAu+r4LMh/9dq2A7k+rHs+w=="], "y18n": ["y18n@5.0.8", "", {}, "sha512-0pfFzegeDWJHJIAmTLRP2DwHjdF5s7jo9tuztdQxAhINCdvS+3nGINqPd00AphqJR/0LhANUS6/+7SCb98YOfA=="], - "yaml": ["yaml@2.8.3", "", { "bin": { "yaml": "bin.mjs" } }, "sha512-AvbaCLOO2Otw/lW5bmh9d/WEdcDFdQp2Z2ZUH3pX9U2ihyUY0nvLv7J6TrWowklRGPYbB/IuIMfYgxaCPg5Bpg=="], + "yaml": ["yaml@2.9.0", "", { "bin": { "yaml": "bin.mjs" } }, "sha512-2AvhNX3mb8zd6Zy7INTtSpl1F15HW6Wnqj0srWlkKLcpYl/gMIMJiyuGq2KeI2YFxUPjdlB+3Lc10seMLtL4cA=="], "yargs": ["yargs@17.7.2", "", { "dependencies": { "cliui": "^8.0.1", "escalade": "^3.1.1", "get-caller-file": "^2.0.5", "require-directory": "^2.1.1", "string-width": "^4.2.3", "y18n": "^5.0.5", "yargs-parser": "^21.1.1" } }, "sha512-7dSzzRQ++CKnNI/krKnYRV7JKKPUXMEh61soaHKg9mrWEhzFWhFnxPxGl+69cD1Ou63C13NUPCnmIcrvqCuM6w=="], @@ -956,7 +946,7 @@ "yoga-layout": ["yoga-layout@3.2.1", "", {}, "sha512-0LPOt3AxKqMdFBZA3HBAt/t/8vIKq7VaQYbuA8WxCgung+p9TVyKRYdpvCb80HcdTN2NkbIKbhNwKUfm3tQywQ=="], - "zod": ["zod@4.3.6", "", {}, "sha512-rftlrkhHZOcjDwkGlnUtZZkvaPHCsDATp4pGpuOOMDaTdDDXF91wuVDJoWoPsKX/3YPQ5fHuF3STjcYyKr+Qhg=="], + "zod": ["zod@4.4.3", "", {}, "sha512-ytENFjIJFl2UwYglde2jchW2Hwm4GJFLDiSXWdTrJQBIN9Fcyp7n4DhxJEiWNAJMV1/BqWfW/kkg71UDcHJyTQ=="], "zod-to-json-schema": ["zod-to-json-schema@3.25.2", "", { "peerDependencies": { "zod": "^3.25.28 || ^4" } }, "sha512-O/PgfnpT1xKSDeQYSCfRI5Gy3hPf91mKVDuYLUHZJMiDFptvP41MSnWofm8dnCm0256ZNfZIM7DSzuSMAFnjHA=="], @@ -992,6 +982,8 @@ "stack-utils/escape-string-regexp": ["escape-string-regexp@2.0.0", "", {}, "sha512-UpzcLCXolUWcNu5HtVMHYdXJjArjsF9C0aNnquZYY4uW/Vu0miy5YoWvbV345HauVvcAUnpRuhMMcqTcGOY2+w=="], + "type-is/content-type": ["content-type@2.0.0", "", {}, "sha512-j/O/d7GcZCyNl7/hwZAb606rzqkyvaDctLmckbxLzHvFBzTJHuGEdodATcP3yIRoDrLHkIATJuvzbFlp/ki2cQ=="], + "yargs/string-width": ["string-width@4.2.3", "", { "dependencies": { "emoji-regex": "^8.0.0", "is-fullwidth-code-point": "^3.0.0", "strip-ansi": "^6.0.1" } }, "sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g=="], "@commitlint/config-validator/ajv/json-schema-traverse": ["json-schema-traverse@1.0.0", "", {}, "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug=="], diff --git a/packages/cli/package.json b/packages/cli/package.json index e7297d3..0cb0936 100644 --- a/packages/cli/package.json +++ b/packages/cli/package.json @@ -14,6 +14,7 @@ "dependencies": { "@bb/config": "workspace:*", "@bb/errors": "workspace:*", + "@bb/ingest-github": "workspace:*", "@bb/logger": "workspace:*", "@bb/types": "workspace:*", "commander": "^14.0.3", diff --git a/packages/cli/tsconfig.json b/packages/cli/tsconfig.json index 614acd6..19ed7d8 100644 --- a/packages/cli/tsconfig.json +++ b/packages/cli/tsconfig.json @@ -5,5 +5,12 @@ "outDir": "./dist", "jsx": "react-jsx" }, - "include": ["src/**/*"] + "include": ["src/**/*"], + "references": [ + { "path": "../config" }, + { "path": "../errors" }, + { "path": "../ingest-github" }, + { "path": "../logger" }, + { "path": "../types" } + ] } diff --git a/packages/config/tsconfig.json b/packages/config/tsconfig.json index b2f9baa..fd6c909 100644 --- a/packages/config/tsconfig.json +++ b/packages/config/tsconfig.json @@ -6,5 +6,6 @@ "noEmit": false, "emitDeclarationOnly": true }, - "include": ["src/**/*"] + "include": ["src/**/*"], + "references": [{ "path": "../types" }] } diff --git a/packages/errors/tsconfig.json b/packages/errors/tsconfig.json index c2104f6..2d2ce73 100644 --- a/packages/errors/tsconfig.json +++ b/packages/errors/tsconfig.json @@ -4,5 +4,6 @@ "rootDir": "./src", "outDir": "./dist" }, - "include": ["src/**/*"] + "include": ["src/**/*"], + "references": [{ "path": "../types" }] } diff --git a/packages/ingest-github/src/githubRepo.ts b/packages/ingest-github/src/githubApi.ts similarity index 64% rename from packages/ingest-github/src/githubRepo.ts rename to packages/ingest-github/src/githubApi.ts index 7d3c266..ad2301f 100644 --- a/packages/ingest-github/src/githubRepo.ts +++ b/packages/ingest-github/src/githubApi.ts @@ -137,3 +137,85 @@ export async function fetchBranches( const branches = body.map((b) => b.name).filter((name): name is string => typeof name === "string"); return { status: "ok", branches }; } + +export interface CommitEntry { + sha: string; + message: string; + author: string; + timestamp: string; +} + +export type FetchCommitsResult = + | { status: "ok"; commits: CommitEntry[] } + | { status: "not_found" } + | { status: "unauthorized" } + | { status: "rate_limited" } + | { status: "error"; message: string }; + +/** + * Fetches recent commits for a repository on a specific branch. + */ +export async function fetchRecentCommits( + repoUrl: string, + branch: string, + limit = 10, + gitToken?: string, +): Promise { + const parsed = parseGithubRepo(repoUrl); + if (parsed === null) { + return { status: "error", message: `unparseable github url: ${repoUrl}` }; + } + + const headers: Record = { + Accept: "application/vnd.github+json", + "User-Agent": USER_AGENT, + "X-GitHub-Api-Version": "2022-11-28", + }; + if (gitToken !== undefined && gitToken.length > 0) { + headers["Authorization"] = `Bearer ${gitToken}`; + } + + const url = `https://api.github.com/repos/${parsed.owner}/${parsed.repo}/commits?per_page=${limit}&sha=${encodeURIComponent(branch)}`; + let response: Response; + try { + response = await fetch(url, { headers }); + } catch (cause: unknown) { + const msg = cause instanceof Error ? cause.message : String(cause); + return { status: "error", message: `github fetch failed: ${msg}` }; + } + + if (response.status === 404) { + return { status: "not_found" }; + } + if (response.status === 401) { + return { status: "unauthorized" }; + } + if (response.status === 403 && response.headers.get("x-ratelimit-remaining") === "0") { + return { status: "rate_limited" }; + } + if (!response.ok) { + const body = await response.text().catch(() => ""); + return { status: "error", message: `github ${response.status}: ${body.slice(0, 200)}` }; + } + + const body = (await response.json()) as Array<{ + sha?: unknown; + commit?: { message?: unknown; author?: { date?: unknown } }; + author?: { login?: unknown }; + }>; + + const commits = body + .map((c) => { + const sha = typeof c.sha === "string" ? c.sha : ""; + const message = typeof c.commit?.message === "string" ? c.commit.message : ""; + const author = typeof c.author?.login === "string" ? c.author.login : ""; + const timestamp = typeof c.commit?.author?.date === "string" ? c.commit.author.date : ""; + return { sha, message, author, timestamp }; + }) + .filter((c): c is CommitEntry => Boolean(c.sha && c.message)); + + return { status: "ok", commits }; +} + +export { parseGithubRepo } from "./githubUrl.ts"; +export type { ParsedRepo } from "./githubUrl.ts"; diff --git a/packages/ingest-github/src/pipeline/branch.ts b/packages/ingest-github/src/pipeline/branch.ts index dedc693..b048b49 100644 --- a/packages/ingest-github/src/pipeline/branch.ts +++ b/packages/ingest-github/src/pipeline/branch.ts @@ -1,6 +1,6 @@ import type { GithubIndexPayload } from "@bb/types"; import { IngestError } from "@bb/errors"; -import { fetchDefaultBranch } from "../githubRepo.ts"; +import { fetchDefaultBranch } from "src/githubApi.ts"; const DEFAULT_BRANCH = "main"; diff --git a/packages/ingest-github/tsconfig.json b/packages/ingest-github/tsconfig.json index 0fd9bd8..07da1a4 100644 --- a/packages/ingest-github/tsconfig.json +++ b/packages/ingest-github/tsconfig.json @@ -11,5 +11,15 @@ "noEmit": false, "emitDeclarationOnly": true }, - "include": ["src/**/*", "src/**/*.json"] + "include": ["src/**/*", "src/**/*.json"], + "references": [ + { "path": "../config" }, + { "path": "../errors" }, + { "path": "../llm" }, + { "path": "../logger" }, + { "path": "../mongo" }, + { "path": "../neo4j" }, + { "path": "../queue" }, + { "path": "../types" } + ] } diff --git a/packages/llm/tsconfig.json b/packages/llm/tsconfig.json index c2104f6..c69f55e 100644 --- a/packages/llm/tsconfig.json +++ b/packages/llm/tsconfig.json @@ -4,5 +4,6 @@ "rootDir": "./src", "outDir": "./dist" }, - "include": ["src/**/*"] + "include": ["src/**/*"], + "references": [{ "path": "../config" }, { "path": "../errors" }, { "path": "../mongo" }, { "path": "../types" }] } diff --git a/packages/logger/tsconfig.json b/packages/logger/tsconfig.json index 5ae1abb..c705055 100644 --- a/packages/logger/tsconfig.json +++ b/packages/logger/tsconfig.json @@ -7,5 +7,5 @@ "emitDeclarationOnly": true }, "include": ["src/**/*"], - "references": [{ "path": "../config" }] + "references": [{ "path": "../config" }, { "path": "../types" }] } diff --git a/packages/mcp/tsconfig.json b/packages/mcp/tsconfig.json index c2104f6..e3950ee 100644 --- a/packages/mcp/tsconfig.json +++ b/packages/mcp/tsconfig.json @@ -4,5 +4,12 @@ "rootDir": "./src", "outDir": "./dist" }, - "include": ["src/**/*"] + "include": ["src/**/*"], + "references": [ + { "path": "../config" }, + { "path": "../logger" }, + { "path": "../llm" }, + { "path": "../neo4j" }, + { "path": "../types" } + ] } diff --git a/packages/mongo/src/knowledge.ts b/packages/mongo/src/knowledge.ts index c91f5e3..766888a 100644 --- a/packages/mongo/src/knowledge.ts +++ b/packages/mongo/src/knowledge.ts @@ -1,4 +1,4 @@ -import { KnowledgeState, type KnowledgeDoc } from "@bb/types"; +import type { KnowledgeDoc, KnowledgeState } from "@bb/types"; import { KnowledgeNotFoundError } from "@bb/errors"; import { _getDb } from "./client.ts"; import { Collections } from "./collections.ts"; diff --git a/packages/mongo/tsconfig.json b/packages/mongo/tsconfig.json index c2104f6..df37b8d 100644 --- a/packages/mongo/tsconfig.json +++ b/packages/mongo/tsconfig.json @@ -4,5 +4,6 @@ "rootDir": "./src", "outDir": "./dist" }, - "include": ["src/**/*"] + "include": ["src/**/*"], + "references": [{ "path": "../config" }, { "path": "../errors" }, { "path": "../types" }] } diff --git a/packages/neo4j/package.json b/packages/neo4j/package.json index 6733044..6335062 100644 --- a/packages/neo4j/package.json +++ b/packages/neo4j/package.json @@ -12,6 +12,7 @@ "@bb/config": "workspace:*", "@bb/errors": "workspace:*", "@bb/types": "workspace:*", + "@bb/mongo": "workspace:*", "neo4j-driver": "^6.0.1" } } diff --git a/packages/neo4j/tsconfig.json b/packages/neo4j/tsconfig.json index c2104f6..31a7e4d 100644 --- a/packages/neo4j/tsconfig.json +++ b/packages/neo4j/tsconfig.json @@ -4,5 +4,6 @@ "rootDir": "./src", "outDir": "./dist" }, - "include": ["src/**/*"] + "include": ["src/**/*"], + "references": [{ "path": "../config" }, { "path": "../errors" }, { "path": "../types" }, { "path": "../mongo" }] } diff --git a/packages/queue/src/manager.ts b/packages/queue/src/manager.ts index 456f75d..3d2f45d 100644 --- a/packages/queue/src/manager.ts +++ b/packages/queue/src/manager.ts @@ -1,4 +1,5 @@ -import { Queue, Worker } from "bullmq"; +import type { Worker } from "bullmq"; +import { Queue } from "bullmq"; import { JobType } from "@bb/types"; import { QueueConnectError, QueueNotConnectedError } from "@bb/errors"; import { getRedisConnection } from "@bb/redis"; diff --git a/packages/queue/tsconfig.json b/packages/queue/tsconfig.json index c2104f6..89d8570 100644 --- a/packages/queue/tsconfig.json +++ b/packages/queue/tsconfig.json @@ -4,5 +4,12 @@ "rootDir": "./src", "outDir": "./dist" }, - "include": ["src/**/*"] + "include": ["src/**/*"], + "references": [ + { "path": "../config" }, + { "path": "../errors" }, + { "path": "../mongo" }, + { "path": "../redis" }, + { "path": "../types" } + ] } diff --git a/packages/redis/tsconfig.json b/packages/redis/tsconfig.json index c2104f6..df37b8d 100644 --- a/packages/redis/tsconfig.json +++ b/packages/redis/tsconfig.json @@ -4,5 +4,6 @@ "rootDir": "./src", "outDir": "./dist" }, - "include": ["src/**/*"] + "include": ["src/**/*"], + "references": [{ "path": "../config" }, { "path": "../errors" }, { "path": "../types" }] } diff --git a/packages/server/src/githubCommitsRoute.ts b/packages/server/src/githubCommitsRoute.ts index a8ee67a..61f7863 100644 --- a/packages/server/src/githubCommitsRoute.ts +++ b/packages/server/src/githubCommitsRoute.ts @@ -57,7 +57,14 @@ export function buildGithubCommitsRoute(): Router { const result = await fetchRecentCommits(knowledge.source.repoUrl, branch, limit, gitToken); switch (result.status) { case "ok": { - const payload: CommitsResponse = { knowledgeId, branch, commits: result.commits }; + const commits = result.commits.map((c) => ({ + hash: c.sha, + shortHash: c.sha.slice(0, 7), + subject: c.message.split("\n")[0] ?? "", + author: c.author, + date: c.timestamp, + })); + const payload: CommitsResponse = { knowledgeId, branch, commits }; res.status(200).json(payload); return; } diff --git a/packages/server/tsconfig.json b/packages/server/tsconfig.json index 8d1771a..b195e2f 100644 --- a/packages/server/tsconfig.json +++ b/packages/server/tsconfig.json @@ -5,5 +5,15 @@ "outDir": "./dist" }, "include": ["src/**/*"], - "references": [{ "path": "../ingest-github" }] + "references": [ + { "path": "../config" }, + { "path": "../errors" }, + { "path": "../ingest-github" }, + { "path": "../mcp" }, + { "path": "../mongo" }, + { "path": "../neo4j" }, + { "path": "../queue" }, + { "path": "../redis" }, + { "path": "../types" } + ] } diff --git a/tsconfig.base.json b/tsconfig.base.json index 0275a27..6903d08 100644 --- a/tsconfig.base.json +++ b/tsconfig.base.json @@ -41,6 +41,7 @@ "declarationMap": true, "sourceMap": true, "incremental": true, - "noEmit": true + "noEmit": false, + "emitDeclarationOnly": true } } From 983218d5f399d289d2b37255458303957c509b39 Mon Sep 17 00:00:00 2001 From: Dead-Bytes <143434285+Dead-Bytes@users.noreply.github.com> Date: Thu, 14 May 2026 17:10:21 +0530 Subject: [PATCH 07/34] fix(boot): startup-config and logger factorized --- packages/config/context.md | 15 +++++++++++++++ packages/config/src/index.ts | 4 ++-- packages/config/src/loader.ts | 19 +++++++++++++++++++ packages/config/src/writer.ts | 11 +++++++++++ packages/logger/context.md | 16 ++++++++++++++++ packages/logger/src/index.ts | 13 ++++++++++--- packages/logger/src/logger.ts | 15 ++++++++++++++- 7 files changed, 87 insertions(+), 6 deletions(-) diff --git a/packages/config/context.md b/packages/config/context.md index 2b5666e..541ee5f 100644 --- a/packages/config/context.md +++ b/packages/config/context.md @@ -40,9 +40,24 @@ function getBytebellHome(): string function getConfigPath(): string function ensureBytebellHome(): void +function seedConfig(value: unknown): BytebellConfig +function __isSeeded(): boolean +class ConfigSeededError extends Error + +function __resetSeedForTests(): void // test-only function __setBytebellHomeForTests(home: string | null): void // test-only ``` +`seedConfig` injects a pre-parsed config object into the in-memory cache, +validated through `configSchema.parse`. When seeded, `loadConfig()` returns +the seeded values and **does not** call `ensureBytebellHome()` or read +`config.json`. The cache invalidator is also no-op while seeded, so the seed +survives unexpected `__notifyConfigChanged` events. `setConfigValue` throws +`ConfigSeededError` when invoked against a seeded cache — writes are disabled +in that mode. When `seedConfig` is never called, behaviour is bit-for-bit the +disk-backed path: `loadConfig()` materializes `~/.bytebell/config.json` on +first read and `setConfigValue` performs atomic writes. + The `Config` enum lives in `@bb/types`; `ConfigIncompleteError` lives in `@bb/errors`. Both are imported from those packages directly, not from `@bb/config`. diff --git a/packages/config/src/index.ts b/packages/config/src/index.ts index de390ec..c60e1aa 100644 --- a/packages/config/src/index.ts +++ b/packages/config/src/index.ts @@ -1,9 +1,9 @@ export { LOG_LEVELS, LLM_PROVIDERS, HINTS } from "./schema.ts"; export type { BytebellConfig, ConfigValue, ConfigValueMap, LogLevel, LlmProvider } from "./schema.ts"; -export { loadConfig, getConfigValue, isConfigComplete } from "./loader.ts"; +export { loadConfig, getConfigValue, isConfigComplete, seedConfig, __isSeeded, __resetSeedForTests } from "./loader.ts"; export type { ConfigCompletenessResult } from "./loader.ts"; -export { setConfigValue, ensureBytebellHome } from "./writer.ts"; +export { setConfigValue, ensureBytebellHome, ConfigSeededError } from "./writer.ts"; export { getBytebellHome, getConfigPath, isDevMode, __setBytebellHomeForTests } from "./paths.ts"; diff --git a/packages/config/src/loader.ts b/packages/config/src/loader.ts index 5de16c2..b3adcb4 100644 --- a/packages/config/src/loader.ts +++ b/packages/config/src/loader.ts @@ -12,11 +12,30 @@ import { __registerCacheInvalidator, getConfigPath } from "./paths.ts"; import { ensureBytebellHome } from "./writer.ts"; let cached: BytebellConfig | null = null; +let seeded = false; __registerCacheInvalidator(() => { + if (seeded) { + return; + } cached = null; }); +export function seedConfig(value: unknown): BytebellConfig { + cached = configSchema.parse(value); + seeded = true; + return cached; +} + +export function __isSeeded(): boolean { + return seeded; +} + +export function __resetSeedForTests(): void { + cached = null; + seeded = false; +} + export function loadConfig(): BytebellConfig { if (cached !== null) { return cached; diff --git a/packages/config/src/writer.ts b/packages/config/src/writer.ts index 04d73ae..c89a82c 100644 --- a/packages/config/src/writer.ts +++ b/packages/config/src/writer.ts @@ -1,7 +1,15 @@ import fs from "node:fs"; import { configSchema, Config, type BytebellConfig, type ConfigValue, DEFAULT_CONFIG, writeField } from "./schema.ts"; +import { __isSeeded } from "./loader.ts"; import { getBytebellHome, getConfigPath, __notifyConfigChanged } from "./paths.ts"; +export class ConfigSeededError extends Error { + constructor() { + super("config cache is seeded; setConfigValue is disabled"); + this.name = "ConfigSeededError"; + } +} + const FILE_MODE = 0o600; const DIR_MODE = 0o700; @@ -41,6 +49,9 @@ export function ensureBytebellHome(): void { } export function setConfigValue(key: K, value: ConfigValue): void { + if (__isSeeded()) { + throw new ConfigSeededError(); + } ensureBytebellHome(); const current = readConfigFile(); const next = writeField(current, key, value); diff --git a/packages/logger/context.md b/packages/logger/context.md index 7256095..113af4e 100644 --- a/packages/logger/context.md +++ b/packages/logger/context.md @@ -19,16 +19,32 @@ Single logging surface for the workspace. Two sinks: ```ts type LoggerScope = "server" | "cli" +type LoggerFactory = (scope: LoggerScope) => Logger type Logger // re-exported from winston +const logger: Logger // proxy → getLogger("server") function getLogger(scope: LoggerScope): Logger +function seedLoggerFactory(factory: LoggerFactory): void function shutdownLoggers(): Promise function getLogsDir(): string function ensureLogsDir(): void +function __isLoggerFactorySeeded(): boolean function __resetLoggersForTests(): void // test-only ``` +`logger` (the default export) is a Proxy that lazily resolves to +`getLogger("server")` on every access — necessary because the resolved logger +may change after `seedLoggerFactory` is called by a parent process. + +`seedLoggerFactory(factory)` registers a factory used by all subsequent +`getLogger(scope)` calls. The previous scope cache is cleared on registration +so any logger already imported via the `logger` proxy resolves to the new +factory's output on its next method call. When no factory is seeded, +`getLogger` falls back to `buildLogger(scope)` — the disk-backed +DailyRotateFile + Console transport setup. The standalone binary never seeds +and gets the original behaviour bit-for-bit. + `getLogger(scope)` is idempotent. Workers tag via `getLogger("server").child({ worker: "pdf-1" })` — there is no per-worker file split. diff --git a/packages/logger/src/index.ts b/packages/logger/src/index.ts index c6678a5..22ed5d0 100644 --- a/packages/logger/src/index.ts +++ b/packages/logger/src/index.ts @@ -1,10 +1,17 @@ +import type winston from "winston"; import { getLogger } from "./logger.ts"; -export { getLogger, shutdownLoggers, __resetLoggersForTests } from "./logger.ts"; -export type { LoggerScope } from "./logger.ts"; +export { getLogger, seedLoggerFactory, shutdownLoggers, __isLoggerFactorySeeded, __resetLoggersForTests } from "./logger.ts"; +export type { LoggerScope, LoggerFactory } from "./logger.ts"; export { getLogsDir, ensureLogsDir } from "./dirs.ts"; export type { Logger } from "winston"; -export const logger = getLogger("server"); +export const logger = new Proxy({} as winston.Logger, { + get(_target, prop, receiver) { + const actual = getLogger("server"); + const value = Reflect.get(actual, prop, receiver); + return typeof value === "function" ? (value as (...args: unknown[]) => unknown).bind(actual) : value; + }, +}); diff --git a/packages/logger/src/logger.ts b/packages/logger/src/logger.ts index e7c0248..d6adf83 100644 --- a/packages/logger/src/logger.ts +++ b/packages/logger/src/logger.ts @@ -6,7 +6,10 @@ import { flushTransport, makeConsoleTransport, makeFileTransport } from "./trans export type LoggerScope = "server" | "cli"; +export type LoggerFactory = (scope: LoggerScope) => winston.Logger; + const scopeLoggers = new Map(); +let seededFactory: LoggerFactory | null = null; function buildLogger(scope: LoggerScope): winston.Logger { ensureLogsDir(); @@ -17,12 +20,21 @@ function buildLogger(scope: LoggerScope): winston.Logger { }); } +export function seedLoggerFactory(factory: LoggerFactory): void { + seededFactory = factory; + scopeLoggers.clear(); +} + +export function __isLoggerFactorySeeded(): boolean { + return seededFactory !== null; +} + export function getLogger(scope: LoggerScope): winston.Logger { const cached = scopeLoggers.get(scope); if (cached !== undefined) { return cached; } - const logger = buildLogger(scope); + const logger = seededFactory !== null ? seededFactory(scope) : buildLogger(scope); scopeLoggers.set(scope, logger); return logger; } @@ -42,4 +54,5 @@ export function __resetLoggersForTests(): void { logger.close(); } scopeLoggers.clear(); + seededFactory = null; } From 821c15cb21c74d73d9bcdecb26c58aa27c621390 Mon Sep 17 00:00:00 2001 From: Dead-Bytes <143434285+Dead-Bytes@users.noreply.github.com> Date: Thu, 14 May 2026 17:26:21 +0530 Subject: [PATCH 08/34] refactor(logger): format export statements for better readability --- packages/ingest-github/context.md | 26 +++++++++---------- packages/ingest-github/src/context.md | 18 ++++++------- .../ingest-github/src/pipeline/context.md | 16 ++++++------ packages/logger/src/index.ts | 8 +++++- 4 files changed, 37 insertions(+), 31 deletions(-) diff --git a/packages/ingest-github/context.md b/packages/ingest-github/context.md index be7800b..7979474 100644 --- a/packages/ingest-github/context.md +++ b/packages/ingest-github/context.md @@ -59,25 +59,25 @@ The package does **not** own: ```ts // High-level registration (OSS standalone wires this once at boot) -function registerGithubWorkers(deps?: RegisterGithubWorkersDeps): void // wires GithubIndex + GithubPull -function registerLocalIngestWorker(): void // wires LocalIngest +function registerGithubWorkers(deps?: RegisterGithubWorkersDeps): void; // wires GithubIndex + GithubPull +function registerLocalIngestWorker(): void; // wires LocalIngest interface RegisterGithubWorkersDeps { - sourceFactory?: SourceFactory; // index-side hook - pullFactory?: PullFactory; // pull-side hook (provides reader + diff + targetCommit) + sourceFactory?: SourceFactory; // index-side hook + pullFactory?: PullFactory; // pull-side hook (provides reader + diff + targetCommit) } // Lower-level building blocks (downstream consumers with their own queue // skip registerGithubWorkers and wire these against their own registry) -function createPipelineRunner(deps: CreatePipelineRunnerDeps): IngestRunnerDeps -function createGithubIngestHandler(deps: IngestJobHandlerDeps): (msg) => Promise -function createLocalIngestHandler(deps: IngestJobHandlerDeps): (msg) => Promise -function runPull(msg: JobMessage, pullFactory?: PullFactory): Promise -function reposRoot(): string - -function createFlatFolderStrategy(deps): IngestStrategy -function createLlmFileAnalyzer(deps): FileAnalyzer -function createDiskSourceReader(deps): SourceReader +function createPipelineRunner(deps: CreatePipelineRunnerDeps): IngestRunnerDeps; +function createGithubIngestHandler(deps: IngestJobHandlerDeps): (msg) => Promise; +function createLocalIngestHandler(deps: IngestJobHandlerDeps): (msg) => Promise; +function runPull(msg: JobMessage, pullFactory?: PullFactory): Promise; +function reposRoot(): string; + +function createFlatFolderStrategy(deps): IngestStrategy; +function createLlmFileAnalyzer(deps): FileAnalyzer; +function createDiskSourceReader(deps): SourceReader; ``` The optional `sourceFactory` lets downstream consumers inject a custom diff --git a/packages/ingest-github/src/context.md b/packages/ingest-github/src/context.md index 2c22181..acaadf7 100644 --- a/packages/ingest-github/src/context.md +++ b/packages/ingest-github/src/context.md @@ -34,15 +34,15 @@ Domain (composes infra: `@bb/config`, `@bb/llm`, `@bb/mongo`, `@bb/neo4j`, - `CondensedFileAnalysis`. - GitHub helpers: `parseGithubRepo` / `fetchLatestCommitHash` / `fetchRecentCommits`. - `registerGithubWorkers` accepts optional `sourceFactory` (index) and - `pullFactory` (pull) injections through `RegisterGithubWorkersDeps`; - the open-source binary leaves both undefined. It registers both - `JobType.GithubIndex` (full re-index, via `runner.run` + optional - `sourceFactory`) and `JobType.GithubPull` (incremental diff-and-apply - via `runPull` + optional `pullFactory`). Downstream consumers that - bring their own queue (e.g. the enterprise wrapper using `@bytebell/queue`) - skip `registerGithubWorkers` entirely and call `createPipelineRunner`, - `createGithubIngestHandler`, and `runPull` directly. + `registerGithubWorkers` accepts optional `sourceFactory` (index) and + `pullFactory` (pull) injections through `RegisterGithubWorkersDeps`; + the open-source binary leaves both undefined. It registers both + `JobType.GithubIndex` (full re-index, via `runner.run` + optional + `sourceFactory`) and `JobType.GithubPull` (incremental diff-and-apply + via `runPull` + optional `pullFactory`). Downstream consumers that + bring their own queue (e.g. the enterprise wrapper using `@bytebell/queue`) + skip `registerGithubWorkers` entirely and call `createPipelineRunner`, + `createGithubIngestHandler`, and `runPull` directly. - **[githubApi.ts](githubApi.ts)** — `parseGithubRepo(repoUrl)` and `fetchLatestCommitHash(owner, repo, branch, gitToken?)`. **Pull-only utility**; revisit in the pull plan. Kept in place rather than deleted so diff --git a/packages/ingest-github/src/pipeline/context.md b/packages/ingest-github/src/pipeline/context.md index 6c8cf3d..049441f 100644 --- a/packages/ingest-github/src/pipeline/context.md +++ b/packages/ingest-github/src/pipeline/context.md @@ -65,14 +65,14 @@ llmCallContext`, which every LLM call site downstream consumes. State - `pull.ts` — `runPull(msg, pullFactory?)` orchestrates the pull job. When `pullFactory` is provided, it returns `{source, diff, targetCommit, archiveSink?}` and `runPull` skips `syncRepository` + `materialiseEndpoints` - + `assertReachableFromBranch` + `computePullDiff` + `checkoutCommit` — - the factory is the sole source of truth. When `pullFactory` is undefined - (open-source default), the legacy git-based path runs. Either path - produces the same downstream pipeline: snapshot prior version, - `analyseChangedFiles` (now reading via `SourceReader`), - `processBigFilesQueue`, `backfillMissingFields`, `backfillBigFiles`, - `runSelectiveFolderSummary`, `summariseRepo`, `storePullAnalysis`. - Mirrors `run.ts` for `llmCallContext` extraction from payload. + - `assertReachableFromBranch` + `computePullDiff` + `checkoutCommit` — + the factory is the sole source of truth. When `pullFactory` is undefined + (open-source default), the legacy git-based path runs. Either path + produces the same downstream pipeline: snapshot prior version, + `analyseChangedFiles` (now reading via `SourceReader`), + `processBigFilesQueue`, `backfillMissingFields`, `backfillBigFiles`, + `runSelectiveFolderSummary`, `summariseRepo`, `storePullAnalysis`. + Mirrors `run.ts` for `llmCallContext` extraction from payload. - `branch.ts` — `resolveBranch(knowledgeId, payload)`. Defaults to `main` when the payload omits it; rejects branch names that don't match `^[\w./-]+$` with `IngestError` (defence against shell-injection into git args). diff --git a/packages/logger/src/index.ts b/packages/logger/src/index.ts index 22ed5d0..c2e1376 100644 --- a/packages/logger/src/index.ts +++ b/packages/logger/src/index.ts @@ -1,7 +1,13 @@ import type winston from "winston"; import { getLogger } from "./logger.ts"; -export { getLogger, seedLoggerFactory, shutdownLoggers, __isLoggerFactorySeeded, __resetLoggersForTests } from "./logger.ts"; +export { + getLogger, + seedLoggerFactory, + shutdownLoggers, + __isLoggerFactorySeeded, + __resetLoggersForTests, +} from "./logger.ts"; export type { LoggerScope, LoggerFactory } from "./logger.ts"; export { getLogsDir, ensureLogsDir } from "./dirs.ts"; From 6f0cc24aa6ea8121079a55c1fcc38283b469ecac Mon Sep 17 00:00:00 2001 From: Dead-Bytes <143434285+Dead-Bytes@users.noreply.github.com> Date: Thu, 14 May 2026 17:29:37 +0530 Subject: [PATCH 09/34] fix(shim): shims added for wrapper --- packages/ingest-github/package.json | 7 +++- packages/ingest-github/types/index.d.ts | 49 +++++++++++++++++++++++++ 2 files changed, 54 insertions(+), 2 deletions(-) create mode 100644 packages/ingest-github/types/index.d.ts diff --git a/packages/ingest-github/package.json b/packages/ingest-github/package.json index 4ca252c..ad516c4 100644 --- a/packages/ingest-github/package.json +++ b/packages/ingest-github/package.json @@ -4,9 +4,12 @@ "private": true, "type": "module", "main": "./src/index.ts", - "types": "./src/index.ts", + "types": "./types/index.d.ts", "exports": { - ".": "./src/index.ts" + ".": { + "types": "./types/index.d.ts", + "default": "./src/index.ts" + } }, "dependencies": { "@bb/config": "workspace:*", diff --git a/packages/ingest-github/types/index.d.ts b/packages/ingest-github/types/index.d.ts new file mode 100644 index 0000000..a8bdcef --- /dev/null +++ b/packages/ingest-github/types/index.d.ts @@ -0,0 +1,49 @@ +export interface RegisterGithubWorkersDeps { + sourceFactory?: SourceFactory; + pullFactory?: PullFactory; +} + +export declare function registerGithubWorkers(deps?: RegisterGithubWorkersDeps): void; +export declare function registerLocalIngestWorker(): void; + +export declare const createFlatFolderStrategy: (...args: any[]) => any; +export declare const createLlmFileAnalyzer: (...args: any[]) => any; +export declare const createDiskSourceReader: (...args: any[]) => any; +export declare const createPipelineRunner: (...args: any[]) => any; +export declare const createGithubIngestHandler: (...args: any[]) => any; +export declare const createLocalIngestHandler: (...args: any[]) => any; +export declare const runPull: (...args: any[]) => any; +export declare const reposRoot: (...args: any[]) => string; +export declare const fetchLatestCommitHash: (...args: any[]) => any; +export declare const fetchRecentCommits: (...args: any[]) => any; +export declare const parseGithubRepo: (...args: any[]) => any; + +export type CreatePipelineRunnerDeps = any; +export type IngestJobHandlerDeps = any; +export type IngestRunnerDeps = any; +export type IngestRunnerInput = any; +export type IngestStrategy = any; +export type StrategyInput = any; +export type StrategyResult = any; +export type StrategyContext = any; +export type FileAnalyzer = any; +export type AnalyzedFileResult = any; +export type ScanEntry = any; +export type ScannedFile = any; +export type OversizedFile = any; +export type ScanDeps = any; +export type SourceReader = any; +export type ArchiveSink = any; +export type ArchiveSinkInput = any; +export type SourceFactory = any; +export type SourceFactoryInput = any; +export type SourceFactoryResult = any; +export type PullFactory = any; +export type PullFactoryInput = any; +export type PullFactoryResult = any; +export type DiffResult = any; +export type RenamedFile = any; +export type CondensedFileAnalysis = any; +export type CommitEntry = any; +export type FetchCommitsResult = any; +export type ParsedRepo = any; From 5777ada2903a1c978556b271528cb48b1cda4843 Mon Sep 17 00:00:00 2001 From: Dead-Bytes <143434285+Dead-Bytes@users.noreply.github.com> Date: Thu, 14 May 2026 17:54:11 +0530 Subject: [PATCH 10/34] feat(logging): add performance logging to LLM file analyzer and pipeline run --- package.json | 2 +- .../src/adapters/llm-file-analyzer.ts | 7 +++- packages/ingest-github/src/bootstrap.ts | 16 +++++++ packages/ingest-github/src/index.ts | 6 +++ packages/ingest-github/src/pipeline/run.ts | 5 +++ packages/ingest-github/types/context.md | 42 +++++++++++++++++++ packages/ingest-github/types/index.d.ts | 9 ++++ packages/llm/package.json | 1 + packages/llm/src/client.ts | 5 ++- 9 files changed, 89 insertions(+), 4 deletions(-) create mode 100644 packages/ingest-github/src/bootstrap.ts create mode 100644 packages/ingest-github/types/context.md diff --git a/package.json b/package.json index 5361baa..c70eb7b 100644 --- a/package.json +++ b/package.json @@ -22,7 +22,7 @@ "lint-staged": { "**/*.{ts,tsx,js,mjs,cjs}": [ "prettier --write", - "eslint --fix --max-warnings=0" + "eslint --fix --max-warnings=0 --no-warn-ignored" ], "**/*.{json,md}": [ "prettier --write" diff --git a/packages/ingest-github/src/adapters/llm-file-analyzer.ts b/packages/ingest-github/src/adapters/llm-file-analyzer.ts index 13bda00..88d0b17 100644 --- a/packages/ingest-github/src/adapters/llm-file-analyzer.ts +++ b/packages/ingest-github/src/adapters/llm-file-analyzer.ts @@ -40,6 +40,7 @@ export function createLlmFileAnalyzer(deps: LlmFileAnalyzerDeps): FileAnalyzer { }): Promise { const systemPrompt = deps.buildSystemPrompt(); const userPrompt = deps.buildUserPrompt(input); + const t0 = performance.now(); let raw: RawAnalysisJson | null = null; try { const response = await askJsonLLM(systemPrompt, userPrompt, input.llmCallContext ?? {}); @@ -54,7 +55,11 @@ export function createLlmFileAnalyzer(deps: LlmFileAnalyzerDeps): FileAnalyzer { if (raw === null) { return { language: FALLBACK_LANGUAGE, analysis: emptyFileAnalysis() }; } - return shapeAnalysis(raw); + const shaped = shapeAnalysis(raw); + logger.info( + `llm-file-analyzer: ✓ ${input.relativePath} (${Math.round(performance.now() - t0)}ms, lang=${shaped.language})`, + ); + return shaped; }, }; } diff --git a/packages/ingest-github/src/bootstrap.ts b/packages/ingest-github/src/bootstrap.ts new file mode 100644 index 0000000..69bdda6 --- /dev/null +++ b/packages/ingest-github/src/bootstrap.ts @@ -0,0 +1,16 @@ +import { seedConfig } from "@bb/config"; +import { seedLoggerFactory, type LoggerFactory } from "@bb/logger"; +import { connectMongo } from "@bb/mongo"; +import { connectNeo4j } from "@bb/neo4j"; + +export interface BootstrapRuntimeOptions { + config: unknown; + loggerFactory: LoggerFactory; +} + +export async function bootstrapRuntime(opts: BootstrapRuntimeOptions): Promise { + seedConfig(opts.config); + seedLoggerFactory(opts.loggerFactory); + await connectMongo(); + await connectNeo4j(); +} diff --git a/packages/ingest-github/src/index.ts b/packages/ingest-github/src/index.ts index ead22c6..c9dc8bd 100644 --- a/packages/ingest-github/src/index.ts +++ b/packages/ingest-github/src/index.ts @@ -80,3 +80,9 @@ export type { DiffResult, RenamedFile } from "./pipeline/git-diff.ts"; export type { CondensedFileAnalysis } from "./types/condensed-file-analysis.ts"; export { fetchLatestCommitHash, fetchRecentCommits, parseGithubRepo } from "./githubApi.ts"; export type { CommitEntry, FetchCommitsResult, ParsedRepo } from "./githubApi.ts"; +export { bootstrapRuntime } from "./bootstrap.ts"; +export type { BootstrapRuntimeOptions } from "./bootstrap.ts"; +export { + COMBINED_CODE_ANALYSIS_SYSTEM_PROMPT, + buildFileAnalysisUserPrompt, +} from "./strategies/flat-folder/prompts/file-analysis.ts"; diff --git a/packages/ingest-github/src/pipeline/run.ts b/packages/ingest-github/src/pipeline/run.ts index e0b5482..f62e870 100644 --- a/packages/ingest-github/src/pipeline/run.ts +++ b/packages/ingest-github/src/pipeline/run.ts @@ -142,6 +142,11 @@ async function runGithub( await setKnowledgeCommit(knowledgeId, commitHash); await transitionState(knowledgeId, KnowledgeState.Processed); + const totalMs = Date.now() - startedAt; + logger.info( + `pipeline/run: ✓ github_index complete (knowledgeId=${knowledgeId}, commit=${commitHash.slice(0, 12)}, files=${result.filesAnalyzed}, folders=${result.foldersSummarised}, nodes=${result.graphNodesWritten}, ${totalMs}ms)`, + ); + return { filesAnalyzed: result.filesAnalyzed, foldersSummarised: result.foldersSummarised, diff --git a/packages/ingest-github/types/context.md b/packages/ingest-github/types/context.md new file mode 100644 index 0000000..03ed483 --- /dev/null +++ b/packages/ingest-github/types/context.md @@ -0,0 +1,42 @@ +# `types/` — context + +## Tier + +Hand-written type declarations for the package's public surface. Consumed by +TypeScript via `package.json` `"types": "./types/index.d.ts"`. Not executed at +runtime — runtime resolves through `package.json` `"main": "./src/index.ts"`. + +## Responsibility + +Provide a stable, loosely-typed declaration of every public export of +`@bb/ingest-github`. The shim short-circuits TypeScript before it walks into +`src/`, which uses package-local `src/*` path aliases that don't resolve under +a consumer's tsconfig context. + +Without this shim, any external project that imports `@bb/ingest-github` and +runs `tsc -b` would trip on `TS2307: Cannot find module 'src/types/foo.ts'` +errors from the package's internal imports. + +## Public interface + +`./index.d.ts` declares every exported symbol of the runtime `src/index.ts`. +Function signatures are intentionally permissive (`(...args: any[]) => any` in +many cases) — full type fidelity is sacrificed for resolution stability. + +When `src/index.ts` adds or renames a public export, this file must be updated +in the same commit. + +## Invariants + +1. **Never imported by `src/`.** This is a consumer-facing artifact only. +2. **Mirror of `src/index.ts` exports.** A symbol exported here that doesn't + exist in `src/index.ts` is a leak; a symbol exported from `src/` but not + here will appear as `any` to consumers at best, or break their typecheck + at worst. +3. **No runtime code.** Pure `.d.ts` declarations. + +## What is intentionally out of scope + +- Full structural types for complex shapes (use `any` / `unknown`) +- Generic constraints (keep signatures flat) +- Documentation comments (the source is authoritative) diff --git a/packages/ingest-github/types/index.d.ts b/packages/ingest-github/types/index.d.ts index a8bdcef..ac23419 100644 --- a/packages/ingest-github/types/index.d.ts +++ b/packages/ingest-github/types/index.d.ts @@ -18,6 +18,15 @@ export declare const fetchLatestCommitHash: (...args: any[]) => any; export declare const fetchRecentCommits: (...args: any[]) => any; export declare const parseGithubRepo: (...args: any[]) => any; +export interface BootstrapRuntimeOptions { + config: unknown; + loggerFactory: (scope: string) => unknown; +} +export declare function bootstrapRuntime(opts: BootstrapRuntimeOptions): Promise; + +export declare const COMBINED_CODE_ANALYSIS_SYSTEM_PROMPT: string; +export declare function buildFileAnalysisUserPrompt(input: { relativePath: string; content: string }): string; + export type CreatePipelineRunnerDeps = any; export type IngestJobHandlerDeps = any; export type IngestRunnerDeps = any; diff --git a/packages/llm/package.json b/packages/llm/package.json index 3591373..a972e7a 100644 --- a/packages/llm/package.json +++ b/packages/llm/package.json @@ -11,6 +11,7 @@ "dependencies": { "@bb/config": "workspace:*", "@bb/errors": "workspace:*", + "@bb/logger": "workspace:*", "@bb/mongo": "workspace:*", "@bb/types": "workspace:*", "tiktoken": "^1.0.22" diff --git a/packages/llm/src/client.ts b/packages/llm/src/client.ts index 0aa1634..77c3329 100644 --- a/packages/llm/src/client.ts +++ b/packages/llm/src/client.ts @@ -1,5 +1,6 @@ // SPDX-License-Identifier: AGPL-3.0-only WITH non-commercial-clause import { getConfigValue } from "@bb/config"; +import { logger } from "@bb/logger"; import { Config } from "@bb/types"; import { computeCacheKey, getCachedDecision, isCacheEnabled, recordDecision, recordHit } from "./cache.ts"; import { callOllama, resolveOllamaChain } from "./ollama.ts"; @@ -58,11 +59,11 @@ export async function askLLM(prompt: string, opts: AskLlmOptions = {}): Promise< const cached = await getCachedDecision(cacheKey); if (cached !== null) { const saved = cached.usage.inputTokens + cached.usage.outputTokens; - console.info(`[LLM CACHE HIT] key=${cacheKey.slice(0, 8)} tokens-saved=${saved}`); + logger.debug(`llm: cache hit (key=${cacheKey.slice(0, 8)}, tokens-saved=${saved})`); void recordHit(cacheKey); return { content: cached.content, usage: cached.usage }; } - console.info(`[LLM CACHE MISS] key=${cacheKey.slice(0, 8)}`); + logger.debug(`llm: cache miss (key=${cacheKey.slice(0, 8)})`); } const result = From 3ff7218e0a07293048786277e457103d8ffef007 Mon Sep 17 00:00:00 2001 From: Dead-Bytes <143434285+Dead-Bytes@users.noreply.github.com> Date: Thu, 14 May 2026 18:37:02 +0530 Subject: [PATCH 11/34] feat(pipeline): refactor knowledge structure to include info for repoUrl and branch --- packages/cli/src/output.d.ts | 16 +++++ .../ingest-github/src/pipeline/context.md | 9 +++ .../src/pipeline/pull-helpers.ts | 48 +++++++++++++++ packages/ingest-github/src/pipeline/pull.ts | 58 +++---------------- packages/mongo/src/processingStats.ts | 4 +- packages/neo4j/src/knowledge.ts | 16 ++--- packages/server/src/githubCommitsRoute.ts | 9 ++- packages/server/src/githubIndexRoute.ts | 3 +- packages/server/src/githubPullRoute.ts | 9 ++- packages/server/src/localIndexRoute.ts | 1 + packages/types/context.md | 37 +++++------- packages/types/src/context.md | 17 +++++- packages/types/src/index.ts | 8 ++- packages/types/src/knowledge.ts | 11 +++- 14 files changed, 152 insertions(+), 94 deletions(-) create mode 100644 packages/cli/src/output.d.ts create mode 100644 packages/ingest-github/src/pipeline/pull-helpers.ts diff --git a/packages/cli/src/output.d.ts b/packages/cli/src/output.d.ts new file mode 100644 index 0000000..e20f44b --- /dev/null +++ b/packages/cli/src/output.d.ts @@ -0,0 +1,16 @@ +export declare function success(line: string): void; +export declare function error(line: string, hint?: string): void; +export declare function list(label: string, items: readonly string[]): void; +export interface Spinner { + update(text: string): void; + stop(success: boolean, finalMsg?: string): void; +} +export declare function createSpinner(initialText: string): Spinner; +export interface ProgressBar { + update(current: number, total: number, text?: string): void; + stop(success: boolean, finalMsg?: string): void; +} +export declare function createProgressBar(initialText: string): ProgressBar; +export declare function table(headers: string[], rows: string[][]): void; +export declare function info(line: string): void; +//# sourceMappingURL=output.d.ts.map diff --git a/packages/ingest-github/src/pipeline/context.md b/packages/ingest-github/src/pipeline/context.md index 049441f..af56ee0 100644 --- a/packages/ingest-github/src/pipeline/context.md +++ b/packages/ingest-github/src/pipeline/context.md @@ -63,6 +63,10 @@ llmCallContext`, which every LLM call site downstream consumes. State - Neo4j via `transitionState`, and `CancellationError` is re-thrown without flipping to FAILED. - `pull.ts` — `runPull(msg, pullFactory?)` orchestrates the pull job. + Reads `repoUrl` and `branch` directly off `knowledge.info.*` (loaded via + `@bb/mongo.getKnowledge`). The `KnowledgeSource` discriminator (`kind`) is + still read off `knowledge.source` along with `commitId`/`commitHashes`, but + the repo coordinates themselves live on `info` — no fallback chain. When `pullFactory` is provided, it returns `{source, diff, targetCommit, archiveSink?}` and `runPull` skips `syncRepository` + `materialiseEndpoints` - `assertReachableFromBranch` + `computePullDiff` + `checkoutCommit` — @@ -73,6 +77,11 @@ archiveSink?}` and `runPull` skips `syncRepository` + `materialiseEndpoints` `processBigFilesQueue`, `backfillMissingFields`, `backfillBigFiles`, `runSelectiveFolderSummary`, `summariseRepo`, `storePullAnalysis`. Mirrors `run.ts` for `llmCallContext` extraction from payload. +- `pull-helpers.ts` — small pure helpers extracted from `pull.ts` to keep it + under the 300-line cap: `persistPullStats` writes the per-commit row into + `processing_stats`, `repoNameFromUrl` parses an owner/repo display name out + of a GitHub URL with a graceful fallback, and `describe` flattens an + `unknown` cause to a short string for `IngestError` messages. - `branch.ts` — `resolveBranch(knowledgeId, payload)`. Defaults to `main` when the payload omits it; rejects branch names that don't match `^[\w./-]+$` with `IngestError` (defence against shell-injection into git args). diff --git a/packages/ingest-github/src/pipeline/pull-helpers.ts b/packages/ingest-github/src/pipeline/pull-helpers.ts new file mode 100644 index 0000000..0330a03 --- /dev/null +++ b/packages/ingest-github/src/pipeline/pull-helpers.ts @@ -0,0 +1,48 @@ +import { recordProcessingStats } from "@bb/mongo"; +import { estimateCostFromBreakdown } from "@bb/llm"; + +export interface PersistPullStatsInput { + knowledgeId: string; + repoName: string; + commitHash: string; + filesAnalyzed: number; + foldersSummarised: number; + processingTimeMs: number; +} + +export async function persistPullStats(input: PersistPullStatsInput): Promise { + const estimatedCost = await estimateCostFromBreakdown({}); + await recordProcessingStats({ + knowledgeId: input.knowledgeId, + repoName: input.repoName, + commitHash: input.commitHash, + modelTokens: {}, + estimatedCost, + totalBatches: 1, + totalFiles: input.filesAnalyzed, + totalFolders: input.foldersSummarised, + filesAnalyzed: input.filesAnalyzed, + processingTimeMs: input.processingTimeMs, + }); +} + +export function repoNameFromUrl(repoUrl: string): string { + try { + const segments = new URL(repoUrl).pathname + .split("/") + .map((s) => s.trim()) + .filter((s) => s.length > 0); + const repo = segments.at(-1)?.replace(/\.git$/u, ""); + const owner = segments.at(-2); + if (owner !== undefined && repo !== undefined) { + return `${owner}/${repo}`; + } + } catch { + // fall through + } + return repoUrl; +} + +export function describe(cause: unknown): string { + return cause instanceof Error ? cause.message : String(cause); +} diff --git a/packages/ingest-github/src/pipeline/pull.ts b/packages/ingest-github/src/pipeline/pull.ts index c3f08f3..ff5d0aa 100644 --- a/packages/ingest-github/src/pipeline/pull.ts +++ b/packages/ingest-github/src/pipeline/pull.ts @@ -1,8 +1,9 @@ import { Config, KnowledgeState, type GithubPullPayload, type JobMessage } from "@bb/types"; import { getConfigValue } from "@bb/config"; -import { getKnowledge, recordProcessingStats, setKnowledgeCommit, setKnowledgeState } from "@bb/mongo"; +import { getKnowledge, setKnowledgeCommit, setKnowledgeState } from "@bb/mongo"; import { setKnowledgeStateInGraph, snapshotFilesToVersion, type NodeScope } from "@bb/neo4j"; -import { estimateCostFromBreakdown, type AskLlmOptions } from "@bb/llm"; +import { type AskLlmOptions } from "@bb/llm"; +import { describe, persistPullStats, repoNameFromUrl } from "./pull-helpers.ts"; import { IngestError, KnowledgeNotFoundError } from "@bb/errors"; import { logger } from "@bb/logger"; import { ensureMetaDirs, metaPathsFor, repoCloneDir, ensureReposRoot } from "./paths.ts"; @@ -77,8 +78,11 @@ export async function runPull(msg: JobMessage, pullFactory?: ); } - const branch = knowledge.source.branch ?? "main"; - const repoUrl = knowledge.source.repoUrl; + const branch = knowledge.info.branch ?? "main"; + const repoUrl = knowledge.info.repoUrl; + if (repoUrl === undefined || repoUrl.length === 0) { + throw new IngestError(knowledgeId, "pull requires knowledge.info.repoUrl"); + } const gitToken = msg.payload.gitToken; clearCancellation(knowledgeId); @@ -262,49 +266,3 @@ async function transitionState(knowledgeId: string, state: KnowledgeState): Prom await setKnowledgeState(knowledgeId, state); await setKnowledgeStateInGraph(knowledgeId, state).catch(() => undefined); } - -interface PersistPullStatsInput { - knowledgeId: string; - repoName: string; - commitHash: string; - filesAnalyzed: number; - foldersSummarised: number; - processingTimeMs: number; -} - -async function persistPullStats(input: PersistPullStatsInput): Promise { - const estimatedCost = await estimateCostFromBreakdown({}); - await recordProcessingStats({ - knowledgeId: input.knowledgeId, - repoName: input.repoName, - commitHash: input.commitHash, - modelTokens: {}, - estimatedCost, - totalBatches: 1, - totalFiles: input.filesAnalyzed, - totalFolders: input.foldersSummarised, - filesAnalyzed: input.filesAnalyzed, - processingTimeMs: input.processingTimeMs, - }); -} - -function repoNameFromUrl(repoUrl: string): string { - try { - const segments = new URL(repoUrl).pathname - .split("/") - .map((s) => s.trim()) - .filter((s) => s.length > 0); - const repo = segments.at(-1)?.replace(/\.git$/u, ""); - const owner = segments.at(-2); - if (owner !== undefined && repo !== undefined) { - return `${owner}/${repo}`; - } - } catch { - // fall through - } - return repoUrl; -} - -function describe(cause: unknown): string { - return cause instanceof Error ? cause.message : String(cause); -} diff --git a/packages/mongo/src/processingStats.ts b/packages/mongo/src/processingStats.ts index c87f5db..b7034e6 100644 --- a/packages/mongo/src/processingStats.ts +++ b/packages/mongo/src/processingStats.ts @@ -165,7 +165,7 @@ function deriveRepoName(doc: KnowledgeDoc): string { return segments.at(-1) ?? doc.source.sourcePath; } try { - const segments = new URL(doc.source.repoUrl).pathname + const segments = new URL(doc.info.repoUrl ?? "").pathname .split("/") .map((s) => s.trim()) .filter((s) => s.length > 0); @@ -177,7 +177,7 @@ function deriveRepoName(doc: KnowledgeDoc): string { } catch { // fall through } - return doc.source.repoUrl; + return doc.info.repoUrl ?? ""; } function toIso(value: Date | string | undefined): string { diff --git a/packages/neo4j/src/knowledge.ts b/packages/neo4j/src/knowledge.ts index fcb8043..78c6f3d 100644 --- a/packages/neo4j/src/knowledge.ts +++ b/packages/neo4j/src/knowledge.ts @@ -1,5 +1,5 @@ import path from "node:path"; -import type { KnowledgeDoc, KnowledgeSource, KnowledgeState } from "@bb/types"; +import type { KnowledgeDoc, KnowledgeState } from "@bb/types"; import { _runCypher } from "./client.ts"; const UPSERT_KNOWLEDGE = ` @@ -52,14 +52,14 @@ DELETE n export async function upsertKnowledgeNode(doc: KnowledgeDoc): Promise { const sourceKind = doc.source.kind; - const sourceUrl = doc.source.kind === "github" ? doc.source.repoUrl : doc.source.sourcePath; - const branch = doc.source.kind === "github" ? (doc.source.branch ?? null) : null; + const sourceUrl = doc.source.kind === "github" ? (doc.info.repoUrl ?? "") : doc.source.sourcePath; + const branch = doc.source.kind === "github" ? (doc.info.branch ?? null) : null; await _runCypher(UPSERT_KNOWLEDGE, { knowledgeId: doc.knowledgeId, sourceKind, sourceUrl, branch, - repoName: deriveRepoName(doc.source), + repoName: deriveRepoName(doc), state: doc.status.state, createdAt: doc.createdAt.toISOString(), updatedAt: doc.updatedAt.toISOString(), @@ -81,11 +81,11 @@ export async function deleteKnowledgeGraph(knowledgeId: string): Promise { await _runCypher(DELETE_ORPHAN_ENTITIES); } -function deriveRepoName(source: KnowledgeSource): string { - if (source.kind === "local") { - return path.basename(source.sourcePath); +function deriveRepoName(doc: KnowledgeDoc): string { + if (doc.source.kind === "local") { + return path.basename(doc.source.sourcePath); } - return repoNameFromGithubUrl(source.repoUrl); + return repoNameFromGithubUrl(doc.info.repoUrl ?? ""); } function repoNameFromGithubUrl(repoUrl: string): string { diff --git a/packages/server/src/githubCommitsRoute.ts b/packages/server/src/githubCommitsRoute.ts index a8ee67a..3399236 100644 --- a/packages/server/src/githubCommitsRoute.ts +++ b/packages/server/src/githubCommitsRoute.ts @@ -51,10 +51,15 @@ export function buildGithubCommitsRoute(): Router { .json({ error: `commits endpoint is only supported for github knowledge (kind=${knowledge.source.kind})` }); return; } - const branch = knowledge.source.branch ?? "main"; + const branch = knowledge.info.branch ?? "main"; + const repoUrl = knowledge.info.repoUrl; + if (repoUrl === undefined || repoUrl.length === 0) { + res.status(422).json({ error: "commits endpoint requires knowledge.info.repoUrl" }); + return; + } const gitToken = extractBearerToken(req.headers["authorization"]); - const result = await fetchRecentCommits(knowledge.source.repoUrl, branch, limit, gitToken); + const result = await fetchRecentCommits(repoUrl, branch, limit, gitToken); switch (result.status) { case "ok": { const payload: CommitsResponse = { knowledgeId, branch, commits: result.commits }; diff --git a/packages/server/src/githubIndexRoute.ts b/packages/server/src/githubIndexRoute.ts index 13aa084..e92dde0 100644 --- a/packages/server/src/githubIndexRoute.ts +++ b/packages/server/src/githubIndexRoute.ts @@ -31,7 +31,8 @@ export function buildGithubIndexRoute(): Router { const now = new Date(); const doc: KnowledgeDoc = { knowledgeId, - source: { kind: "github", repoUrl, ...(branch !== undefined ? { branch } : {}) }, + source: { kind: "github" }, + info: { repoUrl, ...(branch !== undefined ? { branch } : {}) }, status: { state: KnowledgeState.Created }, createdAt: now, updatedAt: now, diff --git a/packages/server/src/githubPullRoute.ts b/packages/server/src/githubPullRoute.ts index f80d79f..0b72a81 100644 --- a/packages/server/src/githubPullRoute.ts +++ b/packages/server/src/githubPullRoute.ts @@ -71,11 +71,16 @@ export function buildGithubPullRoute(): Router { return; } - const branch = knowledge.source.branch ?? "main"; + const branch = knowledge.info.branch ?? "main"; + const repoUrl = knowledge.info.repoUrl; + if (repoUrl === undefined || repoUrl.length === 0) { + res.status(422).json({ error: "pull requires knowledge.info.repoUrl" }); + return; + } let targetCommit = suppliedTarget; if (targetCommit === undefined) { try { - const head = await fetchLatestCommitHash(knowledge.source.repoUrl, branch, gitToken); + const head = await fetchLatestCommitHash(repoUrl, branch, gitToken); if (head !== null && COMMIT_HASH_RE.test(head)) { targetCommit = head; } diff --git a/packages/server/src/localIndexRoute.ts b/packages/server/src/localIndexRoute.ts index 532f33b..326185e 100644 --- a/packages/server/src/localIndexRoute.ts +++ b/packages/server/src/localIndexRoute.ts @@ -48,6 +48,7 @@ export function buildLocalIndexRoute(): Router { const doc: KnowledgeDoc = { knowledgeId, source: { kind: "local", sourcePath }, + info: {}, status: { state: KnowledgeState.Created }, createdAt: now, updatedAt: now, diff --git a/packages/types/context.md b/packages/types/context.md index 31c18a4..7dd5780 100644 --- a/packages/types/context.md +++ b/packages/types/context.md @@ -32,31 +32,22 @@ llmModel?, llmKeyId? }` mixin that lets downstream consumers carry per-job QUEUED → INGESTED → PROCESSING → PROCESSED ↘ FAILED`) referenced by `@bb/queue` (writes `QUEUED`), `@bb/mongo` (`setKnowledgeState`), and future ingest workers. - -Future inhabitants (added on need basis): full `Knowledge`, `Raw`, -`Node`, `MCP*` document shapes — the cross-package domain types named in +- `KnowledgeDoc`, `KnowledgeSource`, `GithubKnowledgeSource`, + `LocalKnowledgeSource`, `KnowledgeInfo` — the cross-package shape of the + Mongo `knowledge` document. Split into two substructures with + non-overlapping responsibilities: `KnowledgeSource` discriminates the + upstream type (github vs local) and carries per-kind ingestion state — + for github, the current head commit and the full commit history; for + local, the on-disk path. `KnowledgeInfo` carries the repo coordinates the + pipeline reads on every run (URL and branch); it has an open shape so + downstream consumers can attach extra fields without forcing schema + changes here. The pull pipeline reads URL and branch off `KnowledgeInfo` + directly — there is no fallback chain to `KnowledgeSource`. + +Future inhabitants (added on need basis): full `Raw`, `Node`, `MCP*` +document shapes — the cross-package domain types named in [docs/arch.md:69](../../docs/arch.md#L69). -## Public exports - -```ts -enum Config { ... } - -enum JobType { GithubIndex, GithubPull, LocalIngest } -enum JobPriority { Low, Normal, High } -interface PayloadLlmOverrides { llmApiKey?, llmProvider?: string, llmModel?, llmKeyId? } -interface GithubIndexPayload extends PayloadLlmOverrides { knowledgeId, repoUrl, branch?, commitHash?, gitToken?, orgId? } -interface GithubPullPayload extends PayloadLlmOverrides { knowledgeId, orgId?, targetCommitHash?, gitToken? } -interface LocalIngestPayload { knowledgeId, rootDir, orgId? } -interface JobMessage

{ id, type, priority, knowledgeId, attempt, createdAt, payload } -type PayloadFor - -enum KnowledgeState { Created, Queued, Ingested, Processing, Processed, Failed } -``` - -Add new shared types here only when **two or more** packages need to refer -to the same shape. - ## Data ownership None. This package owns no runtime state — only types and enum members. diff --git a/packages/types/src/context.md b/packages/types/src/context.md index 6786ddd..2ca3488 100644 --- a/packages/types/src/context.md +++ b/packages/types/src/context.md @@ -36,9 +36,20 @@ package-level contract; this file documents how the source tree is split. also carries an optional `orgId?` so downstream multi-tenant workers can scope Mongo/Neo4j lookups by org. - **[knowledge.ts](knowledge.ts)** — the `KnowledgeState` enum modeling - the lifecycle in [CLAUDE.md](../../../CLAUDE.md). v0 only ships the - enum; the full `Knowledge` document interface lands when domain CRUD - helpers in `@bb/mongo` need it. + the lifecycle in [CLAUDE.md](../../../CLAUDE.md), plus the + `KnowledgeDoc` document interface and its substructures: + - `KnowledgeSource` is a discriminated union (`GithubKnowledgeSource | LocalKnowledgeSource`) + that captures **what kind of upstream produced this knowledge** plus per-kind + state. For github: `commitId` (current head) and `commitHashes` (history). + For local: `sourcePath`. `source` does **not** carry `repoUrl` or `branch` — + those live on `info` (see below). + - `KnowledgeInfo` carries the human-readable repo coordinates the pipeline + needs every run: `repoUrl`, `branch`, plus an open index signature so + downstream consumers can stash extra fields without forcing schema changes + here. The pull pipeline reads `knowledge.info.repoUrl` / `knowledge.info.branch` + directly — that's the single source of truth for the URL/branch, no fallback. + - `KnowledgeDoc` carries both: `source` for upstream-type + indexed-commit + state, `info` for repo coordinates. Both are required on every doc. ## Module dependency graph diff --git a/packages/types/src/index.ts b/packages/types/src/index.ts index 99ef34d..2871e7b 100644 --- a/packages/types/src/index.ts +++ b/packages/types/src/index.ts @@ -9,7 +9,13 @@ export type { PayloadLlmOverrides, } from "./job.ts"; export { KnowledgeState } from "./knowledge.ts"; -export type { GithubKnowledgeSource, KnowledgeDoc, KnowledgeSource, LocalKnowledgeSource } from "./knowledge.ts"; +export type { + GithubKnowledgeSource, + KnowledgeDoc, + KnowledgeInfo, + KnowledgeSource, + LocalKnowledgeSource, +} from "./knowledge.ts"; export type { ModelTokenBreakdown, ModelTokenUsage, diff --git a/packages/types/src/knowledge.ts b/packages/types/src/knowledge.ts index a6e1309..6537ba5 100644 --- a/packages/types/src/knowledge.ts +++ b/packages/types/src/knowledge.ts @@ -9,8 +9,6 @@ export enum KnowledgeState { export interface GithubKnowledgeSource { kind: "github"; - repoUrl: string; - branch?: string; /** Current head pointer — the most recently indexed commit. */ commitId?: string; /** Every commit this knowledge has been indexed at, oldest → newest. Pull appends to this list. */ @@ -24,10 +22,19 @@ export interface LocalKnowledgeSource { export type KnowledgeSource = GithubKnowledgeSource | LocalKnowledgeSource; +export interface KnowledgeInfo { + repoUrl?: string; + branch?: string; + git_url?: string; + githubInfo?: { commitId?: string; commitHashes?: string[]; branchName?: string }; + [key: string]: unknown; +} + export interface KnowledgeDoc { knowledgeId: string; source: KnowledgeSource; status: { state: KnowledgeState; totalFiles?: number; processedFiles?: number }; createdAt: Date; updatedAt: Date; + info: KnowledgeInfo; } From 7fae33f8476a9a27fe71ba11c6b07fd6e3f4ab69 Mon Sep 17 00:00:00 2001 From: Dead-Bytes <143434285+Dead-Bytes@users.noreply.github.com> Date: Thu, 14 May 2026 18:49:20 +0530 Subject: [PATCH 12/34] Merge branch 'pre-release' into feat/per-call-llm-creds --- .../src/strategies/flat-folder/analyse-changed.ts | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/packages/ingest-github/src/strategies/flat-folder/analyse-changed.ts b/packages/ingest-github/src/strategies/flat-folder/analyse-changed.ts index 1f10ae8..6a39d06 100644 --- a/packages/ingest-github/src/strategies/flat-folder/analyse-changed.ts +++ b/packages/ingest-github/src/strategies/flat-folder/analyse-changed.ts @@ -1,6 +1,6 @@ import path from "node:path"; import { readFile, stat } from "node:fs/promises"; -import { tokenLen } from "@bb/llm"; +import { tokenLen, type AskLlmOptions } from "@bb/llm"; import { logger } from "@bb/logger"; import { Config } from "@bb/types"; import { getConfigValue } from "@bb/config"; @@ -28,6 +28,7 @@ export interface AnalyseChangedInput { * effort — errors from the callback are swallowed. */ onFileProcessed?: () => Promise | void; + llmCallContext?: AskLlmOptions; } export interface AnalyseChangedResult { @@ -166,7 +167,7 @@ export async function analyseChangedFiles(input: AnalyseChangedInput): Promise { try { throwIfCancelled(input.knowledgeId); - const condensed = await analyseScannedFile(input.analyzer, scanned); + const condensed = await analyseScannedFile(input.analyzer, scanned, input.llmCallContext); await saveCondensed(input.metaPaths, condensed); smallFilesAnalysed += 1; } catch (cause: unknown) { From 251a1a051d3b4fbe31dcd2c0d97789ab7f9ef057 Mon Sep 17 00:00:00 2001 From: Dead-Bytes <143434285+Dead-Bytes@users.noreply.github.com> Date: Thu, 14 May 2026 22:48:01 +0530 Subject: [PATCH 13/34] feat(progress): integrate progress reporting into flat-folder strategy phases --- packages/ingest-github/README.md | 8 +- packages/ingest-github/src/index.ts | 34 ++- packages/ingest-github/src/pipeline/README.md | 9 +- packages/ingest-github/src/pipeline/pull.ts | 31 ++- .../src/progress/NullProgressReporter.ts | 45 ++++ packages/ingest-github/src/progress/README.md | 35 +++ packages/ingest-github/src/progress/types.ts | 48 ++++ .../src/strategies/flat-folder/README.md | 32 ++- .../strategies/flat-folder/analyse-changed.ts | 217 ++++++++++-------- .../strategies/flat-folder/backfill/README.md | 38 +-- .../flat-folder/backfill/big-files.ts | 73 +++--- .../strategies/flat-folder/backfill/fields.ts | 54 +++-- .../strategies/flat-folder/big-file/README.md | 14 +- .../strategies/flat-folder/big-file/index.ts | 25 +- .../src/strategies/flat-folder/index.ts | 142 +++++++----- .../strategies/flat-folder/phases/README.md | 20 +- .../phases/classify-and-analyse-small.ts | 130 ++++++----- .../flat-folder/phases/process-big-files.ts | 105 +++++---- packages/ingest-github/types/index.d.ts | 31 +++ 19 files changed, 740 insertions(+), 351 deletions(-) create mode 100644 packages/ingest-github/src/progress/NullProgressReporter.ts create mode 100644 packages/ingest-github/src/progress/README.md create mode 100644 packages/ingest-github/src/progress/types.ts diff --git a/packages/ingest-github/README.md b/packages/ingest-github/README.md index 7979474..bb624cf 100644 --- a/packages/ingest-github/README.md +++ b/packages/ingest-github/README.md @@ -50,7 +50,12 @@ The package does **not** own: strategies can add this) - Semantic chunking, big-file processing, smart sampling (future strategies) -- Recovery / progress reporting / failed-files tracking +- Recovery / failed-files tracking +- Progress **transport** — the package now ships a `ProgressContext` + extension port under `src/progress/` (see that folder's README), but + the actual SSE / Pub-Sub plumbing lives in the host binary's progress + package. The OSS default (`nullProgressContextFactory`) discards every + event, consistent with the no-outbound-calls posture. - Provider abstraction (no Bitbucket support; GitHub-only) - Concurrency control (sequential per-file processing intentional for v0; revisit when users complain) @@ -65,6 +70,7 @@ function registerLocalIngestWorker(): void; // wires LocalIngest interface RegisterGithubWorkersDeps { sourceFactory?: SourceFactory; // index-side hook pullFactory?: PullFactory; // pull-side hook (provides reader + diff + targetCommit) + progressContextFactory?: ProgressContextFactory; // SSE progress hook (default: no-op) } // Lower-level building blocks (downstream consumers with their own queue diff --git a/packages/ingest-github/src/index.ts b/packages/ingest-github/src/index.ts index c9dc8bd..e777df6 100644 --- a/packages/ingest-github/src/index.ts +++ b/packages/ingest-github/src/index.ts @@ -11,24 +11,30 @@ import { buildFileAnalysisUserPrompt, } from "./strategies/flat-folder/prompts/file-analysis.ts"; import type { PullFactory, SourceFactory } from "./types/pipeline.ts"; +import type { ProgressContextFactory } from "./progress/types.ts"; +import { nullProgressContextFactory } from "./progress/NullProgressReporter.ts"; /** - * Optional dependencies for the GitHub workers. Both factories are - * documented in `docs/extension-points.md`. The open-source binary - * leaves both undefined — index and pull use the default disk-backed - * readers and a local `git clone` / `git diff`. + * Optional dependencies for the GitHub workers. Factories are documented in + * `docs/extension-points.md`. The open-source binary leaves them undefined — + * index and pull use the default disk-backed readers, and progress events + * are discarded by `nullProgressContextFactory`. */ export interface RegisterGithubWorkersDeps { sourceFactory?: SourceFactory; pullFactory?: PullFactory; + progressContextFactory?: ProgressContextFactory; } -function buildRunner(sourceFactory: SourceFactory | undefined): ReturnType { +function buildRunner( + sourceFactory: SourceFactory | undefined, + progressContextFactory: ProgressContextFactory, +): ReturnType { const fileAnalyzer = createLlmFileAnalyzer({ buildSystemPrompt: () => COMBINED_CODE_ANALYSIS_SYSTEM_PROMPT, buildUserPrompt: buildFileAnalysisUserPrompt, }); - const strategy = createFlatFolderStrategy({ fileAnalyzer }); + const strategy = createFlatFolderStrategy({ fileAnalyzer, progressContextFactory }); const runnerDeps: Parameters[0] = { reposRootDir: reposRoot(), strategy }; if (sourceFactory !== undefined) { runnerDeps.sourceFactory = sourceFactory; @@ -37,14 +43,15 @@ function buildRunner(sourceFactory: SourceFactory | undefined): ReturnType runPull(msg, pullFactory)); + registerWorker(JobType.GithubPull, (msg) => runPull(msg, pullFactory, progressContextFactory)); } export function registerLocalIngestWorker(): void { - const runner = buildRunner(undefined); + const runner = buildRunner(undefined, nullProgressContextFactory); registerWorker(JobType.LocalIngest, createLocalIngestHandler({ runner })); } @@ -86,3 +93,12 @@ export { COMBINED_CODE_ANALYSIS_SYSTEM_PROMPT, buildFileAnalysisUserPrompt, } from "./strategies/flat-folder/prompts/file-analysis.ts"; +export type { + ProgressContext, + ProgressContextFactory, + ProgressPhase, + ProgressReporter, + ProgressReporterInput, + ProgressTotalMode, +} from "./progress/types.ts"; +export { nullProgressContextFactory } from "./progress/NullProgressReporter.ts"; diff --git a/packages/ingest-github/src/pipeline/README.md b/packages/ingest-github/src/pipeline/README.md index 5ef8bd8..97b7192 100644 --- a/packages/ingest-github/src/pipeline/README.md +++ b/packages/ingest-github/src/pipeline/README.md @@ -62,7 +62,7 @@ llmCallContext`, which every LLM call site downstream consumes. State transitions (`CREATED → QUEUED → INGESTED → …`) are persisted to Mongo - Neo4j via `transitionState`, and `CancellationError` is re-thrown without flipping to FAILED. -- `pull.ts` — `runPull(msg, pullFactory?)` orchestrates the pull job. +- `pull.ts` — `runPull(msg, pullFactory?, progressContextFactory?)` orchestrates the pull job. Reads `repoUrl` and `branch` directly off `knowledge.info.*` (loaded via `@bb/mongo.getKnowledge`). The `KnowledgeSource` discriminator (`kind`) is still read off `knowledge.source` along with `commitId`/`commitHashes`, but @@ -77,6 +77,13 @@ archiveSink?}` and `runPull` skips `syncRepository` + `materialiseEndpoints` `processBigFilesQueue`, `backfillMissingFields`, `backfillBigFiles`, `runSelectiveFolderSummary`, `summariseRepo`, `storePullAnalysis`. Mirrors `run.ts` for `llmCallContext` extraction from payload. + Mirrors the index-side strategy orchestrator for progress: builds one + `ProgressContext` per job from the optional `progressContextFactory` + (default `nullProgressContextFactory`), emits `phaseChanged` at + `file_analysis` / `folder_analysis` / `indexing` boundaries, threads + the context into every phase that takes a `progressContext?` field, + and finishes with `completed()` on success or `failed(message)` on a + non-`CancellationError` throw. - `pull-helpers.ts` — small pure helpers extracted from `pull.ts` to keep it under the 300-line cap: `persistPullStats` writes the per-commit row into `processing_stats`, `repoNameFromUrl` parses an owner/repo display name out diff --git a/packages/ingest-github/src/pipeline/pull.ts b/packages/ingest-github/src/pipeline/pull.ts index ff5d0aa..17c753c 100644 --- a/packages/ingest-github/src/pipeline/pull.ts +++ b/packages/ingest-github/src/pipeline/pull.ts @@ -14,6 +14,8 @@ import { computePullDiff, materialiseEndpoints } from "./pull-diff-resolver.ts"; import { affectedFoldersFromDiff } from "./affected-folders.ts"; import { createDiskSourceReader } from "./disk-source-reader.ts"; import type { PullFactory, SourceReader, ArchiveSink } from "src/types/pipeline.ts"; +import type { ProgressContextFactory } from "src/progress/types.ts"; +import { nullProgressContextFactory } from "src/progress/NullProgressReporter.ts"; import { analyseChangedFiles } from "src/strategies/flat-folder/analyse-changed.ts"; import { processBigFilesQueue } from "src/strategies/flat-folder/phases/process-big-files.ts"; import { backfillMissingFields } from "src/strategies/flat-folder/backfill/fields.ts"; @@ -54,7 +56,11 @@ function llmCallContextFromPayload(payload: { return Object.keys(ctx).length > 0 ? ctx : undefined; } -export async function runPull(msg: JobMessage, pullFactory?: PullFactory): Promise { +export async function runPull( + msg: JobMessage, + pullFactory?: PullFactory, + progressContextFactory: ProgressContextFactory = nullProgressContextFactory, +): Promise { const { knowledgeId } = msg.payload; if (msg.payload.targetCommitHash !== undefined && !COMMIT_HASH_RE.test(msg.payload.targetCommitHash)) { throw new IngestError( @@ -88,6 +94,7 @@ export async function runPull(msg: JobMessage, pullFactory?: clearCancellation(knowledgeId); const startedAt = Date.now(); await transitionState(knowledgeId, KnowledgeState.Processing); + const progressContext = progressContextFactory(knowledgeId); try { throwIfCancelled(knowledgeId); @@ -169,6 +176,7 @@ export async function runPull(msg: JobMessage, pullFactory?: const llmCallContext = llmCallContextFromPayload(msg.payload); + progressContext.phaseChanged("file_analysis"); logger.info(`pull: phase per-file dispatcher for ${knowledgeId} starting`); throwIfCancelled(knowledgeId); const analyseChangedInput: Parameters[0] = { @@ -177,6 +185,7 @@ export async function runPull(msg: JobMessage, pullFactory?: metaPaths, analyzer: fileAnalyzer, diff, + progressContext, }; if (llmCallContext !== undefined) { analyseChangedInput.llmCallContext = llmCallContext; @@ -188,7 +197,12 @@ export async function runPull(msg: JobMessage, pullFactory?: logger.info(`pull: phase process big files starting`); throwIfCancelled(knowledgeId); - const processBigFilesInput: Parameters[0] = { knowledgeId, source, metaPaths }; + const processBigFilesInput: Parameters[0] = { + knowledgeId, + source, + metaPaths, + progressContext, + }; if (llmCallContext !== undefined) { processBigFilesInput.llmCallContext = llmCallContext; } @@ -196,16 +210,22 @@ export async function runPull(msg: JobMessage, pullFactory?: logger.info(`pull: phase backfill fields starting`); throwIfCancelled(knowledgeId); - await backfillMissingFields(metaPaths, llmCallContext); + await backfillMissingFields(metaPaths, llmCallContext, progressContext); logger.info(`pull: phase backfill big-files starting`); throwIfCancelled(knowledgeId); - const backfillBigFilesInput: Parameters[0] = { knowledgeId, source, metaPaths }; + const backfillBigFilesInput: Parameters[0] = { + knowledgeId, + source, + metaPaths, + progressContext, + }; if (llmCallContext !== undefined) { backfillBigFilesInput.llmCallContext = llmCallContext; } await backfillBigFiles(backfillBigFilesInput); + progressContext.phaseChanged("folder_analysis"); logger.info(`pull: phase selective folder summary (${affectedFolders.size} folders) starting`); throwIfCancelled(knowledgeId); const selectiveInput: Parameters[0] = { @@ -218,6 +238,7 @@ export async function runPull(msg: JobMessage, pullFactory?: } await runSelectiveFolderSummary(selectiveInput); + progressContext.phaseChanged("indexing"); logger.info(`pull: phase repo summary starting`); throwIfCancelled(knowledgeId); const orgId = resolveOrgId({ ...(knowledge.source.kind === "github" ? {} : {}) }); @@ -248,6 +269,7 @@ export async function runPull(msg: JobMessage, pullFactory?: }); await setKnowledgeCommit(knowledgeId, targetCommit); await transitionState(knowledgeId, KnowledgeState.Processed); + progressContext.completed("github_pull complete"); logger.info( `pull: ${knowledgeId} ${currentCommit.slice(0, 12)} -> ${targetCommit.slice(0, 12)} done (filesUpserted=${stored.filesUpserted} filesDeleted=${stored.filesDeleted} foldersUpserted=${stored.foldersUpserted})`, ); @@ -258,6 +280,7 @@ export async function runPull(msg: JobMessage, pullFactory?: throw cause; } await transitionState(knowledgeId, KnowledgeState.Failed).catch(() => undefined); + progressContext.failed(describe(cause)); throw new IngestError(knowledgeId, `github_pull failed: ${describe(cause)}`, cause); } } diff --git a/packages/ingest-github/src/progress/NullProgressReporter.ts b/packages/ingest-github/src/progress/NullProgressReporter.ts new file mode 100644 index 0000000..8c3d394 --- /dev/null +++ b/packages/ingest-github/src/progress/NullProgressReporter.ts @@ -0,0 +1,45 @@ +import type { + ProgressContext, + ProgressContextFactory, + ProgressPhase, + ProgressReporter, + ProgressReporterInput, +} from "src/progress/types.ts"; + +class NullProgressReporter implements ProgressReporter { + async start(): Promise { + /* no-op */ + } + increment(_delta?: number, _meta?: { fileName?: string }): void { + /* no-op */ + } + incrementSeen(_delta?: number): void { + /* no-op */ + } + setTotal(_total: number): void { + /* no-op */ + } + stop(): void { + /* no-op */ + } +} + +class NullProgressContext implements ProgressContext { + reporter(_input: ProgressReporterInput): ProgressReporter { + return new NullProgressReporter(); + } + phaseChanged(_phase: ProgressPhase): void { + /* no-op */ + } + completed(_message?: string): void { + /* no-op */ + } + failed(_error: string, _phase?: ProgressPhase): void { + /* no-op */ + } +} + +const SINGLETON: ProgressContext = new NullProgressContext(); + +/** Default factory used when no host binary supplies one. */ +export const nullProgressContextFactory: ProgressContextFactory = (_knowledgeId: string) => SINGLETON; diff --git a/packages/ingest-github/src/progress/README.md b/packages/ingest-github/src/progress/README.md new file mode 100644 index 0000000..cfed8d0 --- /dev/null +++ b/packages/ingest-github/src/progress/README.md @@ -0,0 +1,35 @@ +# `ingest-github / progress` + +**Tier:** Domain extension port + +## Responsibility + +Defines the host-binary extension port for observing ingestion-phase progress without coupling `@bb/ingest-github` to any transport. + +The strategy emits two kinds of signals through this port: + +- **Intra-phase ticks** via `ProgressReporter` — one reporter per phase or sub-phase of one job, driven by the strategy as it makes progress. +- **Phase boundaries and terminal state** via `ProgressContext.phaseChanged / completed / failed`. + +A host binary supplies a `ProgressContextFactory(knowledgeId)`. `@bb/server` does not — it falls back to `nullProgressContextFactory`, which discards every signal. + +## Public API + +- `ProgressPhase` — `"file_analysis" | "folder_analysis" | "indexing"` +- `ProgressTotalMode` — `{ kind: "fixed"; total }` or `{ kind: "growing"; initialTotal? }` +- `ProgressReporterInput` — phase + sub-phase + total mode + optional restart-seed hook +- `ProgressReporter` — `start / increment / incrementSeen / setTotal / stop` +- `ProgressContext` — bundles `reporter()` with boundary-event publishers +- `ProgressContextFactory` — `(knowledgeId) => ProgressContext` +- `nullProgressContextFactory` — no-op fallback used when the host does not supply one + +## Invariants + +- Pure types and a no-op default. No transport. No outbound calls. +- Tracker decisions (sampling cadence, persistence, fanout) belong to the host implementation. +- The strategy must call `reporter.stop()` in a `finally` so the host can emit a final tick deterministically. +- Reporters returned for the same `(knowledgeId, phase, subPhase)` are not reused across invocations — each `reporter()` call returns a fresh instance. + +## External dependencies + +None. diff --git a/packages/ingest-github/src/progress/types.ts b/packages/ingest-github/src/progress/types.ts new file mode 100644 index 0000000..698dad9 --- /dev/null +++ b/packages/ingest-github/src/progress/types.ts @@ -0,0 +1,48 @@ +/** + * Progress-reporting extension port. + * + * `@bb/ingest-github` exposes this interface so a host binary can observe + * phase progress without the strategy importing the host's transport. The + * default is a no-op (`NullProgressContext`) — consistent with the + * no-outbound-calls posture. + */ + +export type ProgressPhase = "file_analysis" | "folder_analysis" | "indexing"; + +export type ProgressTotalMode = + | { kind: "fixed"; total: number } + | { kind: "growing"; initialTotal?: number }; + +export interface ProgressReporterInput { + readonly phase: ProgressPhase; + readonly subPhase?: string; + readonly total: ProgressTotalMode; + readonly resolveInitialProcessed?: () => Promise | number; +} + +/** + * Per-phase progress sink. One instance per phase or sub-phase of a job. + * The host implementation decides whether emissions are timer-sampled, + * push-per-call, persisted, or discarded. + */ +export interface ProgressReporter { + start(): Promise; + increment(delta?: number, meta?: { fileName?: string }): void; + /** Grow the denominator when the work set is a streaming iterator. */ + incrementSeen(delta?: number): void; + setTotal(total: number): void; + stop(): void; +} + +/** + * Bundle of progress facilities scoped to a single ingestion job. Returned + * by `ProgressContextFactory(knowledgeId)`. + */ +export interface ProgressContext { + reporter(input: ProgressReporterInput): ProgressReporter; + phaseChanged(phase: ProgressPhase): void; + completed(message?: string): void; + failed(error: string, phase?: ProgressPhase): void; +} + +export type ProgressContextFactory = (knowledgeId: string) => ProgressContext; diff --git a/packages/ingest-github/src/strategies/flat-folder/README.md b/packages/ingest-github/src/strategies/flat-folder/README.md index ee75ffc..624534e 100644 --- a/packages/ingest-github/src/strategies/flat-folder/README.md +++ b/packages/ingest-github/src/strategies/flat-folder/README.md @@ -30,13 +30,43 @@ sub-phase boundary. flat-folder indexes, upsert `:Repo`, then every `:Folder`, then every `:File` with the extended analysis + Folder→File `CONTAINS` edge. +## Progress events + +The strategy emits progress through the `ProgressContext` port defined in +`src/progress/`. `createFlatFolderStrategy(deps)` accepts an optional +`progressContextFactory`; absent → `nullProgressContextFactory` +(no-op, OSS default). + +- **Boundary events** are emitted by `index.ts`: + - `phaseChanged("file_analysis")` before phase 1 + - `phaseChanged("folder_analysis")` before phase 5 + - `phaseChanged("indexing")` before phase 6 (which feeds phase 7) + - `completed()` after phase 7 returns + - `failed(message)` from a `try/catch` wrapping the whole `execute` +- **Intra-phase ticks** are emitted by each phase via per-phase reporters + created from `progressContext.reporter(...)`. Sub-phase labels: + - phase 1 → no sub-phase (the main file-analysis loop) + - phase 2 → `big_files_queue`; inner `processBigFile` adds + `big_file:` for chunk pulses + - phase 3 → `backfill`; phase 4 → `backfill:big_files` +- **Total mode**: phase 1, phase 3, and any other streaming-iterator loop + use `total: { kind: "growing" }` (denominator grows as `source.scan` + yields). Phases 2 and 4, plus the big-file chunk pool, know their size + up front and use `total: { kind: "fixed", total: N }`. +- The cancellation path in `execute` lets `CancellationError` propagate + past the orchestrator; `failed()` only fires for non-cancellation + errors. + ## Files - `index.ts` — `createFlatFolderStrategy(deps)` orchestrates the 7 phases. + Accepts `{ fileAnalyzer, progressContextFactory? }`. Constructs one + `ProgressContext` per job and threads it into every phase that takes a + `progressContext?` field. - `types.ts` — `AnalyzedFileEntry`, `FolderSummary`, `RepoSummary`, `RepoSummaryEnvelope`, `FlatFolderResult`. - `analyse-file.ts` — `analyseScannedFile(analyzer, file, llmCallContext?)` + `buildOversizedStub`. -- `analyse-changed.ts` — `analyseChangedFiles({knowledgeId, source, metaPaths, analyzer, diff, llmCallContext?, archiveSink?})`. Pull-time per-file dispatcher. Reads changed file content through `input.source` (a `SourceReader`) so it works with both the disk-backed reader (OSS default) and any HTTP-backed alternative supplied via the `pullFactory` hook. Mirrors `classifyAndAnalyseSmall`'s small-file path: filter → fetch → size cap → binary detect → line count → analyse → save + archive push. Does NOT invoke the skip-decision LLM gate. +- `analyse-changed.ts` — `analyseChangedFiles({knowledgeId, source, metaPaths, analyzer, diff, llmCallContext?, archiveSink?, progressContext?})`. Pull-time per-file dispatcher. Reads changed file content through `input.source` (a `SourceReader`) so it works with both the disk-backed reader (OSS default) and any HTTP-backed alternative supplied via the `pullFactory` hook. Mirrors `classifyAndAnalyseSmall`'s small-file path: filter → fetch → size cap → binary detect → line count → analyse → save + archive push. Does NOT invoke the skip-decision LLM gate. When `progressContext` is present it creates a fixed-total reporter (`subPhase: "pull"`, `total = dedupedPaths.length`) and increments per-path so the pull SSE stream stays live. - `folder-path.ts` — `directFolderOf`, `affectedFolderPaths`. - `folder-summary.ts` — group + summarise + persist + iterate folder summaries. - `repo-summary.ts` — single-shot or batched repo summary with envelope writer. diff --git a/packages/ingest-github/src/strategies/flat-folder/analyse-changed.ts b/packages/ingest-github/src/strategies/flat-folder/analyse-changed.ts index 6db4a7a..357b2aa 100644 --- a/packages/ingest-github/src/strategies/flat-folder/analyse-changed.ts +++ b/packages/ingest-github/src/strategies/flat-folder/analyse-changed.ts @@ -6,6 +6,7 @@ import { getConfigValue } from "@bb/config"; import type { ArchiveSink, FileAnalyzer, ScannedFile, SourceReader } from "src/types/pipeline.ts"; import type { MetaPaths } from "src/types/meta-paths.ts"; import type { BigFileEntry } from "src/types/big-file.ts"; +import type { ProgressContext } from "src/progress/types.ts"; import { looksBinary, passesPathFilters } from "src/pipeline/filters.ts"; import { withConcurrency } from "src/pipeline/concurrency.ts"; import { throwIfCancelled, CancellationError } from "src/pipeline/cancellation.ts"; @@ -23,13 +24,7 @@ export interface AnalyseChangedInput { llmCallContext?: AskLlmOptions; /** Optional non-fatal archive sink. When set, analysed content is pushed after `saveCondensed`. */ archiveSink?: ArchiveSink; - /** - * Invoked once per consumed path (analysed, stubbed, queued-as-big-file, - * filtered, or failed). Lets the caller drive a `processedFiles` counter - * for the progress bar without coupling this strategy to mongo. Best - * effort — errors from the callback are swallowed. - */ - onFileProcessed?: () => Promise | void; + progressContext?: ProgressContext; } export interface AnalyseChangedResult { @@ -85,114 +80,134 @@ export async function analyseChangedFiles(input: AnalyseChangedInput): Promise[] = []; - for (const relativePath of dedupedPaths) { - throwIfCancelled(input.knowledgeId); - const filename = path.basename(relativePath); - const ext = path.extname(filename).toLowerCase(); - if (!passesPathFilters(filename, ext)) { - skipped += 1; - continue; - } - - let content: string; - try { - content = await input.source.readFile(relativePath); - } catch (cause: unknown) { - failed += 1; - logger.warn(`pull-analyse: read failed for ${relativePath}: ${describe(cause)}`); - continue; - } - if (content.length === 0) { - skipped += 1; - continue; - } - const sizeBytes = Buffer.byteLength(content, "utf8"); + const reporter = input.progressContext?.reporter({ + phase: "file_analysis", + subPhase: "pull", + total: { kind: "fixed", total: dedupedPaths.length }, + }); + await reporter?.start(); + + try { + for (const relativePath of dedupedPaths) { + throwIfCancelled(input.knowledgeId); + const filename = path.basename(relativePath); + const ext = path.extname(filename).toLowerCase(); + if (!passesPathFilters(filename, ext)) { + skipped += 1; + reporter?.increment(1, { fileName: relativePath }); + continue; + } - if (sizeBytes > absoluteCap) { - bigFileBuffer.push({ - relativePath, - sizeBytes, - tokenCount: 0, - reason: "too-large", - }); + let content: string; try { - await saveCondensed(input.metaPaths, buildOversizedStub(relativePath, sizeBytes)); - oversizedStubs += 1; + content = await input.source.readFile(relativePath); } catch (cause: unknown) { failed += 1; - logger.warn(`pull-analyse: oversized stub write failed for ${relativePath}: ${describe(cause)}`); + logger.warn(`pull-analyse: read failed for ${relativePath}: ${describe(cause)}`); + reporter?.increment(1, { fileName: relativePath }); + continue; + } + if (content.length === 0) { + skipped += 1; + reporter?.increment(1, { fileName: relativePath }); + continue; + } + const sizeBytes = Buffer.byteLength(content, "utf8"); + + if (sizeBytes > absoluteCap) { + bigFileBuffer.push({ + relativePath, + sizeBytes, + tokenCount: 0, + reason: "too-large", + }); + try { + await saveCondensed(input.metaPaths, buildOversizedStub(relativePath, sizeBytes)); + oversizedStubs += 1; + } catch (cause: unknown) { + failed += 1; + logger.warn(`pull-analyse: oversized stub write failed for ${relativePath}: ${describe(cause)}`); + } + reporter?.increment(1, { fileName: relativePath }); + continue; } - continue; - } - if (looksBinary(Buffer.from(content, "utf8"))) { - skipped += 1; - continue; - } - if (countLines(content) > bigFileLineThreshold) { - bigFileBuffer.push({ - relativePath, - sizeBytes, - tokenCount: 0, - reason: "too-large", - }); - try { - await saveCondensed(input.metaPaths, buildOversizedStub(relativePath, sizeBytes)); - oversizedStubs += 1; - } catch (cause: unknown) { - failed += 1; - logger.warn(`pull-analyse: oversized stub write failed for ${relativePath}: ${describe(cause)}`); + if (looksBinary(Buffer.from(content, "utf8"))) { + skipped += 1; + reporter?.increment(1, { fileName: relativePath }); + continue; + } + if (countLines(content) > bigFileLineThreshold) { + bigFileBuffer.push({ + relativePath, + sizeBytes, + tokenCount: 0, + reason: "too-large", + }); + try { + await saveCondensed(input.metaPaths, buildOversizedStub(relativePath, sizeBytes)); + oversizedStubs += 1; + } catch (cause: unknown) { + failed += 1; + logger.warn(`pull-analyse: oversized stub write failed for ${relativePath}: ${describe(cause)}`); + } + reporter?.increment(1, { fileName: relativePath }); + continue; + } + + const tokenCount = tokenLen(content); + if (tokenCount > contextWindowLimit) { + bigFileBuffer.push({ + relativePath, + sizeBytes, + tokenCount, + reason: "context-window-exceeded", + }); + // Big-file path runs in its own phase; this entry leaves the small-loop accounting. + reporter?.increment(1, { fileName: relativePath }); + continue; } - continue; - } - const tokenCount = tokenLen(content); - if (tokenCount > contextWindowLimit) { - bigFileBuffer.push({ + const scanned: ScannedFile = { + kind: "file", relativePath, + absolutePath: relativePath, sizeBytes, - tokenCount, - reason: "context-window-exceeded", - }); - continue; + content, + }; + const fileContent = content; + const filePath = relativePath; + pending.push( + limit(async () => { + try { + throwIfCancelled(input.knowledgeId); + const condensed = await analyseScannedFile(input.analyzer, scanned, input.llmCallContext); + await saveCondensed(input.metaPaths, condensed); + if (input.archiveSink !== undefined) { + await input.archiveSink.push({ + knowledgeId: input.knowledgeId, + relativePath: filePath, + content: fileContent, + }); + } + smallFilesAnalysed += 1; + } catch (cause: unknown) { + if (cause instanceof CancellationError) { + throw cause; + } + failed += 1; + logger.warn(`pull-analyse: analyse failed for ${relativePath}: ${describe(cause)}`); + } + reporter?.increment(1, { fileName: filePath }); + }), + ); } - const scanned: ScannedFile = { - kind: "file", - relativePath, - absolutePath: relativePath, - sizeBytes, - content, - }; - const fileContent = content; - const filePath = relativePath; - pending.push( - limit(async () => { - try { - throwIfCancelled(input.knowledgeId); - const condensed = await analyseScannedFile(input.analyzer, scanned, input.llmCallContext); - await saveCondensed(input.metaPaths, condensed); - if (input.archiveSink !== undefined) { - await input.archiveSink.push({ - knowledgeId: input.knowledgeId, - relativePath: filePath, - content: fileContent, - }); - } - smallFilesAnalysed += 1; - } catch (cause: unknown) { - if (cause instanceof CancellationError) { - throw cause; - } - failed += 1; - logger.warn(`pull-analyse: analyse failed for ${relativePath}: ${describe(cause)}`); - } - }), - ); + await Promise.all(pending); + } finally { + reporter?.stop(); } - await Promise.all(pending); - if (bigFileBuffer.length > 0) { const existing = await readBigFiles(input.metaPaths); const merged = mergeBigFileEntries(existing, bigFileBuffer); diff --git a/packages/ingest-github/src/strategies/flat-folder/backfill/README.md b/packages/ingest-github/src/strategies/flat-folder/backfill/README.md index 87d1872..dfa3d72 100644 --- a/packages/ingest-github/src/strategies/flat-folder/backfill/README.md +++ b/packages/ingest-github/src/strategies/flat-folder/backfill/README.md @@ -7,27 +7,33 @@ Both are idempotent and skip entries that already look complete. ## Files -- `fields.ts` — Phase 3. `backfillMissingFields(metaPaths)` iterates every - condensed entry via `iterateCondensed`, computes which extended-analysis - fields are missing (`keywords`, `ontologyConcepts`, `businessEntities`, - `systemCapabilities`, `sideEffects`, `configDependencies`, - `dataFlowDirection`, `integrationSurface`, `contractsProvided`, - `contractsConsumed`, `sectionMap`), and asks one LLM call per file to - fill only the missing slots. The response is validated and normalised - (`pickStringArray`, `pickSections`) before being written back via - `saveCondensed`. Entries with nothing missing are skipped without an - LLM call. +- `fields.ts` — Phase 3. `backfillMissingFields(metaPaths, llmCallContext?, progressContext?)` + iterates every condensed entry via `iterateCondensed`, computes which + extended-analysis fields are missing (`keywords`, `ontologyConcepts`, + `businessEntities`, `systemCapabilities`, `sideEffects`, + `configDependencies`, `dataFlowDirection`, `integrationSurface`, + `contractsProvided`, `contractsConsumed`, `sectionMap`), and asks one + LLM call per file to fill only the missing slots. The response is + validated and normalised (`pickStringArray`, `pickSections`) before + being written back via `saveCondensed`. Entries with nothing missing + are skipped without an LLM call. When `progressContext` is present + this phase opens a growing-total reporter (`subPhase: "backfill"`) + because `iterateCondensed`'s size is not known up front. - `big-files.ts` — Phase 4. `backfillBigFiles({knowledgeId, repoDir, -metaPaths})` re-reads `bigFiles.json`, skips `reason === "too-large"`, - and for each non-complete entry (per `inspect`) re-runs `processBigFile` - against the file on disk so the condensed JSON is rebuilt from cached - chunks where possible. +metaPaths, llmCallContext?, progressContext?})` re-reads + `bigFiles.json`, skips `reason === "too-large"`, and for each + non-complete entry (per `inspect`) re-runs `processBigFile` against + the file on disk so the condensed JSON is rebuilt from cached chunks + where possible. When `progressContext` is present this phase opens a + fixed-total reporter (`subPhase: "backfill:big_files"`, sized by + `bigFiles.json`) and forwards itself into `processBigFile` so per-file + chunk pulses also surface. ## Public interfaces -- `backfillMissingFields(metaPaths, llmCallContext?): Promise<{ updated, failed }>` +- `backfillMissingFields(metaPaths, llmCallContext?, progressContext?): Promise<{ updated, failed }>` - `backfillBigFiles(input: BackfillBigFilesInput): Promise` - — `BackfillBigFilesInput` carries an optional `llmCallContext?: AskLlmOptions` that the inner `processBigFile` call uses to forward per-job LLM credentials. + — `BackfillBigFilesInput` carries an optional `llmCallContext?: AskLlmOptions` that the inner `processBigFile` call uses to forward per-job LLM credentials, and an optional `progressContext?: ProgressContext` for the per-phase reporter described above. Both return phase-summary counters consumed by `createFlatFolderStrategy` to roll up into the strategy result. diff --git a/packages/ingest-github/src/strategies/flat-folder/backfill/big-files.ts b/packages/ingest-github/src/strategies/flat-folder/backfill/big-files.ts index 7aad4e2..0925eb1 100644 --- a/packages/ingest-github/src/strategies/flat-folder/backfill/big-files.ts +++ b/packages/ingest-github/src/strategies/flat-folder/backfill/big-files.ts @@ -2,6 +2,7 @@ import { logger } from "@bb/logger"; import type { AskLlmOptions } from "@bb/llm"; import type { MetaPaths } from "src/types/meta-paths.ts"; import type { SourceReader } from "src/types/pipeline.ts"; +import type { ProgressContext } from "src/progress/types.ts"; import { readBigFiles } from "src/strategies/flat-folder/big-file/detector.ts"; import { inspect } from "src/strategies/flat-folder/big-file/cache.ts"; import { processBigFile } from "src/strategies/flat-folder/big-file/index.ts"; @@ -11,6 +12,7 @@ export interface BackfillBigFilesInput { source: SourceReader; metaPaths: MetaPaths; llmCallContext?: AskLlmOptions; + progressContext?: ProgressContext; } export interface BackfillBigFilesResult { @@ -22,36 +24,51 @@ export async function backfillBigFiles(input: BackfillBigFilesInput): Promise { let updated = 0; let failed = 0; - for await (const entry of iterateCondensed(metaPaths)) { - const a = entry.analysis; - const needed = computeNeeded(a); - if (!hasAnyMissing(needed)) { - continue; - } - const userPrompt = buildBackfillUserPrompt(entry.relativePath, entry.analysis); - try { - const response = await askJsonLLM(BACKFILL_SYSTEM_PROMPT, userPrompt, llmCallContext ?? {}); - const result = response.result; - if (result === null) { + const reporter = progressContext?.reporter({ + phase: "file_analysis", + subPhase: "backfill", + total: { kind: "growing" }, + }); + await reporter?.start(); + try { + for await (const entry of iterateCondensed(metaPaths)) { + reporter?.incrementSeen(); + const a = entry.analysis; + const needed = computeNeeded(a); + if (!hasAnyMissing(needed)) { + reporter?.increment(1, { fileName: entry.relativePath }); continue; } - applyBackfill(a, result, needed); - await saveCondensed(metaPaths, entry); - updated += 1; - } catch (cause: unknown) { - failed += 1; - logger.warn(`phase3: backfill failed for ${entry.relativePath}: ${describe(cause)}`); + const userPrompt = buildBackfillUserPrompt(entry.relativePath, entry.analysis); + try { + const response = await askJsonLLM(BACKFILL_SYSTEM_PROMPT, userPrompt, llmCallContext ?? {}); + const result = response.result; + if (result === null) { + reporter?.increment(1, { fileName: entry.relativePath }); + continue; + } + applyBackfill(a, result, needed); + await saveCondensed(metaPaths, entry); + updated += 1; + } catch (cause: unknown) { + failed += 1; + logger.warn(`phase3: backfill failed for ${entry.relativePath}: ${describe(cause)}`); + } + reporter?.increment(1, { fileName: entry.relativePath }); } + logger.info(`phase3 done: updated=${updated} failed=${failed}`); + return { updated, failed }; + } finally { + reporter?.stop(); } - logger.info(`phase3 done: updated=${updated} failed=${failed}`); - return { updated, failed }; } function computeNeeded(a: FileAnalysis): NeededFlags { diff --git a/packages/ingest-github/src/strategies/flat-folder/big-file/README.md b/packages/ingest-github/src/strategies/flat-folder/big-file/README.md index 0c26ca6..b1c974a 100644 --- a/packages/ingest-github/src/strategies/flat-folder/big-file/README.md +++ b/packages/ingest-github/src/strategies/flat-folder/big-file/README.md @@ -25,11 +25,15 @@ depending on chunk count and prompt budget. `stale-condensed`, or `missing`. Used by Phase 2 to short-circuit and by Phase 4 to find candidates for cheap re-condense. - `index.ts` — `processBigFile({knowledgeId, metaPaths, relativePath, content, -sizeBytes, llmCallContext?})`. Sequential per file (chunk-level - concurrency inside). Persists every intermediate artifact, so a - restart resumes from the next unfinished chunk. `llmCallContext` is - forwarded to every chunk analyzer call so per-job LLM credentials - reach `@bb/llm`. +sizeBytes, llmCallContext?, progressContext?})`. Sequential per file + (chunk-level concurrency inside). Persists every intermediate artifact, + so a restart resumes from the next unfinished chunk. `llmCallContext` + is forwarded to every chunk analyzer call so per-job LLM credentials + reach `@bb/llm`. When `progressContext` is present, the chunk pool runs + under a fixed-total reporter + (`subPhase: "big_file:"`, `total = chunks.length`) so + long single-file analyses surface as live `PHASE_TICK` envelopes + carrying per-chunk progress instead of looking frozen. ## Invariants diff --git a/packages/ingest-github/src/strategies/flat-folder/big-file/index.ts b/packages/ingest-github/src/strategies/flat-folder/big-file/index.ts index 893e416..250c318 100644 --- a/packages/ingest-github/src/strategies/flat-folder/big-file/index.ts +++ b/packages/ingest-github/src/strategies/flat-folder/big-file/index.ts @@ -6,6 +6,7 @@ import { logger } from "@bb/logger"; import type { ChunkAnalysisResult, HugeFileManifest } from "src/types/big-file.ts"; import type { CondensedFileAnalysis } from "src/types/condensed-file-analysis.ts"; import type { MetaPaths } from "src/types/meta-paths.ts"; +import type { ProgressContext } from "src/progress/types.ts"; import { throwIfCancelled } from "src/pipeline/cancellation.ts"; import { splitFileIntoChunks } from "./chunker.ts"; import { analyzeChunk } from "./chunk-analyzer.ts"; @@ -19,6 +20,7 @@ export interface ProcessBigFileInput { content: string; sizeBytes: number; llmCallContext?: AskLlmOptions; + progressContext?: ProgressContext; } export async function processBigFile(input: ProcessBigFileInput): Promise { @@ -31,6 +33,13 @@ export async function processBigFile(input: ProcessBigFileInput): Promise => { while (nextIndex < chunks.length) { const idx = nextIndex; @@ -43,20 +52,26 @@ export async function processBigFile(input: ProcessBigFileInput): Promise[] = []; - for (let i = 0; i < workerCount; i += 1) { - workers.push(worker()); + try { + const workerCount = Math.min(concurrency, chunks.length); + const workers: Promise[] = []; + for (let i = 0; i < workerCount; i += 1) { + workers.push(worker()); + } + await Promise.all(workers); + } finally { + reporter?.stop(); } - await Promise.all(workers); throwIfCancelled(input.knowledgeId); const merged = await condenseChunks(input.relativePath, results); diff --git a/packages/ingest-github/src/strategies/flat-folder/index.ts b/packages/ingest-github/src/strategies/flat-folder/index.ts index 5304318..dd47955 100644 --- a/packages/ingest-github/src/strategies/flat-folder/index.ts +++ b/packages/ingest-github/src/strategies/flat-folder/index.ts @@ -9,82 +9,110 @@ import { backfillBigFiles } from "./backfill/big-files.ts"; import { runFolderSummaryPhase } from "./folder-summary.ts"; import { makeRepoSummaryEnvelope, persistRepoSummary, summariseRepo } from "./repo-summary.ts"; import { storeFlatAnalysis } from "./phases/store-flat-analysis.ts"; +import type { ProgressContext, ProgressContextFactory } from "src/progress/types.ts"; +import { nullProgressContextFactory } from "src/progress/NullProgressReporter.ts"; export interface FlatFolderStrategyDeps { fileAnalyzer: FileAnalyzer; + progressContextFactory?: ProgressContextFactory; } export function createFlatFolderStrategy(deps: FlatFolderStrategyDeps): IngestStrategy { + const progressContextFactory = deps.progressContextFactory ?? nullProgressContextFactory; return { name: "flat-folder", async execute(input: StrategyInput): Promise { const { context, source, archiveSink, metaPaths, payload, branch } = input; const { knowledgeId, orgId, repoId, llmCallContext } = context; + const progressContext: ProgressContext = progressContextFactory(knowledgeId); - logger.info(`flat-folder: phase1 (classify + analyse small) starting for ${knowledgeId}`); - throwIfCancelled(knowledgeId); - const phase1Input: Parameters[0] = { - knowledgeId, - source, - metaPaths, - analyzer: deps.fileAnalyzer, - }; - if (archiveSink !== undefined) { - phase1Input.archiveSink = archiveSink; - } - if (llmCallContext !== undefined) { - phase1Input.llmCallContext = llmCallContext; - } - const phase1 = await classifyAndAnalyseSmall(phase1Input); + try { + progressContext.phaseChanged("file_analysis"); - logger.info(`flat-folder: phase2 (process big files) starting`); - throwIfCancelled(knowledgeId); - const phase2Input: Parameters[0] = { knowledgeId, source, metaPaths }; - if (llmCallContext !== undefined) { - phase2Input.llmCallContext = llmCallContext; - } - const phase2 = await processBigFilesQueue(phase2Input); + logger.info(`flat-folder: phase1 (classify + analyse small) starting for ${knowledgeId}`); + throwIfCancelled(knowledgeId); + const phase1Input: Parameters[0] = { + knowledgeId, + source, + metaPaths, + analyzer: deps.fileAnalyzer, + progressContext, + }; + if (archiveSink !== undefined) { + phase1Input.archiveSink = archiveSink; + } + if (llmCallContext !== undefined) { + phase1Input.llmCallContext = llmCallContext; + } + const phase1 = await classifyAndAnalyseSmall(phase1Input); - logger.info(`flat-folder: phase3 (backfill missing fields) starting`); - throwIfCancelled(knowledgeId); - await backfillMissingFields(metaPaths, llmCallContext); + logger.info(`flat-folder: phase2 (process big files) starting`); + throwIfCancelled(knowledgeId); + const phase2Input: Parameters[0] = { + knowledgeId, + source, + metaPaths, + progressContext, + }; + if (llmCallContext !== undefined) { + phase2Input.llmCallContext = llmCallContext; + } + const phase2 = await processBigFilesQueue(phase2Input); - logger.info(`flat-folder: phase4 (backfill big files) starting`); - throwIfCancelled(knowledgeId); - const phase4Input: Parameters[0] = { knowledgeId, source, metaPaths }; - if (llmCallContext !== undefined) { - phase4Input.llmCallContext = llmCallContext; - } - await backfillBigFiles(phase4Input); + logger.info(`flat-folder: phase3 (backfill missing fields) starting`); + throwIfCancelled(knowledgeId); + await backfillMissingFields(metaPaths, llmCallContext, progressContext); - logger.info(`flat-folder: phase5 (folder summaries) starting`); - throwIfCancelled(knowledgeId); - const phase5 = await runFolderSummaryPhase(knowledgeId, metaPaths, llmCallContext); + logger.info(`flat-folder: phase4 (backfill big files) starting`); + throwIfCancelled(knowledgeId); + const phase4Input: Parameters[0] = { + knowledgeId, + source, + metaPaths, + progressContext, + }; + if (llmCallContext !== undefined) { + phase4Input.llmCallContext = llmCallContext; + } + await backfillBigFiles(phase4Input); - logger.info(`flat-folder: phase6 (repo summary) starting`); - throwIfCancelled(knowledgeId); - const repoSummary = await summariseRepo(knowledgeId, metaPaths, llmCallContext); - let repoSummarised = false; - if (repoSummary !== null) { - await persistRepoSummary(metaPaths, makeRepoSummaryEnvelope(knowledgeId, orgId, repoSummary)); - repoSummarised = true; - } + progressContext.phaseChanged("folder_analysis"); + logger.info(`flat-folder: phase5 (folder summaries) starting`); + throwIfCancelled(knowledgeId); + const phase5 = await runFolderSummaryPhase(knowledgeId, metaPaths, llmCallContext); + + progressContext.phaseChanged("indexing"); + logger.info(`flat-folder: phase6 (repo summary) starting`); + throwIfCancelled(knowledgeId); + const repoSummary = await summariseRepo(knowledgeId, metaPaths, llmCallContext); + let repoSummarised = false; + if (repoSummary !== null) { + await persistRepoSummary(metaPaths, makeRepoSummaryEnvelope(knowledgeId, orgId, repoSummary)); + repoSummarised = true; + } - logger.info(`flat-folder: phase7 (graph store) starting`); - throwIfCancelled(knowledgeId); - const phase7 = await storeFlatAnalysis({ - scope: { orgId, knowledgeId, repoId }, - payload, - branch, - metaPaths, - }); + logger.info(`flat-folder: phase7 (graph store) starting`); + throwIfCancelled(knowledgeId); + const phase7 = await storeFlatAnalysis({ + scope: { orgId, knowledgeId, repoId }, + payload, + branch, + metaPaths, + }); - return { - filesAnalyzed: phase1.smallFilesAnalysed + phase2.processed + phase2.cached + phase1.oversizedStubs, - foldersSummarised: phase5.succeeded, - repoSummarised, - graphNodesWritten: phase7.nodesWritten, - }; + progressContext.completed(); + + return { + filesAnalyzed: phase1.smallFilesAnalysed + phase2.processed + phase2.cached + phase1.oversizedStubs, + foldersSummarised: phase5.succeeded, + repoSummarised, + graphNodesWritten: phase7.nodesWritten, + }; + } catch (cause: unknown) { + const message = cause instanceof Error ? cause.message : String(cause); + progressContext.failed(message); + throw cause; + } }, }; } diff --git a/packages/ingest-github/src/strategies/flat-folder/phases/README.md b/packages/ingest-github/src/strategies/flat-folder/phases/README.md index 71ac8d8..f0701a7 100644 --- a/packages/ingest-github/src/strategies/flat-folder/phases/README.md +++ b/packages/ingest-github/src/strategies/flat-folder/phases/README.md @@ -10,8 +10,8 @@ and repo summarisation (Phases 5 and 6) live as `folder-summary.ts` and - `classify-and-analyse-small.ts` — Phase 1. `classifyAndAnalyseSmall({knowledgeId, source, metaPaths, analyzer, -skipDecider?, archiveSink?, llmCallContext?})` walks `source.scan({ -skipDecider, llmCallContext })` and per entry: +skipDecider?, archiveSink?, llmCallContext?, progressContext?})` walks + `source.scan({ skipDecider, llmCallContext })` and per entry: - `kind === "oversized"` → write a stub via `buildOversizedStub` + `saveCondensed`, and append a `too-large` row to `bigFiles.json`. - token count > `Config.ContextWindowLimit` → buffer a @@ -23,12 +23,17 @@ skipDecider, llmCallContext })` and per entry: buffered big-file list is flushed via `writeBigFiles` after all tasks drain. - `process-big-files.ts` — Phase 2. - `processBigFilesQueue({knowledgeId, source, metaPaths, llmCallContext?})` + `processBigFilesQueue({knowledgeId, source, metaPaths, llmCallContext?, progressContext?})` reads `bigFiles.json`, skips `too-large` entries (counted as `skippedOversized`), short-circuits when `inspect` returns `complete` (counted as `cached`), reads the file via `source.readFile`, and dispatches `processBigFile` sequentially per file with the per-job - `llmCallContext` threaded through. Cancellation re-throws past the + `llmCallContext` threaded through. When `progressContext` is present + this phase opens a fixed-total reporter (`subPhase: "big_files_queue"`, + `total = entries.length`) and increments per entry — including + skipped/cached/failed paths so the percentage never stalls. The same + `progressContext` is forwarded into `processBigFile` so each big file + gets its own per-chunk sub-phase. Cancellation re-throws past the phase; other errors are logged per file and counted as `failed`. - `store-flat-analysis.ts` — Phase 7. `storeFlatAnalysis({scope, payload, branch, metaPaths})` ensures @@ -44,8 +49,13 @@ skipDecider, llmCallContext })` and per entry: - `classifyAndAnalyseSmall(input): Promise` — `{ smallFilesAnalysed, bigFilesQueued, oversizedStubs, failed }`. + `input.progressContext?` opens a growing-total reporter + (`source.scan` size is not known up front); `incrementSeen()` fires per + scan yield and `increment()` fires per persisted entry. - `processBigFilesQueue(input): Promise` — - `{ processed, cached, failed, skippedOversized }`. + `{ processed, cached, failed, skippedOversized }`. `input.progressContext?` + opens a fixed-total reporter sized by `bigFiles.json` and forwards + itself into the per-file `processBigFile` call. - `storeFlatAnalysis(input): Promise` — `{ nodesWritten, foldersWritten, filesWritten }`. diff --git a/packages/ingest-github/src/strategies/flat-folder/phases/classify-and-analyse-small.ts b/packages/ingest-github/src/strategies/flat-folder/phases/classify-and-analyse-small.ts index 3306d23..4f5e996 100644 --- a/packages/ingest-github/src/strategies/flat-folder/phases/classify-and-analyse-small.ts +++ b/packages/ingest-github/src/strategies/flat-folder/phases/classify-and-analyse-small.ts @@ -6,6 +6,7 @@ import { getConfigValue } from "@bb/config"; import type { ArchiveSink, FileAnalyzer, SkipDecider, SourceReader } from "src/types/pipeline.ts"; import type { MetaPaths } from "src/types/meta-paths.ts"; import type { BigFileEntry } from "src/types/big-file.ts"; +import type { ProgressContext } from "src/progress/types.ts"; import { withConcurrency } from "src/pipeline/concurrency.ts"; import { throwIfCancelled, CancellationError } from "src/pipeline/cancellation.ts"; import { makeSkipDecider } from "src/pipeline/skip-decisions/index.ts"; @@ -21,6 +22,7 @@ export interface ClassifyPhaseInput { skipDecider?: SkipDecider; archiveSink?: ArchiveSink; llmCallContext?: AskLlmOptions; + progressContext?: ProgressContext; } export interface ClassifyPhaseResult { @@ -45,71 +47,87 @@ export async function classifyAndAnalyseSmall(input: ClassifyPhaseInput): Promis const pending: Promise[] = []; - const scanDeps: Parameters[0] = { skipDecider }; - if (input.llmCallContext !== undefined) { - scanDeps.llmCallContext = input.llmCallContext; - } - for await (const entry of input.source.scan(scanDeps)) { - throwIfCancelled(input.knowledgeId); - - if (entry.kind === "oversized") { - bigFileBuffer.push({ - relativePath: entry.relativePath, - sizeBytes: entry.sizeBytes, - tokenCount: 0, - reason: "too-large", - }); - try { - await saveCondensed(input.metaPaths, buildOversizedStub(entry.relativePath, entry.sizeBytes)); - oversizedStubs += 1; - } catch (cause: unknown) { - failed += 1; - logger.warn(`phase1: oversized stub write failed for ${entry.relativePath}: ${describe(cause)}`); - } - continue; - } + const reporter = input.progressContext?.reporter({ + phase: "file_analysis", + total: { kind: "growing" }, + }); + await reporter?.start(); - const tokenCount = tokenLen(entry.content); - if (tokenCount > contextWindowLimit) { - bigFileBuffer.push({ - relativePath: entry.relativePath, - sizeBytes: entry.sizeBytes, - tokenCount, - reason: "context-window-exceeded", - }); - continue; + try { + const scanDeps: Parameters[0] = { skipDecider }; + if (input.llmCallContext !== undefined) { + scanDeps.llmCallContext = input.llmCallContext; } + for await (const entry of input.source.scan(scanDeps)) { + throwIfCancelled(input.knowledgeId); + reporter?.incrementSeen(); - const fileContent = entry.content; - const filePath = entry.relativePath; - pending.push( - limit(async () => { + if (entry.kind === "oversized") { + bigFileBuffer.push({ + relativePath: entry.relativePath, + sizeBytes: entry.sizeBytes, + tokenCount: 0, + reason: "too-large", + }); try { - throwIfCancelled(input.knowledgeId); - const condensed = await analyseScannedFile(input.analyzer, entry, input.llmCallContext); - await saveCondensed(input.metaPaths, condensed); - if (input.archiveSink !== undefined) { - await input.archiveSink.push({ - knowledgeId: input.knowledgeId, - relativePath: filePath, - content: fileContent, - }); - } - smallFilesAnalysed += 1; + await saveCondensed(input.metaPaths, buildOversizedStub(entry.relativePath, entry.sizeBytes)); + oversizedStubs += 1; + reporter?.increment(1, { fileName: entry.relativePath }); } catch (cause: unknown) { - if (cause instanceof CancellationError) { - throw cause; - } failed += 1; - logger.warn(`phase1: analyse failed for ${entry.relativePath}: ${describe(cause)}`); + logger.warn(`phase1: oversized stub write failed for ${entry.relativePath}: ${describe(cause)}`); } - }), - ); - } + continue; + } - await Promise.all(pending); + const tokenCount = tokenLen(entry.content); + if (tokenCount > contextWindowLimit) { + bigFileBuffer.push({ + relativePath: entry.relativePath, + sizeBytes: entry.sizeBytes, + tokenCount, + reason: "context-window-exceeded", + }); + // Big files are accounted for here; phase 2 has its own reporter. + reporter?.increment(1, { fileName: entry.relativePath }); + continue; + } - await writeBigFiles(input.metaPaths, bigFileBuffer); + const fileContent = entry.content; + const filePath = entry.relativePath; + pending.push( + limit(async () => { + try { + throwIfCancelled(input.knowledgeId); + const condensed = await analyseScannedFile(input.analyzer, entry, input.llmCallContext); + await saveCondensed(input.metaPaths, condensed); + if (input.archiveSink !== undefined) { + await input.archiveSink.push({ + knowledgeId: input.knowledgeId, + relativePath: filePath, + content: fileContent, + }); + } + smallFilesAnalysed += 1; + reporter?.increment(1, { fileName: filePath }); + } catch (cause: unknown) { + if (cause instanceof CancellationError) { + throw cause; + } + failed += 1; + logger.warn(`phase1: analyse failed for ${entry.relativePath}: ${describe(cause)}`); + reporter?.increment(1, { fileName: filePath }); + } + }), + ); + } + + await Promise.all(pending); + + await writeBigFiles(input.metaPaths, bigFileBuffer); + } finally { + reporter?.stop(); + } logger.info( `phase1 done: smallFilesAnalysed=${smallFilesAnalysed} bigFilesQueued=${bigFileBuffer.filter((e) => e.reason === "context-window-exceeded").length} oversizedStubs=${oversizedStubs} failed=${failed}`, diff --git a/packages/ingest-github/src/strategies/flat-folder/phases/process-big-files.ts b/packages/ingest-github/src/strategies/flat-folder/phases/process-big-files.ts index 26357af..92ae614 100644 --- a/packages/ingest-github/src/strategies/flat-folder/phases/process-big-files.ts +++ b/packages/ingest-github/src/strategies/flat-folder/phases/process-big-files.ts @@ -2,6 +2,7 @@ import { logger } from "@bb/logger"; import type { AskLlmOptions } from "@bb/llm"; import type { MetaPaths } from "src/types/meta-paths.ts"; import type { SourceReader } from "src/types/pipeline.ts"; +import type { ProgressContext } from "src/progress/types.ts"; import { throwIfCancelled, CancellationError } from "src/pipeline/cancellation.ts"; import { readBigFiles } from "src/strategies/flat-folder/big-file/detector.ts"; import { inspect } from "src/strategies/flat-folder/big-file/cache.ts"; @@ -12,6 +13,7 @@ export interface ProcessBigFilesInput { source: SourceReader; metaPaths: MetaPaths; llmCallContext?: AskLlmOptions; + progressContext?: ProgressContext; } export interface ProcessBigFilesResult { @@ -28,52 +30,69 @@ export async function processBigFilesQueue(input: ProcessBigFilesInput): Promise let failed = 0; let skippedOversized = 0; - for (const entry of entries) { - throwIfCancelled(input.knowledgeId); - if (entry.reason === "too-large") { - skippedOversized += 1; - continue; - } - const status = await inspect(input.metaPaths, entry.relativePath); - if (status === "complete") { - cached += 1; - continue; - } - let content: string; - try { - content = await input.source.readFile(entry.relativePath); - } catch (cause: unknown) { - failed += 1; - logger.warn(`phase2: read failed for ${entry.relativePath}: ${describe(cause)}`); - continue; - } - if (content.length === 0) { - failed += 1; - logger.warn(`phase2: empty content for ${entry.relativePath}; skipping`); - continue; - } - try { - await processBigFile({ - knowledgeId: input.knowledgeId, - metaPaths: input.metaPaths, - relativePath: entry.relativePath, - content, - sizeBytes: entry.sizeBytes, - ...(input.llmCallContext !== undefined ? { llmCallContext: input.llmCallContext } : {}), - }); - processed += 1; - } catch (cause: unknown) { - if (cause instanceof CancellationError) { - throw cause; + const reporter = input.progressContext?.reporter({ + phase: "file_analysis", + subPhase: "big_files_queue", + total: { kind: "fixed", total: entries.length }, + }); + await reporter?.start(); + + try { + for (const entry of entries) { + throwIfCancelled(input.knowledgeId); + if (entry.reason === "too-large") { + skippedOversized += 1; + reporter?.increment(1, { fileName: entry.relativePath }); + continue; + } + const status = await inspect(input.metaPaths, entry.relativePath); + if (status === "complete") { + cached += 1; + reporter?.increment(1, { fileName: entry.relativePath }); + continue; + } + let content: string; + try { + content = await input.source.readFile(entry.relativePath); + } catch (cause: unknown) { + failed += 1; + logger.warn(`phase2: read failed for ${entry.relativePath}: ${describe(cause)}`); + reporter?.increment(1, { fileName: entry.relativePath }); + continue; + } + if (content.length === 0) { + failed += 1; + logger.warn(`phase2: empty content for ${entry.relativePath}; skipping`); + reporter?.increment(1, { fileName: entry.relativePath }); + continue; + } + try { + await processBigFile({ + knowledgeId: input.knowledgeId, + metaPaths: input.metaPaths, + relativePath: entry.relativePath, + content, + sizeBytes: entry.sizeBytes, + ...(input.llmCallContext !== undefined ? { llmCallContext: input.llmCallContext } : {}), + ...(input.progressContext !== undefined ? { progressContext: input.progressContext } : {}), + }); + processed += 1; + } catch (cause: unknown) { + if (cause instanceof CancellationError) { + throw cause; + } + failed += 1; + logger.warn(`phase2: processBigFile failed for ${entry.relativePath}: ${describe(cause)}`); } - failed += 1; - logger.warn(`phase2: processBigFile failed for ${entry.relativePath}: ${describe(cause)}`); + reporter?.increment(1, { fileName: entry.relativePath }); } + logger.info( + `phase2 done: processed=${processed} cached=${cached} failed=${failed} skippedOversized=${skippedOversized}`, + ); + return { processed, cached, failed, skippedOversized }; + } finally { + reporter?.stop(); } - logger.info( - `phase2 done: processed=${processed} cached=${cached} failed=${failed} skippedOversized=${skippedOversized}`, - ); - return { processed, cached, failed, skippedOversized }; } function describe(cause: unknown): string { diff --git a/packages/ingest-github/types/index.d.ts b/packages/ingest-github/types/index.d.ts index ac23419..6abc22d 100644 --- a/packages/ingest-github/types/index.d.ts +++ b/packages/ingest-github/types/index.d.ts @@ -1,8 +1,39 @@ export interface RegisterGithubWorkersDeps { sourceFactory?: SourceFactory; pullFactory?: PullFactory; + progressContextFactory?: ProgressContextFactory; } +export type ProgressPhase = "file_analysis" | "folder_analysis" | "indexing"; + +export type ProgressTotalMode = { kind: "fixed"; total: number } | { kind: "growing"; initialTotal?: number }; + +export interface ProgressReporterInput { + readonly phase: ProgressPhase; + readonly subPhase?: string; + readonly total: ProgressTotalMode; + readonly resolveInitialProcessed?: () => Promise | number; +} + +export interface ProgressReporter { + start(): Promise; + increment(delta?: number, meta?: { fileName?: string }): void; + incrementSeen(delta?: number): void; + setTotal(total: number): void; + stop(): void; +} + +export interface ProgressContext { + reporter(input: ProgressReporterInput): ProgressReporter; + phaseChanged(phase: ProgressPhase): void; + completed(message?: string): void; + failed(error: string, phase?: ProgressPhase): void; +} + +export type ProgressContextFactory = (knowledgeId: string) => ProgressContext; + +export declare const nullProgressContextFactory: ProgressContextFactory; + export declare function registerGithubWorkers(deps?: RegisterGithubWorkersDeps): void; export declare function registerLocalIngestWorker(): void; From ea7b3f0ef77666dbff3ccb242fb3bb76db7b895b Mon Sep 17 00:00:00 2001 From: Dead-Bytes <143434285+Dead-Bytes@users.noreply.github.com> Date: Thu, 14 May 2026 23:29:59 +0530 Subject: [PATCH 14/34] chore: format fix --- packages/ingest-github/src/progress/types.ts | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/packages/ingest-github/src/progress/types.ts b/packages/ingest-github/src/progress/types.ts index 698dad9..0f7d65c 100644 --- a/packages/ingest-github/src/progress/types.ts +++ b/packages/ingest-github/src/progress/types.ts @@ -9,9 +9,7 @@ export type ProgressPhase = "file_analysis" | "folder_analysis" | "indexing"; -export type ProgressTotalMode = - | { kind: "fixed"; total: number } - | { kind: "growing"; initialTotal?: number }; +export type ProgressTotalMode = { kind: "fixed"; total: number } | { kind: "growing"; initialTotal?: number }; export interface ProgressReporterInput { readonly phase: ProgressPhase; From 7aa70775ef32d160f3f4c9e99f56c65f1b19ce7f Mon Sep 17 00:00:00 2001 From: lovanshu garg Date: Fri, 15 May 2026 05:39:37 +0530 Subject: [PATCH 15/34] feat(token): tokens usage by llm --- bun.lock | 1 + .../src/adapters/llm-file-analyzer.ts | 5 +- .../src/pipeline/pull-helpers.ts | 14 ++++-- packages/ingest-github/src/pipeline/pull.ts | 21 +++++--- packages/ingest-github/src/pipeline/run.ts | 20 ++++++-- .../strategies/flat-folder/analyse-changed.ts | 8 +++ .../strategies/flat-folder/analyse-file.ts | 3 +- .../flat-folder/big-file/chunk-analyzer.ts | 1 + .../flat-folder/big-file/condenser.ts | Bin 8689 -> 9435 bytes .../strategies/flat-folder/big-file/index.ts | 7 +++ .../flat-folder/folder-summary-selective.ts | 9 +++- .../strategies/flat-folder/folder-summary.ts | 24 ++++++--- .../src/strategies/flat-folder/index.ts | 15 +++++- .../phases/classify-and-analyse-small.ts | 8 +++ .../flat-folder/phases/process-big-files.ts | 17 ++++++- .../strategies/flat-folder/repo-summary.ts | 46 ++++++++++++++---- packages/ingest-github/src/types/big-file.ts | 1 + .../src/types/condensed-file-analysis.ts | 1 + packages/ingest-github/src/types/pipeline.ts | 2 + packages/ingest-github/src/types/strategy.ts | 1 + packages/llm/src/jsonClient.ts | 20 ++++++-- packages/llm/src/ollama.ts | 8 ++- packages/llm/src/openrouter.ts | 9 +++- packages/mongo/src/knowledge.ts | 11 ++++- packages/mongo/src/processingStats.ts | 5 +- packages/types/src/knowledge.ts | 8 ++- 26 files changed, 215 insertions(+), 50 deletions(-) diff --git a/bun.lock b/bun.lock index 083df85..b3cff8d 100644 --- a/bun.lock +++ b/bun.lock @@ -75,6 +75,7 @@ "dependencies": { "@bb/config": "workspace:*", "@bb/errors": "workspace:*", + "@bb/logger": "workspace:*", "@bb/mongo": "workspace:*", "@bb/types": "workspace:*", "tiktoken": "^1.0.22", diff --git a/packages/ingest-github/src/adapters/llm-file-analyzer.ts b/packages/ingest-github/src/adapters/llm-file-analyzer.ts index 88d0b17..b006fbd 100644 --- a/packages/ingest-github/src/adapters/llm-file-analyzer.ts +++ b/packages/ingest-github/src/adapters/llm-file-analyzer.ts @@ -42,9 +42,11 @@ export function createLlmFileAnalyzer(deps: LlmFileAnalyzerDeps): FileAnalyzer { const userPrompt = deps.buildUserPrompt(input); const t0 = performance.now(); let raw: RawAnalysisJson | null = null; + let usage: { inputTokens: number; outputTokens: number } | undefined; try { const response = await askJsonLLM(systemPrompt, userPrompt, input.llmCallContext ?? {}); raw = response.result; + usage = { inputTokens: response.usage.inputTokens, outputTokens: response.usage.outputTokens }; if (raw === null) { logger.warn(`llm-file-analyzer: ${input.relativePath} returned unparseable JSON`); } @@ -53,9 +55,10 @@ export function createLlmFileAnalyzer(deps: LlmFileAnalyzerDeps): FileAnalyzer { logger.warn(`llm-file-analyzer: ${input.relativePath} askJsonLLM failed: ${msg}`); } if (raw === null) { - return { language: FALLBACK_LANGUAGE, analysis: emptyFileAnalysis() }; + return { language: FALLBACK_LANGUAGE, analysis: emptyFileAnalysis(), tokenUsage: usage }; } const shaped = shapeAnalysis(raw); + shaped.tokenUsage = usage; logger.info( `llm-file-analyzer: ✓ ${input.relativePath} (${Math.round(performance.now() - t0)}ms, lang=${shaped.language})`, ); diff --git a/packages/ingest-github/src/pipeline/pull-helpers.ts b/packages/ingest-github/src/pipeline/pull-helpers.ts index 0330a03..d5afb5b 100644 --- a/packages/ingest-github/src/pipeline/pull-helpers.ts +++ b/packages/ingest-github/src/pipeline/pull-helpers.ts @@ -8,15 +8,23 @@ export interface PersistPullStatsInput { filesAnalyzed: number; foldersSummarised: number; processingTimeMs: number; + tokenUsage: { inputTokens: number; outputTokens: number }; } -export async function persistPullStats(input: PersistPullStatsInput): Promise { +export async function persistPullStats( + input: PersistPullStatsInput, +): Promise<{ inputTokens: number; outputTokens: number }> { const estimatedCost = await estimateCostFromBreakdown({}); - await recordProcessingStats({ + return await recordProcessingStats({ knowledgeId: input.knowledgeId, repoName: input.repoName, commitHash: input.commitHash, - modelTokens: {}, + modelTokens: { + total: { + inputTokens: input.tokenUsage.inputTokens, + outputTokens: input.tokenUsage.outputTokens, + }, + }, estimatedCost, totalBatches: 1, totalFiles: input.filesAnalyzed, diff --git a/packages/ingest-github/src/pipeline/pull.ts b/packages/ingest-github/src/pipeline/pull.ts index ff5d0aa..176a9f5 100644 --- a/packages/ingest-github/src/pipeline/pull.ts +++ b/packages/ingest-github/src/pipeline/pull.ts @@ -184,7 +184,9 @@ export async function runPull(msg: JobMessage, pullFactory?: if (archiveSink !== undefined) { analyseChangedInput.archiveSink = archiveSink; } - await analyseChangedFiles(analyseChangedInput); + const phase1 = await analyseChangedFiles(analyseChangedInput); + let totalInputTokens = phase1.tokenUsage.inputTokens; + let totalOutputTokens = phase1.tokenUsage.outputTokens; logger.info(`pull: phase process big files starting`); throwIfCancelled(knowledgeId); @@ -192,7 +194,9 @@ export async function runPull(msg: JobMessage, pullFactory?: if (llmCallContext !== undefined) { processBigFilesInput.llmCallContext = llmCallContext; } - await processBigFilesQueue(processBigFilesInput); + const phase2 = await processBigFilesQueue(processBigFilesInput); + totalInputTokens += phase2.tokenUsage.inputTokens; + totalOutputTokens += phase2.tokenUsage.outputTokens; logger.info(`pull: phase backfill fields starting`); throwIfCancelled(knowledgeId); @@ -216,13 +220,17 @@ export async function runPull(msg: JobMessage, pullFactory?: if (llmCallContext !== undefined) { selectiveInput.llmCallContext = llmCallContext; } - await runSelectiveFolderSummary(selectiveInput); + const phase5 = await runSelectiveFolderSummary(selectiveInput); + totalInputTokens += phase5.tokenUsage.inputTokens; + totalOutputTokens += phase5.tokenUsage.outputTokens; logger.info(`pull: phase repo summary starting`); throwIfCancelled(knowledgeId); const orgId = resolveOrgId({ ...(knowledge.source.kind === "github" ? {} : {}) }); const scope: NodeScope = { orgId, knowledgeId, repoId: knowledgeId }; - const repoSummary = await summariseRepo(knowledgeId, metaPaths, llmCallContext); + const { summary: repoSummary, tokenUsage: repoUsage } = await summariseRepo(knowledgeId, metaPaths, llmCallContext); + totalInputTokens += repoUsage.inputTokens; + totalOutputTokens += repoUsage.outputTokens; if (repoSummary !== null) { await persistRepoSummary(metaPaths, makeRepoSummaryEnvelope(knowledgeId, orgId, repoSummary)); } @@ -238,15 +246,16 @@ export async function runPull(msg: JobMessage, pullFactory?: affectedFolders, }); - await persistPullStats({ + const stats = await persistPullStats({ knowledgeId, repoName: repoNameFromUrl(repoUrl), commitHash: targetCommit, filesAnalyzed: stored.filesUpserted, foldersSummarised: stored.foldersUpserted, processingTimeMs: Date.now() - startedAt, + tokenUsage: { inputTokens: totalInputTokens, outputTokens: totalOutputTokens }, }); - await setKnowledgeCommit(knowledgeId, targetCommit); + await setKnowledgeCommit(knowledgeId, targetCommit, String(stats.inputTokens), String(stats.outputTokens)); await transitionState(knowledgeId, KnowledgeState.Processed); logger.info( `pull: ${knowledgeId} ${currentCommit.slice(0, 12)} -> ${targetCommit.slice(0, 12)} done (filesUpserted=${stored.filesUpserted} filesDeleted=${stored.filesDeleted} foldersUpserted=${stored.foldersUpserted})`, diff --git a/packages/ingest-github/src/pipeline/run.ts b/packages/ingest-github/src/pipeline/run.ts index f62e870..cced59e 100644 --- a/packages/ingest-github/src/pipeline/run.ts +++ b/packages/ingest-github/src/pipeline/run.ts @@ -131,15 +131,16 @@ async function runGithub( } const result = await strategy.execute(strategyInput); - await persistStats({ + const stats = await persistStats({ knowledgeId, repoName: repoNameFromUrl(payload.repoUrl), commitHash, filesAnalyzed: result.filesAnalyzed, foldersSummarised: result.foldersSummarised, processingTimeMs: Date.now() - startedAt, + tokenUsage: result.tokenUsage, }); - await setKnowledgeCommit(knowledgeId, commitHash); + await setKnowledgeCommit(knowledgeId, commitHash, String(stats.inputTokens), String(stats.outputTokens)); await transitionState(knowledgeId, KnowledgeState.Processed); const totalMs = Date.now() - startedAt; @@ -153,6 +154,7 @@ async function runGithub( repoSummarised: result.repoSummarised, graphNodesWritten: result.graphNodesWritten, commitHash, + tokenUsage: result.tokenUsage, }; } catch (cause: unknown) { if (cause instanceof CancellationError) { @@ -193,6 +195,7 @@ async function runLocal(strategy: IngestStrategy, payload: LocalIngestPayload): filesAnalyzed: result.filesAnalyzed, foldersSummarised: result.foldersSummarised, processingTimeMs: Date.now() - startedAt, + tokenUsage: result.tokenUsage, }); await transitionState(knowledgeId, KnowledgeState.Processed); return { @@ -201,6 +204,7 @@ async function runLocal(strategy: IngestStrategy, payload: LocalIngestPayload): repoSummarised: result.repoSummarised, graphNodesWritten: result.graphNodesWritten, commitHash, + tokenUsage: result.tokenUsage, }; } catch (cause: unknown) { if (cause instanceof CancellationError) { @@ -224,15 +228,21 @@ interface PersistStatsInput { filesAnalyzed: number; foldersSummarised: number; processingTimeMs: number; + tokenUsage: { inputTokens: number; outputTokens: number }; } -async function persistStats(input: PersistStatsInput): Promise { +async function persistStats(input: PersistStatsInput): Promise<{ inputTokens: number; outputTokens: number }> { const estimatedCost = await estimateCostFromBreakdown({}); - await recordProcessingStats({ + return await recordProcessingStats({ knowledgeId: input.knowledgeId, repoName: input.repoName, commitHash: input.commitHash, - modelTokens: {}, + modelTokens: { + total: { + inputTokens: input.tokenUsage.inputTokens, + outputTokens: input.tokenUsage.outputTokens, + }, + }, estimatedCost, totalBatches: 1, totalFiles: input.filesAnalyzed, diff --git a/packages/ingest-github/src/strategies/flat-folder/analyse-changed.ts b/packages/ingest-github/src/strategies/flat-folder/analyse-changed.ts index 6db4a7a..e488ff8 100644 --- a/packages/ingest-github/src/strategies/flat-folder/analyse-changed.ts +++ b/packages/ingest-github/src/strategies/flat-folder/analyse-changed.ts @@ -38,6 +38,7 @@ export interface AnalyseChangedResult { oversizedStubs: number; skipped: number; failed: number; + tokenUsage: { inputTokens: number; outputTokens: number }; } /** @@ -82,6 +83,8 @@ export async function analyseChangedFiles(input: AnalyseChangedInput): Promise[] = []; @@ -179,6 +182,10 @@ export async function analyseChangedFiles(input: AnalyseChangedInput): Promisez=j={;xt{kA(8LS@4b8HkJGoa=V|A_10G$|@RH#v9YPca3-p&Lr=9Os zpF|P=em zYGj4rt%=xhr@;!DqH{;`(`c7It~}KZTqNPLZ)9bbh1nu3zS_IRQ+KC)?_F&zcf@8m zrW3iNbNf*GqmPDoiQJqomg}WRs0U#BykRP{vo=wtWU5GLB~=l(=%l!{@Ap>8-_0Y0 pir4M^T>A~R_xC^QG*cxq1A;l3fkVnq`=LLm(k<=r6d{s3+F6)*q* delta 50 zcmV-20L}l~N%2FlX9lx`2KfS$wgfMev `chunks/${encodeFolder(input.relativePath)}/chunk-${i}.json`); const totalTokenCount = chunks.reduce((acc, c) => acc + c.tokenCount, 0); + + const chunkInputTokens = results.reduce((acc, r) => acc + (r.tokenUsage?.inputTokens ?? 0), 0); + const chunkOutputTokens = results.reduce((acc, r) => acc + (r.tokenUsage?.outputTokens ?? 0), 0); + const totalInputTokens = chunkInputTokens + (merged.tokenUsage?.inputTokens ?? 0); + const totalOutputTokens = chunkOutputTokens + (merged.tokenUsage?.outputTokens ?? 0); + const manifest: HugeFileManifest = { relativePath: input.relativePath, totalChunks: chunks.length, @@ -83,6 +89,7 @@ export async function processBigFile(input: ProcessBigFileInput): Promise[] = []; for (const [folderPath, files] of groups.entries()) { if (!input.affectedFolders.has(folderPath)) { @@ -48,7 +51,9 @@ export async function runSelectiveFolderSummary( limit(async () => { try { throwIfCancelled(input.knowledgeId); - const summary = await summariseFolder(folderPath, files, input.llmCallContext); + const { summary, tokenUsage } = await summariseFolder(folderPath, files, input.llmCallContext); + totalInputTokens += tokenUsage.inputTokens; + totalOutputTokens += tokenUsage.outputTokens; if (summary !== null) { await persistFolderSummary(input.metaPaths, summary); succeeded += 1; @@ -67,5 +72,5 @@ export async function runSelectiveFolderSummary( } await Promise.all(tasks); logger.info(`pull-folder-summary done: succeeded=${succeeded} failed=${failed} skipped=${skipped}`); - return { succeeded, failed, skipped }; + return { succeeded, failed, skipped, tokenUsage: { inputTokens: totalInputTokens, outputTokens: totalOutputTokens } }; } diff --git a/packages/ingest-github/src/strategies/flat-folder/folder-summary.ts b/packages/ingest-github/src/strategies/flat-folder/folder-summary.ts index 7aa3e07..f76fe14 100644 --- a/packages/ingest-github/src/strategies/flat-folder/folder-summary.ts +++ b/packages/ingest-github/src/strategies/flat-folder/folder-summary.ts @@ -40,7 +40,7 @@ export async function summariseFolder( folderPath: string, files: CondensedFileAnalysis[], llmCallContext?: AskLlmOptions, -): Promise { +): Promise<{ summary: FolderSummary | null; tokenUsage: { inputTokens: number; outputTokens: number } }> { const userPrompt = folderAnalysisUserPrompt(folderPath, files); try { const response = await askJsonLLM( @@ -50,13 +50,19 @@ export async function summariseFolder( ); if (response.result === null) { logger.warn(`summariseFolder: ${folderPath || ""} returned unparseable JSON`); - return null; + return { + summary: null, + tokenUsage: { inputTokens: response.usage.inputTokens, outputTokens: response.usage.outputTokens }, + }; } - return shapeFolderSummary(folderPath, response.result); + return { + summary: shapeFolderSummary(folderPath, response.result), + tokenUsage: { inputTokens: response.usage.inputTokens, outputTokens: response.usage.outputTokens }, + }; } catch (cause: unknown) { const msg = cause instanceof Error ? cause.message : String(cause); logger.warn(`summariseFolder: ${folderPath || ""} askJsonLLM failed: ${msg}`); - return null; + return { summary: null, tokenUsage: { inputTokens: 0, outputTokens: 0 } }; } } @@ -92,19 +98,23 @@ export async function runFolderSummaryPhase( knowledgeId: string, metaPaths: MetaPaths, llmCallContext?: AskLlmOptions, -): Promise<{ succeeded: number; failed: number }> { +): Promise<{ succeeded: number; failed: number; tokenUsage: { inputTokens: number; outputTokens: number } }> { const concurrentWorkers = getConfigValue(Config.ConcurrentWorkers); const limit = withConcurrency(concurrentWorkers); const groups = await groupByDirectFolder(metaPaths); let succeeded = 0; let failed = 0; + let totalInputTokens = 0; + let totalOutputTokens = 0; const tasks: Promise[] = []; for (const [folderPath, files] of groups.entries()) { tasks.push( limit(async () => { try { throwIfCancelled(knowledgeId); - const summary = await summariseFolder(folderPath, files, llmCallContext); + const { summary, tokenUsage } = await summariseFolder(folderPath, files, llmCallContext); + totalInputTokens += tokenUsage.inputTokens; + totalOutputTokens += tokenUsage.outputTokens; if (summary !== null) { await persistFolderSummary(metaPaths, summary); succeeded += 1; @@ -123,7 +133,7 @@ export async function runFolderSummaryPhase( } await Promise.all(tasks); logger.info(`phase5 done: foldersSummarised=${succeeded} failed=${failed}`); - return { succeeded, failed }; + return { succeeded, failed, tokenUsage: { inputTokens: totalInputTokens, outputTokens: totalOutputTokens } }; } function shapeFolderSummary(folderPath: string, raw: FolderSummaryJson): FolderSummary { diff --git a/packages/ingest-github/src/strategies/flat-folder/index.ts b/packages/ingest-github/src/strategies/flat-folder/index.ts index 5304318..8e0a9c6 100644 --- a/packages/ingest-github/src/strategies/flat-folder/index.ts +++ b/packages/ingest-github/src/strategies/flat-folder/index.ts @@ -36,6 +36,8 @@ export function createFlatFolderStrategy(deps: FlatFolderStrategyDeps): IngestSt phase1Input.llmCallContext = llmCallContext; } const phase1 = await classifyAndAnalyseSmall(phase1Input); + let totalInputTokens = phase1.tokenUsage.inputTokens; + let totalOutputTokens = phase1.tokenUsage.outputTokens; logger.info(`flat-folder: phase2 (process big files) starting`); throwIfCancelled(knowledgeId); @@ -44,6 +46,8 @@ export function createFlatFolderStrategy(deps: FlatFolderStrategyDeps): IngestSt phase2Input.llmCallContext = llmCallContext; } const phase2 = await processBigFilesQueue(phase2Input); + totalInputTokens += phase2.tokenUsage.inputTokens; + totalOutputTokens += phase2.tokenUsage.outputTokens; logger.info(`flat-folder: phase3 (backfill missing fields) starting`); throwIfCancelled(knowledgeId); @@ -60,10 +64,18 @@ export function createFlatFolderStrategy(deps: FlatFolderStrategyDeps): IngestSt logger.info(`flat-folder: phase5 (folder summaries) starting`); throwIfCancelled(knowledgeId); const phase5 = await runFolderSummaryPhase(knowledgeId, metaPaths, llmCallContext); + totalInputTokens += phase5.tokenUsage.inputTokens; + totalOutputTokens += phase5.tokenUsage.outputTokens; logger.info(`flat-folder: phase6 (repo summary) starting`); throwIfCancelled(knowledgeId); - const repoSummary = await summariseRepo(knowledgeId, metaPaths, llmCallContext); + const { summary: repoSummary, tokenUsage: repoUsage } = await summariseRepo( + knowledgeId, + metaPaths, + llmCallContext, + ); + totalInputTokens += repoUsage.inputTokens; + totalOutputTokens += repoUsage.outputTokens; let repoSummarised = false; if (repoSummary !== null) { await persistRepoSummary(metaPaths, makeRepoSummaryEnvelope(knowledgeId, orgId, repoSummary)); @@ -84,6 +96,7 @@ export function createFlatFolderStrategy(deps: FlatFolderStrategyDeps): IngestSt foldersSummarised: phase5.succeeded, repoSummarised, graphNodesWritten: phase7.nodesWritten, + tokenUsage: { inputTokens: totalInputTokens, outputTokens: totalOutputTokens }, }; }, }; diff --git a/packages/ingest-github/src/strategies/flat-folder/phases/classify-and-analyse-small.ts b/packages/ingest-github/src/strategies/flat-folder/phases/classify-and-analyse-small.ts index 3306d23..c0bd990 100644 --- a/packages/ingest-github/src/strategies/flat-folder/phases/classify-and-analyse-small.ts +++ b/packages/ingest-github/src/strategies/flat-folder/phases/classify-and-analyse-small.ts @@ -28,6 +28,7 @@ export interface ClassifyPhaseResult { bigFilesQueued: number; oversizedStubs: number; failed: number; + tokenUsage: { inputTokens: number; outputTokens: number }; } export async function classifyAndAnalyseSmall(input: ClassifyPhaseInput): Promise { @@ -38,6 +39,8 @@ export async function classifyAndAnalyseSmall(input: ClassifyPhaseInput): Promis let smallFilesAnalysed = 0; let oversizedStubs = 0; let failed = 0; + let totalInputTokens = 0; + let totalOutputTokens = 0; const repositoryHint = input.source.localRepoDir.length > 0 ? path.basename(input.source.localRepoDir) : input.knowledgeId; @@ -96,6 +99,10 @@ export async function classifyAndAnalyseSmall(input: ClassifyPhaseInput): Promis }); } smallFilesAnalysed += 1; + if (condensed.tokenUsage) { + totalInputTokens += condensed.tokenUsage.inputTokens; + totalOutputTokens += condensed.tokenUsage.outputTokens; + } } catch (cause: unknown) { if (cause instanceof CancellationError) { throw cause; @@ -119,6 +126,7 @@ export async function classifyAndAnalyseSmall(input: ClassifyPhaseInput): Promis bigFilesQueued: bigFileBuffer.filter((e) => e.reason === "context-window-exceeded").length, oversizedStubs, failed, + tokenUsage: { inputTokens: totalInputTokens, outputTokens: totalOutputTokens }, }; } diff --git a/packages/ingest-github/src/strategies/flat-folder/phases/process-big-files.ts b/packages/ingest-github/src/strategies/flat-folder/phases/process-big-files.ts index 26357af..58bcfe0 100644 --- a/packages/ingest-github/src/strategies/flat-folder/phases/process-big-files.ts +++ b/packages/ingest-github/src/strategies/flat-folder/phases/process-big-files.ts @@ -19,6 +19,7 @@ export interface ProcessBigFilesResult { cached: number; failed: number; skippedOversized: number; + tokenUsage: { inputTokens: number; outputTokens: number }; } export async function processBigFilesQueue(input: ProcessBigFilesInput): Promise { @@ -27,6 +28,8 @@ export async function processBigFilesQueue(input: ProcessBigFilesInput): Promise let cached = 0; let failed = 0; let skippedOversized = 0; + let totalInputTokens = 0; + let totalOutputTokens = 0; for (const entry of entries) { throwIfCancelled(input.knowledgeId); @@ -53,7 +56,7 @@ export async function processBigFilesQueue(input: ProcessBigFilesInput): Promise continue; } try { - await processBigFile({ + const condensed = await processBigFile({ knowledgeId: input.knowledgeId, metaPaths: input.metaPaths, relativePath: entry.relativePath, @@ -62,6 +65,10 @@ export async function processBigFilesQueue(input: ProcessBigFilesInput): Promise ...(input.llmCallContext !== undefined ? { llmCallContext: input.llmCallContext } : {}), }); processed += 1; + if (condensed.tokenUsage) { + totalInputTokens += condensed.tokenUsage.inputTokens; + totalOutputTokens += condensed.tokenUsage.outputTokens; + } } catch (cause: unknown) { if (cause instanceof CancellationError) { throw cause; @@ -73,7 +80,13 @@ export async function processBigFilesQueue(input: ProcessBigFilesInput): Promise logger.info( `phase2 done: processed=${processed} cached=${cached} failed=${failed} skippedOversized=${skippedOversized}`, ); - return { processed, cached, failed, skippedOversized }; + return { + processed, + cached, + failed, + skippedOversized, + tokenUsage: { inputTokens: totalInputTokens, outputTokens: totalOutputTokens }, + }; } function describe(cause: unknown): string { diff --git a/packages/ingest-github/src/strategies/flat-folder/repo-summary.ts b/packages/ingest-github/src/strategies/flat-folder/repo-summary.ts index 2fe6c06..a2af39b 100644 --- a/packages/ingest-github/src/strategies/flat-folder/repo-summary.ts +++ b/packages/ingest-github/src/strategies/flat-folder/repo-summary.ts @@ -29,14 +29,16 @@ export async function summariseRepo( knowledgeId: string, metaPaths: MetaPaths, llmCallContext?: AskLlmOptions, -): Promise { +): Promise<{ summary: RepoSummary | null; tokenUsage: { inputTokens: number; outputTokens: number } }> { const folders: FolderSummary[] = []; for await (const f of iterateFolderSummaries(metaPaths)) { folders.push(f); } + let totalInputTokens = 0; + let totalOutputTokens = 0; if (folders.length === 0) { logger.warn(`phase6: no folder summaries on disk; skipping repo summary`); - return null; + return { summary: null, tokenUsage: { inputTokens: 0, outputTokens: 0 } }; } folders.sort((a, b) => a.folderPath.split("/").length - b.folderPath.split("/").length); const infos = repoFolderInfosFrom(folders); @@ -54,32 +56,56 @@ export async function summariseRepo( const partials: string[] = []; for (const batch of batches) { throwIfCancelled(knowledgeId); - const partial = await callRepoSummary(buildRepoPromptFromFolders(batch), llmCallContext); + const { summary: partial, tokenUsage } = await callRepoSummary(buildRepoPromptFromFolders(batch), llmCallContext); + totalInputTokens += tokenUsage.inputTokens; + totalOutputTokens += tokenUsage.outputTokens; if (partial !== null) { partials.push(JSON.stringify(partial)); } } if (partials.length === 0) { - return null; + return { summary: null, tokenUsage: { inputTokens: totalInputTokens, outputTokens: totalOutputTokens } }; } if (partials.length === 1) { - return JSON.parse(partials[0] ?? "null") as RepoSummary | null; + return { + summary: JSON.parse(partials[0] ?? "null") as RepoSummary | null, + tokenUsage: { inputTokens: totalInputTokens, outputTokens: totalOutputTokens }, + }; } throwIfCancelled(knowledgeId); - return await callRepoSummary(buildRepoMergePrompt(partials), llmCallContext); + const { summary: final, tokenUsage: finalUsage } = await callRepoSummary( + buildRepoMergePrompt(partials), + llmCallContext, + ); + return { + summary: final, + tokenUsage: { + inputTokens: totalInputTokens + finalUsage.inputTokens, + outputTokens: totalOutputTokens + finalUsage.outputTokens, + }, + }; } -async function callRepoSummary(userPrompt: string, llmCallContext?: AskLlmOptions): Promise { +async function callRepoSummary( + userPrompt: string, + llmCallContext?: AskLlmOptions, +): Promise<{ summary: RepoSummary | null; tokenUsage: { inputTokens: number; outputTokens: number } }> { try { const response = await askJsonLLM(REPO_SUMMARY_SYSTEM_PROMPT, userPrompt, llmCallContext ?? {}); if (response.result === null) { - return null; + return { + summary: null, + tokenUsage: { inputTokens: response.usage.inputTokens, outputTokens: response.usage.outputTokens }, + }; } - return shapeRepoSummary(response.result); + return { + summary: shapeRepoSummary(response.result), + tokenUsage: { inputTokens: response.usage.inputTokens, outputTokens: response.usage.outputTokens }, + }; } catch (cause: unknown) { const msg = cause instanceof Error ? cause.message : String(cause); logger.warn(`callRepoSummary: askJsonLLM failed: ${msg}`); - return null; + return { summary: null, tokenUsage: { inputTokens: 0, outputTokens: 0 } }; } } diff --git a/packages/ingest-github/src/types/big-file.ts b/packages/ingest-github/src/types/big-file.ts index b681670..d3b28e8 100644 --- a/packages/ingest-github/src/types/big-file.ts +++ b/packages/ingest-github/src/types/big-file.ts @@ -27,6 +27,7 @@ export interface ChunkAnalysisResult { endLine: number; language: string; analysis: FileAnalysis; + tokenUsage?: { inputTokens: number; outputTokens: number } | undefined; } export interface HugeFileManifest { diff --git a/packages/ingest-github/src/types/condensed-file-analysis.ts b/packages/ingest-github/src/types/condensed-file-analysis.ts index 337555b..c19d9f6 100644 --- a/packages/ingest-github/src/types/condensed-file-analysis.ts +++ b/packages/ingest-github/src/types/condensed-file-analysis.ts @@ -11,4 +11,5 @@ export interface CondensedFileAnalysis { totalTokenCount: number; analysedAt: string; analysis: FileAnalysis; + tokenUsage?: { inputTokens: number; outputTokens: number } | undefined; } diff --git a/packages/ingest-github/src/types/pipeline.ts b/packages/ingest-github/src/types/pipeline.ts index 196900e..dd8e898 100644 --- a/packages/ingest-github/src/types/pipeline.ts +++ b/packages/ingest-github/src/types/pipeline.ts @@ -23,6 +23,7 @@ export type ScanEntry = ScannedFile | OversizedFile; export interface AnalyzedFileResult { language: string; analysis: FileAnalysis; + tokenUsage?: { inputTokens: number; outputTokens: number } | undefined; } export interface FileAnalyzer { @@ -44,6 +45,7 @@ export interface PipelineSummary { repoSummarised: boolean; graphNodesWritten: number; commitHash: string; + tokenUsage: { inputTokens: number; outputTokens: number }; } export interface PipelineDeps { diff --git a/packages/ingest-github/src/types/strategy.ts b/packages/ingest-github/src/types/strategy.ts index 2f079b0..32e4b11 100644 --- a/packages/ingest-github/src/types/strategy.ts +++ b/packages/ingest-github/src/types/strategy.ts @@ -30,6 +30,7 @@ export interface StrategyResult { foldersSummarised: number; repoSummarised: boolean; graphNodesWritten: number; + tokenUsage: { inputTokens: number; outputTokens: number }; } export interface IngestStrategy { diff --git a/packages/llm/src/jsonClient.ts b/packages/llm/src/jsonClient.ts index 55f0034..43308f0 100644 --- a/packages/llm/src/jsonClient.ts +++ b/packages/llm/src/jsonClient.ts @@ -73,18 +73,30 @@ export async function askJsonLLM( const maxRetries = opts.maxRetries ?? 1; const baseOpts: AskLlmOptions = { ...opts, systemPrompt }; - let lastUsage: AskLlmUsage = { model: "", inputTokens: 0, outputTokens: 0 }; + let totalInputTokens = 0; + let totalOutputTokens = 0; + let lastModel = ""; let lastRaw = ""; for (let attempt = 0; attempt <= maxRetries; attempt += 1) { const { content, usage } = await askLLM(userPrompt, baseOpts); - lastUsage = usage; + totalInputTokens += usage.inputTokens; + totalOutputTokens += usage.outputTokens; + lastModel = usage.model; lastRaw = content; const parsed = tryParseJson(content); if (parsed !== null) { - return { result: parsed, usage, raw: content }; + return { + result: parsed, + usage: { model: usage.model, inputTokens: totalInputTokens, outputTokens: totalOutputTokens }, + raw: content, + }; } } - return { result: null, usage: lastUsage, raw: lastRaw }; + return { + result: null, + usage: { model: lastModel, inputTokens: totalInputTokens, outputTokens: totalOutputTokens }, + raw: lastRaw, + }; } diff --git a/packages/llm/src/ollama.ts b/packages/llm/src/ollama.ts index 2f131f2..444b29a 100644 --- a/packages/llm/src/ollama.ts +++ b/packages/llm/src/ollama.ts @@ -2,6 +2,7 @@ import { getConfigValue } from "@bb/config"; import { Config } from "@bb/types"; import { LlmConfigError, LlmError } from "@bb/errors"; +import { tokenLen } from "./tokenizer.ts"; import type { AskLlmOptions, AskLlmResult } from "./client.ts"; interface OllamaMessage { @@ -85,8 +86,11 @@ export async function callOllama(prompt: string, opts: AskLlmOptions, timeoutMs: content, usage: { model: typeof json.model === "string" && json.model.length > 0 ? json.model : model, - inputTokens: typeof json.prompt_eval_count === "number" ? json.prompt_eval_count : 0, - outputTokens: typeof json.eval_count === "number" ? json.eval_count : 0, + inputTokens: + typeof json.prompt_eval_count === "number" + ? json.prompt_eval_count + : tokenLen((opts.systemPrompt ?? "") + prompt), + outputTokens: typeof json.eval_count === "number" ? json.eval_count : tokenLen(content), }, }; } diff --git a/packages/llm/src/openrouter.ts b/packages/llm/src/openrouter.ts index 6f90228..8410f52 100644 --- a/packages/llm/src/openrouter.ts +++ b/packages/llm/src/openrouter.ts @@ -2,6 +2,7 @@ import { getConfigValue } from "@bb/config"; import { Config } from "@bb/types"; import { LlmConfigError, LlmError } from "@bb/errors"; +import { tokenLen } from "./tokenizer.ts"; import type { AskLlmOptions, AskLlmResult } from "./client.ts"; const OPENROUTER_URL = "https://openrouter.ai/api/v1/chat/completions"; @@ -94,8 +95,12 @@ export async function callOpenRouter(prompt: string, opts: AskLlmOptions, timeou content, usage: { model: typeof json.model === "string" && json.model.length > 0 ? json.model : model, - inputTokens: typeof json.usage?.prompt_tokens === "number" ? json.usage.prompt_tokens : 0, - outputTokens: typeof json.usage?.completion_tokens === "number" ? json.usage.completion_tokens : 0, + inputTokens: + typeof json.usage?.prompt_tokens === "number" + ? json.usage.prompt_tokens + : tokenLen((opts.systemPrompt ?? "") + prompt), + outputTokens: + typeof json.usage?.completion_tokens === "number" ? json.usage.completion_tokens : tokenLen(content), }, }; } diff --git a/packages/mongo/src/knowledge.ts b/packages/mongo/src/knowledge.ts index c1bfb15..e3e042c 100644 --- a/packages/mongo/src/knowledge.ts +++ b/packages/mongo/src/knowledge.ts @@ -26,14 +26,21 @@ export async function setKnowledgeState(knowledgeId: string, state: KnowledgeSta * * Throws `KnowledgeNotFoundError` if the document doesn't exist. */ -export async function setKnowledgeCommit(knowledgeId: string, commitHash: string): Promise { +export async function setKnowledgeCommit( + knowledgeId: string, + commitHash: string, + inputTokens: string = "", + outputTokens: string = "", +): Promise { const result = await _getDb() .collection(Collections.Knowledge) .updateOne( { knowledgeId }, { $set: { "source.commitId": commitHash, updatedAt: new Date() }, - $addToSet: { "source.commitHashes": commitHash }, + $addToSet: { + "source.commitHashes": { hash: commitHash, inputTokens, outputTokens }, + }, }, ); if (result.matchedCount === 0) { diff --git a/packages/mongo/src/processingStats.ts b/packages/mongo/src/processingStats.ts index b7034e6..fc320b0 100644 --- a/packages/mongo/src/processingStats.ts +++ b/packages/mongo/src/processingStats.ts @@ -25,7 +25,9 @@ export interface RecordProcessingStatsInput { const COST_UNKNOWN = -1; -export async function recordProcessingStats(input: RecordProcessingStatsInput): Promise { +export async function recordProcessingStats( + input: RecordProcessingStatsInput, +): Promise<{ inputTokens: number; outputTokens: number }> { const now = new Date(); const totals = sumModelTokens(input.modelTokens); await _getDb() @@ -54,6 +56,7 @@ export async function recordProcessingStats(input: RecordProcessingStatsInput): }, { upsert: true }, ); + return totals; } export async function aggregateStats(): Promise { diff --git a/packages/types/src/knowledge.ts b/packages/types/src/knowledge.ts index 6537ba5..6c922fd 100644 --- a/packages/types/src/knowledge.ts +++ b/packages/types/src/knowledge.ts @@ -7,12 +7,18 @@ export enum KnowledgeState { Failed = "FAILED", } +export interface CommitHashRecord { + hash: string; + inputTokens: string; + outputTokens: string; +} + export interface GithubKnowledgeSource { kind: "github"; /** Current head pointer — the most recently indexed commit. */ commitId?: string; /** Every commit this knowledge has been indexed at, oldest → newest. Pull appends to this list. */ - commitHashes?: string[]; + commitHashes?: (string | CommitHashRecord)[]; } export interface LocalKnowledgeSource { From 949a839b350e393947331b42ca6137e093fa8ea7 Mon Sep 17 00:00:00 2001 From: Dead-Bytes <143434285+Dead-Bytes@users.noreply.github.com> Date: Fri, 15 May 2026 10:51:36 +0530 Subject: [PATCH 16/34] fix(docker-compose): update port mappings for mongo, neo4j, and redis services --- infra/docker/docker-compose.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/infra/docker/docker-compose.yml b/infra/docker/docker-compose.yml index c73be99..3fdc22d 100644 --- a/infra/docker/docker-compose.yml +++ b/infra/docker/docker-compose.yml @@ -4,7 +4,7 @@ services: container_name: bytebell-mongo restart: unless-stopped ports: - - "127.0.0.1:27017:27017" + - "127.0.0.1:27117:27017" volumes: - mongo_data:/data/db environment: @@ -26,7 +26,7 @@ services: restart: unless-stopped ports: - "127.0.0.1:7474:7474" - - "127.0.0.1:7687:7687" + - "127.0.0.1:7787:7687" volumes: - neo4j_data:/data environment: @@ -47,7 +47,7 @@ services: container_name: bytebell-redis restart: unless-stopped ports: - - "127.0.0.1:6379:6379" + - "127.0.0.1:6479:6379" volumes: - redis_data:/data command: ["redis-server", "--appendonly", "yes"] From e1e005c73f1c3690a3e9bb031896e8d0e0427b30 Mon Sep 17 00:00:00 2001 From: Dead-Bytes <143434285+Dead-Bytes@users.noreply.github.com> Date: Fri, 15 May 2026 11:21:43 +0530 Subject: [PATCH 17/34] feat(progress): enhance progress reporting in ingestion pipeline with new phases and context --- packages/ingest-github/src/index.ts | 6 +- packages/ingest-github/src/pipeline/README.md | 12 ++- packages/ingest-github/src/pipeline/run.ts | 20 +++- packages/ingest-github/src/progress/README.md | 2 +- packages/ingest-github/src/progress/types.ts | 2 +- .../src/strategies/flat-folder/README.md | 10 +- .../strategies/flat-folder/folder-summary.ts | 55 +++++++---- .../src/strategies/flat-folder/index.ts | 3 +- .../flat-folder/phases/store-flat-analysis.ts | 96 ++++++++++++------- 9 files changed, 141 insertions(+), 65 deletions(-) diff --git a/packages/ingest-github/src/index.ts b/packages/ingest-github/src/index.ts index e777df6..a00945e 100644 --- a/packages/ingest-github/src/index.ts +++ b/packages/ingest-github/src/index.ts @@ -35,7 +35,11 @@ function buildRunner( buildUserPrompt: buildFileAnalysisUserPrompt, }); const strategy = createFlatFolderStrategy({ fileAnalyzer, progressContextFactory }); - const runnerDeps: Parameters[0] = { reposRootDir: reposRoot(), strategy }; + const runnerDeps: Parameters[0] = { + reposRootDir: reposRoot(), + strategy, + progressContextFactory, + }; if (sourceFactory !== undefined) { runnerDeps.sourceFactory = sourceFactory; } diff --git a/packages/ingest-github/src/pipeline/README.md b/packages/ingest-github/src/pipeline/README.md index 97b7192..d39eeb6 100644 --- a/packages/ingest-github/src/pipeline/README.md +++ b/packages/ingest-github/src/pipeline/README.md @@ -43,7 +43,7 @@ true` (default). Consumed by `scan.ts` via the optional `skipDecider` unknown-extension gate uses per-job credentials. `readScannedFile` re-reads a file by absolute path for the big-file phase which streams content lazily. -- `run.ts` — `createPipelineRunner({ reposRootDir, strategy, sourceFactory? })` +- `run.ts` — `createPipelineRunner({ reposRootDir, strategy, sourceFactory?, progressContextFactory? })` builds an `IngestRunnerDeps`. GitHub payloads run: branch resolve, source-reader construction, strategy execute, commit persistence. Local payloads skip the clone. The source reader is chosen by the optional @@ -61,7 +61,15 @@ true` (default). Consumed by `scan.ts` via the optional `skipDecider` llmCallContext`, which every LLM call site downstream consumes. State transitions (`CREATED → QUEUED → INGESTED → …`) are persisted to Mongo - Neo4j via `transitionState`, and `CancellationError` is re-thrown - without flipping to FAILED. + without flipping to FAILED. The optional `progressContextFactory` + is the runner's own `ProgressContext` source: `runGithub` emits + `phaseChanged("clone")` before `syncRepository` (or before the + `sourceFactory` call) and `phaseChanged("scan")` before invoking + `strategy.execute`, so SSE clients see liveness during the + network/disk-bound prelude. On a non-`CancellationError` throw the + runner emits `failed(message)` only when the strategy has not yet + started — once `strategy.execute` is reached, the strategy owns + terminal emission and the runner stays silent to avoid double-FAILED. - `pull.ts` — `runPull(msg, pullFactory?, progressContextFactory?)` orchestrates the pull job. Reads `repoUrl` and `branch` directly off `knowledge.info.*` (loaded via `@bb/mongo.getKnowledge`). The `KnowledgeSource` discriminator (`kind`) is diff --git a/packages/ingest-github/src/pipeline/run.ts b/packages/ingest-github/src/pipeline/run.ts index f62e870..eb23021 100644 --- a/packages/ingest-github/src/pipeline/run.ts +++ b/packages/ingest-github/src/pipeline/run.ts @@ -8,6 +8,8 @@ import { logger } from "@bb/logger"; import type { IngestRunnerDeps, IngestRunnerInput } from "src/types/ingest-runner.ts"; import type { IngestStrategy } from "src/types/strategy.ts"; import type { ArchiveSink, PipelineSummary, SourceFactory, SourceReader } from "src/types/pipeline.ts"; +import type { ProgressContextFactory } from "src/progress/types.ts"; +import { nullProgressContextFactory } from "src/progress/NullProgressReporter.ts"; import { ensureMetaDirs, ensureReposRoot, metaPathsFor, repoCloneDir } from "./paths.ts"; import { readHeadCommitHash, syncRepository } from "./source.ts"; import { resolveBranch } from "./branch.ts"; @@ -49,16 +51,23 @@ export interface CreatePipelineRunnerDeps { * supplies one. */ sourceFactory?: SourceFactory; + /** + * Optional progress context factory. When provided, the runner emits + * pre-strategy phase changes (`clone`, `scan`) so SSE clients see liveness + * during the network/disk-bound prelude. Defaults to a no-op. + */ + progressContextFactory?: ProgressContextFactory; } export function createPipelineRunner(deps: CreatePipelineRunnerDeps): IngestRunnerDeps { + const progressContextFactory = deps.progressContextFactory ?? nullProgressContextFactory; return { reposRootDir: deps.reposRootDir, strategy: deps.strategy, run: async (input: IngestRunnerInput): Promise => { const payload = input.payload; if (isGithubPayload(payload)) { - return await runGithub(deps.strategy, payload, deps.sourceFactory); + return await runGithub(deps.strategy, payload, deps.sourceFactory, progressContextFactory); } return await runLocal(deps.strategy, payload); }, @@ -69,11 +78,14 @@ async function runGithub( strategy: IngestStrategy, payload: GithubIndexPayload, sourceFactory: SourceFactory | undefined, + progressContextFactory: ProgressContextFactory, ): Promise { const { knowledgeId } = payload; clearCancellation(knowledgeId); const startedAt = Date.now(); await transitionState(knowledgeId, KnowledgeState.Processing); + const progressContext = progressContextFactory(knowledgeId); + let strategyStarted = false; try { throwIfCancelled(knowledgeId); const branch = resolveBranch(knowledgeId, payload); @@ -82,6 +94,7 @@ async function runGithub( let archiveSink: ArchiveSink | undefined; let commitHash: string; + progressContext.phaseChanged("clone"); if (sourceFactory !== undefined) { const factoryResult = await sourceFactory({ knowledgeId, payload, branch }); source = factoryResult.source; @@ -107,6 +120,7 @@ async function runGithub( source = createDiskSourceReader({ repoDir, commitHash }); } + progressContext.phaseChanged("scan"); const metaPaths = metaPathsFor(knowledgeId); await ensureMetaDirs(metaPaths); @@ -129,6 +143,7 @@ async function runGithub( if (archiveSink !== undefined) { strategyInput.archiveSink = archiveSink; } + strategyStarted = true; const result = await strategy.execute(strategyInput); await persistStats({ @@ -161,6 +176,9 @@ async function runGithub( throw cause; } await transitionState(knowledgeId, KnowledgeState.Failed).catch(() => undefined); + if (!strategyStarted) { + progressContext.failed(describe(cause)); + } throw new IngestError(knowledgeId, `github_index pipeline failed: ${describe(cause)}`, cause); } } diff --git a/packages/ingest-github/src/progress/README.md b/packages/ingest-github/src/progress/README.md index cfed8d0..e6d1013 100644 --- a/packages/ingest-github/src/progress/README.md +++ b/packages/ingest-github/src/progress/README.md @@ -15,7 +15,7 @@ A host binary supplies a `ProgressContextFactory(knowledgeId)`. `@bb/server` doe ## Public API -- `ProgressPhase` — `"file_analysis" | "folder_analysis" | "indexing"` +- `ProgressPhase` — `"clone" | "scan" | "file_analysis" | "folder_analysis" | "indexing"`. `clone` and `scan` are emitted by `runGithub` (the runner) before the strategy starts, so SSE clients see liveness during the network/disk-bound prelude. `file_analysis`, `folder_analysis`, and `indexing` are emitted by the strategy. - `ProgressTotalMode` — `{ kind: "fixed"; total }` or `{ kind: "growing"; initialTotal? }` - `ProgressReporterInput` — phase + sub-phase + total mode + optional restart-seed hook - `ProgressReporter` — `start / increment / incrementSeen / setTotal / stop` diff --git a/packages/ingest-github/src/progress/types.ts b/packages/ingest-github/src/progress/types.ts index 0f7d65c..99744eb 100644 --- a/packages/ingest-github/src/progress/types.ts +++ b/packages/ingest-github/src/progress/types.ts @@ -7,7 +7,7 @@ * no-outbound-calls posture. */ -export type ProgressPhase = "file_analysis" | "folder_analysis" | "indexing"; +export type ProgressPhase = "clone" | "scan" | "file_analysis" | "folder_analysis" | "indexing"; export type ProgressTotalMode = { kind: "fixed"; total: number } | { kind: "growing"; initialTotal?: number }; diff --git a/packages/ingest-github/src/strategies/flat-folder/README.md b/packages/ingest-github/src/strategies/flat-folder/README.md index 624534e..8d26d9d 100644 --- a/packages/ingest-github/src/strategies/flat-folder/README.md +++ b/packages/ingest-github/src/strategies/flat-folder/README.md @@ -37,8 +37,11 @@ The strategy emits progress through the `ProgressContext` port defined in `progressContextFactory`; absent → `nullProgressContextFactory` (no-op, OSS default). -- **Boundary events** are emitted by `index.ts`: - - `phaseChanged("file_analysis")` before phase 1 +- **Boundary events** are split between the runner and the strategy: + - `phaseChanged("clone")` and `phaseChanged("scan")` are emitted by + `pipeline/run.ts` (the runner) before `strategy.execute` is called, + so the SSE stream stays alive during the network/disk-bound prelude. + - `phaseChanged("file_analysis")` is emitted by `index.ts` before phase 1 - `phaseChanged("folder_analysis")` before phase 5 - `phaseChanged("indexing")` before phase 6 (which feeds phase 7) - `completed()` after phase 7 returns @@ -49,6 +52,9 @@ The strategy emits progress through the `ProgressContext` port defined in - phase 2 → `big_files_queue`; inner `processBigFile` adds `big_file:` for chunk pulses - phase 3 → `backfill`; phase 4 → `backfill:big_files` + - phase 5 → no sub-phase, fixed total = directly-grouped folder count + - phase 7 → `folders` then `files`, both `growing` (drained from + on-disk async generators) - **Total mode**: phase 1, phase 3, and any other streaming-iterator loop use `total: { kind: "growing" }` (denominator grows as `source.scan` yields). Phases 2 and 4, plus the big-file chunk pool, know their size diff --git a/packages/ingest-github/src/strategies/flat-folder/folder-summary.ts b/packages/ingest-github/src/strategies/flat-folder/folder-summary.ts index 7aa3e07..66472d2 100644 --- a/packages/ingest-github/src/strategies/flat-folder/folder-summary.ts +++ b/packages/ingest-github/src/strategies/flat-folder/folder-summary.ts @@ -9,6 +9,7 @@ import type { MetaPaths } from "src/types/meta-paths.ts"; import { encodeMetaPath } from "src/pipeline/paths.ts"; import { withConcurrency } from "src/pipeline/concurrency.ts"; import { throwIfCancelled, CancellationError } from "src/pipeline/cancellation.ts"; +import type { ProgressContext } from "src/progress/types.ts"; import { iterateCondensed } from "./big-file/storage.ts"; import { directFolderOf } from "./folder-path.ts"; import { FOLDER_ANALYSIS_SYSTEM_PROMPT, folderAnalysisUserPrompt } from "./prompts/folder-summary.ts"; @@ -92,36 +93,48 @@ export async function runFolderSummaryPhase( knowledgeId: string, metaPaths: MetaPaths, llmCallContext?: AskLlmOptions, + progressContext?: ProgressContext, ): Promise<{ succeeded: number; failed: number }> { const concurrentWorkers = getConfigValue(Config.ConcurrentWorkers); const limit = withConcurrency(concurrentWorkers); const groups = await groupByDirectFolder(metaPaths); let succeeded = 0; let failed = 0; - const tasks: Promise[] = []; - for (const [folderPath, files] of groups.entries()) { - tasks.push( - limit(async () => { - try { - throwIfCancelled(knowledgeId); - const summary = await summariseFolder(folderPath, files, llmCallContext); - if (summary !== null) { - await persistFolderSummary(metaPaths, summary); - succeeded += 1; - } else { + const reporter = progressContext?.reporter({ + phase: "folder_analysis", + total: { kind: "fixed", total: groups.size }, + }); + await reporter?.start(); + try { + const tasks: Promise[] = []; + for (const [folderPath, files] of groups.entries()) { + tasks.push( + limit(async () => { + try { + throwIfCancelled(knowledgeId); + const summary = await summariseFolder(folderPath, files, llmCallContext); + if (summary !== null) { + await persistFolderSummary(metaPaths, summary); + succeeded += 1; + } else { + failed += 1; + } + } catch (cause: unknown) { + if (cause instanceof CancellationError) { + throw cause; + } failed += 1; + logger.warn(`phase5: folder summary failed for ${folderPath || ""}`); + } finally { + reporter?.increment(1, { fileName: folderPath || "" }); } - } catch (cause: unknown) { - if (cause instanceof CancellationError) { - throw cause; - } - failed += 1; - logger.warn(`phase5: folder summary failed for ${folderPath || ""}`); - } - }), - ); + }), + ); + } + await Promise.all(tasks); + } finally { + reporter?.stop(); } - await Promise.all(tasks); logger.info(`phase5 done: foldersSummarised=${succeeded} failed=${failed}`); return { succeeded, failed }; } diff --git a/packages/ingest-github/src/strategies/flat-folder/index.ts b/packages/ingest-github/src/strategies/flat-folder/index.ts index dd47955..d82ea49 100644 --- a/packages/ingest-github/src/strategies/flat-folder/index.ts +++ b/packages/ingest-github/src/strategies/flat-folder/index.ts @@ -79,7 +79,7 @@ export function createFlatFolderStrategy(deps: FlatFolderStrategyDeps): IngestSt progressContext.phaseChanged("folder_analysis"); logger.info(`flat-folder: phase5 (folder summaries) starting`); throwIfCancelled(knowledgeId); - const phase5 = await runFolderSummaryPhase(knowledgeId, metaPaths, llmCallContext); + const phase5 = await runFolderSummaryPhase(knowledgeId, metaPaths, llmCallContext, progressContext); progressContext.phaseChanged("indexing"); logger.info(`flat-folder: phase6 (repo summary) starting`); @@ -98,6 +98,7 @@ export function createFlatFolderStrategy(deps: FlatFolderStrategyDeps): IngestSt payload, branch, metaPaths, + progressContext, }); progressContext.completed(); diff --git a/packages/ingest-github/src/strategies/flat-folder/phases/store-flat-analysis.ts b/packages/ingest-github/src/strategies/flat-folder/phases/store-flat-analysis.ts index d6e14b7..b700986 100644 --- a/packages/ingest-github/src/strategies/flat-folder/phases/store-flat-analysis.ts +++ b/packages/ingest-github/src/strategies/flat-folder/phases/store-flat-analysis.ts @@ -8,6 +8,7 @@ import { iterateCondensed } from "src/strategies/flat-folder/big-file/storage.ts import { iterateFolderSummaries } from "src/strategies/flat-folder/folder-summary.ts"; import { directFolderOf } from "src/strategies/flat-folder/folder-path.ts"; import { languageFromPath } from "src/adapters/llm-file-analyzer.ts"; +import type { ProgressContext } from "src/progress/types.ts"; import type { FolderSummary, RepoSummary, RepoSummaryEnvelope } from "src/strategies/flat-folder/types.ts"; export interface StoreFlatAnalysisInput { @@ -15,6 +16,7 @@ export interface StoreFlatAnalysisInput { payload: GithubIndexPayload; branch: string; metaPaths: MetaPaths; + progressContext?: ProgressContext; } export interface StoreFlatAnalysisResult { @@ -59,48 +61,72 @@ export async function storeFlatAnalysis(input: StoreFlatAnalysisInput): Promise< nodesWritten += 1; } + const folderReporter = input.progressContext?.reporter({ + phase: "indexing", + subPhase: "folders", + total: { kind: "growing" }, + }); + await folderReporter?.start(); const folderPaths = new Set(); - for await (const folder of iterateFolderSummaries(input.metaPaths)) { - throwIfCancelled(input.scope.knowledgeId); - await upsertFolderNode({ - scope: input.scope, - folderPath: folder.folderPath, - summary: shapeFolderPayload(folder), - }); - folderPaths.add(folder.folderPath); - foldersWritten += 1; - nodesWritten += 1; - } - - for await (const file of iterateCondensed(input.metaPaths)) { - throwIfCancelled(input.scope.knowledgeId); - const folderPath = directFolderOf(file.relativePath); - if (!folderPaths.has(folderPath)) { + try { + for await (const folder of iterateFolderSummaries(input.metaPaths)) { + throwIfCancelled(input.scope.knowledgeId); + folderReporter?.incrementSeen(); await upsertFolderNode({ scope: input.scope, - folderPath, - summary: emptyFolderPayload(), + folderPath: folder.folderPath, + summary: shapeFolderPayload(folder), }); - folderPaths.add(folderPath); + folderPaths.add(folder.folderPath); foldersWritten += 1; nodesWritten += 1; + folderReporter?.increment(1, { fileName: folder.folderPath || "" }); } - await upsertFileNode({ - orgId: input.scope.orgId, - knowledgeId: input.scope.knowledgeId, - repoId: input.scope.repoId, - relativePath: file.relativePath, - folderPath, - language: file.language.length > 0 ? file.language : languageFromPath(file.relativePath), - sha: file.sha256, - sizeBytes: file.sizeBytes, - analysis: file.analysis, - isBigFile: file.isBigFile, - totalChunks: file.totalChunks, - totalTokenCount: file.totalTokenCount, - }); - filesWritten += 1; - nodesWritten += 1; + } finally { + folderReporter?.stop(); + } + + const fileReporter = input.progressContext?.reporter({ + phase: "indexing", + subPhase: "files", + total: { kind: "growing" }, + }); + await fileReporter?.start(); + try { + for await (const file of iterateCondensed(input.metaPaths)) { + throwIfCancelled(input.scope.knowledgeId); + fileReporter?.incrementSeen(); + const folderPath = directFolderOf(file.relativePath); + if (!folderPaths.has(folderPath)) { + await upsertFolderNode({ + scope: input.scope, + folderPath, + summary: emptyFolderPayload(), + }); + folderPaths.add(folderPath); + foldersWritten += 1; + nodesWritten += 1; + } + await upsertFileNode({ + orgId: input.scope.orgId, + knowledgeId: input.scope.knowledgeId, + repoId: input.scope.repoId, + relativePath: file.relativePath, + folderPath, + language: file.language.length > 0 ? file.language : languageFromPath(file.relativePath), + sha: file.sha256, + sizeBytes: file.sizeBytes, + analysis: file.analysis, + isBigFile: file.isBigFile, + totalChunks: file.totalChunks, + totalTokenCount: file.totalTokenCount, + }); + filesWritten += 1; + nodesWritten += 1; + fileReporter?.increment(1, { fileName: file.relativePath }); + } + } finally { + fileReporter?.stop(); } logger.info(`phase7 done: nodesWritten=${nodesWritten} folders=${foldersWritten} files=${filesWritten}`); From aa0c347e4e3e03568caa6618a552d4548ea186de Mon Sep 17 00:00:00 2001 From: Lovanshu garg Date: Fri, 15 May 2026 12:39:18 +0530 Subject: [PATCH 18/34] Update interactive option logic in LsCommand --- packages/cli/src/LsCommand.ts | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/packages/cli/src/LsCommand.ts b/packages/cli/src/LsCommand.ts index c58f9ee..5da25cf 100644 --- a/packages/cli/src/LsCommand.ts +++ b/packages/cli/src/LsCommand.ts @@ -15,7 +15,8 @@ export function buildLsCommand(): Command { const cmd = new Command("ls"); cmd .description("List indexed knowledge entries.") - .option("-i, --interactive", "Use interactive selector to browse entries.") + .option("-i, --interactive", "Use interactive selector to browse entries.", true) + .option("--no-interactive", "Display a plain table instead of the interactive selector.") .action(runLs); return cmd; } @@ -42,7 +43,7 @@ async function runLs(options: { interactive?: boolean }): Promise { return; } - if (options.interactive === true) { + if (options.interactive !== false) { await promptLsInteractive(repos); return; } From 4430efc7a8bfe622ed04b8904e5ab2468d4c5ae8 Mon Sep 17 00:00:00 2001 From: lovanshu garg Date: Mon, 18 May 2026 12:31:37 +0530 Subject: [PATCH 19/34] feat(ls): interactive mode by default --- bun.lock | 1 + packages/ingest-github/types/index.d.ts | 50 ++++++++++++++++++++++--- 2 files changed, 45 insertions(+), 6 deletions(-) diff --git a/bun.lock b/bun.lock index 5b75687..4b0f46d 100644 --- a/bun.lock +++ b/bun.lock @@ -76,6 +76,7 @@ "dependencies": { "@bb/config": "workspace:*", "@bb/errors": "workspace:*", + "@bb/logger": "workspace:*", "@bb/mongo": "workspace:*", "@bb/types": "workspace:*", "tiktoken": "^1.0.22", diff --git a/packages/ingest-github/types/index.d.ts b/packages/ingest-github/types/index.d.ts index ac23419..2c360af 100644 --- a/packages/ingest-github/types/index.d.ts +++ b/packages/ingest-github/types/index.d.ts @@ -14,9 +14,24 @@ export declare const createGithubIngestHandler: (...args: any[]) => any; export declare const createLocalIngestHandler: (...args: any[]) => any; export declare const runPull: (...args: any[]) => any; export declare const reposRoot: (...args: any[]) => string; -export declare const fetchLatestCommitHash: (...args: any[]) => any; -export declare const fetchRecentCommits: (...args: any[]) => any; -export declare const parseGithubRepo: (...args: any[]) => any; +export declare function fetchLatestCommitHash( + repoUrl: string, + branch: string, + gitToken?: string, +): Promise; +export declare function fetchRecentCommits( + repoUrl: string, + branch: string, + limit?: number, + gitToken?: string, +): Promise; +export declare function fetchDefaultBranch(repoUrl: string, gitToken?: string): Promise; +export declare function fetchBranches( + repoUrl: string, + gitToken?: string, + limit?: number, +): Promise<{ status: "ok"; branches: string[] } | { status: "error"; message: string }>; +export declare function parseGithubRepo(repoUrl: string): ParsedRepo | null; export interface BootstrapRuntimeOptions { config: unknown; @@ -53,6 +68,29 @@ export type PullFactoryResult = any; export type DiffResult = any; export type RenamedFile = any; export type CondensedFileAnalysis = any; -export type CommitEntry = any; -export type FetchCommitsResult = any; -export type ParsedRepo = any; +export interface CommitEntry { + sha: string; + message: string; + author: string; + timestamp: string; +} + +export type FetchCommitsResult = + | { status: "ok"; commits: CommitEntry[] } + | { status: "not_found" } + | { status: "unauthorized" } + | { status: "rate_limited" } + | { status: "error"; message: string }; + +export interface ParsedRepo { + owner: string; + repo: string; + branch?: string; +} + +export type DefaultBranchResult = + | { status: "ok"; branch: string } + | { status: "not_found" } + | { status: "unauthorized" } + | { status: "rate_limited" } + | { status: "error"; message: string }; From 47ff55b5b5612966d2373cbddf3e953ce6f7a37f Mon Sep 17 00:00:00 2001 From: Dead-Bytes <143434285+Dead-Bytes@users.noreply.github.com> Date: Mon, 18 May 2026 17:28:07 +0530 Subject: [PATCH 20/34] feat(config): add setBytebellHomeResolver for dynamic home directory resolution --- packages/config/README.md | 12 ++++++++++-- packages/config/src/README.md | 8 +++++--- packages/config/src/index.ts | 8 +++++++- packages/config/src/paths.ts | 18 ++++++++++++++++++ 4 files changed, 40 insertions(+), 6 deletions(-) diff --git a/packages/config/README.md b/packages/config/README.md index 950de59..97e74bc 100644 --- a/packages/config/README.md +++ b/packages/config/README.md @@ -44,10 +44,17 @@ function seedConfig(value: unknown): BytebellConfig function __isSeeded(): boolean class ConfigSeededError extends Error +function setBytebellHomeResolver(fn: (() => string | null) | null): void + function __resetSeedForTests(): void // test-only function __setBytebellHomeForTests(home: string | null): void // test-only ``` +`setBytebellHomeResolver` registers an override function invoked on every +`getBytebellHome()` call (no caching). The resolver returns the home directory +to use for the current invocation, or `null` to fall through to the +`~/.bytebell` default. Pass `null` to clear. + `seedConfig` injects a pre-parsed config object into the in-memory cache, validated through `configSchema.parse`. When seeded, `loadConfig()` returns the seeded values and **does not** call `ensureBytebellHome()` or read @@ -89,8 +96,9 @@ This package does **not** own: 1. **No env var reads.** Source files contain no `process.env` references. Enforced at lint time ([eslint.config.mjs:71-94](../../eslint.config.mjs#L71-L94)). -2. **No `.env` / `dotenv` / `BYTEBELL_HOME`.** The only test seam is the - programmatic `__setBytebellHomeForTests`. +2. **No `.env` / `dotenv` / `BYTEBELL_HOME`.** Programmatic override seams + are `__setBytebellHomeForTests` (test-only, static) and + `setBytebellHomeResolver` (per-call function). 3. **Strict schema.** Unknown keys in `config.json` cause `loadConfig()` to throw — typo defense. 4. **Defaults always present.** `loadConfig()` never returns a partial config; diff --git a/packages/config/src/README.md b/packages/config/src/README.md index 6ab129e..221e1e2 100644 --- a/packages/config/src/README.md +++ b/packages/config/src/README.md @@ -8,9 +8,11 @@ package-level contract; this file documents how the source tree is split. - **[index.ts](index.ts)** — public re-exports. The only entry point other packages may import. Anything not re-exported here is internal. - **[paths.ts](paths.ts)** — `getBytebellHome`, `getConfigPath`, and the - cache-invalidator registry. Holds the `testHomeOverride` state used by - `__setBytebellHomeForTests`. Pure: imports nothing from the rest of the - package. + cache-invalidator registry. Holds the `testHomeOverride` slot set by + `__setBytebellHomeForTests` and the `homeResolver` slot set by + `setBytebellHomeResolver`. `getBytebellHome` consults the test override + first, then the resolver (if set and returning non-null), then falls back + to `~/.bytebell`. Pure: imports nothing from the rest of the package. - **[schema.ts](schema.ts)** — Zod `configSchema`, `BytebellConfig` type, `ConfigValueMap`, `DEFAULT_CONFIG`, `REQUIRED_KEYS` (infra-always), `requiredKeysFor(provider)` (combines infra + provider-specific keys diff --git a/packages/config/src/index.ts b/packages/config/src/index.ts index c60e1aa..b4cefb3 100644 --- a/packages/config/src/index.ts +++ b/packages/config/src/index.ts @@ -6,4 +6,10 @@ export type { ConfigCompletenessResult } from "./loader.ts"; export { setConfigValue, ensureBytebellHome, ConfigSeededError } from "./writer.ts"; -export { getBytebellHome, getConfigPath, isDevMode, __setBytebellHomeForTests } from "./paths.ts"; +export { + getBytebellHome, + getConfigPath, + isDevMode, + setBytebellHomeResolver, + __setBytebellHomeForTests, +} from "./paths.ts"; diff --git a/packages/config/src/paths.ts b/packages/config/src/paths.ts index b93876e..b9d02f8 100644 --- a/packages/config/src/paths.ts +++ b/packages/config/src/paths.ts @@ -2,15 +2,33 @@ import os from "node:os"; import path from "node:path"; let testHomeOverride: string | null = null; +let homeResolver: (() => string | null) | null = null; const cacheInvalidators: Array<() => void> = []; export function getBytebellHome(): string { if (testHomeOverride !== null) { return testHomeOverride; } + if (homeResolver !== null) { + const resolved = homeResolver(); + if (resolved !== null) { + return resolved; + } + } return path.join(os.homedir(), ".bytebell"); } +/** + * Register an override resolver for `getBytebellHome()`. The resolver runs on + * every call (no caching) so it may return different values across invocations. + * Returning `null` falls through to the `~/.bytebell` default. Pass `null` to + * clear the resolver. + */ +export function setBytebellHomeResolver(fn: (() => string | null) | null): void { + homeResolver = fn; + __notifyConfigChanged(); +} + export function getConfigPath(): string { return path.join(getBytebellHome(), "config.json"); } From 524e0e9b45aef61e6262771a4a37f43c6ec2fcc8 Mon Sep 17 00:00:00 2001 From: Dead-Bytes <143434285+Dead-Bytes@users.noreply.github.com> Date: Mon, 18 May 2026 18:53:28 +0530 Subject: [PATCH 21/34] refactor(code): resolved type issues clearly --- packages/cli/package.json | 3 + packages/cli/tsconfig.json | 16 +- packages/config/package.json | 3 + packages/config/tsconfig.json | 11 +- packages/errors/package.json | 3 + packages/errors/tsconfig.json | 9 +- packages/ingest-business-context/README.md | 54 +++++ packages/ingest-business-context/package.json | 24 +++ .../ingest-business-context/src/README.md | 37 ++++ .../src/disk/README.md | 17 ++ .../src/disk/load-cached.ts | 41 ++++ .../src/disk/sanitize-title.ts | 25 +++ .../src/disk/save-analysis.ts | 48 +++++ .../src/disk/save-original.ts | 25 +++ .../ingest-business-context/src/errors.ts | 33 +++ .../ingest-business-context/src/field-defs.ts | 203 ++++++++++++++++++ packages/ingest-business-context/src/index.ts | 33 +++ .../ingest-business-context/src/llm/README.md | 16 ++ .../src/llm/analyze-parallel.ts | 111 ++++++++++ .../src/llm/call-builder.ts | 46 ++++ .../src/llm/enrichment-format.ts | 88 ++++++++ .../src/llm/enrichment-reader.ts | 147 +++++++++++++ .../ingest-business-context/src/llm/merge.ts | 40 ++++ .../ingest-business-context/src/llm/title.ts | 50 +++++ .../src/neo4j/README.md | 33 +++ .../src/neo4j/indexes.ts | 24 +++ .../src/neo4j/relationship-types.ts | 19 ++ .../src/neo4j/serialize.ts | 8 + .../src/neo4j/write-keywords.ts | 60 ++++++ .../src/neo4j/write-node.ts | 53 +++++ .../src/neo4j/write-version.ts | 71 ++++++ .../src/prompt/README.md | 11 + .../src/prompt/analysis-prompt.ts | 48 +++++ .../src/prompt/title-prompt.ts | 22 ++ .../src/prompt/user-message.ts | 18 ++ .../src/strategy/README.md | 13 ++ .../src/strategy/commit-validator.ts | 46 ++++ .../src/strategy/execute.ts | 96 +++++++++ .../src/strategy/store-graph.ts | 63 ++++++ packages/ingest-business-context/src/types.ts | 117 ++++++++++ .../src/worker/README.md | 12 ++ .../src/worker/handler.ts | 73 +++++++ .../src/worker/register.ts | 13 ++ .../ingest-business-context/tsconfig.json | 4 + packages/ingest-github/README.md | 6 + packages/ingest-github/package.json | 10 +- .../src/adapters/llm-file-analyzer.ts | 4 +- .../ingest-github/src/handlers/ingest-job.ts | 4 +- packages/ingest-github/src/index.ts | 10 +- packages/ingest-github/src/pipeline/branch.ts | 2 +- .../ingest-github/src/pipeline/context.ts | 2 - .../src/pipeline/disk-source-reader.ts | 2 +- packages/ingest-github/src/pipeline/paths.ts | 37 +++- packages/ingest-github/src/pipeline/pull.ts | 28 +-- packages/ingest-github/src/pipeline/run.ts | 10 +- packages/ingest-github/src/pipeline/scan.ts | 2 +- .../src/pipeline/skip-decisions/decider.ts | 2 +- packages/ingest-github/src/pipeline/stats.ts | 2 - .../src/progress/NullProgressReporter.ts | 2 +- .../BasicFileAnalysisStrategy.ts.archived | 1 - .../strategies/flat-folder/analyse-changed.ts | 22 +- .../strategies/flat-folder/analyse-file.ts | 4 +- .../flat-folder/backfill/big-files.ts | 12 +- .../strategies/flat-folder/backfill/fields.ts | 10 +- .../strategies/flat-folder/big-file/cache.ts | 2 +- .../flat-folder/big-file/chunk-analyzer.ts | 8 +- .../flat-folder/big-file/chunker.ts | 2 +- .../flat-folder/big-file/condenser.ts | Bin 9435 -> 9440 bytes .../flat-folder/big-file/detector.ts | 4 +- .../strategies/flat-folder/big-file/index.ts | 10 +- .../flat-folder/big-file/storage.ts | 8 +- .../flat-folder/folder-summary-selective.ts | 8 +- .../strategies/flat-folder/folder-summary.ts | 12 +- .../src/strategies/flat-folder/index.ts | 10 +- .../phases/classify-and-analyse-small.ts | 20 +- .../flat-folder/phases/process-big-files.ts | 14 +- .../flat-folder/phases/store-flat-analysis.ts | 16 +- .../flat-folder/prompts/folder-summary.ts | 2 +- .../strategies/flat-folder/repo-summary.ts | 4 +- .../src/strategies/flat-folder/store-pull.ts | 18 +- .../src/strategies/flat-folder/types.ts | 2 +- packages/ingest-github/src/types/pipeline.ts | 2 +- packages/ingest-github/tsconfig.json | 25 +-- packages/ingest-github/types/index.d.ts | 6 + packages/llm/package.json | 3 + packages/llm/src/cache.ts | 1 - packages/llm/src/client.ts | 1 - packages/llm/src/ollama.ts | 1 - packages/llm/src/openrouter.ts | 1 - packages/llm/tsconfig.json | 9 +- packages/logger/package.json | 3 + packages/logger/tsconfig.json | 11 +- packages/mcp/package.json | 3 + packages/mcp/tsconfig.json | 15 +- packages/mongo/package.json | 3 + packages/mongo/tsconfig.json | 9 +- packages/neo4j/package.json | 3 + packages/neo4j/tsconfig.json | 9 +- packages/queue/package.json | 3 + packages/queue/src/workers.ts | 1 + packages/queue/tsconfig.json | 15 +- packages/redis/package.json | 3 + packages/redis/tsconfig.json | 9 +- packages/server/package.json | 3 + packages/server/tsconfig.json | 19 +- packages/types/package.json | 3 + packages/types/src/index.ts | 1 + packages/types/src/job.ts | 32 ++- packages/types/tsconfig.json | 8 +- tsconfig.json | 1 + 110 files changed, 2124 insertions(+), 278 deletions(-) create mode 100644 packages/ingest-business-context/README.md create mode 100644 packages/ingest-business-context/package.json create mode 100644 packages/ingest-business-context/src/README.md create mode 100644 packages/ingest-business-context/src/disk/README.md create mode 100644 packages/ingest-business-context/src/disk/load-cached.ts create mode 100644 packages/ingest-business-context/src/disk/sanitize-title.ts create mode 100644 packages/ingest-business-context/src/disk/save-analysis.ts create mode 100644 packages/ingest-business-context/src/disk/save-original.ts create mode 100644 packages/ingest-business-context/src/errors.ts create mode 100644 packages/ingest-business-context/src/field-defs.ts create mode 100644 packages/ingest-business-context/src/index.ts create mode 100644 packages/ingest-business-context/src/llm/README.md create mode 100644 packages/ingest-business-context/src/llm/analyze-parallel.ts create mode 100644 packages/ingest-business-context/src/llm/call-builder.ts create mode 100644 packages/ingest-business-context/src/llm/enrichment-format.ts create mode 100644 packages/ingest-business-context/src/llm/enrichment-reader.ts create mode 100644 packages/ingest-business-context/src/llm/merge.ts create mode 100644 packages/ingest-business-context/src/llm/title.ts create mode 100644 packages/ingest-business-context/src/neo4j/README.md create mode 100644 packages/ingest-business-context/src/neo4j/indexes.ts create mode 100644 packages/ingest-business-context/src/neo4j/relationship-types.ts create mode 100644 packages/ingest-business-context/src/neo4j/serialize.ts create mode 100644 packages/ingest-business-context/src/neo4j/write-keywords.ts create mode 100644 packages/ingest-business-context/src/neo4j/write-node.ts create mode 100644 packages/ingest-business-context/src/neo4j/write-version.ts create mode 100644 packages/ingest-business-context/src/prompt/README.md create mode 100644 packages/ingest-business-context/src/prompt/analysis-prompt.ts create mode 100644 packages/ingest-business-context/src/prompt/title-prompt.ts create mode 100644 packages/ingest-business-context/src/prompt/user-message.ts create mode 100644 packages/ingest-business-context/src/strategy/README.md create mode 100644 packages/ingest-business-context/src/strategy/commit-validator.ts create mode 100644 packages/ingest-business-context/src/strategy/execute.ts create mode 100644 packages/ingest-business-context/src/strategy/store-graph.ts create mode 100644 packages/ingest-business-context/src/types.ts create mode 100644 packages/ingest-business-context/src/worker/README.md create mode 100644 packages/ingest-business-context/src/worker/handler.ts create mode 100644 packages/ingest-business-context/src/worker/register.ts create mode 100644 packages/ingest-business-context/tsconfig.json diff --git a/packages/cli/package.json b/packages/cli/package.json index 0cb0936..17414e6 100644 --- a/packages/cli/package.json +++ b/packages/cli/package.json @@ -8,6 +8,9 @@ "exports": { ".": "./src/index.ts" }, + "imports": { + "#src/*": "./src/*" + }, "bin": { "bytebell": "./src/index.ts" }, diff --git a/packages/cli/tsconfig.json b/packages/cli/tsconfig.json index 19ed7d8..d8a16a7 100644 --- a/packages/cli/tsconfig.json +++ b/packages/cli/tsconfig.json @@ -1,16 +1,4 @@ { - "extends": "../../tsconfig.base.json", - "compilerOptions": { - "rootDir": "./src", - "outDir": "./dist", - "jsx": "react-jsx" - }, - "include": ["src/**/*"], - "references": [ - { "path": "../config" }, - { "path": "../errors" }, - { "path": "../ingest-github" }, - { "path": "../logger" }, - { "path": "../types" } - ] + "extends": "../../../../tsconfig.base.json", + "include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.json"] } diff --git a/packages/config/package.json b/packages/config/package.json index a38c0a2..d31ebe4 100644 --- a/packages/config/package.json +++ b/packages/config/package.json @@ -8,6 +8,9 @@ "exports": { ".": "./src/index.ts" }, + "imports": { + "#src/*": "./src/*" + }, "dependencies": { "@bb/types": "workspace:*", "zod": "^4.3.6" diff --git a/packages/config/tsconfig.json b/packages/config/tsconfig.json index fd6c909..d8a16a7 100644 --- a/packages/config/tsconfig.json +++ b/packages/config/tsconfig.json @@ -1,11 +1,4 @@ { - "extends": "../../tsconfig.base.json", - "compilerOptions": { - "rootDir": "./src", - "outDir": "./dist", - "noEmit": false, - "emitDeclarationOnly": true - }, - "include": ["src/**/*"], - "references": [{ "path": "../types" }] + "extends": "../../../../tsconfig.base.json", + "include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.json"] } diff --git a/packages/errors/package.json b/packages/errors/package.json index c8d42db..bf2da88 100644 --- a/packages/errors/package.json +++ b/packages/errors/package.json @@ -8,6 +8,9 @@ "exports": { ".": "./src/index.ts" }, + "imports": { + "#src/*": "./src/*" + }, "dependencies": { "@bb/types": "workspace:*" } diff --git a/packages/errors/tsconfig.json b/packages/errors/tsconfig.json index 2d2ce73..d8a16a7 100644 --- a/packages/errors/tsconfig.json +++ b/packages/errors/tsconfig.json @@ -1,9 +1,4 @@ { - "extends": "../../tsconfig.base.json", - "compilerOptions": { - "rootDir": "./src", - "outDir": "./dist" - }, - "include": ["src/**/*"], - "references": [{ "path": "../types" }] + "extends": "../../../../tsconfig.base.json", + "include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.json"] } diff --git a/packages/ingest-business-context/README.md b/packages/ingest-business-context/README.md new file mode 100644 index 0000000..566f444 --- /dev/null +++ b/packages/ingest-business-context/README.md @@ -0,0 +1,54 @@ +# `@bb/ingest-business-context` — context + +## Tier + +Domain. Depends on Kernel (`@bb/types`, `@bb/errors`), Infrastructure (`@bb/config`, `@bb/neo4j`), +Cross-cutting (`@bb/llm`), and Strategy (`@bb/queue`). One horizontal Domain→Domain dependency on +`@bb/ingest-github` (read-only path helpers + the on-disk layout it owns). May be imported by +Binaries (`@bb/server` calls `registerBusinessContextWorker()` once at boot). Never by `@bb/cli`. + +## Responsibility + +Attaches human-authored business-context notes to a specific indexed commit of a GitHub knowledge. +The package consumes `JobType.BusinessContextProcessing` jobs. For each job it: + +1. Validates the commit is indexed (Neo4j contains either `:File {knowledgeId}` or + `:FileVersion {knowledgeId, commitHash}`). +2. Reads optional enrichment from disk (`metaRoot/repo-summary.json`, `metaRoot/org//*.json`). +3. Runs one LLM call to generate a concise title, then three parallel LLM calls covering + product fields, technical fields, and the shared overview. +4. Persists the result to disk at + `metaRoot/commits//business-context//{original.txt,analysis.json}`. +5. Projects the analysis into Neo4j as a `:BusinessContext` node plus a `:BusinessContextVersion` + snapshot keyed by `(knowledgeId, commitHash)`. The version node `:DESCRIBES` every + `:FileVersion {knowledgeId, commitHash}` that exists for the same commit; if none exist yet + (BC authored before the commit was snapshot), zero edges are created and a later run will + backfill them via the same idempotent MERGE. +6. Creates `:OrgKeyword` nodes for each array field (10 typed relationship classes such as + `HAS_DOMAIN_KEYWORD`, `HAS_STAKEHOLDER`, `HAS_AFFECTED_MODULE`) connected to the parent + `:BusinessContext` via `:APPEARS_IN_BUSINESS_CONTEXT`. + +## Public exports + +- `registerBusinessContextWorker(deps?)` — boots the worker. Called by the deployable at startup. +- `executeBusinessContextStrategy(input)` — the disk pipeline (validate → enrichment → title → + analysis → save). Returns the resolved storage paths and the title. Safe to call directly from + HTTP for synchronous flows. +- `storeBusinessContextToNeo4j(input, analysis, sanitizedTitle)` — graph persistence. Separated + so callers can run it inline or defer it. +- `BUSINESS_CONTEXT_FIELD_DEFS` — single source of truth for the 16-field LLM analysis schema. +- Types: `BusinessContextInput`, `BusinessContextAnalysis`, `BusinessContextStorageResult`, + `BusinessContextNeo4jResult`, `BusinessContextAnalysisMetadata`, `CommitNotIndexedError`. + +## Invariants + +- Single LLM call surface — never bypass `@bb/llm`. Outputs are validated against the field-defs + schema before persistence. +- `:BusinessContext` and `:BusinessContextVersion` are addressed by `(knowledgeId, nodeId)` / + `(knowledgeId, nodeId, commitHash)`; all MERGEs are idempotent and re-runnable. +- `nodeId` is the sanitized title (kebab-case, ≤80 chars). Two BC submissions that LLM-title to the + same string will MERGE onto the same node — by design. +- No outbound calls. No GitHub-API lookups. The strategy never clones or pulls — it operates on + the meta-output already produced by `@bb/ingest-github` for the indexed commit. +- All disk writes scoped under `metaRootFor(knowledgeId)/commits//business-context/` + via the `@bb/ingest-github` path helpers — this package never invents its own layout. diff --git a/packages/ingest-business-context/package.json b/packages/ingest-business-context/package.json new file mode 100644 index 0000000..da74346 --- /dev/null +++ b/packages/ingest-business-context/package.json @@ -0,0 +1,24 @@ +{ + "name": "@bb/ingest-business-context", + "version": "0.0.0", + "private": true, + "type": "module", + "main": "./src/index.ts", + "types": "./src/index.ts", + "exports": { + ".": "./src/index.ts" + }, + "imports": { + "#src/*": "./src/*" + }, + "dependencies": { + "@bb/config": "workspace:*", + "@bb/errors": "workspace:*", + "@bb/ingest-github": "workspace:*", + "@bb/llm": "workspace:*", + "@bb/logger": "workspace:*", + "@bb/neo4j": "workspace:*", + "@bb/queue": "workspace:*", + "@bb/types": "workspace:*" + } +} diff --git a/packages/ingest-business-context/src/README.md b/packages/ingest-business-context/src/README.md new file mode 100644 index 0000000..02cc971 --- /dev/null +++ b/packages/ingest-business-context/src/README.md @@ -0,0 +1,37 @@ +# `@bb/ingest-business-context/src` — implementation map + +See [../README.md](../README.md) for the package contract. + +## Layout + +``` +src/ + README.md + index.ts Public barrel + field-defs.ts 16-field analysis schema (single source of truth) + types.ts Input / output / metadata interfaces + errors.ts CommitNotIndexedError, BusinessContextAnalysisFailedError + + prompt/ System + user prompt builders (title, analysis, user-message) + disk/ Disk persistence (sanitize-title, save-original, save-analysis, load-cached) + llm/ Enrichment-reader, enrichment-format, call-builder, merge, title, analyze-parallel + neo4j/ Indexes, relationship-types, serialize, write-node, write-version, write-keywords + strategy/ commit-validator, execute, store-graph + worker/ handler, register +``` + +## Import rules + +- Cross-folder within the package → `src/folder/file.ts`. +- Sibling within the same folder → `./file.ts`. +- Cross-package → `@bb/foo`. +- **Never** `../` parent traversal. + +## Module-graph rules + +- `disk/**` depends only on `node:fs`, `@bb/ingest-github` (paths), `@bb/logger`, and `src/types.ts`. +- `llm/**` depends only on `@bb/llm`, `@bb/logger`, `@bb/ingest-github` (paths), and `src/prompt/`, `src/field-defs.ts`, `src/types.ts`. +- `neo4j/**` depends only on `@bb/neo4j`, `@bb/logger`, and `src/types.ts`. +- `strategy/**` depends on `disk/`, `llm/`, `neo4j/`, `src/errors.ts`, `@bb/ingest-github` (paths), `@bb/logger`, `@bb/neo4j`. +- `worker/**` depends on `strategy/`, `@bb/queue`, `@bb/types`, `@bb/config`, `@bb/logger`. +- No layer skips another. The public API (`index.ts`) re-exports from each layer. diff --git a/packages/ingest-business-context/src/disk/README.md b/packages/ingest-business-context/src/disk/README.md new file mode 100644 index 0000000..dca6384 --- /dev/null +++ b/packages/ingest-business-context/src/disk/README.md @@ -0,0 +1,17 @@ +# `disk/` — context + +Persists business-context artefacts under +`metaRoot/commits//business-context//`. Paths come +from `@bb/ingest-github`'s `businessContextDir()` — this folder never builds +its own paths. + +| File | Responsibility | +| ------------------- | ------------------------------------------------------------------------------- | +| `sanitize-title.ts` | LLM title → kebab-case filesystem-safe slug (≤80 chars). Also the Neo4j nodeId. | +| `save-original.ts` | Writes `original.txt` (raw user-authored text, mode 0600). | +| `save-analysis.ts` | Wraps the analysis in a metadata envelope and writes `analysis.json`. | +| `load-cached.ts` | Reads back a saved envelope; tolerant of missing / malformed files. | + +Cache key is the sanitized title alone. Two BC submissions whose LLM titles +sanitise to the same slug share the same cached analysis (intentional — same +idea, same node). diff --git a/packages/ingest-business-context/src/disk/load-cached.ts b/packages/ingest-business-context/src/disk/load-cached.ts new file mode 100644 index 0000000..a93f68b --- /dev/null +++ b/packages/ingest-business-context/src/disk/load-cached.ts @@ -0,0 +1,41 @@ +import { readFile } from "node:fs/promises"; +import path from "node:path"; +import { businessContextDir } from "@bb/ingest-github"; +import { logger } from "@bb/logger"; +import type { BusinessContextAnalysisMetadata } from "#src/types.ts"; + +/** + * Returns a previously-saved analysis envelope if one exists for this title, + * otherwise `null`. The cache key is the sanitized title — same title across + * re-runs returns the same envelope and skips a fresh LLM call. + * + * Tolerant of missing or malformed files: the strategy treats `null` as a + * cache miss and proceeds with a full LLM run. We never crash on stale JSON. + */ +export async function loadCachedAnalysis( + knowledgeId: string, + commitHash: string, + sanitizedTitle: string, +): Promise { + const filePath = path.join(businessContextDir(knowledgeId, commitHash, sanitizedTitle), "analysis.json"); + let content: string; + try { + content = await readFile(filePath, "utf-8"); + } catch { + return null; + } + try { + const parsed = JSON.parse(content) as BusinessContextAnalysisMetadata; + if (parsed.analysis === undefined || parsed.analysis === null) { + logger.warn(`business-context: cached envelope at ${filePath} has no analysis field; ignoring`); + return null; + } + logger.info( + `business-context: cache HIT at ${filePath} (generated ${parsed.generatedAt}, model ${parsed.modelName})`, + ); + return parsed; + } catch (err) { + logger.warn(`business-context: failed to parse cached analysis ${filePath}: ${(err as Error).message}`); + return null; + } +} diff --git a/packages/ingest-business-context/src/disk/sanitize-title.ts b/packages/ingest-business-context/src/disk/sanitize-title.ts new file mode 100644 index 0000000..a953abf --- /dev/null +++ b/packages/ingest-business-context/src/disk/sanitize-title.ts @@ -0,0 +1,25 @@ +const NON_ALNUM_DASH = /[^a-z0-9\s-]/gu; +const WHITESPACE_RUN = /\s+/gu; +const DASH_RUN = /-{2,}/gu; +const LEADING_OR_TRAILING_DASH = /^-|-$/gu; + +/** + * Converts an LLM-generated title into a filesystem-safe, URL-safe slug. + * + * Lowercase. Non-alphanumerics collapse to single hyphens. Capped at 80 chars + * so the resulting directory name is comfortably under filesystem limits on + * every platform. Used as both the on-disk directory name and the Neo4j + * `nodeId` — two BC submissions whose LLM titles sanitise to the same slug + * MERGE onto the same `:BusinessContext` node (by design — same idea, same + * node). + */ +export function sanitizeTitle(title: string): string { + return title + .toLowerCase() + .replace(NON_ALNUM_DASH, "") + .replace(WHITESPACE_RUN, "-") + .replace(DASH_RUN, "-") + .replace(LEADING_OR_TRAILING_DASH, "") + .slice(0, 80) + .replace(/-$/u, ""); +} diff --git a/packages/ingest-business-context/src/disk/save-analysis.ts b/packages/ingest-business-context/src/disk/save-analysis.ts new file mode 100644 index 0000000..56a4422 --- /dev/null +++ b/packages/ingest-business-context/src/disk/save-analysis.ts @@ -0,0 +1,48 @@ +import { mkdir, writeFile } from "node:fs/promises"; +import path from "node:path"; +import { businessContextDir } from "@bb/ingest-github"; +import { logger } from "@bb/logger"; +import type { BusinessContextAnalysis, BusinessContextAnalysisMetadata } from "#src/types.ts"; + +const DIR_MODE = 0o700; + +export interface SaveAnalysisMetadata { + commitHash: string; + modelName: string; + inputTokens: number; + outputTokens: number; + description?: string; +} + +/** + * Wraps the LLM analysis in a metadata envelope (provenance: model, tokens, + * timestamp) and writes it as `analysis.json` next to `original.txt`. The + * envelope shape is the cache key — loadCachedAnalysis() reads it back on the + * next run with the same sanitized title. + */ +export async function saveAnalysis( + knowledgeId: string, + commitHash: string, + sanitizedTitle: string, + analysis: BusinessContextAnalysis, + meta: SaveAnalysisMetadata, +): Promise { + const envelope: BusinessContextAnalysisMetadata = { + generatedAt: new Date().toISOString(), + commitHash: meta.commitHash, + modelName: meta.modelName, + inputTokens: meta.inputTokens, + outputTokens: meta.outputTokens, + ...(meta.description !== undefined ? { description: meta.description } : {}), + analysis, + }; + + const dir = businessContextDir(knowledgeId, commitHash, sanitizedTitle); + await mkdir(dir, { recursive: true, mode: DIR_MODE }); + const filePath = path.join(dir, "analysis.json"); + await writeFile(filePath, JSON.stringify(envelope, null, 2), { encoding: "utf-8", mode: 0o600 }); + logger.info( + `business-context: saved analysis at ${filePath} (model=${meta.modelName}, ${meta.inputTokens} in / ${meta.outputTokens} out)`, + ); + return filePath; +} diff --git a/packages/ingest-business-context/src/disk/save-original.ts b/packages/ingest-business-context/src/disk/save-original.ts new file mode 100644 index 0000000..60a0073 --- /dev/null +++ b/packages/ingest-business-context/src/disk/save-original.ts @@ -0,0 +1,25 @@ +import { mkdir, writeFile } from "node:fs/promises"; +import path from "node:path"; +import { businessContextDir } from "@bb/ingest-github"; +import { logger } from "@bb/logger"; + +const DIR_MODE = 0o700; + +/** + * Persists the raw user-authored text. Mirror copy of the input — used for + * audit (proving what was analysed) and for re-running the analysis later + * against an updated field-defs schema without re-prompting the user. + */ +export async function saveOriginalText( + knowledgeId: string, + commitHash: string, + sanitizedTitle: string, + text: string, +): Promise { + const dir = businessContextDir(knowledgeId, commitHash, sanitizedTitle); + await mkdir(dir, { recursive: true, mode: DIR_MODE }); + const filePath = path.join(dir, "original.txt"); + await writeFile(filePath, text, { encoding: "utf-8", mode: 0o600 }); + logger.info(`business-context: saved original text at ${filePath} (${text.length} chars)`); + return filePath; +} diff --git a/packages/ingest-business-context/src/errors.ts b/packages/ingest-business-context/src/errors.ts new file mode 100644 index 0000000..4af4339 --- /dev/null +++ b/packages/ingest-business-context/src/errors.ts @@ -0,0 +1,33 @@ +/** + * Thrown when the worker is asked to attach a business context to a commit + * whose files have not been indexed. The HTTP layer maps this to a 409. + */ +export class CommitNotIndexedError extends Error { + readonly knowledgeId: string; + readonly commitHash: string; + + constructor(knowledgeId: string, commitHash: string) { + super(`Commit ${commitHash.substring(0, 12)} is not indexed for knowledge ${knowledgeId}`); + this.name = "CommitNotIndexedError"; + this.knowledgeId = knowledgeId; + this.commitHash = commitHash; + } +} + +/** + * Thrown when every LLM analysis call returns null (no usable JSON). Distinct + * from upstream LLM errors (rate limits, transport) which propagate as-is. + */ +export class BusinessContextAnalysisFailedError extends Error { + readonly knowledgeId: string; + readonly commitHash: string; + + constructor(knowledgeId: string, commitHash: string) { + super( + `All parallel LLM analysis calls returned null for knowledge ${knowledgeId} @ ${commitHash.substring(0, 12)}`, + ); + this.name = "BusinessContextAnalysisFailedError"; + this.knowledgeId = knowledgeId; + this.commitHash = commitHash; + } +} diff --git a/packages/ingest-business-context/src/field-defs.ts b/packages/ingest-business-context/src/field-defs.ts new file mode 100644 index 0000000..d3b7bec --- /dev/null +++ b/packages/ingest-business-context/src/field-defs.ts @@ -0,0 +1,203 @@ +/** + * Single source of truth for the LLM analysis schema. Each entry defines a + * field's expected type, the human-readable description shown to the LLM, + * special instructions that constrain output, whether the field is requested + * from the LLM (vs. populated by the pipeline), and an example value that + * appears in the prompt template. + * + * Changing any value here propagates to the prompt builders and the validation + * paths; nothing else needs to update. + */ +export interface BusinessContextFieldDef { + type: string; + description: string; + special_instructions: string; + requestedFromLLM: boolean; + example: string; +} + +const _FIELD_DEFS = { + // ── Product People Fields ───────────────────────────────────────────────── + + title: { + type: "string", + description: "Concise, descriptive title for this business context entry", + special_instructions: + "Max 50 words. Should be immediately recognizable to a product manager scanning a list. No technical jargon.", + requestedFromLLM: true, + example: '"Stripe Payment Processing Integration"', + }, + product_area: { + type: "string", + description: "Which product domain or area this context describes", + special_instructions: + "One or two words identifying the product area. Use standard product terminology. Empty string if unclear.", + requestedFromLLM: true, + example: '"Payments"', + }, + user_stories: { + type: "string[]", + description: 'User needs this context addresses, each in "As a [role], I want [goal]" format', + special_instructions: + "Max 5 stories. Each must follow the As a / I want pattern. Derive from the text, do not invent needs not mentioned.", + requestedFromLLM: true, + example: + '["As a customer, I want to pay with my saved card so checkout is faster", "As a finance team member, I want transaction reconciliation reports"]', + }, + business_value: { + type: "string", + description: "What measurable value this provides to the business", + special_instructions: + "2-3 sentences max. Focus on revenue, cost, risk, or user satisfaction impact. No technical implementation details.", + requestedFromLLM: true, + example: + '"Reduces checkout abandonment by 15% through one-click payments. Directly impacts monthly recurring revenue and customer retention metrics."', + }, + stakeholders: { + type: "string[]", + description: "Roles or teams who care about this context", + special_instructions: + "Max 6 entries. Use role titles not individual names. Include both business and technical stakeholders mentioned or implied.", + requestedFromLLM: true, + example: '["Product Manager", "Payments Team", "Finance", "Customer Support"]', + }, + success_metrics: { + type: "string[]", + description: "How success is measured for this business context", + special_instructions: + "Max 5 metrics. Each should be a measurable outcome, not a vague goal. Derive from text, infer reasonable metrics if not stated explicitly.", + requestedFromLLM: true, + example: + '["Checkout conversion rate > 85%", "Payment processing latency < 2s", "Zero failed transactions due to integration errors"]', + }, + user_impact: { + type: "string", + description: "How end users are affected, in plain language", + special_instructions: "2-3 sentences. Describe the before/after for the end user. No technical jargon.", + requestedFromLLM: true, + example: + '"Users can now complete purchases in under 30 seconds with saved payment methods. Previously, re-entering card details on every purchase caused significant drop-off."', + }, + domain_keywords: { + type: "string[]", + description: "Business domain search terms for cross-repo discoverability", + special_instructions: + "Max 10 keywords. Business language only — no code identifiers. Think: what would a product person search for?", + requestedFromLLM: true, + example: '["payments", "checkout", "subscription", "billing", "revenue", "PCI compliance"]', + }, + + // ── Developer Fields ────────────────────────────────────────────────────── + + technical_summary: { + type: "string", + description: "What the code actually does at a technical level", + special_instructions: + "3-5 sentences. Include architecture pattern, key technologies, and data stores involved. This is for senior engineers.", + requestedFromLLM: true, + example: + '"Implements a Stripe webhook handler using Express middleware that processes payment_intent events. Uses idempotency keys stored in Redis to prevent duplicate processing. Failed webhooks are retried via a BullMQ dead-letter queue with exponential backoff."', + }, + affected_modules: { + type: "string[]", + description: "Which parts of the codebase are involved (folder paths or module names)", + special_instructions: + "Max 10 entries. Use folder-level paths (e.g., src/payments/) or module names. Derive from context, do not guess paths not mentioned.", + requestedFromLLM: true, + example: '["src/payments/", "src/webhooks/stripe/", "src/queue/workers/payment-processor"]', + }, + architecture_decisions: { + type: "string[]", + description: 'Key technical choices, each as "Decision: X — Rationale: Y"', + special_instructions: + "Max 5 entries. Focus on decisions that would surprise a new developer or that have non-obvious rationale.", + requestedFromLLM: true, + example: + '["Decision: Use webhook-based flow instead of polling — Rationale: Stripe recommends webhooks for reliability", "Decision: Redis idempotency keys with 24h TTL — Rationale: Stripe may retry webhooks for up to 24 hours"]', + }, + dependencies: { + type: "string[]", + description: "Systems, services, or libraries this relies on", + special_instructions: + "Max 8 entries. Include both internal services and external dependencies. Format: 'name (type)' e.g., 'Stripe API (external)', 'Redis (cache)'.", + requestedFromLLM: true, + example: '["Stripe API (external)", "Redis (cache/idempotency)", "BullMQ (queue)", "PostgreSQL (transactions)"]', + }, + risk_areas: { + type: "string[]", + description: "What could go wrong — known fragilities, operational concerns", + special_instructions: "Max 5 entries. Be specific about failure modes. Include both technical and business risks.", + requestedFromLLM: true, + example: + '["Stripe webhook signing secret rotation requires coordinated deploy", "Redis downtime causes duplicate payment processing"]', + }, + data_flow: { + type: "string", + description: "How data moves through the system for this business context", + special_instructions: + "Describe the flow in plain English with arrow notation. Max 3-4 sentences. Include entry points, transforms, and storage.", + requestedFromLLM: true, + example: + '"User submits payment → Stripe processes charge → Webhook hits /api/webhooks/stripe → Handler validates signature → Event queued in BullMQ → Worker updates order status in PostgreSQL → Confirmation email sent via SendGrid."', + }, + api_surface: { + type: "string[]", + description: "APIs exposed or consumed", + special_instructions: + 'Max 8 entries. Format exposed as "METHOD /path — description". Format consumed as "Consumes: service.endpoint — purpose".', + requestedFromLLM: true, + example: + '["POST /api/webhooks/stripe — Receives Stripe webhook events", "GET /api/payments/:id — Retrieve payment status", "Consumes: Stripe PaymentIntents API — Create and confirm charges"]', + }, + + // ── Shared Fields ───────────────────────────────────────────────────────── + + summary: { + type: "string", + description: "2-3 sentence overview combining both business and technical perspectives", + special_instructions: + "First sentence: business context. Second sentence: technical approach. Optional third: key constraint or trade-off. Max 100 tokens.", + requestedFromLLM: true, + example: + '"Enables one-click checkout by integrating Stripe payment processing with saved card tokens. Implemented as an event-driven pipeline using webhooks and BullMQ for reliable async processing. Designed for PCI compliance with zero card data touching our servers."', + }, + keywords: { + type: "string[]", + description: "Searchable terms covering both business and technical vocabulary", + special_instructions: + "Max 15 keywords. Mix of business terms (from domain_keywords) and technical terms. No duplicates across domain_keywords and keywords.", + requestedFromLLM: true, + example: '["stripe", "webhook", "payment-intent", "idempotency", "BullMQ", "checkout", "PCI", "async-processing"]', + }, +} as const; + +export const BUSINESS_CONTEXT_FIELD_DEFS: Record = _FIELD_DEFS; + +export const PRODUCT_FIELDS: readonly string[] = [ + "title", + "product_area", + "user_stories", + "business_value", + "stakeholders", + "success_metrics", + "user_impact", + "domain_keywords", +]; + +export const TECHNICAL_FIELDS: readonly string[] = [ + "technical_summary", + "affected_modules", + "architecture_decisions", + "dependencies", + "risk_areas", + "data_flow", + "api_surface", +]; + +export const SHARED_FIELDS: readonly string[] = ["summary", "keywords"]; + +export const LLM_FIELD_NAMES: readonly string[] = Object.entries(_FIELD_DEFS) + .filter(([, def]) => def.requestedFromLLM) + .map(([name]) => name); + +export const LLM_FIELD_NAME_SET: ReadonlySet = new Set(LLM_FIELD_NAMES); diff --git a/packages/ingest-business-context/src/index.ts b/packages/ingest-business-context/src/index.ts new file mode 100644 index 0000000..79c6422 --- /dev/null +++ b/packages/ingest-business-context/src/index.ts @@ -0,0 +1,33 @@ +// Public API for @bb/ingest-business-context. + +export { registerBusinessContextWorker } from "./worker/register.ts"; +export { handleBusinessContextProcessing } from "./worker/handler.ts"; + +export { executeBusinessContextStrategy } from "./strategy/execute.ts"; +export type { ExecuteOptions } from "./strategy/execute.ts"; +export { storeBusinessContextToNeo4j } from "./strategy/store-graph.ts"; +export type { StoreGraphInput } from "./strategy/store-graph.ts"; +export { assertCommitIndexed, checkCommitIndexed } from "./strategy/commit-validator.ts"; +export type { CommitIndexStatus } from "./strategy/commit-validator.ts"; + +export { BUSINESS_CONTEXT_FIELD_DEFS, LLM_FIELD_NAMES, LLM_FIELD_NAME_SET } from "./field-defs.ts"; +export type { BusinessContextFieldDef } from "./field-defs.ts"; + +export { BUSINESS_CONTEXT_KEYWORD_TYPES } from "./neo4j/relationship-types.ts"; +export { ensureBusinessContextIndexes } from "./neo4j/indexes.ts"; + +export { sanitizeTitle } from "./disk/sanitize-title.ts"; +export { loadCachedAnalysis } from "./disk/load-cached.ts"; + +export { CommitNotIndexedError, BusinessContextAnalysisFailedError } from "./errors.ts"; + +export type { + BusinessContextAnalysis, + BusinessContextAnalysisMetadata, + BusinessContextInput, + BusinessContextLlmOptions, + BusinessContextNeo4jResult, + BusinessContextStorageResult, + TitleGenerationResult, + AnalysisResult, +} from "./types.ts"; diff --git a/packages/ingest-business-context/src/llm/README.md b/packages/ingest-business-context/src/llm/README.md new file mode 100644 index 0000000..5cd3f94 --- /dev/null +++ b/packages/ingest-business-context/src/llm/README.md @@ -0,0 +1,16 @@ +# `llm/` — context + +LLM-driven analysis. All calls flow through `@bb/llm` (`askJsonLLM`). Per-job +overrides (apiKey, provider, model) come in via the worker payload and are +applied here. + +| File | Responsibility | +| ---------------------- | ------------------------------------------------------------------------------------------ | +| `enrichment-reader.ts` | Reads optional org-level registries and repo-summary from disk. Tolerant of missing files. | +| `enrichment-format.ts` | Renders enrichment data into a per-focus prompt section with a token cap. | +| `call-builder.ts` | Composes one analysis call (system+user) and trims enrichment if over budget. | +| `merge.ts` | Merges three partial blobs into one fully-populated `BusinessContextAnalysis`. | +| `title.ts` | Title-generation call. Returns the fallback "Untitled Business Context" on null. | +| `analyze-parallel.ts` | Runs the 3 analysis calls concurrently and merges results. | + +The package never imports OpenAI / Anthropic SDKs. Only `@bb/llm`. diff --git a/packages/ingest-business-context/src/llm/analyze-parallel.ts b/packages/ingest-business-context/src/llm/analyze-parallel.ts new file mode 100644 index 0000000..cfdb60c --- /dev/null +++ b/packages/ingest-business-context/src/llm/analyze-parallel.ts @@ -0,0 +1,111 @@ +import { askJsonLLM, type AskJsonLlmOptions, type LlmProviderName, tokenLen } from "@bb/llm"; +import { logger } from "@bb/logger"; +import { PRODUCT_FIELDS, SHARED_FIELDS, TECHNICAL_FIELDS } from "#src/field-defs.ts"; +import { buildAnalysisPromptForCall } from "#src/llm/call-builder.ts"; +import type { EnrichmentData } from "#src/llm/enrichment-reader.ts"; +import type { EnrichmentFocus } from "#src/llm/enrichment-format.ts"; +import { mergeAnalysisFields } from "#src/llm/merge.ts"; +import type { AnalysisResult, BusinessContextAnalysis, BusinessContextLlmOptions } from "#src/types.ts"; + +const MAX_CONTEXT_WINDOW = 50_000; +const KNOWN_PROVIDERS: ReadonlySet = new Set(["openrouter", "ollama"]); + +interface AnalysisCall { + name: string; + fields: readonly string[]; + focus: EnrichmentFocus; +} + +const CALLS: readonly AnalysisCall[] = [ + { name: "product", fields: PRODUCT_FIELDS, focus: "product" }, + { name: "technical", fields: TECHNICAL_FIELDS, focus: "technical" }, + { name: "shared", fields: SHARED_FIELDS, focus: "shared" }, +]; + +function buildLlmOpts(options: BusinessContextLlmOptions): AskJsonLlmOptions { + const opts: AskJsonLlmOptions = { maxRetries: 3 }; + if (options.apiKey !== undefined) { + opts.apiKey = options.apiKey; + } + if (options.model !== undefined) { + opts.model = options.model; + } + if (options.provider !== undefined && KNOWN_PROVIDERS.has(options.provider)) { + opts.provider = options.provider as LlmProviderName; + } + return opts; +} + +async function runOneCall( + call: AnalysisCall, + text: string, + title: string, + enrichment: EnrichmentData, + baseOpts: AskJsonLlmOptions, +): Promise<{ + result: Partial | null; + model: string; + inputTokens: number; + outputTokens: number; +}> { + const { systemPrompt, userMessage } = buildAnalysisPromptForCall(call, text, title, enrichment, MAX_CONTEXT_WINDOW); + const promptTokens = tokenLen(systemPrompt) + tokenLen(userMessage); + logger.info(`business-context: call "${call.name}" ~${promptTokens} tokens, ${call.fields.length} fields`); + + const r = await askJsonLLM>(systemPrompt, userMessage, baseOpts); + return { + result: r.result, + model: r.usage.model, + inputTokens: r.usage.inputTokens, + outputTokens: r.usage.outputTokens, + }; +} + +/** + * Runs the three analysis LLM calls in parallel (product, technical, shared) + * and merges the partial results into a single `BusinessContextAnalysis`. + * Returns `analysis: null` only when every call returned null — caller treats + * that as a fatal failure. + */ +export async function analyzeBusinessContextParallel( + text: string, + title: string, + enrichment: EnrichmentData, + options: BusinessContextLlmOptions, +): Promise { + const baseOpts = buildLlmOpts(options); + const calls = await Promise.all(CALLS.map((c) => runOneCall(c, text, title, enrichment, baseOpts))); + + let totalInputTokens = 0; + let totalOutputTokens = 0; + let modelName = ""; + let nonNullResults = 0; + const merged: Record = {}; + + for (let i = 0; i < calls.length; i += 1) { + const r = calls[i]; + const callName = CALLS[i]?.name ?? "?"; + if (r === undefined) { + continue; + } + totalInputTokens += r.inputTokens; + totalOutputTokens += r.outputTokens; + if (modelName.length === 0 && r.model.length > 0) { + modelName = r.model; + } + if (r.result !== null) { + nonNullResults += 1; + Object.assign(merged, r.result); + logger.info(`business-context: call "${callName}" done (${r.inputTokens} in / ${r.outputTokens} out)`); + } else { + logger.warn(`business-context: call "${callName}" returned null — fields will use defaults`); + } + } + + if (nonNullResults === 0) { + return { analysis: null, inputTokens: totalInputTokens, outputTokens: totalOutputTokens, modelName }; + } + + const analysis = mergeAnalysisFields(merged, title); + return { analysis, inputTokens: totalInputTokens, outputTokens: totalOutputTokens, modelName }; +} diff --git a/packages/ingest-business-context/src/llm/call-builder.ts b/packages/ingest-business-context/src/llm/call-builder.ts new file mode 100644 index 0000000..a90d83b --- /dev/null +++ b/packages/ingest-business-context/src/llm/call-builder.ts @@ -0,0 +1,46 @@ +import { tokenLen } from "@bb/llm"; +import { logger } from "@bb/logger"; +import { buildPartialAnalysisPrompt } from "#src/prompt/analysis-prompt.ts"; +import { buildEnrichedUserMessage } from "#src/prompt/user-message.ts"; +import { buildEnrichmentSection, type EnrichmentFocus } from "#src/llm/enrichment-format.ts"; +import type { EnrichmentData } from "#src/llm/enrichment-reader.ts"; + +export interface AnalysisCallShape { + name: string; + fields: readonly string[]; + focus: EnrichmentFocus; +} + +export interface BuiltCall { + systemPrompt: string; + userMessage: string; +} + +/** + * Builds the prompt pair for a single analysis call. If the combined + * system+user token estimate exceeds the budget, the enrichment section is + * trimmed proportionally and the user message is rebuilt — we never let the + * prompt drift past the budget silently. + */ +export function buildAnalysisPromptForCall( + call: AnalysisCallShape, + text: string, + title: string, + enrichment: EnrichmentData, + maxContextWindow: number, +): BuiltCall { + const systemPrompt = buildPartialAnalysisPrompt(call.fields); + let enrichmentSection = buildEnrichmentSection(enrichment, call.focus); + let userMessage = buildEnrichedUserMessage(text, title, enrichmentSection); + let totalTokens = tokenLen(systemPrompt) + tokenLen(userMessage); + + if (totalTokens > maxContextWindow && enrichmentSection.length > 0) { + const ratio = (maxContextWindow / totalTokens) * 0.8; + enrichmentSection = enrichmentSection.slice(0, Math.floor(enrichmentSection.length * ratio)); + userMessage = buildEnrichedUserMessage(text, title, enrichmentSection); + totalTokens = tokenLen(systemPrompt) + tokenLen(userMessage); + logger.warn(`business-context: call "${call.name}" trimmed enrichment to ~${totalTokens} tokens`); + } + + return { systemPrompt, userMessage }; +} diff --git a/packages/ingest-business-context/src/llm/enrichment-format.ts b/packages/ingest-business-context/src/llm/enrichment-format.ts new file mode 100644 index 0000000..6ece2b8 --- /dev/null +++ b/packages/ingest-business-context/src/llm/enrichment-format.ts @@ -0,0 +1,88 @@ +import { tokenLen } from "@bb/llm"; +import { logger } from "@bb/logger"; +import type { EnrichmentData, KeywordCount } from "#src/llm/enrichment-reader.ts"; + +const MAX_ENRICHMENT_TOKENS = 15_000; + +export type EnrichmentFocus = "product" | "technical" | "shared"; + +function formatEntries(entries: readonly KeywordCount[]): string { + return entries.map((e) => ` ${e.keyword} (${e.count})`).join("\n"); +} + +function appendProductSection(enrichment: EnrichmentData, sections: string[]): void { + if (enrichment.topKeywords.length > 0) { + sections.push(`TOP REPOSITORY KEYWORDS (by frequency):\n${formatEntries(enrichment.topKeywords)}`); + } + if (enrichment.topBusinessEntities.length > 0) { + sections.push(`TOP BUSINESS ENTITIES:\n${formatEntries(enrichment.topBusinessEntities)}`); + } + if (enrichment.topOntologyConcepts.length > 0) { + sections.push(`TOP ONTOLOGY CONCEPTS:\n${formatEntries(enrichment.topOntologyConcepts)}`); + } + if (enrichment.majorSubsystems.length > 0) { + const lines = enrichment.majorSubsystems.map((s) => ` ${s.name}: ${s.responsibility}`).join("\n"); + sections.push(`MAJOR SUBSYSTEMS:\n${lines}`); + } +} + +function appendTechnicalSection(enrichment: EnrichmentData, sections: string[]): void { + if (enrichment.repoArchitecture.length > 0) { + sections.push(`REPOSITORY ARCHITECTURE:\n${enrichment.repoArchitecture}`); + } + if (enrichment.repoDataFlow.length > 0) { + sections.push(`DATA FLOW:\n${enrichment.repoDataFlow}`); + } + if (enrichment.repoKeyPatterns.length > 0) { + sections.push(`KEY PATTERNS:\n ${enrichment.repoKeyPatterns.join(", ")}`); + } + if (enrichment.integrationSurface.length > 0) { + sections.push(`INTEGRATION SURFACE:\n${formatEntries(enrichment.integrationSurface)}`); + } + if (enrichment.contractsProvided.length > 0) { + sections.push(`CONTRACTS PROVIDED:\n${formatEntries(enrichment.contractsProvided)}`); + } + if (enrichment.contractsConsumed.length > 0) { + sections.push(`CONTRACTS CONSUMED:\n${formatEntries(enrichment.contractsConsumed)}`); + } + if (enrichment.sideEffects.length > 0) { + sections.push(`SIDE EFFECTS:\n${formatEntries(enrichment.sideEffects)}`); + } + if (enrichment.configDependencies.length > 0) { + sections.push(`CONFIG DEPENDENCIES:\n${formatEntries(enrichment.configDependencies)}`); + } + if (enrichment.topSystemCapabilities.length > 0) { + sections.push(`SYSTEM CAPABILITIES:\n${formatEntries(enrichment.topSystemCapabilities)}`); + } +} + +/** + * Renders the enrichment data into a string targeted at a specific LLM call. + * Product call sees business entities and concepts; technical call sees + * architecture, contracts, side effects; shared call sees both. + * + * Output is capped at `MAX_ENRICHMENT_TOKENS` — over budget, truncated + * proportionally. Empty enrichment returns an empty string and the user-message + * composer elides the section entirely. + */ +export function buildEnrichmentSection(enrichment: EnrichmentData, focus: EnrichmentFocus): string { + const sections: string[] = []; + if (focus === "product" || focus === "shared") { + appendProductSection(enrichment, sections); + } + if (focus === "technical" || focus === "shared") { + appendTechnicalSection(enrichment, sections); + } + const full = sections.join("\n\n"); + if (full.length === 0) { + return ""; + } + + const tokens = tokenLen(full); + if (tokens > MAX_ENRICHMENT_TOKENS) { + logger.info(`business-context: enrichment (${focus}) ${tokens} tokens > cap ${MAX_ENRICHMENT_TOKENS}; truncating`); + const ratio = MAX_ENRICHMENT_TOKENS / tokens; + return full.slice(0, Math.floor(full.length * ratio)); + } + return full; +} diff --git a/packages/ingest-business-context/src/llm/enrichment-reader.ts b/packages/ingest-business-context/src/llm/enrichment-reader.ts new file mode 100644 index 0000000..dd082f6 --- /dev/null +++ b/packages/ingest-business-context/src/llm/enrichment-reader.ts @@ -0,0 +1,147 @@ +import { readFile } from "node:fs/promises"; +import path from "node:path"; +import { metaRootFor, orgRegistryDir } from "@bb/ingest-github"; +import { logger } from "@bb/logger"; + +const TOP_N = 50; + +/** + * Org-level keyword registry files the reader probes for. None of these are + * produced by OSS today; downstream multi-tenant deployments may produce them + * by aggregating across all knowledges in an org. Missing files are normal + * and degrade silently to empty data. + */ +type OrgRegistryFile = + | "keywords.json" + | "business-entities.json" + | "ontology-concepts.json" + | "system-capabilities.json" + | "integration-surface.json" + | "contracts-provided.json" + | "contracts-consumed.json" + | "side-effects.json" + | "config-dependencies.json"; + +export interface KeywordCount { + keyword: string; + count: number; +} + +export interface EnrichmentData { + topKeywords: KeywordCount[]; + topBusinessEntities: KeywordCount[]; + topOntologyConcepts: KeywordCount[]; + topSystemCapabilities: KeywordCount[]; + integrationSurface: KeywordCount[]; + contractsProvided: KeywordCount[]; + contractsConsumed: KeywordCount[]; + sideEffects: KeywordCount[]; + configDependencies: KeywordCount[]; + repoArchitecture: string; + repoDataFlow: string; + repoKeyPatterns: string[]; + majorSubsystems: Array<{ name: string; responsibility: string }>; +} + +export function emptyEnrichment(): EnrichmentData { + return { + topKeywords: [], + topBusinessEntities: [], + topOntologyConcepts: [], + topSystemCapabilities: [], + integrationSurface: [], + contractsProvided: [], + contractsConsumed: [], + sideEffects: [], + configDependencies: [], + repoArchitecture: "", + repoDataFlow: "", + repoKeyPatterns: [], + majorSubsystems: [], + }; +} + +async function readJsonSafe(filePath: string): Promise { + try { + const content = await readFile(filePath, "utf-8"); + return JSON.parse(content) as unknown; + } catch { + return null; + } +} + +async function readOrgRegistry(dir: string, file: OrgRegistryFile): Promise { + const data = await readJsonSafe(path.join(dir, file)); + if (data === null || typeof data !== "object") { + return []; + } + const entries: KeywordCount[] = []; + for (const [keyword, count] of Object.entries(data as Record)) { + if (typeof count === "number") { + entries.push({ keyword, count }); + } + } + entries.sort((a, b) => b.count - a.count); + return entries.slice(0, TOP_N); +} + +interface RepoSummaryShape { + architecture?: string; + dataFlow?: string; + keyPatterns?: unknown; + majorSubsystems?: unknown; +} + +async function readRepoSummary(knowledgeId: string, enrichment: EnrichmentData): Promise { + const repoSummaryJson = path.join(metaRootFor(knowledgeId), "repo-summary.json"); + const data = await readJsonSafe(repoSummaryJson); + if (data === null || typeof data !== "object") { + return; + } + const rs = ((data as { repoSummary?: RepoSummaryShape }).repoSummary ?? data) as RepoSummaryShape; + if (typeof rs.architecture === "string") { + enrichment.repoArchitecture = rs.architecture; + } + if (typeof rs.dataFlow === "string") { + enrichment.repoDataFlow = rs.dataFlow; + } + if (Array.isArray(rs.keyPatterns)) { + enrichment.repoKeyPatterns = rs.keyPatterns.filter((p): p is string => typeof p === "string"); + } + if (Array.isArray(rs.majorSubsystems)) { + enrichment.majorSubsystems = rs.majorSubsystems + .filter((s): s is { name?: unknown; responsibility?: unknown } => typeof s === "object" && s !== null) + .map((s) => ({ + name: typeof s.name === "string" ? s.name : "", + responsibility: typeof s.responsibility === "string" ? s.responsibility : "", + })) + .filter((s) => s.name.length > 0); + } +} + +/** + * Reads enrichment data from disk. Never throws — every missing file degrades + * silently to empty data. The strategy proceeds with whatever it finds; the + * LLM is robust to empty enrichment sections. + */ +export async function collectEnrichmentData(knowledgeId: string, orgId: string): Promise { + const enrichment = emptyEnrichment(); + const registryDir = orgRegistryDir(knowledgeId, orgId); + + enrichment.topKeywords = await readOrgRegistry(registryDir, "keywords.json"); + enrichment.topBusinessEntities = await readOrgRegistry(registryDir, "business-entities.json"); + enrichment.topOntologyConcepts = await readOrgRegistry(registryDir, "ontology-concepts.json"); + enrichment.topSystemCapabilities = await readOrgRegistry(registryDir, "system-capabilities.json"); + enrichment.integrationSurface = await readOrgRegistry(registryDir, "integration-surface.json"); + enrichment.contractsProvided = await readOrgRegistry(registryDir, "contracts-provided.json"); + enrichment.contractsConsumed = await readOrgRegistry(registryDir, "contracts-consumed.json"); + enrichment.sideEffects = await readOrgRegistry(registryDir, "side-effects.json"); + enrichment.configDependencies = await readOrgRegistry(registryDir, "config-dependencies.json"); + + await readRepoSummary(knowledgeId, enrichment); + + logger.info( + `business-context: enrichment loaded — ${enrichment.topKeywords.length} kw, ${enrichment.topBusinessEntities.length} entities, architecture=${enrichment.repoArchitecture.length > 0}, subsystems=${enrichment.majorSubsystems.length}`, + ); + return enrichment; +} diff --git a/packages/ingest-business-context/src/llm/merge.ts b/packages/ingest-business-context/src/llm/merge.ts new file mode 100644 index 0000000..4d269f9 --- /dev/null +++ b/packages/ingest-business-context/src/llm/merge.ts @@ -0,0 +1,40 @@ +import type { BusinessContextAnalysis } from "#src/types.ts"; + +function takeString(value: unknown, fallback = ""): string { + return typeof value === "string" ? value : fallback; +} + +function takeStringArray(value: unknown): string[] { + if (!Array.isArray(value)) { + return []; + } + return value.filter((v): v is string => typeof v === "string"); +} + +/** + * Merges three partial analysis blobs (product, technical, shared) into a + * single fully-populated `BusinessContextAnalysis`. Missing fields default to + * empty values. The pre-generated `title` is used as the final fallback if + * the product call did not emit one. + */ +export function mergeAnalysisFields(merged: Record, fallbackTitle: string): BusinessContextAnalysis { + return { + title: takeString(merged["title"], fallbackTitle), + product_area: takeString(merged["product_area"]), + user_stories: takeStringArray(merged["user_stories"]), + business_value: takeString(merged["business_value"]), + stakeholders: takeStringArray(merged["stakeholders"]), + success_metrics: takeStringArray(merged["success_metrics"]), + user_impact: takeString(merged["user_impact"]), + domain_keywords: takeStringArray(merged["domain_keywords"]), + technical_summary: takeString(merged["technical_summary"]), + affected_modules: takeStringArray(merged["affected_modules"]), + architecture_decisions: takeStringArray(merged["architecture_decisions"]), + dependencies: takeStringArray(merged["dependencies"]), + risk_areas: takeStringArray(merged["risk_areas"]), + data_flow: takeString(merged["data_flow"]), + api_surface: takeStringArray(merged["api_surface"]), + summary: takeString(merged["summary"]), + keywords: takeStringArray(merged["keywords"]), + }; +} diff --git a/packages/ingest-business-context/src/llm/title.ts b/packages/ingest-business-context/src/llm/title.ts new file mode 100644 index 0000000..c55e654 --- /dev/null +++ b/packages/ingest-business-context/src/llm/title.ts @@ -0,0 +1,50 @@ +import { askJsonLLM, type AskJsonLlmOptions, type LlmProviderName } from "@bb/llm"; +import { logger } from "@bb/logger"; +import { buildTitleGenerationPrompt } from "#src/prompt/title-prompt.ts"; +import type { BusinessContextLlmOptions, TitleGenerationResult } from "#src/types.ts"; + +const FALLBACK_TITLE = "Untitled Business Context"; + +const KNOWN_PROVIDERS: ReadonlySet = new Set(["openrouter", "ollama"]); + +function buildLlmOpts(options: BusinessContextLlmOptions): AskJsonLlmOptions { + const opts: AskJsonLlmOptions = { maxRetries: 2 }; + if (options.apiKey !== undefined) { + opts.apiKey = options.apiKey; + } + if (options.model !== undefined) { + opts.model = options.model; + } + if (options.provider !== undefined && KNOWN_PROVIDERS.has(options.provider)) { + opts.provider = options.provider as LlmProviderName; + } + return opts; +} + +/** + * Runs the title-generation LLM call. Returns `FALLBACK_TITLE` if the LLM + * returns nothing parseable — the rest of the pipeline still completes. + */ +export async function generateBusinessContextTitle( + text: string, + options: BusinessContextLlmOptions, +): Promise { + const systemPrompt = buildTitleGenerationPrompt(); + const result = await askJsonLLM<{ title?: unknown }>(systemPrompt, text, buildLlmOpts(options)); + + const title = + result.result !== null && typeof result.result.title === "string" && result.result.title.trim().length > 0 + ? result.result.title.trim() + : FALLBACK_TITLE; + + logger.info( + `business-context: title generated — "${title}" (model=${result.usage.model}, ${result.usage.inputTokens} in / ${result.usage.outputTokens} out)`, + ); + + return { + title, + inputTokens: result.usage.inputTokens, + outputTokens: result.usage.outputTokens, + modelName: result.usage.model, + }; +} diff --git a/packages/ingest-business-context/src/neo4j/README.md b/packages/ingest-business-context/src/neo4j/README.md new file mode 100644 index 0000000..e3033ec --- /dev/null +++ b/packages/ingest-business-context/src/neo4j/README.md @@ -0,0 +1,33 @@ +# `neo4j/` — context + +Owns every Cypher statement the package issues. All writes go through +`@bb/neo4j`'s `runCypher` — no driver imports here. + +## Schema + +``` +(:Knowledge {knowledgeId}) + -[:HAS_BUSINESS_CONTEXT]-> + (:BusinessContext {nodeId, knowledgeId, orgId, title, productArea, summary, + businessValue, technicalSummary, userImpact, + keywordsText, domainKeywordsText, updatedAt}) + -[:HAS_VERSION]-> + (:BusinessContextVersion {knowledgeId, nodeId, commitHash, orgId, + analysisJson, updatedAt}) + -[:DESCRIBES]-> (:FileVersion {knowledgeId, commitHash, …}) [zero or more] + +(:OrgKeyword {orgId, keyword, type}) + -[:APPEARS_IN_BUSINESS_CONTEXT]-> + (:BusinessContext) +``` + +| File | Responsibility | +| ----------------------- | ---------------------------------------------------------------------------- | --- | ------------------------------------- | +| `indexes.ts` | `ensureBusinessContextIndexes()` — 7 `IF NOT EXISTS` indexes. | +| `relationship-types.ts` | Field → relationship-class map (10 typed classes on `:OrgKeyword`). | +| `serialize.ts` | `string[] → "a | b | c"` for fulltext-friendly properties. | +| `write-node.ts` | Merges the parent `:BusinessContext` and links it from `:Knowledge`. | +| `write-version.ts` | Merges the per-commit `:BusinessContextVersion` and links to `:FileVersion`. | +| `write-keywords.ts` | Merges `:OrgKeyword` nodes and `:APPEARS_IN_BUSINESS_CONTEXT` edges. | + +Every MERGE is keyed so re-runs are no-ops (idempotency is the contract). diff --git a/packages/ingest-business-context/src/neo4j/indexes.ts b/packages/ingest-business-context/src/neo4j/indexes.ts new file mode 100644 index 0000000..63dd22d --- /dev/null +++ b/packages/ingest-business-context/src/neo4j/indexes.ts @@ -0,0 +1,24 @@ +import { runCypher } from "@bb/neo4j"; +import { logger } from "@bb/logger"; + +const INDEX_DEFINITIONS: readonly string[] = [ + "CREATE INDEX business_context_by_knowledge IF NOT EXISTS FOR (bc:BusinessContext) ON (bc.knowledgeId)", + "CREATE INDEX business_context_by_node_id IF NOT EXISTS FOR (bc:BusinessContext) ON (bc.nodeId)", + "CREATE INDEX business_context_by_org IF NOT EXISTS FOR (bc:BusinessContext) ON (bc.orgId)", + "CREATE INDEX business_context_version_by_knowledge_commit IF NOT EXISTS FOR (bv:BusinessContextVersion) ON (bv.knowledgeId, bv.commitHash)", + "CREATE INDEX business_context_version_by_node_commit IF NOT EXISTS FOR (bv:BusinessContextVersion) ON (bv.nodeId, bv.commitHash)", + "CREATE INDEX org_keyword_by_org_keyword IF NOT EXISTS FOR (k:OrgKeyword) ON (k.orgId, k.keyword)", + "CREATE INDEX org_keyword_by_type IF NOT EXISTS FOR (k:OrgKeyword) ON (k.type)", +]; + +/** + * Creates the indexes the business-context queries rely on. Safe to call + * repeatedly — every statement uses `IF NOT EXISTS`. The worker invokes + * this once before each Neo4j write. + */ +export async function ensureBusinessContextIndexes(): Promise { + for (const ddl of INDEX_DEFINITIONS) { + await runCypher(ddl); + } + logger.info("business-context: indexes ensured"); +} diff --git a/packages/ingest-business-context/src/neo4j/relationship-types.ts b/packages/ingest-business-context/src/neo4j/relationship-types.ts new file mode 100644 index 0000000..d44a7e1 --- /dev/null +++ b/packages/ingest-business-context/src/neo4j/relationship-types.ts @@ -0,0 +1,19 @@ +/** + * Maps each array-valued field in `BusinessContextAnalysis` to the typed + * relationship name connecting an `:OrgKeyword` to its parent + * `:BusinessContext`. Edge label is fixed (`:APPEARS_IN_BUSINESS_CONTEXT`); + * the `type` property on the `:OrgKeyword` node carries the relationship + * class so queries can filter by stakeholder vs. risk vs. dependency etc. + */ +export const BUSINESS_CONTEXT_KEYWORD_TYPES: Readonly> = { + domain_keywords: "HAS_DOMAIN_KEYWORD", + keywords: "HAS_KEYWORD", + stakeholders: "HAS_STAKEHOLDER", + affected_modules: "HAS_AFFECTED_MODULE", + risk_areas: "HAS_RISK_AREA", + api_surface: "HAS_API_SURFACE", + dependencies: "HAS_DEPENDENCY", + user_stories: "HAS_USER_STORY", + success_metrics: "HAS_SUCCESS_METRIC", + architecture_decisions: "HAS_ARCHITECTURE_DECISION", +}; diff --git a/packages/ingest-business-context/src/neo4j/serialize.ts b/packages/ingest-business-context/src/neo4j/serialize.ts new file mode 100644 index 0000000..19d4416 --- /dev/null +++ b/packages/ingest-business-context/src/neo4j/serialize.ts @@ -0,0 +1,8 @@ +/** + * Joins an array into a single delimited string for storage on a Neo4j property + * that we want full-text indexable. Empty values are skipped; empty input + * returns "". + */ +export function serializeArrayForNeo4j(values: readonly string[]): string { + return values.filter((v) => typeof v === "string" && v.trim().length > 0).join(" | "); +} diff --git a/packages/ingest-business-context/src/neo4j/write-keywords.ts b/packages/ingest-business-context/src/neo4j/write-keywords.ts new file mode 100644 index 0000000..75ea426 --- /dev/null +++ b/packages/ingest-business-context/src/neo4j/write-keywords.ts @@ -0,0 +1,60 @@ +import { runCypher } from "@bb/neo4j"; +import { BUSINESS_CONTEXT_KEYWORD_TYPES } from "#src/neo4j/relationship-types.ts"; +import type { BusinessContextAnalysis } from "#src/types.ts"; + +export interface BusinessContextKeywordIdentity { + knowledgeId: string; + orgId: string; +} + +const MERGE_KEYWORDS = ` +UNWIND $keywords AS kwData +MERGE (kw:OrgKeyword {orgId: $orgId, keyword: kwData.word, type: $relType}) +WITH kw +MATCH (bc:BusinessContext {nodeId: $nodeId, knowledgeId: $knowledgeId}) +MERGE (kw)-[:APPEARS_IN_BUSINESS_CONTEXT]->(bc) +RETURN count(*) AS count +`; + +function pickArrayField(analysis: BusinessContextAnalysis, field: string): string[] { + const value = (analysis as unknown as Record)[field]; + if (!Array.isArray(value)) { + return []; + } + return value + .filter((v): v is string => typeof v === "string") + .map((v) => v.trim()) + .filter((v) => v.length > 0); +} + +/** + * Creates `:OrgKeyword` nodes for every populated array field and connects + * them to the parent `:BusinessContext`. One MERGE per relationship class — + * keeps the writes batched and idempotent. Returns the total count of edges + * (created or pre-existing) across all classes. + */ +export async function createBusinessContextKeywords( + identity: BusinessContextKeywordIdentity, + analysis: BusinessContextAnalysis, + sanitizedTitle: string, +): Promise { + let total = 0; + for (const [field, relType] of Object.entries(BUSINESS_CONTEXT_KEYWORD_TYPES)) { + const words = pickArrayField(analysis, field); + if (words.length === 0) { + continue; + } + + const rows = await runCypher<{ count: number }>(MERGE_KEYWORDS, { + keywords: words.map((w) => ({ word: w })), + relType, + orgId: identity.orgId, + nodeId: sanitizedTitle, + knowledgeId: identity.knowledgeId, + }); + if (rows.length > 0) { + total += Number(rows[0]?.count ?? 0); + } + } + return total; +} diff --git a/packages/ingest-business-context/src/neo4j/write-node.ts b/packages/ingest-business-context/src/neo4j/write-node.ts new file mode 100644 index 0000000..232f887 --- /dev/null +++ b/packages/ingest-business-context/src/neo4j/write-node.ts @@ -0,0 +1,53 @@ +import { runCypher } from "@bb/neo4j"; +import { serializeArrayForNeo4j } from "#src/neo4j/serialize.ts"; +import type { BusinessContextAnalysis } from "#src/types.ts"; + +export interface BusinessContextNodeIdentity { + knowledgeId: string; + orgId: string; +} + +const MERGE_BUSINESS_CONTEXT = ` +MERGE (bc:BusinessContext {nodeId: $nodeId, knowledgeId: $knowledgeId}) +SET bc.orgId = $orgId, + bc.title = $title, + bc.productArea = $productArea, + bc.summary = $summary, + bc.businessValue = $businessValue, + bc.technicalSummary = $technicalSummary, + bc.userImpact = $userImpact, + bc.keywordsText = $keywordsText, + bc.domainKeywordsText = $domainKeywordsText, + bc.updatedAt = $updatedAt +WITH bc +MATCH (k:Knowledge {knowledgeId: $knowledgeId}) +MERGE (k)-[:HAS_BUSINESS_CONTEXT]->(bc) +RETURN count(bc) AS count +`; + +/** + * Creates or updates the parent `:BusinessContext` node, then links it from + * the owning `:Knowledge`. Idempotent — MERGE on `(nodeId, knowledgeId)` means + * resubmitting the same BC returns the same node. + */ +export async function createBusinessContextNode( + identity: BusinessContextNodeIdentity, + analysis: BusinessContextAnalysis, + sanitizedTitle: string, +): Promise { + const rows = await runCypher<{ count: number }>(MERGE_BUSINESS_CONTEXT, { + nodeId: sanitizedTitle, + knowledgeId: identity.knowledgeId, + orgId: identity.orgId, + title: analysis.title, + productArea: analysis.product_area, + summary: analysis.summary, + businessValue: analysis.business_value, + technicalSummary: analysis.technical_summary, + userImpact: analysis.user_impact, + keywordsText: serializeArrayForNeo4j(analysis.keywords), + domainKeywordsText: serializeArrayForNeo4j(analysis.domain_keywords), + updatedAt: new Date().toISOString(), + }); + return rows.length > 0 ? Number(rows[0]?.count ?? 0) : 0; +} diff --git a/packages/ingest-business-context/src/neo4j/write-version.ts b/packages/ingest-business-context/src/neo4j/write-version.ts new file mode 100644 index 0000000..dc9700e --- /dev/null +++ b/packages/ingest-business-context/src/neo4j/write-version.ts @@ -0,0 +1,71 @@ +import { runCypher } from "@bb/neo4j"; +import type { BusinessContextAnalysis } from "#src/types.ts"; + +export interface BusinessContextVersionIdentity { + knowledgeId: string; + orgId: string; + commitHash: string; +} + +const MERGE_VERSION = ` +MERGE (bv:BusinessContextVersion { + knowledgeId: $knowledgeId, + nodeId: $nodeId, + commitHash: $commitHash +}) +SET bv.orgId = $orgId, + bv.analysisJson = $analysisJson, + bv.updatedAt = $updatedAt +WITH bv +MATCH (bc:BusinessContext {nodeId: $nodeId, knowledgeId: $knowledgeId}) +MERGE (bc)-[:HAS_VERSION]->(bv) +RETURN count(bv) AS count +`; + +const LINK_TO_FILE_VERSIONS = ` +MATCH (bv:BusinessContextVersion {knowledgeId: $knowledgeId, nodeId: $nodeId, commitHash: $commitHash}) +WITH bv +MATCH (fv:FileVersion {knowledgeId: $knowledgeId, commitHash: $commitHash}) +MERGE (bv)-[:DESCRIBES]->(fv) +RETURN count(fv) AS count +`; + +/** + * Creates or merges the `:BusinessContextVersion` snapshot for this commit and + * connects it to the parent `:BusinessContext`. Stores the full analysis as a + * JSON property on the version node so historical queries can reconstruct it + * without re-reading disk. + */ +export async function createBusinessContextVersionNode( + identity: BusinessContextVersionIdentity, + analysis: BusinessContextAnalysis, + sanitizedTitle: string, +): Promise { + const rows = await runCypher<{ count: number }>(MERGE_VERSION, { + nodeId: sanitizedTitle, + knowledgeId: identity.knowledgeId, + orgId: identity.orgId, + commitHash: identity.commitHash, + analysisJson: JSON.stringify(analysis), + updatedAt: new Date().toISOString(), + }); + return rows.length > 0 ? Number(rows[0]?.count ?? 0) : 0; +} + +/** + * Links the `:BusinessContextVersion` to every `:FileVersion` that exists for + * the same `(knowledgeId, commitHash)`. Returns the number of edges merged. + * Zero matches → zero edges; re-running after files are snapshot will create + * the missing edges (MERGE is idempotent). + */ +export async function linkVersionToFileVersions( + identity: BusinessContextVersionIdentity, + sanitizedTitle: string, +): Promise { + const rows = await runCypher<{ count: number }>(LINK_TO_FILE_VERSIONS, { + nodeId: sanitizedTitle, + knowledgeId: identity.knowledgeId, + commitHash: identity.commitHash, + }); + return rows.length > 0 ? Number(rows[0]?.count ?? 0) : 0; +} diff --git a/packages/ingest-business-context/src/prompt/README.md b/packages/ingest-business-context/src/prompt/README.md new file mode 100644 index 0000000..9555cd5 --- /dev/null +++ b/packages/ingest-business-context/src/prompt/README.md @@ -0,0 +1,11 @@ +# `prompt/` — context + +Builds the system + user messages consumed by the LLM calls. + +| File | Responsibility | +| -------------------- | ------------------------------------------------------------------------------------ | +| `title-prompt.ts` | System prompt for the title-generation call. Returns `{ "title": "…" }`. | +| `analysis-prompt.ts` | System prompt for partial-fields analysis. Builds the JSON template from field-defs. | +| `user-message.ts` | Composes the user message (text + title + enrichment) for analysis calls. | + +All prompt content stays here. Nothing else in the package builds prompts. diff --git a/packages/ingest-business-context/src/prompt/analysis-prompt.ts b/packages/ingest-business-context/src/prompt/analysis-prompt.ts new file mode 100644 index 0000000..b93b2b6 --- /dev/null +++ b/packages/ingest-business-context/src/prompt/analysis-prompt.ts @@ -0,0 +1,48 @@ +import { BUSINESS_CONTEXT_FIELD_DEFS } from "#src/field-defs.ts"; + +/** + * Builds a system prompt asking the LLM to fill exactly the requested field + * subset (a slice of the full 16-field schema). Each call in the parallel + * pipeline targets one subset (product, technical, shared) so total context + * stays under budget and the JSON outputs are small enough to parse reliably. + * + * The prompt emits a JSON template that lists only the requested fields with + * their descriptions, special instructions, and an example value drawn from + * `BUSINESS_CONTEXT_FIELD_DEFS`. The LLM is asked to populate every key. + */ +export function buildPartialAnalysisPrompt(requestedFields: readonly string[]): string { + const fieldBlocks: string[] = []; + for (const name of requestedFields) { + const def = BUSINESS_CONTEXT_FIELD_DEFS[name]; + if (!def) { + continue; + } + fieldBlocks.push( + ` "${name}": ${def.example}\n // type: ${def.type}\n // description: ${def.description}\n // instructions: ${def.special_instructions}`, + ); + } + + return `You are an analyst combining business context with technical understanding of an indexed codebase. + +The user provides: + 1. Raw business-context text describing why a commit exists. + 2. A pre-generated title for context. + 3. (Optional) Aggregated enrichment data sampled from the repository (top keywords, architecture + summary, file tree, integration surface). Use these as evidence to ground your output; do not + invent claims that conflict with them. + +Your task: extract the following fields, populating EVERY key. If a field cannot be derived from +the text or enrichment, output an empty string or empty array — never null or undefined. + +Output format (strict JSON, no markdown fences, no commentary): + +{ +${fieldBlocks.join(",\n")} +} + +Rules: +- Honour every "instructions" line literally — they cap list lengths and dictate tone. +- Do not echo the field descriptions or instructions in your output. +- Do not introduce extra top-level keys beyond those listed. +- Output ONE JSON object. Nothing else.`; +} diff --git a/packages/ingest-business-context/src/prompt/title-prompt.ts b/packages/ingest-business-context/src/prompt/title-prompt.ts new file mode 100644 index 0000000..b1e60dd --- /dev/null +++ b/packages/ingest-business-context/src/prompt/title-prompt.ts @@ -0,0 +1,22 @@ +/** + * System prompt for the title-generation LLM call. Asks the model to read the + * raw business-context text and return a single JSON object with one key, + * `title`, holding a concise product-recognisable string. + */ +export function buildTitleGenerationPrompt(): string { + return `You are a senior product manager generating a concise title for a business-context entry. + +The user will provide raw text describing a piece of business context attached to a code commit. +Your task: produce ONE short, descriptive title that a product manager would recognise instantly +when scanning a list of business contexts. + +Requirements: +- Maximum 12 words. +- No technical jargon. No code identifiers (no camelCase, no file paths, no function names). +- Product-domain language. Capture the *what* and the *audience*, not the *how*. +- If the text is empty or unintelligible, output the literal string "Untitled Business Context". + +Output strictly as JSON: { "title": "" } + +No prose. No explanations. No markdown code fences.`; +} diff --git a/packages/ingest-business-context/src/prompt/user-message.ts b/packages/ingest-business-context/src/prompt/user-message.ts new file mode 100644 index 0000000..3cbd7de --- /dev/null +++ b/packages/ingest-business-context/src/prompt/user-message.ts @@ -0,0 +1,18 @@ +/** + * Composes the user-side message for the analysis LLM call. Bundles the raw + * business-context text, the pre-generated title, and (optional) enrichment + * data extracted from the repository's meta-output. The enrichment section is + * elided entirely when empty so the call works even before ingest-github has + * produced any repo-summary. + */ +export function buildEnrichedUserMessage(text: string, title: string, enrichmentSection: string): string { + const parts: string[] = [`TITLE (pre-generated):`, title, "", `BUSINESS CONTEXT TEXT (authored by a human):`, text]; + + if (enrichmentSection.trim().length > 0) { + parts.push(""); + parts.push("REPOSITORY ENRICHMENT (sampled from the indexed codebase):"); + parts.push(enrichmentSection); + } + + return parts.join("\n"); +} diff --git a/packages/ingest-business-context/src/strategy/README.md b/packages/ingest-business-context/src/strategy/README.md new file mode 100644 index 0000000..9d7de47 --- /dev/null +++ b/packages/ingest-business-context/src/strategy/README.md @@ -0,0 +1,13 @@ +# `strategy/` — context + +Orchestrates the per-job pipeline. + +| File | Responsibility | +| --------------------- | ------------------------------------------------------------------------------ | +| `commit-validator.ts` | `assertCommitIndexed()` — throws `CommitNotIndexedError` if files don't exist. | +| `execute.ts` | The disk pipeline: validate → enrich → title → analyse → persist. | +| `store-graph.ts` | The Neo4j pipeline: indexes → node → version → file-version edges → keywords. | + +`execute` and `store-graph` are separate by design — the worker calls them in +sequence, but a synchronous HTTP path can call them in the same request, and +a future scheduler can defer `store-graph` for later. diff --git a/packages/ingest-business-context/src/strategy/commit-validator.ts b/packages/ingest-business-context/src/strategy/commit-validator.ts new file mode 100644 index 0000000..983e091 --- /dev/null +++ b/packages/ingest-business-context/src/strategy/commit-validator.ts @@ -0,0 +1,46 @@ +import { runCypher } from "@bb/neo4j"; +import { CommitNotIndexedError } from "#src/errors.ts"; + +const CHECK_INDEXED = ` +OPTIONAL MATCH (fv:FileVersion {knowledgeId: $knowledgeId, commitHash: $commitHash}) +WITH count(fv) AS versionCount +OPTIONAL MATCH (f:File {knowledgeId: $knowledgeId}) +WITH versionCount, count(f) AS fileCount +RETURN versionCount AS versions, fileCount AS files +`; + +export interface CommitIndexStatus { + /** Number of `:FileVersion` rows matching `(knowledgeId, commitHash)`. */ + fileVersions: number; + /** Number of `:File` rows for the knowledge (any commit). */ + liveFiles: number; + /** True if either count is positive. */ + indexed: boolean; +} + +/** + * Reports whether the commit's files are indexed. Two evidence sources: + * + * 1. `:FileVersion {knowledgeId, commitHash}` — historical snapshot exists. + * 2. `:File {knowledgeId}` — live state exists, which implies the knowledge + * was indexed at *some* commit. We accept this because the latest commit + * may not yet have a snapshot (snapshots are taken before the next pull). + * + * If both are zero, the commit (or knowledge) is not indexed. + */ +export async function checkCommitIndexed(knowledgeId: string, commitHash: string): Promise { + const rows = await runCypher<{ versions: number; files: number }>(CHECK_INDEXED, { knowledgeId, commitHash }); + const row = rows[0] ?? { versions: 0, files: 0 }; + const fileVersions = Number(row.versions ?? 0); + const liveFiles = Number(row.files ?? 0); + return { fileVersions, liveFiles, indexed: fileVersions > 0 || liveFiles > 0 }; +} + +/** Throws `CommitNotIndexedError` if neither file-versions nor live files exist. */ +export async function assertCommitIndexed(knowledgeId: string, commitHash: string): Promise { + const status = await checkCommitIndexed(knowledgeId, commitHash); + if (!status.indexed) { + throw new CommitNotIndexedError(knowledgeId, commitHash); + } + return status; +} diff --git a/packages/ingest-business-context/src/strategy/execute.ts b/packages/ingest-business-context/src/strategy/execute.ts new file mode 100644 index 0000000..f72c7a6 --- /dev/null +++ b/packages/ingest-business-context/src/strategy/execute.ts @@ -0,0 +1,96 @@ +import { logger } from "@bb/logger"; +import { loadCachedAnalysis } from "#src/disk/load-cached.ts"; +import { sanitizeTitle } from "#src/disk/sanitize-title.ts"; +import { saveAnalysis } from "#src/disk/save-analysis.ts"; +import { saveOriginalText } from "#src/disk/save-original.ts"; +import { BusinessContextAnalysisFailedError } from "#src/errors.ts"; +import { analyzeBusinessContextParallel } from "#src/llm/analyze-parallel.ts"; +import { collectEnrichmentData } from "#src/llm/enrichment-reader.ts"; +import { generateBusinessContextTitle } from "#src/llm/title.ts"; +import { assertCommitIndexed } from "#src/strategy/commit-validator.ts"; +import { businessContextDir } from "@bb/ingest-github"; +import path from "node:path"; +import type { BusinessContextInput, BusinessContextLlmOptions, BusinessContextStorageResult } from "#src/types.ts"; + +export interface ExecuteOptions { + llmOptions: BusinessContextLlmOptions; +} + +/** + * Main entry point for the BusinessContext disk pipeline. Validates the + * commit is indexed, reads enrichment, runs the title call + the 3 parallel + * analysis calls, persists both the original text and the analysis envelope + * to disk. Neo4j persistence is intentionally separate (`store-graph.ts`) so + * callers can defer it. + */ +export async function executeBusinessContextStrategy( + input: BusinessContextInput, + options: ExecuteOptions, +): Promise { + logger.info( + `business-context: executing — knowledge=${input.knowledgeId}, commit=${input.commitHash.substring(0, 12)}, text=${input.text.length} chars`, + ); + + // 1. Validate the commit (or knowledge) is indexed. + await assertCommitIndexed(input.knowledgeId, input.commitHash); + + // 2. Generate the title. + const titleResult = await generateBusinessContextTitle(input.text, options.llmOptions); + const sanitizedTitle = sanitizeTitle(titleResult.title); + if (sanitizedTitle.length === 0) { + // Defensive: an empty slug would collide on every BC. Bail with a stable fallback. + logger.warn(`business-context: sanitized title was empty for "${titleResult.title}" — using fallback slug`); + } + const effectiveSlug = sanitizedTitle.length > 0 ? sanitizedTitle : "untitled-business-context"; + + // 3. Cache hit? Skip the analysis call and return the existing paths. + const cached = await loadCachedAnalysis(input.knowledgeId, input.commitHash, effectiveSlug); + if (cached !== null) { + const dir = businessContextDir(input.knowledgeId, input.commitHash, effectiveSlug); + return { + analysisPath: path.join(dir, "analysis.json"), + originalTextPath: path.join(dir, "original.txt"), + title: cached.analysis.title, + commitHash: input.commitHash, + sanitizedTitle: effectiveSlug, + }; + } + + // 4. Collect enrichment + run the parallel analysis. + const enrichment = await collectEnrichmentData(input.knowledgeId, input.orgId); + const analysisResult = await analyzeBusinessContextParallel( + input.text, + titleResult.title, + enrichment, + options.llmOptions, + ); + if (analysisResult.analysis === null) { + throw new BusinessContextAnalysisFailedError(input.knowledgeId, input.commitHash); + } + + // 5. Persist to disk in parallel. + const totalInputTokens = titleResult.inputTokens + analysisResult.inputTokens; + const totalOutputTokens = titleResult.outputTokens + analysisResult.outputTokens; + const [originalTextPath, analysisPath] = await Promise.all([ + saveOriginalText(input.knowledgeId, input.commitHash, effectiveSlug, input.text), + saveAnalysis(input.knowledgeId, input.commitHash, effectiveSlug, analysisResult.analysis, { + commitHash: input.commitHash, + modelName: analysisResult.modelName, + inputTokens: totalInputTokens, + outputTokens: totalOutputTokens, + ...(input.description !== undefined ? { description: input.description } : {}), + }), + ]); + + logger.info( + `business-context: strategy complete — title="${analysisResult.analysis.title}", commit=${input.commitHash.substring(0, 12)}`, + ); + + return { + analysisPath, + originalTextPath, + title: analysisResult.analysis.title, + commitHash: input.commitHash, + sanitizedTitle: effectiveSlug, + }; +} diff --git a/packages/ingest-business-context/src/strategy/store-graph.ts b/packages/ingest-business-context/src/strategy/store-graph.ts new file mode 100644 index 0000000..5f25ca7 --- /dev/null +++ b/packages/ingest-business-context/src/strategy/store-graph.ts @@ -0,0 +1,63 @@ +import { logger } from "@bb/logger"; +import { ensureBusinessContextIndexes } from "#src/neo4j/indexes.ts"; +import { createBusinessContextKeywords } from "#src/neo4j/write-keywords.ts"; +import { createBusinessContextNode } from "#src/neo4j/write-node.ts"; +import { createBusinessContextVersionNode, linkVersionToFileVersions } from "#src/neo4j/write-version.ts"; +import type { BusinessContextAnalysis, BusinessContextNeo4jResult } from "#src/types.ts"; + +export interface StoreGraphInput { + knowledgeId: string; + orgId: string; + commitHash: string; +} + +/** + * Persists a completed `BusinessContextAnalysis` to Neo4j. Four steps: + * + * 1. Ensure indexes exist (idempotent). + * 2. Merge the parent `:BusinessContext` and link from `:Knowledge`. + * 3. Merge the per-commit `:BusinessContextVersion`, then MERGE `:DESCRIBES` + * edges to every `:FileVersion {knowledgeId, commitHash}` that exists. + * 4. Merge `:OrgKeyword` nodes and `:APPEARS_IN_BUSINESS_CONTEXT` edges. + */ +export async function storeBusinessContextToNeo4j( + input: StoreGraphInput, + analysis: BusinessContextAnalysis, + sanitizedTitle: string, +): Promise { + await ensureBusinessContextIndexes(); + + const nodeCount = await createBusinessContextNode( + { knowledgeId: input.knowledgeId, orgId: input.orgId }, + analysis, + sanitizedTitle, + ); + + const versionCount = await createBusinessContextVersionNode( + { knowledgeId: input.knowledgeId, orgId: input.orgId, commitHash: input.commitHash }, + analysis, + sanitizedTitle, + ); + + const fileVersionEdges = await linkVersionToFileVersions( + { knowledgeId: input.knowledgeId, orgId: input.orgId, commitHash: input.commitHash }, + sanitizedTitle, + ); + + const keywordEdges = await createBusinessContextKeywords( + { knowledgeId: input.knowledgeId, orgId: input.orgId }, + analysis, + sanitizedTitle, + ); + + logger.info( + `business-context: graph stored — node=${nodeCount > 0}, version=${versionCount > 0}, fileVersion=${fileVersionEdges}, keywords=${keywordEdges}`, + ); + + return { + businessContextNodeCreated: nodeCount > 0, + versionNodeCreated: versionCount > 0, + keywordRelationships: keywordEdges, + fileVersionRelationships: fileVersionEdges, + }; +} diff --git a/packages/ingest-business-context/src/types.ts b/packages/ingest-business-context/src/types.ts new file mode 100644 index 0000000..ec3c6b7 --- /dev/null +++ b/packages/ingest-business-context/src/types.ts @@ -0,0 +1,117 @@ +/** + * The structured analysis produced by LLM from user-authored business-context text. + * Two audiences are served by one document: product people (title, user stories, + * stakeholders, business value) and engineers (technical summary, affected modules, + * architecture decisions, dependencies, data flow). + */ +export interface BusinessContextAnalysis { + // Product fields + title: string; + product_area: string; + user_stories: string[]; + business_value: string; + stakeholders: string[]; + success_metrics: string[]; + user_impact: string; + domain_keywords: string[]; + + // Technical fields + technical_summary: string; + affected_modules: string[]; + architecture_decisions: string[]; + dependencies: string[]; + risk_areas: string[]; + data_flow: string; + api_surface: string[]; + + // Shared fields + summary: string; + keywords: string[]; +} + +/** + * Input to the BusinessContext strategy. `orgId` is single-tenant (`"local"`) in + * OSS; downstream multi-tenant deployments stamp it from the request. + */ +export interface BusinessContextInput { + /** Raw business-context text authored by a human. */ + text: string; + /** Knowledge entity UUID. */ + knowledgeId: string; + /** 40-char hex SHA. Must reference an indexed commit. */ + commitHash: string; + /** Tenant binding. */ + orgId: string; + /** Optional human-supplied description, persisted alongside the analysis envelope. */ + description?: string; +} + +/** Result of the disk-side pipeline (validation → enrichment → LLM → write). */ +export interface BusinessContextStorageResult { + /** Absolute path to the saved `analysis.json`. */ + analysisPath: string; + /** Absolute path to the saved `original.txt`. */ + originalTextPath: string; + /** The LLM-generated title. */ + title: string; + /** The commit hash the analysis is anchored to. */ + commitHash: string; + /** Sanitized title used as the node_id and the on-disk directory name. */ + sanitizedTitle: string; +} + +/** Result returned after persisting to Neo4j. */ +export interface BusinessContextNeo4jResult { + /** Whether the main `:BusinessContext` node was created (true on first run, true on MERGE). */ + businessContextNodeCreated: boolean; + /** Whether the per-commit `:BusinessContextVersion` was created or merged. */ + versionNodeCreated: boolean; + /** Total number of `:OrgKeyword` relationships created. */ + keywordRelationships: number; + /** Count of `[:DESCRIBES]` edges from the version node to file-version nodes for this commit. */ + fileVersionRelationships: number; +} + +/** Metadata envelope wrapping the analysis when persisted to disk. */ +export interface BusinessContextAnalysisMetadata { + /** ISO timestamp of when the analysis was generated. */ + generatedAt: string; + /** The commit hash this analysis is stored under. */ + commitHash: string; + /** LLM model name used. */ + modelName: string; + /** Total input tokens consumed (title + analysis calls combined). */ + inputTokens: number; + /** Total output tokens consumed (title + analysis calls combined). */ + outputTokens: number; + /** Optional human-supplied description carried through from the input. */ + description?: string; + /** The full analysis object. */ + analysis: BusinessContextAnalysis; +} + +/** Result of the title-generation LLM call. */ +export interface TitleGenerationResult { + title: string; + inputTokens: number; + outputTokens: number; + modelName: string; +} + +/** Result of the parallel analysis LLM calls. */ +export interface AnalysisResult { + analysis: BusinessContextAnalysis | null; + inputTokens: number; + outputTokens: number; + modelName: string; +} + +/** Options forwarded to the LLM layer (per-job credential overrides, etc.). */ +export interface BusinessContextLlmOptions { + /** Optional per-job LLM API key override. */ + apiKey?: string; + /** Optional per-job LLM provider override (`"openrouter"` or `"ollama"` in OSS). */ + provider?: string; + /** Optional per-job LLM model override. */ + model?: string; +} diff --git a/packages/ingest-business-context/src/worker/README.md b/packages/ingest-business-context/src/worker/README.md new file mode 100644 index 0000000..d74e95f --- /dev/null +++ b/packages/ingest-business-context/src/worker/README.md @@ -0,0 +1,12 @@ +# `worker/` — context + +BullMQ worker registration. + +| File | Responsibility | +| ------------- | ---------------------------------------------------------------------- | +| `handler.ts` | Runs `execute → store-graph` for each `BusinessContextProcessing` job. | +| `register.ts` | `registerBusinessContextWorker()` — called once by the deployable. | + +The handler re-reads the persisted analysis from disk between the disk and +graph phases so a future split into two queue jobs produces the same result +as the current inline flow. diff --git a/packages/ingest-business-context/src/worker/handler.ts b/packages/ingest-business-context/src/worker/handler.ts new file mode 100644 index 0000000..5be25de --- /dev/null +++ b/packages/ingest-business-context/src/worker/handler.ts @@ -0,0 +1,73 @@ +import { getConfigValue } from "@bb/config"; +import { Config, type BusinessContextProcessingPayload, type JobMessage, JobType } from "@bb/types"; +import { logger } from "@bb/logger"; +import type { JobHandler } from "@bb/queue"; +import { executeBusinessContextStrategy } from "#src/strategy/execute.ts"; +import { storeBusinessContextToNeo4j } from "#src/strategy/store-graph.ts"; +import type { BusinessContextAnalysis, BusinessContextLlmOptions } from "#src/types.ts"; +import { readFile } from "node:fs/promises"; + +const DEFAULT_ORG_ID = "local"; + +function buildLlmOptions(payload: BusinessContextProcessingPayload): BusinessContextLlmOptions { + const opts: BusinessContextLlmOptions = {}; + if (payload.llmApiKey !== undefined) { + opts.apiKey = payload.llmApiKey; + } + if (payload.llmProvider !== undefined) { + opts.provider = payload.llmProvider; + } + if (payload.llmModel !== undefined) { + opts.model = payload.llmModel; + } + return opts; +} + +function resolveOrgId(payload: BusinessContextProcessingPayload): string { + if (payload.orgId !== undefined && payload.orgId.length > 0) { + return payload.orgId; + } + const configured = getConfigValue(Config.OrgId); + return configured.length > 0 ? configured : DEFAULT_ORG_ID; +} + +/** + * BullMQ job handler for `JobType.BusinessContextProcessing`. Runs the disk + * strategy then the graph store. Re-reads the persisted analysis from disk + * before the graph step so a deferred / split execution path produces the same + * result as the inline path. + */ +export const handleBusinessContextProcessing: JobHandler = async ( + msg: JobMessage, +): Promise => { + const { payload } = msg; + const orgId = resolveOrgId(payload); + const input = { + text: payload.customText, + knowledgeId: payload.knowledgeId, + commitHash: payload.commitHash, + orgId, + ...(payload.description !== undefined ? { description: payload.description } : {}), + }; + + logger.info( + `business-context.handler: starting job=${msg.id} knowledge=${input.knowledgeId} commit=${input.commitHash.substring(0, 12)}`, + ); + + const storage = await executeBusinessContextStrategy(input, { llmOptions: buildLlmOptions(payload) }); + + // Re-load the persisted analysis to feed the graph step. Keeps the contract + // identical whether the graph step runs inline or is deferred to a follow-up + // job: in both cases the source of truth is what's on disk. + const envelope = JSON.parse(await readFile(storage.analysisPath, "utf-8")) as { + analysis: BusinessContextAnalysis; + }; + + await storeBusinessContextToNeo4j( + { knowledgeId: input.knowledgeId, orgId, commitHash: input.commitHash }, + envelope.analysis, + storage.sanitizedTitle, + ); + + logger.info(`business-context.handler: completed job=${msg.id}`); +}; diff --git a/packages/ingest-business-context/src/worker/register.ts b/packages/ingest-business-context/src/worker/register.ts new file mode 100644 index 0000000..80a9686 --- /dev/null +++ b/packages/ingest-business-context/src/worker/register.ts @@ -0,0 +1,13 @@ +import { JobType } from "@bb/types"; +import { registerWorker, type WorkerRegistrationOptions } from "@bb/queue"; +import { handleBusinessContextProcessing } from "#src/worker/handler.ts"; + +/** + * Registers the BusinessContext worker against `JobType.BusinessContextProcessing`. + * Called once by the deployable at boot. The default concurrency is sourced + * from `Config.ConcurrencyGithub` (shared with other CPU/LLM-heavy workers); + * callers may override via `opts.concurrency`. + */ +export function registerBusinessContextWorker(opts: WorkerRegistrationOptions = {}): void { + registerWorker(JobType.BusinessContextProcessing, handleBusinessContextProcessing, opts); +} diff --git a/packages/ingest-business-context/tsconfig.json b/packages/ingest-business-context/tsconfig.json new file mode 100644 index 0000000..d8a16a7 --- /dev/null +++ b/packages/ingest-business-context/tsconfig.json @@ -0,0 +1,4 @@ +{ + "extends": "../../../../tsconfig.base.json", + "include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.json"] +} diff --git a/packages/ingest-github/README.md b/packages/ingest-github/README.md index bb624cf..b442726 100644 --- a/packages/ingest-github/README.md +++ b/packages/ingest-github/README.md @@ -80,6 +80,12 @@ function createGithubIngestHandler(deps: IngestJobHandlerDeps): (msg) => Promise function createLocalIngestHandler(deps: IngestJobHandlerDeps): (msg) => Promise; function runPull(msg: JobMessage, pullFactory?: PullFactory): Promise; function reposRoot(): string; +function repoCloneDir(knowledgeId: string): string; +function metaRootFor(knowledgeId: string): string; +function metaPathsFor(knowledgeId: string): MetaPaths; +function commitMetaDir(knowledgeId: string, commitHash: string): string; +function businessContextDir(knowledgeId: string, commitHash: string, sanitizedTitle: string): string; +function orgRegistryDir(knowledgeId: string, orgId: string): string; function createFlatFolderStrategy(deps): IngestStrategy; function createLlmFileAnalyzer(deps): FileAnalyzer; diff --git a/packages/ingest-github/package.json b/packages/ingest-github/package.json index ad516c4..936da74 100644 --- a/packages/ingest-github/package.json +++ b/packages/ingest-github/package.json @@ -4,12 +4,12 @@ "private": true, "type": "module", "main": "./src/index.ts", - "types": "./types/index.d.ts", + "types": "./src/index.ts", "exports": { - ".": { - "types": "./types/index.d.ts", - "default": "./src/index.ts" - } + ".": "./src/index.ts" + }, + "imports": { + "#src/*": "./src/*" }, "dependencies": { "@bb/config": "workspace:*", diff --git a/packages/ingest-github/src/adapters/llm-file-analyzer.ts b/packages/ingest-github/src/adapters/llm-file-analyzer.ts index b006fbd..8aafa43 100644 --- a/packages/ingest-github/src/adapters/llm-file-analyzer.ts +++ b/packages/ingest-github/src/adapters/llm-file-analyzer.ts @@ -1,8 +1,8 @@ import { askJsonLLM, type AskLlmOptions } from "@bb/llm"; import { logger } from "@bb/logger"; import type { FileAnalysis, FileAnalysisSection } from "@bb/mongo"; -import { FALLBACK_LANGUAGE, emptyFileAnalysis } from "src/types/file-analysis.ts"; -import type { AnalyzedFileResult, FileAnalyzer } from "src/types/pipeline.ts"; +import { FALLBACK_LANGUAGE, emptyFileAnalysis } from "#src/types/file-analysis.ts"; +import type { AnalyzedFileResult, FileAnalyzer } from "#src/types/pipeline.ts"; export interface LlmFileAnalyzerDeps { buildSystemPrompt: () => string; diff --git a/packages/ingest-github/src/handlers/ingest-job.ts b/packages/ingest-github/src/handlers/ingest-job.ts index 39580af..e9eb790 100644 --- a/packages/ingest-github/src/handlers/ingest-job.ts +++ b/packages/ingest-github/src/handlers/ingest-job.ts @@ -1,7 +1,7 @@ import type { GithubIndexPayload, JobMessage, LocalIngestPayload } from "@bb/types"; import { IngestError } from "@bb/errors"; -import { isEnvelopeCoherent, narrowGithubIngest, narrowLocalIngest } from "src/payload/narrow.ts"; -import type { IngestRunnerDeps } from "src/types/ingest-runner.ts"; +import { isEnvelopeCoherent, narrowGithubIngest, narrowLocalIngest } from "#src/payload/narrow.ts"; +import type { IngestRunnerDeps } from "#src/types/ingest-runner.ts"; export interface IngestJobHandlerDeps { runner: IngestRunnerDeps; diff --git a/packages/ingest-github/src/index.ts b/packages/ingest-github/src/index.ts index 27371ac..7a6ea16 100644 --- a/packages/ingest-github/src/index.ts +++ b/packages/ingest-github/src/index.ts @@ -67,7 +67,15 @@ export type { CreatePipelineRunnerDeps } from "./pipeline/run.ts"; export { createGithubIngestHandler, createLocalIngestHandler } from "./handlers/ingest-job.ts"; export type { IngestJobHandlerDeps } from "./handlers/ingest-job.ts"; export { runPull } from "./pipeline/pull.ts"; -export { reposRoot } from "./pipeline/paths.ts"; +export { + reposRoot, + repoCloneDir, + metaRootFor, + metaPathsFor, + commitMetaDir, + businessContextDir, + orgRegistryDir, +} from "./pipeline/paths.ts"; export type { IngestRunnerDeps, IngestRunnerInput } from "./types/ingest-runner.ts"; export type { IngestStrategy, StrategyInput, StrategyResult, StrategyContext } from "./types/strategy.ts"; export type { diff --git a/packages/ingest-github/src/pipeline/branch.ts b/packages/ingest-github/src/pipeline/branch.ts index b048b49..70186ab 100644 --- a/packages/ingest-github/src/pipeline/branch.ts +++ b/packages/ingest-github/src/pipeline/branch.ts @@ -1,6 +1,6 @@ import type { GithubIndexPayload } from "@bb/types"; import { IngestError } from "@bb/errors"; -import { fetchDefaultBranch } from "src/githubApi.ts"; +import { fetchDefaultBranch } from "#src/githubApi.ts"; const DEFAULT_BRANCH = "main"; diff --git a/packages/ingest-github/src/pipeline/context.ts b/packages/ingest-github/src/pipeline/context.ts index 4624414..c392291 100644 --- a/packages/ingest-github/src/pipeline/context.ts +++ b/packages/ingest-github/src/pipeline/context.ts @@ -1,5 +1,3 @@ -// SPDX-License-Identifier: AGPL-3.0-only WITH non-commercial-clause - import { Config } from "@bb/types"; import { getConfigValue } from "@bb/config"; import type { AskLlmOptions } from "@bb/llm"; diff --git a/packages/ingest-github/src/pipeline/disk-source-reader.ts b/packages/ingest-github/src/pipeline/disk-source-reader.ts index 9e404be..43dfd3f 100644 --- a/packages/ingest-github/src/pipeline/disk-source-reader.ts +++ b/packages/ingest-github/src/pipeline/disk-source-reader.ts @@ -1,6 +1,6 @@ import path from "node:path"; import { readFile } from "node:fs/promises"; -import type { ScanDeps, ScanEntry, SourceReader } from "src/types/pipeline.ts"; +import type { ScanDeps, ScanEntry, SourceReader } from "#src/types/pipeline.ts"; import { scanRepository } from "./scan.ts"; export interface DiskSourceReaderDeps { diff --git a/packages/ingest-github/src/pipeline/paths.ts b/packages/ingest-github/src/pipeline/paths.ts index 21db948..cdddc2f 100644 --- a/packages/ingest-github/src/pipeline/paths.ts +++ b/packages/ingest-github/src/pipeline/paths.ts @@ -1,7 +1,7 @@ import { mkdir } from "node:fs/promises"; import path from "node:path"; import { getBytebellHome } from "@bb/config"; -import type { MetaPaths } from "src/types/meta-paths.ts"; +import type { MetaPaths } from "#src/types/meta-paths.ts"; const DIR_MODE = 0o700; @@ -17,8 +17,12 @@ export async function ensureReposRoot(): Promise { await mkdir(reposRoot(), { recursive: true, mode: DIR_MODE }); } +export function metaRootFor(knowledgeId: string): string { + return path.join(reposRoot(), ".meta", knowledgeId); +} + export function metaPathsFor(knowledgeId: string): MetaPaths { - const metaRoot = path.join(reposRoot(), ".meta", knowledgeId); + const metaRoot = metaRootFor(knowledgeId); return { metaRoot, fileAnalysisDir: path.join(metaRoot, "file-analysis"), @@ -30,6 +34,35 @@ export function metaPathsFor(knowledgeId: string): MetaPaths { }; } +/** + * Per-commit meta directory for content scoped to a specific indexed commit. + * Sits under the knowledge's `metaRoot/commits//` so it survives + * subsequent pulls that overwrite the live `:File` set. + */ +export function commitMetaDir(knowledgeId: string, commitHash: string): string { + return path.join(metaRootFor(knowledgeId), "commits", commitHash); +} + +/** + * Directory for business-context analyses authored against a specific commit. + * Each business context lives at `business-context//` and contains + * `original.txt` (the raw user-authored text) and `analysis.json` (the LLM + * analysis wrapped in its metadata envelope). + */ +export function businessContextDir(knowledgeId: string, commitHash: string, sanitizedTitle: string): string { + return path.join(commitMetaDir(knowledgeId, commitHash), "business-context", sanitizedTitle); +} + +/** + * Org-level keyword registry directory. In single-tenant OSS this resolves to + * `metaRoot/org//` (orgId defaults to `"local"`); downstream multi-tenant + * deployments may aggregate registries across multiple knowledges into the same + * directory. The business-context enrichment reader tolerates missing files. + */ +export function orgRegistryDir(knowledgeId: string, orgId: string): string { + return path.join(metaRootFor(knowledgeId), "org", orgId); +} + export async function ensureMetaDirs(paths: MetaPaths): Promise { await mkdir(paths.fileAnalysisDir, { recursive: true, mode: DIR_MODE }); await mkdir(paths.folderSummariesDir, { recursive: true, mode: DIR_MODE }); diff --git a/packages/ingest-github/src/pipeline/pull.ts b/packages/ingest-github/src/pipeline/pull.ts index c5ec3ff..1776b77 100644 --- a/packages/ingest-github/src/pipeline/pull.ts +++ b/packages/ingest-github/src/pipeline/pull.ts @@ -12,21 +12,25 @@ import { assertReachableFromBranch, checkoutCommit, type DiffResult } from "./gi import { computePullDiff, materialiseEndpoints } from "./pull-diff-resolver.ts"; import { affectedFoldersFromDiff } from "./affected-folders.ts"; import { createDiskSourceReader } from "./disk-source-reader.ts"; -import type { PullFactory, SourceReader, ArchiveSink } from "src/types/pipeline.ts"; -import type { ProgressContextFactory } from "src/progress/types.ts"; -import { nullProgressContextFactory } from "src/progress/NullProgressReporter.ts"; -import { analyseChangedFiles } from "src/strategies/flat-folder/analyse-changed.ts"; -import { processBigFilesQueue } from "src/strategies/flat-folder/phases/process-big-files.ts"; -import { backfillMissingFields } from "src/strategies/flat-folder/backfill/fields.ts"; -import { backfillBigFiles } from "src/strategies/flat-folder/backfill/big-files.ts"; -import { runSelectiveFolderSummary } from "src/strategies/flat-folder/folder-summary-selective.ts"; -import { makeRepoSummaryEnvelope, persistRepoSummary, summariseRepo } from "src/strategies/flat-folder/repo-summary.ts"; -import { storePullAnalysis } from "src/strategies/flat-folder/store-pull.ts"; -import { createLlmFileAnalyzer } from "src/adapters/llm-file-analyzer.ts"; +import type { PullFactory, SourceReader, ArchiveSink } from "#src/types/pipeline.ts"; +import type { ProgressContextFactory } from "#src/progress/types.ts"; +import { nullProgressContextFactory } from "#src/progress/NullProgressReporter.ts"; +import { analyseChangedFiles } from "#src/strategies/flat-folder/analyse-changed.ts"; +import { processBigFilesQueue } from "#src/strategies/flat-folder/phases/process-big-files.ts"; +import { backfillMissingFields } from "#src/strategies/flat-folder/backfill/fields.ts"; +import { backfillBigFiles } from "#src/strategies/flat-folder/backfill/big-files.ts"; +import { runSelectiveFolderSummary } from "#src/strategies/flat-folder/folder-summary-selective.ts"; +import { + makeRepoSummaryEnvelope, + persistRepoSummary, + summariseRepo, +} from "#src/strategies/flat-folder/repo-summary.ts"; +import { storePullAnalysis } from "#src/strategies/flat-folder/store-pull.ts"; +import { createLlmFileAnalyzer } from "#src/adapters/llm-file-analyzer.ts"; import { COMBINED_CODE_ANALYSIS_SYSTEM_PROMPT, buildFileAnalysisUserPrompt, -} from "src/strategies/flat-folder/prompts/file-analysis.ts"; +} from "#src/strategies/flat-folder/prompts/file-analysis.ts"; const COMMIT_HASH_RE = /^[0-9a-f]{40}$/u; diff --git a/packages/ingest-github/src/pipeline/run.ts b/packages/ingest-github/src/pipeline/run.ts index ff52cb3..5cf1d15 100644 --- a/packages/ingest-github/src/pipeline/run.ts +++ b/packages/ingest-github/src/pipeline/run.ts @@ -3,11 +3,11 @@ import { setKnowledgeBranch, setKnowledgeCommit, setKnowledgeState } from "@bb/m import { setKnowledgeBranchInGraph, setKnowledgeStateInGraph } from "@bb/neo4j"; import { IngestError } from "@bb/errors"; import { logger } from "@bb/logger"; -import type { IngestRunnerDeps, IngestRunnerInput } from "src/types/ingest-runner.ts"; -import type { IngestStrategy } from "src/types/strategy.ts"; -import type { ArchiveSink, PipelineSummary, SourceFactory, SourceReader } from "src/types/pipeline.ts"; -import type { ProgressContextFactory } from "src/progress/types.ts"; -import { nullProgressContextFactory } from "src/progress/NullProgressReporter.ts"; +import type { IngestRunnerDeps, IngestRunnerInput } from "#src/types/ingest-runner.ts"; +import type { IngestStrategy } from "#src/types/strategy.ts"; +import type { ArchiveSink, PipelineSummary, SourceFactory, SourceReader } from "#src/types/pipeline.ts"; +import type { ProgressContextFactory } from "#src/progress/types.ts"; +import { nullProgressContextFactory } from "#src/progress/NullProgressReporter.ts"; import { ensureMetaDirs, ensureReposRoot, metaPathsFor, repoCloneDir } from "./paths.ts"; import { readHeadCommitHash, syncRepository } from "./source.ts"; import { resolveBranch } from "./branch.ts"; diff --git a/packages/ingest-github/src/pipeline/scan.ts b/packages/ingest-github/src/pipeline/scan.ts index a6b03ef..02d17ea 100644 --- a/packages/ingest-github/src/pipeline/scan.ts +++ b/packages/ingest-github/src/pipeline/scan.ts @@ -5,7 +5,7 @@ import { getConfigValue } from "@bb/config"; import type { AskLlmOptions } from "@bb/llm"; import { logger } from "@bb/logger"; import { SKIP_DIRS, looksBinary, passesPathFilters } from "./filters.ts"; -import type { ScanEntry, SkipDecider } from "src/types/pipeline.ts"; +import type { ScanEntry, SkipDecider } from "#src/types/pipeline.ts"; interface ScanLimits { absoluteCap: number; diff --git a/packages/ingest-github/src/pipeline/skip-decisions/decider.ts b/packages/ingest-github/src/pipeline/skip-decisions/decider.ts index 9c5d3cd..455f633 100644 --- a/packages/ingest-github/src/pipeline/skip-decisions/decider.ts +++ b/packages/ingest-github/src/pipeline/skip-decisions/decider.ts @@ -4,7 +4,7 @@ import { Config } from "@bb/types"; import { getConfigValue } from "@bb/config"; import { askYesNoLLM, type AskLlmOptions } from "@bb/llm"; import { logger } from "@bb/logger"; -import type { SkipDecider, SkipDeciderInput, SkipDecision } from "src/types/pipeline.ts"; +import type { SkipDecider, SkipDeciderInput, SkipDecision } from "#src/types/pipeline.ts"; import { defaultCachePath, emptyCache, diff --git a/packages/ingest-github/src/pipeline/stats.ts b/packages/ingest-github/src/pipeline/stats.ts index b9ff4c9..84cf9cb 100644 --- a/packages/ingest-github/src/pipeline/stats.ts +++ b/packages/ingest-github/src/pipeline/stats.ts @@ -1,5 +1,3 @@ -// SPDX-License-Identifier: AGPL-3.0-only WITH non-commercial-clause - import { recordProcessingStats } from "@bb/mongo"; import { estimateCostFromBreakdown } from "@bb/llm"; diff --git a/packages/ingest-github/src/progress/NullProgressReporter.ts b/packages/ingest-github/src/progress/NullProgressReporter.ts index 8c3d394..fd59574 100644 --- a/packages/ingest-github/src/progress/NullProgressReporter.ts +++ b/packages/ingest-github/src/progress/NullProgressReporter.ts @@ -4,7 +4,7 @@ import type { ProgressPhase, ProgressReporter, ProgressReporterInput, -} from "src/progress/types.ts"; +} from "#src/progress/types.ts"; class NullProgressReporter implements ProgressReporter { async start(): Promise { diff --git a/packages/ingest-github/src/strategies/basic-file-analysis/BasicFileAnalysisStrategy.ts.archived b/packages/ingest-github/src/strategies/basic-file-analysis/BasicFileAnalysisStrategy.ts.archived index 3eeaead..0c8696e 100644 --- a/packages/ingest-github/src/strategies/basic-file-analysis/BasicFileAnalysisStrategy.ts.archived +++ b/packages/ingest-github/src/strategies/basic-file-analysis/BasicFileAnalysisStrategy.ts.archived @@ -1,4 +1,3 @@ -// SPDX-License-Identifier: AGPL-3.0-only WITH non-commercial-clause // // ===================================================================== // ARCHIVED — v1 strategy. Superseded by `strategies/flat-folder/`. diff --git a/packages/ingest-github/src/strategies/flat-folder/analyse-changed.ts b/packages/ingest-github/src/strategies/flat-folder/analyse-changed.ts index ddfe9c1..6b55754 100644 --- a/packages/ingest-github/src/strategies/flat-folder/analyse-changed.ts +++ b/packages/ingest-github/src/strategies/flat-folder/analyse-changed.ts @@ -3,17 +3,17 @@ import { tokenLen, type AskLlmOptions } from "@bb/llm"; import { logger } from "@bb/logger"; import { Config } from "@bb/types"; import { getConfigValue } from "@bb/config"; -import type { ArchiveSink, FileAnalyzer, ScannedFile, SourceReader } from "src/types/pipeline.ts"; -import type { MetaPaths } from "src/types/meta-paths.ts"; -import type { BigFileEntry } from "src/types/big-file.ts"; -import type { ProgressContext } from "src/progress/types.ts"; -import { looksBinary, passesPathFilters } from "src/pipeline/filters.ts"; -import { withConcurrency } from "src/pipeline/concurrency.ts"; -import { throwIfCancelled, CancellationError } from "src/pipeline/cancellation.ts"; -import type { DiffResult } from "src/pipeline/git-diff.ts"; -import { analyseScannedFile, buildOversizedStub } from "src/strategies/flat-folder/analyse-file.ts"; -import { saveCondensed } from "src/strategies/flat-folder/big-file/storage.ts"; -import { readBigFiles, writeBigFiles } from "src/strategies/flat-folder/big-file/detector.ts"; +import type { ArchiveSink, FileAnalyzer, ScannedFile, SourceReader } from "#src/types/pipeline.ts"; +import type { MetaPaths } from "#src/types/meta-paths.ts"; +import type { BigFileEntry } from "#src/types/big-file.ts"; +import type { ProgressContext } from "#src/progress/types.ts"; +import { looksBinary, passesPathFilters } from "#src/pipeline/filters.ts"; +import { withConcurrency } from "#src/pipeline/concurrency.ts"; +import { throwIfCancelled, CancellationError } from "#src/pipeline/cancellation.ts"; +import type { DiffResult } from "#src/pipeline/git-diff.ts"; +import { analyseScannedFile, buildOversizedStub } from "#src/strategies/flat-folder/analyse-file.ts"; +import { saveCondensed } from "#src/strategies/flat-folder/big-file/storage.ts"; +import { readBigFiles, writeBigFiles } from "#src/strategies/flat-folder/big-file/detector.ts"; export interface AnalyseChangedInput { knowledgeId: string; diff --git a/packages/ingest-github/src/strategies/flat-folder/analyse-file.ts b/packages/ingest-github/src/strategies/flat-folder/analyse-file.ts index d7e8c71..2a41165 100644 --- a/packages/ingest-github/src/strategies/flat-folder/analyse-file.ts +++ b/packages/ingest-github/src/strategies/flat-folder/analyse-file.ts @@ -1,7 +1,7 @@ import { createHash } from "node:crypto"; import { tokenLen, type AskLlmOptions } from "@bb/llm"; -import type { CondensedFileAnalysis } from "src/types/condensed-file-analysis.ts"; -import type { FileAnalyzer, ScannedFile } from "src/types/pipeline.ts"; +import type { CondensedFileAnalysis } from "#src/types/condensed-file-analysis.ts"; +import type { FileAnalyzer, ScannedFile } from "#src/types/pipeline.ts"; export async function analyseScannedFile( analyzer: FileAnalyzer, diff --git a/packages/ingest-github/src/strategies/flat-folder/backfill/big-files.ts b/packages/ingest-github/src/strategies/flat-folder/backfill/big-files.ts index 0925eb1..2c8e201 100644 --- a/packages/ingest-github/src/strategies/flat-folder/backfill/big-files.ts +++ b/packages/ingest-github/src/strategies/flat-folder/backfill/big-files.ts @@ -1,11 +1,11 @@ import { logger } from "@bb/logger"; import type { AskLlmOptions } from "@bb/llm"; -import type { MetaPaths } from "src/types/meta-paths.ts"; -import type { SourceReader } from "src/types/pipeline.ts"; -import type { ProgressContext } from "src/progress/types.ts"; -import { readBigFiles } from "src/strategies/flat-folder/big-file/detector.ts"; -import { inspect } from "src/strategies/flat-folder/big-file/cache.ts"; -import { processBigFile } from "src/strategies/flat-folder/big-file/index.ts"; +import type { MetaPaths } from "#src/types/meta-paths.ts"; +import type { SourceReader } from "#src/types/pipeline.ts"; +import type { ProgressContext } from "#src/progress/types.ts"; +import { readBigFiles } from "#src/strategies/flat-folder/big-file/detector.ts"; +import { inspect } from "#src/strategies/flat-folder/big-file/cache.ts"; +import { processBigFile } from "#src/strategies/flat-folder/big-file/index.ts"; export interface BackfillBigFilesInput { knowledgeId: string; diff --git a/packages/ingest-github/src/strategies/flat-folder/backfill/fields.ts b/packages/ingest-github/src/strategies/flat-folder/backfill/fields.ts index 758b7ae..aaf206b 100644 --- a/packages/ingest-github/src/strategies/flat-folder/backfill/fields.ts +++ b/packages/ingest-github/src/strategies/flat-folder/backfill/fields.ts @@ -1,11 +1,11 @@ import { askJsonLLM, type AskLlmOptions } from "@bb/llm"; import { logger } from "@bb/logger"; import type { FileAnalysis, FileAnalysisSection } from "@bb/mongo"; -import type { MetaPaths } from "src/types/meta-paths.ts"; -import type { ProgressContext } from "src/progress/types.ts"; -import { iterateCondensed } from "src/strategies/flat-folder/big-file/storage.ts"; -import { saveCondensed } from "src/strategies/flat-folder/big-file/storage.ts"; -import { BACKFILL_SYSTEM_PROMPT, buildBackfillUserPrompt } from "src/strategies/flat-folder/prompts/backfill.ts"; +import type { MetaPaths } from "#src/types/meta-paths.ts"; +import type { ProgressContext } from "#src/progress/types.ts"; +import { iterateCondensed } from "#src/strategies/flat-folder/big-file/storage.ts"; +import { saveCondensed } from "#src/strategies/flat-folder/big-file/storage.ts"; +import { BACKFILL_SYSTEM_PROMPT, buildBackfillUserPrompt } from "#src/strategies/flat-folder/prompts/backfill.ts"; const EXTENDED_ARRAY_KEYS = [ "ontologyConcepts", diff --git a/packages/ingest-github/src/strategies/flat-folder/big-file/cache.ts b/packages/ingest-github/src/strategies/flat-folder/big-file/cache.ts index 86bd85a..5f62fe9 100644 --- a/packages/ingest-github/src/strategies/flat-folder/big-file/cache.ts +++ b/packages/ingest-github/src/strategies/flat-folder/big-file/cache.ts @@ -1,5 +1,5 @@ import { readManifestIfPresent, readCondensed } from "./storage.ts"; -import type { MetaPaths } from "src/types/meta-paths.ts"; +import type { MetaPaths } from "#src/types/meta-paths.ts"; export type BigFileCacheStatus = "complete" | "stale-condensed" | "missing"; diff --git a/packages/ingest-github/src/strategies/flat-folder/big-file/chunk-analyzer.ts b/packages/ingest-github/src/strategies/flat-folder/big-file/chunk-analyzer.ts index 32c6691..1d9d830 100644 --- a/packages/ingest-github/src/strategies/flat-folder/big-file/chunk-analyzer.ts +++ b/packages/ingest-github/src/strategies/flat-folder/big-file/chunk-analyzer.ts @@ -1,9 +1,9 @@ import { askJsonLLM, type AskLlmOptions } from "@bb/llm"; import { logger } from "@bb/logger"; -import type { ChunkAnalysisResult, FileChunk } from "src/types/big-file.ts"; -import { FALLBACK_LANGUAGE, emptyFileAnalysis } from "src/types/file-analysis.ts"; -import { shapeAnalysis } from "src/adapters/llm-file-analyzer.ts"; -import { CHUNK_ANALYSIS_SYSTEM_PROMPT, buildChunkUserPrompt } from "src/strategies/flat-folder/prompts/chunk.ts"; +import type { ChunkAnalysisResult, FileChunk } from "#src/types/big-file.ts"; +import { FALLBACK_LANGUAGE, emptyFileAnalysis } from "#src/types/file-analysis.ts"; +import { shapeAnalysis } from "#src/adapters/llm-file-analyzer.ts"; +import { CHUNK_ANALYSIS_SYSTEM_PROMPT, buildChunkUserPrompt } from "#src/strategies/flat-folder/prompts/chunk.ts"; export async function analyzeChunk(chunk: FileChunk, llmCallContext?: AskLlmOptions): Promise { const systemPrompt = CHUNK_ANALYSIS_SYSTEM_PROMPT; diff --git a/packages/ingest-github/src/strategies/flat-folder/big-file/chunker.ts b/packages/ingest-github/src/strategies/flat-folder/big-file/chunker.ts index f65e51a..1f284ee 100644 --- a/packages/ingest-github/src/strategies/flat-folder/big-file/chunker.ts +++ b/packages/ingest-github/src/strategies/flat-folder/big-file/chunker.ts @@ -1,5 +1,5 @@ import { tokenLen } from "@bb/llm"; -import type { FileChunk } from "src/types/big-file.ts"; +import type { FileChunk } from "#src/types/big-file.ts"; export function splitFileIntoChunks(relativePath: string, content: string, maxTokensPerChunk: number): FileChunk[] { const lines = content.split("\n"); diff --git a/packages/ingest-github/src/strategies/flat-folder/big-file/condenser.ts b/packages/ingest-github/src/strategies/flat-folder/big-file/condenser.ts index 4a5a577350627bd551645325e5f5de3b16ca4794..f1f366442f370a8b7c5eacfcaf196802dda85c33 100644 GIT binary patch delta 38 qcmccZ`M`672qUBNWD!P3AQ{T&2P79W+5pM>jH!&un_ZZuD**uC6bj@3 delta 28 kcmaFhdE0Y?2;*c?M#stFjDC}sG1^Ri%$T~_lWDpV0Gu%jKL7v# diff --git a/packages/ingest-github/src/strategies/flat-folder/big-file/detector.ts b/packages/ingest-github/src/strategies/flat-folder/big-file/detector.ts index 86a6412..0ea7f0c 100644 --- a/packages/ingest-github/src/strategies/flat-folder/big-file/detector.ts +++ b/packages/ingest-github/src/strategies/flat-folder/big-file/detector.ts @@ -1,8 +1,8 @@ import { readFile, writeFile } from "node:fs/promises"; import { tokenLen } from "@bb/llm"; import { logger } from "@bb/logger"; -import type { BigFileEntry, BigFileReason } from "src/types/big-file.ts"; -import type { MetaPaths } from "src/types/meta-paths.ts"; +import type { BigFileEntry, BigFileReason } from "#src/types/big-file.ts"; +import type { MetaPaths } from "#src/types/meta-paths.ts"; export function classifyByTokens( content: string, diff --git a/packages/ingest-github/src/strategies/flat-folder/big-file/index.ts b/packages/ingest-github/src/strategies/flat-folder/big-file/index.ts index 73f7895..e100b1d 100644 --- a/packages/ingest-github/src/strategies/flat-folder/big-file/index.ts +++ b/packages/ingest-github/src/strategies/flat-folder/big-file/index.ts @@ -3,11 +3,11 @@ import { Config } from "@bb/types"; import { getConfigValue } from "@bb/config"; import type { AskLlmOptions } from "@bb/llm"; import { logger } from "@bb/logger"; -import type { ChunkAnalysisResult, HugeFileManifest } from "src/types/big-file.ts"; -import type { CondensedFileAnalysis } from "src/types/condensed-file-analysis.ts"; -import type { MetaPaths } from "src/types/meta-paths.ts"; -import type { ProgressContext } from "src/progress/types.ts"; -import { throwIfCancelled } from "src/pipeline/cancellation.ts"; +import type { ChunkAnalysisResult, HugeFileManifest } from "#src/types/big-file.ts"; +import type { CondensedFileAnalysis } from "#src/types/condensed-file-analysis.ts"; +import type { MetaPaths } from "#src/types/meta-paths.ts"; +import type { ProgressContext } from "#src/progress/types.ts"; +import { throwIfCancelled } from "#src/pipeline/cancellation.ts"; import { splitFileIntoChunks } from "./chunker.ts"; import { analyzeChunk } from "./chunk-analyzer.ts"; import { condenseChunks } from "./condenser.ts"; diff --git a/packages/ingest-github/src/strategies/flat-folder/big-file/storage.ts b/packages/ingest-github/src/strategies/flat-folder/big-file/storage.ts index a607531..edc38df 100644 --- a/packages/ingest-github/src/strategies/flat-folder/big-file/storage.ts +++ b/packages/ingest-github/src/strategies/flat-folder/big-file/storage.ts @@ -1,9 +1,9 @@ import { mkdir, readFile, readdir, writeFile } from "node:fs/promises"; import path from "node:path"; -import { encodeMetaPath } from "src/pipeline/paths.ts"; -import type { CondensedFileAnalysis } from "src/types/condensed-file-analysis.ts"; -import type { ChunkAnalysisResult, HugeFileManifest } from "src/types/big-file.ts"; -import type { MetaPaths } from "src/types/meta-paths.ts"; +import { encodeMetaPath } from "#src/pipeline/paths.ts"; +import type { CondensedFileAnalysis } from "#src/types/condensed-file-analysis.ts"; +import type { ChunkAnalysisResult, HugeFileManifest } from "#src/types/big-file.ts"; +import type { MetaPaths } from "#src/types/meta-paths.ts"; const DIR_MODE = 0o700; diff --git a/packages/ingest-github/src/strategies/flat-folder/folder-summary-selective.ts b/packages/ingest-github/src/strategies/flat-folder/folder-summary-selective.ts index 000767d..87c23ed 100644 --- a/packages/ingest-github/src/strategies/flat-folder/folder-summary-selective.ts +++ b/packages/ingest-github/src/strategies/flat-folder/folder-summary-selective.ts @@ -2,14 +2,14 @@ import { logger } from "@bb/logger"; import { Config } from "@bb/types"; import { getConfigValue } from "@bb/config"; import type { AskLlmOptions } from "@bb/llm"; -import type { MetaPaths } from "src/types/meta-paths.ts"; -import { withConcurrency } from "src/pipeline/concurrency.ts"; -import { throwIfCancelled, CancellationError } from "src/pipeline/cancellation.ts"; +import type { MetaPaths } from "#src/types/meta-paths.ts"; +import { withConcurrency } from "#src/pipeline/concurrency.ts"; +import { throwIfCancelled, CancellationError } from "#src/pipeline/cancellation.ts"; import { groupByDirectFolder, persistFolderSummary, summariseFolder, -} from "src/strategies/flat-folder/folder-summary.ts"; +} from "#src/strategies/flat-folder/folder-summary.ts"; export interface SelectiveFolderSummaryInput { knowledgeId: string; diff --git a/packages/ingest-github/src/strategies/flat-folder/folder-summary.ts b/packages/ingest-github/src/strategies/flat-folder/folder-summary.ts index c3b29fc..10b895c 100644 --- a/packages/ingest-github/src/strategies/flat-folder/folder-summary.ts +++ b/packages/ingest-github/src/strategies/flat-folder/folder-summary.ts @@ -4,12 +4,12 @@ import { askJsonLLM, type AskLlmOptions } from "@bb/llm"; import { logger } from "@bb/logger"; import { Config } from "@bb/types"; import { getConfigValue } from "@bb/config"; -import type { CondensedFileAnalysis } from "src/types/condensed-file-analysis.ts"; -import type { MetaPaths } from "src/types/meta-paths.ts"; -import { encodeMetaPath } from "src/pipeline/paths.ts"; -import { withConcurrency } from "src/pipeline/concurrency.ts"; -import { throwIfCancelled, CancellationError } from "src/pipeline/cancellation.ts"; -import type { ProgressContext } from "src/progress/types.ts"; +import type { CondensedFileAnalysis } from "#src/types/condensed-file-analysis.ts"; +import type { MetaPaths } from "#src/types/meta-paths.ts"; +import { encodeMetaPath } from "#src/pipeline/paths.ts"; +import { withConcurrency } from "#src/pipeline/concurrency.ts"; +import { throwIfCancelled, CancellationError } from "#src/pipeline/cancellation.ts"; +import type { ProgressContext } from "#src/progress/types.ts"; import { iterateCondensed } from "./big-file/storage.ts"; import { directFolderOf } from "./folder-path.ts"; import { FOLDER_ANALYSIS_SYSTEM_PROMPT, folderAnalysisUserPrompt } from "./prompts/folder-summary.ts"; diff --git a/packages/ingest-github/src/strategies/flat-folder/index.ts b/packages/ingest-github/src/strategies/flat-folder/index.ts index 70bee12..72f2640 100644 --- a/packages/ingest-github/src/strategies/flat-folder/index.ts +++ b/packages/ingest-github/src/strategies/flat-folder/index.ts @@ -1,7 +1,7 @@ import { logger } from "@bb/logger"; -import type { FileAnalyzer } from "src/types/pipeline.ts"; -import type { IngestStrategy, StrategyInput, StrategyResult } from "src/types/strategy.ts"; -import { throwIfCancelled } from "src/pipeline/cancellation.ts"; +import type { FileAnalyzer } from "#src/types/pipeline.ts"; +import type { IngestStrategy, StrategyInput, StrategyResult } from "#src/types/strategy.ts"; +import { throwIfCancelled } from "#src/pipeline/cancellation.ts"; import { classifyAndAnalyseSmall } from "./phases/classify-and-analyse-small.ts"; import { processBigFilesQueue } from "./phases/process-big-files.ts"; import { backfillMissingFields } from "./backfill/fields.ts"; @@ -9,8 +9,8 @@ import { backfillBigFiles } from "./backfill/big-files.ts"; import { runFolderSummaryPhase } from "./folder-summary.ts"; import { makeRepoSummaryEnvelope, persistRepoSummary, summariseRepo } from "./repo-summary.ts"; import { storeFlatAnalysis } from "./phases/store-flat-analysis.ts"; -import type { ProgressContext, ProgressContextFactory } from "src/progress/types.ts"; -import { nullProgressContextFactory } from "src/progress/NullProgressReporter.ts"; +import type { ProgressContext, ProgressContextFactory } from "#src/progress/types.ts"; +import { nullProgressContextFactory } from "#src/progress/NullProgressReporter.ts"; export interface FlatFolderStrategyDeps { fileAnalyzer: FileAnalyzer; diff --git a/packages/ingest-github/src/strategies/flat-folder/phases/classify-and-analyse-small.ts b/packages/ingest-github/src/strategies/flat-folder/phases/classify-and-analyse-small.ts index 6b82ca6..a3922fd 100644 --- a/packages/ingest-github/src/strategies/flat-folder/phases/classify-and-analyse-small.ts +++ b/packages/ingest-github/src/strategies/flat-folder/phases/classify-and-analyse-small.ts @@ -3,16 +3,16 @@ import { tokenLen, type AskLlmOptions } from "@bb/llm"; import { logger } from "@bb/logger"; import { Config } from "@bb/types"; import { getConfigValue } from "@bb/config"; -import type { ArchiveSink, FileAnalyzer, SkipDecider, SourceReader } from "src/types/pipeline.ts"; -import type { MetaPaths } from "src/types/meta-paths.ts"; -import type { BigFileEntry } from "src/types/big-file.ts"; -import type { ProgressContext } from "src/progress/types.ts"; -import { withConcurrency } from "src/pipeline/concurrency.ts"; -import { throwIfCancelled, CancellationError } from "src/pipeline/cancellation.ts"; -import { makeSkipDecider } from "src/pipeline/skip-decisions/index.ts"; -import { analyseScannedFile, buildOversizedStub } from "src/strategies/flat-folder/analyse-file.ts"; -import { saveCondensed } from "src/strategies/flat-folder/big-file/storage.ts"; -import { writeBigFiles } from "src/strategies/flat-folder/big-file/detector.ts"; +import type { ArchiveSink, FileAnalyzer, SkipDecider, SourceReader } from "#src/types/pipeline.ts"; +import type { MetaPaths } from "#src/types/meta-paths.ts"; +import type { BigFileEntry } from "#src/types/big-file.ts"; +import type { ProgressContext } from "#src/progress/types.ts"; +import { withConcurrency } from "#src/pipeline/concurrency.ts"; +import { throwIfCancelled, CancellationError } from "#src/pipeline/cancellation.ts"; +import { makeSkipDecider } from "#src/pipeline/skip-decisions/index.ts"; +import { analyseScannedFile, buildOversizedStub } from "#src/strategies/flat-folder/analyse-file.ts"; +import { saveCondensed } from "#src/strategies/flat-folder/big-file/storage.ts"; +import { writeBigFiles } from "#src/strategies/flat-folder/big-file/detector.ts"; export interface ClassifyPhaseInput { knowledgeId: string; diff --git a/packages/ingest-github/src/strategies/flat-folder/phases/process-big-files.ts b/packages/ingest-github/src/strategies/flat-folder/phases/process-big-files.ts index 6ada634..c0563b2 100644 --- a/packages/ingest-github/src/strategies/flat-folder/phases/process-big-files.ts +++ b/packages/ingest-github/src/strategies/flat-folder/phases/process-big-files.ts @@ -1,12 +1,12 @@ import { logger } from "@bb/logger"; import type { AskLlmOptions } from "@bb/llm"; -import type { MetaPaths } from "src/types/meta-paths.ts"; -import type { SourceReader } from "src/types/pipeline.ts"; -import type { ProgressContext } from "src/progress/types.ts"; -import { throwIfCancelled, CancellationError } from "src/pipeline/cancellation.ts"; -import { readBigFiles } from "src/strategies/flat-folder/big-file/detector.ts"; -import { inspect } from "src/strategies/flat-folder/big-file/cache.ts"; -import { processBigFile } from "src/strategies/flat-folder/big-file/index.ts"; +import type { MetaPaths } from "#src/types/meta-paths.ts"; +import type { SourceReader } from "#src/types/pipeline.ts"; +import type { ProgressContext } from "#src/progress/types.ts"; +import { throwIfCancelled, CancellationError } from "#src/pipeline/cancellation.ts"; +import { readBigFiles } from "#src/strategies/flat-folder/big-file/detector.ts"; +import { inspect } from "#src/strategies/flat-folder/big-file/cache.ts"; +import { processBigFile } from "#src/strategies/flat-folder/big-file/index.ts"; export interface ProcessBigFilesInput { knowledgeId: string; diff --git a/packages/ingest-github/src/strategies/flat-folder/phases/store-flat-analysis.ts b/packages/ingest-github/src/strategies/flat-folder/phases/store-flat-analysis.ts index b700986..dbcbb30 100644 --- a/packages/ingest-github/src/strategies/flat-folder/phases/store-flat-analysis.ts +++ b/packages/ingest-github/src/strategies/flat-folder/phases/store-flat-analysis.ts @@ -2,14 +2,14 @@ import { readFile } from "node:fs/promises"; import { logger } from "@bb/logger"; import { ensureFlatFolderIndexes, upsertFileNode, upsertFolderNode, upsertRepoNode, type NodeScope } from "@bb/neo4j"; import type { GithubIndexPayload } from "@bb/types"; -import type { MetaPaths } from "src/types/meta-paths.ts"; -import { throwIfCancelled } from "src/pipeline/cancellation.ts"; -import { iterateCondensed } from "src/strategies/flat-folder/big-file/storage.ts"; -import { iterateFolderSummaries } from "src/strategies/flat-folder/folder-summary.ts"; -import { directFolderOf } from "src/strategies/flat-folder/folder-path.ts"; -import { languageFromPath } from "src/adapters/llm-file-analyzer.ts"; -import type { ProgressContext } from "src/progress/types.ts"; -import type { FolderSummary, RepoSummary, RepoSummaryEnvelope } from "src/strategies/flat-folder/types.ts"; +import type { MetaPaths } from "#src/types/meta-paths.ts"; +import { throwIfCancelled } from "#src/pipeline/cancellation.ts"; +import { iterateCondensed } from "#src/strategies/flat-folder/big-file/storage.ts"; +import { iterateFolderSummaries } from "#src/strategies/flat-folder/folder-summary.ts"; +import { directFolderOf } from "#src/strategies/flat-folder/folder-path.ts"; +import { languageFromPath } from "#src/adapters/llm-file-analyzer.ts"; +import type { ProgressContext } from "#src/progress/types.ts"; +import type { FolderSummary, RepoSummary, RepoSummaryEnvelope } from "#src/strategies/flat-folder/types.ts"; export interface StoreFlatAnalysisInput { scope: NodeScope; diff --git a/packages/ingest-github/src/strategies/flat-folder/prompts/folder-summary.ts b/packages/ingest-github/src/strategies/flat-folder/prompts/folder-summary.ts index 465c9dc..10276a8 100644 --- a/packages/ingest-github/src/strategies/flat-folder/prompts/folder-summary.ts +++ b/packages/ingest-github/src/strategies/flat-folder/prompts/folder-summary.ts @@ -1,4 +1,4 @@ -import type { CondensedFileAnalysis } from "src/types/condensed-file-analysis.ts"; +import type { CondensedFileAnalysis } from "#src/types/condensed-file-analysis.ts"; export const FOLDER_ANALYSIS_SYSTEM_PROMPT = `You are summarising a single FOLDER of a source repository. The user will provide the per-file analyses of the files DIRECTLY inside that folder (subfolders are summarised separately and are NOT in your input). diff --git a/packages/ingest-github/src/strategies/flat-folder/repo-summary.ts b/packages/ingest-github/src/strategies/flat-folder/repo-summary.ts index a2af39b..0eaec5c 100644 --- a/packages/ingest-github/src/strategies/flat-folder/repo-summary.ts +++ b/packages/ingest-github/src/strategies/flat-folder/repo-summary.ts @@ -3,8 +3,8 @@ import { askJsonLLM, tokenLen, type AskLlmOptions } from "@bb/llm"; import { logger } from "@bb/logger"; import { Config } from "@bb/types"; import { getConfigValue } from "@bb/config"; -import type { MetaPaths } from "src/types/meta-paths.ts"; -import { throwIfCancelled } from "src/pipeline/cancellation.ts"; +import type { MetaPaths } from "#src/types/meta-paths.ts"; +import { throwIfCancelled } from "#src/pipeline/cancellation.ts"; import { iterateFolderSummaries } from "./folder-summary.ts"; import { REPO_SUMMARY_SYSTEM_PROMPT, diff --git a/packages/ingest-github/src/strategies/flat-folder/store-pull.ts b/packages/ingest-github/src/strategies/flat-folder/store-pull.ts index 9b74fad..d070c42 100644 --- a/packages/ingest-github/src/strategies/flat-folder/store-pull.ts +++ b/packages/ingest-github/src/strategies/flat-folder/store-pull.ts @@ -10,15 +10,15 @@ import { } from "@bb/neo4j"; import { deleteRawFiles } from "@bb/mongo"; import type { GithubIndexPayload } from "@bb/types"; -import type { MetaPaths } from "src/types/meta-paths.ts"; -import type { CondensedFileAnalysis } from "src/types/condensed-file-analysis.ts"; -import { throwIfCancelled } from "src/pipeline/cancellation.ts"; -import type { DiffResult } from "src/pipeline/git-diff.ts"; -import { readCondensed } from "src/strategies/flat-folder/big-file/storage.ts"; -import { iterateFolderSummaries } from "src/strategies/flat-folder/folder-summary.ts"; -import { directFolderOf } from "src/strategies/flat-folder/folder-path.ts"; -import { languageFromPath } from "src/adapters/llm-file-analyzer.ts"; -import type { FolderSummary, RepoSummary, RepoSummaryEnvelope } from "src/strategies/flat-folder/types.ts"; +import type { MetaPaths } from "#src/types/meta-paths.ts"; +import type { CondensedFileAnalysis } from "#src/types/condensed-file-analysis.ts"; +import { throwIfCancelled } from "#src/pipeline/cancellation.ts"; +import type { DiffResult } from "#src/pipeline/git-diff.ts"; +import { readCondensed } from "#src/strategies/flat-folder/big-file/storage.ts"; +import { iterateFolderSummaries } from "#src/strategies/flat-folder/folder-summary.ts"; +import { directFolderOf } from "#src/strategies/flat-folder/folder-path.ts"; +import { languageFromPath } from "#src/adapters/llm-file-analyzer.ts"; +import type { FolderSummary, RepoSummary, RepoSummaryEnvelope } from "#src/strategies/flat-folder/types.ts"; export interface StorePullInput { scope: NodeScope; diff --git a/packages/ingest-github/src/strategies/flat-folder/types.ts b/packages/ingest-github/src/strategies/flat-folder/types.ts index 9d33168..15ac29f 100644 --- a/packages/ingest-github/src/strategies/flat-folder/types.ts +++ b/packages/ingest-github/src/strategies/flat-folder/types.ts @@ -1,4 +1,4 @@ -import type { CondensedFileAnalysis } from "src/types/condensed-file-analysis.ts"; +import type { CondensedFileAnalysis } from "#src/types/condensed-file-analysis.ts"; export interface AnalyzedFileEntry { relativePath: string; diff --git a/packages/ingest-github/src/types/pipeline.ts b/packages/ingest-github/src/types/pipeline.ts index dd8e898..382cd16 100644 --- a/packages/ingest-github/src/types/pipeline.ts +++ b/packages/ingest-github/src/types/pipeline.ts @@ -1,7 +1,7 @@ import type { GithubIndexPayload, GithubPullPayload } from "@bb/types"; import type { AskLlmOptions } from "@bb/llm"; import type { FileAnalysis } from "@bb/mongo"; -import type { DiffResult } from "src/pipeline/git-diff.ts"; +import type { DiffResult } from "#src/pipeline/git-diff.ts"; export interface ScannedFile { kind: "file"; diff --git a/packages/ingest-github/tsconfig.json b/packages/ingest-github/tsconfig.json index 07da1a4..d8a16a7 100644 --- a/packages/ingest-github/tsconfig.json +++ b/packages/ingest-github/tsconfig.json @@ -1,25 +1,4 @@ { - "extends": "../../tsconfig.base.json", - "compilerOptions": { - "rootDir": "./src", - "outDir": "./dist", - "baseUrl": ".", - "paths": { - "src/*": ["./src/*"] - }, - "ignoreDeprecations": "5.0", - "noEmit": false, - "emitDeclarationOnly": true - }, - "include": ["src/**/*", "src/**/*.json"], - "references": [ - { "path": "../config" }, - { "path": "../errors" }, - { "path": "../llm" }, - { "path": "../logger" }, - { "path": "../mongo" }, - { "path": "../neo4j" }, - { "path": "../queue" }, - { "path": "../types" } - ] + "extends": "../../../../tsconfig.base.json", + "include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.json"] } diff --git a/packages/ingest-github/types/index.d.ts b/packages/ingest-github/types/index.d.ts index 69c8433..98445ad 100644 --- a/packages/ingest-github/types/index.d.ts +++ b/packages/ingest-github/types/index.d.ts @@ -49,6 +49,12 @@ export declare const createGithubIngestHandler: (...args: any[]) => any; export declare const createLocalIngestHandler: (...args: any[]) => any; export declare const runPull: (...args: any[]) => any; export declare const reposRoot: (...args: any[]) => string; +export declare const repoCloneDir: (knowledgeId: string) => string; +export declare const metaRootFor: (knowledgeId: string) => string; +export declare const metaPathsFor: (knowledgeId: string) => unknown; +export declare const commitMetaDir: (knowledgeId: string, commitHash: string) => string; +export declare const businessContextDir: (knowledgeId: string, commitHash: string, sanitizedTitle: string) => string; +export declare const orgRegistryDir: (knowledgeId: string, orgId: string) => string; export declare function fetchLatestCommitHash( repoUrl: string, branch: string, diff --git a/packages/llm/package.json b/packages/llm/package.json index a972e7a..32be323 100644 --- a/packages/llm/package.json +++ b/packages/llm/package.json @@ -8,6 +8,9 @@ "exports": { ".": "./src/index.ts" }, + "imports": { + "#src/*": "./src/*" + }, "dependencies": { "@bb/config": "workspace:*", "@bb/errors": "workspace:*", diff --git a/packages/llm/src/cache.ts b/packages/llm/src/cache.ts index a98f3e3..a6fea64 100644 --- a/packages/llm/src/cache.ts +++ b/packages/llm/src/cache.ts @@ -1,4 +1,3 @@ -// SPDX-License-Identifier: AGPL-3.0-only WITH non-commercial-clause import { createHash } from "node:crypto"; import fs from "node:fs/promises"; import path from "node:path"; diff --git a/packages/llm/src/client.ts b/packages/llm/src/client.ts index 77c3329..3bbd268 100644 --- a/packages/llm/src/client.ts +++ b/packages/llm/src/client.ts @@ -1,4 +1,3 @@ -// SPDX-License-Identifier: AGPL-3.0-only WITH non-commercial-clause import { getConfigValue } from "@bb/config"; import { logger } from "@bb/logger"; import { Config } from "@bb/types"; diff --git a/packages/llm/src/ollama.ts b/packages/llm/src/ollama.ts index 444b29a..ea9fecd 100644 --- a/packages/llm/src/ollama.ts +++ b/packages/llm/src/ollama.ts @@ -1,4 +1,3 @@ -// SPDX-License-Identifier: AGPL-3.0-only WITH non-commercial-clause import { getConfigValue } from "@bb/config"; import { Config } from "@bb/types"; import { LlmConfigError, LlmError } from "@bb/errors"; diff --git a/packages/llm/src/openrouter.ts b/packages/llm/src/openrouter.ts index 8410f52..fb17150 100644 --- a/packages/llm/src/openrouter.ts +++ b/packages/llm/src/openrouter.ts @@ -1,4 +1,3 @@ -// SPDX-License-Identifier: AGPL-3.0-only WITH non-commercial-clause import { getConfigValue } from "@bb/config"; import { Config } from "@bb/types"; import { LlmConfigError, LlmError } from "@bb/errors"; diff --git a/packages/llm/tsconfig.json b/packages/llm/tsconfig.json index c69f55e..d8a16a7 100644 --- a/packages/llm/tsconfig.json +++ b/packages/llm/tsconfig.json @@ -1,9 +1,4 @@ { - "extends": "../../tsconfig.base.json", - "compilerOptions": { - "rootDir": "./src", - "outDir": "./dist" - }, - "include": ["src/**/*"], - "references": [{ "path": "../config" }, { "path": "../errors" }, { "path": "../mongo" }, { "path": "../types" }] + "extends": "../../../../tsconfig.base.json", + "include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.json"] } diff --git a/packages/logger/package.json b/packages/logger/package.json index 3d25c86..f7a1152 100644 --- a/packages/logger/package.json +++ b/packages/logger/package.json @@ -8,6 +8,9 @@ "exports": { ".": "./src/index.ts" }, + "imports": { + "#src/*": "./src/*" + }, "dependencies": { "@bb/config": "workspace:*", "@bb/types": "workspace:*", diff --git a/packages/logger/tsconfig.json b/packages/logger/tsconfig.json index c705055..d8a16a7 100644 --- a/packages/logger/tsconfig.json +++ b/packages/logger/tsconfig.json @@ -1,11 +1,4 @@ { - "extends": "../../tsconfig.base.json", - "compilerOptions": { - "rootDir": "./src", - "outDir": "./dist", - "noEmit": false, - "emitDeclarationOnly": true - }, - "include": ["src/**/*"], - "references": [{ "path": "../config" }, { "path": "../types" }] + "extends": "../../../../tsconfig.base.json", + "include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.json"] } diff --git a/packages/mcp/package.json b/packages/mcp/package.json index 7d58faf..98fc5a5 100644 --- a/packages/mcp/package.json +++ b/packages/mcp/package.json @@ -8,6 +8,9 @@ "exports": { ".": "./src/index.ts" }, + "imports": { + "#src/*": "./src/*" + }, "dependencies": { "@bb/config": "workspace:*", "@bb/logger": "workspace:*", diff --git a/packages/mcp/tsconfig.json b/packages/mcp/tsconfig.json index e3950ee..d8a16a7 100644 --- a/packages/mcp/tsconfig.json +++ b/packages/mcp/tsconfig.json @@ -1,15 +1,4 @@ { - "extends": "../../tsconfig.base.json", - "compilerOptions": { - "rootDir": "./src", - "outDir": "./dist" - }, - "include": ["src/**/*"], - "references": [ - { "path": "../config" }, - { "path": "../logger" }, - { "path": "../llm" }, - { "path": "../neo4j" }, - { "path": "../types" } - ] + "extends": "../../../../tsconfig.base.json", + "include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.json"] } diff --git a/packages/mongo/package.json b/packages/mongo/package.json index c32bc30..54f058a 100644 --- a/packages/mongo/package.json +++ b/packages/mongo/package.json @@ -8,6 +8,9 @@ "exports": { ".": "./src/index.ts" }, + "imports": { + "#src/*": "./src/*" + }, "dependencies": { "@bb/config": "workspace:*", "@bb/errors": "workspace:*", diff --git a/packages/mongo/tsconfig.json b/packages/mongo/tsconfig.json index df37b8d..d8a16a7 100644 --- a/packages/mongo/tsconfig.json +++ b/packages/mongo/tsconfig.json @@ -1,9 +1,4 @@ { - "extends": "../../tsconfig.base.json", - "compilerOptions": { - "rootDir": "./src", - "outDir": "./dist" - }, - "include": ["src/**/*"], - "references": [{ "path": "../config" }, { "path": "../errors" }, { "path": "../types" }] + "extends": "../../../../tsconfig.base.json", + "include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.json"] } diff --git a/packages/neo4j/package.json b/packages/neo4j/package.json index 6335062..7e16617 100644 --- a/packages/neo4j/package.json +++ b/packages/neo4j/package.json @@ -8,6 +8,9 @@ "exports": { ".": "./src/index.ts" }, + "imports": { + "#src/*": "./src/*" + }, "dependencies": { "@bb/config": "workspace:*", "@bb/errors": "workspace:*", diff --git a/packages/neo4j/tsconfig.json b/packages/neo4j/tsconfig.json index 31a7e4d..d8a16a7 100644 --- a/packages/neo4j/tsconfig.json +++ b/packages/neo4j/tsconfig.json @@ -1,9 +1,4 @@ { - "extends": "../../tsconfig.base.json", - "compilerOptions": { - "rootDir": "./src", - "outDir": "./dist" - }, - "include": ["src/**/*"], - "references": [{ "path": "../config" }, { "path": "../errors" }, { "path": "../types" }, { "path": "../mongo" }] + "extends": "../../../../tsconfig.base.json", + "include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.json"] } diff --git a/packages/queue/package.json b/packages/queue/package.json index 308c1eb..99230be 100644 --- a/packages/queue/package.json +++ b/packages/queue/package.json @@ -8,6 +8,9 @@ "exports": { ".": "./src/index.ts" }, + "imports": { + "#src/*": "./src/*" + }, "dependencies": { "@bb/config": "workspace:*", "@bb/errors": "workspace:*", diff --git a/packages/queue/src/workers.ts b/packages/queue/src/workers.ts index 5ef8e24..3bfe659 100644 --- a/packages/queue/src/workers.ts +++ b/packages/queue/src/workers.ts @@ -40,6 +40,7 @@ function defaultConcurrencyFor(type: JobType): number { case JobType.GithubIndex: case JobType.GithubPull: case JobType.LocalIngest: + case JobType.BusinessContextProcessing: return getConfigValue(Config.ConcurrencyGithub); } } diff --git a/packages/queue/tsconfig.json b/packages/queue/tsconfig.json index 89d8570..d8a16a7 100644 --- a/packages/queue/tsconfig.json +++ b/packages/queue/tsconfig.json @@ -1,15 +1,4 @@ { - "extends": "../../tsconfig.base.json", - "compilerOptions": { - "rootDir": "./src", - "outDir": "./dist" - }, - "include": ["src/**/*"], - "references": [ - { "path": "../config" }, - { "path": "../errors" }, - { "path": "../mongo" }, - { "path": "../redis" }, - { "path": "../types" } - ] + "extends": "../../../../tsconfig.base.json", + "include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.json"] } diff --git a/packages/redis/package.json b/packages/redis/package.json index 400bfc7..3aeccf2 100644 --- a/packages/redis/package.json +++ b/packages/redis/package.json @@ -8,6 +8,9 @@ "exports": { ".": "./src/index.ts" }, + "imports": { + "#src/*": "./src/*" + }, "dependencies": { "@bb/config": "workspace:*", "@bb/errors": "workspace:*", diff --git a/packages/redis/tsconfig.json b/packages/redis/tsconfig.json index df37b8d..d8a16a7 100644 --- a/packages/redis/tsconfig.json +++ b/packages/redis/tsconfig.json @@ -1,9 +1,4 @@ { - "extends": "../../tsconfig.base.json", - "compilerOptions": { - "rootDir": "./src", - "outDir": "./dist" - }, - "include": ["src/**/*"], - "references": [{ "path": "../config" }, { "path": "../errors" }, { "path": "../types" }] + "extends": "../../../../tsconfig.base.json", + "include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.json"] } diff --git a/packages/server/package.json b/packages/server/package.json index 2a3915c..aa828af 100644 --- a/packages/server/package.json +++ b/packages/server/package.json @@ -8,6 +8,9 @@ "exports": { ".": "./src/index.ts" }, + "imports": { + "#src/*": "./src/*" + }, "bin": { "bytebell-server": "./src/index.ts" }, diff --git a/packages/server/tsconfig.json b/packages/server/tsconfig.json index b195e2f..d8a16a7 100644 --- a/packages/server/tsconfig.json +++ b/packages/server/tsconfig.json @@ -1,19 +1,4 @@ { - "extends": "../../tsconfig.base.json", - "compilerOptions": { - "rootDir": "./src", - "outDir": "./dist" - }, - "include": ["src/**/*"], - "references": [ - { "path": "../config" }, - { "path": "../errors" }, - { "path": "../ingest-github" }, - { "path": "../mcp" }, - { "path": "../mongo" }, - { "path": "../neo4j" }, - { "path": "../queue" }, - { "path": "../redis" }, - { "path": "../types" } - ] + "extends": "../../../../tsconfig.base.json", + "include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.json"] } diff --git a/packages/types/package.json b/packages/types/package.json index 3fc568f..e29793f 100644 --- a/packages/types/package.json +++ b/packages/types/package.json @@ -8,5 +8,8 @@ "exports": { ".": "./src/index.ts" }, + "imports": { + "#src/*": "./src/*" + }, "dependencies": {} } diff --git a/packages/types/src/index.ts b/packages/types/src/index.ts index 2871e7b..fc21d19 100644 --- a/packages/types/src/index.ts +++ b/packages/types/src/index.ts @@ -4,6 +4,7 @@ export type { GithubIndexPayload, GithubPullPayload, LocalIngestPayload, + BusinessContextProcessingPayload, JobMessage, PayloadFor, PayloadLlmOverrides, diff --git a/packages/types/src/job.ts b/packages/types/src/job.ts index befc5df..7d42db6 100644 --- a/packages/types/src/job.ts +++ b/packages/types/src/job.ts @@ -2,6 +2,7 @@ export enum JobType { GithubIndex = "github_index", GithubPull = "github_pull", LocalIngest = "local_ingest", + BusinessContextProcessing = "CUSTOM_CONTEXT_PROCESSING", } export enum JobPriority { @@ -65,6 +66,33 @@ export interface LocalIngestPayload { orgId?: string; } +/** + * Payload for the BusinessContext processing job. A BusinessContext is a free-text + * note authored by a human against a specific indexed commit of a GitHub knowledge. + * The worker analyses the text into structured product/technical fields, persists + * it to the per-commit meta tree on disk, and projects it into Neo4j as a + * `:BusinessContext` node plus a `:BusinessContextVersion` snapshot keyed by + * `(knowledgeId, commitHash)`. + * + * `orgId` is single-tenant (`"local"`) in OSS; downstream multi-tenant deployments + * stamp it from the request so org-scoped keyword nodes stay isolated. + */ +export interface BusinessContextProcessingPayload extends PayloadLlmOverrides { + knowledgeId: string; + /** 40-char hex SHA of the commit this business context applies to. */ + commitHash: string; + /** Raw, user-authored business-context text. */ + customText: string; + /** Optional human-supplied description for the job-tracking record. */ + description?: string; + /** Optional repo URL (carried for audit; ingestion does not re-clone). */ + repoUrl?: string; + /** Optional branch (carried for audit). */ + branch?: string; + /** Tenant binding. OSS standalone leaves this unset (defaults to `"local"`). */ + orgId?: string; +} + export interface JobMessage

{ id: string; type: JobType; @@ -81,4 +109,6 @@ export type PayloadFor = T extends JobType.GithubIndex ? GithubPullPayload : T extends JobType.LocalIngest ? LocalIngestPayload - : never; + : T extends JobType.BusinessContextProcessing + ? BusinessContextProcessingPayload + : never; diff --git a/packages/types/tsconfig.json b/packages/types/tsconfig.json index c2104f6..d8a16a7 100644 --- a/packages/types/tsconfig.json +++ b/packages/types/tsconfig.json @@ -1,8 +1,4 @@ { - "extends": "../../tsconfig.base.json", - "compilerOptions": { - "rootDir": "./src", - "outDir": "./dist" - }, - "include": ["src/**/*"] + "extends": "../../../../tsconfig.base.json", + "include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.json"] } diff --git a/tsconfig.json b/tsconfig.json index 97edcbe..4f4863d 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -17,6 +17,7 @@ { "path": "packages/queue" }, { "path": "packages/llm" }, { "path": "packages/ingest-github" }, + { "path": "packages/ingest-business-context" }, { "path": "packages/cli" }, { "path": "packages/server" }, { "path": "packages/neo4j" }, From 4d26790dc7c4e4f6ec3a388c0daa4a0f3448208e Mon Sep 17 00:00:00 2001 From: Dead-Bytes <143434285+Dead-Bytes@users.noreply.github.com> Date: Tue, 19 May 2026 01:42:11 +0530 Subject: [PATCH 22/34] refactor: enhance LLM usage reporting and remove deprecated pricing logic --- packages/errors/src/README.md | 8 +- packages/errors/src/llm-errors.ts | 12 +- .../src/adapters/llm-file-analyzer.ts | 14 +- packages/ingest-github/src/handlers/README.md | 7 +- .../ingest-github/src/handlers/ingest-job.ts | 13 +- packages/ingest-github/src/index.ts | 19 +- packages/ingest-github/src/payload/narrow.ts | 30 ++- packages/ingest-github/src/pipeline/README.md | 31 ++- .../src/pipeline/failure-classifier.ts | 77 +++++++ packages/ingest-github/src/pipeline/pull.ts | 58 ++++-- packages/ingest-github/src/pipeline/run.ts | 64 +++--- packages/ingest-github/src/pipeline/stats.ts | 34 --- .../src/progress/NullProgressReporter.ts | 2 +- packages/ingest-github/src/progress/types.ts | 10 +- .../strategies/flat-folder/analyse-changed.ts | 10 +- .../flat-folder/backfill/big-files.ts | 4 + .../strategies/flat-folder/backfill/fields.ts | 4 + .../flat-folder/big-file/chunk-analyzer.ts | 10 +- .../flat-folder/big-file/condenser.ts | Bin 9440 -> 9844 bytes .../strategies/flat-folder/big-file/index.ts | 4 +- .../flat-folder/folder-summary-selective.ts | 11 +- .../strategies/flat-folder/folder-summary.ts | 37 +++- .../src/strategies/flat-folder/index.ts | 11 +- .../phases/classify-and-analyse-small.ts | 12 +- .../flat-folder/phases/process-big-files.ts | 10 +- .../strategies/flat-folder/repo-summary.ts | 40 +++- packages/ingest-github/src/types/big-file.ts | 2 +- .../src/types/condensed-file-analysis.ts | 2 +- packages/ingest-github/src/types/pipeline.ts | 4 +- packages/ingest-github/src/types/strategy.ts | 2 +- packages/llm/README.md | 35 ++-- packages/llm/src/cache.ts | 4 + packages/llm/src/client.ts | 6 + packages/llm/src/index.ts | 1 - packages/llm/src/jsonClient.ts | 13 +- packages/llm/src/ollama.ts | 1 + packages/llm/src/openrouter.ts | 22 +- packages/llm/src/pricing.ts | 137 ------------- packages/mongo/README.md | 29 +-- packages/mongo/src/README.md | 25 ++- packages/mongo/src/aggregateStats.ts | 155 ++++++++++++++ packages/mongo/src/collections.ts | 1 - packages/mongo/src/index.ts | 4 +- packages/mongo/src/knowledge.ts | 52 ++++- packages/mongo/src/processingStats.ts | 194 ------------------ packages/server/src/deleteRoute.ts | 1 - packages/types/src/README.md | 17 +- packages/types/src/index.ts | 12 +- packages/types/src/knowledge.ts | 37 ++++ packages/types/src/stats.ts | 24 --- 50 files changed, 757 insertions(+), 555 deletions(-) create mode 100644 packages/ingest-github/src/pipeline/failure-classifier.ts delete mode 100644 packages/llm/src/pricing.ts create mode 100644 packages/mongo/src/aggregateStats.ts delete mode 100644 packages/mongo/src/processingStats.ts diff --git a/packages/errors/src/README.md b/packages/errors/src/README.md index c2752d6..4edfc65 100644 --- a/packages/errors/src/README.md +++ b/packages/errors/src/README.md @@ -36,7 +36,13 @@ package-level contract; this file documents how the source tree is split. - **[llm-errors.ts](llm-errors.ts)** — errors thrown by `@bb/llm`. Today: `LlmConfigError` (missing OpenRouter API key; carries the `bytebell keys set` hint), `LlmError` (HTTP non-2xx, timeout, empty - completion; accepts an optional `cause`). + completion; accepts an optional `cause` plus an optional + `{ status?: number; detail?: string }` options bag — `status` is the + provider HTTP status when the failure originated from a non-OK response, + `detail` is the raw response body capped to 4000 chars. Downstream + classifiers like `@bb/ingest-github/src/pipeline/failure-classifier.ts` + map `status` → `KnowledgeFailureCategory` so operators see the right + remediation hint). - **[ingest-errors.ts](ingest-errors.ts)** — errors thrown by `@bb/ingest-*` workers and `@bb/cli`'s ingest command. Today: `GitCloneError` (git binary failed; redacts userinfo in the repo URL diff --git a/packages/errors/src/llm-errors.ts b/packages/errors/src/llm-errors.ts index 67e2a70..bc5c00c 100644 --- a/packages/errors/src/llm-errors.ts +++ b/packages/errors/src/llm-errors.ts @@ -10,11 +10,21 @@ export class LlmConfigError extends Error { export class LlmError extends Error { override readonly name = "LlmError"; + /** HTTP status code from the provider when the failure originated from a non-OK response. */ + readonly status?: number; + /** Raw provider response body (or other structured detail), capped to a sane size by the thrower. */ + readonly detail?: string; - constructor(message: string, cause?: unknown) { + constructor(message: string, cause?: unknown, options?: { status?: number; detail?: string }) { super(message); if (cause !== undefined) { this.cause = cause; } + if (options?.status !== undefined) { + this.status = options.status; + } + if (options?.detail !== undefined) { + this.detail = options.detail; + } } } diff --git a/packages/ingest-github/src/adapters/llm-file-analyzer.ts b/packages/ingest-github/src/adapters/llm-file-analyzer.ts index 8aafa43..8e42d74 100644 --- a/packages/ingest-github/src/adapters/llm-file-analyzer.ts +++ b/packages/ingest-github/src/adapters/llm-file-analyzer.ts @@ -1,4 +1,5 @@ import { askJsonLLM, type AskLlmOptions } from "@bb/llm"; +import { LlmConfigError, LlmError } from "@bb/errors"; import { logger } from "@bb/logger"; import type { FileAnalysis, FileAnalysisSection } from "@bb/mongo"; import { FALLBACK_LANGUAGE, emptyFileAnalysis } from "#src/types/file-analysis.ts"; @@ -42,15 +43,24 @@ export function createLlmFileAnalyzer(deps: LlmFileAnalyzerDeps): FileAnalyzer { const userPrompt = deps.buildUserPrompt(input); const t0 = performance.now(); let raw: RawAnalysisJson | null = null; - let usage: { inputTokens: number; outputTokens: number } | undefined; + let usage: { inputTokens: number; outputTokens: number; costUsd: number } | undefined; try { const response = await askJsonLLM(systemPrompt, userPrompt, input.llmCallContext ?? {}); raw = response.result; - usage = { inputTokens: response.usage.inputTokens, outputTokens: response.usage.outputTokens }; + usage = { + inputTokens: response.usage.inputTokens, + outputTokens: response.usage.outputTokens, + costUsd: response.usage.costUsd, + }; if (raw === null) { logger.warn(`llm-file-analyzer: ${input.relativePath} returned unparseable JSON`); } } catch (cause: unknown) { + if (cause instanceof LlmConfigError || cause instanceof LlmError) { + // LLM is unreachable / misconfigured — bubble up so the runner can + // mark the knowledge FAILED with a structured reason. + throw cause; + } const msg = cause instanceof Error ? cause.message : String(cause); logger.warn(`llm-file-analyzer: ${input.relativePath} askJsonLLM failed: ${msg}`); } diff --git a/packages/ingest-github/src/handlers/README.md b/packages/ingest-github/src/handlers/README.md index 934ca3a..edfb36c 100644 --- a/packages/ingest-github/src/handlers/README.md +++ b/packages/ingest-github/src/handlers/README.md @@ -8,8 +8,11 @@ no clone — those belong in `pipeline/run.ts`. - `ingest-job.ts` — `createGithubIngestHandler(deps)` and `createLocalIngestHandler(deps)` both return BullMQ-shaped - `(msg) => Promise` callbacks. They throw `IngestError` on validation - failures; everything else propagates to BullMQ as the worker's failure path. + `(msg) => Promise` callbacks (the summary carries + per-commit `tokenUsage` including `costUsd` so the enterprise wrapper + can mirror it to the knowledge record without a `processing_stats` + round-trip). They throw `IngestError` on validation failures; + everything else propagates to BullMQ as the worker's failure path. - `README.md` — this file. ## Invariants diff --git a/packages/ingest-github/src/handlers/ingest-job.ts b/packages/ingest-github/src/handlers/ingest-job.ts index e9eb790..0853a5f 100644 --- a/packages/ingest-github/src/handlers/ingest-job.ts +++ b/packages/ingest-github/src/handlers/ingest-job.ts @@ -2,6 +2,7 @@ import type { GithubIndexPayload, JobMessage, LocalIngestPayload } from "@bb/typ import { IngestError } from "@bb/errors"; import { isEnvelopeCoherent, narrowGithubIngest, narrowLocalIngest } from "#src/payload/narrow.ts"; import type { IngestRunnerDeps } from "#src/types/ingest-runner.ts"; +import type { PipelineSummary } from "#src/types/pipeline.ts"; export interface IngestJobHandlerDeps { runner: IngestRunnerDeps; @@ -9,8 +10,8 @@ export interface IngestJobHandlerDeps { export function createGithubIngestHandler( deps: IngestJobHandlerDeps, -): (msg: JobMessage) => Promise { - return async function handleGithubIngest(msg: JobMessage): Promise { +): (msg: JobMessage) => Promise { + return async function handleGithubIngest(msg: JobMessage): Promise { const payload = narrowGithubIngest(msg.knowledgeId, msg.payload); if (!isEnvelopeCoherent(msg.knowledgeId, payload.knowledgeId)) { throw new IngestError( @@ -18,14 +19,14 @@ export function createGithubIngestHandler( `envelope mismatch: job.knowledgeId=${msg.knowledgeId} payload.knowledgeId=${payload.knowledgeId}`, ); } - await deps.runner.run({ job: msg, payload }); + return await deps.runner.run({ job: msg, payload }); }; } export function createLocalIngestHandler( deps: IngestJobHandlerDeps, -): (msg: JobMessage) => Promise { - return async function handleLocalIngest(msg: JobMessage): Promise { +): (msg: JobMessage) => Promise { + return async function handleLocalIngest(msg: JobMessage): Promise { const payload = narrowLocalIngest(msg.knowledgeId, msg.payload); if (!isEnvelopeCoherent(msg.knowledgeId, payload.knowledgeId)) { throw new IngestError( @@ -33,6 +34,6 @@ export function createLocalIngestHandler( `envelope mismatch: job.knowledgeId=${msg.knowledgeId} payload.knowledgeId=${payload.knowledgeId}`, ); } - await deps.runner.run({ job: msg, payload }); + return await deps.runner.run({ job: msg, payload }); }; } diff --git a/packages/ingest-github/src/index.ts b/packages/ingest-github/src/index.ts index 7a6ea16..efd5348 100644 --- a/packages/ingest-github/src/index.ts +++ b/packages/ingest-github/src/index.ts @@ -49,14 +49,27 @@ function buildRunner( export function registerGithubWorkers(deps: RegisterGithubWorkersDeps = {}): void { const progressContextFactory = deps.progressContextFactory ?? nullProgressContextFactory; const runner = buildRunner(deps.sourceFactory, progressContextFactory); - registerWorker(JobType.GithubIndex, createGithubIngestHandler({ runner })); + // `registerWorker` expects `Promise`; the handler now returns + // `Promise` so the enterprise queue bridge can mirror + // per-commit tokens + cost into the knowledge record. The OSS in-process + // worker discards the summary — local stats are read off + // `source.commitHashes[]` via `bytebell stats` instead. + const indexHandler = createGithubIngestHandler({ runner }); + registerWorker(JobType.GithubIndex, async (msg) => { + await indexHandler(msg); + }); const pullFactory = deps.pullFactory; - registerWorker(JobType.GithubPull, (msg) => runPull(msg, pullFactory, progressContextFactory)); + registerWorker(JobType.GithubPull, async (msg) => { + await runPull(msg, pullFactory, progressContextFactory); + }); } export function registerLocalIngestWorker(): void { const runner = buildRunner(undefined, nullProgressContextFactory); - registerWorker(JobType.LocalIngest, createLocalIngestHandler({ runner })); + const localHandler = createLocalIngestHandler({ runner }); + registerWorker(JobType.LocalIngest, async (msg) => { + await localHandler(msg); + }); } export { createFlatFolderStrategy } from "./strategies/flat-folder/index.ts"; diff --git a/packages/ingest-github/src/payload/narrow.ts b/packages/ingest-github/src/payload/narrow.ts index e6a5b36..c7f818b 100644 --- a/packages/ingest-github/src/payload/narrow.ts +++ b/packages/ingest-github/src/payload/narrow.ts @@ -1,6 +1,32 @@ -import type { GithubIndexPayload, LocalIngestPayload } from "@bb/types"; +import type { GithubIndexPayload, LocalIngestPayload, PayloadLlmOverrides } from "@bb/types"; import { IngestError } from "@bb/errors"; +/** + * Copies optional LLM credential / model overrides from a payload record onto + * a typed payload. Enterprise wrappers resolve per-org credentials at the + * enqueue boundary and stamp them on the BullMQ payload; without this passthrough + * the worker would always fall back to global config (and the resolver work is + * wasted). OSS standalone leaves all four unset, so nothing happens here. + */ +function attachLlmOverrides(rec: Record, target: PayloadLlmOverrides): void { + const apiKey = rec["llmApiKey"]; + if (typeof apiKey === "string" && apiKey.length > 0) { + target.llmApiKey = apiKey; + } + const provider = rec["llmProvider"]; + if (typeof provider === "string" && provider.length > 0) { + target.llmProvider = provider; + } + const model = rec["llmModel"]; + if (typeof model === "string" && model.length > 0) { + target.llmModel = model; + } + const keyId = rec["llmKeyId"]; + if (typeof keyId === "string" && keyId.length > 0) { + target.llmKeyId = keyId; + } +} + export function narrowGithubIngest(knowledgeId: string, payload: unknown): GithubIndexPayload { if (typeof payload !== "object" || payload === null) { throw new IngestError(knowledgeId, "github_index payload must be an object"); @@ -31,6 +57,7 @@ export function narrowGithubIngest(knowledgeId: string, payload: unknown): Githu if (typeof orgId === "string" && orgId.length > 0) { out.orgId = orgId; } + attachLlmOverrides(rec, out); return out; } @@ -52,6 +79,7 @@ export function narrowLocalIngest(knowledgeId: string, payload: unknown): LocalI if (typeof orgId === "string" && orgId.length > 0) { out.orgId = orgId; } + attachLlmOverrides(rec, out as PayloadLlmOverrides); return out; } diff --git a/packages/ingest-github/src/pipeline/README.md b/packages/ingest-github/src/pipeline/README.md index 0f4bd3b..0c57d78 100644 --- a/packages/ingest-github/src/pipeline/README.md +++ b/packages/ingest-github/src/pipeline/README.md @@ -64,7 +64,7 @@ true` (default). Consumed by `scan.ts` via the optional `skipDecider` runner emits `failed(message)` only when the strategy has not yet started — once `strategy.execute` is reached, the strategy owns terminal emission and the runner stays silent to avoid double-FAILED. -- `pull.ts` — `runPull(msg, pullFactory?, progressContextFactory?)` orchestrates the pull job. +- `pull.ts` — `runPull(msg, pullFactory?, progressContextFactory?)` orchestrates the pull job. Returns `Promise` (was `Promise`); the returned `tokenUsage` carries `inputTokens`, `outputTokens`, and `costUsd` summed across the pull phases for callers (e.g. the enterprise queue bridge) that need to mirror the run into a knowledge record. Reads `repoUrl` and `branch` directly off `knowledge.info.*` (loaded via `@bb/mongo.getKnowledge`). The `KnowledgeSource` discriminator (`kind`) is still read off `knowledge.source` along with `commitId`/`commitHashes`, but @@ -85,12 +85,25 @@ archiveSink?}` and `runPull` skips `syncRepository` + `materialiseEndpoints` the context into every phase that takes a `progressContext?` field, and finishes with `completed()` on success or `failed(message)` on a non-`CancellationError` throw. -- `stats.ts` — shared helpers for handling all ingestion processing statistics, - repository name resolutions, and error-string descriptions: `persistStats` - writes the per-commit row into `processing_stats`, `repoNameFromUrl` parses - an owner/repo display name out of a GitHub URL with a graceful fallback, and - `describe` flattens an `unknown` cause to a short string for `IngestError` - messages. +- `stats.ts` — small shared helpers: `repoNameFromUrl` parses an owner/repo + display name out of a GitHub URL with a graceful fallback, `localRepoName` + derives a name from a local path, and `describe` flattens an `unknown` + cause to a short string for `IngestError` messages. The previous + `persistStats` write into the `processing_stats` collection has been + removed — per-commit token and cost data now lives on the knowledge + document's `source.commitHashes[]` (set by `setKnowledgeCommit` from + `@bb/mongo`), with the per-call `costUsd` sourced directly from + OpenRouter's `response.usage.cost`. +- `failure-classifier.ts` — `classifyFailure(cause)` returns + `{ reason, category, detail? }` for any thrown ingestion error. + `LlmConfigError` → `llm_config`. `LlmError` is subdivided by its + `status` field: `401`/`403` → `llm_auth`, `402` → `llm_quota`, `429` → + `llm_rate_limit`, `5xx`/no-status → `llm_unreachable`. Anything else → + `internal`. Each category produces a single short operator-readable + `reason` sentence; the raw provider response body lives in `detail`. + Used by `run.ts`/`pull.ts` catch blocks (Mongo persistence via + `markKnowledgeFailed`) and `strategies/flat-folder/index.ts` (SSE event + via `progressContext.failed`) so both paths share one classification. - `context.ts` — shared helpers to resolve pipeline organization IDs and parse optional LLM context parameter overrides from payload messages: `resolveOrgId(payload)` returns `payload.orgId ?? getConfigValue(Config.OrgId)` @@ -117,8 +130,8 @@ archiveSink?}` and `runPull` skips `syncRepository` + `materialiseEndpoints` - Up: `@bb/config`, `@bb/types`, `@bb/errors`, `@bb/logger`, `node:*`. - `run.ts` and `pull.ts` additionally import `@bb/mongo` and `@bb/neo4j` for state transitions and graph state writes respectively. -- `stats.ts` imports `@bb/mongo` and `@bb/llm` for persisting stats and - estimating cost respectively. +- `stats.ts` has no cross-package imports — it carries only pure helpers + (`repoNameFromUrl`, `localRepoName`, `describe`). - Forbidden: importing from `../strategies`, `../adapters`, `../handlers`. ## Invariants diff --git a/packages/ingest-github/src/pipeline/failure-classifier.ts b/packages/ingest-github/src/pipeline/failure-classifier.ts new file mode 100644 index 0000000..38e9baa --- /dev/null +++ b/packages/ingest-github/src/pipeline/failure-classifier.ts @@ -0,0 +1,77 @@ +import { LlmConfigError, LlmError } from "@bb/errors"; +import type { KnowledgeFailureCategory } from "@bb/types"; +import { describe } from "./stats.ts"; + +export interface ClassifiedFailure { + /** Operator-readable single-sentence summary. UI surfaces this directly. */ + reason: string; + category: KnowledgeFailureCategory; + /** Raw provider response or structured debug payload. Optional. */ + detail?: string; +} + +/** + * Translates a thrown ingestion error into the structured `(reason, category, + * detail)` triple persisted on `KnowledgeDoc.failure` and stamped on the SSE + * FAILED event. + * + * For LLM transport errors, the provider's HTTP status drives the category so + * operators can distinguish "wrong key" (401/403) from "out of credits" (402) + * from "throttled" (429) from "infra down" (5xx). Each path produces a short + * sentence; the raw response body lands in `detail` for the disclosure UI. + */ +export function classifyFailure(cause: unknown): ClassifiedFailure { + if (cause instanceof LlmConfigError) { + return { + category: "llm_config", + reason: "LLM provider is not configured. Set the API key and retry.", + detail: cause.message, + }; + } + if (cause instanceof LlmError) { + return classifyLlmTransport(cause); + } + return { category: "internal", reason: describe(cause) }; +} + +function classifyLlmTransport(cause: LlmError): ClassifiedFailure { + const status = cause.status; + const detail = cause.detail ?? cause.message; + if (status === 401 || status === 403) { + return { + category: "llm_auth", + reason: "LLM provider rejected the API key. Update the key and retry.", + detail, + }; + } + if (status === 402) { + return { + category: "llm_quota", + reason: "LLM provider is out of credits or over its spend limit. Top up and retry.", + detail, + }; + } + if (status === 429) { + return { + category: "llm_rate_limit", + reason: "LLM provider rate-limited the request. Wait and retry.", + detail, + }; + } + if (status !== undefined && status >= 500 && status < 600) { + return { + category: "llm_unreachable", + reason: `LLM provider responded with HTTP ${String(status)}. Provider is temporarily unavailable.`, + detail, + }; + } + // Network/timeout (no status) or any other non-OK status. + return { + category: "llm_unreachable", + reason: + status === undefined + ? "LLM provider is unreachable (network error or timeout)." + : `LLM provider responded with HTTP ${String(status)}.`, + detail, + }; +} diff --git a/packages/ingest-github/src/pipeline/pull.ts b/packages/ingest-github/src/pipeline/pull.ts index 1776b77..930b7be 100644 --- a/packages/ingest-github/src/pipeline/pull.ts +++ b/packages/ingest-github/src/pipeline/pull.ts @@ -1,9 +1,10 @@ import { KnowledgeState, type GithubPullPayload, type JobMessage } from "@bb/types"; -import { getKnowledge, setKnowledgeCommit, setKnowledgeState } from "@bb/mongo"; +import { getKnowledge, markKnowledgeFailed, setKnowledgeCommit, setKnowledgeState } from "@bb/mongo"; import { setKnowledgeStateInGraph, snapshotFilesToVersion, type NodeScope } from "@bb/neo4j"; -import { describe, persistStats, repoNameFromUrl } from "./stats.ts"; +import type { PipelineSummary } from "#src/types/pipeline.ts"; import { resolveOrgId, llmCallContextFromPayload } from "./context.ts"; import { IngestError, KnowledgeNotFoundError } from "@bb/errors"; +import { classifyFailure } from "./failure-classifier.ts"; import { logger } from "@bb/logger"; import { ensureMetaDirs, metaPathsFor, repoCloneDir, ensureReposRoot } from "./paths.ts"; import { readHeadCommitHash, syncRepository } from "./source.ts"; @@ -38,7 +39,7 @@ export async function runPull( msg: JobMessage, pullFactory?: PullFactory, progressContextFactory: ProgressContextFactory = nullProgressContextFactory, -): Promise { +): Promise { const { knowledgeId } = msg.payload; if (msg.payload.targetCommitHash !== undefined && !COMMIT_HASH_RE.test(msg.payload.targetCommitHash)) { throw new IngestError( @@ -70,7 +71,6 @@ export async function runPull( const gitToken = msg.payload.gitToken; clearCancellation(knowledgeId); - const startedAt = Date.now(); await transitionState(knowledgeId, KnowledgeState.Processing); const progressContext = progressContextFactory(knowledgeId); @@ -92,7 +92,7 @@ export async function runPull( if (targetCommit === currentCommit) { logger.info(`pull: ${knowledgeId} already at ${targetCommit.slice(0, 12)}; no-op`); await transitionState(knowledgeId, KnowledgeState.Processed); - return; + return emptyPullSummary(targetCommit); } } else { await ensureReposRoot(); @@ -116,7 +116,7 @@ export async function runPull( if (targetCommit === currentCommit) { logger.info(`pull: ${knowledgeId} already at ${targetCommit.slice(0, 12)}; no-op`); await transitionState(knowledgeId, KnowledgeState.Processed); - return; + return emptyPullSummary(targetCommit); } // Deepen the shallow clone first so historical commits selected via the @@ -174,6 +174,7 @@ export async function runPull( const phase1 = await analyseChangedFiles(analyseChangedInput); let totalInputTokens = phase1.tokenUsage.inputTokens; let totalOutputTokens = phase1.tokenUsage.outputTokens; + let totalCostUsd = phase1.tokenUsage.costUsd; logger.info(`pull: phase process big files starting`); throwIfCancelled(knowledgeId); @@ -189,6 +190,7 @@ export async function runPull( const phase2 = await processBigFilesQueue(processBigFilesInput); totalInputTokens += phase2.tokenUsage.inputTokens; totalOutputTokens += phase2.tokenUsage.outputTokens; + totalCostUsd += phase2.tokenUsage.costUsd; logger.info(`pull: phase backfill fields starting`); throwIfCancelled(knowledgeId); @@ -221,6 +223,7 @@ export async function runPull( const phase5 = await runSelectiveFolderSummary(selectiveInput); totalInputTokens += phase5.tokenUsage.inputTokens; totalOutputTokens += phase5.tokenUsage.outputTokens; + totalCostUsd += phase5.tokenUsage.costUsd; progressContext.phaseChanged("indexing"); logger.info(`pull: phase repo summary starting`); @@ -230,6 +233,7 @@ export async function runPull( const { summary: repoSummary, tokenUsage: repoUsage } = await summariseRepo(knowledgeId, metaPaths, llmCallContext); totalInputTokens += repoUsage.inputTokens; totalOutputTokens += repoUsage.outputTokens; + totalCostUsd += repoUsage.costUsd; if (repoSummary !== null) { await persistRepoSummary(metaPaths, makeRepoSummaryEnvelope(knowledgeId, orgId, repoSummary)); } @@ -245,30 +249,37 @@ export async function runPull( affectedFolders, }); - const stats = await persistStats({ + await setKnowledgeCommit( knowledgeId, - repoName: repoNameFromUrl(repoUrl), - commitHash: targetCommit, - filesAnalyzed: stored.filesUpserted, - foldersSummarised: stored.foldersUpserted, - processingTimeMs: Date.now() - startedAt, - tokenUsage: { inputTokens: totalInputTokens, outputTokens: totalOutputTokens }, - }); - await setKnowledgeCommit(knowledgeId, targetCommit, String(stats.inputTokens), String(stats.outputTokens)); + targetCommit, + String(totalInputTokens), + String(totalOutputTokens), + String(totalCostUsd), + ); await transitionState(knowledgeId, KnowledgeState.Processed); progressContext.completed("github_pull complete"); logger.info( `pull: ${knowledgeId} ${currentCommit.slice(0, 12)} -> ${targetCommit.slice(0, 12)} done (filesUpserted=${stored.filesUpserted} filesDeleted=${stored.filesDeleted} foldersUpserted=${stored.foldersUpserted})`, ); + return { + filesAnalyzed: stored.filesUpserted, + foldersSummarised: stored.foldersUpserted, + repoSummarised: repoSummary !== null, + graphNodesWritten: stored.filesUpserted + stored.foldersUpserted, + commitHash: targetCommit, + tokenUsage: { inputTokens: totalInputTokens, outputTokens: totalOutputTokens, costUsd: totalCostUsd }, + }; } catch (cause: unknown) { if (cause instanceof CancellationError) { clearCancellation(knowledgeId); logger.info(`pull: cancelled for ${knowledgeId}`); throw cause; } - await transitionState(knowledgeId, KnowledgeState.Failed).catch(() => undefined); - progressContext.failed(describe(cause)); - throw new IngestError(knowledgeId, `github_pull failed: ${describe(cause)}`, cause); + const { category, reason, detail } = classifyFailure(cause); + await markKnowledgeFailed(knowledgeId, reason, category, detail).catch(() => undefined); + await setKnowledgeStateInGraph(knowledgeId, KnowledgeState.Failed).catch(() => undefined); + progressContext.failed(reason, undefined, category, detail); + throw new IngestError(knowledgeId, `github_pull failed: ${reason}`, cause); } } @@ -276,3 +287,14 @@ async function transitionState(knowledgeId: string, state: KnowledgeState): Prom await setKnowledgeState(knowledgeId, state); await setKnowledgeStateInGraph(knowledgeId, state).catch(() => undefined); } + +function emptyPullSummary(commitHash: string): PipelineSummary { + return { + filesAnalyzed: 0, + foldersSummarised: 0, + repoSummarised: false, + graphNodesWritten: 0, + commitHash, + tokenUsage: { inputTokens: 0, outputTokens: 0, costUsd: 0 }, + }; +} diff --git a/packages/ingest-github/src/pipeline/run.ts b/packages/ingest-github/src/pipeline/run.ts index 5cf1d15..eca725e 100644 --- a/packages/ingest-github/src/pipeline/run.ts +++ b/packages/ingest-github/src/pipeline/run.ts @@ -1,8 +1,14 @@ -import { KnowledgeState, type GithubIndexPayload, type LocalIngestPayload } from "@bb/types"; -import { setKnowledgeBranch, setKnowledgeCommit, setKnowledgeState } from "@bb/mongo"; +import { + KnowledgeState, + type GithubIndexPayload, + type KnowledgeFailureCategory, + type LocalIngestPayload, +} from "@bb/types"; +import { markKnowledgeFailed, setKnowledgeBranch, setKnowledgeCommit, setKnowledgeState } from "@bb/mongo"; import { setKnowledgeBranchInGraph, setKnowledgeStateInGraph } from "@bb/neo4j"; import { IngestError } from "@bb/errors"; import { logger } from "@bb/logger"; +import { classifyFailure } from "./failure-classifier.ts"; import type { IngestRunnerDeps, IngestRunnerInput } from "#src/types/ingest-runner.ts"; import type { IngestStrategy } from "#src/types/strategy.ts"; import type { ArchiveSink, PipelineSummary, SourceFactory, SourceReader } from "#src/types/pipeline.ts"; @@ -14,7 +20,7 @@ import { resolveBranch } from "./branch.ts"; import { CancellationError, clearCancellation, throwIfCancelled } from "./cancellation.ts"; import { createDiskSourceReader } from "./disk-source-reader.ts"; import { resolveOrgId, llmCallContextFromPayload } from "./context.ts"; -import { describe, persistStats, repoNameFromUrl, localRepoName } from "./stats.ts"; +import { localRepoName } from "./stats.ts"; export interface CreatePipelineRunnerDeps { reposRootDir: string; @@ -123,16 +129,13 @@ async function runGithub( strategyStarted = true; const result = await strategy.execute(strategyInput); - const stats = await persistStats({ + await setKnowledgeCommit( knowledgeId, - repoName: repoNameFromUrl(payload.repoUrl), commitHash, - filesAnalyzed: result.filesAnalyzed, - foldersSummarised: result.foldersSummarised, - processingTimeMs: Date.now() - startedAt, - tokenUsage: result.tokenUsage, - }); - await setKnowledgeCommit(knowledgeId, commitHash, String(stats.inputTokens), String(stats.outputTokens)); + String(result.tokenUsage.inputTokens), + String(result.tokenUsage.outputTokens), + String(result.tokenUsage.costUsd), + ); await transitionState(knowledgeId, KnowledgeState.Processed); const totalMs = Date.now() - startedAt; @@ -154,11 +157,12 @@ async function runGithub( logger.info(`pipeline/run: ingestion cancelled for ${knowledgeId}`); throw cause; } - await transitionState(knowledgeId, KnowledgeState.Failed).catch(() => undefined); + const { category, reason, detail } = classifyFailure(cause); + await persistFailure(knowledgeId, category, reason, detail); if (!strategyStarted) { - progressContext.failed(describe(cause)); + progressContext.failed(reason, undefined, category, detail); } - throw new IngestError(knowledgeId, `github_index pipeline failed: ${describe(cause)}`, cause); + throw new IngestError(knowledgeId, `github_index pipeline failed: ${reason}`, cause); } } @@ -183,15 +187,9 @@ async function runLocal(strategy: IngestStrategy, payload: LocalIngestPayload): }); const commitHash = `local-${startedAt}`; - await persistStats({ - knowledgeId, - repoName: localRepoName(rootDir), - commitHash, - filesAnalyzed: result.filesAnalyzed, - foldersSummarised: result.foldersSummarised, - processingTimeMs: Date.now() - startedAt, - tokenUsage: result.tokenUsage, - }); + logger.info( + `pipeline/run: ✓ local_ingest complete (knowledgeId=${knowledgeId}, repo=${localRepoName(rootDir)}, files=${result.filesAnalyzed}, in=${result.tokenUsage.inputTokens}, out=${result.tokenUsage.outputTokens}, cost=$${result.tokenUsage.costUsd})`, + ); await transitionState(knowledgeId, KnowledgeState.Processed); return { filesAnalyzed: result.filesAnalyzed, @@ -206,8 +204,9 @@ async function runLocal(strategy: IngestStrategy, payload: LocalIngestPayload): clearCancellation(knowledgeId); throw cause; } - await transitionState(knowledgeId, KnowledgeState.Failed).catch(() => undefined); - throw new IngestError(knowledgeId, `local_ingest pipeline failed: ${describe(cause)}`, cause); + const { category, reason, detail } = classifyFailure(cause); + await persistFailure(knowledgeId, category, reason, detail); + throw new IngestError(knowledgeId, `local_ingest pipeline failed: ${reason}`, cause); } } @@ -216,6 +215,21 @@ async function transitionState(knowledgeId: string, state: KnowledgeState): Prom await setKnowledgeStateInGraph(knowledgeId, state).catch(() => undefined); } +/** + * Persists the FAILED state + structured failure reason to Mongo, then + * mirrors the state into Neo4j on a best-effort basis. Errors from both + * sides are swallowed so the throw path is preserved. + */ +async function persistFailure( + knowledgeId: string, + category: KnowledgeFailureCategory, + reason: string, + detail?: string, +): Promise { + await markKnowledgeFailed(knowledgeId, reason, category, detail).catch(() => undefined); + await setKnowledgeStateInGraph(knowledgeId, KnowledgeState.Failed).catch(() => undefined); +} + function isGithubPayload(payload: GithubIndexPayload | LocalIngestPayload): payload is GithubIndexPayload { return (payload as GithubIndexPayload).repoUrl !== undefined; } diff --git a/packages/ingest-github/src/pipeline/stats.ts b/packages/ingest-github/src/pipeline/stats.ts index 84cf9cb..e7682e1 100644 --- a/packages/ingest-github/src/pipeline/stats.ts +++ b/packages/ingest-github/src/pipeline/stats.ts @@ -1,37 +1,3 @@ -import { recordProcessingStats } from "@bb/mongo"; -import { estimateCostFromBreakdown } from "@bb/llm"; - -export interface PersistStatsInput { - knowledgeId: string; - repoName: string; - commitHash: string; - filesAnalyzed: number; - foldersSummarised: number; - processingTimeMs: number; - tokenUsage: { inputTokens: number; outputTokens: number }; -} - -export async function persistStats(input: PersistStatsInput): Promise<{ inputTokens: number; outputTokens: number }> { - const estimatedCost = await estimateCostFromBreakdown({}); - return await recordProcessingStats({ - knowledgeId: input.knowledgeId, - repoName: input.repoName, - commitHash: input.commitHash, - modelTokens: { - total: { - inputTokens: input.tokenUsage.inputTokens, - outputTokens: input.tokenUsage.outputTokens, - }, - }, - estimatedCost, - totalBatches: 1, - totalFiles: input.filesAnalyzed, - totalFolders: input.foldersSummarised, - filesAnalyzed: input.filesAnalyzed, - processingTimeMs: input.processingTimeMs, - }); -} - export function repoNameFromUrl(repoUrl: string): string { try { const segments = new URL(repoUrl).pathname diff --git a/packages/ingest-github/src/progress/NullProgressReporter.ts b/packages/ingest-github/src/progress/NullProgressReporter.ts index fd59574..a35f74f 100644 --- a/packages/ingest-github/src/progress/NullProgressReporter.ts +++ b/packages/ingest-github/src/progress/NullProgressReporter.ts @@ -34,7 +34,7 @@ class NullProgressContext implements ProgressContext { completed(_message?: string): void { /* no-op */ } - failed(_error: string, _phase?: ProgressPhase): void { + failed(_error: string, _phase?: ProgressPhase, _category?: string, _detail?: string): void { /* no-op */ } } diff --git a/packages/ingest-github/src/progress/types.ts b/packages/ingest-github/src/progress/types.ts index 99744eb..5d2f1ab 100644 --- a/packages/ingest-github/src/progress/types.ts +++ b/packages/ingest-github/src/progress/types.ts @@ -40,7 +40,15 @@ export interface ProgressContext { reporter(input: ProgressReporterInput): ProgressReporter; phaseChanged(phase: ProgressPhase): void; completed(message?: string): void; - failed(error: string, phase?: ProgressPhase): void; + /** + * Emit a terminal FAILED event. `error` is a short operator-readable + * sentence (e.g. "OpenRouter is out of credits"). `category` is the + * classification taxonomy (`"llm_config" | "llm_auth" | "llm_quota" | + * "llm_rate_limit" | "llm_unreachable" | "cancelled" | "internal"`). + * `detail` is the optional raw provider response or structured debug + * payload — UIs typically hide it behind a disclosure. + */ + failed(error: string, phase?: ProgressPhase, category?: string, detail?: string): void; } export type ProgressContextFactory = (knowledgeId: string) => ProgressContext; diff --git a/packages/ingest-github/src/strategies/flat-folder/analyse-changed.ts b/packages/ingest-github/src/strategies/flat-folder/analyse-changed.ts index 6b55754..982d0a7 100644 --- a/packages/ingest-github/src/strategies/flat-folder/analyse-changed.ts +++ b/packages/ingest-github/src/strategies/flat-folder/analyse-changed.ts @@ -1,5 +1,6 @@ import path from "node:path"; import { tokenLen, type AskLlmOptions } from "@bb/llm"; +import { LlmConfigError, LlmError } from "@bb/errors"; import { logger } from "@bb/logger"; import { Config } from "@bb/types"; import { getConfigValue } from "@bb/config"; @@ -33,7 +34,7 @@ export interface AnalyseChangedResult { oversizedStubs: number; skipped: number; failed: number; - tokenUsage: { inputTokens: number; outputTokens: number }; + tokenUsage: { inputTokens: number; outputTokens: number; costUsd: number }; } /** @@ -80,6 +81,7 @@ export async function analyseChangedFiles(input: AnalyseChangedInput): Promise[] = []; @@ -196,12 +198,16 @@ export async function analyseChangedFiles(input: AnalyseChangedInput): PromisemxLClSSchXC+k5u3fc-91$rg<*{OM<#fj;u_HYS%dj$hc9WDhRsD;wiV4i|P zX5Qp?Tym5BxCAE0bBRs<$)#cnQHW$mQEG8%P6j$180BGZWR+g8aPVRJ~G=V_|NEDxBQOdzFf&ZRFF5$xKtwNKPy*P6hh8xFj(z pIW<2G6w*i`uTWE?fK3TFGBg!nVOEk+lwS@}XblO}%{TZs!~nrZi#Y%Q delta 62 zcmez3^T2b0<;H?CF3ws_9WDhRsGe-VZ8SN9+j#Q@Zb_!eA9*z=ckydYuHm~1XFuWB K*<2{VECv9!1{J3O diff --git a/packages/ingest-github/src/strategies/flat-folder/big-file/index.ts b/packages/ingest-github/src/strategies/flat-folder/big-file/index.ts index e100b1d..255be0b 100644 --- a/packages/ingest-github/src/strategies/flat-folder/big-file/index.ts +++ b/packages/ingest-github/src/strategies/flat-folder/big-file/index.ts @@ -81,8 +81,10 @@ export async function processBigFile(input: ProcessBigFileInput): Promise acc + (r.tokenUsage?.inputTokens ?? 0), 0); const chunkOutputTokens = results.reduce((acc, r) => acc + (r.tokenUsage?.outputTokens ?? 0), 0); + const chunkCostUsd = results.reduce((acc, r) => acc + (r.tokenUsage?.costUsd ?? 0), 0); const totalInputTokens = chunkInputTokens + (merged.tokenUsage?.inputTokens ?? 0); const totalOutputTokens = chunkOutputTokens + (merged.tokenUsage?.outputTokens ?? 0); + const totalCostUsd = chunkCostUsd + (merged.tokenUsage?.costUsd ?? 0); const manifest: HugeFileManifest = { relativePath: input.relativePath, @@ -104,7 +106,7 @@ export async function processBigFile(input: ProcessBigFileInput): Promise[] = []; for (const [folderPath, files] of groups.entries()) { if (!input.affectedFolders.has(folderPath)) { @@ -54,6 +55,7 @@ export async function runSelectiveFolderSummary( const { summary, tokenUsage } = await summariseFolder(folderPath, files, input.llmCallContext); totalInputTokens += tokenUsage.inputTokens; totalOutputTokens += tokenUsage.outputTokens; + totalCostUsd += tokenUsage.costUsd; if (summary !== null) { await persistFolderSummary(input.metaPaths, summary); succeeded += 1; @@ -72,5 +74,10 @@ export async function runSelectiveFolderSummary( } await Promise.all(tasks); logger.info(`pull-folder-summary done: succeeded=${succeeded} failed=${failed} skipped=${skipped}`); - return { succeeded, failed, skipped, tokenUsage: { inputTokens: totalInputTokens, outputTokens: totalOutputTokens } }; + return { + succeeded, + failed, + skipped, + tokenUsage: { inputTokens: totalInputTokens, outputTokens: totalOutputTokens, costUsd: totalCostUsd }, + }; } diff --git a/packages/ingest-github/src/strategies/flat-folder/folder-summary.ts b/packages/ingest-github/src/strategies/flat-folder/folder-summary.ts index 10b895c..4fa175b 100644 --- a/packages/ingest-github/src/strategies/flat-folder/folder-summary.ts +++ b/packages/ingest-github/src/strategies/flat-folder/folder-summary.ts @@ -1,6 +1,7 @@ import { readFile, readdir, writeFile } from "node:fs/promises"; import path from "node:path"; import { askJsonLLM, type AskLlmOptions } from "@bb/llm"; +import { LlmConfigError, LlmError } from "@bb/errors"; import { logger } from "@bb/logger"; import { Config } from "@bb/types"; import { getConfigValue } from "@bb/config"; @@ -41,7 +42,10 @@ export async function summariseFolder( folderPath: string, files: CondensedFileAnalysis[], llmCallContext?: AskLlmOptions, -): Promise<{ summary: FolderSummary | null; tokenUsage: { inputTokens: number; outputTokens: number } }> { +): Promise<{ + summary: FolderSummary | null; + tokenUsage: { inputTokens: number; outputTokens: number; costUsd: number }; +}> { const userPrompt = folderAnalysisUserPrompt(folderPath, files); try { const response = await askJsonLLM( @@ -53,17 +57,28 @@ export async function summariseFolder( logger.warn(`summariseFolder: ${folderPath || ""} returned unparseable JSON`); return { summary: null, - tokenUsage: { inputTokens: response.usage.inputTokens, outputTokens: response.usage.outputTokens }, + tokenUsage: { + inputTokens: response.usage.inputTokens, + outputTokens: response.usage.outputTokens, + costUsd: response.usage.costUsd, + }, }; } return { summary: shapeFolderSummary(folderPath, response.result), - tokenUsage: { inputTokens: response.usage.inputTokens, outputTokens: response.usage.outputTokens }, + tokenUsage: { + inputTokens: response.usage.inputTokens, + outputTokens: response.usage.outputTokens, + costUsd: response.usage.costUsd, + }, }; } catch (cause: unknown) { + if (cause instanceof LlmConfigError || cause instanceof LlmError) { + throw cause; + } const msg = cause instanceof Error ? cause.message : String(cause); logger.warn(`summariseFolder: ${folderPath || ""} askJsonLLM failed: ${msg}`); - return { summary: null, tokenUsage: { inputTokens: 0, outputTokens: 0 } }; + return { summary: null, tokenUsage: { inputTokens: 0, outputTokens: 0, costUsd: 0 } }; } } @@ -100,7 +115,11 @@ export async function runFolderSummaryPhase( metaPaths: MetaPaths, llmCallContext?: AskLlmOptions, progressContext?: ProgressContext, -): Promise<{ succeeded: number; failed: number; tokenUsage: { inputTokens: number; outputTokens: number } }> { +): Promise<{ + succeeded: number; + failed: number; + tokenUsage: { inputTokens: number; outputTokens: number; costUsd: number }; +}> { const concurrentWorkers = getConfigValue(Config.ConcurrentWorkers); const limit = withConcurrency(concurrentWorkers); const groups = await groupByDirectFolder(metaPaths); @@ -108,6 +127,7 @@ export async function runFolderSummaryPhase( let failed = 0; let totalInputTokens = 0; let totalOutputTokens = 0; + let totalCostUsd = 0; const reporter = progressContext?.reporter({ phase: "folder_analysis", total: { kind: "fixed", total: groups.size }, @@ -123,6 +143,7 @@ export async function runFolderSummaryPhase( const { summary, tokenUsage } = await summariseFolder(folderPath, files, llmCallContext); totalInputTokens += tokenUsage.inputTokens; totalOutputTokens += tokenUsage.outputTokens; + totalCostUsd += tokenUsage.costUsd; if (summary !== null) { await persistFolderSummary(metaPaths, summary); succeeded += 1; @@ -146,7 +167,11 @@ export async function runFolderSummaryPhase( reporter?.stop(); } logger.info(`phase5 done: foldersSummarised=${succeeded} failed=${failed}`); - return { succeeded, failed, tokenUsage: { inputTokens: totalInputTokens, outputTokens: totalOutputTokens } }; + return { + succeeded, + failed, + tokenUsage: { inputTokens: totalInputTokens, outputTokens: totalOutputTokens, costUsd: totalCostUsd }, + }; } function shapeFolderSummary(folderPath: string, raw: FolderSummaryJson): FolderSummary { diff --git a/packages/ingest-github/src/strategies/flat-folder/index.ts b/packages/ingest-github/src/strategies/flat-folder/index.ts index 72f2640..09c03c6 100644 --- a/packages/ingest-github/src/strategies/flat-folder/index.ts +++ b/packages/ingest-github/src/strategies/flat-folder/index.ts @@ -2,6 +2,7 @@ import { logger } from "@bb/logger"; import type { FileAnalyzer } from "#src/types/pipeline.ts"; import type { IngestStrategy, StrategyInput, StrategyResult } from "#src/types/strategy.ts"; import { throwIfCancelled } from "#src/pipeline/cancellation.ts"; +import { classifyFailure } from "#src/pipeline/failure-classifier.ts"; import { classifyAndAnalyseSmall } from "./phases/classify-and-analyse-small.ts"; import { processBigFilesQueue } from "./phases/process-big-files.ts"; import { backfillMissingFields } from "./backfill/fields.ts"; @@ -47,6 +48,7 @@ export function createFlatFolderStrategy(deps: FlatFolderStrategyDeps): IngestSt const phase1 = await classifyAndAnalyseSmall(phase1Input); let totalInputTokens = phase1.tokenUsage.inputTokens; let totalOutputTokens = phase1.tokenUsage.outputTokens; + let totalCostUsd = phase1.tokenUsage.costUsd; logger.info(`flat-folder: phase2 (process big files) starting`); throwIfCancelled(knowledgeId); @@ -62,6 +64,7 @@ export function createFlatFolderStrategy(deps: FlatFolderStrategyDeps): IngestSt const phase2 = await processBigFilesQueue(phase2Input); totalInputTokens += phase2.tokenUsage.inputTokens; totalOutputTokens += phase2.tokenUsage.outputTokens; + totalCostUsd += phase2.tokenUsage.costUsd; logger.info(`flat-folder: phase3 (backfill missing fields) starting`); throwIfCancelled(knowledgeId); @@ -86,6 +89,7 @@ export function createFlatFolderStrategy(deps: FlatFolderStrategyDeps): IngestSt const phase5 = await runFolderSummaryPhase(knowledgeId, metaPaths, llmCallContext, progressContext); totalInputTokens += phase5.tokenUsage.inputTokens; totalOutputTokens += phase5.tokenUsage.outputTokens; + totalCostUsd += phase5.tokenUsage.costUsd; progressContext.phaseChanged("indexing"); logger.info(`flat-folder: phase6 (repo summary) starting`); @@ -97,6 +101,7 @@ export function createFlatFolderStrategy(deps: FlatFolderStrategyDeps): IngestSt ); totalInputTokens += repoUsage.inputTokens; totalOutputTokens += repoUsage.outputTokens; + totalCostUsd += repoUsage.costUsd; let repoSummarised = false; if (repoSummary !== null) { await persistRepoSummary(metaPaths, makeRepoSummaryEnvelope(knowledgeId, orgId, repoSummary)); @@ -120,11 +125,11 @@ export function createFlatFolderStrategy(deps: FlatFolderStrategyDeps): IngestSt foldersSummarised: phase5.succeeded, repoSummarised, graphNodesWritten: phase7.nodesWritten, - tokenUsage: { inputTokens: totalInputTokens, outputTokens: totalOutputTokens }, + tokenUsage: { inputTokens: totalInputTokens, outputTokens: totalOutputTokens, costUsd: totalCostUsd }, }; } catch (cause: unknown) { - const message = cause instanceof Error ? cause.message : String(cause); - progressContext.failed(message); + const { category, reason, detail } = classifyFailure(cause); + progressContext.failed(reason, undefined, category, detail); throw cause; } }, diff --git a/packages/ingest-github/src/strategies/flat-folder/phases/classify-and-analyse-small.ts b/packages/ingest-github/src/strategies/flat-folder/phases/classify-and-analyse-small.ts index a3922fd..a9ad59a 100644 --- a/packages/ingest-github/src/strategies/flat-folder/phases/classify-and-analyse-small.ts +++ b/packages/ingest-github/src/strategies/flat-folder/phases/classify-and-analyse-small.ts @@ -1,5 +1,6 @@ import path from "node:path"; import { tokenLen, type AskLlmOptions } from "@bb/llm"; +import { LlmConfigError, LlmError } from "@bb/errors"; import { logger } from "@bb/logger"; import { Config } from "@bb/types"; import { getConfigValue } from "@bb/config"; @@ -30,7 +31,7 @@ export interface ClassifyPhaseResult { bigFilesQueued: number; oversizedStubs: number; failed: number; - tokenUsage: { inputTokens: number; outputTokens: number }; + tokenUsage: { inputTokens: number; outputTokens: number; costUsd: number }; } export async function classifyAndAnalyseSmall(input: ClassifyPhaseInput): Promise { @@ -43,6 +44,7 @@ export async function classifyAndAnalyseSmall(input: ClassifyPhaseInput): Promis let failed = 0; let totalInputTokens = 0; let totalOutputTokens = 0; + let totalCostUsd = 0; const repositoryHint = input.source.localRepoDir.length > 0 ? path.basename(input.source.localRepoDir) : input.knowledgeId; @@ -114,6 +116,7 @@ export async function classifyAndAnalyseSmall(input: ClassifyPhaseInput): Promis if (condensed.tokenUsage) { totalInputTokens += condensed.tokenUsage.inputTokens; totalOutputTokens += condensed.tokenUsage.outputTokens; + totalCostUsd += condensed.tokenUsage.costUsd; } smallFilesAnalysed += 1; reporter?.increment(1, { fileName: filePath }); @@ -121,6 +124,11 @@ export async function classifyAndAnalyseSmall(input: ClassifyPhaseInput): Promis if (cause instanceof CancellationError) { throw cause; } + if (cause instanceof LlmConfigError || cause instanceof LlmError) { + // LLM unreachable — bail the whole job, don't keep iterating + // over the rest of the files producing the same failure. + throw cause; + } failed += 1; logger.warn(`phase1: analyse failed for ${entry.relativePath}: ${describe(cause)}`); reporter?.increment(1, { fileName: filePath }); @@ -144,7 +152,7 @@ export async function classifyAndAnalyseSmall(input: ClassifyPhaseInput): Promis bigFilesQueued: bigFileBuffer.filter((e) => e.reason === "context-window-exceeded").length, oversizedStubs, failed, - tokenUsage: { inputTokens: totalInputTokens, outputTokens: totalOutputTokens }, + tokenUsage: { inputTokens: totalInputTokens, outputTokens: totalOutputTokens, costUsd: totalCostUsd }, }; } diff --git a/packages/ingest-github/src/strategies/flat-folder/phases/process-big-files.ts b/packages/ingest-github/src/strategies/flat-folder/phases/process-big-files.ts index c0563b2..1197753 100644 --- a/packages/ingest-github/src/strategies/flat-folder/phases/process-big-files.ts +++ b/packages/ingest-github/src/strategies/flat-folder/phases/process-big-files.ts @@ -1,5 +1,6 @@ import { logger } from "@bb/logger"; import type { AskLlmOptions } from "@bb/llm"; +import { LlmConfigError, LlmError } from "@bb/errors"; import type { MetaPaths } from "#src/types/meta-paths.ts"; import type { SourceReader } from "#src/types/pipeline.ts"; import type { ProgressContext } from "#src/progress/types.ts"; @@ -21,7 +22,7 @@ export interface ProcessBigFilesResult { cached: number; failed: number; skippedOversized: number; - tokenUsage: { inputTokens: number; outputTokens: number }; + tokenUsage: { inputTokens: number; outputTokens: number; costUsd: number }; } export async function processBigFilesQueue(input: ProcessBigFilesInput): Promise { @@ -32,6 +33,7 @@ export async function processBigFilesQueue(input: ProcessBigFilesInput): Promise let skippedOversized = 0; let totalInputTokens = 0; let totalOutputTokens = 0; + let totalCostUsd = 0; const reporter = input.progressContext?.reporter({ phase: "file_analysis", @@ -83,11 +85,15 @@ export async function processBigFilesQueue(input: ProcessBigFilesInput): Promise if (condensed.tokenUsage) { totalInputTokens += condensed.tokenUsage.inputTokens; totalOutputTokens += condensed.tokenUsage.outputTokens; + totalCostUsd += condensed.tokenUsage.costUsd; } } catch (cause: unknown) { if (cause instanceof CancellationError) { throw cause; } + if (cause instanceof LlmConfigError || cause instanceof LlmError) { + throw cause; + } failed += 1; logger.warn(`phase2: processBigFile failed for ${entry.relativePath}: ${describe(cause)}`); } @@ -101,7 +107,7 @@ export async function processBigFilesQueue(input: ProcessBigFilesInput): Promise cached, failed, skippedOversized, - tokenUsage: { inputTokens: totalInputTokens, outputTokens: totalOutputTokens }, + tokenUsage: { inputTokens: totalInputTokens, outputTokens: totalOutputTokens, costUsd: totalCostUsd }, }; } finally { reporter?.stop(); diff --git a/packages/ingest-github/src/strategies/flat-folder/repo-summary.ts b/packages/ingest-github/src/strategies/flat-folder/repo-summary.ts index 0eaec5c..13d54fb 100644 --- a/packages/ingest-github/src/strategies/flat-folder/repo-summary.ts +++ b/packages/ingest-github/src/strategies/flat-folder/repo-summary.ts @@ -1,5 +1,6 @@ import { writeFile } from "node:fs/promises"; import { askJsonLLM, tokenLen, type AskLlmOptions } from "@bb/llm"; +import { LlmConfigError, LlmError } from "@bb/errors"; import { logger } from "@bb/logger"; import { Config } from "@bb/types"; import { getConfigValue } from "@bb/config"; @@ -29,16 +30,20 @@ export async function summariseRepo( knowledgeId: string, metaPaths: MetaPaths, llmCallContext?: AskLlmOptions, -): Promise<{ summary: RepoSummary | null; tokenUsage: { inputTokens: number; outputTokens: number } }> { +): Promise<{ + summary: RepoSummary | null; + tokenUsage: { inputTokens: number; outputTokens: number; costUsd: number }; +}> { const folders: FolderSummary[] = []; for await (const f of iterateFolderSummaries(metaPaths)) { folders.push(f); } let totalInputTokens = 0; let totalOutputTokens = 0; + let totalCostUsd = 0; if (folders.length === 0) { logger.warn(`phase6: no folder summaries on disk; skipping repo summary`); - return { summary: null, tokenUsage: { inputTokens: 0, outputTokens: 0 } }; + return { summary: null, tokenUsage: { inputTokens: 0, outputTokens: 0, costUsd: 0 } }; } folders.sort((a, b) => a.folderPath.split("/").length - b.folderPath.split("/").length); const infos = repoFolderInfosFrom(folders); @@ -59,17 +64,21 @@ export async function summariseRepo( const { summary: partial, tokenUsage } = await callRepoSummary(buildRepoPromptFromFolders(batch), llmCallContext); totalInputTokens += tokenUsage.inputTokens; totalOutputTokens += tokenUsage.outputTokens; + totalCostUsd += tokenUsage.costUsd; if (partial !== null) { partials.push(JSON.stringify(partial)); } } if (partials.length === 0) { - return { summary: null, tokenUsage: { inputTokens: totalInputTokens, outputTokens: totalOutputTokens } }; + return { + summary: null, + tokenUsage: { inputTokens: totalInputTokens, outputTokens: totalOutputTokens, costUsd: totalCostUsd }, + }; } if (partials.length === 1) { return { summary: JSON.parse(partials[0] ?? "null") as RepoSummary | null, - tokenUsage: { inputTokens: totalInputTokens, outputTokens: totalOutputTokens }, + tokenUsage: { inputTokens: totalInputTokens, outputTokens: totalOutputTokens, costUsd: totalCostUsd }, }; } throwIfCancelled(knowledgeId); @@ -82,6 +91,7 @@ export async function summariseRepo( tokenUsage: { inputTokens: totalInputTokens + finalUsage.inputTokens, outputTokens: totalOutputTokens + finalUsage.outputTokens, + costUsd: totalCostUsd + finalUsage.costUsd, }, }; } @@ -89,23 +99,37 @@ export async function summariseRepo( async function callRepoSummary( userPrompt: string, llmCallContext?: AskLlmOptions, -): Promise<{ summary: RepoSummary | null; tokenUsage: { inputTokens: number; outputTokens: number } }> { +): Promise<{ + summary: RepoSummary | null; + tokenUsage: { inputTokens: number; outputTokens: number; costUsd: number }; +}> { try { const response = await askJsonLLM(REPO_SUMMARY_SYSTEM_PROMPT, userPrompt, llmCallContext ?? {}); if (response.result === null) { return { summary: null, - tokenUsage: { inputTokens: response.usage.inputTokens, outputTokens: response.usage.outputTokens }, + tokenUsage: { + inputTokens: response.usage.inputTokens, + outputTokens: response.usage.outputTokens, + costUsd: response.usage.costUsd, + }, }; } return { summary: shapeRepoSummary(response.result), - tokenUsage: { inputTokens: response.usage.inputTokens, outputTokens: response.usage.outputTokens }, + tokenUsage: { + inputTokens: response.usage.inputTokens, + outputTokens: response.usage.outputTokens, + costUsd: response.usage.costUsd, + }, }; } catch (cause: unknown) { + if (cause instanceof LlmConfigError || cause instanceof LlmError) { + throw cause; + } const msg = cause instanceof Error ? cause.message : String(cause); logger.warn(`callRepoSummary: askJsonLLM failed: ${msg}`); - return { summary: null, tokenUsage: { inputTokens: 0, outputTokens: 0 } }; + return { summary: null, tokenUsage: { inputTokens: 0, outputTokens: 0, costUsd: 0 } }; } } diff --git a/packages/ingest-github/src/types/big-file.ts b/packages/ingest-github/src/types/big-file.ts index d3b28e8..4d73838 100644 --- a/packages/ingest-github/src/types/big-file.ts +++ b/packages/ingest-github/src/types/big-file.ts @@ -27,7 +27,7 @@ export interface ChunkAnalysisResult { endLine: number; language: string; analysis: FileAnalysis; - tokenUsage?: { inputTokens: number; outputTokens: number } | undefined; + tokenUsage?: { inputTokens: number; outputTokens: number; costUsd: number } | undefined; } export interface HugeFileManifest { diff --git a/packages/ingest-github/src/types/condensed-file-analysis.ts b/packages/ingest-github/src/types/condensed-file-analysis.ts index c19d9f6..eeee56d 100644 --- a/packages/ingest-github/src/types/condensed-file-analysis.ts +++ b/packages/ingest-github/src/types/condensed-file-analysis.ts @@ -11,5 +11,5 @@ export interface CondensedFileAnalysis { totalTokenCount: number; analysedAt: string; analysis: FileAnalysis; - tokenUsage?: { inputTokens: number; outputTokens: number } | undefined; + tokenUsage?: { inputTokens: number; outputTokens: number; costUsd: number } | undefined; } diff --git a/packages/ingest-github/src/types/pipeline.ts b/packages/ingest-github/src/types/pipeline.ts index 382cd16..9f5c0be 100644 --- a/packages/ingest-github/src/types/pipeline.ts +++ b/packages/ingest-github/src/types/pipeline.ts @@ -23,7 +23,7 @@ export type ScanEntry = ScannedFile | OversizedFile; export interface AnalyzedFileResult { language: string; analysis: FileAnalysis; - tokenUsage?: { inputTokens: number; outputTokens: number } | undefined; + tokenUsage?: { inputTokens: number; outputTokens: number; costUsd: number } | undefined; } export interface FileAnalyzer { @@ -45,7 +45,7 @@ export interface PipelineSummary { repoSummarised: boolean; graphNodesWritten: number; commitHash: string; - tokenUsage: { inputTokens: number; outputTokens: number }; + tokenUsage: { inputTokens: number; outputTokens: number; costUsd: number }; } export interface PipelineDeps { diff --git a/packages/ingest-github/src/types/strategy.ts b/packages/ingest-github/src/types/strategy.ts index 32e4b11..a9e6bee 100644 --- a/packages/ingest-github/src/types/strategy.ts +++ b/packages/ingest-github/src/types/strategy.ts @@ -30,7 +30,7 @@ export interface StrategyResult { foldersSummarised: number; repoSummarised: boolean; graphNodesWritten: number; - tokenUsage: { inputTokens: number; outputTokens: number }; + tokenUsage: { inputTokens: number; outputTokens: number; costUsd: number }; } export interface IngestStrategy { diff --git a/packages/llm/README.md b/packages/llm/README.md index dbddd0b..5d659d0 100644 --- a/packages/llm/README.md +++ b/packages/llm/README.md @@ -18,30 +18,29 @@ selected by `Config.LlmProvider` (`"openrouter"` default, or - `askLLM(prompt, opts?)` — dispatches to either `src/openrouter.ts` or `src/ollama.ts` depending on `Config.LlmProvider`. Returns - `{ content, usage: { model, inputTokens, outputTokens } }`. Caller - never sees the provider; the result shape is identical across - backends. + `{ content, usage: { model, inputTokens, outputTokens, costUsd } }`. + Caller never sees the provider; the result shape is identical across + backends. `costUsd` is the provider-reported USD cost for that single + call — taken straight from the provider's response, never computed + client-side. - **OpenRouter mode** — POST to OpenRouter's chat-completions endpoint using `Config.OpenrouterApiKey` + `Config.OpenrouterModel` as the primary model, plus `Config.OpenrouterFallbackModel1..4` as the fallback chain. The request body includes a `models: [...]` array - when the deduplicated chain has ≥2 non-empty entries; OpenRouter - routes among them and bills only the responder. `usage.model` is the - actual model the gateway picked. Tokens come straight from - OpenRouter's `usage.prompt_tokens` / `usage.completion_tokens`. + when the deduplicated chain has ≥2 non-empty entries and always sends + `usage: { include: true }` so OpenRouter populates `usage.cost` in + the response. `usage.model` is the actual model the gateway picked. + Tokens come straight from OpenRouter's `usage.prompt_tokens` / + `usage.completion_tokens`; `costUsd` from `usage.cost` (defaults to + `0` when the provider omits it — common for `:free` models). - **Ollama mode** — POST to `${Config.OllamaUrl}/api/chat` with `{ model: Config.OllamaModel, messages, stream: false }`. Single model per request — no fallback chain (Ollama does not have a multi-model fan-out). The model string is free-form: any model the user has pulled into their Ollama daemon works (`llama3.1`, `qwen2.5-coder:7b`, custom Modelfile names — we do not validate). - `inputTokens` ← `prompt_eval_count`, `outputTokens` ← `eval_count`. - Cost is reported as `$0` (see `estimateCostUsd` short-circuit). -- `estimateCostUsd(model, inputTokens, outputTokens)` and - `estimateCostFromBreakdown(modelTokens)` — async cost helpers backed - by a one-shot fetch of OpenRouter's `/api/v1/models` (cached in module - scope for the process lifetime). Returns `-1` when the model has no - published pricing. + `inputTokens` ← `prompt_eval_count`, `outputTokens` ← `eval_count`, + `costUsd` ← `0` (Ollama is keyless / local). - AbortController-based timeout (default 90s, matches the kube-package reference `askLLM` shape) - Typed errors via `@bb/errors`: `LlmConfigError` (missing key) and @@ -56,8 +55,6 @@ selected by `Config.LlmProvider` (`"openrouter"` default, or ```ts function askLLM(prompt: string, opts?: AskLlmOptions): Promise; -function estimateCostUsd(model: string, inputTokens: number, outputTokens: number): Promise; -function estimateCostFromBreakdown(modelTokens: ModelTokenBreakdown): Promise; function tokenLen(text: string): number; function encodeTokens(text: string): number[]; function decodeTokens(tokens: number[]): string; @@ -73,10 +70,14 @@ interface AskLlmOptions { } interface AskLlmResult { content: string; - usage: { model: string; inputTokens: number; outputTokens: number }; + usage: { model: string; inputTokens: number; outputTokens: number; costUsd: number }; } ``` +Local-pricing helpers (`estimateCostUsd`, `estimateCostFromBreakdown`) +have been removed — cost is now sourced directly from +`response.usage.cost` returned by OpenRouter. + The package has no module-scoped HTTP client. Each `askLLM` call constructs its own `fetch` request. diff --git a/packages/llm/src/cache.ts b/packages/llm/src/cache.ts index a6fea64..c1e13a1 100644 --- a/packages/llm/src/cache.ts +++ b/packages/llm/src/cache.ts @@ -57,6 +57,10 @@ export async function getCachedDecision(key: string): Promise( let totalInputTokens = 0; let totalOutputTokens = 0; + let totalCostUsd = 0; let lastModel = ""; let lastRaw = ""; @@ -82,13 +83,19 @@ export async function askJsonLLM( const { content, usage } = await askLLM(userPrompt, baseOpts); totalInputTokens += usage.inputTokens; totalOutputTokens += usage.outputTokens; + totalCostUsd += usage.costUsd; lastModel = usage.model; lastRaw = content; const parsed = tryParseJson(content); if (parsed !== null) { return { result: parsed, - usage: { model: usage.model, inputTokens: totalInputTokens, outputTokens: totalOutputTokens }, + usage: { + model: usage.model, + inputTokens: totalInputTokens, + outputTokens: totalOutputTokens, + costUsd: totalCostUsd, + }, raw: content, }; } @@ -96,7 +103,7 @@ export async function askJsonLLM( return { result: null, - usage: { model: lastModel, inputTokens: totalInputTokens, outputTokens: totalOutputTokens }, + usage: { model: lastModel, inputTokens: totalInputTokens, outputTokens: totalOutputTokens, costUsd: totalCostUsd }, raw: lastRaw, }; } diff --git a/packages/llm/src/ollama.ts b/packages/llm/src/ollama.ts index ea9fecd..096b7ac 100644 --- a/packages/llm/src/ollama.ts +++ b/packages/llm/src/ollama.ts @@ -90,6 +90,7 @@ export async function callOllama(prompt: string, opts: AskLlmOptions, timeoutMs: ? json.prompt_eval_count : tokenLen((opts.systemPrompt ?? "") + prompt), outputTokens: typeof json.eval_count === "number" ? json.eval_count : tokenLen(content), + costUsd: 0, }, }; } diff --git a/packages/llm/src/openrouter.ts b/packages/llm/src/openrouter.ts index fb17150..53b48b4 100644 --- a/packages/llm/src/openrouter.ts +++ b/packages/llm/src/openrouter.ts @@ -11,10 +11,20 @@ interface OpenRouterMessage { content: string; } +interface OpenRouterUsageAccounting { + /** + * Opt-in flag that asks OpenRouter to populate `usage.cost` in the + * response with the authoritative billed cost (in USD credits). Without + * this, OpenRouter omits the cost field. + */ + include: true; +} + interface OpenRouterRequest { model: string; models?: string[]; messages: OpenRouterMessage[]; + usage: OpenRouterUsageAccounting; } interface OpenRouterResponse { @@ -23,6 +33,7 @@ interface OpenRouterResponse { usage?: { prompt_tokens?: number; completion_tokens?: number; + cost?: number; }; } @@ -55,8 +66,11 @@ export async function callOpenRouter(prompt: string, opts: AskLlmOptions, timeou } messages.push({ role: "user", content: prompt }); + const usageAccounting: OpenRouterUsageAccounting = { include: true }; const body: OpenRouterRequest = - cappedChain.length > 1 ? { model, models: cappedChain, messages } : { model, messages }; + cappedChain.length > 1 + ? { model, models: cappedChain, messages, usage: usageAccounting } + : { model, messages, usage: usageAccounting }; const controller = new AbortController(); const timer = setTimeout(() => controller.abort(), timeoutMs); @@ -82,7 +96,10 @@ export async function callOpenRouter(prompt: string, opts: AskLlmOptions, timeou if (!response.ok) { const text = await response.text().catch(() => ""); - throw new LlmError(`OpenRouter HTTP ${response.status}: ${text.slice(0, 500)}`); + throw new LlmError(`OpenRouter HTTP ${response.status}`, undefined, { + status: response.status, + detail: text.slice(0, 4000), + }); } const json = (await response.json()) as OpenRouterResponse; @@ -100,6 +117,7 @@ export async function callOpenRouter(prompt: string, opts: AskLlmOptions, timeou : tokenLen((opts.systemPrompt ?? "") + prompt), outputTokens: typeof json.usage?.completion_tokens === "number" ? json.usage.completion_tokens : tokenLen(content), + costUsd: typeof json.usage?.cost === "number" ? json.usage.cost : 0, }, }; } diff --git a/packages/llm/src/pricing.ts b/packages/llm/src/pricing.ts deleted file mode 100644 index 78da4d7..0000000 --- a/packages/llm/src/pricing.ts +++ /dev/null @@ -1,137 +0,0 @@ -import { getConfigValue } from "@bb/config"; -import { Config, type ModelTokenBreakdown } from "@bb/types"; - -const OPENROUTER_MODELS_URL = "https://openrouter.ai/api/v1/models"; -const PRICING_TIMEOUT_MS = 8_000; -const COST_UNKNOWN = -1; - -interface OpenRouterPricing { - prompt?: string; - completion?: string; -} - -interface OpenRouterModel { - id?: string; - pricing?: OpenRouterPricing; -} - -interface OpenRouterModelsResponse { - data?: OpenRouterModel[]; -} - -interface ModelPrice { - inputUsdPerToken: number; - outputUsdPerToken: number; -} - -let pricingCache: Map | null = null; -let pricingPromise: Promise> | null = null; - -async function fetchPricing(): Promise> { - const map = new Map(); - let response: Response; - try { - response = await fetch(OPENROUTER_MODELS_URL, { - signal: AbortSignal.timeout(PRICING_TIMEOUT_MS), - }); - } catch { - return map; - } - if (!response.ok) { - return map; - } - const json = (await response.json().catch(() => null)) as OpenRouterModelsResponse | null; - if (json === null || !Array.isArray(json.data)) { - return map; - } - for (const entry of json.data) { - if (typeof entry.id !== "string" || entry.id.length === 0) { - continue; - } - const promptStr = entry.pricing?.prompt; - const completionStr = entry.pricing?.completion; - const inputPrice = typeof promptStr === "string" ? Number.parseFloat(promptStr) : Number.NaN; - const outputPrice = typeof completionStr === "string" ? Number.parseFloat(completionStr) : Number.NaN; - if (!Number.isFinite(inputPrice) || !Number.isFinite(outputPrice)) { - continue; - } - map.set(entry.id, { inputUsdPerToken: inputPrice, outputUsdPerToken: outputPrice }); - } - return map; -} - -async function getPricing(): Promise> { - if (pricingCache !== null) { - return pricingCache; - } - if (pricingPromise === null) { - pricingPromise = fetchPricing().then((map) => { - pricingCache = map; - return map; - }); - } - return pricingPromise; -} - -function resolvePrice(prices: Map, model: string): ModelPrice | undefined { - const direct = prices.get(model); - if (direct !== undefined) { - return direct; - } - for (const [id, price] of prices.entries()) { - if (id.endsWith(`/${model}`) || model.endsWith(`/${id}`)) { - return price; - } - } - return undefined; -} - -function isOllamaProvider(): boolean { - try { - return getConfigValue(Config.LlmProvider) === "ollama"; - } catch { - return false; - } -} - -export async function estimateCostUsd(model: string, inputTokens: number, outputTokens: number): Promise { - if (isOllamaProvider()) { - return 0; - } - const prices = await getPricing(); - if (prices.size === 0) { - return COST_UNKNOWN; - } - const price = resolvePrice(prices, model); - if (price === undefined) { - return COST_UNKNOWN; - } - const cost = inputTokens * price.inputUsdPerToken + outputTokens * price.outputUsdPerToken; - return Math.round(cost * 1_000_000) / 1_000_000; -} - -export async function estimateCostFromBreakdown(modelTokens: ModelTokenBreakdown): Promise { - const entries = Object.entries(modelTokens); - if (entries.length === 0) { - return 0; - } - let total = 0; - let anyKnown = false; - for (const [model, usage] of entries) { - const cost = await estimateCostUsd(model, usage.inputTokens, usage.outputTokens); - if (cost === COST_UNKNOWN) { - continue; - } - anyKnown = true; - total += cost; - } - if (!anyKnown) { - return COST_UNKNOWN; - } - return Math.round(total * 1_000_000) / 1_000_000; -} - -export function _resetPricingForTests(): void { - pricingCache = null; - pricingPromise = null; -} diff --git a/packages/mongo/README.md b/packages/mongo/README.md index 73d1b72..028a513 100644 --- a/packages/mongo/README.md +++ b/packages/mongo/README.md @@ -22,22 +22,27 @@ The package owns: - `upsertKnowledge` / `listKnowledge` — knowledge-doc upsert and list (with file count joined from `raw`). Used by the github / local index routes and by `@bb/cli`'s `ls` and `delete` flows. - - `deleteKnowledge` — hard delete: removes the `knowledge` doc, every - `raw` row tagged with that `knowledgeId`, and every - `processing_stats` commit row tagged with that `knowledgeId`. - Called by the server's `DELETE /api/v1/repos/:knowledgeId` route. + - `deleteKnowledge` — hard delete: removes the `knowledge` doc and + every `raw` row tagged with that `knowledgeId`. Called by the + server's `DELETE /api/v1/repos/:knowledgeId` route. - `upsertRawFile` — per-file Raw doc writer (compound key `{ knowledgeId, relativePath }`). Called by `@bb/ingest-github`'s worker for every scanned file. - - `recordProcessingStats` — upsert one `processing_stats` row keyed - on `{ knowledgeId, commitHash }`. Called by `@bb/ingest-github`'s - worker once per ingest run with the per-model token totals, - estimated cost, and processing time. - - `aggregateStats` — read every `knowledge` + `processing_stats` doc - and assemble the kube-shaped `StatsResponse` (totals, repos, - commitStats). Called by the server's `GET /api/v1/stats` route. + - `setKnowledgeCommit(knowledgeId, commitHash, inputTokens, outputTokens, costUsd)` + — appends `{ hash, inputTokens, outputTokens, costUsd }` to + `source.commitHashes[]` and sets `source.commitId`. `costUsd` is the + OpenRouter-reported USD cost (`response.usage.cost`) summed across + the pipeline phases for this commit — never computed client-side. + - `aggregateStats` — read every `knowledge` doc and assemble the + kube-shaped `StatsResponse` (totals, repos, commitStats) by summing + `source.commitHashes[].{inputTokens,outputTokens,costUsd}`. Called by + the server's `GET /api/v1/stats` route. The deprecated + `processing_stats` collection is no longer queried. - A central registry of collection name strings (`Collections` enum): - `knowledge`, `raw`, `processing_stats`. + `knowledge`, `raw`, `mcp_usage`, `mcp_activity`. The + `processing_stats` collection has been removed — per-commit token + + cost data lives on the knowledge document's `source.commitHashes[]` + instead. The package does **not** own: diff --git a/packages/mongo/src/README.md b/packages/mongo/src/README.md index f89f382..6ae0c4e 100644 --- a/packages/mongo/src/README.md +++ b/packages/mongo/src/README.md @@ -7,9 +7,9 @@ package-level contract; this file documents how the source tree is split. - **[index.ts](index.ts)** — public re-exports. The only entry point other packages may import. Exposes `connectMongo`, `closeMongo`, `pingMongo`, - `setKnowledgeState`, `upsertRawFile`, and the `PingResult` / - `FileAnalysis` / `RawFileDoc` types. Anything not re-exported here is - internal. + `setKnowledgeState`, `markKnowledgeFailed`, `upsertRawFile`, and the + `PingResult` / `FileAnalysis` / `RawFileDoc` types. Anything not + re-exported here is internal. - **[client.ts](client.ts)** — module-scoped `MongoClient` singleton plus the lifecycle (`connectMongo`, `closeMongo`), the health probe (`pingMongo`), and the **internal** `_getDb()` accessor. Reads the URI via @@ -22,12 +22,19 @@ package-level contract; this file documents how the source tree is split. `Collections.Knowledge = "knowledge"`, `Collections.Raw = "raw"`. `Nodes` and `Jobs` join when their helpers land. **Internal** — not re-exported from `index.ts`; consumed only by helpers in this folder. -- **[knowledge.ts](knowledge.ts)** — domain CRUD helper: - `setKnowledgeState(knowledgeId, state)`. Uses `_getDb()` to access - `Collections.Knowledge`, runs `updateOne({ knowledgeId }, { $set: { -"status.state": state, updatedAt: } })`, and throws - `KnowledgeNotFoundError` on `matchedCount === 0`. Called by `@bb/queue` - publishers on enqueue. +- **[knowledge.ts](knowledge.ts)** — domain CRUD helpers: + - `setKnowledgeState(knowledgeId, state)` runs + `updateOne({ knowledgeId }, { $set: { "status.state": state, updatedAt }, $unset: { failure: "" } })`. + The `$unset` of `failure` makes the next successful transition out of + FAILED automatically clear any stale failure metadata. Throws + `KnowledgeNotFoundError` on `matchedCount === 0`. Called by `@bb/queue` + publishers on enqueue and by the runner on terminal success. + - `markKnowledgeFailed(knowledgeId, reason, category, detail?)` writes + the structured `failure: { reason, category, at, detail? }` subdoc + alongside `status.state = "FAILED"`. `reason` is short and + operator-readable; `detail` is the optional raw provider response. + Called by `@bb/ingest-github/src/pipeline/run.ts` (and `pull.ts`) + catch blocks via the shared `classifyFailure` helper. - **[raw.ts](raw.ts)** — domain CRUD helpers for `Collections.Raw`. Defines the `FileAnalysis` and `RawFileDoc` interfaces (package-local until promotion to `@bb/types`). Exports: diff --git a/packages/mongo/src/aggregateStats.ts b/packages/mongo/src/aggregateStats.ts new file mode 100644 index 0000000..0cfa6a8 --- /dev/null +++ b/packages/mongo/src/aggregateStats.ts @@ -0,0 +1,155 @@ +import type { + KnowledgeDoc, + StatsCommitEntry, + StatsRepoEntry, + StatsResponse, + StatsTotals, +} from "@bb/types"; +import { _getDb } from "./client.ts"; +import { Collections } from "./collections.ts"; + +interface CommitHashRecord { + hash: string; + inputTokens: string; + outputTokens: string; + costUsd: string; +} + +/** + * Aggregates token + cost stats over the `knowledge` collection. Replaces the + * previous read against the deleted `processing_stats` collection — the + * authoritative per-commit numbers now live on the knowledge document's + * `source.commitHashes[]` (populated by `setKnowledgeCommit`). + * + * Fields that the old `processing_stats` row carried but the knowledge doc + * does not (per-commit `processingTimeMs`, `totalBatches`, `totalFolders`, + * `filesAnalyzed`, `createdAt`/`updatedAt`) are reported as 0 / empty — + * the `bytebell stats` UI tolerates that. + */ +export async function aggregateStats(): Promise { + const db = _getDb(); + const knowledgeDocs = (await db + .collection(Collections.Knowledge) + .find({}) + .sort({ updatedAt: -1 }) + .toArray()) as unknown as KnowledgeDoc[]; + + const repos: StatsRepoEntry[] = []; + const commitStats: StatsCommitEntry[] = []; + let totalInputTokens = 0; + let totalOutputTokens = 0; + let totalCost = 0; + let totalFiles = 0; + + for (const doc of knowledgeDocs) { + const commits = pickCommits(doc); + const fileCount = await db.collection(Collections.Raw).countDocuments({ knowledgeId: doc.knowledgeId }); + const repoName = deriveRepoName(doc); + const type = doc.source.kind === "github" ? ("GITHUB" as const) : ("LOCAL" as const); + + let repoIn = 0; + let repoOut = 0; + let repoCost = 0; + for (const c of commits) { + const inT = parseNumber(c.inputTokens); + const outT = parseNumber(c.outputTokens); + const cost = parseNumber(c.costUsd); + repoIn += inT; + repoOut += outT; + repoCost += cost; + commitStats.push({ + knowledgeId: doc.knowledgeId, + repoName, + commitHash: c.hash, + inputTokens: inT, + outputTokens: outT, + estimatedCost: cost, + totalBatches: 0, + processingTimeMs: 0, + totalFiles: fileCount, + totalFolders: 0, + filesAnalyzed: fileCount, + createdAt: "", + updatedAt: "", + }); + } + + repos.push({ + knowledgeId: doc.knowledgeId, + repoName, + type, + fileCount, + folderCount: 0, + inputTokens: repoIn, + outputTokens: repoOut, + estimatedCost: repoCost, + }); + + totalInputTokens += repoIn; + totalOutputTokens += repoOut; + totalCost += repoCost; + totalFiles += fileCount; + } + + const totals: StatsTotals = { + totalRepos: knowledgeDocs.length, + totalFiles, + totalFolders: 0, + totalInputTokens, + totalOutputTokens, + totalEstimatedCost: Math.round(totalCost * 1_000_000) / 1_000_000, + }; + + return { totals, repos, commitStats }; +} + +function pickCommits(doc: KnowledgeDoc): CommitHashRecord[] { + const source = (doc as unknown as { source?: { commitHashes?: unknown } }).source; + const raw = source?.commitHashes; + if (!Array.isArray(raw)) { + return []; + } + const out: CommitHashRecord[] = []; + for (const entry of raw) { + if (typeof entry !== "object" || entry === null) { + continue; + } + const rec = entry as Partial; + if (typeof rec.hash !== "string") { + continue; + } + out.push({ + hash: rec.hash, + inputTokens: typeof rec.inputTokens === "string" ? rec.inputTokens : "0", + outputTokens: typeof rec.outputTokens === "string" ? rec.outputTokens : "0", + costUsd: typeof rec.costUsd === "string" ? rec.costUsd : "0", + }); + } + return out; +} + +function parseNumber(value: string): number { + const n = Number.parseFloat(value); + return Number.isFinite(n) ? n : 0; +} + +function deriveRepoName(doc: KnowledgeDoc): string { + if (doc.source.kind === "local") { + const segments = doc.source.sourcePath.split("/").filter((s) => s.length > 0); + return segments.at(-1) ?? doc.source.sourcePath; + } + try { + const segments = new URL(doc.info.repoUrl ?? "").pathname + .split("/") + .map((s) => s.trim()) + .filter((s) => s.length > 0); + const repo = segments.at(-1)?.replace(/\.git$/u, ""); + const owner = segments.at(-2); + if (owner !== undefined && repo !== undefined) { + return `${owner}/${repo}`; + } + } catch { + // fall through + } + return doc.info.repoUrl ?? ""; +} diff --git a/packages/mongo/src/collections.ts b/packages/mongo/src/collections.ts index 8a0714d..7737836 100644 --- a/packages/mongo/src/collections.ts +++ b/packages/mongo/src/collections.ts @@ -1,7 +1,6 @@ export enum Collections { Knowledge = "knowledge", Raw = "raw", - ProcessingStats = "processing_stats", Usage = "mcp_usage", Activity = "mcp_activity", } diff --git a/packages/mongo/src/index.ts b/packages/mongo/src/index.ts index 9f7780a..bcee0b0 100644 --- a/packages/mongo/src/index.ts +++ b/packages/mongo/src/index.ts @@ -5,6 +5,7 @@ export { getKnowledge, setKnowledgeCommit, setKnowledgeState, + markKnowledgeFailed, setKnowledgeBranch, updateKnowledgeProgress, upsertKnowledge, @@ -16,8 +17,7 @@ export type { KnowledgeListEntry, DeleteKnowledgeResult } from "./knowledge.ts"; export { upsertRawFile, listRawFileShas, deleteRawFiles } from "./raw.ts"; export type { FileAnalysis, FileAnalysisSection, RawFileDoc } from "./raw.ts"; -export { recordProcessingStats, aggregateStats } from "./processingStats.ts"; -export type { RecordProcessingStatsInput } from "./processingStats.ts"; +export { aggregateStats } from "./aggregateStats.ts"; export { incrementUsage, getMonthlyUsage, getGlobalUsage } from "./usage.ts"; export { recordActivity } from "./activity.ts"; diff --git a/packages/mongo/src/knowledge.ts b/packages/mongo/src/knowledge.ts index a04f163..a83e30e 100644 --- a/packages/mongo/src/knowledge.ts +++ b/packages/mongo/src/knowledge.ts @@ -1,4 +1,4 @@ -import type { KnowledgeDoc, KnowledgeState } from "@bb/types"; +import type { KnowledgeDoc, KnowledgeFailureCategory, KnowledgeState } from "@bb/types"; import { KnowledgeNotFoundError } from "@bb/errors"; import { _getDb } from "./client.ts"; import { Collections } from "./collections.ts"; @@ -10,9 +10,51 @@ export interface KnowledgeListEntry extends KnowledgeDoc { } export async function setKnowledgeState(knowledgeId: string, state: KnowledgeState): Promise { + const update: Record = { "status.state": state, updatedAt: new Date() }; const result = await _getDb() .collection(Collections.Knowledge) - .updateOne({ knowledgeId }, { $set: { "status.state": state, updatedAt: new Date() } }); + .updateOne({ knowledgeId }, { $set: update, $unset: { failure: "" } }); + if (result.matchedCount === 0) { + throw new KnowledgeNotFoundError(knowledgeId); + } +} + +/** + * Marks a knowledge as FAILED and records the structured failure reason on + * the top-level `failure` subdoc. The next successful transition out of + * FAILED automatically clears it (see `setKnowledgeState`'s `$unset`). + * + * `reason` is a short operator-readable sentence (UI surfaces it directly). + * `detail` is the raw provider response or structured debug payload (UI may + * hide behind a disclosure). + */ +export async function markKnowledgeFailed( + knowledgeId: string, + reason: string, + category: KnowledgeFailureCategory, + detail?: string, +): Promise { + const now = new Date(); + const failure: { reason: string; category: KnowledgeFailureCategory; at: Date; detail?: string } = { + reason, + category, + at: now, + }; + if (detail !== undefined && detail.length > 0) { + failure.detail = detail; + } + const result = await _getDb() + .collection(Collections.Knowledge) + .updateOne( + { knowledgeId }, + { + $set: { + "status.state": "FAILED", + failure, + updatedAt: now, + }, + }, + ); if (result.matchedCount === 0) { throw new KnowledgeNotFoundError(knowledgeId); } @@ -31,6 +73,7 @@ export async function setKnowledgeCommit( commitHash: string, inputTokens: string = "", outputTokens: string = "", + costUsd: string = "0", ): Promise { const result = await _getDb() .collection(Collections.Knowledge) @@ -39,7 +82,7 @@ export async function setKnowledgeCommit( { $set: { "source.commitId": commitHash, updatedAt: new Date() }, $addToSet: { - "source.commitHashes": { hash: commitHash, inputTokens, outputTokens }, + "source.commitHashes": { hash: commitHash, inputTokens, outputTokens, costUsd }, }, }, ); @@ -102,7 +145,6 @@ export async function upsertKnowledge(doc: Omit & { u export interface DeleteKnowledgeResult { knowledgeDeleted: number; rawDeleted: number; - statsDeleted: number; } export async function deleteKnowledge(knowledgeId: string): Promise { @@ -112,11 +154,9 @@ export async function deleteKnowledge(knowledgeId: string): Promise { - const now = new Date(); - const totals = sumModelTokens(input.modelTokens); - await _getDb() - .collection(Collections.ProcessingStats) - .updateOne( - { knowledgeId: input.knowledgeId, commitHash: input.commitHash }, - { - $set: { - repoName: input.repoName, - modelTokens: input.modelTokens, - inputTokens: totals.inputTokens, - outputTokens: totals.outputTokens, - estimatedCost: input.estimatedCost, - totalBatches: input.totalBatches, - totalFiles: input.totalFiles, - totalFolders: input.totalFolders, - filesAnalyzed: input.filesAnalyzed, - processingTimeMs: input.processingTimeMs, - updatedAt: now, - }, - $setOnInsert: { - knowledgeId: input.knowledgeId, - commitHash: input.commitHash, - createdAt: now, - }, - }, - { upsert: true }, - ); - return totals; -} - -export async function aggregateStats(): Promise { - const db = _getDb(); - const knowledgeDocs = (await db - .collection(Collections.Knowledge) - .find({}) - .sort({ updatedAt: -1 }) - .toArray()) as unknown as KnowledgeDoc[]; - - const statsDocs = (await db - .collection(Collections.ProcessingStats) - .find({}) - .sort({ updatedAt: -1 }) - .toArray()) as unknown as ProcessingStatsDoc[]; - - const repos: StatsRepoEntry[] = []; - for (const doc of knowledgeDocs) { - const matchedStats = statsDocs.filter((s) => s.knowledgeId === doc.knowledgeId); - const aggregate = aggregateRepoTokens(matchedStats); - const fileCount = await db.collection(Collections.Raw).countDocuments({ knowledgeId: doc.knowledgeId }); - repos.push({ - knowledgeId: doc.knowledgeId, - repoName: matchedStats[0]?.repoName ?? deriveRepoName(doc), - type: doc.source.kind === "github" ? "GITHUB" : "LOCAL", - fileCount, - folderCount: 0, - inputTokens: aggregate.inputTokens, - outputTokens: aggregate.outputTokens, - estimatedCost: aggregate.estimatedCost, - }); - } - - const commitStats: StatsCommitEntry[] = statsDocs.map((s) => ({ - knowledgeId: s.knowledgeId, - repoName: s.repoName, - commitHash: s.commitHash, - inputTokens: s.inputTokens, - outputTokens: s.outputTokens, - estimatedCost: s.estimatedCost, - totalBatches: s.totalBatches, - processingTimeMs: s.processingTimeMs, - totalFiles: s.totalFiles, - totalFolders: s.totalFolders, - filesAnalyzed: s.filesAnalyzed, - createdAt: toIso(s.createdAt), - updatedAt: toIso(s.updatedAt), - })); - - const totals: StatsTotals = { - totalRepos: knowledgeDocs.length, - totalFiles: repos.reduce((sum, r) => sum + r.fileCount, 0), - totalFolders: 0, - totalInputTokens: statsDocs.reduce((sum, s) => sum + (s.inputTokens ?? 0), 0), - totalOutputTokens: statsDocs.reduce((sum, s) => sum + (s.outputTokens ?? 0), 0), - totalEstimatedCost: sumCost(statsDocs.map((s) => s.estimatedCost)), - }; - - return { totals, repos, commitStats }; -} - -function sumModelTokens(modelTokens: ModelTokenBreakdown): { inputTokens: number; outputTokens: number } { - let inputTokens = 0; - let outputTokens = 0; - for (const usage of Object.values(modelTokens)) { - inputTokens += usage.inputTokens; - outputTokens += usage.outputTokens; - } - return { inputTokens, outputTokens }; -} - -function aggregateRepoTokens(stats: ProcessingStatsDoc[]): { - inputTokens: number; - outputTokens: number; - estimatedCost: number; -} { - let inputTokens = 0; - let outputTokens = 0; - for (const s of stats) { - inputTokens += s.inputTokens ?? 0; - outputTokens += s.outputTokens ?? 0; - } - return { - inputTokens, - outputTokens, - estimatedCost: sumCost(stats.map((s) => s.estimatedCost)), - }; -} - -function sumCost(values: number[]): number { - let total = 0; - let anyKnown = false; - for (const v of values) { - if (typeof v !== "number" || v === COST_UNKNOWN) { - continue; - } - anyKnown = true; - total += v; - } - if (!anyKnown) { - return values.length === 0 ? 0 : COST_UNKNOWN; - } - return Math.round(total * 1_000_000) / 1_000_000; -} - -function deriveRepoName(doc: KnowledgeDoc): string { - if (doc.source.kind === "local") { - const segments = doc.source.sourcePath.split("/").filter((s) => s.length > 0); - return segments.at(-1) ?? doc.source.sourcePath; - } - try { - const segments = new URL(doc.info.repoUrl ?? "").pathname - .split("/") - .map((s) => s.trim()) - .filter((s) => s.length > 0); - const repo = segments.at(-1)?.replace(/\.git$/u, ""); - const owner = segments.at(-2); - if (owner !== undefined && repo !== undefined) { - return `${owner}/${repo}`; - } - } catch { - // fall through - } - return doc.info.repoUrl ?? ""; -} - -function toIso(value: Date | string | undefined): string { - if (value === undefined) { - return new Date(0).toISOString(); - } - if (value instanceof Date) { - return value.toISOString(); - } - return new Date(value).toISOString(); -} diff --git a/packages/server/src/deleteRoute.ts b/packages/server/src/deleteRoute.ts index 72788eb..a35a63f 100644 --- a/packages/server/src/deleteRoute.ts +++ b/packages/server/src/deleteRoute.ts @@ -40,7 +40,6 @@ export function buildDeleteRoute(): Router { jobsRemoved: removedJobs.removed, mongoDeleted: mongoResult.knowledgeDeleted, rawDeleted: mongoResult.rawDeleted, - statsDeleted: mongoResult.statsDeleted, }); }); return router; diff --git a/packages/types/src/README.md b/packages/types/src/README.md index bcb5ce3..d9c4fa0 100644 --- a/packages/types/src/README.md +++ b/packages/types/src/README.md @@ -48,8 +48,23 @@ package-level contract; this file documents how the source tree is split. downstream consumers can stash extra fields without forcing schema changes here. The pull pipeline reads `knowledge.info.repoUrl` / `knowledge.info.branch` directly — that's the single source of truth for the URL/branch, no fallback. + - `KnowledgeFailureCategory` is a closed union covering the operator-facing + failure taxonomy: `"llm_config"` (no key), `"llm_auth"` (401/403), + `"llm_quota"` (402), `"llm_rate_limit"` (429), `"llm_unreachable"` + (5xx / network / timeout), `"cancelled"`, `"internal"`. The + HTTP-status → category mapping lives in + `@bb/ingest-github/src/pipeline/failure-classifier.ts`. + - `KnowledgeFailure` is the structured failure record: + `{ reason: string; category: KnowledgeFailureCategory; at: Date; detail?: string }`. + `reason` is a single short operator-readable sentence (UI surfaces it + directly), `detail` is the raw provider response body (UI hides it + behind a disclosure). - `KnowledgeDoc` carries both: `source` for upstream-type + indexed-commit - state, `info` for repo coordinates. Both are required on every doc. + state, `info` for repo coordinates. Both are required on every doc. The + optional `failure?: KnowledgeFailure` field is populated when + `status.state === FAILED` and cleared automatically by the next + `setKnowledgeState` call (the function `$unset`s it on transitions out + of FAILED). ## Module dependency graph diff --git a/packages/types/src/index.ts b/packages/types/src/index.ts index fc21d19..b5171f8 100644 --- a/packages/types/src/index.ts +++ b/packages/types/src/index.ts @@ -13,17 +13,11 @@ export { KnowledgeState } from "./knowledge.ts"; export type { GithubKnowledgeSource, KnowledgeDoc, + KnowledgeFailure, + KnowledgeFailureCategory, KnowledgeInfo, KnowledgeSource, LocalKnowledgeSource, } from "./knowledge.ts"; -export type { - ModelTokenBreakdown, - ModelTokenUsage, - ProcessingStatsDoc, - StatsCommitEntry, - StatsRepoEntry, - StatsResponse, - StatsTotals, -} from "./stats.ts"; +export type { StatsCommitEntry, StatsRepoEntry, StatsResponse, StatsTotals } from "./stats.ts"; export type { UsageDoc, ActivityDoc, UsageIncrement, ActivityInput } from "./usage.ts"; diff --git a/packages/types/src/knowledge.ts b/packages/types/src/knowledge.ts index 6c922fd..aa6f77b 100644 --- a/packages/types/src/knowledge.ts +++ b/packages/types/src/knowledge.ts @@ -11,6 +11,8 @@ export interface CommitHashRecord { hash: string; inputTokens: string; outputTokens: string; + /** Authoritative provider-reported cost in USD (OpenRouter `usage.cost`). "0" for Ollama or when omitted by provider. */ + costUsd: string; } export interface GithubKnowledgeSource { @@ -36,6 +38,36 @@ export interface KnowledgeInfo { [key: string]: unknown; } +/** + * Categorises why a knowledge ingestion failed. Drives operator triage and + * downstream UI hints. + * + * - `llm_config` — missing or empty API key (operator action required) + * - `llm_auth` — 401/403 from provider, key invalid/expired (operator action) + * - `llm_quota` — 402, credit/billing exhausted (operator action) + * - `llm_rate_limit` — 429, transient — could be retried later by operator + * - `llm_unreachable` — 5xx / network / timeout (transient infra issue) + * - `cancelled` — operator-initiated cancellation + * - `internal` — anything else (bug, infra, unexpected exception) + */ +export type KnowledgeFailureCategory = + | "llm_config" + | "llm_auth" + | "llm_quota" + | "llm_rate_limit" + | "llm_unreachable" + | "cancelled" + | "internal"; + +export interface KnowledgeFailure { + /** Short, operator-readable sentence. UI can render this directly. */ + reason: string; + category: KnowledgeFailureCategory; + at: Date; + /** Raw provider response or structured detail for debugging. May be long. */ + detail?: string; +} + export interface KnowledgeDoc { knowledgeId: string; source: KnowledgeSource; @@ -43,4 +75,9 @@ export interface KnowledgeDoc { createdAt: Date; updatedAt: Date; info: KnowledgeInfo; + /** + * Populated when `status.state === KnowledgeState.Failed`. Cleared + * automatically on the next successful transition out of FAILED. + */ + failure?: KnowledgeFailure; } diff --git a/packages/types/src/stats.ts b/packages/types/src/stats.ts index cdfee5b..5e1e1e7 100644 --- a/packages/types/src/stats.ts +++ b/packages/types/src/stats.ts @@ -1,27 +1,3 @@ -export interface ModelTokenUsage { - inputTokens: number; - outputTokens: number; -} - -export type ModelTokenBreakdown = Record; - -export interface ProcessingStatsDoc { - knowledgeId: string; - repoName: string; - commitHash: string; - modelTokens: ModelTokenBreakdown; - inputTokens: number; - outputTokens: number; - estimatedCost: number; - totalBatches: number; - totalFiles: number; - totalFolders: number; - filesAnalyzed: number; - processingTimeMs: number; - createdAt: Date; - updatedAt: Date; -} - export interface StatsTotals { totalRepos: number; totalFiles: number; From 064ebf27fad655a5ed85a986f0d82f0618e26766 Mon Sep 17 00:00:00 2001 From: Dead-Bytes <143434285+Dead-Bytes@users.noreply.github.com> Date: Fri, 22 May 2026 13:01:56 +0530 Subject: [PATCH 23/34] refactor: update LLM credential handling in big-file processing and condensing --- .../src/strategies/flat-folder/README.md | 13 +++++----- .../strategies/flat-folder/big-file/README.md | 23 ++++++++++++------ .../flat-folder/big-file/condenser.ts | Bin 9844 -> 10066 bytes .../strategies/flat-folder/big-file/index.ts | 2 +- 4 files changed, 23 insertions(+), 15 deletions(-) diff --git a/packages/ingest-github/src/strategies/flat-folder/README.md b/packages/ingest-github/src/strategies/flat-folder/README.md index 8d26d9d..a454303 100644 --- a/packages/ingest-github/src/strategies/flat-folder/README.md +++ b/packages/ingest-github/src/strategies/flat-folder/README.md @@ -103,14 +103,15 @@ The strategy emits progress through the `ProgressContext` port defined in after `saveCondensed`; failures inside the sink are logged WARN and do not interrupt the analyse loop. The open-source binary never wires a sink — `archiveSink` is undefined and the call is skipped entirely. -- **Per-job LLM credentials thread through every phase.** The orchestrator +- **Per-call LLM credentials thread through every phase.** The orchestrator reads `context.llmCallContext` (an optional `AskLlmOptions` built by the runner from `GithubIndexPayload.{llmApiKey, llmProvider, llmModel}`) and forwards it into every phase that issues LLM calls: phase 1 via `classifyAndAnalyseSmall`'s `llmCallContext`, phase 2 via - `processBigFilesQueue`, phase 3 via `backfillMissingFields`, phase 4 via - `backfillBigFiles`, phase 5 via `runFolderSummaryPhase`, phase 6 via - `summariseRepo`. The phases pass the same option object through to - `askJsonLLM` so per-org overrides reach `@bb/llm` unchanged. OSS - standalone leaves `llmCallContext` undefined and falls back to + `processBigFilesQueue` (which threads it into **both** the chunk + analyzer and `condenseChunks`), phase 3 via `backfillMissingFields`, + phase 4 via `backfillBigFiles`, phase 5 via `runFolderSummaryPhase`, + phase 6 via `summariseRepo`. The phases pass the same option object + through to `askJsonLLM` so the per-call override reaches `@bb/llm` + unchanged. When `llmCallContext` is undefined the call falls back to `Config.OpenrouterApiKey` + `Config.LlmProvider`. diff --git a/packages/ingest-github/src/strategies/flat-folder/big-file/README.md b/packages/ingest-github/src/strategies/flat-folder/big-file/README.md index b1c974a..ba5e5f8 100644 --- a/packages/ingest-github/src/strategies/flat-folder/big-file/README.md +++ b/packages/ingest-github/src/strategies/flat-folder/big-file/README.md @@ -15,10 +15,15 @@ depending on chunk count and prompt budget. `askJsonLLM` with the chunk prompt; tolerates failures by returning an empty analysis. `llmCallContext` forwards per-job LLM credentials threaded through from `StrategyContext`. -- `condenser.ts` — `condenseChunks(relativePath, chunks)`: +- `condenser.ts` — `condenseChunks(relativePath, chunks, llmCallContext?)`: ≤ `SmallFileDedupThreshold` → deterministic merge (no LLM); - above → recursive map-reduce. Per-condense LLM failure falls back to - deterministic dedup so recursion always terminates. + above → recursive map-reduce. `llmCallContext` is threaded through + `condenseRecursively` and `condenseOne` to every `askJsonLLM` call so + the same per-call credential bag the chunk analyser uses also reaches + the condense step — without it, callers that rely on per-call overrides + instead of `Config.OpenrouterApiKey` would hit `LlmConfigError` here. + Per-condense LLM failure falls back to deterministic dedup so recursion + always terminates. - `storage.ts` — on-disk cache (chunk JSON, manifest, condensed analysis) + `iterateCondensed(metaPaths)` async iterator used by Phase 5. - `cache.ts` — `inspect(metaPaths, relativePath)` returns `complete`, @@ -28,11 +33,13 @@ depending on chunk count and prompt budget. sizeBytes, llmCallContext?, progressContext?})`. Sequential per file (chunk-level concurrency inside). Persists every intermediate artifact, so a restart resumes from the next unfinished chunk. `llmCallContext` - is forwarded to every chunk analyzer call so per-job LLM credentials - reach `@bb/llm`. When `progressContext` is present, the chunk pool runs - under a fixed-total reporter - (`subPhase: "big_file:"`, `total = chunks.length`) so - long single-file analyses surface as live `PHASE_TICK` envelopes + is forwarded to **both** sides of the big-file pipeline — every + `analyzeChunk` call inside the worker loop **and** the final + `condenseChunks(...)` call — so per-call LLM credentials reach + `@bb/llm` consistently across chunk analysis and condense. When + `progressContext` is present, the chunk pool runs under a fixed-total + reporter (`subPhase: "big_file:"`, `total = chunks.length`) + so long single-file analyses surface as live `PHASE_TICK` envelopes carrying per-chunk progress instead of looking frozen. ## Invariants diff --git a/packages/ingest-github/src/strategies/flat-folder/big-file/condenser.ts b/packages/ingest-github/src/strategies/flat-folder/big-file/condenser.ts index a7b927655c05f013c4257bbd221f0150187bcf68..fdde9b835a295bd1137c8c84fd809dc1a180b219 100644 GIT binary patch delta 305 zcmez3bIEUltcZ?6No7H*f@5*EPfo6XK}lwQUhziRT})D33JOK3If*5iWvKy)B^g!< z#U(|VdFeV_3X|V3$teM8g`AvR=fs>G=lr~q)QS>&D^yb^&t_GfT*t$<*_in|qks;Y z>d9KH#R`Os*gTuHk_kiWW)Y6rSPWdxnU1Mcj7yklaxkm498qrlB8Kiv1$%pi>e|gk HyiAe+=QC=i delta 110 zcmccQ_r+&|>_)>~OwvWEIf*5iWvKy)B^g!<#U(|VdFeU|$r+`2*^?cGq$l&UDQwPS z{>wPIoV9qeGOzMxeYOy$&67E1GfuYSO5ePdOPFc0G#3vSLhocpX?c*g$!@$~HV5!A GNdf?c9waRQ diff --git a/packages/ingest-github/src/strategies/flat-folder/big-file/index.ts b/packages/ingest-github/src/strategies/flat-folder/big-file/index.ts index 255be0b..c35b234 100644 --- a/packages/ingest-github/src/strategies/flat-folder/big-file/index.ts +++ b/packages/ingest-github/src/strategies/flat-folder/big-file/index.ts @@ -74,7 +74,7 @@ export async function processBigFile(input: ProcessBigFileInput): Promise `chunks/${encodeFolder(input.relativePath)}/chunk-${i}.json`); const totalTokenCount = chunks.reduce((acc, c) => acc + c.tokenCount, 0); From f9949f640be03ca7e245b1a9a45be02054507efa Mon Sep 17 00:00:00 2001 From: Dead-Bytes <143434285+Dead-Bytes@users.noreply.github.com> Date: Fri, 22 May 2026 13:02:35 +0530 Subject: [PATCH 24/34] refactor: enhance OpenRouter provider routing to prevent fallback on slow calls --- .../seed-data/ignorePatterns.json | 3 ++- packages/llm/README.md | 22 +++++++++++++++---- packages/llm/src/README.md | 11 ++++++---- packages/llm/src/openrouter.ts | 13 +++++++++-- 4 files changed, 38 insertions(+), 11 deletions(-) diff --git a/packages/ingest-github/src/pipeline/skip-decisions/seed-data/ignorePatterns.json b/packages/ingest-github/src/pipeline/skip-decisions/seed-data/ignorePatterns.json index f7991f1..96de6e3 100644 --- a/packages/ingest-github/src/pipeline/skip-decisions/seed-data/ignorePatterns.json +++ b/packages/ingest-github/src/pipeline/skip-decisions/seed-data/ignorePatterns.json @@ -305,7 +305,8 @@ { "type": "exact", "pattern": "CODE_OF_CONDUCT.txt" }, { "type": "exact", "pattern": "FAQ.md" }, { "type": "exact", "pattern": "TROUBLESHOOTING.md" }, - { "type": "exact", "pattern": "UPGRADING.md" } + { "type": "exact", "pattern": "UPGRADING.md" }, + { "type": "extension", "pattern": ".md" } ], "logFiles": [ { "type": "extension", "pattern": ".log" }, diff --git a/packages/llm/README.md b/packages/llm/README.md index 5d659d0..2deb951 100644 --- a/packages/llm/README.md +++ b/packages/llm/README.md @@ -29,10 +29,15 @@ selected by `Config.LlmProvider` (`"openrouter"` default, or fallback chain. The request body includes a `models: [...]` array when the deduplicated chain has ≥2 non-empty entries and always sends `usage: { include: true }` so OpenRouter populates `usage.cost` in - the response. `usage.model` is the actual model the gateway picked. - Tokens come straight from OpenRouter's `usage.prompt_tokens` / - `usage.completion_tokens`; `costUsd` from `usage.cost` (defaults to - `0` when the provider omits it — common for `:free` models). + the response. The body also pins `provider: { allow_fallbacks: false }` + so OpenRouter does not silently cycle across upstream providers of the + same model — a slow or sick provider surfaces a real error to us + instead of consuming the wall-clock budget. Model-level fallback + through the `models` chain is unaffected. `usage.model` is the actual + model the gateway picked. Tokens come straight from OpenRouter's + `usage.prompt_tokens` / `usage.completion_tokens`; `costUsd` from + `usage.cost` (defaults to `0` when the provider omits it — common for + `:free` models). - **Ollama mode** — POST to `${Config.OllamaUrl}/api/chat` with `{ model: Config.OllamaModel, messages, stream: false }`. Single model per request — no fallback chain (Ollama does not have a @@ -151,6 +156,15 @@ it. The cost ledger described in [docs/arch.md](../../docs/arch.md) is sees a single `AskLlmResult`. BullMQ's `attempts: 3` wraps the whole call — retries walk the chain again, useful when a transient OpenRouter outage clears between retries. +4a. **No upstream-provider fallback.** Every request carries + `provider: { allow_fallbacks: false }`. This is orthogonal to the + `models` chain in invariant 4 — `models` controls *which model* the + gateway tries; `allow_fallbacks` controls whether OpenRouter routes + to a different upstream backend serving the same model when the first + one stalls. We disable the latter so a slow provider cannot eat the + wall-clock without ever producing tokens; the surfaced error becomes + actionable (specific provider, specific status) instead of a generic + timeout. 5. **Errors are typed, not strings.** `LlmConfigError` carries the exact `bytebell keys set` hint; `LlmError` carries `cause`. 6. **Timeout is enforced.** AbortController fires at `timeoutMs`; the diff --git a/packages/llm/src/README.md b/packages/llm/src/README.md index 61d122a..1b3bba7 100644 --- a/packages/llm/src/README.md +++ b/packages/llm/src/README.md @@ -21,10 +21,13 @@ package-level contract; this file documents how the source tree is split. or `Config.OpenrouterModel` + four fallback slots), caps the chain at 3 entries (OpenRouter's hard limit), POSTs to the chat-completions endpoint with an AbortController timeout, parses the typed - `OpenRouterResponse`, returns the first choice's content. `usage.model` - reflects which model OpenRouter actually routed to. Throws - `LlmConfigError` if the API key resolves to empty, `LlmError` on - timeout / HTTP non-2xx / empty completion. + `OpenRouterResponse`, returns the first choice's content. The body + always carries `provider: { allow_fallbacks: false }` so OpenRouter + cannot silently route across upstream providers of the same model; + see `OpenRouterProviderRouting` in this file and invariant 4a in the + package README. `usage.model` reflects which model OpenRouter actually + routed to. Throws `LlmConfigError` if the API key resolves to empty, + `LlmError` on timeout / HTTP non-2xx / empty completion. - **[ollama.ts](ollama.ts)** — `callOllama` and `resolveOllamaChain`. Single-model per request (Ollama has no fan-out). Reads model from `opts.model ?? Config.OllamaModel`. Ignores `opts.apiKey` (Ollama is diff --git a/packages/llm/src/openrouter.ts b/packages/llm/src/openrouter.ts index 53b48b4..a4f99e7 100644 --- a/packages/llm/src/openrouter.ts +++ b/packages/llm/src/openrouter.ts @@ -20,11 +20,19 @@ interface OpenRouterUsageAccounting { include: true; } +interface OpenRouterProviderRouting { + // Pin OpenRouter to the first viable upstream provider. Without this, + // OpenRouter silently cycles across providers on slow/failed calls and + // we lose the per-call wall-clock budget before a real error surfaces. + allow_fallbacks: boolean; +} + interface OpenRouterRequest { model: string; models?: string[]; messages: OpenRouterMessage[]; usage: OpenRouterUsageAccounting; + provider: OpenRouterProviderRouting; } interface OpenRouterResponse { @@ -67,10 +75,11 @@ export async function callOpenRouter(prompt: string, opts: AskLlmOptions, timeou messages.push({ role: "user", content: prompt }); const usageAccounting: OpenRouterUsageAccounting = { include: true }; + const providerRouting: OpenRouterProviderRouting = { allow_fallbacks: false }; const body: OpenRouterRequest = cappedChain.length > 1 - ? { model, models: cappedChain, messages, usage: usageAccounting } - : { model, messages, usage: usageAccounting }; + ? { model, models: cappedChain, messages, usage: usageAccounting, provider: providerRouting } + : { model, messages, usage: usageAccounting, provider: providerRouting }; const controller = new AbortController(); const timer = setTimeout(() => controller.abort(), timeoutMs); From 665c4d124f027cb109bb4d42ec593214a1f0810b Mon Sep 17 00:00:00 2001 From: Dead-Bytes <143434285+Dead-Bytes@users.noreply.github.com> Date: Fri, 22 May 2026 14:07:37 +0530 Subject: [PATCH 25/34] refactor: restructure flat-folder phases for improved clarity and performance --- packages/config/src/schema.ts | 7 + packages/ingest-github/README.md | 20 +- packages/ingest-github/src/pipeline/paths.ts | 1 + .../strategies/flat-folder/analyse-changed.ts | 2 +- .../strategies/flat-folder/big-file/README.md | 31 ++- .../src/strategies/flat-folder/index.ts | 64 +++-- .../strategies/flat-folder/phases/README.md | 143 ++++++---- .../flat-folder/phases/analyse-small.ts | 133 +++++++++ .../phases/classify-and-analyse-small.ts | 161 ----------- .../flat-folder/phases/process-big-files.ts | 257 +++++++++++++++++- .../flat-folder/phases/scan-and-classify.ts | 131 +++++++++ .../strategies/flat-folder/scan-manifest.ts | 61 +++++ .../ingest-github/src/types/meta-paths.ts | 1 + packages/types/src/config.ts | 1 + 14 files changed, 763 insertions(+), 250 deletions(-) create mode 100644 packages/ingest-github/src/strategies/flat-folder/phases/analyse-small.ts delete mode 100644 packages/ingest-github/src/strategies/flat-folder/phases/classify-and-analyse-small.ts create mode 100644 packages/ingest-github/src/strategies/flat-folder/phases/scan-and-classify.ts create mode 100644 packages/ingest-github/src/strategies/flat-folder/scan-manifest.ts diff --git a/packages/config/src/schema.ts b/packages/config/src/schema.ts index 63a65d4..77a7468 100644 --- a/packages/config/src/schema.ts +++ b/packages/config/src/schema.ts @@ -41,6 +41,7 @@ export const configSchema = z "big.file.concurrency": z.number().int().positive().default(25), "absolute.file.size.cap": z.number().int().positive().default(52428800), "concurrent.workers": z.number().int().positive().default(4), + "llm.concurrency": z.number().int().positive().default(29), "condense.context.limit": z.number().int().positive().default(12000), "condense.prompt.overhead": z.number().int().nonnegative().default(1500), "small.file.dedup.threshold": z.number().int().positive().default(3), @@ -81,6 +82,7 @@ export type ConfigValueMap = { [Config.BigFileConcurrency]: number; [Config.AbsoluteFileSizeCap]: number; [Config.ConcurrentWorkers]: number; + [Config.LlmConcurrency]: number; [Config.CondenseContextLimit]: number; [Config.CondensePromptOverhead]: number; [Config.SmallFileDedupThreshold]: number; @@ -135,6 +137,7 @@ export const HINTS: Readonly> = { [Config.BigFileConcurrency]: "bytebell set big.file.concurrency ", [Config.AbsoluteFileSizeCap]: "bytebell set absolute.file.size.cap ", [Config.ConcurrentWorkers]: "bytebell set concurrent.workers ", + [Config.LlmConcurrency]: "bytebell set llm.concurrency ", [Config.CondenseContextLimit]: "bytebell set condense.context.limit ", [Config.CondensePromptOverhead]: "bytebell set condense.prompt.overhead ", [Config.SmallFileDedupThreshold]: "bytebell set small.file.dedup.threshold ", @@ -195,6 +198,8 @@ export function readField(cfg: BytebellConfig, key: K): Config return cfg["absolute.file.size.cap"] as ConfigValue; case Config.ConcurrentWorkers: return cfg["concurrent.workers"] as ConfigValue; + case Config.LlmConcurrency: + return cfg["llm.concurrency"] as ConfigValue; case Config.CondenseContextLimit: return cfg["condense.context.limit"] as ConfigValue; case Config.CondensePromptOverhead: @@ -264,6 +269,8 @@ export function writeField(cfg: BytebellConfig, key: K, value: return { ...cfg, "absolute.file.size.cap": value as number }; case Config.ConcurrentWorkers: return { ...cfg, "concurrent.workers": value as number }; + case Config.LlmConcurrency: + return { ...cfg, "llm.concurrency": value as number }; case Config.CondenseContextLimit: return { ...cfg, "condense.context.limit": value as number }; case Config.CondensePromptOverhead: diff --git a/packages/ingest-github/README.md b/packages/ingest-github/README.md index b442726..6073339 100644 --- a/packages/ingest-github/README.md +++ b/packages/ingest-github/README.md @@ -132,14 +132,23 @@ worker hardcodes a single `IngestionStrategy` instance (currently - `:File` graph nodes + `:HAS_FILE` / `:HAS_KEYWORD` / `:HAS_CLASS` / `:HAS_FUNCTION` / `:HAS_IMPORT_INTERNAL` / `:HAS_IMPORT_EXTERNAL` relationships — written via `upsertFileNode` from `@bb/neo4j`. +- `meta-output/scan-manifest.json` — the canonical small/big/oversized + classification produced by Phase 1 (`scanAndClassify`). Per-file entries + carry `tokenCount`, `kind`, and (for big files) `estimatedChunks`. + Phases 2a (small) and 2b (big) consume the manifest in parallel. +- `meta-output/bigFiles.json` — legacy view written alongside the manifest + for the pull-path and backfill phases. The main strategy no longer + consumes it directly. ## Invariants -1. **Sequential per-file processing.** Intentionally degraded; one - `upsertRawFile` per file. The small-file path issues one `askLLM`; - the big-file path issues N (one per chunk) plus condensation calls, - all sequential — no `Promise.all`, no concurrency cap. Revisit when - the latency profile demands it. +1. **Shared LLM concurrency limiter.** The flat-folder strategy + constructs one `withConcurrency(Config.LlmConcurrency)` instance at + entry (default 29). The small-file phase, the big-file chunk phase, + and per-file condense calls all check out from this single pool, so + total in-flight LLM calls is bounded by one knob. The legacy + `processBigFile` driver used by the pull-path still uses its own + per-file pool sized by `Config.BigFileConcurrency`. 2. **Clone idempotent.** Re-runs (BullMQ retries) call `git fetch` + `git reset --hard` in the existing dir rather than re-cloning. Tokens are re-injected into the remote URL each time. @@ -179,7 +188,6 @@ worker hardcodes a single `IngestionStrategy` instance (currently - GitHub API streaming mode (always shell-clone) - Default-branch auto-detection (caller supplies `branch`; defaults to `"main"`) -- Concurrency control / parallel file processing - Folder-level summaries / `repoSummary.json` / `flat-folder` strategy - Semantic chunking (`SemanticChunker`) - Per-chunk persistence (we persist only the merged file-level diff --git a/packages/ingest-github/src/pipeline/paths.ts b/packages/ingest-github/src/pipeline/paths.ts index cdddc2f..ac52215 100644 --- a/packages/ingest-github/src/pipeline/paths.ts +++ b/packages/ingest-github/src/pipeline/paths.ts @@ -30,6 +30,7 @@ export function metaPathsFor(knowledgeId: string): MetaPaths { bigFileAnalysisDir: path.join(metaRoot, "big-file-analysis"), bigFileChunksDir: path.join(metaRoot, "big-file-analysis", "chunks"), bigFilesJson: path.join(metaRoot, "bigFiles.json"), + scanManifestJson: path.join(metaRoot, "scan-manifest.json"), repoSummaryJson: path.join(metaRoot, "repo-summary.json"), }; } diff --git a/packages/ingest-github/src/strategies/flat-folder/analyse-changed.ts b/packages/ingest-github/src/strategies/flat-folder/analyse-changed.ts index 982d0a7..17f0125 100644 --- a/packages/ingest-github/src/strategies/flat-folder/analyse-changed.ts +++ b/packages/ingest-github/src/strategies/flat-folder/analyse-changed.ts @@ -39,7 +39,7 @@ export interface AnalyseChangedResult { /** * Pull-time per-file dispatcher. Iterates the changed file set from the - * diff and runs the same per-file work as `classifyAndAnalyseSmall`, but + * diff and runs the same per-file work as `analyseSmallFiles`, but * targeted at known paths rather than a tree walk. * * Reads file content through `input.source` (a `SourceReader`) so the diff --git a/packages/ingest-github/src/strategies/flat-folder/big-file/README.md b/packages/ingest-github/src/strategies/flat-folder/big-file/README.md index ba5e5f8..3e4e6ef 100644 --- a/packages/ingest-github/src/strategies/flat-folder/big-file/README.md +++ b/packages/ingest-github/src/strategies/flat-folder/big-file/README.md @@ -42,11 +42,32 @@ sizeBytes, llmCallContext?, progressContext?})`. Sequential per file so long single-file analyses surface as live `PHASE_TICK` envelopes carrying per-chunk progress instead of looking frozen. +## Two callers + +These leaf helpers (`splitFileIntoChunks`, `analyzeChunk`, `condenseChunks`, +the storage / cache primitives) are consumed by **two** drivers: + +- `processBigFile` (`index.ts`) — legacy serial driver. One big file at a + time, chunks-within-file parallel under `Config.BigFileConcurrency`, + followed by a blocking condense. Used today by the pull-path + (`pipeline/pull.ts`) via `processBigFilesQueue` and by the Phase 4 + backfill. +- `analyseBigFiles` (`phases/process-big-files.ts`) — manifest-driven + chunk-task queue used by the main strategy entry. Every chunk of every + big file is an independent task scheduled through a strategy-wide + shared `ConcurrencyLimiter`. As soon as a file's last chunk lands, + that file's `condenseChunks` is scheduled through the same limiter — + multiple condenses run in parallel with chunks of slower files. + Reuses `splitFileIntoChunks`, `analyzeChunk`, `condenseChunks`, and + the storage helpers without modification. + ## Invariants -- One big file at a time. Concurrency lives at the chunk level inside - `processBigFile`, never across files, to bound peak memory. - Every artifact is durable on disk before the next step. The chunk cache - short-circuits on re-runs; the manifest plus condensed JSON are the - Phase 7 graph-store inputs. -- Cancellation is checked between chunks (`throwIfCancelled(knowledgeId)`). + short-circuits on re-runs (per-chunk granularity, not per-file); the + manifest plus condensed JSON are the Phase 7 graph-store inputs. +- Cancellation is checked between chunks and before each condense + dispatch (`throwIfCancelled(knowledgeId)`). +- `bigFiles.json` is now a derived view written by `scanAndClassify`. + The main strategy reads it indirectly via the manifest; the legacy + drivers (pull-path + backfill) continue to read it directly. diff --git a/packages/ingest-github/src/strategies/flat-folder/index.ts b/packages/ingest-github/src/strategies/flat-folder/index.ts index 09c03c6..924b26f 100644 --- a/packages/ingest-github/src/strategies/flat-folder/index.ts +++ b/packages/ingest-github/src/strategies/flat-folder/index.ts @@ -1,10 +1,14 @@ +import { Config } from "@bb/types"; +import { getConfigValue } from "@bb/config"; import { logger } from "@bb/logger"; import type { FileAnalyzer } from "#src/types/pipeline.ts"; import type { IngestStrategy, StrategyInput, StrategyResult } from "#src/types/strategy.ts"; import { throwIfCancelled } from "#src/pipeline/cancellation.ts"; import { classifyFailure } from "#src/pipeline/failure-classifier.ts"; -import { classifyAndAnalyseSmall } from "./phases/classify-and-analyse-small.ts"; -import { processBigFilesQueue } from "./phases/process-big-files.ts"; +import { withConcurrency } from "#src/pipeline/concurrency.ts"; +import { scanAndClassify } from "./phases/scan-and-classify.ts"; +import { analyseSmallFiles } from "./phases/analyse-small.ts"; +import { analyseBigFiles } from "./phases/process-big-files.ts"; import { backfillMissingFields } from "./backfill/fields.ts"; import { backfillBigFiles } from "./backfill/big-files.ts"; import { runFolderSummaryPhase } from "./folder-summary.ts"; @@ -28,43 +32,60 @@ export function createFlatFolderStrategy(deps: FlatFolderStrategyDeps): IngestSt const progressContext: ProgressContext = progressContextFactory(knowledgeId); try { - progressContext.phaseChanged("file_analysis"); + // Shared LLM limiter — small-file analyses, big-file chunk analyses, + // and per-file condense calls all check out from this single pool. + const llmConcurrency = getConfigValue(Config.LlmConcurrency); + const limiter = withConcurrency(llmConcurrency); + + progressContext.phaseChanged("scan"); + logger.info(`flat-folder: phase1 (scan + classify) starting for ${knowledgeId} limit=${llmConcurrency}`); + throwIfCancelled(knowledgeId); + const scanInput: Parameters[0] = { + knowledgeId, + source, + metaPaths, + progressContext, + }; + if (llmCallContext !== undefined) { + scanInput.llmCallContext = llmCallContext; + } + const { manifest } = await scanAndClassify(scanInput); - logger.info(`flat-folder: phase1 (classify + analyse small) starting for ${knowledgeId}`); + progressContext.phaseChanged("file_analysis"); + logger.info( + `flat-folder: phase2 (analyse small ${manifest.summary.smallCount} + big ${manifest.summary.bigCount}) starting in parallel`, + ); throwIfCancelled(knowledgeId); - const phase1Input: Parameters[0] = { + const smallInput: Parameters[0] = { knowledgeId, + manifest, source, metaPaths, analyzer: deps.fileAnalyzer, + limiter, progressContext, }; if (archiveSink !== undefined) { - phase1Input.archiveSink = archiveSink; + smallInput.archiveSink = archiveSink; } if (llmCallContext !== undefined) { - phase1Input.llmCallContext = llmCallContext; + smallInput.llmCallContext = llmCallContext; } - const phase1 = await classifyAndAnalyseSmall(phase1Input); - let totalInputTokens = phase1.tokenUsage.inputTokens; - let totalOutputTokens = phase1.tokenUsage.outputTokens; - let totalCostUsd = phase1.tokenUsage.costUsd; - - logger.info(`flat-folder: phase2 (process big files) starting`); - throwIfCancelled(knowledgeId); - const phase2Input: Parameters[0] = { + const bigInput: Parameters[0] = { knowledgeId, + manifest, source, metaPaths, + limiter, progressContext, }; if (llmCallContext !== undefined) { - phase2Input.llmCallContext = llmCallContext; + bigInput.llmCallContext = llmCallContext; } - const phase2 = await processBigFilesQueue(phase2Input); - totalInputTokens += phase2.tokenUsage.inputTokens; - totalOutputTokens += phase2.tokenUsage.outputTokens; - totalCostUsd += phase2.tokenUsage.costUsd; + const [smallResult, bigResult] = await Promise.all([analyseSmallFiles(smallInput), analyseBigFiles(bigInput)]); + let totalInputTokens = smallResult.tokenUsage.inputTokens + bigResult.tokenUsage.inputTokens; + let totalOutputTokens = smallResult.tokenUsage.outputTokens + bigResult.tokenUsage.outputTokens; + let totalCostUsd = smallResult.tokenUsage.costUsd + bigResult.tokenUsage.costUsd; logger.info(`flat-folder: phase3 (backfill missing fields) starting`); throwIfCancelled(knowledgeId); @@ -121,7 +142,8 @@ export function createFlatFolderStrategy(deps: FlatFolderStrategyDeps): IngestSt progressContext.completed(); return { - filesAnalyzed: phase1.smallFilesAnalysed + phase2.processed + phase2.cached + phase1.oversizedStubs, + filesAnalyzed: + smallResult.smallFilesAnalysed + smallResult.oversizedStubs + bigResult.processed + bigResult.cached, foldersSummarised: phase5.succeeded, repoSummarised, graphNodesWritten: phase7.nodesWritten, diff --git a/packages/ingest-github/src/strategies/flat-folder/phases/README.md b/packages/ingest-github/src/strategies/flat-folder/phases/README.md index f0701a7..e2d218a 100644 --- a/packages/ingest-github/src/strategies/flat-folder/phases/README.md +++ b/packages/ingest-github/src/strategies/flat-folder/phases/README.md @@ -6,35 +6,50 @@ Backfill (Phases 3 and 4) lives in the sibling `backfill/` folder; folder and repo summarisation (Phases 5 and 6) live as `folder-summary.ts` and `repo-summary.ts` at the strategy root. +The strategy constructs a **shared LLM limiter** (`withConcurrency(Config.LlmConcurrency)`, +default 29) once at entry. Every LLM call across the small-file phase, +the big-file chunk phase, and per-file condense calls checks out from +the same pool — the single tunable for total in-flight LLM calls. + ## Files -- `classify-and-analyse-small.ts` — Phase 1. - `classifyAndAnalyseSmall({knowledgeId, source, metaPaths, analyzer, -skipDecider?, archiveSink?, llmCallContext?, progressContext?})` walks - `source.scan({ skipDecider, llmCallContext })` and per entry: - - `kind === "oversized"` → write a stub via `buildOversizedStub` + - `saveCondensed`, and append a `too-large` row to `bigFiles.json`. - - token count > `Config.ContextWindowLimit` → buffer a - `context-window-exceeded` row for Phase 2. - - otherwise → run `analyseScannedFile(analyzer, entry)` and persist via - `saveCondensed`, under a `withConcurrency(Config.ConcurrentWorkers)` - limiter so analyses run in parallel. - Cancellation is checked at scan boundaries and inside each task; the - buffered big-file list is flushed via `writeBigFiles` after all tasks - drain. -- `process-big-files.ts` — Phase 2. - `processBigFilesQueue({knowledgeId, source, metaPaths, llmCallContext?, progressContext?})` - reads `bigFiles.json`, skips `too-large` entries (counted as - `skippedOversized`), short-circuits when `inspect` returns `complete` - (counted as `cached`), reads the file via `source.readFile`, and - dispatches `processBigFile` sequentially per file with the per-job - `llmCallContext` threaded through. When `progressContext` is present - this phase opens a fixed-total reporter (`subPhase: "big_files_queue"`, - `total = entries.length`) and increments per entry — including - skipped/cached/failed paths so the percentage never stalls. The same - `progressContext` is forwarded into `processBigFile` so each big file - gets its own per-chunk sub-phase. Cancellation re-throws past the - phase; other errors are logged per file and counted as `failed`. +- `scan-and-classify.ts` — Phase 1. `scanAndClassify({knowledgeId, source, +metaPaths, skipDecider?, llmCallContext?, progressContext?})` walks + `source.scan({ skipDecider, llmCallContext })` exactly once, counts + tokens for every eligible entry, classifies each as `"small"`, + `"big"` (token count > `Config.ContextWindowLimit`), or `"oversized"` + (yielded as `kind === "oversized"` by `scanRepository`), and writes + `meta-output/scan-manifest.json` plus the legacy `bigFiles.json` (for + pull-path and backfill consumers that have not migrated). Big entries + get a cheap `estimatedChunks = ceil(tokenCount / Config.MaxTokensPerChunk)` + used by Phase 2's progress reporter. No LLM calls. No file analysis. +- `analyse-small.ts` — Phase 2a. `analyseSmallFiles({knowledgeId, manifest, +source, metaPaths, analyzer, limiter, archiveSink?, llmCallContext?, +progressContext?})` filters the manifest to `kind === "small"` entries, + re-reads each file via `source.readFile`, runs the LLM file analyser, + and persists via `saveCondensed`. Oversized entries also flow through + here as stub writes (no LLM). Every LLM dispatch goes through the + shared `limiter`. Progress is a fixed total — `smallCount + oversizedCount`. +- `process-big-files.ts` — Phase 2b plus the legacy queue. Exports two + functions: + - `analyseBigFiles({knowledgeId, manifest, source, metaPaths, limiter, +llmCallContext?, progressContext?})` — manifest-driven chunk-task + queue. Skips files already complete (manifest + condensed on disk). + For each remaining big file: read content, split into chunks + via `splitFileIntoChunks`, register a per-file `pendingChunks` + counter. Every chunk becomes an independent task scheduled through + the shared limiter: cache-check via `loadChunkIfPresent`, otherwise + `analyzeChunk` + `saveChunk`. When a file's last chunk lands, that + file's condense is **immediately** scheduled through the same + limiter — condenses across multiple files run in parallel with + chunks of slower files. Two fixed-total progress sub-phases: + `"big_files_chunks"` (sum of `estimatedChunks`) and + `"big_files_condense"` (`bigCount`). + - `processBigFilesQueue({knowledgeId, source, metaPaths, llmCallContext?, +progressContext?})` — legacy serial driver kept for the pull-path + (`pipeline/pull.ts`) and any caller that has not migrated to + `analyseBigFiles(manifest, …)`. Reads `bigFiles.json`, dispatches + `processBigFile` once per file in a `for` loop. - `store-flat-analysis.ts` — Phase 7. `storeFlatAnalysis({scope, payload, branch, metaPaths})` ensures `flat-folder` Neo4j indexes, upserts `:Repo` (from `repo-summary.json` @@ -45,45 +60,69 @@ skipDecider?, archiveSink?, llmCallContext?, progressContext?})` walks `:Folder` so the `CONTAINS` edge always lands. `languageFromPath` fills `language` when the analysis left it blank. +## Execution order + +``` +scanAndClassify + ↓ (manifest in-memory + on disk) +┌── analyseSmallFiles ──┐ +│ │ (Promise.all, share one limiter) +└── analyseBigFiles ────┘ + ↓ +backfillMissingFields → backfillBigFiles → folderSummary → repoSummary → storeFlatAnalysis +``` + ## Public interfaces -- `classifyAndAnalyseSmall(input): Promise` — - `{ smallFilesAnalysed, bigFilesQueued, oversizedStubs, failed }`. - `input.progressContext?` opens a growing-total reporter - (`source.scan` size is not known up front); `incrementSeen()` fires per - scan yield and `increment()` fires per persisted entry. -- `processBigFilesQueue(input): Promise` — - `{ processed, cached, failed, skippedOversized }`. `input.progressContext?` - opens a fixed-total reporter sized by `bigFiles.json` and forwards - itself into the per-file `processBigFile` call. +- `scanAndClassify(input): Promise` — + `{ manifest }`. The manifest contains every eligible file plus a + `summary` with `totalFiles`, `smallCount`, `bigCount`, `oversizedCount`, + `totalTokens`, `estimatedBigChunks`. +- `analyseSmallFiles(input): Promise` — + `{ smallFilesAnalysed, oversizedStubs, failed, tokenUsage }`. + Progress: fixed-total reporter sized by `smallCount + oversizedCount`. +- `analyseBigFiles(input): Promise` — + `{ processed, cached, failed, skippedOversized, tokenUsage }`. + Progress: two fixed-total reporters — one for chunks across all + big files, one for per-file condenses. +- `processBigFilesQueue(input): Promise` — same + result shape; legacy driver used by the pull path. - `storeFlatAnalysis(input): Promise` — `{ nodesWritten, foldersWritten, filesWritten }`. -Each phase returns its own counter shape; the strategy aggregates them -into `FlatFolderResult`. - ## Data ownership -- Phase 1 writes condensed JSON (small files + oversized stubs) and - `bigFiles.json`. -- Phase 2 writes chunk artifacts, the chunk manifest, and condensed JSON - for big files via `processBigFile`. -- Phase 7 owns no disk artifacts. It reads the on-disk state produced by +- Phase 1 writes `scan-manifest.json` (canonical) and `bigFiles.json` + (legacy view for backfill + pull). It does not write per-file + analyses. +- Phase 2a writes condensed JSON for small files + oversized stubs. +- Phase 2b writes per-chunk JSON (`chunks//chunk-N.json`), + per-file chunk manifests (`.manifest.json`), and condensed JSON + for big files. +- Phase 7 owns no disk artifacts. It reads on-disk state produced by Phases 1–6 and writes Neo4j nodes (`:Repo`, `:Folder`, `:File`) plus the `CONTAINS` edge. ## Invariants - Disk is the inter-phase contract; nothing crosses a phase boundary in - memory. + memory (except the in-memory manifest object that scan returns directly + to the orchestrator, which is a convenience — the canonical copy on + disk is what later resume/backfill runs read). - `throwIfCancelled(knowledgeId)` runs at every scan boundary, every - big-file boundary, and before each Neo4j upsert in Phase 7. -- Per-file LLM or I/O failures are logged and counted; phases do not - abort on a single bad file. Only `CancellationError` propagates. + per-chunk and per-file dispatch boundary, and before each Neo4j + upsert in Phase 7. +- Per-file or per-chunk LLM/I/O failures are logged and counted; phases + do not abort on a single bad file. Only `CancellationError`, + `LlmConfigError`, and `LlmError` propagate. +- The shared LLM limiter is the only place LLM concurrency is bounded + during the small/big phases. `Config.BigFileConcurrency` is no longer + consulted from the chunk-queue path (it is still consulted by the + legacy `processBigFile` used by the pull-path driver). +- Phase 1 respects `Config.ContextWindowLimit` and + `Config.MaxTokensPerChunk`; do not hardcode either. - Phase 7 always emits a `:Repo` node, even when `repo-summary.json` is absent (logged as a `phase7` warning). -- Phase 1 respects `Config.ContextWindowLimit` and - `Config.ConcurrentWorkers`; do not hardcode either. ## External dependencies @@ -92,8 +131,8 @@ into `FlatFolderResult`. `upsertRepoNode`, `upsertFolderNode`, `upsertFileNode`, `NodeScope`), `pipeline/scan.ts`, `pipeline/concurrency.ts`, `pipeline/cancellation.ts`, and the sibling `flat-folder/{analyse-file, big-file, folder-summary, -folder-path}` modules plus `adapters/llm-file-analyzer.ts` -(`languageFromPath`). +folder-path, scan-manifest}` modules plus +`adapters/llm-file-analyzer.ts` (`languageFromPath`). ## Tier diff --git a/packages/ingest-github/src/strategies/flat-folder/phases/analyse-small.ts b/packages/ingest-github/src/strategies/flat-folder/phases/analyse-small.ts new file mode 100644 index 0000000..5176f7f --- /dev/null +++ b/packages/ingest-github/src/strategies/flat-folder/phases/analyse-small.ts @@ -0,0 +1,133 @@ +import { logger } from "@bb/logger"; +import type { AskLlmOptions } from "@bb/llm"; +import { LlmConfigError, LlmError } from "@bb/errors"; +import type { ArchiveSink, FileAnalyzer, ScannedFile, SourceReader } from "#src/types/pipeline.ts"; +import type { MetaPaths } from "#src/types/meta-paths.ts"; +import type { ProgressContext } from "#src/progress/types.ts"; +import type { ConcurrencyLimiter } from "#src/pipeline/concurrency.ts"; +import { throwIfCancelled, CancellationError } from "#src/pipeline/cancellation.ts"; +import { analyseScannedFile, buildOversizedStub } from "#src/strategies/flat-folder/analyse-file.ts"; +import { saveCondensed } from "#src/strategies/flat-folder/big-file/storage.ts"; +import type { ScanManifest } from "#src/strategies/flat-folder/scan-manifest.ts"; + +export interface AnalyseSmallInput { + knowledgeId: string; + manifest: ScanManifest; + source: SourceReader; + metaPaths: MetaPaths; + analyzer: FileAnalyzer; + limiter: ConcurrencyLimiter; + archiveSink?: ArchiveSink; + llmCallContext?: AskLlmOptions; + progressContext?: ProgressContext; +} + +export interface AnalyseSmallResult { + smallFilesAnalysed: number; + oversizedStubs: number; + failed: number; + tokenUsage: { inputTokens: number; outputTokens: number; costUsd: number }; +} + +/** + * Consumes the `scan-manifest.json` produced by `scanAndClassify` and + * analyses every `kind: "small"` entry through the shared LLM limiter. + * + * Oversized stubs are also written here (they don't go through the LLM but + * still need a placeholder analysis row on disk so downstream phases see a + * complete file set). + */ +export async function analyseSmallFiles(input: AnalyseSmallInput): Promise { + const smallEntries = input.manifest.entries.filter((e) => e.kind === "small"); + const oversizedEntries = input.manifest.entries.filter((e) => e.kind === "oversized"); + + let smallFilesAnalysed = 0; + let oversizedStubs = 0; + let failed = 0; + let totalInputTokens = 0; + let totalOutputTokens = 0; + let totalCostUsd = 0; + + const reporter = input.progressContext?.reporter({ + phase: "file_analysis", + subPhase: "analyse_small", + total: { kind: "fixed", total: smallEntries.length + oversizedEntries.length }, + }); + await reporter?.start(); + + try { + for (const entry of oversizedEntries) { + throwIfCancelled(input.knowledgeId); + try { + await saveCondensed(input.metaPaths, buildOversizedStub(entry.relativePath, entry.sizeBytes)); + oversizedStubs += 1; + } catch (cause: unknown) { + failed += 1; + logger.warn(`analyse-small: oversized stub write failed for ${entry.relativePath}: ${describe(cause)}`); + } + reporter?.increment(1, { fileName: entry.relativePath }); + } + + const pending: Promise[] = []; + for (const entry of smallEntries) { + pending.push( + input.limiter(async () => { + throwIfCancelled(input.knowledgeId); + try { + const content = await input.source.readFile(entry.relativePath); + const scanned: ScannedFile = { + kind: "file", + relativePath: entry.relativePath, + absolutePath: entry.absolutePath, + sizeBytes: entry.sizeBytes, + content, + }; + const condensed = await analyseScannedFile(input.analyzer, scanned, input.llmCallContext); + await saveCondensed(input.metaPaths, condensed); + if (input.archiveSink !== undefined) { + await input.archiveSink.push({ + knowledgeId: input.knowledgeId, + relativePath: entry.relativePath, + content, + }); + } + if (condensed.tokenUsage) { + totalInputTokens += condensed.tokenUsage.inputTokens; + totalOutputTokens += condensed.tokenUsage.outputTokens; + totalCostUsd += condensed.tokenUsage.costUsd; + } + smallFilesAnalysed += 1; + reporter?.increment(1, { fileName: entry.relativePath }); + } catch (cause: unknown) { + if (cause instanceof CancellationError) { + throw cause; + } + if (cause instanceof LlmConfigError || cause instanceof LlmError) { + throw cause; + } + failed += 1; + logger.warn(`analyse-small: analyse failed for ${entry.relativePath}: ${describe(cause)}`); + reporter?.increment(1, { fileName: entry.relativePath }); + } + }), + ); + } + await Promise.all(pending); + } finally { + reporter?.stop(); + } + + logger.info( + `analyse-small done: smallFilesAnalysed=${smallFilesAnalysed} oversizedStubs=${oversizedStubs} failed=${failed}`, + ); + return { + smallFilesAnalysed, + oversizedStubs, + failed, + tokenUsage: { inputTokens: totalInputTokens, outputTokens: totalOutputTokens, costUsd: totalCostUsd }, + }; +} + +function describe(cause: unknown): string { + return cause instanceof Error ? cause.message : String(cause); +} diff --git a/packages/ingest-github/src/strategies/flat-folder/phases/classify-and-analyse-small.ts b/packages/ingest-github/src/strategies/flat-folder/phases/classify-and-analyse-small.ts deleted file mode 100644 index a9ad59a..0000000 --- a/packages/ingest-github/src/strategies/flat-folder/phases/classify-and-analyse-small.ts +++ /dev/null @@ -1,161 +0,0 @@ -import path from "node:path"; -import { tokenLen, type AskLlmOptions } from "@bb/llm"; -import { LlmConfigError, LlmError } from "@bb/errors"; -import { logger } from "@bb/logger"; -import { Config } from "@bb/types"; -import { getConfigValue } from "@bb/config"; -import type { ArchiveSink, FileAnalyzer, SkipDecider, SourceReader } from "#src/types/pipeline.ts"; -import type { MetaPaths } from "#src/types/meta-paths.ts"; -import type { BigFileEntry } from "#src/types/big-file.ts"; -import type { ProgressContext } from "#src/progress/types.ts"; -import { withConcurrency } from "#src/pipeline/concurrency.ts"; -import { throwIfCancelled, CancellationError } from "#src/pipeline/cancellation.ts"; -import { makeSkipDecider } from "#src/pipeline/skip-decisions/index.ts"; -import { analyseScannedFile, buildOversizedStub } from "#src/strategies/flat-folder/analyse-file.ts"; -import { saveCondensed } from "#src/strategies/flat-folder/big-file/storage.ts"; -import { writeBigFiles } from "#src/strategies/flat-folder/big-file/detector.ts"; - -export interface ClassifyPhaseInput { - knowledgeId: string; - source: SourceReader; - metaPaths: MetaPaths; - analyzer: FileAnalyzer; - skipDecider?: SkipDecider; - archiveSink?: ArchiveSink; - llmCallContext?: AskLlmOptions; - progressContext?: ProgressContext; -} - -export interface ClassifyPhaseResult { - smallFilesAnalysed: number; - bigFilesQueued: number; - oversizedStubs: number; - failed: number; - tokenUsage: { inputTokens: number; outputTokens: number; costUsd: number }; -} - -export async function classifyAndAnalyseSmall(input: ClassifyPhaseInput): Promise { - const contextWindowLimit = getConfigValue(Config.ContextWindowLimit); - const concurrentWorkers = getConfigValue(Config.ConcurrentWorkers); - const limit = withConcurrency(concurrentWorkers); - const bigFileBuffer: BigFileEntry[] = []; - let smallFilesAnalysed = 0; - let oversizedStubs = 0; - let failed = 0; - let totalInputTokens = 0; - let totalOutputTokens = 0; - let totalCostUsd = 0; - - const repositoryHint = - input.source.localRepoDir.length > 0 ? path.basename(input.source.localRepoDir) : input.knowledgeId; - const skipDecider = input.skipDecider ?? makeSkipDecider({ repositoryName: repositoryHint }); - - const pending: Promise[] = []; - - const reporter = input.progressContext?.reporter({ - phase: "file_analysis", - total: { kind: "growing" }, - }); - await reporter?.start(); - - try { - const scanDeps: Parameters[0] = { skipDecider }; - if (input.llmCallContext !== undefined) { - scanDeps.llmCallContext = input.llmCallContext; - } - for await (const entry of input.source.scan(scanDeps)) { - throwIfCancelled(input.knowledgeId); - reporter?.incrementSeen(); - - if (entry.kind === "oversized") { - bigFileBuffer.push({ - relativePath: entry.relativePath, - sizeBytes: entry.sizeBytes, - tokenCount: 0, - reason: "too-large", - }); - try { - await saveCondensed(input.metaPaths, buildOversizedStub(entry.relativePath, entry.sizeBytes)); - oversizedStubs += 1; - reporter?.increment(1, { fileName: entry.relativePath }); - } catch (cause: unknown) { - failed += 1; - logger.warn(`phase1: oversized stub write failed for ${entry.relativePath}: ${describe(cause)}`); - } - continue; - } - - const tokenCount = tokenLen(entry.content); - if (tokenCount > contextWindowLimit) { - bigFileBuffer.push({ - relativePath: entry.relativePath, - sizeBytes: entry.sizeBytes, - tokenCount, - reason: "context-window-exceeded", - }); - // Big files are accounted for here; phase 2 has its own reporter. - reporter?.increment(1, { fileName: entry.relativePath }); - continue; - } - - const fileContent = entry.content; - const filePath = entry.relativePath; - pending.push( - limit(async () => { - try { - throwIfCancelled(input.knowledgeId); - const condensed = await analyseScannedFile(input.analyzer, entry, input.llmCallContext); - await saveCondensed(input.metaPaths, condensed); - if (input.archiveSink !== undefined) { - await input.archiveSink.push({ - knowledgeId: input.knowledgeId, - relativePath: filePath, - content: fileContent, - }); - } - if (condensed.tokenUsage) { - totalInputTokens += condensed.tokenUsage.inputTokens; - totalOutputTokens += condensed.tokenUsage.outputTokens; - totalCostUsd += condensed.tokenUsage.costUsd; - } - smallFilesAnalysed += 1; - reporter?.increment(1, { fileName: filePath }); - } catch (cause: unknown) { - if (cause instanceof CancellationError) { - throw cause; - } - if (cause instanceof LlmConfigError || cause instanceof LlmError) { - // LLM unreachable — bail the whole job, don't keep iterating - // over the rest of the files producing the same failure. - throw cause; - } - failed += 1; - logger.warn(`phase1: analyse failed for ${entry.relativePath}: ${describe(cause)}`); - reporter?.increment(1, { fileName: filePath }); - } - }), - ); - } - - await Promise.all(pending); - - await writeBigFiles(input.metaPaths, bigFileBuffer); - } finally { - reporter?.stop(); - } - - logger.info( - `phase1 done: smallFilesAnalysed=${smallFilesAnalysed} bigFilesQueued=${bigFileBuffer.filter((e) => e.reason === "context-window-exceeded").length} oversizedStubs=${oversizedStubs} failed=${failed}`, - ); - return { - smallFilesAnalysed, - bigFilesQueued: bigFileBuffer.filter((e) => e.reason === "context-window-exceeded").length, - oversizedStubs, - failed, - tokenUsage: { inputTokens: totalInputTokens, outputTokens: totalOutputTokens, costUsd: totalCostUsd }, - }; -} - -function describe(cause: unknown): string { - return cause instanceof Error ? cause.message : String(cause); -} diff --git a/packages/ingest-github/src/strategies/flat-folder/phases/process-big-files.ts b/packages/ingest-github/src/strategies/flat-folder/phases/process-big-files.ts index 1197753..70d5102 100644 --- a/packages/ingest-github/src/strategies/flat-folder/phases/process-big-files.ts +++ b/packages/ingest-github/src/strategies/flat-folder/phases/process-big-files.ts @@ -1,13 +1,24 @@ +import { createHash } from "node:crypto"; import { logger } from "@bb/logger"; +import { Config } from "@bb/types"; +import { getConfigValue } from "@bb/config"; import type { AskLlmOptions } from "@bb/llm"; import { LlmConfigError, LlmError } from "@bb/errors"; import type { MetaPaths } from "#src/types/meta-paths.ts"; import type { SourceReader } from "#src/types/pipeline.ts"; import type { ProgressContext } from "#src/progress/types.ts"; +import type { ConcurrencyLimiter } from "#src/pipeline/concurrency.ts"; +import type { ChunkAnalysisResult, FileChunk, HugeFileManifest } from "#src/types/big-file.ts"; +import type { CondensedFileAnalysis } from "#src/types/condensed-file-analysis.ts"; import { throwIfCancelled, CancellationError } from "#src/pipeline/cancellation.ts"; import { readBigFiles } from "#src/strategies/flat-folder/big-file/detector.ts"; import { inspect } from "#src/strategies/flat-folder/big-file/cache.ts"; +import { splitFileIntoChunks } from "#src/strategies/flat-folder/big-file/chunker.ts"; +import { analyzeChunk } from "#src/strategies/flat-folder/big-file/chunk-analyzer.ts"; +import { condenseChunks } from "#src/strategies/flat-folder/big-file/condenser.ts"; +import { loadChunkIfPresent, saveChunk, saveCondensed, saveManifest } from "#src/strategies/flat-folder/big-file/storage.ts"; import { processBigFile } from "#src/strategies/flat-folder/big-file/index.ts"; +import type { ScanManifest, ScanManifestEntry } from "#src/strategies/flat-folder/scan-manifest.ts"; export interface ProcessBigFilesInput { knowledgeId: string; @@ -25,6 +36,12 @@ export interface ProcessBigFilesResult { tokenUsage: { inputTokens: number; outputTokens: number; costUsd: number }; } +/** + * Legacy big-file driver. Reads the deprecated `bigFiles.json`, processes + * each entry serially via `processBigFile` (which internally does + * chunk-then-condense). Kept for the pull-path (`pipeline/pull.ts`) and any + * caller that has not migrated to `analyseBigFiles(manifest, …)` yet. + */ export async function processBigFilesQueue(input: ProcessBigFilesInput): Promise { const entries = await readBigFiles(input.metaPaths); let processed = 0; @@ -61,13 +78,13 @@ export async function processBigFilesQueue(input: ProcessBigFilesInput): Promise content = await input.source.readFile(entry.relativePath); } catch (cause: unknown) { failed += 1; - logger.warn(`phase2: read failed for ${entry.relativePath}: ${describe(cause)}`); + logger.warn(`big-files-queue: read failed for ${entry.relativePath}: ${describe(cause)}`); reporter?.increment(1, { fileName: entry.relativePath }); continue; } if (content.length === 0) { failed += 1; - logger.warn(`phase2: empty content for ${entry.relativePath}; skipping`); + logger.warn(`big-files-queue: empty content for ${entry.relativePath}; skipping`); reporter?.increment(1, { fileName: entry.relativePath }); continue; } @@ -95,12 +112,12 @@ export async function processBigFilesQueue(input: ProcessBigFilesInput): Promise throw cause; } failed += 1; - logger.warn(`phase2: processBigFile failed for ${entry.relativePath}: ${describe(cause)}`); + logger.warn(`big-files-queue: processBigFile failed for ${entry.relativePath}: ${describe(cause)}`); } reporter?.increment(1, { fileName: entry.relativePath }); } logger.info( - `phase2 done: processed=${processed} cached=${cached} failed=${failed} skippedOversized=${skippedOversized}`, + `big-files-queue done: processed=${processed} cached=${cached} failed=${failed} skippedOversized=${skippedOversized}`, ); return { processed, @@ -114,6 +131,238 @@ export async function processBigFilesQueue(input: ProcessBigFilesInput): Promise } } +// --------------------------------------------------------------------------- +// Chunk-queue model (manifest-driven) +// --------------------------------------------------------------------------- + +export interface AnalyseBigFilesInput { + knowledgeId: string; + manifest: ScanManifest; + source: SourceReader; + metaPaths: MetaPaths; + limiter: ConcurrencyLimiter; + llmCallContext?: AskLlmOptions; + progressContext?: ProgressContext; +} + +interface BigFileState { + entry: ScanManifestEntry; + content: string; + chunks: FileChunk[]; + results: (ChunkAnalysisResult | undefined)[]; + pendingChunks: number; + fatal: boolean; +} + +/** + * Manifest-driven big-file phase. Every chunk of every big file is an + * independent task scheduled through the shared LLM limiter. As soon as the + * last chunk of a given file lands, that file's condense is scheduled — + * multiple condenses run in parallel with the still-pending chunks of slower + * files. All LLM calls (chunk + condense) check out from the same limiter. + * + * Files already fully processed (manifest + condensed on disk) are skipped. + */ +export async function analyseBigFiles(input: AnalyseBigFilesInput): Promise { + const maxTokensPerChunk = getConfigValue(Config.MaxTokensPerChunk); + const bigEntries = input.manifest.entries.filter((e) => e.kind === "big"); + + let cached = 0; + let skippedOversized = 0; + let failed = 0; + let processed = 0; + let totalInputTokens = 0; + let totalOutputTokens = 0; + let totalCostUsd = 0; + + // Per-file preparation: read content, chunk, record state. Sequential and + // cheap — no LLM calls here. + const states: BigFileState[] = []; + for (const entry of bigEntries) { + throwIfCancelled(input.knowledgeId); + const status = await inspect(input.metaPaths, entry.relativePath); + if (status === "complete") { + cached += 1; + continue; + } + let content: string; + try { + content = await input.source.readFile(entry.relativePath); + } catch (cause: unknown) { + failed += 1; + logger.warn(`analyse-big: read failed for ${entry.relativePath}: ${describe(cause)}`); + continue; + } + if (content.length === 0) { + failed += 1; + logger.warn(`analyse-big: empty content for ${entry.relativePath}; skipping`); + continue; + } + const chunks = splitFileIntoChunks(entry.relativePath, content, maxTokensPerChunk); + states.push({ + entry, + content, + chunks, + results: new Array(chunks.length), + pendingChunks: chunks.length, + fatal: false, + }); + logger.info(`analyse-big: ${entry.relativePath} split into ${chunks.length} chunks`); + } + + const totalChunks = states.reduce((acc, s) => acc + s.chunks.length, 0); + const chunkReporter = input.progressContext?.reporter({ + phase: "file_analysis", + subPhase: "big_files_chunks", + total: { kind: "fixed", total: totalChunks }, + }); + await chunkReporter?.start(); + const condenseReporter = input.progressContext?.reporter({ + phase: "file_analysis", + subPhase: "big_files_condense", + total: { kind: "fixed", total: states.length }, + }); + await condenseReporter?.start(); + + // For oversized entries the legacy phase counted them; we accept the manifest + // already accounted for them via the small phase (which writes the stub). + // Surfaced here for parity with the legacy result shape. + skippedOversized = input.manifest.entries.filter((e) => e.kind === "oversized").length; + + const condensePromises: Promise[] = []; + + function maybeScheduleCondense(state: BigFileState): void { + if (state.pendingChunks > 0 || state.fatal) { + return; + } + const definedResults = state.results.filter((r): r is ChunkAnalysisResult => r !== undefined); + condensePromises.push( + input.limiter(async () => { + throwIfCancelled(input.knowledgeId); + try { + const merged = await condenseChunks(state.entry.relativePath, definedResults, input.llmCallContext); + + const chunkInputTokens = definedResults.reduce((acc, r) => acc + (r.tokenUsage?.inputTokens ?? 0), 0); + const chunkOutputTokens = definedResults.reduce((acc, r) => acc + (r.tokenUsage?.outputTokens ?? 0), 0); + const chunkCostUsd = definedResults.reduce((acc, r) => acc + (r.tokenUsage?.costUsd ?? 0), 0); + const totalTokenCount = state.chunks.reduce((acc, c) => acc + c.tokenCount, 0); + const totalIn = chunkInputTokens + (merged.tokenUsage?.inputTokens ?? 0); + const totalOut = chunkOutputTokens + (merged.tokenUsage?.outputTokens ?? 0); + const totalCost = chunkCostUsd + (merged.tokenUsage?.costUsd ?? 0); + + const manifest: HugeFileManifest = { + relativePath: state.entry.relativePath, + totalChunks: state.chunks.length, + totalTokenCount, + chunkPaths: state.chunks.map((_, i) => `chunks/${encodeFolder(state.entry.relativePath)}/chunk-${i}.json`), + generatedAt: new Date().toISOString(), + }; + await saveManifest(input.metaPaths, manifest); + + const condensed: CondensedFileAnalysis = { + relativePath: state.entry.relativePath, + language: merged.language, + sha256: sha256(state.content), + sizeBytes: state.entry.sizeBytes, + tokenCount: totalTokenCount, + isBigFile: true, + totalChunks: state.chunks.length, + totalTokenCount, + analysedAt: new Date().toISOString(), + analysis: merged.analysis, + tokenUsage: { inputTokens: totalIn, outputTokens: totalOut, costUsd: totalCost }, + }; + await saveCondensed(input.metaPaths, condensed); + + totalInputTokens += totalIn; + totalOutputTokens += totalOut; + totalCostUsd += totalCost; + processed += 1; + } catch (cause: unknown) { + if (cause instanceof CancellationError) { + throw cause; + } + if (cause instanceof LlmConfigError || cause instanceof LlmError) { + throw cause; + } + failed += 1; + logger.warn(`analyse-big: condense failed for ${state.entry.relativePath}: ${describe(cause)}`); + } finally { + condenseReporter?.increment(1, { fileName: state.entry.relativePath }); + } + }), + ); + } + + const chunkPromises: Promise[] = []; + for (const state of states) { + for (let i = 0; i < state.chunks.length; i += 1) { + const idx = i; + const chunk = state.chunks[idx]; + if (chunk === undefined) { + continue; + } + chunkPromises.push( + input.limiter(async () => { + throwIfCancelled(input.knowledgeId); + try { + const cachedChunk = await loadChunkIfPresent(input.metaPaths, state.entry.relativePath, idx); + if (cachedChunk !== null) { + state.results[idx] = cachedChunk; + } else { + const analyzed = await analyzeChunk(chunk, input.llmCallContext); + await saveChunk(input.metaPaths, analyzed); + state.results[idx] = analyzed; + } + } catch (cause: unknown) { + if (cause instanceof CancellationError) { + throw cause; + } + if (cause instanceof LlmConfigError || cause instanceof LlmError) { + state.fatal = true; + throw cause; + } + logger.warn( + `analyse-big: chunk ${idx + 1}/${state.chunks.length} failed for ${state.entry.relativePath}: ${describe(cause)}`, + ); + } finally { + state.pendingChunks -= 1; + chunkReporter?.increment(1, { fileName: `${state.entry.relativePath}#chunk-${String(idx)}` }); + maybeScheduleCondense(state); + } + }), + ); + } + } + + try { + await Promise.all(chunkPromises); + await Promise.all(condensePromises); + } finally { + chunkReporter?.stop(); + condenseReporter?.stop(); + } + + logger.info( + `analyse-big done: processed=${processed} cached=${cached} failed=${failed} skippedOversized=${skippedOversized}`, + ); + return { + processed, + cached, + failed, + skippedOversized, + tokenUsage: { inputTokens: totalInputTokens, outputTokens: totalOutputTokens, costUsd: totalCostUsd }, + }; +} + +function sha256(content: string): string { + return createHash("sha256").update(content).digest("hex"); +} + +function encodeFolder(relativePath: string): string { + return relativePath.replace(/\//gu, "__SL__").replace(/\\/gu, "__BS__"); +} + function describe(cause: unknown): string { return cause instanceof Error ? cause.message : String(cause); } diff --git a/packages/ingest-github/src/strategies/flat-folder/phases/scan-and-classify.ts b/packages/ingest-github/src/strategies/flat-folder/phases/scan-and-classify.ts new file mode 100644 index 0000000..786c9b0 --- /dev/null +++ b/packages/ingest-github/src/strategies/flat-folder/phases/scan-and-classify.ts @@ -0,0 +1,131 @@ +import path from "node:path"; +import { Config } from "@bb/types"; +import { getConfigValue } from "@bb/config"; +import { logger } from "@bb/logger"; +import type { AskLlmOptions } from "@bb/llm"; +import type { MetaPaths } from "#src/types/meta-paths.ts"; +import type { BigFileEntry } from "#src/types/big-file.ts"; +import type { SkipDecider, SourceReader } from "#src/types/pipeline.ts"; +import type { ProgressContext } from "#src/progress/types.ts"; +import { throwIfCancelled } from "#src/pipeline/cancellation.ts"; +import { makeSkipDecider } from "#src/pipeline/skip-decisions/index.ts"; +import { classifyByTokens, writeBigFiles } from "#src/strategies/flat-folder/big-file/detector.ts"; +import { + emptyManifest, + writeScanManifest, + type ScanManifest, + type ScanManifestEntry, +} from "#src/strategies/flat-folder/scan-manifest.ts"; + +export interface ScanAndClassifyInput { + knowledgeId: string; + source: SourceReader; + metaPaths: MetaPaths; + skipDecider?: SkipDecider; + llmCallContext?: AskLlmOptions; + progressContext?: ProgressContext; +} + +export interface ScanAndClassifyResult { + manifest: ScanManifest; +} + +/** + * Walks the repo once, classifies every eligible file as small / big / + * oversized by token count, and writes `scan-manifest.json`. The downstream + * small-file and big-file phases consume the manifest instead of re-walking. + * + * Also writes the legacy `bigFiles.json` so the pull-path and backfill phases + * (which still read it directly) keep working without migration. + */ +export async function scanAndClassify(input: ScanAndClassifyInput): Promise { + const contextWindowLimit = getConfigValue(Config.ContextWindowLimit); + const maxTokensPerChunk = getConfigValue(Config.MaxTokensPerChunk); + const manifest = emptyManifest(); + const bigFileEntries: BigFileEntry[] = []; + + const repositoryHint = + input.source.localRepoDir.length > 0 ? path.basename(input.source.localRepoDir) : input.knowledgeId; + const skipDecider = input.skipDecider ?? makeSkipDecider({ repositoryName: repositoryHint }); + + const reporter = input.progressContext?.reporter({ + phase: "scan", + total: { kind: "growing" }, + }); + await reporter?.start(); + + try { + const scanDeps: Parameters[0] = { skipDecider }; + if (input.llmCallContext !== undefined) { + scanDeps.llmCallContext = input.llmCallContext; + } + + for await (const entry of input.source.scan(scanDeps)) { + throwIfCancelled(input.knowledgeId); + reporter?.incrementSeen(); + + if (entry.kind === "oversized") { + const manifestEntry: ScanManifestEntry = { + relativePath: entry.relativePath, + absolutePath: entry.absolutePath, + sizeBytes: entry.sizeBytes, + tokenCount: 0, + kind: "oversized", + }; + manifest.entries.push(manifestEntry); + manifest.summary.oversizedCount += 1; + manifest.summary.totalFiles += 1; + bigFileEntries.push({ + relativePath: entry.relativePath, + sizeBytes: entry.sizeBytes, + tokenCount: 0, + reason: "too-large", + }); + reporter?.increment(1, { fileName: entry.relativePath }); + continue; + } + + const { tokenCount, isBigFile } = classifyByTokens(entry.content, contextWindowLimit); + manifest.summary.totalFiles += 1; + manifest.summary.totalTokens += tokenCount; + if (isBigFile) { + const estimatedChunks = Math.max(1, Math.ceil(tokenCount / maxTokensPerChunk)); + manifest.entries.push({ + relativePath: entry.relativePath, + absolutePath: entry.absolutePath, + sizeBytes: entry.sizeBytes, + tokenCount, + kind: "big", + estimatedChunks, + }); + manifest.summary.bigCount += 1; + manifest.summary.estimatedBigChunks += estimatedChunks; + bigFileEntries.push({ + relativePath: entry.relativePath, + sizeBytes: entry.sizeBytes, + tokenCount, + reason: "context-window-exceeded", + }); + } else { + manifest.entries.push({ + relativePath: entry.relativePath, + absolutePath: entry.absolutePath, + sizeBytes: entry.sizeBytes, + tokenCount, + kind: "small", + }); + manifest.summary.smallCount += 1; + } + reporter?.increment(1, { fileName: entry.relativePath }); + } + } finally { + reporter?.stop(); + } + + await writeScanManifest(input.metaPaths, manifest); + await writeBigFiles(input.metaPaths, bigFileEntries); + logger.info( + `scan-and-classify done: total=${manifest.summary.totalFiles} small=${manifest.summary.smallCount} big=${manifest.summary.bigCount} oversized=${manifest.summary.oversizedCount} totalTokens=${manifest.summary.totalTokens} estimatedBigChunks=${manifest.summary.estimatedBigChunks}`, + ); + return { manifest }; +} diff --git a/packages/ingest-github/src/strategies/flat-folder/scan-manifest.ts b/packages/ingest-github/src/strategies/flat-folder/scan-manifest.ts new file mode 100644 index 0000000..5caee3b --- /dev/null +++ b/packages/ingest-github/src/strategies/flat-folder/scan-manifest.ts @@ -0,0 +1,61 @@ +import { readFile, writeFile } from "node:fs/promises"; +import type { MetaPaths } from "#src/types/meta-paths.ts"; + +export type ScanEntryKind = "small" | "big" | "oversized"; + +export interface ScanManifestEntry { + relativePath: string; + absolutePath: string; + sizeBytes: number; + tokenCount: number; + kind: ScanEntryKind; + estimatedChunks?: number; +} + +export interface ScanManifestSummary { + totalFiles: number; + smallCount: number; + bigCount: number; + oversizedCount: number; + totalTokens: number; + estimatedBigChunks: number; +} + +export interface ScanManifest { + generatedAt: string; + summary: ScanManifestSummary; + entries: ScanManifestEntry[]; +} + +export function emptyManifest(): ScanManifest { + return { + generatedAt: new Date().toISOString(), + summary: { totalFiles: 0, smallCount: 0, bigCount: 0, oversizedCount: 0, totalTokens: 0, estimatedBigChunks: 0 }, + entries: [], + }; +} + +export async function writeScanManifest(metaPaths: MetaPaths, manifest: ScanManifest): Promise { + await writeFile(metaPaths.scanManifestJson, JSON.stringify(manifest, null, 2), "utf8"); +} + +export async function readScanManifest(metaPaths: MetaPaths): Promise { + try { + const raw = await readFile(metaPaths.scanManifestJson, "utf8"); + const parsed: unknown = JSON.parse(raw); + if (!isManifest(parsed)) { + return null; + } + return parsed; + } catch { + return null; + } +} + +function isManifest(value: unknown): value is ScanManifest { + if (typeof value !== "object" || value === null) { + return false; + } + const rec = value as Record; + return Array.isArray(rec["entries"]) && typeof rec["summary"] === "object" && typeof rec["generatedAt"] === "string"; +} diff --git a/packages/ingest-github/src/types/meta-paths.ts b/packages/ingest-github/src/types/meta-paths.ts index 8898df3..5da4f89 100644 --- a/packages/ingest-github/src/types/meta-paths.ts +++ b/packages/ingest-github/src/types/meta-paths.ts @@ -5,5 +5,6 @@ export interface MetaPaths { bigFileAnalysisDir: string; bigFileChunksDir: string; bigFilesJson: string; + scanManifestJson: string; repoSummaryJson: string; } diff --git a/packages/types/src/config.ts b/packages/types/src/config.ts index 882381a..950cb81 100644 --- a/packages/types/src/config.ts +++ b/packages/types/src/config.ts @@ -23,6 +23,7 @@ export enum Config { BigFileConcurrency = "big.file.concurrency", AbsoluteFileSizeCap = "absolute.file.size.cap", ConcurrentWorkers = "concurrent.workers", + LlmConcurrency = "llm.concurrency", CondenseContextLimit = "condense.context.limit", CondensePromptOverhead = "condense.prompt.overhead", SmallFileDedupThreshold = "small.file.dedup.threshold", From b6311ba7d58e37ec5842a3a988f9214c3484f61b Mon Sep 17 00:00:00 2001 From: Dead-Bytes <143434285+Dead-Bytes@users.noreply.github.com> Date: Fri, 22 May 2026 14:26:40 +0530 Subject: [PATCH 26/34] refactor: implement FileAnalysisCache for improved performance in file analysis phases --- packages/ingest-github/README.md | 8 ++ packages/ingest-github/src/pipeline/pull.ts | 8 +- .../strategies/flat-folder/backfill/fields.ts | 9 +- .../flat-folder/file-analysis-cache.ts | 91 +++++++++++++++++++ .../flat-folder/folder-summary-selective.ts | 4 +- .../strategies/flat-folder/folder-summary.ts | 9 +- .../src/strategies/flat-folder/index.ts | 16 +++- .../strategies/flat-folder/phases/README.md | 11 +++ .../flat-folder/phases/store-flat-analysis.ts | 8 +- 9 files changed, 148 insertions(+), 16 deletions(-) create mode 100644 packages/ingest-github/src/strategies/flat-folder/file-analysis-cache.ts diff --git a/packages/ingest-github/README.md b/packages/ingest-github/README.md index 6073339..c9e1ca2 100644 --- a/packages/ingest-github/README.md +++ b/packages/ingest-github/README.md @@ -139,6 +139,14 @@ worker hardcodes a single `IngestionStrategy` instance (currently - `meta-output/bigFiles.json` — legacy view written alongside the manifest for the pull-path and backfill phases. The main strategy no longer consumes it directly. +- `FileAnalysisCache` (in-memory only, not persisted) — single + `Map` loaded once between the + analyse and backfill phases via parallel `readdir + readFile`. Replaces + three sequential `iterateCondensed` walks (phases 3, 5, 7) with one + parallel preload + three in-memory iterations. The pull workflow loads + its own cache instance; only one strategy run owns a given + `metaPaths` directory at a time. For repos beyond ~50k analysed files + consider a streaming-mode fallback (not implemented today). ## Invariants diff --git a/packages/ingest-github/src/pipeline/pull.ts b/packages/ingest-github/src/pipeline/pull.ts index 930b7be..6ffab0b 100644 --- a/packages/ingest-github/src/pipeline/pull.ts +++ b/packages/ingest-github/src/pipeline/pull.ts @@ -20,6 +20,7 @@ import { analyseChangedFiles } from "#src/strategies/flat-folder/analyse-changed import { processBigFilesQueue } from "#src/strategies/flat-folder/phases/process-big-files.ts"; import { backfillMissingFields } from "#src/strategies/flat-folder/backfill/fields.ts"; import { backfillBigFiles } from "#src/strategies/flat-folder/backfill/big-files.ts"; +import { FileAnalysisCache } from "#src/strategies/flat-folder/file-analysis-cache.ts"; import { runSelectiveFolderSummary } from "#src/strategies/flat-folder/folder-summary-selective.ts"; import { makeRepoSummaryEnvelope, @@ -192,9 +193,13 @@ export async function runPull( totalOutputTokens += phase2.tokenUsage.outputTokens; totalCostUsd += phase2.tokenUsage.costUsd; + logger.info(`pull: loading file-analysis cache`); + throwIfCancelled(knowledgeId); + const fileAnalysisCache = await FileAnalysisCache.loadAll(metaPaths); + logger.info(`pull: phase backfill fields starting`); throwIfCancelled(knowledgeId); - await backfillMissingFields(metaPaths, llmCallContext, progressContext); + await backfillMissingFields(metaPaths, fileAnalysisCache, llmCallContext, progressContext); logger.info(`pull: phase backfill big-files starting`); throwIfCancelled(knowledgeId); @@ -215,6 +220,7 @@ export async function runPull( const selectiveInput: Parameters[0] = { knowledgeId, metaPaths, + cache: fileAnalysisCache, affectedFolders, }; if (llmCallContext !== undefined) { diff --git a/packages/ingest-github/src/strategies/flat-folder/backfill/fields.ts b/packages/ingest-github/src/strategies/flat-folder/backfill/fields.ts index b6db25e..7836520 100644 --- a/packages/ingest-github/src/strategies/flat-folder/backfill/fields.ts +++ b/packages/ingest-github/src/strategies/flat-folder/backfill/fields.ts @@ -4,8 +4,8 @@ import { logger } from "@bb/logger"; import type { FileAnalysis, FileAnalysisSection } from "@bb/mongo"; import type { MetaPaths } from "#src/types/meta-paths.ts"; import type { ProgressContext } from "#src/progress/types.ts"; -import { iterateCondensed } from "#src/strategies/flat-folder/big-file/storage.ts"; import { saveCondensed } from "#src/strategies/flat-folder/big-file/storage.ts"; +import type { FileAnalysisCache } from "#src/strategies/flat-folder/file-analysis-cache.ts"; import { BACKFILL_SYSTEM_PROMPT, buildBackfillUserPrompt } from "#src/strategies/flat-folder/prompts/backfill.ts"; const EXTENDED_ARRAY_KEYS = [ @@ -44,6 +44,7 @@ interface NeededFlags { export async function backfillMissingFields( metaPaths: MetaPaths, + cache: FileAnalysisCache, llmCallContext?: AskLlmOptions, progressContext?: ProgressContext, ): Promise<{ updated: number; failed: number }> { @@ -52,12 +53,11 @@ export async function backfillMissingFields( const reporter = progressContext?.reporter({ phase: "file_analysis", subPhase: "backfill", - total: { kind: "growing" }, + total: { kind: "fixed", total: cache.size }, }); await reporter?.start(); try { - for await (const entry of iterateCondensed(metaPaths)) { - reporter?.incrementSeen(); + for (const entry of cache.values()) { const a = entry.analysis; const needed = computeNeeded(a); if (!hasAnyMissing(needed)) { @@ -74,6 +74,7 @@ export async function backfillMissingFields( } applyBackfill(a, result, needed); await saveCondensed(metaPaths, entry); + cache.set(entry); updated += 1; } catch (cause: unknown) { if (cause instanceof LlmConfigError || cause instanceof LlmError) { diff --git a/packages/ingest-github/src/strategies/flat-folder/file-analysis-cache.ts b/packages/ingest-github/src/strategies/flat-folder/file-analysis-cache.ts new file mode 100644 index 0000000..4405682 --- /dev/null +++ b/packages/ingest-github/src/strategies/flat-folder/file-analysis-cache.ts @@ -0,0 +1,91 @@ +import { readdir, readFile } from "node:fs/promises"; +import path from "node:path"; +import { logger } from "@bb/logger"; +import type { CondensedFileAnalysis } from "#src/types/condensed-file-analysis.ts"; +import type { MetaPaths } from "#src/types/meta-paths.ts"; +import { withConcurrency } from "#src/pipeline/concurrency.ts"; + +const LOAD_CONCURRENCY = 20; + +/** + * In-memory snapshot of every `CondensedFileAnalysis` JSON under + * `metaPaths.fileAnalysisDir`. Loaded once per strategy run between the + * analyse phases (2a/2b) and the backfill / folder-summary / graph-store + * phases. The downstream consumers iterate `.values()` (full sweeps) or + * `.get(relativePath)` (random-access); Phase 3 also calls `.set(...)` + * to keep the map in sync with disk writes. + * + * Replaces three sequential `iterateCondensed` walks (one per consumer) + * with one parallel preload + three in-memory iterations. + */ +export class FileAnalysisCache { + private readonly map: Map; + + private constructor(map: Map) { + this.map = map; + } + + static async loadAll(metaPaths: MetaPaths): Promise { + const startedAt = Date.now(); + let filenames: string[]; + try { + filenames = await readdir(metaPaths.fileAnalysisDir); + } catch (cause: unknown) { + logger.warn(`file-analysis-cache: readdir failed for ${metaPaths.fileAnalysisDir}: ${describe(cause)}`); + return new FileAnalysisCache(new Map()); + } + const jsonFiles = filenames.filter((n) => n.endsWith(".json")); + const map = new Map(); + const limit = withConcurrency(LOAD_CONCURRENCY); + const tasks: Promise[] = []; + for (const name of jsonFiles) { + tasks.push( + limit(async () => { + const full = path.join(metaPaths.fileAnalysisDir, name); + try { + const raw = await readFile(full, "utf8"); + const parsed: unknown = JSON.parse(raw); + if (typeof parsed !== "object" || parsed === null) { + return; + } + const entry = parsed as CondensedFileAnalysis; + if (typeof entry.relativePath !== "string" || entry.relativePath.length === 0) { + return; + } + map.set(entry.relativePath, entry); + } catch (cause: unknown) { + logger.warn(`file-analysis-cache: failed to read ${name}: ${describe(cause)}`); + } + }), + ); + } + await Promise.all(tasks); + const elapsedMs = Date.now() - startedAt; + logger.info(`file-analysis-cache: loaded ${map.size} entries in ${elapsedMs} ms`); + return new FileAnalysisCache(map); + } + + get(relativePath: string): CondensedFileAnalysis | undefined { + return this.map.get(relativePath); + } + + set(entry: CondensedFileAnalysis): void { + this.map.set(entry.relativePath, entry); + } + + values(): IterableIterator { + return this.map.values(); + } + + entries(): IterableIterator<[string, CondensedFileAnalysis]> { + return this.map.entries(); + } + + get size(): number { + return this.map.size; + } +} + +function describe(cause: unknown): string { + return cause instanceof Error ? cause.message : String(cause); +} diff --git a/packages/ingest-github/src/strategies/flat-folder/folder-summary-selective.ts b/packages/ingest-github/src/strategies/flat-folder/folder-summary-selective.ts index d053d82..9b4e71c 100644 --- a/packages/ingest-github/src/strategies/flat-folder/folder-summary-selective.ts +++ b/packages/ingest-github/src/strategies/flat-folder/folder-summary-selective.ts @@ -5,6 +5,7 @@ import type { AskLlmOptions } from "@bb/llm"; import type { MetaPaths } from "#src/types/meta-paths.ts"; import { withConcurrency } from "#src/pipeline/concurrency.ts"; import { throwIfCancelled, CancellationError } from "#src/pipeline/cancellation.ts"; +import type { FileAnalysisCache } from "#src/strategies/flat-folder/file-analysis-cache.ts"; import { groupByDirectFolder, persistFolderSummary, @@ -14,6 +15,7 @@ import { export interface SelectiveFolderSummaryInput { knowledgeId: string; metaPaths: MetaPaths; + cache: FileAnalysisCache; affectedFolders: Set; llmCallContext?: AskLlmOptions; } @@ -35,7 +37,7 @@ export async function runSelectiveFolderSummary( ): Promise { const concurrentWorkers = getConfigValue(Config.ConcurrentWorkers); const limit = withConcurrency(concurrentWorkers); - const groups = await groupByDirectFolder(input.metaPaths); + const groups = groupByDirectFolder(input.cache); let succeeded = 0; let failed = 0; let skipped = 0; diff --git a/packages/ingest-github/src/strategies/flat-folder/folder-summary.ts b/packages/ingest-github/src/strategies/flat-folder/folder-summary.ts index 4fa175b..805eae6 100644 --- a/packages/ingest-github/src/strategies/flat-folder/folder-summary.ts +++ b/packages/ingest-github/src/strategies/flat-folder/folder-summary.ts @@ -11,14 +11,14 @@ import { encodeMetaPath } from "#src/pipeline/paths.ts"; import { withConcurrency } from "#src/pipeline/concurrency.ts"; import { throwIfCancelled, CancellationError } from "#src/pipeline/cancellation.ts"; import type { ProgressContext } from "#src/progress/types.ts"; -import { iterateCondensed } from "./big-file/storage.ts"; +import type { FileAnalysisCache } from "./file-analysis-cache.ts"; import { directFolderOf } from "./folder-path.ts"; import { FOLDER_ANALYSIS_SYSTEM_PROMPT, folderAnalysisUserPrompt } from "./prompts/folder-summary.ts"; import type { FolderSummary } from "./types.ts"; -export async function groupByDirectFolder(metaPaths: MetaPaths): Promise> { +export function groupByDirectFolder(cache: FileAnalysisCache): Map { const groups = new Map(); - for await (const entry of iterateCondensed(metaPaths)) { + for (const entry of cache.values()) { const folder = directFolderOf(entry.relativePath); const bucket = groups.get(folder) ?? []; bucket.push(entry); @@ -113,6 +113,7 @@ export async function* iterateFolderSummaries(metaPaths: MetaPaths): AsyncGenera export async function runFolderSummaryPhase( knowledgeId: string, metaPaths: MetaPaths, + cache: FileAnalysisCache, llmCallContext?: AskLlmOptions, progressContext?: ProgressContext, ): Promise<{ @@ -122,7 +123,7 @@ export async function runFolderSummaryPhase( }> { const concurrentWorkers = getConfigValue(Config.ConcurrentWorkers); const limit = withConcurrency(concurrentWorkers); - const groups = await groupByDirectFolder(metaPaths); + const groups = groupByDirectFolder(cache); let succeeded = 0; let failed = 0; let totalInputTokens = 0; diff --git a/packages/ingest-github/src/strategies/flat-folder/index.ts b/packages/ingest-github/src/strategies/flat-folder/index.ts index 924b26f..de9211b 100644 --- a/packages/ingest-github/src/strategies/flat-folder/index.ts +++ b/packages/ingest-github/src/strategies/flat-folder/index.ts @@ -11,6 +11,7 @@ import { analyseSmallFiles } from "./phases/analyse-small.ts"; import { analyseBigFiles } from "./phases/process-big-files.ts"; import { backfillMissingFields } from "./backfill/fields.ts"; import { backfillBigFiles } from "./backfill/big-files.ts"; +import { FileAnalysisCache } from "./file-analysis-cache.ts"; import { runFolderSummaryPhase } from "./folder-summary.ts"; import { makeRepoSummaryEnvelope, persistRepoSummary, summariseRepo } from "./repo-summary.ts"; import { storeFlatAnalysis } from "./phases/store-flat-analysis.ts"; @@ -87,9 +88,13 @@ export function createFlatFolderStrategy(deps: FlatFolderStrategyDeps): IngestSt let totalOutputTokens = smallResult.tokenUsage.outputTokens + bigResult.tokenUsage.outputTokens; let totalCostUsd = smallResult.tokenUsage.costUsd + bigResult.tokenUsage.costUsd; + logger.info(`flat-folder: loading file-analysis cache`); + throwIfCancelled(knowledgeId); + const fileAnalysisCache = await FileAnalysisCache.loadAll(metaPaths); + logger.info(`flat-folder: phase3 (backfill missing fields) starting`); throwIfCancelled(knowledgeId); - await backfillMissingFields(metaPaths, llmCallContext, progressContext); + await backfillMissingFields(metaPaths, fileAnalysisCache, llmCallContext, progressContext); logger.info(`flat-folder: phase4 (backfill big files) starting`); throwIfCancelled(knowledgeId); @@ -107,7 +112,13 @@ export function createFlatFolderStrategy(deps: FlatFolderStrategyDeps): IngestSt progressContext.phaseChanged("folder_analysis"); logger.info(`flat-folder: phase5 (folder summaries) starting`); throwIfCancelled(knowledgeId); - const phase5 = await runFolderSummaryPhase(knowledgeId, metaPaths, llmCallContext, progressContext); + const phase5 = await runFolderSummaryPhase( + knowledgeId, + metaPaths, + fileAnalysisCache, + llmCallContext, + progressContext, + ); totalInputTokens += phase5.tokenUsage.inputTokens; totalOutputTokens += phase5.tokenUsage.outputTokens; totalCostUsd += phase5.tokenUsage.costUsd; @@ -136,6 +147,7 @@ export function createFlatFolderStrategy(deps: FlatFolderStrategyDeps): IngestSt payload, branch, metaPaths, + cache: fileAnalysisCache, progressContext, }); diff --git a/packages/ingest-github/src/strategies/flat-folder/phases/README.md b/packages/ingest-github/src/strategies/flat-folder/phases/README.md index e2d218a..05ee606 100644 --- a/packages/ingest-github/src/strategies/flat-folder/phases/README.md +++ b/packages/ingest-github/src/strategies/flat-folder/phases/README.md @@ -69,9 +69,17 @@ scanAndClassify │ │ (Promise.all, share one limiter) └── analyseBigFiles ────┘ ↓ +FileAnalysisCache.loadAll (one parallel readdir+readFile pass) + ↓ backfillMissingFields → backfillBigFiles → folderSummary → repoSummary → storeFlatAnalysis + (cache read+write) (no cache) (cache read) (cache read) ``` +`FileAnalysisCache` is a `Map` loaded +once between phase 2 and phase 3. Phases 3, 5, 7 all consume the same +instance — phase 3 also calls `cache.set(...)` after each backfill write +so phases 5 and 7 see the updated entries without re-reading disk. + ## Public interfaces - `scanAndClassify(input): Promise` — @@ -99,6 +107,9 @@ backfillMissingFields → backfillBigFiles → folderSummary → repoSummary → - Phase 2b writes per-chunk JSON (`chunks//chunk-N.json`), per-file chunk manifests (`.manifest.json`), and condensed JSON for big files. +- `FileAnalysisCache` is an in-memory artifact owned by the strategy + run (not persisted). It loads from `fileAnalysisDir` once and is + passed by reference to phases 3, 5, and 7. - Phase 7 owns no disk artifacts. It reads on-disk state produced by Phases 1–6 and writes Neo4j nodes (`:Repo`, `:Folder`, `:File`) plus the `CONTAINS` edge. diff --git a/packages/ingest-github/src/strategies/flat-folder/phases/store-flat-analysis.ts b/packages/ingest-github/src/strategies/flat-folder/phases/store-flat-analysis.ts index dbcbb30..adeb0a6 100644 --- a/packages/ingest-github/src/strategies/flat-folder/phases/store-flat-analysis.ts +++ b/packages/ingest-github/src/strategies/flat-folder/phases/store-flat-analysis.ts @@ -4,7 +4,7 @@ import { ensureFlatFolderIndexes, upsertFileNode, upsertFolderNode, upsertRepoNo import type { GithubIndexPayload } from "@bb/types"; import type { MetaPaths } from "#src/types/meta-paths.ts"; import { throwIfCancelled } from "#src/pipeline/cancellation.ts"; -import { iterateCondensed } from "#src/strategies/flat-folder/big-file/storage.ts"; +import type { FileAnalysisCache } from "#src/strategies/flat-folder/file-analysis-cache.ts"; import { iterateFolderSummaries } from "#src/strategies/flat-folder/folder-summary.ts"; import { directFolderOf } from "#src/strategies/flat-folder/folder-path.ts"; import { languageFromPath } from "#src/adapters/llm-file-analyzer.ts"; @@ -16,6 +16,7 @@ export interface StoreFlatAnalysisInput { payload: GithubIndexPayload; branch: string; metaPaths: MetaPaths; + cache: FileAnalysisCache; progressContext?: ProgressContext; } @@ -89,13 +90,12 @@ export async function storeFlatAnalysis(input: StoreFlatAnalysisInput): Promise< const fileReporter = input.progressContext?.reporter({ phase: "indexing", subPhase: "files", - total: { kind: "growing" }, + total: { kind: "fixed", total: input.cache.size }, }); await fileReporter?.start(); try { - for await (const file of iterateCondensed(input.metaPaths)) { + for (const file of input.cache.values()) { throwIfCancelled(input.scope.knowledgeId); - fileReporter?.incrementSeen(); const folderPath = directFolderOf(file.relativePath); if (!folderPaths.has(folderPath)) { await upsertFolderNode({ From 13970c7dd0c0b2e07ea20abf4d31b7339cc1ee26 Mon Sep 17 00:00:00 2001 From: Dead-Bytes <143434285+Dead-Bytes@users.noreply.github.com> Date: Fri, 22 May 2026 14:49:57 +0530 Subject: [PATCH 27/34] refactor: add folder summary batching configuration and enhance folder summary processing --- packages/config/src/schema.ts | 14 + packages/ingest-github/README.md | 17 +- packages/ingest-github/src/pipeline/pull.ts | 6 +- .../flat-folder/folder-summary-selective.ts | 76 ++--- .../strategies/flat-folder/folder-summary.ts | 292 +++++++++++++++--- .../src/strategies/flat-folder/index.ts | 1 + .../strategies/flat-folder/phases/README.md | 14 +- .../flat-folder/prompts/folder-summary.ts | 54 ++++ packages/types/src/config.ts | 2 + 9 files changed, 378 insertions(+), 98 deletions(-) diff --git a/packages/config/src/schema.ts b/packages/config/src/schema.ts index 77a7468..1f7a021 100644 --- a/packages/config/src/schema.ts +++ b/packages/config/src/schema.ts @@ -42,6 +42,8 @@ export const configSchema = z "absolute.file.size.cap": z.number().int().positive().default(52428800), "concurrent.workers": z.number().int().positive().default(4), "llm.concurrency": z.number().int().positive().default(29), + "folder.summary.batch.size": z.number().int().positive().default(10), + "folder.summary.batch.max.files": z.number().int().positive().default(15), "condense.context.limit": z.number().int().positive().default(12000), "condense.prompt.overhead": z.number().int().nonnegative().default(1500), "small.file.dedup.threshold": z.number().int().positive().default(3), @@ -83,6 +85,8 @@ export type ConfigValueMap = { [Config.AbsoluteFileSizeCap]: number; [Config.ConcurrentWorkers]: number; [Config.LlmConcurrency]: number; + [Config.FolderSummaryBatchSize]: number; + [Config.FolderSummaryBatchMaxFiles]: number; [Config.CondenseContextLimit]: number; [Config.CondensePromptOverhead]: number; [Config.SmallFileDedupThreshold]: number; @@ -138,6 +142,8 @@ export const HINTS: Readonly> = { [Config.AbsoluteFileSizeCap]: "bytebell set absolute.file.size.cap ", [Config.ConcurrentWorkers]: "bytebell set concurrent.workers ", [Config.LlmConcurrency]: "bytebell set llm.concurrency ", + [Config.FolderSummaryBatchSize]: "bytebell set folder.summary.batch.size ", + [Config.FolderSummaryBatchMaxFiles]: "bytebell set folder.summary.batch.max.files ", [Config.CondenseContextLimit]: "bytebell set condense.context.limit ", [Config.CondensePromptOverhead]: "bytebell set condense.prompt.overhead ", [Config.SmallFileDedupThreshold]: "bytebell set small.file.dedup.threshold ", @@ -200,6 +206,10 @@ export function readField(cfg: BytebellConfig, key: K): Config return cfg["concurrent.workers"] as ConfigValue; case Config.LlmConcurrency: return cfg["llm.concurrency"] as ConfigValue; + case Config.FolderSummaryBatchSize: + return cfg["folder.summary.batch.size"] as ConfigValue; + case Config.FolderSummaryBatchMaxFiles: + return cfg["folder.summary.batch.max.files"] as ConfigValue; case Config.CondenseContextLimit: return cfg["condense.context.limit"] as ConfigValue; case Config.CondensePromptOverhead: @@ -271,6 +281,10 @@ export function writeField(cfg: BytebellConfig, key: K, value: return { ...cfg, "concurrent.workers": value as number }; case Config.LlmConcurrency: return { ...cfg, "llm.concurrency": value as number }; + case Config.FolderSummaryBatchSize: + return { ...cfg, "folder.summary.batch.size": value as number }; + case Config.FolderSummaryBatchMaxFiles: + return { ...cfg, "folder.summary.batch.max.files": value as number }; case Config.CondenseContextLimit: return { ...cfg, "condense.context.limit": value as number }; case Config.CondensePromptOverhead: diff --git a/packages/ingest-github/README.md b/packages/ingest-github/README.md index c9e1ca2..9d28387 100644 --- a/packages/ingest-github/README.md +++ b/packages/ingest-github/README.md @@ -153,10 +153,19 @@ worker hardcodes a single `IngestionStrategy` instance (currently 1. **Shared LLM concurrency limiter.** The flat-folder strategy constructs one `withConcurrency(Config.LlmConcurrency)` instance at entry (default 29). The small-file phase, the big-file chunk phase, - and per-file condense calls all check out from this single pool, so - total in-flight LLM calls is bounded by one knob. The legacy - `processBigFile` driver used by the pull-path still uses its own - per-file pool sized by `Config.BigFileConcurrency`. + per-file condense calls, **and the folder-summary phase** all check + out from this single pool, so total in-flight LLM calls is bounded + by one knob. The pull-path constructs its own shared limiter at + `runPull` entry and threads it into the selective folder-summary + phase. The legacy `processBigFile` driver used by the pull-path + still uses its own per-file pool sized by `Config.BigFileConcurrency`. +2. **Folder-summary batching by default.** Phase 5 groups small folders + (`≤ Config.FolderSummaryBatchMaxFiles`, default 15) into batches of + up to `Config.FolderSummaryBatchSize` (default 10) and asks the LLM + for one JSON object keyed by integer label that returns one summary + per folder. Bigger folders take the individual single-folder path. + Roll back to one LLM call per folder via + `bytebell set folder.summary.batch.size 1`. 2. **Clone idempotent.** Re-runs (BullMQ retries) call `git fetch` + `git reset --hard` in the existing dir rather than re-cloning. Tokens are re-injected into the remote URL each time. diff --git a/packages/ingest-github/src/pipeline/pull.ts b/packages/ingest-github/src/pipeline/pull.ts index 6ffab0b..8a4c706 100644 --- a/packages/ingest-github/src/pipeline/pull.ts +++ b/packages/ingest-github/src/pipeline/pull.ts @@ -1,4 +1,6 @@ -import { KnowledgeState, type GithubPullPayload, type JobMessage } from "@bb/types"; +import { Config, KnowledgeState, type GithubPullPayload, type JobMessage } from "@bb/types"; +import { getConfigValue } from "@bb/config"; +import { withConcurrency } from "./concurrency.ts"; import { getKnowledge, markKnowledgeFailed, setKnowledgeCommit, setKnowledgeState } from "@bb/mongo"; import { setKnowledgeStateInGraph, snapshotFilesToVersion, type NodeScope } from "@bb/neo4j"; import type { PipelineSummary } from "#src/types/pipeline.ts"; @@ -196,6 +198,7 @@ export async function runPull( logger.info(`pull: loading file-analysis cache`); throwIfCancelled(knowledgeId); const fileAnalysisCache = await FileAnalysisCache.loadAll(metaPaths); + const limiter = withConcurrency(getConfigValue(Config.LlmConcurrency)); logger.info(`pull: phase backfill fields starting`); throwIfCancelled(knowledgeId); @@ -221,6 +224,7 @@ export async function runPull( knowledgeId, metaPaths, cache: fileAnalysisCache, + limiter, affectedFolders, }; if (llmCallContext !== undefined) { diff --git a/packages/ingest-github/src/strategies/flat-folder/folder-summary-selective.ts b/packages/ingest-github/src/strategies/flat-folder/folder-summary-selective.ts index 9b4e71c..17ac699 100644 --- a/packages/ingest-github/src/strategies/flat-folder/folder-summary-selective.ts +++ b/packages/ingest-github/src/strategies/flat-folder/folder-summary-selective.ts @@ -1,21 +1,19 @@ import { logger } from "@bb/logger"; -import { Config } from "@bb/types"; -import { getConfigValue } from "@bb/config"; import type { AskLlmOptions } from "@bb/llm"; +import type { CondensedFileAnalysis } from "#src/types/condensed-file-analysis.ts"; import type { MetaPaths } from "#src/types/meta-paths.ts"; -import { withConcurrency } from "#src/pipeline/concurrency.ts"; -import { throwIfCancelled, CancellationError } from "#src/pipeline/cancellation.ts"; +import type { ConcurrencyLimiter } from "#src/pipeline/concurrency.ts"; import type { FileAnalysisCache } from "#src/strategies/flat-folder/file-analysis-cache.ts"; import { + dispatchFolderSummaries, groupByDirectFolder, - persistFolderSummary, - summariseFolder, } from "#src/strategies/flat-folder/folder-summary.ts"; export interface SelectiveFolderSummaryInput { knowledgeId: string; metaPaths: MetaPaths; cache: FileAnalysisCache; + limiter: ConcurrencyLimiter; affectedFolders: Set; llmCallContext?: AskLlmOptions; } @@ -29,57 +27,39 @@ export interface SelectiveFolderSummaryResult { /** * Pull-time folder summary. Same machinery as `runFolderSummaryPhase` but - * only regenerates folders the caller flagged as affected. Reads condensed - * file analyses from disk; the dispatcher must have populated them already. + * only regenerates folders the caller flagged as affected. Filters by + * `affectedFolders` BEFORE batching so skipped folders never enter a batch. */ export async function runSelectiveFolderSummary( input: SelectiveFolderSummaryInput, ): Promise { - const concurrentWorkers = getConfigValue(Config.ConcurrentWorkers); - const limit = withConcurrency(concurrentWorkers); - const groups = groupByDirectFolder(input.cache); - let succeeded = 0; - let failed = 0; + const allGroups = groupByDirectFolder(input.cache); + const affectedGroups = new Map(); let skipped = 0; - let totalInputTokens = 0; - let totalOutputTokens = 0; - let totalCostUsd = 0; - const tasks: Promise[] = []; - for (const [folderPath, files] of groups.entries()) { - if (!input.affectedFolders.has(folderPath)) { + for (const [folderPath, files] of allGroups.entries()) { + if (input.affectedFolders.has(folderPath)) { + affectedGroups.set(folderPath, files); + } else { skipped += 1; - continue; } - tasks.push( - limit(async () => { - try { - throwIfCancelled(input.knowledgeId); - const { summary, tokenUsage } = await summariseFolder(folderPath, files, input.llmCallContext); - totalInputTokens += tokenUsage.inputTokens; - totalOutputTokens += tokenUsage.outputTokens; - totalCostUsd += tokenUsage.costUsd; - if (summary !== null) { - await persistFolderSummary(input.metaPaths, summary); - succeeded += 1; - } else { - failed += 1; - } - } catch (cause: unknown) { - if (cause instanceof CancellationError) { - throw cause; - } - failed += 1; - logger.warn(`pull-folder-summary: failed for ${folderPath || ""}`); - } - }), - ); } - await Promise.all(tasks); - logger.info(`pull-folder-summary done: succeeded=${succeeded} failed=${failed} skipped=${skipped}`); + + const totals = await dispatchFolderSummaries( + affectedGroups, + input.metaPaths, + input.limiter, + input.llmCallContext, + undefined, + input.knowledgeId, + "pull-folder-summary", + ); + logger.info( + `pull-folder-summary done: succeeded=${totals.succeeded} failed=${totals.failed} skipped=${skipped}`, + ); return { - succeeded, - failed, + succeeded: totals.succeeded, + failed: totals.failed, skipped, - tokenUsage: { inputTokens: totalInputTokens, outputTokens: totalOutputTokens, costUsd: totalCostUsd }, + tokenUsage: { inputTokens: totals.inputTokens, outputTokens: totals.outputTokens, costUsd: totals.costUsd }, }; } diff --git a/packages/ingest-github/src/strategies/flat-folder/folder-summary.ts b/packages/ingest-github/src/strategies/flat-folder/folder-summary.ts index 805eae6..a5d95a3 100644 --- a/packages/ingest-github/src/strategies/flat-folder/folder-summary.ts +++ b/packages/ingest-github/src/strategies/flat-folder/folder-summary.ts @@ -8,12 +8,18 @@ import { getConfigValue } from "@bb/config"; import type { CondensedFileAnalysis } from "#src/types/condensed-file-analysis.ts"; import type { MetaPaths } from "#src/types/meta-paths.ts"; import { encodeMetaPath } from "#src/pipeline/paths.ts"; -import { withConcurrency } from "#src/pipeline/concurrency.ts"; +import type { ConcurrencyLimiter } from "#src/pipeline/concurrency.ts"; import { throwIfCancelled, CancellationError } from "#src/pipeline/cancellation.ts"; import type { ProgressContext } from "#src/progress/types.ts"; import type { FileAnalysisCache } from "./file-analysis-cache.ts"; import { directFolderOf } from "./folder-path.ts"; -import { FOLDER_ANALYSIS_SYSTEM_PROMPT, folderAnalysisUserPrompt } from "./prompts/folder-summary.ts"; +import { + FOLDER_ANALYSIS_SYSTEM_PROMPT, + FOLDER_BATCH_SYSTEM_PROMPT, + folderAnalysisUserPrompt, + folderBatchUserPrompt, + type BatchedFolderInput, +} from "./prompts/folder-summary.ts"; import type { FolderSummary } from "./types.ts"; export function groupByDirectFolder(cache: FileAnalysisCache): Map { @@ -38,6 +44,52 @@ interface FolderSummaryJson { dependencyGraph?: unknown; } +export interface FolderBucket { + folderPath: string; + files: CondensedFileAnalysis[]; +} + +/** + * Splits the folder groups into "individual" (one LLM call per folder, used + * for big folders or when batching is disabled) and "batches" (N small + * folders summarised in one LLM call). Driven by `Config.FolderSummaryBatchSize` + * (set to 1 to disable batching entirely) and `Config.FolderSummaryBatchMaxFiles` + * (folders exceeding this file count always take the individual path). + * + * Folders are sorted by path so that two runs of the same repo produce the + * same batch composition — helpful when A/B-comparing outputs. + */ +export function groupFoldersForBatching(groups: Map): { + individual: FolderBucket[]; + batches: FolderBucket[][]; +} { + const batchSize = getConfigValue(Config.FolderSummaryBatchSize); + const maxFiles = getConfigValue(Config.FolderSummaryBatchMaxFiles); + const sorted: FolderBucket[] = [...groups.entries()] + .map(([folderPath, files]) => ({ folderPath, files })) + .sort((a, b) => a.folderPath.localeCompare(b.folderPath)); + + if (batchSize <= 1) { + return { individual: sorted, batches: [] }; + } + + const individual: FolderBucket[] = []; + const batchable: FolderBucket[] = []; + for (const bucket of sorted) { + if (bucket.files.length > maxFiles) { + individual.push(bucket); + } else { + batchable.push(bucket); + } + } + + const batches: FolderBucket[][] = []; + for (let i = 0; i < batchable.length; i += batchSize) { + batches.push(batchable.slice(i, i + batchSize)); + } + return { individual, batches }; +} + export async function summariseFolder( folderPath: string, files: CondensedFileAnalysis[], @@ -82,6 +134,72 @@ export async function summariseFolder( } } +/** + * Multi-folder summary. Builds a label-indexed prompt, parses the keyed JSON + * response, returns one `FolderSummary | null` per folder. Folders missing + * from the response (or whose entry fails shape validation) are surfaced as + * `null` with a warn log; the caller counts those as failed. + */ +export async function summariseFolderBatch( + batch: FolderBucket[], + llmCallContext?: AskLlmOptions, +): Promise<{ + summaries: Map; + tokenUsage: { inputTokens: number; outputTokens: number; costUsd: number }; +}> { + const labeled: BatchedFolderInput[] = batch.map((b, i) => ({ label: i, folderPath: b.folderPath, files: b.files })); + const userPrompt = folderBatchUserPrompt(labeled); + const summaries = new Map(); + try { + const response = await askJsonLLM>( + FOLDER_BATCH_SYSTEM_PROMPT, + userPrompt, + llmCallContext ?? {}, + ); + if (response.result === null) { + logger.warn(`summariseFolderBatch: batch of ${batch.length} returned unparseable JSON`); + for (const b of batch) { + summaries.set(b.folderPath, null); + } + return { + summaries, + tokenUsage: { + inputTokens: response.usage.inputTokens, + outputTokens: response.usage.outputTokens, + costUsd: response.usage.costUsd, + }, + }; + } + for (const b of labeled) { + const raw = response.result[String(b.label)]; + if (raw === undefined || typeof raw !== "object" || raw === null) { + logger.warn(`summariseFolderBatch: missing/invalid entry for label ${b.label} (${b.folderPath || ""})`); + summaries.set(b.folderPath, null); + continue; + } + summaries.set(b.folderPath, shapeFolderSummary(b.folderPath, raw)); + } + return { + summaries, + tokenUsage: { + inputTokens: response.usage.inputTokens, + outputTokens: response.usage.outputTokens, + costUsd: response.usage.costUsd, + }, + }; + } catch (cause: unknown) { + if (cause instanceof LlmConfigError || cause instanceof LlmError) { + throw cause; + } + const msg = cause instanceof Error ? cause.message : String(cause); + logger.warn(`summariseFolderBatch: batch of ${batch.length} askJsonLLM failed: ${msg}`); + for (const b of batch) { + summaries.set(b.folderPath, null); + } + return { summaries, tokenUsage: { inputTokens: 0, outputTokens: 0, costUsd: 0 } }; + } +} + export async function persistFolderSummary(metaPaths: MetaPaths, summary: FolderSummary): Promise { const file = path.join(metaPaths.folderSummariesDir, `${encodeMetaPath(summary.folderPath || "__ROOT__")}.json`); await writeFile(file, JSON.stringify(summary, null, 2), "utf8"); @@ -110,10 +228,134 @@ export async function* iterateFolderSummaries(metaPaths: MetaPaths): AsyncGenera } } +interface FolderSummaryTotals { + succeeded: number; + failed: number; + inputTokens: number; + outputTokens: number; + costUsd: number; +} + +/** + * Dispatches a single folder through `summariseFolder` and persists the + * result. Shared between `runFolderSummaryPhase` and `runSelectiveFolderSummary`. + */ +async function dispatchIndividual( + bucket: FolderBucket, + metaPaths: MetaPaths, + totals: FolderSummaryTotals, + llmCallContext: AskLlmOptions | undefined, + reporter: ReturnType> | undefined, + knowledgeId: string, + phaseLabel: string, +): Promise { + try { + throwIfCancelled(knowledgeId); + const { summary, tokenUsage } = await summariseFolder(bucket.folderPath, bucket.files, llmCallContext); + totals.inputTokens += tokenUsage.inputTokens; + totals.outputTokens += tokenUsage.outputTokens; + totals.costUsd += tokenUsage.costUsd; + if (summary !== null) { + await persistFolderSummary(metaPaths, summary); + totals.succeeded += 1; + } else { + totals.failed += 1; + } + } catch (cause: unknown) { + if (cause instanceof CancellationError) { + throw cause; + } + totals.failed += 1; + logger.warn(`${phaseLabel}: folder summary failed for ${bucket.folderPath || ""}`); + } finally { + reporter?.increment(1, { fileName: bucket.folderPath || "" }); + } +} + +/** + * Dispatches a multi-folder batch through `summariseFolderBatch`. Each + * non-null per-folder summary is persisted; missing/null entries count + * toward `failed`. Progress increments once per folder. + */ +async function dispatchBatch( + batch: FolderBucket[], + metaPaths: MetaPaths, + totals: FolderSummaryTotals, + llmCallContext: AskLlmOptions | undefined, + reporter: ReturnType> | undefined, + knowledgeId: string, + phaseLabel: string, +): Promise { + try { + throwIfCancelled(knowledgeId); + const { summaries, tokenUsage } = await summariseFolderBatch(batch, llmCallContext); + totals.inputTokens += tokenUsage.inputTokens; + totals.outputTokens += tokenUsage.outputTokens; + totals.costUsd += tokenUsage.costUsd; + for (const bucket of batch) { + const summary = summaries.get(bucket.folderPath) ?? null; + if (summary !== null) { + try { + await persistFolderSummary(metaPaths, summary); + totals.succeeded += 1; + } catch (cause: unknown) { + totals.failed += 1; + logger.warn( + `${phaseLabel}: persist failed for ${bucket.folderPath || ""}: ${cause instanceof Error ? cause.message : String(cause)}`, + ); + } + } else { + totals.failed += 1; + } + reporter?.increment(1, { fileName: bucket.folderPath || "" }); + } + } catch (cause: unknown) { + if (cause instanceof CancellationError) { + throw cause; + } + totals.failed += batch.length; + for (const bucket of batch) { + reporter?.increment(1, { fileName: bucket.folderPath || "" }); + } + logger.warn( + `${phaseLabel}: batch summary failed for ${batch.length} folders: ${cause instanceof Error ? cause.message : String(cause)}`, + ); + } +} + +/** + * Dispatch helper used by both `runFolderSummaryPhase` and + * `runSelectiveFolderSummary`. Splits `groups` into individual + batched + * buckets, schedules every task through the shared `limiter`, awaits all, + * and returns the aggregated totals. + */ +export async function dispatchFolderSummaries( + groups: Map, + metaPaths: MetaPaths, + limiter: ConcurrencyLimiter, + llmCallContext: AskLlmOptions | undefined, + reporter: ReturnType> | undefined, + knowledgeId: string, + phaseLabel: string, +): Promise { + const totals: FolderSummaryTotals = { succeeded: 0, failed: 0, inputTokens: 0, outputTokens: 0, costUsd: 0 }; + const { individual, batches } = groupFoldersForBatching(groups); + const tasks: Promise[] = []; + for (const bucket of individual) { + tasks.push(limiter(() => dispatchIndividual(bucket, metaPaths, totals, llmCallContext, reporter, knowledgeId, phaseLabel))); + } + for (const batch of batches) { + tasks.push(limiter(() => dispatchBatch(batch, metaPaths, totals, llmCallContext, reporter, knowledgeId, phaseLabel))); + } + await Promise.all(tasks); + return totals; +} + export async function runFolderSummaryPhase( knowledgeId: string, metaPaths: MetaPaths, cache: FileAnalysisCache, + limiter: ConcurrencyLimiter, llmCallContext?: AskLlmOptions, progressContext?: ProgressContext, ): Promise<{ @@ -121,57 +363,23 @@ export async function runFolderSummaryPhase( failed: number; tokenUsage: { inputTokens: number; outputTokens: number; costUsd: number }; }> { - const concurrentWorkers = getConfigValue(Config.ConcurrentWorkers); - const limit = withConcurrency(concurrentWorkers); const groups = groupByDirectFolder(cache); - let succeeded = 0; - let failed = 0; - let totalInputTokens = 0; - let totalOutputTokens = 0; - let totalCostUsd = 0; const reporter = progressContext?.reporter({ phase: "folder_analysis", total: { kind: "fixed", total: groups.size }, }); await reporter?.start(); + let totals: FolderSummaryTotals; try { - const tasks: Promise[] = []; - for (const [folderPath, files] of groups.entries()) { - tasks.push( - limit(async () => { - try { - throwIfCancelled(knowledgeId); - const { summary, tokenUsage } = await summariseFolder(folderPath, files, llmCallContext); - totalInputTokens += tokenUsage.inputTokens; - totalOutputTokens += tokenUsage.outputTokens; - totalCostUsd += tokenUsage.costUsd; - if (summary !== null) { - await persistFolderSummary(metaPaths, summary); - succeeded += 1; - } else { - failed += 1; - } - } catch (cause: unknown) { - if (cause instanceof CancellationError) { - throw cause; - } - failed += 1; - logger.warn(`phase5: folder summary failed for ${folderPath || ""}`); - } finally { - reporter?.increment(1, { fileName: folderPath || "" }); - } - }), - ); - } - await Promise.all(tasks); + totals = await dispatchFolderSummaries(groups, metaPaths, limiter, llmCallContext, reporter, knowledgeId, "phase5"); } finally { reporter?.stop(); } - logger.info(`phase5 done: foldersSummarised=${succeeded} failed=${failed}`); + logger.info(`phase5 done: foldersSummarised=${totals.succeeded} failed=${totals.failed}`); return { - succeeded, - failed, - tokenUsage: { inputTokens: totalInputTokens, outputTokens: totalOutputTokens, costUsd: totalCostUsd }, + succeeded: totals.succeeded, + failed: totals.failed, + tokenUsage: { inputTokens: totals.inputTokens, outputTokens: totals.outputTokens, costUsd: totals.costUsd }, }; } diff --git a/packages/ingest-github/src/strategies/flat-folder/index.ts b/packages/ingest-github/src/strategies/flat-folder/index.ts index de9211b..c23f42c 100644 --- a/packages/ingest-github/src/strategies/flat-folder/index.ts +++ b/packages/ingest-github/src/strategies/flat-folder/index.ts @@ -116,6 +116,7 @@ export function createFlatFolderStrategy(deps: FlatFolderStrategyDeps): IngestSt knowledgeId, metaPaths, fileAnalysisCache, + limiter, llmCallContext, progressContext, ); diff --git a/packages/ingest-github/src/strategies/flat-folder/phases/README.md b/packages/ingest-github/src/strategies/flat-folder/phases/README.md index 05ee606..cdcfddb 100644 --- a/packages/ingest-github/src/strategies/flat-folder/phases/README.md +++ b/packages/ingest-github/src/strategies/flat-folder/phases/README.md @@ -127,9 +127,17 @@ so phases 5 and 7 see the updated entries without re-reading disk. do not abort on a single bad file. Only `CancellationError`, `LlmConfigError`, and `LlmError` propagate. - The shared LLM limiter is the only place LLM concurrency is bounded - during the small/big phases. `Config.BigFileConcurrency` is no longer - consulted from the chunk-queue path (it is still consulted by the - legacy `processBigFile` used by the pull-path driver). + during the small/big phases **and the folder-summary phase**. + `Config.BigFileConcurrency` is no longer consulted from the chunk-queue + path (it is still consulted by the legacy `processBigFile` used by the + pull-path driver). `Config.ConcurrentWorkers` is no longer consulted + by the folder-summary phase. +- Phase 5 batches small folders by default. `Config.FolderSummaryBatchSize` + (default 10) controls batch size; set to 1 to disable and restore one + LLM call per folder. `Config.FolderSummaryBatchMaxFiles` (default 15) + is the per-folder file ceiling above which a folder always takes the + individual path so the LLM still sees the full per-file context. Large + folders run side-by-side with batches under the same shared limiter. - Phase 1 respects `Config.ContextWindowLimit` and `Config.MaxTokensPerChunk`; do not hardcode either. - Phase 7 always emits a `:Repo` node, even when `repo-summary.json` is diff --git a/packages/ingest-github/src/strategies/flat-folder/prompts/folder-summary.ts b/packages/ingest-github/src/strategies/flat-folder/prompts/folder-summary.ts index 10276a8..30e110b 100644 --- a/packages/ingest-github/src/strategies/flat-folder/prompts/folder-summary.ts +++ b/packages/ingest-github/src/strategies/flat-folder/prompts/folder-summary.ts @@ -40,3 +40,57 @@ Per-file analyses (direct children only): ${serialised}`; } + +export const FOLDER_BATCH_SYSTEM_PROMPT = `You are summarising MULTIPLE small folders of a source repository in one pass. The user will provide several folders, each labeled with an integer ID (0, 1, 2, ...). Each folder lists the files directly inside it (subfolders are summarised separately and are NOT in your input). + +Return ONLY a JSON object whose keys are the integer labels as strings ("0", "1", ...) and whose values are folder-summary objects with EXACTLY these keys: + +- purpose : string — one-paragraph explanation of what this folder is responsible for. +- summary : string — natural-language summary of how the files in this folder work together. Plain English, no key-value pairs. ≤ 300 tokens. +- keywords : string[] — up to 10 domain keywords describing this folder. +- classes : string[] — most important class/type entries, deduplicated. Format "Name: short purpose". Max 15 entries. +- functions : string[] — most important function/method entries, deduplicated. Format "name: short purpose". Max 15 entries. +- importsInternal : string[] — significant relative imports observed across the folder's files. Max 15 entries. +- importsExternal : string[] — significant external packages observed across the folder's files. Max 15 entries. +- dependencyGraph : string — Mermaid \`graph LR\` block (no triple-backtick fences) of inter-file dependencies. Empty string if not enough signal. + +You MUST return one entry per labeled folder, even if some fields are empty arrays. Do NOT invent files not listed. Do NOT speculate about subfolders. Do NOT add keys outside the integer-label set; do NOT add commentary outside the JSON object.`; + +export interface BatchedFolderInput { + label: number; + folderPath: string; + files: CondensedFileAnalysis[]; +} + +export function folderBatchUserPrompt(batch: BatchedFolderInput[]): string { + const sections = batch.map((b) => { + const folderLabel = b.folderPath.length === 0 ? "" : b.folderPath; + const fileLines = b.files.map((f) => `- ${f.relativePath}: ${f.analysis.purpose}`).join("\n"); + const aggregatedKeywords = aggregateKeywords(b.files, 10); + return `### Folder ${b.label} :: ${folderLabel} +Files: ${b.files.length} +${fileLines} +Aggregated keywords: ${JSON.stringify(aggregatedKeywords)}`; + }); + return `You are summarising ${batch.length} folder(s). Produce one folder-summary object per labeled folder. + +${sections.join("\n\n")}`; +} + +function aggregateKeywords(files: CondensedFileAnalysis[], cap: number): string[] { + const seen = new Set(); + const out: string[] = []; + for (const f of files) { + for (const k of f.analysis.keywords) { + if (typeof k !== "string" || k.length === 0 || seen.has(k)) { + continue; + } + seen.add(k); + out.push(k); + if (out.length >= cap) { + return out; + } + } + } + return out; +} diff --git a/packages/types/src/config.ts b/packages/types/src/config.ts index 950cb81..1e72f67 100644 --- a/packages/types/src/config.ts +++ b/packages/types/src/config.ts @@ -24,6 +24,8 @@ export enum Config { AbsoluteFileSizeCap = "absolute.file.size.cap", ConcurrentWorkers = "concurrent.workers", LlmConcurrency = "llm.concurrency", + FolderSummaryBatchSize = "folder.summary.batch.size", + FolderSummaryBatchMaxFiles = "folder.summary.batch.max.files", CondenseContextLimit = "condense.context.limit", CondensePromptOverhead = "condense.prompt.overhead", SmallFileDedupThreshold = "small.file.dedup.threshold", From d4b99b12bf5ab7f8c8a99520486cd9d99dcaaef5 Mon Sep 17 00:00:00 2001 From: Dead-Bytes <143434285+Dead-Bytes@users.noreply.github.com> Date: Fri, 22 May 2026 14:59:27 +0530 Subject: [PATCH 28/34] refactor: remove backfillBigFiles phase and update related documentation --- packages/ingest-github/src/pipeline/pull.ts | 14 ---- .../strategies/flat-folder/backfill/README.md | 77 +++++++++--------- .../flat-folder/backfill/big-files.ts | 78 ------------------- .../strategies/flat-folder/big-file/README.md | 21 ++--- .../src/strategies/flat-folder/index.ts | 14 ---- .../strategies/flat-folder/phases/README.md | 4 +- .../flat-folder/phases/process-big-files.ts | 48 ++++++++++-- 7 files changed, 93 insertions(+), 163 deletions(-) delete mode 100644 packages/ingest-github/src/strategies/flat-folder/backfill/big-files.ts diff --git a/packages/ingest-github/src/pipeline/pull.ts b/packages/ingest-github/src/pipeline/pull.ts index 8a4c706..fbc960a 100644 --- a/packages/ingest-github/src/pipeline/pull.ts +++ b/packages/ingest-github/src/pipeline/pull.ts @@ -21,7 +21,6 @@ import { nullProgressContextFactory } from "#src/progress/NullProgressReporter.t import { analyseChangedFiles } from "#src/strategies/flat-folder/analyse-changed.ts"; import { processBigFilesQueue } from "#src/strategies/flat-folder/phases/process-big-files.ts"; import { backfillMissingFields } from "#src/strategies/flat-folder/backfill/fields.ts"; -import { backfillBigFiles } from "#src/strategies/flat-folder/backfill/big-files.ts"; import { FileAnalysisCache } from "#src/strategies/flat-folder/file-analysis-cache.ts"; import { runSelectiveFolderSummary } from "#src/strategies/flat-folder/folder-summary-selective.ts"; import { @@ -204,19 +203,6 @@ export async function runPull( throwIfCancelled(knowledgeId); await backfillMissingFields(metaPaths, fileAnalysisCache, llmCallContext, progressContext); - logger.info(`pull: phase backfill big-files starting`); - throwIfCancelled(knowledgeId); - const backfillBigFilesInput: Parameters[0] = { - knowledgeId, - source, - metaPaths, - progressContext, - }; - if (llmCallContext !== undefined) { - backfillBigFilesInput.llmCallContext = llmCallContext; - } - await backfillBigFiles(backfillBigFilesInput); - progressContext.phaseChanged("folder_analysis"); logger.info(`pull: phase selective folder summary (${affectedFolders.size} folders) starting`); throwIfCancelled(knowledgeId); diff --git a/packages/ingest-github/src/strategies/flat-folder/backfill/README.md b/packages/ingest-github/src/strategies/flat-folder/backfill/README.md index dfa3d72..34f744d 100644 --- a/packages/ingest-github/src/strategies/flat-folder/backfill/README.md +++ b/packages/ingest-github/src/strategies/flat-folder/backfill/README.md @@ -1,66 +1,65 @@ # `@bb/ingest-github/src/strategies/flat-folder/backfill` -Post-analysis top-up phases. After Phases 1 and 2 have produced -`CondensedFileAnalysis` JSON on disk, the backfill phases sweep the cache -to fill gaps left by per-file LLM noise or by interrupted big-file runs. -Both are idempotent and skip entries that already look complete. +Post-analysis top-up. After Phases 1 and 2 have produced +`CondensedFileAnalysis` JSON on disk, this phase sweeps the in-memory +cache to fill extended-analysis fields the main per-file prompt left +empty. Idempotent — entries that already look complete are skipped +without an LLM call. + +The big-file backfill phase that used to live here was removed: the +new chunk-task-queue model in `phases/process-big-files.ts` handles +crash recovery directly via the per-chunk disk cache and `inspect()`, +and same-run condense failures are now retried twice in-place before +being marked failed. ## Files -- `fields.ts` — Phase 3. `backfillMissingFields(metaPaths, llmCallContext?, progressContext?)` - iterates every condensed entry via `iterateCondensed`, computes which - extended-analysis fields are missing (`keywords`, `ontologyConcepts`, - `businessEntities`, `systemCapabilities`, `sideEffects`, - `configDependencies`, `dataFlowDirection`, `integrationSurface`, - `contractsProvided`, `contractsConsumed`, `sectionMap`), and asks one - LLM call per file to fill only the missing slots. The response is - validated and normalised (`pickStringArray`, `pickSections`) before - being written back via `saveCondensed`. Entries with nothing missing - are skipped without an LLM call. When `progressContext` is present - this phase opens a growing-total reporter (`subPhase: "backfill"`) - because `iterateCondensed`'s size is not known up front. -- `big-files.ts` — Phase 4. `backfillBigFiles({knowledgeId, repoDir, -metaPaths, llmCallContext?, progressContext?})` re-reads - `bigFiles.json`, skips `reason === "too-large"`, and for each - non-complete entry (per `inspect`) re-runs `processBigFile` against - the file on disk so the condensed JSON is rebuilt from cached chunks - where possible. When `progressContext` is present this phase opens a - fixed-total reporter (`subPhase: "backfill:big_files"`, sized by - `bigFiles.json`) and forwards itself into `processBigFile` so per-file - chunk pulses also surface. +- `fields.ts` — Phase 3. `backfillMissingFields(metaPaths, cache, llmCallContext?, progressContext?)` + iterates every condensed entry from the shared `FileAnalysisCache`, + computes which extended-analysis fields are missing (`keywords`, + `ontologyConcepts`, `businessEntities`, `systemCapabilities`, + `sideEffects`, `configDependencies`, `dataFlowDirection`, + `integrationSurface`, `contractsProvided`, `contractsConsumed`, + `sectionMap`), and asks one LLM call per file to fill only the + missing slots. The response is validated and normalised + (`pickStringArray`, `pickSections`) before being written back via + `saveCondensed` **and** mirrored into the cache via `cache.set(entry)` + so downstream phases (folder summary, graph store) see the updated + entry without re-reading disk. Entries with nothing missing are + skipped without an LLM call. Progress reporter is fixed-total sized + by `cache.size`. ## Public interfaces -- `backfillMissingFields(metaPaths, llmCallContext?, progressContext?): Promise<{ updated, failed }>` -- `backfillBigFiles(input: BackfillBigFilesInput): Promise` - — `BackfillBigFilesInput` carries an optional `llmCallContext?: AskLlmOptions` that the inner `processBigFile` call uses to forward per-job LLM credentials, and an optional `progressContext?: ProgressContext` for the per-phase reporter described above. +- `backfillMissingFields(metaPaths, cache, llmCallContext?, progressContext?): Promise<{ updated, failed }>` -Both return phase-summary counters consumed by `createFlatFolderStrategy` +Returns phase-summary counters consumed by `createFlatFolderStrategy` to roll up into the strategy result. ## Data ownership -These phases own no new on-disk artifacts. They mutate existing condensed -JSON in place via `saveCondensed`, and (Phase 4) drive `processBigFile` to -refresh the chunk and condensed caches under `big-file/storage.ts`. +This phase owns no new on-disk artifacts. It mutates existing +condensed JSON in place via `saveCondensed` and mirrors the same +mutation into `FileAnalysisCache`. ## Invariants - Idempotent: a second run is a no-op once every entry passes the completeness check. - Per-file LLM failure is logged and counted, never thrown. The phase - continues to the next entry. -- LLM output is untrusted: missing slots are filled only when the response - yields a non-empty value of the expected shape; partial responses leave - unfilled slots for a future pass. -- Phase 4 never touches `reason === "too-large"` entries — those stay as - stubs forever. + continues to the next entry. Only `LlmConfigError` / `LlmError` + propagate (treated as job-fatal upstream). +- LLM output is untrusted: missing slots are filled only when the + response yields a non-empty value of the expected shape; partial + responses leave unfilled slots for a future pass. +- Cache and disk stay in lockstep — every `saveCondensed` is paired + with a `cache.set(entry)` in the same code path. ## External dependencies `@bb/llm` (`askJsonLLM`), `@bb/logger`, `@bb/mongo` (types only — `FileAnalysis`, `FileAnalysisSection`), the sibling -`flat-folder/big-file/` cache layer, and the prompts under +`flat-folder/file-analysis-cache.ts`, and the prompts under `flat-folder/prompts/backfill.ts`. ## Tier diff --git a/packages/ingest-github/src/strategies/flat-folder/backfill/big-files.ts b/packages/ingest-github/src/strategies/flat-folder/backfill/big-files.ts deleted file mode 100644 index 587808c..0000000 --- a/packages/ingest-github/src/strategies/flat-folder/backfill/big-files.ts +++ /dev/null @@ -1,78 +0,0 @@ -import { logger } from "@bb/logger"; -import type { AskLlmOptions } from "@bb/llm"; -import { LlmConfigError, LlmError } from "@bb/errors"; -import type { MetaPaths } from "#src/types/meta-paths.ts"; -import type { SourceReader } from "#src/types/pipeline.ts"; -import type { ProgressContext } from "#src/progress/types.ts"; -import { readBigFiles } from "#src/strategies/flat-folder/big-file/detector.ts"; -import { inspect } from "#src/strategies/flat-folder/big-file/cache.ts"; -import { processBigFile } from "#src/strategies/flat-folder/big-file/index.ts"; - -export interface BackfillBigFilesInput { - knowledgeId: string; - source: SourceReader; - metaPaths: MetaPaths; - llmCallContext?: AskLlmOptions; - progressContext?: ProgressContext; -} - -export interface BackfillBigFilesResult { - reCondensed: number; - failed: number; -} - -export async function backfillBigFiles(input: BackfillBigFilesInput): Promise { - const entries = await readBigFiles(input.metaPaths); - let reCondensed = 0; - let failed = 0; - const reporter = input.progressContext?.reporter({ - phase: "file_analysis", - subPhase: "backfill:big_files", - total: { kind: "fixed", total: entries.length }, - }); - await reporter?.start(); - try { - for (const entry of entries) { - if (entry.reason === "too-large") { - reporter?.increment(1, { fileName: entry.relativePath }); - continue; - } - const status = await inspect(input.metaPaths, entry.relativePath); - if (status === "complete") { - reporter?.increment(1, { fileName: entry.relativePath }); - continue; - } - try { - const content = await input.source.readFile(entry.relativePath); - if (content.length === 0) { - failed += 1; - logger.warn(`phase4: empty content for ${entry.relativePath}; skipping`); - reporter?.increment(1, { fileName: entry.relativePath }); - continue; - } - await processBigFile({ - knowledgeId: input.knowledgeId, - metaPaths: input.metaPaths, - relativePath: entry.relativePath, - content, - sizeBytes: entry.sizeBytes, - ...(input.llmCallContext !== undefined ? { llmCallContext: input.llmCallContext } : {}), - ...(input.progressContext !== undefined ? { progressContext: input.progressContext } : {}), - }); - reCondensed += 1; - } catch (cause: unknown) { - if (cause instanceof LlmConfigError || cause instanceof LlmError) { - throw cause; - } - failed += 1; - const msg = cause instanceof Error ? cause.message : String(cause); - logger.warn(`phase4: re-condense failed for ${entry.relativePath}: ${msg}`); - } - reporter?.increment(1, { fileName: entry.relativePath }); - } - logger.info(`phase4 done: reCondensed=${reCondensed} failed=${failed}`); - return { reCondensed, failed }; - } finally { - reporter?.stop(); - } -} diff --git a/packages/ingest-github/src/strategies/flat-folder/big-file/README.md b/packages/ingest-github/src/strategies/flat-folder/big-file/README.md index 3e4e6ef..264d8ea 100644 --- a/packages/ingest-github/src/strategies/flat-folder/big-file/README.md +++ b/packages/ingest-github/src/strategies/flat-folder/big-file/README.md @@ -27,8 +27,11 @@ depending on chunk count and prompt budget. - `storage.ts` — on-disk cache (chunk JSON, manifest, condensed analysis) + `iterateCondensed(metaPaths)` async iterator used by Phase 5. - `cache.ts` — `inspect(metaPaths, relativePath)` returns `complete`, - `stale-condensed`, or `missing`. Used by Phase 2 to short-circuit and by - Phase 4 to find candidates for cheap re-condense. + `stale-condensed`, or `missing`. Used by Phase 2 to short-circuit + already-finished big files on resume. The chunk task queue then + re-uses cached chunks via `loadChunkIfPresent` and re-runs condense + to recover any `stale-condensed` files — this is the crash-recovery + pathway that replaced the deleted Phase 4 backfill. - `index.ts` — `processBigFile({knowledgeId, metaPaths, relativePath, content, sizeBytes, llmCallContext?, progressContext?})`. Sequential per file (chunk-level concurrency inside). Persists every intermediate artifact, @@ -49,17 +52,17 @@ the storage / cache primitives) are consumed by **two** drivers: - `processBigFile` (`index.ts`) — legacy serial driver. One big file at a time, chunks-within-file parallel under `Config.BigFileConcurrency`, - followed by a blocking condense. Used today by the pull-path - (`pipeline/pull.ts`) via `processBigFilesQueue` and by the Phase 4 - backfill. + followed by a blocking condense. Used today only by the pull-path + (`pipeline/pull.ts`) via `processBigFilesQueue`. - `analyseBigFiles` (`phases/process-big-files.ts`) — manifest-driven chunk-task queue used by the main strategy entry. Every chunk of every big file is an independent task scheduled through a strategy-wide shared `ConcurrencyLimiter`. As soon as a file's last chunk lands, - that file's `condenseChunks` is scheduled through the same limiter — - multiple condenses run in parallel with chunks of slower files. - Reuses `splitFileIntoChunks`, `analyzeChunk`, `condenseChunks`, and - the storage helpers without modification. + that file's `condenseChunks` is scheduled through the same limiter + (with one in-place retry on transient failure) — multiple condenses + run in parallel with chunks of slower files. Reuses + `splitFileIntoChunks`, `analyzeChunk`, `condenseChunks`, and the + storage helpers without modification. ## Invariants diff --git a/packages/ingest-github/src/strategies/flat-folder/index.ts b/packages/ingest-github/src/strategies/flat-folder/index.ts index c23f42c..e70396f 100644 --- a/packages/ingest-github/src/strategies/flat-folder/index.ts +++ b/packages/ingest-github/src/strategies/flat-folder/index.ts @@ -10,7 +10,6 @@ import { scanAndClassify } from "./phases/scan-and-classify.ts"; import { analyseSmallFiles } from "./phases/analyse-small.ts"; import { analyseBigFiles } from "./phases/process-big-files.ts"; import { backfillMissingFields } from "./backfill/fields.ts"; -import { backfillBigFiles } from "./backfill/big-files.ts"; import { FileAnalysisCache } from "./file-analysis-cache.ts"; import { runFolderSummaryPhase } from "./folder-summary.ts"; import { makeRepoSummaryEnvelope, persistRepoSummary, summariseRepo } from "./repo-summary.ts"; @@ -96,19 +95,6 @@ export function createFlatFolderStrategy(deps: FlatFolderStrategyDeps): IngestSt throwIfCancelled(knowledgeId); await backfillMissingFields(metaPaths, fileAnalysisCache, llmCallContext, progressContext); - logger.info(`flat-folder: phase4 (backfill big files) starting`); - throwIfCancelled(knowledgeId); - const phase4Input: Parameters[0] = { - knowledgeId, - source, - metaPaths, - progressContext, - }; - if (llmCallContext !== undefined) { - phase4Input.llmCallContext = llmCallContext; - } - await backfillBigFiles(phase4Input); - progressContext.phaseChanged("folder_analysis"); logger.info(`flat-folder: phase5 (folder summaries) starting`); throwIfCancelled(knowledgeId); diff --git a/packages/ingest-github/src/strategies/flat-folder/phases/README.md b/packages/ingest-github/src/strategies/flat-folder/phases/README.md index cdcfddb..6301e38 100644 --- a/packages/ingest-github/src/strategies/flat-folder/phases/README.md +++ b/packages/ingest-github/src/strategies/flat-folder/phases/README.md @@ -71,8 +71,8 @@ scanAndClassify ↓ FileAnalysisCache.loadAll (one parallel readdir+readFile pass) ↓ -backfillMissingFields → backfillBigFiles → folderSummary → repoSummary → storeFlatAnalysis - (cache read+write) (no cache) (cache read) (cache read) +backfillMissingFields → folderSummary → repoSummary → storeFlatAnalysis + (cache read+write) (cache read) (cache read) ``` `FileAnalysisCache` is a `Map` loaded diff --git a/packages/ingest-github/src/strategies/flat-folder/phases/process-big-files.ts b/packages/ingest-github/src/strategies/flat-folder/phases/process-big-files.ts index 70d5102..1577849 100644 --- a/packages/ingest-github/src/strategies/flat-folder/phases/process-big-files.ts +++ b/packages/ingest-github/src/strategies/flat-folder/phases/process-big-files.ts @@ -5,7 +5,7 @@ import { getConfigValue } from "@bb/config"; import type { AskLlmOptions } from "@bb/llm"; import { LlmConfigError, LlmError } from "@bb/errors"; import type { MetaPaths } from "#src/types/meta-paths.ts"; -import type { SourceReader } from "#src/types/pipeline.ts"; +import type { AnalyzedFileResult, SourceReader } from "#src/types/pipeline.ts"; import type { ProgressContext } from "#src/progress/types.ts"; import type { ConcurrencyLimiter } from "#src/pipeline/concurrency.ts"; import type { ChunkAnalysisResult, FileChunk, HugeFileManifest } from "#src/types/big-file.ts"; @@ -20,6 +20,9 @@ import { loadChunkIfPresent, saveChunk, saveCondensed, saveManifest } from "#src import { processBigFile } from "#src/strategies/flat-folder/big-file/index.ts"; import type { ScanManifest, ScanManifestEntry } from "#src/strategies/flat-folder/scan-manifest.ts"; +const CONDENSE_MAX_ATTEMPTS = 2; +const CONDENSE_RETRY_BACKOFF_MS = 2000; + export interface ProcessBigFilesInput { knowledgeId: string; source: SourceReader; @@ -239,9 +242,37 @@ export async function analyseBigFiles(input: AnalyseBigFilesInput): Promise { throwIfCancelled(input.knowledgeId); - try { - const merged = await condenseChunks(state.entry.relativePath, definedResults, input.llmCallContext); + let merged: AnalyzedFileResult | null = null; + for (let attempt = 1; attempt <= CONDENSE_MAX_ATTEMPTS; attempt += 1) { + try { + merged = await condenseChunks(state.entry.relativePath, definedResults, input.llmCallContext); + break; + } catch (cause: unknown) { + if (cause instanceof CancellationError) { + throw cause; + } + if (cause instanceof LlmConfigError || cause instanceof LlmError) { + throw cause; + } + if (attempt < CONDENSE_MAX_ATTEMPTS) { + logger.warn( + `analyse-big: condense attempt ${attempt}/${CONDENSE_MAX_ATTEMPTS} failed for ${state.entry.relativePath}; retrying: ${describe(cause)}`, + ); + await sleep(CONDENSE_RETRY_BACKOFF_MS); + continue; + } + failed += 1; + logger.warn( + `analyse-big: condense failed after ${CONDENSE_MAX_ATTEMPTS} attempts for ${state.entry.relativePath}: ${describe(cause)}`, + ); + } + } + if (merged === null) { + condenseReporter?.increment(1, { fileName: state.entry.relativePath }); + return; + } + try { const chunkInputTokens = definedResults.reduce((acc, r) => acc + (r.tokenUsage?.inputTokens ?? 0), 0); const chunkOutputTokens = definedResults.reduce((acc, r) => acc + (r.tokenUsage?.outputTokens ?? 0), 0); const chunkCostUsd = definedResults.reduce((acc, r) => acc + (r.tokenUsage?.costUsd ?? 0), 0); @@ -282,11 +313,8 @@ export async function analyseBigFiles(input: AnalyseBigFilesInput): Promise { + return new Promise((resolve) => { + setTimeout(resolve, ms); + }); +} From 1afd5d68e0ff3792744df8f601baab1825251e53 Mon Sep 17 00:00:00 2001 From: Dead-Bytes <143434285+Dead-Bytes@users.noreply.github.com> Date: Fri, 22 May 2026 17:25:07 +0530 Subject: [PATCH 29/34] Refactor backfill process to use concurrency limiter and batch Neo4j upserts --- packages/config/src/schema.ts | 7 + packages/ingest-github/src/pipeline/README.md | 27 ++- packages/ingest-github/src/pipeline/pull.ts | 2 +- packages/ingest-github/src/pipeline/scan.ts | 197 ++++++++++++++-- .../src/pipeline/skip-decisions/README.md | 46 +++- .../src/pipeline/skip-decisions/decider.ts | 119 ++++++---- .../src/strategies/flat-folder/README.md | 160 ++++++++----- .../strategies/flat-folder/backfill/README.md | 27 ++- .../strategies/flat-folder/backfill/fields.ts | 52 +++-- .../src/strategies/flat-folder/index.ts | 3 +- .../strategies/flat-folder/phases/README.md | 23 +- .../flat-folder/phases/scan-and-classify.ts | 12 + .../flat-folder/phases/store-flat-analysis.ts | 121 ++++++---- packages/ingest-github/src/types/README.md | 16 +- packages/ingest-github/src/types/pipeline.ts | 35 +++ packages/neo4j/README.md | 36 +-- packages/neo4j/src/client.ts | 29 +++ packages/neo4j/src/files.ts | 221 +++++++++++++++++- packages/neo4j/src/folder.ts | 76 +++++- packages/neo4j/src/index.ts | 4 +- packages/types/src/config.ts | 1 + 21 files changed, 992 insertions(+), 222 deletions(-) diff --git a/packages/config/src/schema.ts b/packages/config/src/schema.ts index 1f7a021..d5bae9d 100644 --- a/packages/config/src/schema.ts +++ b/packages/config/src/schema.ts @@ -44,6 +44,7 @@ export const configSchema = z "llm.concurrency": z.number().int().positive().default(29), "folder.summary.batch.size": z.number().int().positive().default(10), "folder.summary.batch.max.files": z.number().int().positive().default(15), + "neo4j.batch.size": z.number().int().positive().default(50), "condense.context.limit": z.number().int().positive().default(12000), "condense.prompt.overhead": z.number().int().nonnegative().default(1500), "small.file.dedup.threshold": z.number().int().positive().default(3), @@ -87,6 +88,7 @@ export type ConfigValueMap = { [Config.LlmConcurrency]: number; [Config.FolderSummaryBatchSize]: number; [Config.FolderSummaryBatchMaxFiles]: number; + [Config.Neo4jBatchSize]: number; [Config.CondenseContextLimit]: number; [Config.CondensePromptOverhead]: number; [Config.SmallFileDedupThreshold]: number; @@ -144,6 +146,7 @@ export const HINTS: Readonly> = { [Config.LlmConcurrency]: "bytebell set llm.concurrency ", [Config.FolderSummaryBatchSize]: "bytebell set folder.summary.batch.size ", [Config.FolderSummaryBatchMaxFiles]: "bytebell set folder.summary.batch.max.files ", + [Config.Neo4jBatchSize]: "bytebell set neo4j.batch.size ", [Config.CondenseContextLimit]: "bytebell set condense.context.limit ", [Config.CondensePromptOverhead]: "bytebell set condense.prompt.overhead ", [Config.SmallFileDedupThreshold]: "bytebell set small.file.dedup.threshold ", @@ -210,6 +213,8 @@ export function readField(cfg: BytebellConfig, key: K): Config return cfg["folder.summary.batch.size"] as ConfigValue; case Config.FolderSummaryBatchMaxFiles: return cfg["folder.summary.batch.max.files"] as ConfigValue; + case Config.Neo4jBatchSize: + return cfg["neo4j.batch.size"] as ConfigValue; case Config.CondenseContextLimit: return cfg["condense.context.limit"] as ConfigValue; case Config.CondensePromptOverhead: @@ -285,6 +290,8 @@ export function writeField(cfg: BytebellConfig, key: K, value: return { ...cfg, "folder.summary.batch.size": value as number }; case Config.FolderSummaryBatchMaxFiles: return { ...cfg, "folder.summary.batch.max.files": value as number }; + case Config.Neo4jBatchSize: + return { ...cfg, "neo4j.batch.size": value as number }; case Config.CondenseContextLimit: return { ...cfg, "condense.context.limit": value as number }; case Config.CondensePromptOverhead: diff --git a/packages/ingest-github/src/pipeline/README.md b/packages/ingest-github/src/pipeline/README.md index 0c57d78..ae9da32 100644 --- a/packages/ingest-github/src/pipeline/README.md +++ b/packages/ingest-github/src/pipeline/README.md @@ -28,7 +28,7 @@ Domain (sub-folder of `@bb/ingest-github`). - `skip-decisions/` — LLM-backed unknown-extension gate. See `skip-decisions/README.md`. Active when `Config.SkipDecisionEnabled = true` (default). Consumed by `scan.ts` via the optional `skipDecider` - dep; built by `classifyAndAnalyseSmall` if not injected. + dep; built by `scanAndClassify` (Phase 1) if not injected. - `disk-source-reader.ts` — `createDiskSourceReader({ repoDir, commitHash })` returns a `SourceReader` that wraps `scanRepository` + `node:fs.readFile`. The default reader the open-source binary always uses, unless the caller @@ -40,9 +40,26 @@ true` (default). Consumed by `scan.ts` via the optional `skipDecider` enters the big-file phase). Both thresholds are config-driven — no magic numbers in this file. `deps.llmCallContext` (when present) is forwarded into every `SkipDeciderInput` so the LLM branch of the - unknown-extension gate uses per-job credentials. `readScannedFile` - re-reads a file by absolute path for the big-file phase which streams - content lazily. + unknown-extension gate uses per-job credentials. + + **Two scan modes:** + - **Two-pass (default for the flat-folder strategy)** — activated when + `deps.skipDecider` AND `deps.limiter` are both supplied. Pass 1 walks + the tree calling `decider.decideStatic(...)`; static-resolved files + yield immediately, "needs LLM" files go into a pending buffer with + their content. Pass 2 dedupes pending entries by `ext:` or + `filename:`, dispatches one `decider.decideAndDeferSave(...)` per + unique key through the shared limiter via `Promise.all`, then calls + `decider.persist()` exactly once. Pass 3 drains pending — every + `decideStatic` call is now a cache hit, so the drain is sync at the + decider boundary and yields each kept file with its buffered content. + - **Legacy inline (`walk()`)** — used when `deps.limiter` is omitted (e.g. + a custom `SourceFactory` consumer that didn't opt in). Inline `await +deps.skipDecider.decide(input)` per file. Same semantics as before this + refactor; preserved for backwards compatibility. + + `readScannedFile` re-reads a file by absolute path for the big-file phase + which streams content lazily. - `run.ts` — `createPipelineRunner({ reposRootDir, strategy, sourceFactory?, progressContextFactory? })` builds an `IngestRunnerDeps`. GitHub payloads run: branch resolve, source-reader construction, strategy execute, commit persistence. Local @@ -76,7 +93,7 @@ archiveSink?}` and `runPull` skips `syncRepository` + `materialiseEndpoints` (open-source default), the legacy git-based path runs. Either path produces the same downstream pipeline: snapshot prior version, `analyseChangedFiles` (now reading via `SourceReader`), - `processBigFilesQueue`, `backfillMissingFields`, `backfillBigFiles`, + `processBigFilesQueue`, `backfillMissingFields`, `runSelectiveFolderSummary`, `summariseRepo`, `storePullAnalysis`. Mirrors the index-side strategy orchestrator for progress: builds one `ProgressContext` per job from the optional `progressContextFactory` diff --git a/packages/ingest-github/src/pipeline/pull.ts b/packages/ingest-github/src/pipeline/pull.ts index fbc960a..be344a6 100644 --- a/packages/ingest-github/src/pipeline/pull.ts +++ b/packages/ingest-github/src/pipeline/pull.ts @@ -201,7 +201,7 @@ export async function runPull( logger.info(`pull: phase backfill fields starting`); throwIfCancelled(knowledgeId); - await backfillMissingFields(metaPaths, fileAnalysisCache, llmCallContext, progressContext); + await backfillMissingFields(metaPaths, fileAnalysisCache, limiter, llmCallContext, progressContext); progressContext.phaseChanged("folder_analysis"); logger.info(`pull: phase selective folder summary (${affectedFolders.size} folders) starting`); diff --git a/packages/ingest-github/src/pipeline/scan.ts b/packages/ingest-github/src/pipeline/scan.ts index 02d17ea..fda9236 100644 --- a/packages/ingest-github/src/pipeline/scan.ts +++ b/packages/ingest-github/src/pipeline/scan.ts @@ -5,7 +5,8 @@ import { getConfigValue } from "@bb/config"; import type { AskLlmOptions } from "@bb/llm"; import { logger } from "@bb/logger"; import { SKIP_DIRS, looksBinary, passesPathFilters } from "./filters.ts"; -import type { ScanEntry, SkipDecider } from "#src/types/pipeline.ts"; +import type { ConcurrencyLimiter } from "./concurrency.ts"; +import type { ScanEntry, SkipDecider, SkipDeciderInput } from "#src/types/pipeline.ts"; interface ScanLimits { absoluteCap: number; @@ -15,18 +16,7 @@ interface ScanLimits { export interface ScanRepositoryDeps { skipDecider?: SkipDecider; llmCallContext?: AskLlmOptions; -} - -export async function* scanRepository(rootDir: string, deps: ScanRepositoryDeps = {}): AsyncGenerator { - const limits: ScanLimits = { - absoluteCap: getConfigValue(Config.AbsoluteFileSizeCap), - bigFileLineThreshold: getConfigValue(Config.BigFileLineThreshold), - }; - const counts = { acceptStatic: 0, acceptLlm: 0, rejectStatic: 0, rejectLlm: 0, oversized: 0, binary: 0 }; - yield* walk(rootDir, rootDir, limits, deps, counts); - logger.info( - `scan: acceptStatic=${counts.acceptStatic} acceptLlm=${counts.acceptLlm} rejectStatic=${counts.rejectStatic} rejectLlm=${counts.rejectLlm} oversized=${counts.oversized} binary=${counts.binary}`, - ); + limiter?: ConcurrencyLimiter; } interface ScanCounts { @@ -38,6 +28,44 @@ interface ScanCounts { binary: number; } +interface PendingFile { + relativePath: string; + absolutePath: string; + sizeBytes: number; + content: string; + ext: string; + input: SkipDeciderInput; +} + +function newCounts(): ScanCounts { + return { acceptStatic: 0, acceptLlm: 0, rejectStatic: 0, rejectLlm: 0, oversized: 0, binary: 0 }; +} + +function logCounts(counts: ScanCounts): void { + logger.info( + `scan: acceptStatic=${counts.acceptStatic} acceptLlm=${counts.acceptLlm} rejectStatic=${counts.rejectStatic} rejectLlm=${counts.rejectLlm} oversized=${counts.oversized} binary=${counts.binary}`, + ); +} + +export async function* scanRepository(rootDir: string, deps: ScanRepositoryDeps = {}): AsyncGenerator { + const limits: ScanLimits = { + absoluteCap: getConfigValue(Config.AbsoluteFileSizeCap), + bigFileLineThreshold: getConfigValue(Config.BigFileLineThreshold), + }; + + // Two-pass parallel mode requires both a skip-decider AND a limiter so that + // pending LLM resolutions can be deduplicated and dispatched concurrently. + // Without either, fall back to the inline-await walk that's been here all along. + if (deps.skipDecider !== undefined && deps.limiter !== undefined) { + yield* twoPassScan(rootDir, limits, deps.skipDecider, deps.limiter, deps); + return; + } + + const counts = newCounts(); + yield* walk(rootDir, rootDir, limits, deps, counts); + logCounts(counts); +} + async function* walk( rootDir: string, currentDir: string, @@ -82,7 +110,7 @@ async function* walk( continue; } if (deps.skipDecider !== undefined) { - const deciderInput: Parameters[0] = { relativePath, absolutePath: abs, ext }; + const deciderInput: SkipDeciderInput = { relativePath, absolutePath: abs, ext }; if (deps.llmCallContext !== undefined) { deciderInput.llmCallContext = deps.llmCallContext; } @@ -113,6 +141,147 @@ async function* walk( } } +async function* twoPassScan( + rootDir: string, + limits: ScanLimits, + decider: SkipDecider, + limiter: ConcurrencyLimiter, + deps: ScanRepositoryDeps, +): AsyncGenerator { + const counts = newCounts(); + const pending: PendingFile[] = []; + + // Pass 1: walk + categorize. Static-decided files yield immediately; + // "needs LLM" files go into `pending` for batch resolution. + yield* walkAndCategorize(rootDir, rootDir, limits, deps, decider, counts, pending); + + // Pass 2: dedupe pending by decision key (extension or filename), schedule + // one LLM call per unique key through the shared limiter, then persist the + // decider's cache once. + if (pending.length > 0) { + const unique = new Map(); + for (const p of pending) { + const key = decisionKey(p); + if (!unique.has(key)) { + unique.set(key, p.input); + } + } + logger.info(`scan: resolving ${unique.size} unique skip-decision keys for ${pending.length} pending files`); + await Promise.all( + Array.from(unique.values()).map((input) => limiter(() => decider.decideAndDeferSave(input))), + ); + decider.persist(); + } + + // Pass 3: drain pending. Every decideStatic call is now a cache hit. + for (const p of pending) { + const decision = decider.decideStatic(p.input); + if (decision === "reject-static" || decision === null) { + counts.rejectStatic += 1; + continue; + } + if (decision === "reject-llm") { + counts.rejectLlm += 1; + continue; + } + if (decision === "accept-llm") { + counts.acceptLlm += 1; + } else { + counts.acceptStatic += 1; + } + yield { + kind: "file", + relativePath: p.relativePath, + absolutePath: p.absolutePath, + sizeBytes: p.sizeBytes, + content: p.content, + }; + } + + logCounts(counts); +} + +async function* walkAndCategorize( + rootDir: string, + currentDir: string, + limits: ScanLimits, + deps: ScanRepositoryDeps, + decider: SkipDecider, + counts: ScanCounts, + pending: PendingFile[], +): AsyncGenerator { + const dir = await opendir(currentDir); + for await (const entry of dir) { + const abs = path.join(currentDir, entry.name); + if (entry.isDirectory()) { + if (SKIP_DIRS.has(entry.name)) { + continue; + } + yield* walkAndCategorize(rootDir, abs, limits, deps, decider, counts, pending); + continue; + } + if (!entry.isFile()) { + continue; + } + if (!passesPathFilters(entry.name, path.extname(entry.name))) { + counts.rejectStatic += 1; + continue; + } + const sizeBytes = (await stat(abs)).size; + const relativePath = path.relative(rootDir, abs); + const ext = path.extname(entry.name).toLowerCase(); + if (sizeBytes > limits.absoluteCap) { + counts.oversized += 1; + yield { kind: "oversized", relativePath, absolutePath: abs, sizeBytes }; + continue; + } + const buf = await readFile(abs); + if (looksBinary(buf)) { + counts.binary += 1; + continue; + } + const content = buf.toString("utf8"); + if (countLines(content) > limits.bigFileLineThreshold) { + counts.oversized += 1; + yield { kind: "oversized", relativePath, absolutePath: abs, sizeBytes }; + continue; + } + const deciderInput: SkipDeciderInput = { relativePath, absolutePath: abs, ext }; + if (deps.llmCallContext !== undefined) { + deciderInput.llmCallContext = deps.llmCallContext; + } + const sync = decider.decideStatic(deciderInput); + if (sync === "reject-static") { + counts.rejectStatic += 1; + continue; + } + if (sync === "reject-llm") { + counts.rejectLlm += 1; + continue; + } + if (sync === "accept-llm") { + counts.acceptLlm += 1; + yield { kind: "file", relativePath, absolutePath: abs, sizeBytes, content }; + continue; + } + if (sync === "accept") { + counts.acceptStatic += 1; + yield { kind: "file", relativePath, absolutePath: abs, sizeBytes, content }; + continue; + } + // sync === null → needs LLM. Defer to pass 2. + pending.push({ relativePath, absolutePath: abs, sizeBytes, content, ext, input: deciderInput }); + } +} + +function decisionKey(p: PendingFile): string { + if (p.ext.length > 0) { + return `ext:${p.ext}`; + } + const segments = p.relativePath.split("/"); + return `filename:${segments[segments.length - 1] ?? p.relativePath}`; +} + function countLines(content: string): number { if (content.length === 0) { return 0; diff --git a/packages/ingest-github/src/pipeline/skip-decisions/README.md b/packages/ingest-github/src/pipeline/skip-decisions/README.md index f4e0273..4a6fa2f 100644 --- a/packages/ingest-github/src/pipeline/skip-decisions/README.md +++ b/packages/ingest-github/src/pipeline/skip-decisions/README.md @@ -17,6 +17,36 @@ single-tenant public layout. 8. Persist verdict to ~/.bytebell/llmDecisions.json. LLM failure → reject + cache the rejection. ``` +Steps 1-6 are pure CPU + cached lookup — they run synchronously via +`decideStatic`. Step 7 is the slow LLM branch; `decide` performs it +inline, while `decideAndDeferSave` performs it without flushing the +cache to disk so a batched caller can `persist()` once at the end of +its batch. + +## Public methods (`SkipDecider`) + +```ts +interface SkipDecider { + decide(input): Promise; // legacy single-shot path + decideStatic(input): SkipDecision | null; // sync; null = needs LLM + decideAndDeferSave(input): Promise; // LLM call, no disk save + persist(): void; // flush cache to disk once +} +``` + +- `decide` — the original single-shot API. Calls `decideStatic`; if that + returns `null`, runs the LLM call and `persist()`s the cache. Used by + the legacy `walk()` in `scan.ts` when no shared limiter is passed + (e.g. custom `SourceFactory` consumers that don't opt into two-pass). +- `decideStatic` — synchronous. Returns the resolved `SkipDecision` for + steps 1-6; returns `null` to signal "would need an LLM call". Used by + the two-pass scan to categorise files without blocking the walk. +- `decideAndDeferSave` — runs the LLM call and mutates the in-memory + cache but does **not** flush to disk. Scan calls this concurrently + for unique extension/filename keys under a shared limiter; the disk + write happens once via `persist()` after the batch. +- `persist` — best-effort cache flush; swallows I/O errors. + ## Files - `seed.ts` — loads the four bundled JSON files (directory/filename/pattern/extension lists) @@ -36,7 +66,10 @@ single-tenant public layout. factory time; when disabled the decider degrades to "accept everything past the static blocklist". The LLM branch forwards `SkipDeciderInput.llmCallContext` (when set by the runner) into - `askYesNoLLM` so per-job credentials reach the decision call. + `askYesNoLLM` so per-job credentials reach the decision call. The four + methods (`decide`, `decideStatic`, `decideAndDeferSave`, `persist`) share + one internal `staticDecision()` helper so the seed-list + cache-lookup + branch is defined exactly once. - `seed-data/` — the five JSON files copied from kube's `shared/`: `directoryIgnore.json`, `filenameIgnore.json`, `ignorePatterns.json`, `extensions.json`, `llmDecisionsBase.json`. `llmDecisionsBase.json` is @@ -56,8 +89,15 @@ single-tenant public layout. beyond reading the cache file once at factory time. Only the LLM branch reads file content from disk, and even that is bounded by `Config.SkipDecisionMaxCharsForLlm`. -- Every LLM verdict is flushed to disk immediately so a crash mid-scan does - not lose decisions made earlier in the run. +- `decide` flushes to disk immediately after each LLM verdict — same + semantics as before this refactor, so crash mid-scan does not lose + decisions made earlier in the run when the legacy inline path is in use. +- `decideAndDeferSave` does **not** flush; the batched caller (two-pass + scan) is responsible for calling `persist()` exactly once after the + parallel batch resolves. This avoids racing tmp/rename writes when many + unique extensions resolve concurrently. Crash recovery in two-pass mode + is acceptable because the batch is short and re-running the scan + re-resolves the same decisions. - LLM failure defaults to reject and caches the rejection — matches kube's one-shot-rule behavior. Users can hand-edit the cache to revisit. - The decider is process-local: tests may construct one with `cachePath` diff --git a/packages/ingest-github/src/pipeline/skip-decisions/decider.ts b/packages/ingest-github/src/pipeline/skip-decisions/decider.ts index 455f633..50185e8 100644 --- a/packages/ingest-github/src/pipeline/skip-decisions/decider.ts +++ b/packages/ingest-github/src/pipeline/skip-decisions/decider.ts @@ -29,6 +29,11 @@ export interface SkipDeciderDeps { cachePath?: string; } +interface StaticDecisionContext { + filename: string; + segments: string[]; +} + export function makeSkipDecider(deps: SkipDeciderDeps = {}): SkipDecider { const enabled = getConfigValue(Config.SkipDecisionEnabled); const cachePath = deps.cachePath ?? defaultCachePath(); @@ -37,54 +42,90 @@ export function makeSkipDecider(deps: SkipDeciderDeps = {}): SkipDecider { logCacheSummary(cache); } - return { - async decide(input: SkipDeciderInput): Promise { - const segments = input.relativePath.split("/"); - const filename = segments[segments.length - 1] ?? input.relativePath; - for (const segment of segments.slice(0, -1)) { - if (SEED_DIRECTORIES.has(segment)) { - return "reject-static"; - } - } - if (SEED_FILENAMES.has(filename)) { - return "reject-static"; - } - if (input.ext.length > 0 && SEED_EXTENSIONS.has(input.ext)) { - return "reject-static"; - } - if (matchesAnyGlob(filename)) { + function contextFor(input: SkipDeciderInput): StaticDecisionContext { + const segments = input.relativePath.split("/"); + const filename = segments[segments.length - 1] ?? input.relativePath; + return { filename, segments }; + } + + function staticDecision(input: SkipDeciderInput): SkipDecision | null { + const { filename, segments } = contextFor(input); + for (const segment of segments.slice(0, -1)) { + if (SEED_DIRECTORIES.has(segment)) { return "reject-static"; } + } + if (SEED_FILENAMES.has(filename)) { + return "reject-static"; + } + if (input.ext.length > 0 && SEED_EXTENSIONS.has(input.ext)) { + return "reject-static"; + } + if (matchesAnyGlob(filename)) { + return "reject-static"; + } - if (input.ext.length > 0 && KNOWN_LANGUAGE_EXTENSIONS.has(input.ext)) { - return "accept"; - } + if (input.ext.length > 0 && KNOWN_LANGUAGE_EXTENSIONS.has(input.ext)) { + return "accept"; + } - if (!enabled) { - return "accept"; - } + if (!enabled) { + return "accept"; + } - const cacheKey = input.ext.length > 0 ? input.ext : filename; - const section = input.ext.length > 0 ? cache.extensions : cache.filenames; - const cached = section[cacheKey]; - if (cached !== undefined) { - return cached.ignore ? "reject-llm" : "accept-llm"; - } + const cacheKey = input.ext.length > 0 ? input.ext : filename; + const section = input.ext.length > 0 ? cache.extensions : cache.filenames; + const cached = section[cacheKey]; + if (cached !== undefined) { + return cached.ignore ? "reject-llm" : "accept-llm"; + } + return null; + } + + async function resolveLlm(input: SkipDeciderInput): Promise { + const { filename } = contextFor(input); + const decision = await askLlmDecision(input, deps.repositoryName, input.llmCallContext); + if (input.ext.length > 0) { + setExtensionDecision(cache, input.ext, !decision, "llm", deps.repositoryName, input.relativePath); + } else { + setFilenameDecision(cache, filename, !decision, "llm", deps.repositoryName, input.relativePath); + } + return decision ? "accept-llm" : "reject-llm"; + } + + function persist(): void { + if (!enabled) { + return; + } + try { + saveCache(cachePath, cache); + } catch (cause: unknown) { + const msg = cause instanceof Error ? cause.message : String(cause); + logger.warn(`skip-decisions: failed to save cache to ${cachePath}: ${msg}`); + } + } - const decision = await askLlmDecision(input, deps.repositoryName, input.llmCallContext); - if (input.ext.length > 0) { - setExtensionDecision(cache, input.ext, !decision, "llm", deps.repositoryName, input.relativePath); - } else { - setFilenameDecision(cache, filename, !decision, "llm", deps.repositoryName, input.relativePath); + return { + async decide(input: SkipDeciderInput): Promise { + const sync = staticDecision(input); + if (sync !== null) { + return sync; } - try { - saveCache(cachePath, cache); - } catch (cause: unknown) { - const msg = cause instanceof Error ? cause.message : String(cause); - logger.warn(`skip-decisions: failed to save cache to ${cachePath}: ${msg}`); + const result = await resolveLlm(input); + persist(); + return result; + }, + decideStatic(input: SkipDeciderInput): SkipDecision | null { + return staticDecision(input); + }, + async decideAndDeferSave(input: SkipDeciderInput): Promise { + const sync = staticDecision(input); + if (sync !== null) { + return sync; } - return decision ? "accept-llm" : "reject-llm"; + return await resolveLlm(input); }, + persist, }; } diff --git a/packages/ingest-github/src/strategies/flat-folder/README.md b/packages/ingest-github/src/strategies/flat-folder/README.md index a454303..5a725f4 100644 --- a/packages/ingest-github/src/strategies/flat-folder/README.md +++ b/packages/ingest-github/src/strategies/flat-folder/README.md @@ -1,32 +1,50 @@ # `@bb/ingest-github/src/strategies/flat-folder` -The v2 ingestion strategy: clone → scan → big-file split → per-file analyse → -folder summary → repo summary → graph store. Each phase persists artifacts on -disk before the next begins, so a crash resumes cleanly from the next -sub-phase boundary. +The v2 ingestion strategy: scan + classify → analyse small + big in parallel → +field backfill → folder summary → repo summary → graph store. Each phase +persists artifacts on disk before the next begins, so a crash resumes cleanly +from the next sub-phase boundary. + +The strategy constructs **one shared `ConcurrencyLimiter`** at entry (sized by +`Config.LlmConcurrency`, default 29). Every LLM call across small-file +analyses, big-file chunk analyses, per-file condense calls, the skip-decision +LLM gate (during scan), field backfill, and folder summaries checks out from +this single pool. One knob bounds total in-flight LLM concurrency. ## Phases -1. **classify-and-analyse-small** (`phases/classify-and-analyse-small.ts`) — - walks `source.scan({ skipDecider })`; small files → LLM file-analysis → - write `CondensedFileAnalysis` Oversized files → write a stub. Big-by-tokens - files → append to `bigFiles.json` for Phase 2. -2. **process-big-files** (`phases/process-big-files.ts`) — reads - `bigFiles.json`, calls `source.readFile(relativePath)` per entry, - dispatches `processBigFile` sequentially (chunk-level concurrency - inside). -3. **backfill-fields** (`backfill/fields.ts`) — top up `keywords`, - `sideEffects`, `configDependencies`, `dataFlowDirection` on condensed - entries that miss them. Idempotent. -4. **backfill-big-files** (`backfill/big-files.ts`) — re-condense entries - whose chunks exist but condensed JSON is stale or missing. -5. **summarise-folders** (`folder-summary.ts`) — group condensed entries by - `path.posix.dirname` (root = ""), one LLM call per folder, persist to - `folder-summaries/.json`. -6. **summarise-repo** (`repo-summary.ts`) — load folder summaries +1. **scan-and-classify** (`phases/scan-and-classify.ts`) — walks + `source.scan({ skipDecider, limiter })` once, tokenises each file, classifies + as `small` / `big` / `oversized`, and writes + `meta-output/scan-manifest.json` (canonical) plus the legacy + `bigFiles.json` (for the pull-path consumers). Scan internally uses a + **two-pass** strategy: walk + cache-only `decideStatic` first, then + parallel-deduplicated LLM resolution for unknown extensions/filenames + through the shared limiter, then drain. +2a. **analyse-small** (`phases/analyse-small.ts`) — reads the manifest's + `kind: "small"` entries, re-opens content, runs the LLM file-analyser + per file under the shared limiter, writes `CondensedFileAnalysis` JSON. + Also writes oversized stubs. +2b. **analyse-big-files** (`phases/process-big-files.ts` — + `analyseBigFiles`) — chunk-task queue across all big files. Every chunk + is an independent task on the shared limiter; per-file condense is + scheduled as soon as that file's last chunk lands (one in-place retry + on transient condense failures). Runs **concurrently with 2a**. +3. **backfill-fields** (`backfill/fields.ts`) — for each cached condensed + entry with missing extended fields (`keywords`, `sideEffects`, + `dataFlowDirection`, `sectionMap`, …) dispatches one LLM call through + the shared limiter to fill the gaps. Idempotent — no-op on a complete + entry. +4. **summarise-folders** (`folder-summary.ts`) — groups condensed entries + by direct parent folder. Small folders + (`≤ Config.FolderSummaryBatchMaxFiles`, default 15) are batched up to + `Config.FolderSummaryBatchSize` (default 10) per LLM call. Bigger + folders take the individual single-folder path. Both flows run through + the shared limiter. +5. **summarise-repo** (`repo-summary.ts`) — load folder summaries shallowest-first; one call if it fits `ContextWindowLimit`, batch + merge otherwise; persist `repo-summary.json` with the v2-flat envelope. -7. **store-flat-analysis** (`phases/store-flat-analysis.ts`) — ensure +6. **store-flat-analysis** (`phases/store-flat-analysis.ts`) — ensure flat-folder indexes, upsert `:Repo`, then every `:Folder`, then every `:File` with the extended analysis + Folder→File `CONTAINS` edge. @@ -38,50 +56,68 @@ The strategy emits progress through the `ProgressContext` port defined in (no-op, OSS default). - **Boundary events** are split between the runner and the strategy: - - `phaseChanged("clone")` and `phaseChanged("scan")` are emitted by - `pipeline/run.ts` (the runner) before `strategy.execute` is called, - so the SSE stream stays alive during the network/disk-bound prelude. - - `phaseChanged("file_analysis")` is emitted by `index.ts` before phase 1 - - `phaseChanged("folder_analysis")` before phase 5 - - `phaseChanged("indexing")` before phase 6 (which feeds phase 7) - - `completed()` after phase 7 returns - - `failed(message)` from a `try/catch` wrapping the whole `execute` -- **Intra-phase ticks** are emitted by each phase via per-phase reporters - created from `progressContext.reporter(...)`. Sub-phase labels: - - phase 1 → no sub-phase (the main file-analysis loop) - - phase 2 → `big_files_queue`; inner `processBigFile` adds - `big_file:` for chunk pulses - - phase 3 → `backfill`; phase 4 → `backfill:big_files` - - phase 5 → no sub-phase, fixed total = directly-grouped folder count - - phase 7 → `folders` then `files`, both `growing` (drained from - on-disk async generators) -- **Total mode**: phase 1, phase 3, and any other streaming-iterator loop - use `total: { kind: "growing" }` (denominator grows as `source.scan` - yields). Phases 2 and 4, plus the big-file chunk pool, know their size - up front and use `total: { kind: "fixed", total: N }`. + - `phaseChanged("clone")` is emitted by `pipeline/run.ts` (the runner) + before `syncRepository`, so the SSE stream stays alive during the + network/disk-bound prelude. + - `phaseChanged("scan")` is emitted by `index.ts` before phase 1. + - `phaseChanged("file_analysis")` before the parallel 2a/2b block. + - `phaseChanged("folder_analysis")` before phase 4 (folder summaries). + - `phaseChanged("indexing")` before phase 5 (which feeds phase 6). + - `completed()` after phase 6 returns. + - `failed(message)` from a `try/catch` wrapping the whole `execute`. +- **Intra-phase ticks** are emitted via per-phase reporters created from + `progressContext.reporter(...)`. Sub-phase labels: + - phase 1 (scan) → no sub-phase, growing total (driven by `incrementSeen`). + - phase 2a (analyse-small) → `analyse_small`, fixed total = + `smallCount + oversizedCount`. + - phase 2b (analyse-big) → two reporters: `big_files_chunks` (fixed total + = sum of estimated chunks across all big files) and `big_files_condense` + (fixed total = `bigCount`). + - phase 3 → `backfill`, fixed total = `cache.size`. + - phase 4 → no sub-phase, fixed total = directly-grouped folder count. + - phase 6 → `folders` (growing) then `files` (fixed total = `cache.size`). +- **Pull-path-only sub-phases** (emitted by `pipeline/pull.ts` workflow, + not the main strategy): `big_files_queue` (legacy single-file driver), + `big_file:` (per-big-file chunk pulses inside the legacy + driver), `pull` (`analyse-changed.ts` selective file analysis). +- **Total mode**: scan is the only main-strategy phase that uses + `growing` mode. Everything else has fixed totals known up front from the + scan manifest, the file-analysis cache, or the folder grouping. - The cancellation path in `execute` lets `CancellationError` propagate past the orchestrator; `failed()` only fires for non-cancellation errors. ## Files -- `index.ts` — `createFlatFolderStrategy(deps)` orchestrates the 7 phases. +- `index.ts` — `createFlatFolderStrategy(deps)` orchestrates the phases. Accepts `{ fileAnalyzer, progressContextFactory? }`. Constructs one - `ProgressContext` per job and threads it into every phase that takes a - `progressContext?` field. + `ProgressContext` per job AND one shared `ConcurrencyLimiter` per job + (sized by `Config.LlmConcurrency`); threads both into every phase that + needs them. - `types.ts` — `AnalyzedFileEntry`, `FolderSummary`, `RepoSummary`, `RepoSummaryEnvelope`, `FlatFolderResult`. - `analyse-file.ts` — `analyseScannedFile(analyzer, file, llmCallContext?)` + `buildOversizedStub`. -- `analyse-changed.ts` — `analyseChangedFiles({knowledgeId, source, metaPaths, analyzer, diff, llmCallContext?, archiveSink?, progressContext?})`. Pull-time per-file dispatcher. Reads changed file content through `input.source` (a `SourceReader`) so it works with both the disk-backed reader (OSS default) and any HTTP-backed alternative supplied via the `pullFactory` hook. Mirrors `classifyAndAnalyseSmall`'s small-file path: filter → fetch → size cap → binary detect → line count → analyse → save + archive push. Does NOT invoke the skip-decision LLM gate. When `progressContext` is present it creates a fixed-total reporter (`subPhase: "pull"`, `total = dedupedPaths.length`) and increments per-path so the pull SSE stream stays live. +- `analyse-changed.ts` — `analyseChangedFiles({knowledgeId, source, metaPaths, analyzer, diff, llmCallContext?, archiveSink?, progressContext?})`. Pull-time per-file dispatcher. Reads changed file content through `input.source` (a `SourceReader`) so it works with both the disk-backed reader (OSS default) and any HTTP-backed alternative supplied via the `pullFactory` hook. Mirrors `analyseSmallFiles`'s per-file path: filter → fetch → size cap → binary detect → line count → analyse → save + archive push. Does NOT invoke the skip-decision LLM gate. When `progressContext` is present it creates a fixed-total reporter (`subPhase: "pull"`, `total = dedupedPaths.length`) and increments per-path so the pull SSE stream stays live. +- `file-analysis-cache.ts` — in-memory `Map` + loaded once between phase 2 and phase 3; shared read-only by phases 3, 4, + 6; mutated by phase 3 backfill via `cache.set(entry)` so downstream phases + see updated entries without re-reading disk. +- `scan-manifest.ts` — `ScanManifest` shape, `readScanManifest`, + `writeScanManifest`. The canonical handoff between phase 1 and phases 2a/2b. - `folder-path.ts` — `directFolderOf`, `affectedFolderPaths`. -- `folder-summary.ts` — group + summarise + persist + iterate folder summaries. +- `folder-summary.ts` — group + summarise (individual or batched) + persist + + iterate folder summaries; shared `dispatchFolderSummaries` used by both + the main strategy and the pull-path's selective folder phase. +- `folder-summary-selective.ts` — pull-time selective folder summary phase. - `repo-summary.ts` — single-shot or batched repo summary with envelope writer. -- `phases/classify-and-analyse-small.ts` — Phase 1. -- `phases/process-big-files.ts` — Phase 2. -- `phases/store-flat-analysis.ts` — Phase 7. -- `backfill/fields.ts` — Phase 3. -- `backfill/big-files.ts` — Phase 4. -- `big-file/` — chunker, analyzer, condenser, storage, cache for Phase 2 & 4. +- `phases/scan-and-classify.ts` — Phase 1. +- `phases/analyse-small.ts` — Phase 2a. +- `phases/process-big-files.ts` — Phase 2b (`analyseBigFiles`, chunk-task + queue) plus the legacy `processBigFilesQueue` driver used by the pull-path. +- `phases/store-flat-analysis.ts` — Phase 6. +- `backfill/fields.ts` — Phase 3 (parallel via shared limiter). +- `big-file/` — chunker, analyzer, condenser, storage, cache used by both + big-file drivers. - `prompts/` — LLM prompts shared across the phases. ## Invariants @@ -107,11 +143,11 @@ The strategy emits progress through the `ProgressContext` port defined in reads `context.llmCallContext` (an optional `AskLlmOptions` built by the runner from `GithubIndexPayload.{llmApiKey, llmProvider, llmModel}`) and forwards it into every phase that issues LLM calls: phase 1 via - `classifyAndAnalyseSmall`'s `llmCallContext`, phase 2 via - `processBigFilesQueue` (which threads it into **both** the chunk - analyzer and `condenseChunks`), phase 3 via `backfillMissingFields`, - phase 4 via `backfillBigFiles`, phase 5 via `runFolderSummaryPhase`, - phase 6 via `summariseRepo`. The phases pass the same option object - through to `askJsonLLM` so the per-call override reaches `@bb/llm` - unchanged. When `llmCallContext` is undefined the call falls back to - `Config.OpenrouterApiKey` + `Config.LlmProvider`. + `scanAndClassify` (forwarded into `source.scan({ llmCallContext })` for + the skip-decision LLM gate), phase 2a via `analyseSmallFiles`, phase 2b + via `analyseBigFiles` (which threads it into **both** the chunk analyzer + and `condenseChunks`), phase 3 via `backfillMissingFields`, phase 4 via + `runFolderSummaryPhase`, phase 5 via `summariseRepo`. The phases pass + the same option object through to `askJsonLLM` so the per-call override + reaches `@bb/llm` unchanged. When `llmCallContext` is undefined the call + falls back to `Config.OpenrouterApiKey` + `Config.LlmProvider`. diff --git a/packages/ingest-github/src/strategies/flat-folder/backfill/README.md b/packages/ingest-github/src/strategies/flat-folder/backfill/README.md index 34f744d..f580f19 100644 --- a/packages/ingest-github/src/strategies/flat-folder/backfill/README.md +++ b/packages/ingest-github/src/strategies/flat-folder/backfill/README.md @@ -14,24 +14,27 @@ being marked failed. ## Files -- `fields.ts` — Phase 3. `backfillMissingFields(metaPaths, cache, llmCallContext?, progressContext?)` +- `fields.ts` — Phase 3. `backfillMissingFields(metaPaths, cache, limiter, llmCallContext?, progressContext?)` iterates every condensed entry from the shared `FileAnalysisCache`, computes which extended-analysis fields are missing (`keywords`, `ontologyConcepts`, `businessEntities`, `systemCapabilities`, `sideEffects`, `configDependencies`, `dataFlowDirection`, `integrationSurface`, `contractsProvided`, `contractsConsumed`, - `sectionMap`), and asks one LLM call per file to fill only the - missing slots. The response is validated and normalised - (`pickStringArray`, `pickSections`) before being written back via - `saveCondensed` **and** mirrored into the cache via `cache.set(entry)` - so downstream phases (folder summary, graph store) see the updated - entry without re-reading disk. Entries with nothing missing are - skipped without an LLM call. Progress reporter is fixed-total sized - by `cache.size`. + `sectionMap`), and dispatches one LLM call per file **through the shared + `ConcurrencyLimiter`** to fill only the missing slots. Tasks run + concurrently up to `Config.LlmConcurrency`; the loop builds the task + array and awaits `Promise.all` at the end. The response is validated and + normalised (`pickStringArray`, `pickSections`) before being written back + via `saveCondensed` **and** mirrored into the cache via `cache.set(entry)` + so downstream phases (folder summary, graph store) see the updated entry + without re-reading disk. Entries with nothing missing are skipped + without an LLM call. Progress reporter is fixed-total sized by + `cache.size`. Emits `phase3 dispatching N backfill tasks` at entry so the + caller can see how many tasks went through the limiter. ## Public interfaces -- `backfillMissingFields(metaPaths, cache, llmCallContext?, progressContext?): Promise<{ updated, failed }>` +- `backfillMissingFields(metaPaths, cache, limiter, llmCallContext?, progressContext?): Promise<{ updated, failed }>` Returns phase-summary counters consumed by `createFlatFolderStrategy` to roll up into the strategy result. @@ -54,6 +57,10 @@ mutation into `FileAnalysisCache`. responses leave unfilled slots for a future pass. - Cache and disk stay in lockstep — every `saveCondensed` is paired with a `cache.set(entry)` in the same code path. +- Concurrency is bounded by the shared `ConcurrencyLimiter` (today's + `Config.LlmConcurrency`). Counters (`updated`, `failed`, token totals) + are mutated from inside the concurrent tasks — safe under JS's + single-threaded event loop, no locking needed. ## External dependencies diff --git a/packages/ingest-github/src/strategies/flat-folder/backfill/fields.ts b/packages/ingest-github/src/strategies/flat-folder/backfill/fields.ts index 7836520..9effedb 100644 --- a/packages/ingest-github/src/strategies/flat-folder/backfill/fields.ts +++ b/packages/ingest-github/src/strategies/flat-folder/backfill/fields.ts @@ -4,6 +4,7 @@ import { logger } from "@bb/logger"; import type { FileAnalysis, FileAnalysisSection } from "@bb/mongo"; import type { MetaPaths } from "#src/types/meta-paths.ts"; import type { ProgressContext } from "#src/progress/types.ts"; +import type { ConcurrencyLimiter } from "#src/pipeline/concurrency.ts"; import { saveCondensed } from "#src/strategies/flat-folder/big-file/storage.ts"; import type { FileAnalysisCache } from "#src/strategies/flat-folder/file-analysis-cache.ts"; import { BACKFILL_SYSTEM_PROMPT, buildBackfillUserPrompt } from "#src/strategies/flat-folder/prompts/backfill.ts"; @@ -45,11 +46,13 @@ interface NeededFlags { export async function backfillMissingFields( metaPaths: MetaPaths, cache: FileAnalysisCache, + limiter: ConcurrencyLimiter, llmCallContext?: AskLlmOptions, progressContext?: ProgressContext, ): Promise<{ updated: number; failed: number }> { let updated = 0; let failed = 0; + let dispatched = 0; const reporter = progressContext?.reporter({ phase: "file_analysis", subPhase: "backfill", @@ -57,6 +60,7 @@ export async function backfillMissingFields( }); await reporter?.start(); try { + const tasks: Promise[] = []; for (const entry of cache.values()) { const a = entry.analysis; const needed = computeNeeded(a); @@ -64,27 +68,35 @@ export async function backfillMissingFields( reporter?.increment(1, { fileName: entry.relativePath }); continue; } - const userPrompt = buildBackfillUserPrompt(entry.relativePath, entry.analysis); - try { - const response = await askJsonLLM(BACKFILL_SYSTEM_PROMPT, userPrompt, llmCallContext ?? {}); - const result = response.result; - if (result === null) { - reporter?.increment(1, { fileName: entry.relativePath }); - continue; - } - applyBackfill(a, result, needed); - await saveCondensed(metaPaths, entry); - cache.set(entry); - updated += 1; - } catch (cause: unknown) { - if (cause instanceof LlmConfigError || cause instanceof LlmError) { - throw cause; - } - failed += 1; - logger.warn(`phase3: backfill failed for ${entry.relativePath}: ${describe(cause)}`); - } - reporter?.increment(1, { fileName: entry.relativePath }); + dispatched += 1; + tasks.push( + limiter(async () => { + const userPrompt = buildBackfillUserPrompt(entry.relativePath, entry.analysis); + try { + const response = await askJsonLLM(BACKFILL_SYSTEM_PROMPT, userPrompt, llmCallContext ?? {}); + const result = response.result; + if (result === null) { + reporter?.increment(1, { fileName: entry.relativePath }); + return; + } + applyBackfill(a, result, needed); + await saveCondensed(metaPaths, entry); + cache.set(entry); + updated += 1; + } catch (cause: unknown) { + if (cause instanceof LlmConfigError || cause instanceof LlmError) { + throw cause; + } + failed += 1; + logger.warn(`phase3: backfill failed for ${entry.relativePath}: ${describe(cause)}`); + } finally { + reporter?.increment(1, { fileName: entry.relativePath }); + } + }), + ); } + logger.info(`phase3 dispatching ${dispatched} backfill tasks`); + await Promise.all(tasks); logger.info(`phase3 done: updated=${updated} failed=${failed}`); return { updated, failed }; } finally { diff --git a/packages/ingest-github/src/strategies/flat-folder/index.ts b/packages/ingest-github/src/strategies/flat-folder/index.ts index e70396f..5093568 100644 --- a/packages/ingest-github/src/strategies/flat-folder/index.ts +++ b/packages/ingest-github/src/strategies/flat-folder/index.ts @@ -44,6 +44,7 @@ export function createFlatFolderStrategy(deps: FlatFolderStrategyDeps): IngestSt knowledgeId, source, metaPaths, + limiter, progressContext, }; if (llmCallContext !== undefined) { @@ -93,7 +94,7 @@ export function createFlatFolderStrategy(deps: FlatFolderStrategyDeps): IngestSt logger.info(`flat-folder: phase3 (backfill missing fields) starting`); throwIfCancelled(knowledgeId); - await backfillMissingFields(metaPaths, fileAnalysisCache, llmCallContext, progressContext); + await backfillMissingFields(metaPaths, fileAnalysisCache, limiter, llmCallContext, progressContext); progressContext.phaseChanged("folder_analysis"); logger.info(`flat-folder: phase5 (folder summaries) starting`); diff --git a/packages/ingest-github/src/strategies/flat-folder/phases/README.md b/packages/ingest-github/src/strategies/flat-folder/phases/README.md index 6301e38..64cfc96 100644 --- a/packages/ingest-github/src/strategies/flat-folder/phases/README.md +++ b/packages/ingest-github/src/strategies/flat-folder/phases/README.md @@ -51,14 +51,23 @@ progressContext?})` — legacy serial driver kept for the pull-path `analyseBigFiles(manifest, …)`. Reads `bigFiles.json`, dispatches `processBigFile` once per file in a `for` loop. - `store-flat-analysis.ts` — Phase 7. - `storeFlatAnalysis({scope, payload, branch, metaPaths})` ensures + `storeFlatAnalysis({scope, payload, branch, metaPaths, cache})` ensures `flat-folder` Neo4j indexes, upserts `:Repo` (from `repo-summary.json` - if present, empty payload otherwise), then iterates folder summaries - via `iterateFolderSummaries` to upsert `:Folder`, then iterates - condensed entries via `iterateCondensed` to upsert `:File`. Files whose - containing folder was not in the summaries set get a synthesised empty - `:Folder` so the `CONTAINS` edge always lands. `languageFromPath` - fills `language` when the analysis left it blank. + if present, empty payload otherwise), then **dispatches `:Folder` and + `:File` upserts in batches of `Config.Neo4jBatchSize` (default 50)** + via `upsertFolderNodesBatch` / `upsertFileNodesBatch` from `@bb/neo4j`. + Each batch is one Neo4j write transaction containing the same 12 + Cyphers (1 MERGE + 1 folder-attach + 5 rel CLEARs + 5 rel ATTACHes via + UNWIND) that a single upsert used to issue — so a 1 000-file repo + collapses from ~12 000 round-trips to ~240. Files whose containing + folder was not in the summaries set get a synthesised empty `:Folder` + entry added to the folder batch list **up front** (before any batch + dispatches) so the `CONTAINS` edge always lands. + `languageFromPath` fills `language` when the analysis left it blank. + Both progress reporters (`folders`, `files`) open at phase entry with + their fixed totals so the indexing overall-progress aggregate sees + both denominators from the first tick — fixes the prior "leaps to 100 + then sits there" UX bug. ## Execution order diff --git a/packages/ingest-github/src/strategies/flat-folder/phases/scan-and-classify.ts b/packages/ingest-github/src/strategies/flat-folder/phases/scan-and-classify.ts index 786c9b0..6dc92a7 100644 --- a/packages/ingest-github/src/strategies/flat-folder/phases/scan-and-classify.ts +++ b/packages/ingest-github/src/strategies/flat-folder/phases/scan-and-classify.ts @@ -7,6 +7,7 @@ import type { MetaPaths } from "#src/types/meta-paths.ts"; import type { BigFileEntry } from "#src/types/big-file.ts"; import type { SkipDecider, SourceReader } from "#src/types/pipeline.ts"; import type { ProgressContext } from "#src/progress/types.ts"; +import type { ConcurrencyLimiter } from "#src/pipeline/concurrency.ts"; import { throwIfCancelled } from "#src/pipeline/cancellation.ts"; import { makeSkipDecider } from "#src/pipeline/skip-decisions/index.ts"; import { classifyByTokens, writeBigFiles } from "#src/strategies/flat-folder/big-file/detector.ts"; @@ -24,6 +25,14 @@ export interface ScanAndClassifyInput { skipDecider?: SkipDecider; llmCallContext?: AskLlmOptions; progressContext?: ProgressContext; + /** + * Shared LLM-concurrency limiter. When supplied the underlying + * `scanRepository` runs its two-pass strategy: walk + cache-only decisions + * first, then parallel-deduplicated LLM resolution for unknown + * extensions/filenames under this limiter. Optional so the function + * still works standalone. + */ + limiter?: ConcurrencyLimiter; } export interface ScanAndClassifyResult { @@ -56,6 +65,9 @@ export async function scanAndClassify(input: ScanAndClassifyInput): Promise[0] = { skipDecider }; + if (input.limiter !== undefined) { + scanDeps.limiter = input.limiter; + } if (input.llmCallContext !== undefined) { scanDeps.llmCallContext = input.llmCallContext; } diff --git a/packages/ingest-github/src/strategies/flat-folder/phases/store-flat-analysis.ts b/packages/ingest-github/src/strategies/flat-folder/phases/store-flat-analysis.ts index adeb0a6..7db4433 100644 --- a/packages/ingest-github/src/strategies/flat-folder/phases/store-flat-analysis.ts +++ b/packages/ingest-github/src/strategies/flat-folder/phases/store-flat-analysis.ts @@ -1,6 +1,16 @@ import { readFile } from "node:fs/promises"; +import { Config } from "@bb/types"; +import { getConfigValue } from "@bb/config"; import { logger } from "@bb/logger"; -import { ensureFlatFolderIndexes, upsertFileNode, upsertFolderNode, upsertRepoNode, type NodeScope } from "@bb/neo4j"; +import { + ensureFlatFolderIndexes, + upsertFileNodesBatch, + upsertFolderNodesBatch, + upsertRepoNode, + type NodeScope, + type UpsertFileNodeInput, + type UpsertFolderNodeInput, +} from "@bb/neo4j"; import type { GithubIndexPayload } from "@bb/types"; import type { MetaPaths } from "#src/types/meta-paths.ts"; import { throwIfCancelled } from "#src/pipeline/cancellation.ts"; @@ -30,10 +40,10 @@ export async function storeFlatAnalysis(input: StoreFlatAnalysisInput): Promise< throwIfCancelled(input.scope.knowledgeId); await ensureFlatFolderIndexes(); - let nodesWritten = 0; - let foldersWritten = 0; - let filesWritten = 0; + const batchSize = getConfigValue(Config.Neo4jBatchSize); + // 1. :Repo node — single upsert, not batched (one repo per knowledge). + let nodesWritten = 0; const repoSummary = await readRepoSummary(input.metaPaths); if (repoSummary !== null) { await upsertRepoNode({ @@ -50,7 +60,6 @@ export async function storeFlatAnalysis(input: StoreFlatAnalysisInput): Promise< keyPatterns: repoSummary.keyPatterns, }, }); - nodesWritten += 1; } else { logger.warn(`phase7: no repo summary on disk; writing :Repo with empty summary`); await upsertRepoNode({ @@ -59,60 +68,79 @@ export async function storeFlatAnalysis(input: StoreFlatAnalysisInput): Promise< branch: input.branch, summary: emptyRepoSummaryPayload(), }); - nodesWritten += 1; } + nodesWritten += 1; - const folderReporter = input.progressContext?.reporter({ - phase: "indexing", - subPhase: "folders", - total: { kind: "growing" }, - }); - await folderReporter?.start(); + // 2. Collect every folder we'll upsert: the on-disk folder summaries plus + // synthesised parents for any file whose folder didn't get a summary. Doing + // this up front gives both reporters real fixed totals so `overallProgress` + // doesn't leap to 100 the moment the folder loop completes (the previous + // UX bug where the file sub-phase registered too late to dilute the + // indexing aggregate). + const folderInputs: UpsertFolderNodeInput[] = []; const folderPaths = new Set(); - try { - for await (const folder of iterateFolderSummaries(input.metaPaths)) { - throwIfCancelled(input.scope.knowledgeId); - folderReporter?.incrementSeen(); - await upsertFolderNode({ + for await (const folder of iterateFolderSummaries(input.metaPaths)) { + folderInputs.push({ + scope: input.scope, + folderPath: folder.folderPath, + summary: shapeFolderPayload(folder), + }); + folderPaths.add(folder.folderPath); + } + for (const file of input.cache.values()) { + const folderPath = directFolderOf(file.relativePath); + if (!folderPaths.has(folderPath)) { + folderInputs.push({ scope: input.scope, - folderPath: folder.folderPath, - summary: shapeFolderPayload(folder), + folderPath, + summary: emptyFolderPayload(), }); - folderPaths.add(folder.folderPath); - foldersWritten += 1; - nodesWritten += 1; - folderReporter?.increment(1, { fileName: folder.folderPath || "" }); + folderPaths.add(folderPath); } - } finally { - folderReporter?.stop(); } + // 3. Both reporters open at phase entry with their true totals so the + // overall-progress aggregate sees both denominators from the first tick. + const folderReporter = input.progressContext?.reporter({ + phase: "indexing", + subPhase: "folders", + total: { kind: "fixed", total: folderInputs.length }, + }); const fileReporter = input.progressContext?.reporter({ phase: "indexing", subPhase: "files", total: { kind: "fixed", total: input.cache.size }, }); + await folderReporter?.start(); await fileReporter?.start(); + + let foldersWritten = 0; + let filesWritten = 0; try { - for (const file of input.cache.values()) { + // 4. Batched folder upserts. + logger.info( + `phase7: folder upsert dispatching ${Math.ceil(folderInputs.length / batchSize)} batches of up to ${batchSize} folders (total=${folderInputs.length})`, + ); + for (let i = 0; i < folderInputs.length; i += batchSize) { throwIfCancelled(input.scope.knowledgeId); - const folderPath = directFolderOf(file.relativePath); - if (!folderPaths.has(folderPath)) { - await upsertFolderNode({ - scope: input.scope, - folderPath, - summary: emptyFolderPayload(), - }); - folderPaths.add(folderPath); - foldersWritten += 1; - nodesWritten += 1; + const batch = folderInputs.slice(i, i + batchSize); + await upsertFolderNodesBatch(batch); + foldersWritten += batch.length; + nodesWritten += batch.length; + for (const item of batch) { + folderReporter?.increment(1, { fileName: item.folderPath || "" }); } - await upsertFileNode({ + } + + // 5. Batched file upserts. + const fileInputs: UpsertFileNodeInput[] = []; + for (const file of input.cache.values()) { + fileInputs.push({ orgId: input.scope.orgId, knowledgeId: input.scope.knowledgeId, repoId: input.scope.repoId, relativePath: file.relativePath, - folderPath, + folderPath: directFolderOf(file.relativePath), language: file.language.length > 0 ? file.language : languageFromPath(file.relativePath), sha: file.sha256, sizeBytes: file.sizeBytes, @@ -121,11 +149,22 @@ export async function storeFlatAnalysis(input: StoreFlatAnalysisInput): Promise< totalChunks: file.totalChunks, totalTokenCount: file.totalTokenCount, }); - filesWritten += 1; - nodesWritten += 1; - fileReporter?.increment(1, { fileName: file.relativePath }); + } + logger.info( + `phase7: file upsert dispatching ${Math.ceil(fileInputs.length / batchSize)} batches of up to ${batchSize} files (total=${fileInputs.length})`, + ); + for (let i = 0; i < fileInputs.length; i += batchSize) { + throwIfCancelled(input.scope.knowledgeId); + const batch = fileInputs.slice(i, i + batchSize); + await upsertFileNodesBatch(batch); + filesWritten += batch.length; + nodesWritten += batch.length; + for (const item of batch) { + fileReporter?.increment(1, { fileName: item.relativePath }); + } } } finally { + folderReporter?.stop(); fileReporter?.stop(); } diff --git a/packages/ingest-github/src/types/README.md b/packages/ingest-github/src/types/README.md index 87b2cea..1fd8479 100644 --- a/packages/ingest-github/src/types/README.md +++ b/packages/ingest-github/src/types/README.md @@ -19,9 +19,21 @@ llmCallContext? }`; `llmCallContext` is the optional `AskLlmOptions` - `pipeline.ts` — `ScannedFile`, `OversizedFile`, `ScanEntry`, `FileAnalyzer` port, `AnalyzedFileResult`, `PipelineDeps`, `PipelineSummary`, `SkipDecider` / `SkipDeciderInput` / `SkipDecision` (the unknown-extension - gate port; implementation lives under `pipeline/skip-decisions/`), + gate port; implementation lives under `pipeline/skip-decisions/`). The + `SkipDecider` interface exposes four methods: `decide` (legacy async + single-shot), `decideStatic` (synchronous; returns the resolved decision + or `null` to signal "needs an LLM call"), `decideAndDeferSave` (async LLM + call that mutates the in-memory cache without flushing to disk), and + `persist` (one-shot cache flush). The two-pass scan in `scan.ts` uses the + latter three so unknown-extension probes fan out under the shared LLM + limiter and the disk cache is written exactly once at the end of the + batch. `SourceReader` / `ScanDeps` (the repository-read abstraction; default - implementation in `pipeline/disk-source-reader.ts`), `ArchiveSink` / + implementation in `pipeline/disk-source-reader.ts`). `ScanDeps.limiter` + is the optional shared `ConcurrencyLimiter`; when supplied together with + `skipDecider`, `scanRepository` switches to its two-pass strategy + instead of the legacy inline-await walk. + `ArchiveSink` / `ArchiveSinkInput` (an optional non-fatal sink that the open-source binary never calls), `SourceFactory` / `SourceFactoryInput` / `SourceFactoryResult` (the optional index-side injection hook surfaced diff --git a/packages/ingest-github/src/types/pipeline.ts b/packages/ingest-github/src/types/pipeline.ts index 9f5c0be..aaf13a5 100644 --- a/packages/ingest-github/src/types/pipeline.ts +++ b/packages/ingest-github/src/types/pipeline.ts @@ -1,6 +1,7 @@ import type { GithubIndexPayload, GithubPullPayload } from "@bb/types"; import type { AskLlmOptions } from "@bb/llm"; import type { FileAnalysis } from "@bb/mongo"; +import type { ConcurrencyLimiter } from "#src/pipeline/concurrency.ts"; import type { DiffResult } from "#src/pipeline/git-diff.ts"; export interface ScannedFile { @@ -59,6 +60,14 @@ export interface ScanDeps { * invokes the LLM branch. Absent in OSS standalone runs. */ llmCallContext?: AskLlmOptions; + /** + * Shared LLM-concurrency limiter. When set, `scanRepository` uses a + * two-pass strategy: walk + cache-only decisions in pass 1, parallel + * deduplicated LLM resolution under this limiter in pass 2, drain the + * pending list in pass 3 (all cache-hits). When absent (e.g. legacy + * `SourceFactory` consumers), scan falls back to inline-await per file. + */ + limiter?: ConcurrencyLimiter; } export interface SourceReader { @@ -152,5 +161,31 @@ export interface SkipDeciderInput { } export interface SkipDecider { + /** + * Single-shot decision: applies static filters, consults the in-memory + * + on-disk caches, and falls through to the LLM when neither resolves + * the decision. Persists the cache to disk after each LLM call. + * Kept for non-scan callers and the legacy inline-await path. + */ decide(input: SkipDeciderInput): Promise; + /** + * Synchronous static-only decision. Returns the resolved `SkipDecision` + * when static filters or cache hit resolves it; returns `null` to signal + * "this needs an LLM call to resolve". Used by `scanRepository` in its + * two-pass mode to collect pending entries without blocking the walk. + */ + decideStatic(input: SkipDeciderInput): SkipDecision | null; + /** + * Asynchronous LLM-resolution path that **mutates the in-memory cache** + * but does NOT persist to disk. The caller (typically `scanRepository`) + * batches these under a `ConcurrencyLimiter` and then calls `persist()` + * exactly once at the end of the batch, so concurrent `saveCache` calls + * don't race on the tmp/rename atomicity. + */ + decideAndDeferSave(input: SkipDeciderInput): Promise; + /** + * Persist the in-memory decision cache to disk. Best-effort: swallows + * I/O errors. Called once at the end of a `decideAndDeferSave` batch. + */ + persist(): void; } diff --git a/packages/neo4j/README.md b/packages/neo4j/README.md index e363877..f597e1b 100644 --- a/packages/neo4j/README.md +++ b/packages/neo4j/README.md @@ -40,20 +40,25 @@ The package owns: function / import edges), and one to remove the `:Knowledge` node itself. Called by the server's `DELETE /api/v1/repos/:knowledgeId` route. -- File-node CRUD (`upsertFileNode`) — composes the per-file relationships - (`:HAS_KEYWORD / :HAS_CLASS / :HAS_FUNCTION / :HAS_IMPORT_INTERNAL / -:HAS_IMPORT_EXTERNAL`), clearing stale relationships before - re-attaching for re-runs. The two-`:HAS_IMPORT_*` split mirrors - kube-package's distinction between relative imports and external - packages — downstream MCP queries can ask "which files import this - internal module" vs "which files import this external package" - cleanly +- File-node CRUD (`upsertFileNode`, `upsertFileNodesBatch`) — composes + the per-file relationships (`:HAS_KEYWORD / :HAS_CLASS / :HAS_FUNCTION + / :HAS_IMPORT_INTERNAL / :HAS_IMPORT_EXTERNAL`), clearing stale + relationships before re-attaching for re-runs. The two-`:HAS_IMPORT_*` + split mirrors kube-package's distinction between relative imports and + external packages — downstream MCP queries can ask "which files + import this internal module" vs "which files import this external + package" cleanly. The `*Batch` variant lands an arbitrary number of + files in **one transaction** via Cypher `UNWIND` — same Cypher shape, + wrapped with an outer UNWIND so 50+ files cost the same 12 Cyphers a + single file used to cost. +- Folder-node CRUD (`upsertFolderNode`, `upsertFolderNodesBatch`) — + same shape as file CRUD; batched variant for bulk indexing. The package does **not** own: - Read queries — defer to a future `@bb/graph` once `@bb/mcp` retrieval has a use case -- Telemetry, retry, or transaction batching — driver defaults apply +- Telemetry — driver defaults apply. - Migration tooling — the `IF NOT EXISTS` constraint creates handle schema drift; richer migrations land later @@ -69,6 +74,9 @@ function upsertKnowledgeNode(doc: KnowledgeDoc): Promise; function setKnowledgeStateInGraph(knowledgeId: string, state: KnowledgeState): Promise; function deleteKnowledgeGraph(knowledgeId: string): Promise; function upsertFileNode(input: UpsertFileNodeInput): Promise; +function upsertFileNodesBatch(inputs: readonly UpsertFileNodeInput[]): Promise; +function upsertFolderNode(input: UpsertFolderNodeInput): Promise; +function upsertFolderNodesBatch(inputs: readonly UpsertFolderNodeInput[]): Promise; function runCypher(query: string, params?: Record): Promise; @@ -160,9 +168,12 @@ Neo4jPassword`). Repo-wide ESLint rule blocks `process.env`. "already exists" errors (Neo4j refuses constraints when a matching plain index exists). Operators must drop conflicting indexes manually if uniqueness guarantees matter. -6. **`upsertFileNode` clears stale relationships before re-attaching.** - Re-runs of the same `(knowledgeId, relativePath)` produce a clean - relationship set rather than accumulating outdated keywords/imports. +6. **`upsertFileNode` and `upsertFileNodesBatch` clear stale relationships + before re-attaching.** Re-runs of the same `(knowledgeId, relativePath)` + produce a clean relationship set rather than accumulating outdated + keywords/imports. In the batched variant the clear-then-attach happens + atomically inside one transaction per batch — partial failures roll + back, so re-runs always start from a consistent state. 7. **No raw `Driver` leaks.** `_getDriver()` is not in `src/index.ts`. Higher tiers go through the typed helpers. @@ -174,7 +185,6 @@ Neo4jPassword`). Repo-wide ESLint rule blocks `process.env`. ## What is intentionally out of scope (v0) - Read queries (defer to `@bb/graph`) -- Cypher transactions / batch writes (single-statement per call) - Schema migrations / drops / renames (only `IF NOT EXISTS` creates) - Multi-database support (we use the default `neo4j` db) - Pub/sub / change-data-capture diff --git a/packages/neo4j/src/client.ts b/packages/neo4j/src/client.ts index 56207d2..dac5fbb 100644 --- a/packages/neo4j/src/client.ts +++ b/packages/neo4j/src/client.ts @@ -81,6 +81,35 @@ export async function _runCypher(query: string, params: Record; +} + +/** + * Run multiple Cypher statements inside one write transaction. All-or-nothing: + * either every statement commits or none do. Used by the batched upsert APIs + * so a 50-file batch lands as one transaction instead of 12 × 50 sessions. + * + * Uses the driver's `executeWrite` which retries automatically on transient + * errors (deadlock, leader switch) up to a few attempts. + */ +export async function _runInTransaction(steps: readonly CypherStep[]): Promise { + if (steps.length === 0) { + return; + } + const session: Session = _getDriver().session(); + try { + await session.executeWrite(async (tx) => { + for (const step of steps) { + await tx.run(step.query, step.params); + } + }); + } finally { + await session.close(); + } +} + export function toNeo4jInt(value: number): Integer { return int(value); } diff --git a/packages/neo4j/src/files.ts b/packages/neo4j/src/files.ts index eaf4182..01695ea 100644 --- a/packages/neo4j/src/files.ts +++ b/packages/neo4j/src/files.ts @@ -1,5 +1,5 @@ import type { FileAnalysis } from "@bb/mongo"; -import { _runCypher } from "./client.ts"; +import { _runCypher, _runInTransaction, type CypherStep } from "./client.ts"; const UPSERT_FILE = ` MERGE (f:File {knowledgeId: $knowledgeId, relativePath: $relativePath}) @@ -133,6 +133,225 @@ export async function deleteFileNodes(knowledgeId: string, relativePaths: string await _runCypher(DELETE_FILES, { knowledgeId, relativePaths }); } +// ───────────────────────────────────────────────────────────────────────────── +// Batched upsert — used by the flat-folder indexing phase to land 50+ files in +// one transaction instead of 12 round-trips per file. Same Cypher shape as the +// single-shot path above; just wrapped with an outer UNWIND so one query +// services every file in the batch. The five rel types (HAS_KEYWORD / +// HAS_CLASS / HAS_FUNCTION / HAS_IMPORT_INTERNAL / HAS_IMPORT_EXTERNAL) each +// take two Cyphers: a batched DELETE that clears existing rels for every file +// in the batch by relativePath, then a batched UNWIND that attaches the new +// rels from flattened `(knowledgeId, relativePath, name)` triples. +// ───────────────────────────────────────────────────────────────────────────── + +const BATCH_UPSERT_FILES = ` +UNWIND $files AS f +MERGE (file:File {knowledgeId: f.knowledgeId, relativePath: f.relativePath}) +SET file.orgId = f.orgId, + file.repoId = f.repoId, + file.language = f.language, + file.sha = f.sha, + file.sizeBytes = f.sizeBytes, + file.purpose = f.purpose, + file.summary = f.summary, + file.businessContext = f.businessContext, + file.dataFlowDirection = f.dataFlowDirection, + file.ontologyConcepts = f.ontologyConcepts, + file.businessEntities = f.businessEntities, + file.systemCapabilities = f.systemCapabilities, + file.sideEffects = f.sideEffects, + file.configDependencies = f.configDependencies, + file.integrationSurface = f.integrationSurface, + file.contractsProvided = f.contractsProvided, + file.contractsConsumed = f.contractsConsumed, + file.sectionNames = f.sectionNames, + file.sectionDescriptions = f.sectionDescriptions, + file.isBigFile = f.isBigFile, + file.totalChunks = f.totalChunks, + file.totalTokenCount = f.totalTokenCount, + file.updatedAt = $updatedAt +WITH file, f +MATCH (k:Knowledge {knowledgeId: f.knowledgeId}) +MERGE (k)-[:HAS_FILE]->(file) +`; + +const BATCH_ATTACH_FILES_TO_FOLDERS = ` +UNWIND $pairs AS pair +MATCH (file:File {knowledgeId: pair.knowledgeId, relativePath: pair.relativePath}) +MATCH (folder:Folder {knowledgeId: pair.knowledgeId, folderPath: pair.folderPath}) +MERGE (folder)-[:CONTAINS]->(file) +`; + +const BATCH_CLEAR_RELS_BY_TYPE: Readonly> = { + HAS_KEYWORD: ` +UNWIND $files AS f +MATCH (file:File {knowledgeId: f.knowledgeId, relativePath: f.relativePath})-[r:HAS_KEYWORD]->() +DELETE r +`, + HAS_CLASS: ` +UNWIND $files AS f +MATCH (file:File {knowledgeId: f.knowledgeId, relativePath: f.relativePath})-[r:HAS_CLASS]->() +DELETE r +`, + HAS_FUNCTION: ` +UNWIND $files AS f +MATCH (file:File {knowledgeId: f.knowledgeId, relativePath: f.relativePath})-[r:HAS_FUNCTION]->() +DELETE r +`, + HAS_IMPORT_INTERNAL: ` +UNWIND $files AS f +MATCH (file:File {knowledgeId: f.knowledgeId, relativePath: f.relativePath})-[r:HAS_IMPORT_INTERNAL]->() +DELETE r +`, + HAS_IMPORT_EXTERNAL: ` +UNWIND $files AS f +MATCH (file:File {knowledgeId: f.knowledgeId, relativePath: f.relativePath})-[r:HAS_IMPORT_EXTERNAL]->() +DELETE r +`, +}; + +const BATCH_ATTACH_KEYWORDS = ` +UNWIND $pairs AS p +MATCH (file:File {knowledgeId: p.knowledgeId, relativePath: p.relativePath}) +MERGE (kw:Keyword {name: p.name}) +MERGE (file)-[:HAS_KEYWORD]->(kw) +`; + +const BATCH_ATTACH_CLASSES = ` +UNWIND $pairs AS p +MATCH (file:File {knowledgeId: p.knowledgeId, relativePath: p.relativePath}) +MERGE (c:Class {signature: p.signature}) +MERGE (file)-[:HAS_CLASS]->(c) +`; + +const BATCH_ATTACH_FUNCTIONS = ` +UNWIND $pairs AS p +MATCH (file:File {knowledgeId: p.knowledgeId, relativePath: p.relativePath}) +MERGE (fn:Function {signature: p.signature}) +MERGE (file)-[:HAS_FUNCTION]->(fn) +`; + +const BATCH_ATTACH_IMPORTS_INTERNAL = ` +UNWIND $pairs AS p +MATCH (file:File {knowledgeId: p.knowledgeId, relativePath: p.relativePath}) +MERGE (m:Module {name: p.name}) +MERGE (file)-[:HAS_IMPORT_INTERNAL]->(m) +`; + +const BATCH_ATTACH_IMPORTS_EXTERNAL = ` +UNWIND $pairs AS p +MATCH (file:File {knowledgeId: p.knowledgeId, relativePath: p.relativePath}) +MERGE (m:Module {name: p.name}) +MERGE (file)-[:HAS_IMPORT_EXTERNAL]->(m) +`; + +type RelType = "HAS_KEYWORD" | "HAS_CLASS" | "HAS_FUNCTION" | "HAS_IMPORT_INTERNAL" | "HAS_IMPORT_EXTERNAL"; + +interface FileRow { + knowledgeId: string; + relativePath: string; +} + +export async function upsertFileNodesBatch(inputs: readonly UpsertFileNodeInput[]): Promise { + if (inputs.length === 0) { + return; + } + const updatedAt = new Date().toISOString(); + const files = inputs.map((input) => fileRowFor(input)); + const fileKeys: FileRow[] = inputs.map((input) => ({ knowledgeId: input.knowledgeId, relativePath: input.relativePath })); + const folderPairs = inputs + .filter((input): input is UpsertFileNodeInput & { folderPath: string } => input.folderPath !== undefined) + .map((input) => ({ + knowledgeId: input.knowledgeId, + relativePath: input.relativePath, + folderPath: input.folderPath, + })); + + const keywordPairs = flattenPairs(inputs, "keywords", "name", (v) => v.toLowerCase()); + const classPairs = flattenPairs(inputs, "classes", "signature"); + const functionPairs = flattenPairs(inputs, "functions", "signature"); + const importsInternalPairs = flattenPairs(inputs, "importsInternal", "name"); + const importsExternalPairs = flattenPairs(inputs, "importsExternal", "name"); + + const steps: CypherStep[] = [ + { query: BATCH_UPSERT_FILES, params: { files, updatedAt } }, + ]; + if (folderPairs.length > 0) { + steps.push({ query: BATCH_ATTACH_FILES_TO_FOLDERS, params: { pairs: folderPairs } }); + } + // Clear existing rels of every type for every file in the batch. + for (const relType of ["HAS_KEYWORD", "HAS_CLASS", "HAS_FUNCTION", "HAS_IMPORT_INTERNAL", "HAS_IMPORT_EXTERNAL"] as const) { + steps.push({ query: BATCH_CLEAR_RELS_BY_TYPE[relType], params: { files: fileKeys } }); + } + if (keywordPairs.length > 0) { + steps.push({ query: BATCH_ATTACH_KEYWORDS, params: { pairs: keywordPairs } }); + } + if (classPairs.length > 0) { + steps.push({ query: BATCH_ATTACH_CLASSES, params: { pairs: classPairs } }); + } + if (functionPairs.length > 0) { + steps.push({ query: BATCH_ATTACH_FUNCTIONS, params: { pairs: functionPairs } }); + } + if (importsInternalPairs.length > 0) { + steps.push({ query: BATCH_ATTACH_IMPORTS_INTERNAL, params: { pairs: importsInternalPairs } }); + } + if (importsExternalPairs.length > 0) { + steps.push({ query: BATCH_ATTACH_IMPORTS_EXTERNAL, params: { pairs: importsExternalPairs } }); + } + + await _runInTransaction(steps); +} + +function fileRowFor(input: UpsertFileNodeInput): Record { + const sectionMap = input.analysis.sectionMap ?? []; + return { + knowledgeId: input.knowledgeId, + relativePath: input.relativePath, + orgId: input.orgId ?? "local", + repoId: input.repoId ?? input.knowledgeId, + language: input.language, + sha: input.sha, + sizeBytes: input.sizeBytes, + purpose: input.analysis.purpose, + summary: input.analysis.summary, + businessContext: input.analysis.businessContext, + dataFlowDirection: input.analysis.dataFlowDirection ?? "", + ontologyConcepts: input.analysis.ontologyConcepts ?? [], + businessEntities: input.analysis.businessEntities ?? [], + systemCapabilities: input.analysis.systemCapabilities ?? [], + sideEffects: input.analysis.sideEffects ?? [], + configDependencies: input.analysis.configDependencies ?? [], + integrationSurface: input.analysis.integrationSurface ?? [], + contractsProvided: input.analysis.contractsProvided ?? [], + contractsConsumed: input.analysis.contractsConsumed ?? [], + sectionNames: sectionMap.map((s) => s.name), + sectionDescriptions: sectionMap.map((s) => s.description), + isBigFile: input.isBigFile ?? false, + totalChunks: input.totalChunks ?? 0, + totalTokenCount: input.totalTokenCount ?? 0, + }; +} + +function flattenPairs( + inputs: readonly UpsertFileNodeInput[], + field: "keywords" | "classes" | "functions" | "importsInternal" | "importsExternal", + valueKey: "name" | "signature", + normalize?: (v: string) => string, +): Array> { + const out: Array> = []; + for (const input of inputs) { + const values = input.analysis[field]; + if (!Array.isArray(values)) { + continue; + } + for (const raw of values) { + const value = normalize !== undefined ? normalize(raw) : raw; + out.push({ knowledgeId: input.knowledgeId, relativePath: input.relativePath, [valueKey]: value }); + } + } + return out; +} + export async function upsertFileNode(input: UpsertFileNodeInput): Promise { const params = { knowledgeId: input.knowledgeId, relativePath: input.relativePath }; const sectionMap = input.analysis.sectionMap ?? []; diff --git a/packages/neo4j/src/folder.ts b/packages/neo4j/src/folder.ts index e862c3e..f4c8ad8 100644 --- a/packages/neo4j/src/folder.ts +++ b/packages/neo4j/src/folder.ts @@ -1,4 +1,4 @@ -import { _runCypher } from "./client.ts"; +import { _runCypher, _runInTransaction, type CypherStep } from "./client.ts"; import type { NodeScope } from "./repo.ts"; export interface FolderSummaryPayload { @@ -41,6 +41,80 @@ MERGE (kw:Keyword {name: name}) MERGE (folder)-[:HAS_KEYWORD]->(kw) `; +// ───────────────────────────────────────────────────────────────────────────── +// Batched folder upsert. Same Cypher shape as the single-shot path; wrapped +// with an outer UNWIND so one transaction lands every folder in the batch. +// ───────────────────────────────────────────────────────────────────────────── + +const BATCH_UPSERT_FOLDERS = ` +UNWIND $folders AS fld +MERGE (folder:Folder {orgId: fld.orgId, knowledgeId: fld.knowledgeId, repoId: fld.repoId, folderPath: fld.folderPath}) +SET folder.purpose = fld.purpose, + folder.summary = fld.summary, + folder.dependencyGraph = fld.dependencyGraph, + folder.updatedAt = $updatedAt +WITH folder, fld +MATCH (r:Repo {orgId: fld.orgId, knowledgeId: fld.knowledgeId, repoId: fld.repoId}) +MERGE (r)-[:CONTAINS]->(folder) +`; + +const BATCH_CLEAR_FOLDER_KEYWORDS = ` +UNWIND $folders AS fld +MATCH (folder:Folder {orgId: fld.orgId, knowledgeId: fld.knowledgeId, repoId: fld.repoId, folderPath: fld.folderPath})-[rel:HAS_KEYWORD]->() +DELETE rel +`; + +const BATCH_ATTACH_FOLDER_KEYWORDS = ` +UNWIND $pairs AS p +MATCH (folder:Folder {orgId: p.orgId, knowledgeId: p.knowledgeId, repoId: p.repoId, folderPath: p.folderPath}) +MERGE (kw:Keyword {name: p.name}) +MERGE (folder)-[:HAS_KEYWORD]->(kw) +`; + +export async function upsertFolderNodesBatch(inputs: readonly UpsertFolderNodeInput[]): Promise { + if (inputs.length === 0) { + return; + } + const updatedAt = new Date().toISOString(); + const folders = inputs.map((input) => ({ + orgId: input.scope.orgId, + knowledgeId: input.scope.knowledgeId, + repoId: input.scope.repoId, + folderPath: input.folderPath, + purpose: input.summary.purpose, + summary: input.summary.summary, + dependencyGraph: input.summary.dependencyGraph, + })); + const folderKeys = inputs.map((input) => ({ + orgId: input.scope.orgId, + knowledgeId: input.scope.knowledgeId, + repoId: input.scope.repoId, + folderPath: input.folderPath, + })); + const keywordPairs: Array> = []; + for (const input of inputs) { + for (const raw of input.summary.keywords) { + keywordPairs.push({ + orgId: input.scope.orgId, + knowledgeId: input.scope.knowledgeId, + repoId: input.scope.repoId, + folderPath: input.folderPath, + name: raw.toLowerCase(), + }); + } + } + + const steps: CypherStep[] = [ + { query: BATCH_UPSERT_FOLDERS, params: { folders, updatedAt } }, + { query: BATCH_CLEAR_FOLDER_KEYWORDS, params: { folders: folderKeys } }, + ]; + if (keywordPairs.length > 0) { + steps.push({ query: BATCH_ATTACH_FOLDER_KEYWORDS, params: { pairs: keywordPairs } }); + } + + await _runInTransaction(steps); +} + export async function upsertFolderNode(input: UpsertFolderNodeInput): Promise { const scope = input.scope; const params = { diff --git a/packages/neo4j/src/index.ts b/packages/neo4j/src/index.ts index 03b51c0..c581c80 100644 --- a/packages/neo4j/src/index.ts +++ b/packages/neo4j/src/index.ts @@ -12,13 +12,13 @@ export { deleteKnowledgeGraph, } from "./knowledge.ts"; -export { upsertFileNode, deleteFileNodes } from "./files.ts"; +export { upsertFileNode, upsertFileNodesBatch, deleteFileNodes } from "./files.ts"; export type { UpsertFileNodeInput } from "./files.ts"; export { upsertRepoNode } from "./repo.ts"; export type { NodeScope, RepoSummaryPayload, UpsertRepoNodeInput } from "./repo.ts"; -export { upsertFolderNode } from "./folder.ts"; +export { upsertFolderNode, upsertFolderNodesBatch } from "./folder.ts"; export type { FolderSummaryPayload, UpsertFolderNodeInput } from "./folder.ts"; export { snapshotFilesToVersion } from "./fileVersions.ts"; diff --git a/packages/types/src/config.ts b/packages/types/src/config.ts index 1e72f67..c878718 100644 --- a/packages/types/src/config.ts +++ b/packages/types/src/config.ts @@ -26,6 +26,7 @@ export enum Config { LlmConcurrency = "llm.concurrency", FolderSummaryBatchSize = "folder.summary.batch.size", FolderSummaryBatchMaxFiles = "folder.summary.batch.max.files", + Neo4jBatchSize = "neo4j.batch.size", CondenseContextLimit = "condense.context.limit", CondensePromptOverhead = "condense.prompt.overhead", SmallFileDedupThreshold = "small.file.dedup.threshold", From e45277dba6888422915ad378f4d86c3dbc12fd60 Mon Sep 17 00:00:00 2001 From: Dead-Bytes <143434285+Dead-Bytes@users.noreply.github.com> Date: Fri, 22 May 2026 18:11:30 +0530 Subject: [PATCH 30/34] chore(format): clean up README formatting and improve code readability across multiple files --- packages/ingest-github/README.md | 10 +++++----- packages/ingest-github/src/pipeline/README.md | 1 + packages/ingest-github/src/pipeline/scan.ts | 4 +--- .../src/pipeline/skip-decisions/README.md | 8 ++++---- .../src/strategies/flat-folder/README.md | 16 ++++++++-------- .../flat-folder/big-file/condenser.ts | Bin 10066 -> 10091 bytes .../flat-folder/folder-summary-selective.ts | 9 ++------- .../strategies/flat-folder/folder-summary.ts | 8 ++++++-- .../flat-folder/phases/process-big-files.ts | 7 ++++++- packages/llm/README.md | 4 ++-- packages/mongo/src/aggregateStats.ts | 8 +------- packages/neo4j/README.md | 2 +- packages/neo4j/src/files.ts | 17 ++++++++++++----- 13 files changed, 49 insertions(+), 45 deletions(-) diff --git a/packages/ingest-github/README.md b/packages/ingest-github/README.md index 9d28387..93d7786 100644 --- a/packages/ingest-github/README.md +++ b/packages/ingest-github/README.md @@ -166,17 +166,17 @@ worker hardcodes a single `IngestionStrategy` instance (currently per folder. Bigger folders take the individual single-folder path. Roll back to one LLM call per folder via `bytebell set folder.summary.batch.size 1`. -2. **Clone idempotent.** Re-runs (BullMQ retries) call `git fetch` + +3. **Clone idempotent.** Re-runs (BullMQ retries) call `git fetch` + `git reset --hard` in the existing dir rather than re-cloning. Tokens are re-injected into the remote URL each time. -3. **Token redaction.** `GitCloneError` carries the **redacted** repo +4. **Token redaction.** `GitCloneError` carries the **redacted** repo URL (`https://user:***@host`) — the raw `gitToken` never appears in error messages or logs. -4. **State transition order.** `Processing` is set _before_ any clone +5. **State transition order.** `Processing` is set _before_ any clone work. `Processed` is set _only_ after the entire scan + analyze loop completes. On any thrown error, the handler best-effort sets `Failed` then re-throws so BullMQ records the retry. -5. **Fail-soft analysis, fail-hard infra.** A single file's LLM call +6. **Fail-soft analysis, fail-hard infra.** A single file's LLM call failing falls back to an empty-analysis Raw doc and processing continues. In the big-file path, a single chunk failure contributes an empty analysis to the merge but does not stop the file; a @@ -184,7 +184,7 @@ worker hardcodes a single `IngestionStrategy` instance (currently `dedupAnalyses` so the merged result is always well-formed. A clone failure or Mongo write failure throws and propagates to BullMQ for retry under the queue's `attempts: 3`. -6. **Hardcoded filters only.** No LLM-based ignore decisions in v0. The +7. **Hardcoded filters only.** No LLM-based ignore decisions in v0. The directory / file / extension blocklists in `scan.ts` are the only way files get skipped. diff --git a/packages/ingest-github/src/pipeline/README.md b/packages/ingest-github/src/pipeline/README.md index ae9da32..7c7b0d6 100644 --- a/packages/ingest-github/src/pipeline/README.md +++ b/packages/ingest-github/src/pipeline/README.md @@ -60,6 +60,7 @@ deps.skipDecider.decide(input)` per file. Same semantics as before this `readScannedFile` re-reads a file by absolute path for the big-file phase which streams content lazily. + - `run.ts` — `createPipelineRunner({ reposRootDir, strategy, sourceFactory?, progressContextFactory? })` builds an `IngestRunnerDeps`. GitHub payloads run: branch resolve, source-reader construction, strategy execute, commit persistence. Local diff --git a/packages/ingest-github/src/pipeline/scan.ts b/packages/ingest-github/src/pipeline/scan.ts index fda9236..d7d9db6 100644 --- a/packages/ingest-github/src/pipeline/scan.ts +++ b/packages/ingest-github/src/pipeline/scan.ts @@ -167,9 +167,7 @@ async function* twoPassScan( } } logger.info(`scan: resolving ${unique.size} unique skip-decision keys for ${pending.length} pending files`); - await Promise.all( - Array.from(unique.values()).map((input) => limiter(() => decider.decideAndDeferSave(input))), - ); + await Promise.all(Array.from(unique.values()).map((input) => limiter(() => decider.decideAndDeferSave(input)))); decider.persist(); } diff --git a/packages/ingest-github/src/pipeline/skip-decisions/README.md b/packages/ingest-github/src/pipeline/skip-decisions/README.md index 4a6fa2f..18d80bb 100644 --- a/packages/ingest-github/src/pipeline/skip-decisions/README.md +++ b/packages/ingest-github/src/pipeline/skip-decisions/README.md @@ -27,10 +27,10 @@ its batch. ```ts interface SkipDecider { - decide(input): Promise; // legacy single-shot path - decideStatic(input): SkipDecision | null; // sync; null = needs LLM - decideAndDeferSave(input): Promise; // LLM call, no disk save - persist(): void; // flush cache to disk once + decide(input): Promise; // legacy single-shot path + decideStatic(input): SkipDecision | null; // sync; null = needs LLM + decideAndDeferSave(input): Promise; // LLM call, no disk save + persist(): void; // flush cache to disk once } ``` diff --git a/packages/ingest-github/src/strategies/flat-folder/README.md b/packages/ingest-github/src/strategies/flat-folder/README.md index 5a725f4..78d8acf 100644 --- a/packages/ingest-github/src/strategies/flat-folder/README.md +++ b/packages/ingest-github/src/strategies/flat-folder/README.md @@ -21,30 +21,30 @@ this single pool. One knob bounds total in-flight LLM concurrency. **two-pass** strategy: walk + cache-only `decideStatic` first, then parallel-deduplicated LLM resolution for unknown extensions/filenames through the shared limiter, then drain. -2a. **analyse-small** (`phases/analyse-small.ts`) — reads the manifest's + 2a. **analyse-small** (`phases/analyse-small.ts`) — reads the manifest's `kind: "small"` entries, re-opens content, runs the LLM file-analyser per file under the shared limiter, writes `CondensedFileAnalysis` JSON. Also writes oversized stubs. -2b. **analyse-big-files** (`phases/process-big-files.ts` — + 2b. **analyse-big-files** (`phases/process-big-files.ts` — `analyseBigFiles`) — chunk-task queue across all big files. Every chunk is an independent task on the shared limiter; per-file condense is scheduled as soon as that file's last chunk lands (one in-place retry on transient condense failures). Runs **concurrently with 2a**. -3. **backfill-fields** (`backfill/fields.ts`) — for each cached condensed +2. **backfill-fields** (`backfill/fields.ts`) — for each cached condensed entry with missing extended fields (`keywords`, `sideEffects`, `dataFlowDirection`, `sectionMap`, …) dispatches one LLM call through the shared limiter to fill the gaps. Idempotent — no-op on a complete entry. -4. **summarise-folders** (`folder-summary.ts`) — groups condensed entries +3. **summarise-folders** (`folder-summary.ts`) — groups condensed entries by direct parent folder. Small folders (`≤ Config.FolderSummaryBatchMaxFiles`, default 15) are batched up to `Config.FolderSummaryBatchSize` (default 10) per LLM call. Bigger folders take the individual single-folder path. Both flows run through the shared limiter. -5. **summarise-repo** (`repo-summary.ts`) — load folder summaries +4. **summarise-repo** (`repo-summary.ts`) — load folder summaries shallowest-first; one call if it fits `ContextWindowLimit`, batch + merge otherwise; persist `repo-summary.json` with the v2-flat envelope. -6. **store-flat-analysis** (`phases/store-flat-analysis.ts`) — ensure +5. **store-flat-analysis** (`phases/store-flat-analysis.ts`) — ensure flat-folder indexes, upsert `:Repo`, then every `:Folder`, then every `:File` with the extended analysis + Folder→File `CONTAINS` edge. @@ -106,8 +106,8 @@ The strategy emits progress through the `ProgressContext` port defined in `writeScanManifest`. The canonical handoff between phase 1 and phases 2a/2b. - `folder-path.ts` — `directFolderOf`, `affectedFolderPaths`. - `folder-summary.ts` — group + summarise (individual or batched) + persist - + iterate folder summaries; shared `dispatchFolderSummaries` used by both - the main strategy and the pull-path's selective folder phase. + - iterate folder summaries; shared `dispatchFolderSummaries` used by both + the main strategy and the pull-path's selective folder phase. - `folder-summary-selective.ts` — pull-time selective folder summary phase. - `repo-summary.ts` — single-shot or batched repo summary with envelope writer. - `phases/scan-and-classify.ts` — Phase 1. diff --git a/packages/ingest-github/src/strategies/flat-folder/big-file/condenser.ts b/packages/ingest-github/src/strategies/flat-folder/big-file/condenser.ts index fdde9b835a295bd1137c8c84fd809dc1a180b219..a4663bd8edfe7468d01368a7aeb5affc0a9bdc0e 100644 GIT binary patch delta 94 zcmccQ_u6lR1YaPR0vI^^`?IXTYxc_pb8B?|WT3e~j`xy=E5Op*W+{~C$_ delta 50 zcmaFucgb&q1m9$NK5<2b(&E&jfTH}|f)X8toSa[] = []; for (const bucket of individual) { - tasks.push(limiter(() => dispatchIndividual(bucket, metaPaths, totals, llmCallContext, reporter, knowledgeId, phaseLabel))); + tasks.push( + limiter(() => dispatchIndividual(bucket, metaPaths, totals, llmCallContext, reporter, knowledgeId, phaseLabel)), + ); } for (const batch of batches) { - tasks.push(limiter(() => dispatchBatch(batch, metaPaths, totals, llmCallContext, reporter, knowledgeId, phaseLabel))); + tasks.push( + limiter(() => dispatchBatch(batch, metaPaths, totals, llmCallContext, reporter, knowledgeId, phaseLabel)), + ); } await Promise.all(tasks); return totals; diff --git a/packages/ingest-github/src/strategies/flat-folder/phases/process-big-files.ts b/packages/ingest-github/src/strategies/flat-folder/phases/process-big-files.ts index 1577849..2f8b7ba 100644 --- a/packages/ingest-github/src/strategies/flat-folder/phases/process-big-files.ts +++ b/packages/ingest-github/src/strategies/flat-folder/phases/process-big-files.ts @@ -16,7 +16,12 @@ import { inspect } from "#src/strategies/flat-folder/big-file/cache.ts"; import { splitFileIntoChunks } from "#src/strategies/flat-folder/big-file/chunker.ts"; import { analyzeChunk } from "#src/strategies/flat-folder/big-file/chunk-analyzer.ts"; import { condenseChunks } from "#src/strategies/flat-folder/big-file/condenser.ts"; -import { loadChunkIfPresent, saveChunk, saveCondensed, saveManifest } from "#src/strategies/flat-folder/big-file/storage.ts"; +import { + loadChunkIfPresent, + saveChunk, + saveCondensed, + saveManifest, +} from "#src/strategies/flat-folder/big-file/storage.ts"; import { processBigFile } from "#src/strategies/flat-folder/big-file/index.ts"; import type { ScanManifest, ScanManifestEntry } from "#src/strategies/flat-folder/scan-manifest.ts"; diff --git a/packages/llm/README.md b/packages/llm/README.md index 2deb951..64e6cef 100644 --- a/packages/llm/README.md +++ b/packages/llm/README.md @@ -156,9 +156,9 @@ it. The cost ledger described in [docs/arch.md](../../docs/arch.md) is sees a single `AskLlmResult`. BullMQ's `attempts: 3` wraps the whole call — retries walk the chain again, useful when a transient OpenRouter outage clears between retries. -4a. **No upstream-provider fallback.** Every request carries + 4a. **No upstream-provider fallback.** Every request carries `provider: { allow_fallbacks: false }`. This is orthogonal to the - `models` chain in invariant 4 — `models` controls *which model* the + `models` chain in invariant 4 — `models` controls _which model_ the gateway tries; `allow_fallbacks` controls whether OpenRouter routes to a different upstream backend serving the same model when the first one stalls. We disable the latter so a slow provider cannot eat the diff --git a/packages/mongo/src/aggregateStats.ts b/packages/mongo/src/aggregateStats.ts index 0cfa6a8..95f7d59 100644 --- a/packages/mongo/src/aggregateStats.ts +++ b/packages/mongo/src/aggregateStats.ts @@ -1,10 +1,4 @@ -import type { - KnowledgeDoc, - StatsCommitEntry, - StatsRepoEntry, - StatsResponse, - StatsTotals, -} from "@bb/types"; +import type { KnowledgeDoc, StatsCommitEntry, StatsRepoEntry, StatsResponse, StatsTotals } from "@bb/types"; import { _getDb } from "./client.ts"; import { Collections } from "./collections.ts"; diff --git a/packages/neo4j/README.md b/packages/neo4j/README.md index f597e1b..ba441b2 100644 --- a/packages/neo4j/README.md +++ b/packages/neo4j/README.md @@ -42,7 +42,7 @@ The package owns: route. - File-node CRUD (`upsertFileNode`, `upsertFileNodesBatch`) — composes the per-file relationships (`:HAS_KEYWORD / :HAS_CLASS / :HAS_FUNCTION - / :HAS_IMPORT_INTERNAL / :HAS_IMPORT_EXTERNAL`), clearing stale +/ :HAS_IMPORT_INTERNAL / :HAS_IMPORT_EXTERNAL`), clearing stale relationships before re-attaching for re-runs. The two-`:HAS_IMPORT_*` split mirrors kube-package's distinction between relative imports and external packages — downstream MCP queries can ask "which files diff --git a/packages/neo4j/src/files.ts b/packages/neo4j/src/files.ts index 01695ea..7d049e3 100644 --- a/packages/neo4j/src/files.ts +++ b/packages/neo4j/src/files.ts @@ -258,7 +258,10 @@ export async function upsertFileNodesBatch(inputs: readonly UpsertFileNodeInput[ } const updatedAt = new Date().toISOString(); const files = inputs.map((input) => fileRowFor(input)); - const fileKeys: FileRow[] = inputs.map((input) => ({ knowledgeId: input.knowledgeId, relativePath: input.relativePath })); + const fileKeys: FileRow[] = inputs.map((input) => ({ + knowledgeId: input.knowledgeId, + relativePath: input.relativePath, + })); const folderPairs = inputs .filter((input): input is UpsertFileNodeInput & { folderPath: string } => input.folderPath !== undefined) .map((input) => ({ @@ -273,14 +276,18 @@ export async function upsertFileNodesBatch(inputs: readonly UpsertFileNodeInput[ const importsInternalPairs = flattenPairs(inputs, "importsInternal", "name"); const importsExternalPairs = flattenPairs(inputs, "importsExternal", "name"); - const steps: CypherStep[] = [ - { query: BATCH_UPSERT_FILES, params: { files, updatedAt } }, - ]; + const steps: CypherStep[] = [{ query: BATCH_UPSERT_FILES, params: { files, updatedAt } }]; if (folderPairs.length > 0) { steps.push({ query: BATCH_ATTACH_FILES_TO_FOLDERS, params: { pairs: folderPairs } }); } // Clear existing rels of every type for every file in the batch. - for (const relType of ["HAS_KEYWORD", "HAS_CLASS", "HAS_FUNCTION", "HAS_IMPORT_INTERNAL", "HAS_IMPORT_EXTERNAL"] as const) { + for (const relType of [ + "HAS_KEYWORD", + "HAS_CLASS", + "HAS_FUNCTION", + "HAS_IMPORT_INTERNAL", + "HAS_IMPORT_EXTERNAL", + ] as const) { steps.push({ query: BATCH_CLEAR_RELS_BY_TYPE[relType], params: { files: fileKeys } }); } if (keywordPairs.length > 0) { From f5cdaa3beba9b61d94355ec1ade0094425ad7a97 Mon Sep 17 00:00:00 2001 From: Dead-Bytes <143434285+Dead-Bytes@users.noreply.github.com> Date: Fri, 22 May 2026 18:13:54 +0530 Subject: [PATCH 31/34] chore: update bun.lock to reflect dependency changes --- bun.lock | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/bun.lock b/bun.lock index 92fb911..0042f56 100644 --- a/bun.lock +++ b/bun.lock @@ -1,6 +1,5 @@ { "lockfileVersion": 1, - "configVersion": 0, "workspaces": { "": { "name": "bytebell-public", @@ -56,6 +55,20 @@ "@bb/types": "workspace:*", }, }, + "packages/ingest-business-context": { + "name": "@bb/ingest-business-context", + "version": "0.0.0", + "dependencies": { + "@bb/config": "workspace:*", + "@bb/errors": "workspace:*", + "@bb/ingest-github": "workspace:*", + "@bb/llm": "workspace:*", + "@bb/logger": "workspace:*", + "@bb/neo4j": "workspace:*", + "@bb/queue": "workspace:*", + "@bb/types": "workspace:*", + }, + }, "packages/ingest-github": { "name": "@bb/ingest-github", "version": "0.0.0", @@ -191,6 +204,8 @@ "@bb/errors": ["@bb/errors@workspace:packages/errors"], + "@bb/ingest-business-context": ["@bb/ingest-business-context@workspace:packages/ingest-business-context"], + "@bb/ingest-github": ["@bb/ingest-github@workspace:packages/ingest-github"], "@bb/llm": ["@bb/llm@workspace:packages/llm"], From 0a58a29d18c0b52302fe5e9c9fd4a3fad457511e Mon Sep 17 00:00:00 2001 From: Dead-Bytes <143434285+Dead-Bytes@users.noreply.github.com> Date: Fri, 22 May 2026 18:23:16 +0530 Subject: [PATCH 32/34] refactor: rename import for analyseBigFiles and remove legacy processBigFiles code --- .../src/strategies/flat-folder/index.ts | 2 +- .../flat-folder/phases/analyse-big-files.ts | 287 ++++++++++++++++++ .../flat-folder/phases/process-big-files.ts | 286 +---------------- 3 files changed, 290 insertions(+), 285 deletions(-) create mode 100644 packages/ingest-github/src/strategies/flat-folder/phases/analyse-big-files.ts diff --git a/packages/ingest-github/src/strategies/flat-folder/index.ts b/packages/ingest-github/src/strategies/flat-folder/index.ts index 5093568..86797a6 100644 --- a/packages/ingest-github/src/strategies/flat-folder/index.ts +++ b/packages/ingest-github/src/strategies/flat-folder/index.ts @@ -8,7 +8,7 @@ import { classifyFailure } from "#src/pipeline/failure-classifier.ts"; import { withConcurrency } from "#src/pipeline/concurrency.ts"; import { scanAndClassify } from "./phases/scan-and-classify.ts"; import { analyseSmallFiles } from "./phases/analyse-small.ts"; -import { analyseBigFiles } from "./phases/process-big-files.ts"; +import { analyseBigFiles } from "./phases/analyse-big-files.ts"; import { backfillMissingFields } from "./backfill/fields.ts"; import { FileAnalysisCache } from "./file-analysis-cache.ts"; import { runFolderSummaryPhase } from "./folder-summary.ts"; diff --git a/packages/ingest-github/src/strategies/flat-folder/phases/analyse-big-files.ts b/packages/ingest-github/src/strategies/flat-folder/phases/analyse-big-files.ts new file mode 100644 index 0000000..33f6446 --- /dev/null +++ b/packages/ingest-github/src/strategies/flat-folder/phases/analyse-big-files.ts @@ -0,0 +1,287 @@ +import { createHash } from "node:crypto"; +import { logger } from "@bb/logger"; +import { Config } from "@bb/types"; +import { getConfigValue } from "@bb/config"; +import type { AskLlmOptions } from "@bb/llm"; +import { LlmConfigError, LlmError } from "@bb/errors"; +import type { MetaPaths } from "#src/types/meta-paths.ts"; +import type { AnalyzedFileResult, SourceReader } from "#src/types/pipeline.ts"; +import type { ProgressContext } from "#src/progress/types.ts"; +import type { ConcurrencyLimiter } from "#src/pipeline/concurrency.ts"; +import type { ChunkAnalysisResult, FileChunk, HugeFileManifest } from "#src/types/big-file.ts"; +import type { CondensedFileAnalysis } from "#src/types/condensed-file-analysis.ts"; +import { throwIfCancelled, CancellationError } from "#src/pipeline/cancellation.ts"; +import { inspect } from "#src/strategies/flat-folder/big-file/cache.ts"; +import { splitFileIntoChunks } from "#src/strategies/flat-folder/big-file/chunker.ts"; +import { analyzeChunk } from "#src/strategies/flat-folder/big-file/chunk-analyzer.ts"; +import { condenseChunks } from "#src/strategies/flat-folder/big-file/condenser.ts"; +import { + loadChunkIfPresent, + saveChunk, + saveCondensed, + saveManifest, +} from "#src/strategies/flat-folder/big-file/storage.ts"; +import type { ScanManifest, ScanManifestEntry } from "#src/strategies/flat-folder/scan-manifest.ts"; +import type { ProcessBigFilesResult } from "#src/strategies/flat-folder/phases/process-big-files.ts"; +import { describe } from "#src/strategies/flat-folder/phases/process-big-files.ts"; + +const CONDENSE_MAX_ATTEMPTS = 2; +const CONDENSE_RETRY_BACKOFF_MS = 2000; + +export interface AnalyseBigFilesInput { + knowledgeId: string; + manifest: ScanManifest; + source: SourceReader; + metaPaths: MetaPaths; + limiter: ConcurrencyLimiter; + llmCallContext?: AskLlmOptions; + progressContext?: ProgressContext; +} + +interface BigFileState { + entry: ScanManifestEntry; + content: string; + chunks: FileChunk[]; + results: (ChunkAnalysisResult | undefined)[]; + pendingChunks: number; + fatal: boolean; +} + +/** + * Manifest-driven big-file phase. Every chunk of every big file is an + * independent task scheduled through the shared LLM limiter. As soon as the + * last chunk of a given file lands, that file's condense is scheduled — + * multiple condenses run in parallel with the still-pending chunks of slower + * files. All LLM calls (chunk + condense) check out from the same limiter. + * + * Files already fully processed (manifest + condensed on disk) are skipped. + */ +export async function analyseBigFiles(input: AnalyseBigFilesInput): Promise { + const maxTokensPerChunk = getConfigValue(Config.MaxTokensPerChunk); + const bigEntries = input.manifest.entries.filter((e) => e.kind === "big"); + + let cached = 0; + let failed = 0; + let processed = 0; + let totalInputTokens = 0; + let totalOutputTokens = 0; + let totalCostUsd = 0; + + // Per-file preparation: read content, chunk, record state. Sequential and + // cheap — no LLM calls here. + const states: BigFileState[] = []; + for (const entry of bigEntries) { + throwIfCancelled(input.knowledgeId); + const status = await inspect(input.metaPaths, entry.relativePath); + if (status === "complete") { + cached += 1; + continue; + } + let content: string; + try { + content = await input.source.readFile(entry.relativePath); + } catch (cause: unknown) { + failed += 1; + logger.warn(`analyse-big: read failed for ${entry.relativePath}: ${describe(cause)}`); + continue; + } + if (content.length === 0) { + failed += 1; + logger.warn(`analyse-big: empty content for ${entry.relativePath}; skipping`); + continue; + } + const chunks = splitFileIntoChunks(entry.relativePath, content, maxTokensPerChunk); + states.push({ + entry, + content, + chunks, + results: new Array(chunks.length), + pendingChunks: chunks.length, + fatal: false, + }); + logger.info(`analyse-big: ${entry.relativePath} split into ${chunks.length} chunks`); + } + + const totalChunks = states.reduce((acc, s) => acc + s.chunks.length, 0); + const chunkReporter = input.progressContext?.reporter({ + phase: "file_analysis", + subPhase: "big_files_chunks", + total: { kind: "fixed", total: totalChunks }, + }); + await chunkReporter?.start(); + const condenseReporter = input.progressContext?.reporter({ + phase: "file_analysis", + subPhase: "big_files_condense", + total: { kind: "fixed", total: states.length }, + }); + await condenseReporter?.start(); + + // For oversized entries the legacy phase counted them; we accept the manifest + // already accounted for them via the small phase (which writes the stub). + // Surfaced here for parity with the legacy result shape. + const skippedOversized = input.manifest.entries.filter((e) => e.kind === "oversized").length; + + const condensePromises: Promise[] = []; + + function maybeScheduleCondense(state: BigFileState): void { + if (state.pendingChunks > 0 || state.fatal) { + return; + } + const definedResults = state.results.filter((r): r is ChunkAnalysisResult => r !== undefined); + condensePromises.push( + input.limiter(async () => { + throwIfCancelled(input.knowledgeId); + let merged: AnalyzedFileResult | null = null; + for (let attempt = 1; attempt <= CONDENSE_MAX_ATTEMPTS; attempt += 1) { + try { + merged = await condenseChunks(state.entry.relativePath, definedResults, input.llmCallContext); + break; + } catch (cause: unknown) { + if (cause instanceof CancellationError) { + throw cause; + } + if (cause instanceof LlmConfigError || cause instanceof LlmError) { + throw cause; + } + if (attempt < CONDENSE_MAX_ATTEMPTS) { + logger.warn( + `analyse-big: condense attempt ${attempt}/${CONDENSE_MAX_ATTEMPTS} failed for ${state.entry.relativePath}; retrying: ${describe(cause)}`, + ); + await sleep(CONDENSE_RETRY_BACKOFF_MS); + continue; + } + failed += 1; + logger.warn( + `analyse-big: condense failed after ${CONDENSE_MAX_ATTEMPTS} attempts for ${state.entry.relativePath}: ${describe(cause)}`, + ); + } + } + if (merged === null) { + condenseReporter?.increment(1, { fileName: state.entry.relativePath }); + return; + } + + try { + const chunkInputTokens = definedResults.reduce((acc, r) => acc + (r.tokenUsage?.inputTokens ?? 0), 0); + const chunkOutputTokens = definedResults.reduce((acc, r) => acc + (r.tokenUsage?.outputTokens ?? 0), 0); + const chunkCostUsd = definedResults.reduce((acc, r) => acc + (r.tokenUsage?.costUsd ?? 0), 0); + const totalTokenCount = state.chunks.reduce((acc, c) => acc + c.tokenCount, 0); + const totalIn = chunkInputTokens + (merged.tokenUsage?.inputTokens ?? 0); + const totalOut = chunkOutputTokens + (merged.tokenUsage?.outputTokens ?? 0); + const totalCost = chunkCostUsd + (merged.tokenUsage?.costUsd ?? 0); + + const manifest: HugeFileManifest = { + relativePath: state.entry.relativePath, + totalChunks: state.chunks.length, + totalTokenCount, + chunkPaths: state.chunks.map((_, i) => `chunks/${encodeFolder(state.entry.relativePath)}/chunk-${i}.json`), + generatedAt: new Date().toISOString(), + }; + await saveManifest(input.metaPaths, manifest); + + const condensed: CondensedFileAnalysis = { + relativePath: state.entry.relativePath, + language: merged.language, + sha256: sha256(state.content), + sizeBytes: state.entry.sizeBytes, + tokenCount: totalTokenCount, + isBigFile: true, + totalChunks: state.chunks.length, + totalTokenCount, + analysedAt: new Date().toISOString(), + analysis: merged.analysis, + tokenUsage: { inputTokens: totalIn, outputTokens: totalOut, costUsd: totalCost }, + }; + await saveCondensed(input.metaPaths, condensed); + + totalInputTokens += totalIn; + totalOutputTokens += totalOut; + totalCostUsd += totalCost; + processed += 1; + } catch (cause: unknown) { + if (cause instanceof CancellationError) { + throw cause; + } + failed += 1; + logger.warn(`analyse-big: persist failed for ${state.entry.relativePath}: ${describe(cause)}`); + } finally { + condenseReporter?.increment(1, { fileName: state.entry.relativePath }); + } + }), + ); + } + + const chunkPromises: Promise[] = []; + for (const state of states) { + for (let i = 0; i < state.chunks.length; i += 1) { + const idx = i; + const chunk = state.chunks[idx]; + if (chunk === undefined) { + continue; + } + chunkPromises.push( + input.limiter(async () => { + throwIfCancelled(input.knowledgeId); + try { + const cachedChunk = await loadChunkIfPresent(input.metaPaths, state.entry.relativePath, idx); + if (cachedChunk !== null) { + state.results[idx] = cachedChunk; + } else { + const analyzed = await analyzeChunk(chunk, input.llmCallContext); + await saveChunk(input.metaPaths, analyzed); + state.results[idx] = analyzed; + } + } catch (cause: unknown) { + if (cause instanceof CancellationError) { + throw cause; + } + if (cause instanceof LlmConfigError || cause instanceof LlmError) { + state.fatal = true; + throw cause; + } + logger.warn( + `analyse-big: chunk ${idx + 1}/${state.chunks.length} failed for ${state.entry.relativePath}: ${describe(cause)}`, + ); + } finally { + state.pendingChunks -= 1; + chunkReporter?.increment(1, { fileName: `${state.entry.relativePath}#chunk-${String(idx)}` }); + maybeScheduleCondense(state); + } + }), + ); + } + } + + try { + await Promise.all(chunkPromises); + await Promise.all(condensePromises); + } finally { + chunkReporter?.stop(); + condenseReporter?.stop(); + } + + logger.info( + `analyse-big done: processed=${processed} cached=${cached} failed=${failed} skippedOversized=${skippedOversized}`, + ); + return { + processed, + cached, + failed, + skippedOversized, + tokenUsage: { inputTokens: totalInputTokens, outputTokens: totalOutputTokens, costUsd: totalCostUsd }, + }; +} + +function sha256(content: string): string { + return createHash("sha256").update(content).digest("hex"); +} + +function encodeFolder(relativePath: string): string { + return relativePath.replace(/\//gu, "__SL__").replace(/\\/gu, "__BS__"); +} + +function sleep(ms: number): Promise { + return new Promise((resolve) => { + setTimeout(resolve, ms); + }); +} diff --git a/packages/ingest-github/src/strategies/flat-folder/phases/process-big-files.ts b/packages/ingest-github/src/strategies/flat-folder/phases/process-big-files.ts index 2f8b7ba..951b10e 100644 --- a/packages/ingest-github/src/strategies/flat-folder/phases/process-big-files.ts +++ b/packages/ingest-github/src/strategies/flat-folder/phases/process-big-files.ts @@ -1,32 +1,13 @@ -import { createHash } from "node:crypto"; import { logger } from "@bb/logger"; -import { Config } from "@bb/types"; -import { getConfigValue } from "@bb/config"; import type { AskLlmOptions } from "@bb/llm"; import { LlmConfigError, LlmError } from "@bb/errors"; import type { MetaPaths } from "#src/types/meta-paths.ts"; -import type { AnalyzedFileResult, SourceReader } from "#src/types/pipeline.ts"; +import type { SourceReader } from "#src/types/pipeline.ts"; import type { ProgressContext } from "#src/progress/types.ts"; -import type { ConcurrencyLimiter } from "#src/pipeline/concurrency.ts"; -import type { ChunkAnalysisResult, FileChunk, HugeFileManifest } from "#src/types/big-file.ts"; -import type { CondensedFileAnalysis } from "#src/types/condensed-file-analysis.ts"; import { throwIfCancelled, CancellationError } from "#src/pipeline/cancellation.ts"; import { readBigFiles } from "#src/strategies/flat-folder/big-file/detector.ts"; import { inspect } from "#src/strategies/flat-folder/big-file/cache.ts"; -import { splitFileIntoChunks } from "#src/strategies/flat-folder/big-file/chunker.ts"; -import { analyzeChunk } from "#src/strategies/flat-folder/big-file/chunk-analyzer.ts"; -import { condenseChunks } from "#src/strategies/flat-folder/big-file/condenser.ts"; -import { - loadChunkIfPresent, - saveChunk, - saveCondensed, - saveManifest, -} from "#src/strategies/flat-folder/big-file/storage.ts"; import { processBigFile } from "#src/strategies/flat-folder/big-file/index.ts"; -import type { ScanManifest, ScanManifestEntry } from "#src/strategies/flat-folder/scan-manifest.ts"; - -const CONDENSE_MAX_ATTEMPTS = 2; -const CONDENSE_RETRY_BACKOFF_MS = 2000; export interface ProcessBigFilesInput { knowledgeId: string; @@ -139,269 +120,6 @@ export async function processBigFilesQueue(input: ProcessBigFilesInput): Promise } } -// --------------------------------------------------------------------------- -// Chunk-queue model (manifest-driven) -// --------------------------------------------------------------------------- - -export interface AnalyseBigFilesInput { - knowledgeId: string; - manifest: ScanManifest; - source: SourceReader; - metaPaths: MetaPaths; - limiter: ConcurrencyLimiter; - llmCallContext?: AskLlmOptions; - progressContext?: ProgressContext; -} - -interface BigFileState { - entry: ScanManifestEntry; - content: string; - chunks: FileChunk[]; - results: (ChunkAnalysisResult | undefined)[]; - pendingChunks: number; - fatal: boolean; -} - -/** - * Manifest-driven big-file phase. Every chunk of every big file is an - * independent task scheduled through the shared LLM limiter. As soon as the - * last chunk of a given file lands, that file's condense is scheduled — - * multiple condenses run in parallel with the still-pending chunks of slower - * files. All LLM calls (chunk + condense) check out from the same limiter. - * - * Files already fully processed (manifest + condensed on disk) are skipped. - */ -export async function analyseBigFiles(input: AnalyseBigFilesInput): Promise { - const maxTokensPerChunk = getConfigValue(Config.MaxTokensPerChunk); - const bigEntries = input.manifest.entries.filter((e) => e.kind === "big"); - - let cached = 0; - let skippedOversized = 0; - let failed = 0; - let processed = 0; - let totalInputTokens = 0; - let totalOutputTokens = 0; - let totalCostUsd = 0; - - // Per-file preparation: read content, chunk, record state. Sequential and - // cheap — no LLM calls here. - const states: BigFileState[] = []; - for (const entry of bigEntries) { - throwIfCancelled(input.knowledgeId); - const status = await inspect(input.metaPaths, entry.relativePath); - if (status === "complete") { - cached += 1; - continue; - } - let content: string; - try { - content = await input.source.readFile(entry.relativePath); - } catch (cause: unknown) { - failed += 1; - logger.warn(`analyse-big: read failed for ${entry.relativePath}: ${describe(cause)}`); - continue; - } - if (content.length === 0) { - failed += 1; - logger.warn(`analyse-big: empty content for ${entry.relativePath}; skipping`); - continue; - } - const chunks = splitFileIntoChunks(entry.relativePath, content, maxTokensPerChunk); - states.push({ - entry, - content, - chunks, - results: new Array(chunks.length), - pendingChunks: chunks.length, - fatal: false, - }); - logger.info(`analyse-big: ${entry.relativePath} split into ${chunks.length} chunks`); - } - - const totalChunks = states.reduce((acc, s) => acc + s.chunks.length, 0); - const chunkReporter = input.progressContext?.reporter({ - phase: "file_analysis", - subPhase: "big_files_chunks", - total: { kind: "fixed", total: totalChunks }, - }); - await chunkReporter?.start(); - const condenseReporter = input.progressContext?.reporter({ - phase: "file_analysis", - subPhase: "big_files_condense", - total: { kind: "fixed", total: states.length }, - }); - await condenseReporter?.start(); - - // For oversized entries the legacy phase counted them; we accept the manifest - // already accounted for them via the small phase (which writes the stub). - // Surfaced here for parity with the legacy result shape. - skippedOversized = input.manifest.entries.filter((e) => e.kind === "oversized").length; - - const condensePromises: Promise[] = []; - - function maybeScheduleCondense(state: BigFileState): void { - if (state.pendingChunks > 0 || state.fatal) { - return; - } - const definedResults = state.results.filter((r): r is ChunkAnalysisResult => r !== undefined); - condensePromises.push( - input.limiter(async () => { - throwIfCancelled(input.knowledgeId); - let merged: AnalyzedFileResult | null = null; - for (let attempt = 1; attempt <= CONDENSE_MAX_ATTEMPTS; attempt += 1) { - try { - merged = await condenseChunks(state.entry.relativePath, definedResults, input.llmCallContext); - break; - } catch (cause: unknown) { - if (cause instanceof CancellationError) { - throw cause; - } - if (cause instanceof LlmConfigError || cause instanceof LlmError) { - throw cause; - } - if (attempt < CONDENSE_MAX_ATTEMPTS) { - logger.warn( - `analyse-big: condense attempt ${attempt}/${CONDENSE_MAX_ATTEMPTS} failed for ${state.entry.relativePath}; retrying: ${describe(cause)}`, - ); - await sleep(CONDENSE_RETRY_BACKOFF_MS); - continue; - } - failed += 1; - logger.warn( - `analyse-big: condense failed after ${CONDENSE_MAX_ATTEMPTS} attempts for ${state.entry.relativePath}: ${describe(cause)}`, - ); - } - } - if (merged === null) { - condenseReporter?.increment(1, { fileName: state.entry.relativePath }); - return; - } - - try { - const chunkInputTokens = definedResults.reduce((acc, r) => acc + (r.tokenUsage?.inputTokens ?? 0), 0); - const chunkOutputTokens = definedResults.reduce((acc, r) => acc + (r.tokenUsage?.outputTokens ?? 0), 0); - const chunkCostUsd = definedResults.reduce((acc, r) => acc + (r.tokenUsage?.costUsd ?? 0), 0); - const totalTokenCount = state.chunks.reduce((acc, c) => acc + c.tokenCount, 0); - const totalIn = chunkInputTokens + (merged.tokenUsage?.inputTokens ?? 0); - const totalOut = chunkOutputTokens + (merged.tokenUsage?.outputTokens ?? 0); - const totalCost = chunkCostUsd + (merged.tokenUsage?.costUsd ?? 0); - - const manifest: HugeFileManifest = { - relativePath: state.entry.relativePath, - totalChunks: state.chunks.length, - totalTokenCount, - chunkPaths: state.chunks.map((_, i) => `chunks/${encodeFolder(state.entry.relativePath)}/chunk-${i}.json`), - generatedAt: new Date().toISOString(), - }; - await saveManifest(input.metaPaths, manifest); - - const condensed: CondensedFileAnalysis = { - relativePath: state.entry.relativePath, - language: merged.language, - sha256: sha256(state.content), - sizeBytes: state.entry.sizeBytes, - tokenCount: totalTokenCount, - isBigFile: true, - totalChunks: state.chunks.length, - totalTokenCount, - analysedAt: new Date().toISOString(), - analysis: merged.analysis, - tokenUsage: { inputTokens: totalIn, outputTokens: totalOut, costUsd: totalCost }, - }; - await saveCondensed(input.metaPaths, condensed); - - totalInputTokens += totalIn; - totalOutputTokens += totalOut; - totalCostUsd += totalCost; - processed += 1; - } catch (cause: unknown) { - if (cause instanceof CancellationError) { - throw cause; - } - failed += 1; - logger.warn(`analyse-big: persist failed for ${state.entry.relativePath}: ${describe(cause)}`); - } finally { - condenseReporter?.increment(1, { fileName: state.entry.relativePath }); - } - }), - ); - } - - const chunkPromises: Promise[] = []; - for (const state of states) { - for (let i = 0; i < state.chunks.length; i += 1) { - const idx = i; - const chunk = state.chunks[idx]; - if (chunk === undefined) { - continue; - } - chunkPromises.push( - input.limiter(async () => { - throwIfCancelled(input.knowledgeId); - try { - const cachedChunk = await loadChunkIfPresent(input.metaPaths, state.entry.relativePath, idx); - if (cachedChunk !== null) { - state.results[idx] = cachedChunk; - } else { - const analyzed = await analyzeChunk(chunk, input.llmCallContext); - await saveChunk(input.metaPaths, analyzed); - state.results[idx] = analyzed; - } - } catch (cause: unknown) { - if (cause instanceof CancellationError) { - throw cause; - } - if (cause instanceof LlmConfigError || cause instanceof LlmError) { - state.fatal = true; - throw cause; - } - logger.warn( - `analyse-big: chunk ${idx + 1}/${state.chunks.length} failed for ${state.entry.relativePath}: ${describe(cause)}`, - ); - } finally { - state.pendingChunks -= 1; - chunkReporter?.increment(1, { fileName: `${state.entry.relativePath}#chunk-${String(idx)}` }); - maybeScheduleCondense(state); - } - }), - ); - } - } - - try { - await Promise.all(chunkPromises); - await Promise.all(condensePromises); - } finally { - chunkReporter?.stop(); - condenseReporter?.stop(); - } - - logger.info( - `analyse-big done: processed=${processed} cached=${cached} failed=${failed} skippedOversized=${skippedOversized}`, - ); - return { - processed, - cached, - failed, - skippedOversized, - tokenUsage: { inputTokens: totalInputTokens, outputTokens: totalOutputTokens, costUsd: totalCostUsd }, - }; -} - -function sha256(content: string): string { - return createHash("sha256").update(content).digest("hex"); -} - -function encodeFolder(relativePath: string): string { - return relativePath.replace(/\//gu, "__SL__").replace(/\\/gu, "__BS__"); -} - -function describe(cause: unknown): string { +export function describe(cause: unknown): string { return cause instanceof Error ? cause.message : String(cause); } - -function sleep(ms: number): Promise { - return new Promise((resolve) => { - setTimeout(resolve, ms); - }); -} From 4dfe43ac18acdb612ce015e369954776ce15a9bf Mon Sep 17 00:00:00 2001 From: Dead-Bytes <143434285+Dead-Bytes@users.noreply.github.com> Date: Fri, 22 May 2026 18:28:37 +0530 Subject: [PATCH 33/34] refactor: streamline type imports in aggregateStats.ts --- .../flat-folder/big-file/condenser.ts | Bin 10066 -> 10091 bytes packages/mongo/src/aggregateStats.ts | 8 +------- 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/packages/ingest-github/src/strategies/flat-folder/big-file/condenser.ts b/packages/ingest-github/src/strategies/flat-folder/big-file/condenser.ts index fdde9b835a295bd1137c8c84fd809dc1a180b219..a4663bd8edfe7468d01368a7aeb5affc0a9bdc0e 100644 GIT binary patch delta 94 zcmccQ_u6lR1YaPR0vI^^`?IXTYxc_pb8B?|WT3e~j`xy=E5Op*W+{~C$_ delta 50 zcmaFucgb&q1m9$NK5<2b(&E&jfTH}|f)X8toSa Date: Fri, 22 May 2026 18:35:55 +0530 Subject: [PATCH 34/34] chore(ts-cleanup): tsconfig files cleared --- packages/cli/src/output.d.ts | 16 -- packages/cli/tsconfig.json | 2 +- packages/config/tsconfig.json | 2 +- packages/errors/tsconfig.json | 2 +- .../ingest-business-context/tsconfig.json | 2 +- packages/ingest-github/tsconfig.json | 2 +- packages/ingest-github/types/index.d.ts | 137 ------------------ packages/llm/tsconfig.json | 2 +- packages/logger/tsconfig.json | 2 +- packages/mcp/tsconfig.json | 2 +- packages/mongo/tsconfig.json | 2 +- packages/neo4j/tsconfig.json | 2 +- packages/queue/tsconfig.json | 2 +- packages/redis/tsconfig.json | 2 +- packages/server/tsconfig.json | 2 +- packages/types/tsconfig.json | 2 +- tsconfig.base.json | 12 +- tsconfig.json | 22 +-- 18 files changed, 21 insertions(+), 194 deletions(-) delete mode 100644 packages/cli/src/output.d.ts delete mode 100644 packages/ingest-github/types/index.d.ts diff --git a/packages/cli/src/output.d.ts b/packages/cli/src/output.d.ts deleted file mode 100644 index e20f44b..0000000 --- a/packages/cli/src/output.d.ts +++ /dev/null @@ -1,16 +0,0 @@ -export declare function success(line: string): void; -export declare function error(line: string, hint?: string): void; -export declare function list(label: string, items: readonly string[]): void; -export interface Spinner { - update(text: string): void; - stop(success: boolean, finalMsg?: string): void; -} -export declare function createSpinner(initialText: string): Spinner; -export interface ProgressBar { - update(current: number, total: number, text?: string): void; - stop(success: boolean, finalMsg?: string): void; -} -export declare function createProgressBar(initialText: string): ProgressBar; -export declare function table(headers: string[], rows: string[][]): void; -export declare function info(line: string): void; -//# sourceMappingURL=output.d.ts.map diff --git a/packages/cli/tsconfig.json b/packages/cli/tsconfig.json index d8a16a7..4ed0786 100644 --- a/packages/cli/tsconfig.json +++ b/packages/cli/tsconfig.json @@ -1,4 +1,4 @@ { - "extends": "../../../../tsconfig.base.json", + "extends": "../../tsconfig.base.json", "include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.json"] } diff --git a/packages/config/tsconfig.json b/packages/config/tsconfig.json index d8a16a7..4ed0786 100644 --- a/packages/config/tsconfig.json +++ b/packages/config/tsconfig.json @@ -1,4 +1,4 @@ { - "extends": "../../../../tsconfig.base.json", + "extends": "../../tsconfig.base.json", "include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.json"] } diff --git a/packages/errors/tsconfig.json b/packages/errors/tsconfig.json index d8a16a7..4ed0786 100644 --- a/packages/errors/tsconfig.json +++ b/packages/errors/tsconfig.json @@ -1,4 +1,4 @@ { - "extends": "../../../../tsconfig.base.json", + "extends": "../../tsconfig.base.json", "include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.json"] } diff --git a/packages/ingest-business-context/tsconfig.json b/packages/ingest-business-context/tsconfig.json index d8a16a7..4ed0786 100644 --- a/packages/ingest-business-context/tsconfig.json +++ b/packages/ingest-business-context/tsconfig.json @@ -1,4 +1,4 @@ { - "extends": "../../../../tsconfig.base.json", + "extends": "../../tsconfig.base.json", "include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.json"] } diff --git a/packages/ingest-github/tsconfig.json b/packages/ingest-github/tsconfig.json index d8a16a7..4ed0786 100644 --- a/packages/ingest-github/tsconfig.json +++ b/packages/ingest-github/tsconfig.json @@ -1,4 +1,4 @@ { - "extends": "../../../../tsconfig.base.json", + "extends": "../../tsconfig.base.json", "include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.json"] } diff --git a/packages/ingest-github/types/index.d.ts b/packages/ingest-github/types/index.d.ts deleted file mode 100644 index 98445ad..0000000 --- a/packages/ingest-github/types/index.d.ts +++ /dev/null @@ -1,137 +0,0 @@ -export interface RegisterGithubWorkersDeps { - sourceFactory?: SourceFactory; - pullFactory?: PullFactory; - progressContextFactory?: ProgressContextFactory; -} - -export type ProgressPhase = "file_analysis" | "folder_analysis" | "indexing"; - -export type ProgressTotalMode = { kind: "fixed"; total: number } | { kind: "growing"; initialTotal?: number }; - -export interface ProgressReporterInput { - readonly phase: ProgressPhase; - readonly subPhase?: string; - readonly total: ProgressTotalMode; - readonly resolveInitialProcessed?: () => Promise | number; -} - -export interface ProgressReporter { - start(): Promise; - increment(delta?: number, meta?: { fileName?: string }): void; - incrementSeen(delta?: number): void; - setTotal(total: number): void; - stop(): void; -} - -export interface ProgressContext { - reporter(input: ProgressReporterInput): ProgressReporter; - phaseChanged(phase: ProgressPhase): void; - completed(message?: string): void; - failed(error: string, phase?: ProgressPhase): void; -} - -export type ProgressContextFactory = (knowledgeId: string) => ProgressContext; - -export declare const nullProgressContextFactory: ProgressContextFactory; - -export declare function registerGithubWorkers(deps?: RegisterGithubWorkersDeps): void; -export declare function registerLocalIngestWorker(): void; - -export interface FlatFolderStrategyDeps { - fileAnalyzer: FileAnalyzer; - progressContextFactory?: ProgressContextFactory; -} -export declare function createFlatFolderStrategy(deps: FlatFolderStrategyDeps): IngestStrategy; -export declare const createLlmFileAnalyzer: (...args: any[]) => any; -export declare const createDiskSourceReader: (...args: any[]) => any; -export declare const createPipelineRunner: (...args: any[]) => any; -export declare const createGithubIngestHandler: (...args: any[]) => any; -export declare const createLocalIngestHandler: (...args: any[]) => any; -export declare const runPull: (...args: any[]) => any; -export declare const reposRoot: (...args: any[]) => string; -export declare const repoCloneDir: (knowledgeId: string) => string; -export declare const metaRootFor: (knowledgeId: string) => string; -export declare const metaPathsFor: (knowledgeId: string) => unknown; -export declare const commitMetaDir: (knowledgeId: string, commitHash: string) => string; -export declare const businessContextDir: (knowledgeId: string, commitHash: string, sanitizedTitle: string) => string; -export declare const orgRegistryDir: (knowledgeId: string, orgId: string) => string; -export declare function fetchLatestCommitHash( - repoUrl: string, - branch: string, - gitToken?: string, -): Promise; -export declare function fetchRecentCommits( - repoUrl: string, - branch: string, - limit?: number, - gitToken?: string, -): Promise; -export declare function fetchDefaultBranch(repoUrl: string, gitToken?: string): Promise; -export declare function fetchBranches( - repoUrl: string, - gitToken?: string, - limit?: number, -): Promise<{ status: "ok"; branches: string[] } | { status: "error"; message: string }>; -export declare function parseGithubRepo(repoUrl: string): ParsedRepo | null; - -export interface BootstrapRuntimeOptions { - config: unknown; - loggerFactory: (scope: string) => unknown; -} -export declare function bootstrapRuntime(opts: BootstrapRuntimeOptions): Promise; - -export declare const COMBINED_CODE_ANALYSIS_SYSTEM_PROMPT: string; -export declare function buildFileAnalysisUserPrompt(input: { relativePath: string; content: string }): string; - -export type CreatePipelineRunnerDeps = any; -export type IngestJobHandlerDeps = any; -export type IngestRunnerDeps = any; -export type IngestRunnerInput = any; -export type IngestStrategy = any; -export type StrategyInput = any; -export type StrategyResult = any; -export type StrategyContext = any; -export type FileAnalyzer = any; -export type AnalyzedFileResult = any; -export type ScanEntry = any; -export type ScannedFile = any; -export type OversizedFile = any; -export type ScanDeps = any; -export type SourceReader = any; -export type ArchiveSink = any; -export type ArchiveSinkInput = any; -export type SourceFactory = any; -export type SourceFactoryInput = any; -export type SourceFactoryResult = any; -export type PullFactory = any; -export type PullFactoryInput = any; -export type PullFactoryResult = any; -export type DiffResult = any; -export type RenamedFile = any; -export type CondensedFileAnalysis = any; -export interface CommitEntry { - sha: string; - message: string; - author: string; - timestamp: string; -} - -export type FetchCommitsResult = - | { status: "ok"; commits: CommitEntry[] } - | { status: "not_found" } - | { status: "unauthorized" } - | { status: "rate_limited" } - | { status: "error"; message: string }; - -export interface ParsedRepo { - owner: string; - repo: string; - branch?: string; -} - -export type DefaultBranchResult = - | { status: "ok"; branch: string } - | { status: "not_found" } - | { status: "unauthorized" } - | { status: "rate_limited" } - | { status: "error"; message: string }; diff --git a/packages/llm/tsconfig.json b/packages/llm/tsconfig.json index d8a16a7..4ed0786 100644 --- a/packages/llm/tsconfig.json +++ b/packages/llm/tsconfig.json @@ -1,4 +1,4 @@ { - "extends": "../../../../tsconfig.base.json", + "extends": "../../tsconfig.base.json", "include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.json"] } diff --git a/packages/logger/tsconfig.json b/packages/logger/tsconfig.json index d8a16a7..4ed0786 100644 --- a/packages/logger/tsconfig.json +++ b/packages/logger/tsconfig.json @@ -1,4 +1,4 @@ { - "extends": "../../../../tsconfig.base.json", + "extends": "../../tsconfig.base.json", "include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.json"] } diff --git a/packages/mcp/tsconfig.json b/packages/mcp/tsconfig.json index d8a16a7..4ed0786 100644 --- a/packages/mcp/tsconfig.json +++ b/packages/mcp/tsconfig.json @@ -1,4 +1,4 @@ { - "extends": "../../../../tsconfig.base.json", + "extends": "../../tsconfig.base.json", "include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.json"] } diff --git a/packages/mongo/tsconfig.json b/packages/mongo/tsconfig.json index d8a16a7..4ed0786 100644 --- a/packages/mongo/tsconfig.json +++ b/packages/mongo/tsconfig.json @@ -1,4 +1,4 @@ { - "extends": "../../../../tsconfig.base.json", + "extends": "../../tsconfig.base.json", "include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.json"] } diff --git a/packages/neo4j/tsconfig.json b/packages/neo4j/tsconfig.json index d8a16a7..4ed0786 100644 --- a/packages/neo4j/tsconfig.json +++ b/packages/neo4j/tsconfig.json @@ -1,4 +1,4 @@ { - "extends": "../../../../tsconfig.base.json", + "extends": "../../tsconfig.base.json", "include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.json"] } diff --git a/packages/queue/tsconfig.json b/packages/queue/tsconfig.json index d8a16a7..4ed0786 100644 --- a/packages/queue/tsconfig.json +++ b/packages/queue/tsconfig.json @@ -1,4 +1,4 @@ { - "extends": "../../../../tsconfig.base.json", + "extends": "../../tsconfig.base.json", "include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.json"] } diff --git a/packages/redis/tsconfig.json b/packages/redis/tsconfig.json index d8a16a7..4ed0786 100644 --- a/packages/redis/tsconfig.json +++ b/packages/redis/tsconfig.json @@ -1,4 +1,4 @@ { - "extends": "../../../../tsconfig.base.json", + "extends": "../../tsconfig.base.json", "include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.json"] } diff --git a/packages/server/tsconfig.json b/packages/server/tsconfig.json index d8a16a7..4ed0786 100644 --- a/packages/server/tsconfig.json +++ b/packages/server/tsconfig.json @@ -1,4 +1,4 @@ { - "extends": "../../../../tsconfig.base.json", + "extends": "../../tsconfig.base.json", "include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.json"] } diff --git a/packages/types/tsconfig.json b/packages/types/tsconfig.json index d8a16a7..4ed0786 100644 --- a/packages/types/tsconfig.json +++ b/packages/types/tsconfig.json @@ -1,4 +1,4 @@ { - "extends": "../../../../tsconfig.base.json", + "extends": "../../tsconfig.base.json", "include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.json"] } diff --git a/tsconfig.base.json b/tsconfig.base.json index 6903d08..9226217 100644 --- a/tsconfig.base.json +++ b/tsconfig.base.json @@ -6,6 +6,7 @@ "module": "ESNext", "moduleResolution": "bundler", "moduleDetection": "force", + "jsx": "react-jsx", "allowImportingTsExtensions": true, "verbatimModuleSyntax": true, "isolatedModules": true, @@ -36,12 +37,9 @@ "types": ["bun"], - "composite": true, - "declaration": true, - "declarationMap": true, - "sourceMap": true, - "incremental": true, - "noEmit": false, - "emitDeclarationOnly": true + "composite": false, + "declaration": false, + "noEmit": true, + "incremental": true } } diff --git a/tsconfig.json b/tsconfig.json index 4f4863d..80c98f2 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -1,26 +1,8 @@ { "extends": "./tsconfig.base.json", "compilerOptions": { - "composite": false, - "declaration": false, - "declarationMap": false, "noEmit": true }, - "files": [], - "references": [ - { "path": "packages/types" }, - { "path": "packages/errors" }, - { "path": "packages/config" }, - { "path": "packages/logger" }, - { "path": "packages/mongo" }, - { "path": "packages/redis" }, - { "path": "packages/queue" }, - { "path": "packages/llm" }, - { "path": "packages/ingest-github" }, - { "path": "packages/ingest-business-context" }, - { "path": "packages/cli" }, - { "path": "packages/server" }, - { "path": "packages/neo4j" }, - { "path": "packages/mcp" } - ] + "include": ["packages/*/src/**/*.ts", "packages/*/src/**/*.tsx", "packages/*/src/**/*.json"], + "exclude": ["**/node_modules", "**/dist", "**/*.d.ts"] }