diff --git a/bun.lock b/bun.lock index 92fb911..0042f56 100644 --- a/bun.lock +++ b/bun.lock @@ -1,6 +1,5 @@ { "lockfileVersion": 1, - "configVersion": 0, "workspaces": { "": { "name": "bytebell-public", @@ -56,6 +55,20 @@ "@bb/types": "workspace:*", }, }, + "packages/ingest-business-context": { + "name": "@bb/ingest-business-context", + "version": "0.0.0", + "dependencies": { + "@bb/config": "workspace:*", + "@bb/errors": "workspace:*", + "@bb/ingest-github": "workspace:*", + "@bb/llm": "workspace:*", + "@bb/logger": "workspace:*", + "@bb/neo4j": "workspace:*", + "@bb/queue": "workspace:*", + "@bb/types": "workspace:*", + }, + }, "packages/ingest-github": { "name": "@bb/ingest-github", "version": "0.0.0", @@ -191,6 +204,8 @@ "@bb/errors": ["@bb/errors@workspace:packages/errors"], + "@bb/ingest-business-context": ["@bb/ingest-business-context@workspace:packages/ingest-business-context"], + "@bb/ingest-github": ["@bb/ingest-github@workspace:packages/ingest-github"], "@bb/llm": ["@bb/llm@workspace:packages/llm"], diff --git a/packages/cli/src/output.d.ts b/packages/cli/src/output.d.ts deleted file mode 100644 index e20f44b..0000000 --- a/packages/cli/src/output.d.ts +++ /dev/null @@ -1,16 +0,0 @@ -export declare function success(line: string): void; -export declare function error(line: string, hint?: string): void; -export declare function list(label: string, items: readonly string[]): void; -export interface Spinner { - update(text: string): void; - stop(success: boolean, finalMsg?: string): void; -} -export declare function createSpinner(initialText: string): Spinner; -export interface ProgressBar { - update(current: number, total: number, text?: string): void; - stop(success: boolean, finalMsg?: string): void; -} -export declare function createProgressBar(initialText: string): ProgressBar; -export declare function table(headers: string[], rows: string[][]): void; -export declare function info(line: string): void; -//# sourceMappingURL=output.d.ts.map diff --git a/packages/cli/tsconfig.json b/packages/cli/tsconfig.json index d8a16a7..4ed0786 100644 --- a/packages/cli/tsconfig.json +++ b/packages/cli/tsconfig.json @@ -1,4 +1,4 @@ { - "extends": "../../../../tsconfig.base.json", + "extends": "../../tsconfig.base.json", "include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.json"] } diff --git a/packages/config/src/schema.ts b/packages/config/src/schema.ts index 63a65d4..d5bae9d 100644 --- a/packages/config/src/schema.ts +++ b/packages/config/src/schema.ts @@ -41,6 +41,10 @@ export const configSchema = z "big.file.concurrency": z.number().int().positive().default(25), "absolute.file.size.cap": z.number().int().positive().default(52428800), "concurrent.workers": z.number().int().positive().default(4), + "llm.concurrency": z.number().int().positive().default(29), + "folder.summary.batch.size": z.number().int().positive().default(10), + "folder.summary.batch.max.files": z.number().int().positive().default(15), + "neo4j.batch.size": z.number().int().positive().default(50), "condense.context.limit": z.number().int().positive().default(12000), "condense.prompt.overhead": z.number().int().nonnegative().default(1500), "small.file.dedup.threshold": z.number().int().positive().default(3), @@ -81,6 +85,10 @@ export type ConfigValueMap = { [Config.BigFileConcurrency]: number; [Config.AbsoluteFileSizeCap]: number; [Config.ConcurrentWorkers]: number; + [Config.LlmConcurrency]: number; + [Config.FolderSummaryBatchSize]: number; + [Config.FolderSummaryBatchMaxFiles]: number; + [Config.Neo4jBatchSize]: number; [Config.CondenseContextLimit]: number; [Config.CondensePromptOverhead]: number; [Config.SmallFileDedupThreshold]: number; @@ -135,6 +143,10 @@ export const HINTS: Readonly> = { [Config.BigFileConcurrency]: "bytebell set big.file.concurrency ", [Config.AbsoluteFileSizeCap]: "bytebell set absolute.file.size.cap ", [Config.ConcurrentWorkers]: "bytebell set concurrent.workers ", + [Config.LlmConcurrency]: "bytebell set llm.concurrency ", + [Config.FolderSummaryBatchSize]: "bytebell set folder.summary.batch.size ", + [Config.FolderSummaryBatchMaxFiles]: "bytebell set folder.summary.batch.max.files ", + [Config.Neo4jBatchSize]: "bytebell set neo4j.batch.size ", [Config.CondenseContextLimit]: "bytebell set condense.context.limit ", [Config.CondensePromptOverhead]: "bytebell set condense.prompt.overhead ", [Config.SmallFileDedupThreshold]: "bytebell set small.file.dedup.threshold ", @@ -195,6 +207,14 @@ export function readField(cfg: BytebellConfig, key: K): Config return cfg["absolute.file.size.cap"] as ConfigValue; case Config.ConcurrentWorkers: return cfg["concurrent.workers"] as ConfigValue; + case Config.LlmConcurrency: + return cfg["llm.concurrency"] as ConfigValue; + case Config.FolderSummaryBatchSize: + return cfg["folder.summary.batch.size"] as ConfigValue; + case Config.FolderSummaryBatchMaxFiles: + return cfg["folder.summary.batch.max.files"] as ConfigValue; + case Config.Neo4jBatchSize: + return cfg["neo4j.batch.size"] as ConfigValue; case Config.CondenseContextLimit: return cfg["condense.context.limit"] as ConfigValue; case Config.CondensePromptOverhead: @@ -264,6 +284,14 @@ export function writeField(cfg: BytebellConfig, key: K, value: return { ...cfg, "absolute.file.size.cap": value as number }; case Config.ConcurrentWorkers: return { ...cfg, "concurrent.workers": value as number }; + case Config.LlmConcurrency: + return { ...cfg, "llm.concurrency": value as number }; + case Config.FolderSummaryBatchSize: + return { ...cfg, "folder.summary.batch.size": value as number }; + case Config.FolderSummaryBatchMaxFiles: + return { ...cfg, "folder.summary.batch.max.files": value as number }; + case Config.Neo4jBatchSize: + return { ...cfg, "neo4j.batch.size": value as number }; case Config.CondenseContextLimit: return { ...cfg, "condense.context.limit": value as number }; case Config.CondensePromptOverhead: diff --git a/packages/config/tsconfig.json b/packages/config/tsconfig.json index d8a16a7..4ed0786 100644 --- a/packages/config/tsconfig.json +++ b/packages/config/tsconfig.json @@ -1,4 +1,4 @@ { - "extends": "../../../../tsconfig.base.json", + "extends": "../../tsconfig.base.json", "include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.json"] } diff --git a/packages/errors/tsconfig.json b/packages/errors/tsconfig.json index d8a16a7..4ed0786 100644 --- a/packages/errors/tsconfig.json +++ b/packages/errors/tsconfig.json @@ -1,4 +1,4 @@ { - "extends": "../../../../tsconfig.base.json", + "extends": "../../tsconfig.base.json", "include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.json"] } diff --git a/packages/ingest-business-context/tsconfig.json b/packages/ingest-business-context/tsconfig.json index d8a16a7..4ed0786 100644 --- a/packages/ingest-business-context/tsconfig.json +++ b/packages/ingest-business-context/tsconfig.json @@ -1,4 +1,4 @@ { - "extends": "../../../../tsconfig.base.json", + "extends": "../../tsconfig.base.json", "include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.json"] } diff --git a/packages/ingest-github/README.md b/packages/ingest-github/README.md index b442726..93d7786 100644 --- a/packages/ingest-github/README.md +++ b/packages/ingest-github/README.md @@ -132,25 +132,51 @@ worker hardcodes a single `IngestionStrategy` instance (currently - `:File` graph nodes + `:HAS_FILE` / `:HAS_KEYWORD` / `:HAS_CLASS` / `:HAS_FUNCTION` / `:HAS_IMPORT_INTERNAL` / `:HAS_IMPORT_EXTERNAL` relationships — written via `upsertFileNode` from `@bb/neo4j`. +- `meta-output/scan-manifest.json` — the canonical small/big/oversized + classification produced by Phase 1 (`scanAndClassify`). Per-file entries + carry `tokenCount`, `kind`, and (for big files) `estimatedChunks`. + Phases 2a (small) and 2b (big) consume the manifest in parallel. +- `meta-output/bigFiles.json` — legacy view written alongside the manifest + for the pull-path and backfill phases. The main strategy no longer + consumes it directly. +- `FileAnalysisCache` (in-memory only, not persisted) — single + `Map` loaded once between the + analyse and backfill phases via parallel `readdir + readFile`. Replaces + three sequential `iterateCondensed` walks (phases 3, 5, 7) with one + parallel preload + three in-memory iterations. The pull workflow loads + its own cache instance; only one strategy run owns a given + `metaPaths` directory at a time. For repos beyond ~50k analysed files + consider a streaming-mode fallback (not implemented today). ## Invariants -1. **Sequential per-file processing.** Intentionally degraded; one - `upsertRawFile` per file. The small-file path issues one `askLLM`; - the big-file path issues N (one per chunk) plus condensation calls, - all sequential — no `Promise.all`, no concurrency cap. Revisit when - the latency profile demands it. -2. **Clone idempotent.** Re-runs (BullMQ retries) call `git fetch` + +1. **Shared LLM concurrency limiter.** The flat-folder strategy + constructs one `withConcurrency(Config.LlmConcurrency)` instance at + entry (default 29). The small-file phase, the big-file chunk phase, + per-file condense calls, **and the folder-summary phase** all check + out from this single pool, so total in-flight LLM calls is bounded + by one knob. The pull-path constructs its own shared limiter at + `runPull` entry and threads it into the selective folder-summary + phase. The legacy `processBigFile` driver used by the pull-path + still uses its own per-file pool sized by `Config.BigFileConcurrency`. +2. **Folder-summary batching by default.** Phase 5 groups small folders + (`≤ Config.FolderSummaryBatchMaxFiles`, default 15) into batches of + up to `Config.FolderSummaryBatchSize` (default 10) and asks the LLM + for one JSON object keyed by integer label that returns one summary + per folder. Bigger folders take the individual single-folder path. + Roll back to one LLM call per folder via + `bytebell set folder.summary.batch.size 1`. +3. **Clone idempotent.** Re-runs (BullMQ retries) call `git fetch` + `git reset --hard` in the existing dir rather than re-cloning. Tokens are re-injected into the remote URL each time. -3. **Token redaction.** `GitCloneError` carries the **redacted** repo +4. **Token redaction.** `GitCloneError` carries the **redacted** repo URL (`https://user:***@host`) — the raw `gitToken` never appears in error messages or logs. -4. **State transition order.** `Processing` is set _before_ any clone +5. **State transition order.** `Processing` is set _before_ any clone work. `Processed` is set _only_ after the entire scan + analyze loop completes. On any thrown error, the handler best-effort sets `Failed` then re-throws so BullMQ records the retry. -5. **Fail-soft analysis, fail-hard infra.** A single file's LLM call +6. **Fail-soft analysis, fail-hard infra.** A single file's LLM call failing falls back to an empty-analysis Raw doc and processing continues. In the big-file path, a single chunk failure contributes an empty analysis to the merge but does not stop the file; a @@ -158,7 +184,7 @@ worker hardcodes a single `IngestionStrategy` instance (currently `dedupAnalyses` so the merged result is always well-formed. A clone failure or Mongo write failure throws and propagates to BullMQ for retry under the queue's `attempts: 3`. -6. **Hardcoded filters only.** No LLM-based ignore decisions in v0. The +7. **Hardcoded filters only.** No LLM-based ignore decisions in v0. The directory / file / extension blocklists in `scan.ts` are the only way files get skipped. @@ -179,7 +205,6 @@ worker hardcodes a single `IngestionStrategy` instance (currently - GitHub API streaming mode (always shell-clone) - Default-branch auto-detection (caller supplies `branch`; defaults to `"main"`) -- Concurrency control / parallel file processing - Folder-level summaries / `repoSummary.json` / `flat-folder` strategy - Semantic chunking (`SemanticChunker`) - Per-chunk persistence (we persist only the merged file-level diff --git a/packages/ingest-github/src/pipeline/README.md b/packages/ingest-github/src/pipeline/README.md index 0c57d78..7c7b0d6 100644 --- a/packages/ingest-github/src/pipeline/README.md +++ b/packages/ingest-github/src/pipeline/README.md @@ -28,7 +28,7 @@ Domain (sub-folder of `@bb/ingest-github`). - `skip-decisions/` — LLM-backed unknown-extension gate. See `skip-decisions/README.md`. Active when `Config.SkipDecisionEnabled = true` (default). Consumed by `scan.ts` via the optional `skipDecider` - dep; built by `classifyAndAnalyseSmall` if not injected. + dep; built by `scanAndClassify` (Phase 1) if not injected. - `disk-source-reader.ts` — `createDiskSourceReader({ repoDir, commitHash })` returns a `SourceReader` that wraps `scanRepository` + `node:fs.readFile`. The default reader the open-source binary always uses, unless the caller @@ -40,9 +40,27 @@ true` (default). Consumed by `scan.ts` via the optional `skipDecider` enters the big-file phase). Both thresholds are config-driven — no magic numbers in this file. `deps.llmCallContext` (when present) is forwarded into every `SkipDeciderInput` so the LLM branch of the - unknown-extension gate uses per-job credentials. `readScannedFile` - re-reads a file by absolute path for the big-file phase which streams - content lazily. + unknown-extension gate uses per-job credentials. + + **Two scan modes:** + - **Two-pass (default for the flat-folder strategy)** — activated when + `deps.skipDecider` AND `deps.limiter` are both supplied. Pass 1 walks + the tree calling `decider.decideStatic(...)`; static-resolved files + yield immediately, "needs LLM" files go into a pending buffer with + their content. Pass 2 dedupes pending entries by `ext:` or + `filename:`, dispatches one `decider.decideAndDeferSave(...)` per + unique key through the shared limiter via `Promise.all`, then calls + `decider.persist()` exactly once. Pass 3 drains pending — every + `decideStatic` call is now a cache hit, so the drain is sync at the + decider boundary and yields each kept file with its buffered content. + - **Legacy inline (`walk()`)** — used when `deps.limiter` is omitted (e.g. + a custom `SourceFactory` consumer that didn't opt in). Inline `await +deps.skipDecider.decide(input)` per file. Same semantics as before this + refactor; preserved for backwards compatibility. + + `readScannedFile` re-reads a file by absolute path for the big-file phase + which streams content lazily. + - `run.ts` — `createPipelineRunner({ reposRootDir, strategy, sourceFactory?, progressContextFactory? })` builds an `IngestRunnerDeps`. GitHub payloads run: branch resolve, source-reader construction, strategy execute, commit persistence. Local @@ -76,7 +94,7 @@ archiveSink?}` and `runPull` skips `syncRepository` + `materialiseEndpoints` (open-source default), the legacy git-based path runs. Either path produces the same downstream pipeline: snapshot prior version, `analyseChangedFiles` (now reading via `SourceReader`), - `processBigFilesQueue`, `backfillMissingFields`, `backfillBigFiles`, + `processBigFilesQueue`, `backfillMissingFields`, `runSelectiveFolderSummary`, `summariseRepo`, `storePullAnalysis`. Mirrors the index-side strategy orchestrator for progress: builds one `ProgressContext` per job from the optional `progressContextFactory` diff --git a/packages/ingest-github/src/pipeline/paths.ts b/packages/ingest-github/src/pipeline/paths.ts index cdddc2f..ac52215 100644 --- a/packages/ingest-github/src/pipeline/paths.ts +++ b/packages/ingest-github/src/pipeline/paths.ts @@ -30,6 +30,7 @@ export function metaPathsFor(knowledgeId: string): MetaPaths { bigFileAnalysisDir: path.join(metaRoot, "big-file-analysis"), bigFileChunksDir: path.join(metaRoot, "big-file-analysis", "chunks"), bigFilesJson: path.join(metaRoot, "bigFiles.json"), + scanManifestJson: path.join(metaRoot, "scan-manifest.json"), repoSummaryJson: path.join(metaRoot, "repo-summary.json"), }; } diff --git a/packages/ingest-github/src/pipeline/pull.ts b/packages/ingest-github/src/pipeline/pull.ts index 930b7be..be344a6 100644 --- a/packages/ingest-github/src/pipeline/pull.ts +++ b/packages/ingest-github/src/pipeline/pull.ts @@ -1,4 +1,6 @@ -import { KnowledgeState, type GithubPullPayload, type JobMessage } from "@bb/types"; +import { Config, KnowledgeState, type GithubPullPayload, type JobMessage } from "@bb/types"; +import { getConfigValue } from "@bb/config"; +import { withConcurrency } from "./concurrency.ts"; import { getKnowledge, markKnowledgeFailed, setKnowledgeCommit, setKnowledgeState } from "@bb/mongo"; import { setKnowledgeStateInGraph, snapshotFilesToVersion, type NodeScope } from "@bb/neo4j"; import type { PipelineSummary } from "#src/types/pipeline.ts"; @@ -19,7 +21,7 @@ import { nullProgressContextFactory } from "#src/progress/NullProgressReporter.t import { analyseChangedFiles } from "#src/strategies/flat-folder/analyse-changed.ts"; import { processBigFilesQueue } from "#src/strategies/flat-folder/phases/process-big-files.ts"; import { backfillMissingFields } from "#src/strategies/flat-folder/backfill/fields.ts"; -import { backfillBigFiles } from "#src/strategies/flat-folder/backfill/big-files.ts"; +import { FileAnalysisCache } from "#src/strategies/flat-folder/file-analysis-cache.ts"; import { runSelectiveFolderSummary } from "#src/strategies/flat-folder/folder-summary-selective.ts"; import { makeRepoSummaryEnvelope, @@ -192,22 +194,14 @@ export async function runPull( totalOutputTokens += phase2.tokenUsage.outputTokens; totalCostUsd += phase2.tokenUsage.costUsd; - logger.info(`pull: phase backfill fields starting`); + logger.info(`pull: loading file-analysis cache`); throwIfCancelled(knowledgeId); - await backfillMissingFields(metaPaths, llmCallContext, progressContext); + const fileAnalysisCache = await FileAnalysisCache.loadAll(metaPaths); + const limiter = withConcurrency(getConfigValue(Config.LlmConcurrency)); - logger.info(`pull: phase backfill big-files starting`); + logger.info(`pull: phase backfill fields starting`); throwIfCancelled(knowledgeId); - const backfillBigFilesInput: Parameters[0] = { - knowledgeId, - source, - metaPaths, - progressContext, - }; - if (llmCallContext !== undefined) { - backfillBigFilesInput.llmCallContext = llmCallContext; - } - await backfillBigFiles(backfillBigFilesInput); + await backfillMissingFields(metaPaths, fileAnalysisCache, limiter, llmCallContext, progressContext); progressContext.phaseChanged("folder_analysis"); logger.info(`pull: phase selective folder summary (${affectedFolders.size} folders) starting`); @@ -215,6 +209,8 @@ export async function runPull( const selectiveInput: Parameters[0] = { knowledgeId, metaPaths, + cache: fileAnalysisCache, + limiter, affectedFolders, }; if (llmCallContext !== undefined) { diff --git a/packages/ingest-github/src/pipeline/scan.ts b/packages/ingest-github/src/pipeline/scan.ts index 02d17ea..d7d9db6 100644 --- a/packages/ingest-github/src/pipeline/scan.ts +++ b/packages/ingest-github/src/pipeline/scan.ts @@ -5,7 +5,8 @@ import { getConfigValue } from "@bb/config"; import type { AskLlmOptions } from "@bb/llm"; import { logger } from "@bb/logger"; import { SKIP_DIRS, looksBinary, passesPathFilters } from "./filters.ts"; -import type { ScanEntry, SkipDecider } from "#src/types/pipeline.ts"; +import type { ConcurrencyLimiter } from "./concurrency.ts"; +import type { ScanEntry, SkipDecider, SkipDeciderInput } from "#src/types/pipeline.ts"; interface ScanLimits { absoluteCap: number; @@ -15,18 +16,7 @@ interface ScanLimits { export interface ScanRepositoryDeps { skipDecider?: SkipDecider; llmCallContext?: AskLlmOptions; -} - -export async function* scanRepository(rootDir: string, deps: ScanRepositoryDeps = {}): AsyncGenerator { - const limits: ScanLimits = { - absoluteCap: getConfigValue(Config.AbsoluteFileSizeCap), - bigFileLineThreshold: getConfigValue(Config.BigFileLineThreshold), - }; - const counts = { acceptStatic: 0, acceptLlm: 0, rejectStatic: 0, rejectLlm: 0, oversized: 0, binary: 0 }; - yield* walk(rootDir, rootDir, limits, deps, counts); - logger.info( - `scan: acceptStatic=${counts.acceptStatic} acceptLlm=${counts.acceptLlm} rejectStatic=${counts.rejectStatic} rejectLlm=${counts.rejectLlm} oversized=${counts.oversized} binary=${counts.binary}`, - ); + limiter?: ConcurrencyLimiter; } interface ScanCounts { @@ -38,6 +28,44 @@ interface ScanCounts { binary: number; } +interface PendingFile { + relativePath: string; + absolutePath: string; + sizeBytes: number; + content: string; + ext: string; + input: SkipDeciderInput; +} + +function newCounts(): ScanCounts { + return { acceptStatic: 0, acceptLlm: 0, rejectStatic: 0, rejectLlm: 0, oversized: 0, binary: 0 }; +} + +function logCounts(counts: ScanCounts): void { + logger.info( + `scan: acceptStatic=${counts.acceptStatic} acceptLlm=${counts.acceptLlm} rejectStatic=${counts.rejectStatic} rejectLlm=${counts.rejectLlm} oversized=${counts.oversized} binary=${counts.binary}`, + ); +} + +export async function* scanRepository(rootDir: string, deps: ScanRepositoryDeps = {}): AsyncGenerator { + const limits: ScanLimits = { + absoluteCap: getConfigValue(Config.AbsoluteFileSizeCap), + bigFileLineThreshold: getConfigValue(Config.BigFileLineThreshold), + }; + + // Two-pass parallel mode requires both a skip-decider AND a limiter so that + // pending LLM resolutions can be deduplicated and dispatched concurrently. + // Without either, fall back to the inline-await walk that's been here all along. + if (deps.skipDecider !== undefined && deps.limiter !== undefined) { + yield* twoPassScan(rootDir, limits, deps.skipDecider, deps.limiter, deps); + return; + } + + const counts = newCounts(); + yield* walk(rootDir, rootDir, limits, deps, counts); + logCounts(counts); +} + async function* walk( rootDir: string, currentDir: string, @@ -82,7 +110,7 @@ async function* walk( continue; } if (deps.skipDecider !== undefined) { - const deciderInput: Parameters[0] = { relativePath, absolutePath: abs, ext }; + const deciderInput: SkipDeciderInput = { relativePath, absolutePath: abs, ext }; if (deps.llmCallContext !== undefined) { deciderInput.llmCallContext = deps.llmCallContext; } @@ -113,6 +141,145 @@ async function* walk( } } +async function* twoPassScan( + rootDir: string, + limits: ScanLimits, + decider: SkipDecider, + limiter: ConcurrencyLimiter, + deps: ScanRepositoryDeps, +): AsyncGenerator { + const counts = newCounts(); + const pending: PendingFile[] = []; + + // Pass 1: walk + categorize. Static-decided files yield immediately; + // "needs LLM" files go into `pending` for batch resolution. + yield* walkAndCategorize(rootDir, rootDir, limits, deps, decider, counts, pending); + + // Pass 2: dedupe pending by decision key (extension or filename), schedule + // one LLM call per unique key through the shared limiter, then persist the + // decider's cache once. + if (pending.length > 0) { + const unique = new Map(); + for (const p of pending) { + const key = decisionKey(p); + if (!unique.has(key)) { + unique.set(key, p.input); + } + } + logger.info(`scan: resolving ${unique.size} unique skip-decision keys for ${pending.length} pending files`); + await Promise.all(Array.from(unique.values()).map((input) => limiter(() => decider.decideAndDeferSave(input)))); + decider.persist(); + } + + // Pass 3: drain pending. Every decideStatic call is now a cache hit. + for (const p of pending) { + const decision = decider.decideStatic(p.input); + if (decision === "reject-static" || decision === null) { + counts.rejectStatic += 1; + continue; + } + if (decision === "reject-llm") { + counts.rejectLlm += 1; + continue; + } + if (decision === "accept-llm") { + counts.acceptLlm += 1; + } else { + counts.acceptStatic += 1; + } + yield { + kind: "file", + relativePath: p.relativePath, + absolutePath: p.absolutePath, + sizeBytes: p.sizeBytes, + content: p.content, + }; + } + + logCounts(counts); +} + +async function* walkAndCategorize( + rootDir: string, + currentDir: string, + limits: ScanLimits, + deps: ScanRepositoryDeps, + decider: SkipDecider, + counts: ScanCounts, + pending: PendingFile[], +): AsyncGenerator { + const dir = await opendir(currentDir); + for await (const entry of dir) { + const abs = path.join(currentDir, entry.name); + if (entry.isDirectory()) { + if (SKIP_DIRS.has(entry.name)) { + continue; + } + yield* walkAndCategorize(rootDir, abs, limits, deps, decider, counts, pending); + continue; + } + if (!entry.isFile()) { + continue; + } + if (!passesPathFilters(entry.name, path.extname(entry.name))) { + counts.rejectStatic += 1; + continue; + } + const sizeBytes = (await stat(abs)).size; + const relativePath = path.relative(rootDir, abs); + const ext = path.extname(entry.name).toLowerCase(); + if (sizeBytes > limits.absoluteCap) { + counts.oversized += 1; + yield { kind: "oversized", relativePath, absolutePath: abs, sizeBytes }; + continue; + } + const buf = await readFile(abs); + if (looksBinary(buf)) { + counts.binary += 1; + continue; + } + const content = buf.toString("utf8"); + if (countLines(content) > limits.bigFileLineThreshold) { + counts.oversized += 1; + yield { kind: "oversized", relativePath, absolutePath: abs, sizeBytes }; + continue; + } + const deciderInput: SkipDeciderInput = { relativePath, absolutePath: abs, ext }; + if (deps.llmCallContext !== undefined) { + deciderInput.llmCallContext = deps.llmCallContext; + } + const sync = decider.decideStatic(deciderInput); + if (sync === "reject-static") { + counts.rejectStatic += 1; + continue; + } + if (sync === "reject-llm") { + counts.rejectLlm += 1; + continue; + } + if (sync === "accept-llm") { + counts.acceptLlm += 1; + yield { kind: "file", relativePath, absolutePath: abs, sizeBytes, content }; + continue; + } + if (sync === "accept") { + counts.acceptStatic += 1; + yield { kind: "file", relativePath, absolutePath: abs, sizeBytes, content }; + continue; + } + // sync === null → needs LLM. Defer to pass 2. + pending.push({ relativePath, absolutePath: abs, sizeBytes, content, ext, input: deciderInput }); + } +} + +function decisionKey(p: PendingFile): string { + if (p.ext.length > 0) { + return `ext:${p.ext}`; + } + const segments = p.relativePath.split("/"); + return `filename:${segments[segments.length - 1] ?? p.relativePath}`; +} + function countLines(content: string): number { if (content.length === 0) { return 0; diff --git a/packages/ingest-github/src/pipeline/skip-decisions/README.md b/packages/ingest-github/src/pipeline/skip-decisions/README.md index f4e0273..18d80bb 100644 --- a/packages/ingest-github/src/pipeline/skip-decisions/README.md +++ b/packages/ingest-github/src/pipeline/skip-decisions/README.md @@ -17,6 +17,36 @@ single-tenant public layout. 8. Persist verdict to ~/.bytebell/llmDecisions.json. LLM failure → reject + cache the rejection. ``` +Steps 1-6 are pure CPU + cached lookup — they run synchronously via +`decideStatic`. Step 7 is the slow LLM branch; `decide` performs it +inline, while `decideAndDeferSave` performs it without flushing the +cache to disk so a batched caller can `persist()` once at the end of +its batch. + +## Public methods (`SkipDecider`) + +```ts +interface SkipDecider { + decide(input): Promise; // legacy single-shot path + decideStatic(input): SkipDecision | null; // sync; null = needs LLM + decideAndDeferSave(input): Promise; // LLM call, no disk save + persist(): void; // flush cache to disk once +} +``` + +- `decide` — the original single-shot API. Calls `decideStatic`; if that + returns `null`, runs the LLM call and `persist()`s the cache. Used by + the legacy `walk()` in `scan.ts` when no shared limiter is passed + (e.g. custom `SourceFactory` consumers that don't opt into two-pass). +- `decideStatic` — synchronous. Returns the resolved `SkipDecision` for + steps 1-6; returns `null` to signal "would need an LLM call". Used by + the two-pass scan to categorise files without blocking the walk. +- `decideAndDeferSave` — runs the LLM call and mutates the in-memory + cache but does **not** flush to disk. Scan calls this concurrently + for unique extension/filename keys under a shared limiter; the disk + write happens once via `persist()` after the batch. +- `persist` — best-effort cache flush; swallows I/O errors. + ## Files - `seed.ts` — loads the four bundled JSON files (directory/filename/pattern/extension lists) @@ -36,7 +66,10 @@ single-tenant public layout. factory time; when disabled the decider degrades to "accept everything past the static blocklist". The LLM branch forwards `SkipDeciderInput.llmCallContext` (when set by the runner) into - `askYesNoLLM` so per-job credentials reach the decision call. + `askYesNoLLM` so per-job credentials reach the decision call. The four + methods (`decide`, `decideStatic`, `decideAndDeferSave`, `persist`) share + one internal `staticDecision()` helper so the seed-list + cache-lookup + branch is defined exactly once. - `seed-data/` — the five JSON files copied from kube's `shared/`: `directoryIgnore.json`, `filenameIgnore.json`, `ignorePatterns.json`, `extensions.json`, `llmDecisionsBase.json`. `llmDecisionsBase.json` is @@ -56,8 +89,15 @@ single-tenant public layout. beyond reading the cache file once at factory time. Only the LLM branch reads file content from disk, and even that is bounded by `Config.SkipDecisionMaxCharsForLlm`. -- Every LLM verdict is flushed to disk immediately so a crash mid-scan does - not lose decisions made earlier in the run. +- `decide` flushes to disk immediately after each LLM verdict — same + semantics as before this refactor, so crash mid-scan does not lose + decisions made earlier in the run when the legacy inline path is in use. +- `decideAndDeferSave` does **not** flush; the batched caller (two-pass + scan) is responsible for calling `persist()` exactly once after the + parallel batch resolves. This avoids racing tmp/rename writes when many + unique extensions resolve concurrently. Crash recovery in two-pass mode + is acceptable because the batch is short and re-running the scan + re-resolves the same decisions. - LLM failure defaults to reject and caches the rejection — matches kube's one-shot-rule behavior. Users can hand-edit the cache to revisit. - The decider is process-local: tests may construct one with `cachePath` diff --git a/packages/ingest-github/src/pipeline/skip-decisions/decider.ts b/packages/ingest-github/src/pipeline/skip-decisions/decider.ts index 455f633..50185e8 100644 --- a/packages/ingest-github/src/pipeline/skip-decisions/decider.ts +++ b/packages/ingest-github/src/pipeline/skip-decisions/decider.ts @@ -29,6 +29,11 @@ export interface SkipDeciderDeps { cachePath?: string; } +interface StaticDecisionContext { + filename: string; + segments: string[]; +} + export function makeSkipDecider(deps: SkipDeciderDeps = {}): SkipDecider { const enabled = getConfigValue(Config.SkipDecisionEnabled); const cachePath = deps.cachePath ?? defaultCachePath(); @@ -37,54 +42,90 @@ export function makeSkipDecider(deps: SkipDeciderDeps = {}): SkipDecider { logCacheSummary(cache); } - return { - async decide(input: SkipDeciderInput): Promise { - const segments = input.relativePath.split("/"); - const filename = segments[segments.length - 1] ?? input.relativePath; - for (const segment of segments.slice(0, -1)) { - if (SEED_DIRECTORIES.has(segment)) { - return "reject-static"; - } - } - if (SEED_FILENAMES.has(filename)) { - return "reject-static"; - } - if (input.ext.length > 0 && SEED_EXTENSIONS.has(input.ext)) { - return "reject-static"; - } - if (matchesAnyGlob(filename)) { + function contextFor(input: SkipDeciderInput): StaticDecisionContext { + const segments = input.relativePath.split("/"); + const filename = segments[segments.length - 1] ?? input.relativePath; + return { filename, segments }; + } + + function staticDecision(input: SkipDeciderInput): SkipDecision | null { + const { filename, segments } = contextFor(input); + for (const segment of segments.slice(0, -1)) { + if (SEED_DIRECTORIES.has(segment)) { return "reject-static"; } + } + if (SEED_FILENAMES.has(filename)) { + return "reject-static"; + } + if (input.ext.length > 0 && SEED_EXTENSIONS.has(input.ext)) { + return "reject-static"; + } + if (matchesAnyGlob(filename)) { + return "reject-static"; + } - if (input.ext.length > 0 && KNOWN_LANGUAGE_EXTENSIONS.has(input.ext)) { - return "accept"; - } + if (input.ext.length > 0 && KNOWN_LANGUAGE_EXTENSIONS.has(input.ext)) { + return "accept"; + } - if (!enabled) { - return "accept"; - } + if (!enabled) { + return "accept"; + } - const cacheKey = input.ext.length > 0 ? input.ext : filename; - const section = input.ext.length > 0 ? cache.extensions : cache.filenames; - const cached = section[cacheKey]; - if (cached !== undefined) { - return cached.ignore ? "reject-llm" : "accept-llm"; - } + const cacheKey = input.ext.length > 0 ? input.ext : filename; + const section = input.ext.length > 0 ? cache.extensions : cache.filenames; + const cached = section[cacheKey]; + if (cached !== undefined) { + return cached.ignore ? "reject-llm" : "accept-llm"; + } + return null; + } + + async function resolveLlm(input: SkipDeciderInput): Promise { + const { filename } = contextFor(input); + const decision = await askLlmDecision(input, deps.repositoryName, input.llmCallContext); + if (input.ext.length > 0) { + setExtensionDecision(cache, input.ext, !decision, "llm", deps.repositoryName, input.relativePath); + } else { + setFilenameDecision(cache, filename, !decision, "llm", deps.repositoryName, input.relativePath); + } + return decision ? "accept-llm" : "reject-llm"; + } + + function persist(): void { + if (!enabled) { + return; + } + try { + saveCache(cachePath, cache); + } catch (cause: unknown) { + const msg = cause instanceof Error ? cause.message : String(cause); + logger.warn(`skip-decisions: failed to save cache to ${cachePath}: ${msg}`); + } + } - const decision = await askLlmDecision(input, deps.repositoryName, input.llmCallContext); - if (input.ext.length > 0) { - setExtensionDecision(cache, input.ext, !decision, "llm", deps.repositoryName, input.relativePath); - } else { - setFilenameDecision(cache, filename, !decision, "llm", deps.repositoryName, input.relativePath); + return { + async decide(input: SkipDeciderInput): Promise { + const sync = staticDecision(input); + if (sync !== null) { + return sync; } - try { - saveCache(cachePath, cache); - } catch (cause: unknown) { - const msg = cause instanceof Error ? cause.message : String(cause); - logger.warn(`skip-decisions: failed to save cache to ${cachePath}: ${msg}`); + const result = await resolveLlm(input); + persist(); + return result; + }, + decideStatic(input: SkipDeciderInput): SkipDecision | null { + return staticDecision(input); + }, + async decideAndDeferSave(input: SkipDeciderInput): Promise { + const sync = staticDecision(input); + if (sync !== null) { + return sync; } - return decision ? "accept-llm" : "reject-llm"; + return await resolveLlm(input); }, + persist, }; } diff --git a/packages/ingest-github/src/pipeline/skip-decisions/seed-data/ignorePatterns.json b/packages/ingest-github/src/pipeline/skip-decisions/seed-data/ignorePatterns.json index f7991f1..96de6e3 100644 --- a/packages/ingest-github/src/pipeline/skip-decisions/seed-data/ignorePatterns.json +++ b/packages/ingest-github/src/pipeline/skip-decisions/seed-data/ignorePatterns.json @@ -305,7 +305,8 @@ { "type": "exact", "pattern": "CODE_OF_CONDUCT.txt" }, { "type": "exact", "pattern": "FAQ.md" }, { "type": "exact", "pattern": "TROUBLESHOOTING.md" }, - { "type": "exact", "pattern": "UPGRADING.md" } + { "type": "exact", "pattern": "UPGRADING.md" }, + { "type": "extension", "pattern": ".md" } ], "logFiles": [ { "type": "extension", "pattern": ".log" }, diff --git a/packages/ingest-github/src/strategies/flat-folder/README.md b/packages/ingest-github/src/strategies/flat-folder/README.md index 8d26d9d..78d8acf 100644 --- a/packages/ingest-github/src/strategies/flat-folder/README.md +++ b/packages/ingest-github/src/strategies/flat-folder/README.md @@ -1,32 +1,50 @@ # `@bb/ingest-github/src/strategies/flat-folder` -The v2 ingestion strategy: clone → scan → big-file split → per-file analyse → -folder summary → repo summary → graph store. Each phase persists artifacts on -disk before the next begins, so a crash resumes cleanly from the next -sub-phase boundary. +The v2 ingestion strategy: scan + classify → analyse small + big in parallel → +field backfill → folder summary → repo summary → graph store. Each phase +persists artifacts on disk before the next begins, so a crash resumes cleanly +from the next sub-phase boundary. + +The strategy constructs **one shared `ConcurrencyLimiter`** at entry (sized by +`Config.LlmConcurrency`, default 29). Every LLM call across small-file +analyses, big-file chunk analyses, per-file condense calls, the skip-decision +LLM gate (during scan), field backfill, and folder summaries checks out from +this single pool. One knob bounds total in-flight LLM concurrency. ## Phases -1. **classify-and-analyse-small** (`phases/classify-and-analyse-small.ts`) — - walks `source.scan({ skipDecider })`; small files → LLM file-analysis → - write `CondensedFileAnalysis` Oversized files → write a stub. Big-by-tokens - files → append to `bigFiles.json` for Phase 2. -2. **process-big-files** (`phases/process-big-files.ts`) — reads - `bigFiles.json`, calls `source.readFile(relativePath)` per entry, - dispatches `processBigFile` sequentially (chunk-level concurrency - inside). -3. **backfill-fields** (`backfill/fields.ts`) — top up `keywords`, - `sideEffects`, `configDependencies`, `dataFlowDirection` on condensed - entries that miss them. Idempotent. -4. **backfill-big-files** (`backfill/big-files.ts`) — re-condense entries - whose chunks exist but condensed JSON is stale or missing. -5. **summarise-folders** (`folder-summary.ts`) — group condensed entries by - `path.posix.dirname` (root = ""), one LLM call per folder, persist to - `folder-summaries/.json`. -6. **summarise-repo** (`repo-summary.ts`) — load folder summaries +1. **scan-and-classify** (`phases/scan-and-classify.ts`) — walks + `source.scan({ skipDecider, limiter })` once, tokenises each file, classifies + as `small` / `big` / `oversized`, and writes + `meta-output/scan-manifest.json` (canonical) plus the legacy + `bigFiles.json` (for the pull-path consumers). Scan internally uses a + **two-pass** strategy: walk + cache-only `decideStatic` first, then + parallel-deduplicated LLM resolution for unknown extensions/filenames + through the shared limiter, then drain. + 2a. **analyse-small** (`phases/analyse-small.ts`) — reads the manifest's + `kind: "small"` entries, re-opens content, runs the LLM file-analyser + per file under the shared limiter, writes `CondensedFileAnalysis` JSON. + Also writes oversized stubs. + 2b. **analyse-big-files** (`phases/process-big-files.ts` — + `analyseBigFiles`) — chunk-task queue across all big files. Every chunk + is an independent task on the shared limiter; per-file condense is + scheduled as soon as that file's last chunk lands (one in-place retry + on transient condense failures). Runs **concurrently with 2a**. +2. **backfill-fields** (`backfill/fields.ts`) — for each cached condensed + entry with missing extended fields (`keywords`, `sideEffects`, + `dataFlowDirection`, `sectionMap`, …) dispatches one LLM call through + the shared limiter to fill the gaps. Idempotent — no-op on a complete + entry. +3. **summarise-folders** (`folder-summary.ts`) — groups condensed entries + by direct parent folder. Small folders + (`≤ Config.FolderSummaryBatchMaxFiles`, default 15) are batched up to + `Config.FolderSummaryBatchSize` (default 10) per LLM call. Bigger + folders take the individual single-folder path. Both flows run through + the shared limiter. +4. **summarise-repo** (`repo-summary.ts`) — load folder summaries shallowest-first; one call if it fits `ContextWindowLimit`, batch + merge otherwise; persist `repo-summary.json` with the v2-flat envelope. -7. **store-flat-analysis** (`phases/store-flat-analysis.ts`) — ensure +5. **store-flat-analysis** (`phases/store-flat-analysis.ts`) — ensure flat-folder indexes, upsert `:Repo`, then every `:Folder`, then every `:File` with the extended analysis + Folder→File `CONTAINS` edge. @@ -38,50 +56,68 @@ The strategy emits progress through the `ProgressContext` port defined in (no-op, OSS default). - **Boundary events** are split between the runner and the strategy: - - `phaseChanged("clone")` and `phaseChanged("scan")` are emitted by - `pipeline/run.ts` (the runner) before `strategy.execute` is called, - so the SSE stream stays alive during the network/disk-bound prelude. - - `phaseChanged("file_analysis")` is emitted by `index.ts` before phase 1 - - `phaseChanged("folder_analysis")` before phase 5 - - `phaseChanged("indexing")` before phase 6 (which feeds phase 7) - - `completed()` after phase 7 returns - - `failed(message)` from a `try/catch` wrapping the whole `execute` -- **Intra-phase ticks** are emitted by each phase via per-phase reporters - created from `progressContext.reporter(...)`. Sub-phase labels: - - phase 1 → no sub-phase (the main file-analysis loop) - - phase 2 → `big_files_queue`; inner `processBigFile` adds - `big_file:` for chunk pulses - - phase 3 → `backfill`; phase 4 → `backfill:big_files` - - phase 5 → no sub-phase, fixed total = directly-grouped folder count - - phase 7 → `folders` then `files`, both `growing` (drained from - on-disk async generators) -- **Total mode**: phase 1, phase 3, and any other streaming-iterator loop - use `total: { kind: "growing" }` (denominator grows as `source.scan` - yields). Phases 2 and 4, plus the big-file chunk pool, know their size - up front and use `total: { kind: "fixed", total: N }`. + - `phaseChanged("clone")` is emitted by `pipeline/run.ts` (the runner) + before `syncRepository`, so the SSE stream stays alive during the + network/disk-bound prelude. + - `phaseChanged("scan")` is emitted by `index.ts` before phase 1. + - `phaseChanged("file_analysis")` before the parallel 2a/2b block. + - `phaseChanged("folder_analysis")` before phase 4 (folder summaries). + - `phaseChanged("indexing")` before phase 5 (which feeds phase 6). + - `completed()` after phase 6 returns. + - `failed(message)` from a `try/catch` wrapping the whole `execute`. +- **Intra-phase ticks** are emitted via per-phase reporters created from + `progressContext.reporter(...)`. Sub-phase labels: + - phase 1 (scan) → no sub-phase, growing total (driven by `incrementSeen`). + - phase 2a (analyse-small) → `analyse_small`, fixed total = + `smallCount + oversizedCount`. + - phase 2b (analyse-big) → two reporters: `big_files_chunks` (fixed total + = sum of estimated chunks across all big files) and `big_files_condense` + (fixed total = `bigCount`). + - phase 3 → `backfill`, fixed total = `cache.size`. + - phase 4 → no sub-phase, fixed total = directly-grouped folder count. + - phase 6 → `folders` (growing) then `files` (fixed total = `cache.size`). +- **Pull-path-only sub-phases** (emitted by `pipeline/pull.ts` workflow, + not the main strategy): `big_files_queue` (legacy single-file driver), + `big_file:` (per-big-file chunk pulses inside the legacy + driver), `pull` (`analyse-changed.ts` selective file analysis). +- **Total mode**: scan is the only main-strategy phase that uses + `growing` mode. Everything else has fixed totals known up front from the + scan manifest, the file-analysis cache, or the folder grouping. - The cancellation path in `execute` lets `CancellationError` propagate past the orchestrator; `failed()` only fires for non-cancellation errors. ## Files -- `index.ts` — `createFlatFolderStrategy(deps)` orchestrates the 7 phases. +- `index.ts` — `createFlatFolderStrategy(deps)` orchestrates the phases. Accepts `{ fileAnalyzer, progressContextFactory? }`. Constructs one - `ProgressContext` per job and threads it into every phase that takes a - `progressContext?` field. + `ProgressContext` per job AND one shared `ConcurrencyLimiter` per job + (sized by `Config.LlmConcurrency`); threads both into every phase that + needs them. - `types.ts` — `AnalyzedFileEntry`, `FolderSummary`, `RepoSummary`, `RepoSummaryEnvelope`, `FlatFolderResult`. - `analyse-file.ts` — `analyseScannedFile(analyzer, file, llmCallContext?)` + `buildOversizedStub`. -- `analyse-changed.ts` — `analyseChangedFiles({knowledgeId, source, metaPaths, analyzer, diff, llmCallContext?, archiveSink?, progressContext?})`. Pull-time per-file dispatcher. Reads changed file content through `input.source` (a `SourceReader`) so it works with both the disk-backed reader (OSS default) and any HTTP-backed alternative supplied via the `pullFactory` hook. Mirrors `classifyAndAnalyseSmall`'s small-file path: filter → fetch → size cap → binary detect → line count → analyse → save + archive push. Does NOT invoke the skip-decision LLM gate. When `progressContext` is present it creates a fixed-total reporter (`subPhase: "pull"`, `total = dedupedPaths.length`) and increments per-path so the pull SSE stream stays live. +- `analyse-changed.ts` — `analyseChangedFiles({knowledgeId, source, metaPaths, analyzer, diff, llmCallContext?, archiveSink?, progressContext?})`. Pull-time per-file dispatcher. Reads changed file content through `input.source` (a `SourceReader`) so it works with both the disk-backed reader (OSS default) and any HTTP-backed alternative supplied via the `pullFactory` hook. Mirrors `analyseSmallFiles`'s per-file path: filter → fetch → size cap → binary detect → line count → analyse → save + archive push. Does NOT invoke the skip-decision LLM gate. When `progressContext` is present it creates a fixed-total reporter (`subPhase: "pull"`, `total = dedupedPaths.length`) and increments per-path so the pull SSE stream stays live. +- `file-analysis-cache.ts` — in-memory `Map` + loaded once between phase 2 and phase 3; shared read-only by phases 3, 4, + 6; mutated by phase 3 backfill via `cache.set(entry)` so downstream phases + see updated entries without re-reading disk. +- `scan-manifest.ts` — `ScanManifest` shape, `readScanManifest`, + `writeScanManifest`. The canonical handoff between phase 1 and phases 2a/2b. - `folder-path.ts` — `directFolderOf`, `affectedFolderPaths`. -- `folder-summary.ts` — group + summarise + persist + iterate folder summaries. +- `folder-summary.ts` — group + summarise (individual or batched) + persist + - iterate folder summaries; shared `dispatchFolderSummaries` used by both + the main strategy and the pull-path's selective folder phase. +- `folder-summary-selective.ts` — pull-time selective folder summary phase. - `repo-summary.ts` — single-shot or batched repo summary with envelope writer. -- `phases/classify-and-analyse-small.ts` — Phase 1. -- `phases/process-big-files.ts` — Phase 2. -- `phases/store-flat-analysis.ts` — Phase 7. -- `backfill/fields.ts` — Phase 3. -- `backfill/big-files.ts` — Phase 4. -- `big-file/` — chunker, analyzer, condenser, storage, cache for Phase 2 & 4. +- `phases/scan-and-classify.ts` — Phase 1. +- `phases/analyse-small.ts` — Phase 2a. +- `phases/process-big-files.ts` — Phase 2b (`analyseBigFiles`, chunk-task + queue) plus the legacy `processBigFilesQueue` driver used by the pull-path. +- `phases/store-flat-analysis.ts` — Phase 6. +- `backfill/fields.ts` — Phase 3 (parallel via shared limiter). +- `big-file/` — chunker, analyzer, condenser, storage, cache used by both + big-file drivers. - `prompts/` — LLM prompts shared across the phases. ## Invariants @@ -103,14 +139,15 @@ The strategy emits progress through the `ProgressContext` port defined in after `saveCondensed`; failures inside the sink are logged WARN and do not interrupt the analyse loop. The open-source binary never wires a sink — `archiveSink` is undefined and the call is skipped entirely. -- **Per-job LLM credentials thread through every phase.** The orchestrator +- **Per-call LLM credentials thread through every phase.** The orchestrator reads `context.llmCallContext` (an optional `AskLlmOptions` built by the runner from `GithubIndexPayload.{llmApiKey, llmProvider, llmModel}`) and forwards it into every phase that issues LLM calls: phase 1 via - `classifyAndAnalyseSmall`'s `llmCallContext`, phase 2 via - `processBigFilesQueue`, phase 3 via `backfillMissingFields`, phase 4 via - `backfillBigFiles`, phase 5 via `runFolderSummaryPhase`, phase 6 via - `summariseRepo`. The phases pass the same option object through to - `askJsonLLM` so per-org overrides reach `@bb/llm` unchanged. OSS - standalone leaves `llmCallContext` undefined and falls back to - `Config.OpenrouterApiKey` + `Config.LlmProvider`. + `scanAndClassify` (forwarded into `source.scan({ llmCallContext })` for + the skip-decision LLM gate), phase 2a via `analyseSmallFiles`, phase 2b + via `analyseBigFiles` (which threads it into **both** the chunk analyzer + and `condenseChunks`), phase 3 via `backfillMissingFields`, phase 4 via + `runFolderSummaryPhase`, phase 5 via `summariseRepo`. The phases pass + the same option object through to `askJsonLLM` so the per-call override + reaches `@bb/llm` unchanged. When `llmCallContext` is undefined the call + falls back to `Config.OpenrouterApiKey` + `Config.LlmProvider`. diff --git a/packages/ingest-github/src/strategies/flat-folder/analyse-changed.ts b/packages/ingest-github/src/strategies/flat-folder/analyse-changed.ts index 982d0a7..17f0125 100644 --- a/packages/ingest-github/src/strategies/flat-folder/analyse-changed.ts +++ b/packages/ingest-github/src/strategies/flat-folder/analyse-changed.ts @@ -39,7 +39,7 @@ export interface AnalyseChangedResult { /** * Pull-time per-file dispatcher. Iterates the changed file set from the - * diff and runs the same per-file work as `classifyAndAnalyseSmall`, but + * diff and runs the same per-file work as `analyseSmallFiles`, but * targeted at known paths rather than a tree walk. * * Reads file content through `input.source` (a `SourceReader`) so the diff --git a/packages/ingest-github/src/strategies/flat-folder/backfill/README.md b/packages/ingest-github/src/strategies/flat-folder/backfill/README.md index dfa3d72..f580f19 100644 --- a/packages/ingest-github/src/strategies/flat-folder/backfill/README.md +++ b/packages/ingest-github/src/strategies/flat-folder/backfill/README.md @@ -1,66 +1,72 @@ # `@bb/ingest-github/src/strategies/flat-folder/backfill` -Post-analysis top-up phases. After Phases 1 and 2 have produced -`CondensedFileAnalysis` JSON on disk, the backfill phases sweep the cache -to fill gaps left by per-file LLM noise or by interrupted big-file runs. -Both are idempotent and skip entries that already look complete. +Post-analysis top-up. After Phases 1 and 2 have produced +`CondensedFileAnalysis` JSON on disk, this phase sweeps the in-memory +cache to fill extended-analysis fields the main per-file prompt left +empty. Idempotent — entries that already look complete are skipped +without an LLM call. + +The big-file backfill phase that used to live here was removed: the +new chunk-task-queue model in `phases/process-big-files.ts` handles +crash recovery directly via the per-chunk disk cache and `inspect()`, +and same-run condense failures are now retried twice in-place before +being marked failed. ## Files -- `fields.ts` — Phase 3. `backfillMissingFields(metaPaths, llmCallContext?, progressContext?)` - iterates every condensed entry via `iterateCondensed`, computes which - extended-analysis fields are missing (`keywords`, `ontologyConcepts`, - `businessEntities`, `systemCapabilities`, `sideEffects`, - `configDependencies`, `dataFlowDirection`, `integrationSurface`, - `contractsProvided`, `contractsConsumed`, `sectionMap`), and asks one - LLM call per file to fill only the missing slots. The response is - validated and normalised (`pickStringArray`, `pickSections`) before - being written back via `saveCondensed`. Entries with nothing missing - are skipped without an LLM call. When `progressContext` is present - this phase opens a growing-total reporter (`subPhase: "backfill"`) - because `iterateCondensed`'s size is not known up front. -- `big-files.ts` — Phase 4. `backfillBigFiles({knowledgeId, repoDir, -metaPaths, llmCallContext?, progressContext?})` re-reads - `bigFiles.json`, skips `reason === "too-large"`, and for each - non-complete entry (per `inspect`) re-runs `processBigFile` against - the file on disk so the condensed JSON is rebuilt from cached chunks - where possible. When `progressContext` is present this phase opens a - fixed-total reporter (`subPhase: "backfill:big_files"`, sized by - `bigFiles.json`) and forwards itself into `processBigFile` so per-file - chunk pulses also surface. +- `fields.ts` — Phase 3. `backfillMissingFields(metaPaths, cache, limiter, llmCallContext?, progressContext?)` + iterates every condensed entry from the shared `FileAnalysisCache`, + computes which extended-analysis fields are missing (`keywords`, + `ontologyConcepts`, `businessEntities`, `systemCapabilities`, + `sideEffects`, `configDependencies`, `dataFlowDirection`, + `integrationSurface`, `contractsProvided`, `contractsConsumed`, + `sectionMap`), and dispatches one LLM call per file **through the shared + `ConcurrencyLimiter`** to fill only the missing slots. Tasks run + concurrently up to `Config.LlmConcurrency`; the loop builds the task + array and awaits `Promise.all` at the end. The response is validated and + normalised (`pickStringArray`, `pickSections`) before being written back + via `saveCondensed` **and** mirrored into the cache via `cache.set(entry)` + so downstream phases (folder summary, graph store) see the updated entry + without re-reading disk. Entries with nothing missing are skipped + without an LLM call. Progress reporter is fixed-total sized by + `cache.size`. Emits `phase3 dispatching N backfill tasks` at entry so the + caller can see how many tasks went through the limiter. ## Public interfaces -- `backfillMissingFields(metaPaths, llmCallContext?, progressContext?): Promise<{ updated, failed }>` -- `backfillBigFiles(input: BackfillBigFilesInput): Promise` - — `BackfillBigFilesInput` carries an optional `llmCallContext?: AskLlmOptions` that the inner `processBigFile` call uses to forward per-job LLM credentials, and an optional `progressContext?: ProgressContext` for the per-phase reporter described above. +- `backfillMissingFields(metaPaths, cache, limiter, llmCallContext?, progressContext?): Promise<{ updated, failed }>` -Both return phase-summary counters consumed by `createFlatFolderStrategy` +Returns phase-summary counters consumed by `createFlatFolderStrategy` to roll up into the strategy result. ## Data ownership -These phases own no new on-disk artifacts. They mutate existing condensed -JSON in place via `saveCondensed`, and (Phase 4) drive `processBigFile` to -refresh the chunk and condensed caches under `big-file/storage.ts`. +This phase owns no new on-disk artifacts. It mutates existing +condensed JSON in place via `saveCondensed` and mirrors the same +mutation into `FileAnalysisCache`. ## Invariants - Idempotent: a second run is a no-op once every entry passes the completeness check. - Per-file LLM failure is logged and counted, never thrown. The phase - continues to the next entry. -- LLM output is untrusted: missing slots are filled only when the response - yields a non-empty value of the expected shape; partial responses leave - unfilled slots for a future pass. -- Phase 4 never touches `reason === "too-large"` entries — those stay as - stubs forever. + continues to the next entry. Only `LlmConfigError` / `LlmError` + propagate (treated as job-fatal upstream). +- LLM output is untrusted: missing slots are filled only when the + response yields a non-empty value of the expected shape; partial + responses leave unfilled slots for a future pass. +- Cache and disk stay in lockstep — every `saveCondensed` is paired + with a `cache.set(entry)` in the same code path. +- Concurrency is bounded by the shared `ConcurrencyLimiter` (today's + `Config.LlmConcurrency`). Counters (`updated`, `failed`, token totals) + are mutated from inside the concurrent tasks — safe under JS's + single-threaded event loop, no locking needed. ## External dependencies `@bb/llm` (`askJsonLLM`), `@bb/logger`, `@bb/mongo` (types only — `FileAnalysis`, `FileAnalysisSection`), the sibling -`flat-folder/big-file/` cache layer, and the prompts under +`flat-folder/file-analysis-cache.ts`, and the prompts under `flat-folder/prompts/backfill.ts`. ## Tier diff --git a/packages/ingest-github/src/strategies/flat-folder/backfill/big-files.ts b/packages/ingest-github/src/strategies/flat-folder/backfill/big-files.ts deleted file mode 100644 index 587808c..0000000 --- a/packages/ingest-github/src/strategies/flat-folder/backfill/big-files.ts +++ /dev/null @@ -1,78 +0,0 @@ -import { logger } from "@bb/logger"; -import type { AskLlmOptions } from "@bb/llm"; -import { LlmConfigError, LlmError } from "@bb/errors"; -import type { MetaPaths } from "#src/types/meta-paths.ts"; -import type { SourceReader } from "#src/types/pipeline.ts"; -import type { ProgressContext } from "#src/progress/types.ts"; -import { readBigFiles } from "#src/strategies/flat-folder/big-file/detector.ts"; -import { inspect } from "#src/strategies/flat-folder/big-file/cache.ts"; -import { processBigFile } from "#src/strategies/flat-folder/big-file/index.ts"; - -export interface BackfillBigFilesInput { - knowledgeId: string; - source: SourceReader; - metaPaths: MetaPaths; - llmCallContext?: AskLlmOptions; - progressContext?: ProgressContext; -} - -export interface BackfillBigFilesResult { - reCondensed: number; - failed: number; -} - -export async function backfillBigFiles(input: BackfillBigFilesInput): Promise { - const entries = await readBigFiles(input.metaPaths); - let reCondensed = 0; - let failed = 0; - const reporter = input.progressContext?.reporter({ - phase: "file_analysis", - subPhase: "backfill:big_files", - total: { kind: "fixed", total: entries.length }, - }); - await reporter?.start(); - try { - for (const entry of entries) { - if (entry.reason === "too-large") { - reporter?.increment(1, { fileName: entry.relativePath }); - continue; - } - const status = await inspect(input.metaPaths, entry.relativePath); - if (status === "complete") { - reporter?.increment(1, { fileName: entry.relativePath }); - continue; - } - try { - const content = await input.source.readFile(entry.relativePath); - if (content.length === 0) { - failed += 1; - logger.warn(`phase4: empty content for ${entry.relativePath}; skipping`); - reporter?.increment(1, { fileName: entry.relativePath }); - continue; - } - await processBigFile({ - knowledgeId: input.knowledgeId, - metaPaths: input.metaPaths, - relativePath: entry.relativePath, - content, - sizeBytes: entry.sizeBytes, - ...(input.llmCallContext !== undefined ? { llmCallContext: input.llmCallContext } : {}), - ...(input.progressContext !== undefined ? { progressContext: input.progressContext } : {}), - }); - reCondensed += 1; - } catch (cause: unknown) { - if (cause instanceof LlmConfigError || cause instanceof LlmError) { - throw cause; - } - failed += 1; - const msg = cause instanceof Error ? cause.message : String(cause); - logger.warn(`phase4: re-condense failed for ${entry.relativePath}: ${msg}`); - } - reporter?.increment(1, { fileName: entry.relativePath }); - } - logger.info(`phase4 done: reCondensed=${reCondensed} failed=${failed}`); - return { reCondensed, failed }; - } finally { - reporter?.stop(); - } -} diff --git a/packages/ingest-github/src/strategies/flat-folder/backfill/fields.ts b/packages/ingest-github/src/strategies/flat-folder/backfill/fields.ts index b6db25e..9effedb 100644 --- a/packages/ingest-github/src/strategies/flat-folder/backfill/fields.ts +++ b/packages/ingest-github/src/strategies/flat-folder/backfill/fields.ts @@ -4,8 +4,9 @@ import { logger } from "@bb/logger"; import type { FileAnalysis, FileAnalysisSection } from "@bb/mongo"; import type { MetaPaths } from "#src/types/meta-paths.ts"; import type { ProgressContext } from "#src/progress/types.ts"; -import { iterateCondensed } from "#src/strategies/flat-folder/big-file/storage.ts"; +import type { ConcurrencyLimiter } from "#src/pipeline/concurrency.ts"; import { saveCondensed } from "#src/strategies/flat-folder/big-file/storage.ts"; +import type { FileAnalysisCache } from "#src/strategies/flat-folder/file-analysis-cache.ts"; import { BACKFILL_SYSTEM_PROMPT, buildBackfillUserPrompt } from "#src/strategies/flat-folder/prompts/backfill.ts"; const EXTENDED_ARRAY_KEYS = [ @@ -44,46 +45,58 @@ interface NeededFlags { export async function backfillMissingFields( metaPaths: MetaPaths, + cache: FileAnalysisCache, + limiter: ConcurrencyLimiter, llmCallContext?: AskLlmOptions, progressContext?: ProgressContext, ): Promise<{ updated: number; failed: number }> { let updated = 0; let failed = 0; + let dispatched = 0; const reporter = progressContext?.reporter({ phase: "file_analysis", subPhase: "backfill", - total: { kind: "growing" }, + total: { kind: "fixed", total: cache.size }, }); await reporter?.start(); try { - for await (const entry of iterateCondensed(metaPaths)) { - reporter?.incrementSeen(); + const tasks: Promise[] = []; + for (const entry of cache.values()) { const a = entry.analysis; const needed = computeNeeded(a); if (!hasAnyMissing(needed)) { reporter?.increment(1, { fileName: entry.relativePath }); continue; } - const userPrompt = buildBackfillUserPrompt(entry.relativePath, entry.analysis); - try { - const response = await askJsonLLM(BACKFILL_SYSTEM_PROMPT, userPrompt, llmCallContext ?? {}); - const result = response.result; - if (result === null) { - reporter?.increment(1, { fileName: entry.relativePath }); - continue; - } - applyBackfill(a, result, needed); - await saveCondensed(metaPaths, entry); - updated += 1; - } catch (cause: unknown) { - if (cause instanceof LlmConfigError || cause instanceof LlmError) { - throw cause; - } - failed += 1; - logger.warn(`phase3: backfill failed for ${entry.relativePath}: ${describe(cause)}`); - } - reporter?.increment(1, { fileName: entry.relativePath }); + dispatched += 1; + tasks.push( + limiter(async () => { + const userPrompt = buildBackfillUserPrompt(entry.relativePath, entry.analysis); + try { + const response = await askJsonLLM(BACKFILL_SYSTEM_PROMPT, userPrompt, llmCallContext ?? {}); + const result = response.result; + if (result === null) { + reporter?.increment(1, { fileName: entry.relativePath }); + return; + } + applyBackfill(a, result, needed); + await saveCondensed(metaPaths, entry); + cache.set(entry); + updated += 1; + } catch (cause: unknown) { + if (cause instanceof LlmConfigError || cause instanceof LlmError) { + throw cause; + } + failed += 1; + logger.warn(`phase3: backfill failed for ${entry.relativePath}: ${describe(cause)}`); + } finally { + reporter?.increment(1, { fileName: entry.relativePath }); + } + }), + ); } + logger.info(`phase3 dispatching ${dispatched} backfill tasks`); + await Promise.all(tasks); logger.info(`phase3 done: updated=${updated} failed=${failed}`); return { updated, failed }; } finally { diff --git a/packages/ingest-github/src/strategies/flat-folder/big-file/README.md b/packages/ingest-github/src/strategies/flat-folder/big-file/README.md index b1c974a..264d8ea 100644 --- a/packages/ingest-github/src/strategies/flat-folder/big-file/README.md +++ b/packages/ingest-github/src/strategies/flat-folder/big-file/README.md @@ -15,31 +15,62 @@ depending on chunk count and prompt budget. `askJsonLLM` with the chunk prompt; tolerates failures by returning an empty analysis. `llmCallContext` forwards per-job LLM credentials threaded through from `StrategyContext`. -- `condenser.ts` — `condenseChunks(relativePath, chunks)`: +- `condenser.ts` — `condenseChunks(relativePath, chunks, llmCallContext?)`: ≤ `SmallFileDedupThreshold` → deterministic merge (no LLM); - above → recursive map-reduce. Per-condense LLM failure falls back to - deterministic dedup so recursion always terminates. + above → recursive map-reduce. `llmCallContext` is threaded through + `condenseRecursively` and `condenseOne` to every `askJsonLLM` call so + the same per-call credential bag the chunk analyser uses also reaches + the condense step — without it, callers that rely on per-call overrides + instead of `Config.OpenrouterApiKey` would hit `LlmConfigError` here. + Per-condense LLM failure falls back to deterministic dedup so recursion + always terminates. - `storage.ts` — on-disk cache (chunk JSON, manifest, condensed analysis) + `iterateCondensed(metaPaths)` async iterator used by Phase 5. - `cache.ts` — `inspect(metaPaths, relativePath)` returns `complete`, - `stale-condensed`, or `missing`. Used by Phase 2 to short-circuit and by - Phase 4 to find candidates for cheap re-condense. + `stale-condensed`, or `missing`. Used by Phase 2 to short-circuit + already-finished big files on resume. The chunk task queue then + re-uses cached chunks via `loadChunkIfPresent` and re-runs condense + to recover any `stale-condensed` files — this is the crash-recovery + pathway that replaced the deleted Phase 4 backfill. - `index.ts` — `processBigFile({knowledgeId, metaPaths, relativePath, content, sizeBytes, llmCallContext?, progressContext?})`. Sequential per file (chunk-level concurrency inside). Persists every intermediate artifact, so a restart resumes from the next unfinished chunk. `llmCallContext` - is forwarded to every chunk analyzer call so per-job LLM credentials - reach `@bb/llm`. When `progressContext` is present, the chunk pool runs - under a fixed-total reporter - (`subPhase: "big_file:"`, `total = chunks.length`) so - long single-file analyses surface as live `PHASE_TICK` envelopes + is forwarded to **both** sides of the big-file pipeline — every + `analyzeChunk` call inside the worker loop **and** the final + `condenseChunks(...)` call — so per-call LLM credentials reach + `@bb/llm` consistently across chunk analysis and condense. When + `progressContext` is present, the chunk pool runs under a fixed-total + reporter (`subPhase: "big_file:"`, `total = chunks.length`) + so long single-file analyses surface as live `PHASE_TICK` envelopes carrying per-chunk progress instead of looking frozen. +## Two callers + +These leaf helpers (`splitFileIntoChunks`, `analyzeChunk`, `condenseChunks`, +the storage / cache primitives) are consumed by **two** drivers: + +- `processBigFile` (`index.ts`) — legacy serial driver. One big file at a + time, chunks-within-file parallel under `Config.BigFileConcurrency`, + followed by a blocking condense. Used today only by the pull-path + (`pipeline/pull.ts`) via `processBigFilesQueue`. +- `analyseBigFiles` (`phases/process-big-files.ts`) — manifest-driven + chunk-task queue used by the main strategy entry. Every chunk of every + big file is an independent task scheduled through a strategy-wide + shared `ConcurrencyLimiter`. As soon as a file's last chunk lands, + that file's `condenseChunks` is scheduled through the same limiter + (with one in-place retry on transient failure) — multiple condenses + run in parallel with chunks of slower files. Reuses + `splitFileIntoChunks`, `analyzeChunk`, `condenseChunks`, and the + storage helpers without modification. + ## Invariants -- One big file at a time. Concurrency lives at the chunk level inside - `processBigFile`, never across files, to bound peak memory. - Every artifact is durable on disk before the next step. The chunk cache - short-circuits on re-runs; the manifest plus condensed JSON are the - Phase 7 graph-store inputs. -- Cancellation is checked between chunks (`throwIfCancelled(knowledgeId)`). + short-circuits on re-runs (per-chunk granularity, not per-file); the + manifest plus condensed JSON are the Phase 7 graph-store inputs. +- Cancellation is checked between chunks and before each condense + dispatch (`throwIfCancelled(knowledgeId)`). +- `bigFiles.json` is now a derived view written by `scanAndClassify`. + The main strategy reads it indirectly via the manifest; the legacy + drivers (pull-path + backfill) continue to read it directly. diff --git a/packages/ingest-github/src/strategies/flat-folder/big-file/condenser.ts b/packages/ingest-github/src/strategies/flat-folder/big-file/condenser.ts index a7b9276..a4663bd 100644 Binary files a/packages/ingest-github/src/strategies/flat-folder/big-file/condenser.ts and b/packages/ingest-github/src/strategies/flat-folder/big-file/condenser.ts differ diff --git a/packages/ingest-github/src/strategies/flat-folder/big-file/index.ts b/packages/ingest-github/src/strategies/flat-folder/big-file/index.ts index 255be0b..c35b234 100644 --- a/packages/ingest-github/src/strategies/flat-folder/big-file/index.ts +++ b/packages/ingest-github/src/strategies/flat-folder/big-file/index.ts @@ -74,7 +74,7 @@ export async function processBigFile(input: ProcessBigFileInput): Promise `chunks/${encodeFolder(input.relativePath)}/chunk-${i}.json`); const totalTokenCount = chunks.reduce((acc, c) => acc + c.tokenCount, 0); diff --git a/packages/ingest-github/src/strategies/flat-folder/file-analysis-cache.ts b/packages/ingest-github/src/strategies/flat-folder/file-analysis-cache.ts new file mode 100644 index 0000000..4405682 --- /dev/null +++ b/packages/ingest-github/src/strategies/flat-folder/file-analysis-cache.ts @@ -0,0 +1,91 @@ +import { readdir, readFile } from "node:fs/promises"; +import path from "node:path"; +import { logger } from "@bb/logger"; +import type { CondensedFileAnalysis } from "#src/types/condensed-file-analysis.ts"; +import type { MetaPaths } from "#src/types/meta-paths.ts"; +import { withConcurrency } from "#src/pipeline/concurrency.ts"; + +const LOAD_CONCURRENCY = 20; + +/** + * In-memory snapshot of every `CondensedFileAnalysis` JSON under + * `metaPaths.fileAnalysisDir`. Loaded once per strategy run between the + * analyse phases (2a/2b) and the backfill / folder-summary / graph-store + * phases. The downstream consumers iterate `.values()` (full sweeps) or + * `.get(relativePath)` (random-access); Phase 3 also calls `.set(...)` + * to keep the map in sync with disk writes. + * + * Replaces three sequential `iterateCondensed` walks (one per consumer) + * with one parallel preload + three in-memory iterations. + */ +export class FileAnalysisCache { + private readonly map: Map; + + private constructor(map: Map) { + this.map = map; + } + + static async loadAll(metaPaths: MetaPaths): Promise { + const startedAt = Date.now(); + let filenames: string[]; + try { + filenames = await readdir(metaPaths.fileAnalysisDir); + } catch (cause: unknown) { + logger.warn(`file-analysis-cache: readdir failed for ${metaPaths.fileAnalysisDir}: ${describe(cause)}`); + return new FileAnalysisCache(new Map()); + } + const jsonFiles = filenames.filter((n) => n.endsWith(".json")); + const map = new Map(); + const limit = withConcurrency(LOAD_CONCURRENCY); + const tasks: Promise[] = []; + for (const name of jsonFiles) { + tasks.push( + limit(async () => { + const full = path.join(metaPaths.fileAnalysisDir, name); + try { + const raw = await readFile(full, "utf8"); + const parsed: unknown = JSON.parse(raw); + if (typeof parsed !== "object" || parsed === null) { + return; + } + const entry = parsed as CondensedFileAnalysis; + if (typeof entry.relativePath !== "string" || entry.relativePath.length === 0) { + return; + } + map.set(entry.relativePath, entry); + } catch (cause: unknown) { + logger.warn(`file-analysis-cache: failed to read ${name}: ${describe(cause)}`); + } + }), + ); + } + await Promise.all(tasks); + const elapsedMs = Date.now() - startedAt; + logger.info(`file-analysis-cache: loaded ${map.size} entries in ${elapsedMs} ms`); + return new FileAnalysisCache(map); + } + + get(relativePath: string): CondensedFileAnalysis | undefined { + return this.map.get(relativePath); + } + + set(entry: CondensedFileAnalysis): void { + this.map.set(entry.relativePath, entry); + } + + values(): IterableIterator { + return this.map.values(); + } + + entries(): IterableIterator<[string, CondensedFileAnalysis]> { + return this.map.entries(); + } + + get size(): number { + return this.map.size; + } +} + +function describe(cause: unknown): string { + return cause instanceof Error ? cause.message : String(cause); +} diff --git a/packages/ingest-github/src/strategies/flat-folder/folder-summary-selective.ts b/packages/ingest-github/src/strategies/flat-folder/folder-summary-selective.ts index d053d82..a2d8791 100644 --- a/packages/ingest-github/src/strategies/flat-folder/folder-summary-selective.ts +++ b/packages/ingest-github/src/strategies/flat-folder/folder-summary-selective.ts @@ -1,19 +1,16 @@ import { logger } from "@bb/logger"; -import { Config } from "@bb/types"; -import { getConfigValue } from "@bb/config"; import type { AskLlmOptions } from "@bb/llm"; +import type { CondensedFileAnalysis } from "#src/types/condensed-file-analysis.ts"; import type { MetaPaths } from "#src/types/meta-paths.ts"; -import { withConcurrency } from "#src/pipeline/concurrency.ts"; -import { throwIfCancelled, CancellationError } from "#src/pipeline/cancellation.ts"; -import { - groupByDirectFolder, - persistFolderSummary, - summariseFolder, -} from "#src/strategies/flat-folder/folder-summary.ts"; +import type { ConcurrencyLimiter } from "#src/pipeline/concurrency.ts"; +import type { FileAnalysisCache } from "#src/strategies/flat-folder/file-analysis-cache.ts"; +import { dispatchFolderSummaries, groupByDirectFolder } from "#src/strategies/flat-folder/folder-summary.ts"; export interface SelectiveFolderSummaryInput { knowledgeId: string; metaPaths: MetaPaths; + cache: FileAnalysisCache; + limiter: ConcurrencyLimiter; affectedFolders: Set; llmCallContext?: AskLlmOptions; } @@ -27,57 +24,37 @@ export interface SelectiveFolderSummaryResult { /** * Pull-time folder summary. Same machinery as `runFolderSummaryPhase` but - * only regenerates folders the caller flagged as affected. Reads condensed - * file analyses from disk; the dispatcher must have populated them already. + * only regenerates folders the caller flagged as affected. Filters by + * `affectedFolders` BEFORE batching so skipped folders never enter a batch. */ export async function runSelectiveFolderSummary( input: SelectiveFolderSummaryInput, ): Promise { - const concurrentWorkers = getConfigValue(Config.ConcurrentWorkers); - const limit = withConcurrency(concurrentWorkers); - const groups = await groupByDirectFolder(input.metaPaths); - let succeeded = 0; - let failed = 0; + const allGroups = groupByDirectFolder(input.cache); + const affectedGroups = new Map(); let skipped = 0; - let totalInputTokens = 0; - let totalOutputTokens = 0; - let totalCostUsd = 0; - const tasks: Promise[] = []; - for (const [folderPath, files] of groups.entries()) { - if (!input.affectedFolders.has(folderPath)) { + for (const [folderPath, files] of allGroups.entries()) { + if (input.affectedFolders.has(folderPath)) { + affectedGroups.set(folderPath, files); + } else { skipped += 1; - continue; } - tasks.push( - limit(async () => { - try { - throwIfCancelled(input.knowledgeId); - const { summary, tokenUsage } = await summariseFolder(folderPath, files, input.llmCallContext); - totalInputTokens += tokenUsage.inputTokens; - totalOutputTokens += tokenUsage.outputTokens; - totalCostUsd += tokenUsage.costUsd; - if (summary !== null) { - await persistFolderSummary(input.metaPaths, summary); - succeeded += 1; - } else { - failed += 1; - } - } catch (cause: unknown) { - if (cause instanceof CancellationError) { - throw cause; - } - failed += 1; - logger.warn(`pull-folder-summary: failed for ${folderPath || ""}`); - } - }), - ); } - await Promise.all(tasks); - logger.info(`pull-folder-summary done: succeeded=${succeeded} failed=${failed} skipped=${skipped}`); + + const totals = await dispatchFolderSummaries( + affectedGroups, + input.metaPaths, + input.limiter, + input.llmCallContext, + undefined, + input.knowledgeId, + "pull-folder-summary", + ); + logger.info(`pull-folder-summary done: succeeded=${totals.succeeded} failed=${totals.failed} skipped=${skipped}`); return { - succeeded, - failed, + succeeded: totals.succeeded, + failed: totals.failed, skipped, - tokenUsage: { inputTokens: totalInputTokens, outputTokens: totalOutputTokens, costUsd: totalCostUsd }, + tokenUsage: { inputTokens: totals.inputTokens, outputTokens: totals.outputTokens, costUsd: totals.costUsd }, }; } diff --git a/packages/ingest-github/src/strategies/flat-folder/folder-summary.ts b/packages/ingest-github/src/strategies/flat-folder/folder-summary.ts index 4fa175b..cdd9c5d 100644 --- a/packages/ingest-github/src/strategies/flat-folder/folder-summary.ts +++ b/packages/ingest-github/src/strategies/flat-folder/folder-summary.ts @@ -8,17 +8,23 @@ import { getConfigValue } from "@bb/config"; import type { CondensedFileAnalysis } from "#src/types/condensed-file-analysis.ts"; import type { MetaPaths } from "#src/types/meta-paths.ts"; import { encodeMetaPath } from "#src/pipeline/paths.ts"; -import { withConcurrency } from "#src/pipeline/concurrency.ts"; +import type { ConcurrencyLimiter } from "#src/pipeline/concurrency.ts"; import { throwIfCancelled, CancellationError } from "#src/pipeline/cancellation.ts"; import type { ProgressContext } from "#src/progress/types.ts"; -import { iterateCondensed } from "./big-file/storage.ts"; +import type { FileAnalysisCache } from "./file-analysis-cache.ts"; import { directFolderOf } from "./folder-path.ts"; -import { FOLDER_ANALYSIS_SYSTEM_PROMPT, folderAnalysisUserPrompt } from "./prompts/folder-summary.ts"; +import { + FOLDER_ANALYSIS_SYSTEM_PROMPT, + FOLDER_BATCH_SYSTEM_PROMPT, + folderAnalysisUserPrompt, + folderBatchUserPrompt, + type BatchedFolderInput, +} from "./prompts/folder-summary.ts"; import type { FolderSummary } from "./types.ts"; -export async function groupByDirectFolder(metaPaths: MetaPaths): Promise> { +export function groupByDirectFolder(cache: FileAnalysisCache): Map { const groups = new Map(); - for await (const entry of iterateCondensed(metaPaths)) { + for (const entry of cache.values()) { const folder = directFolderOf(entry.relativePath); const bucket = groups.get(folder) ?? []; bucket.push(entry); @@ -38,6 +44,52 @@ interface FolderSummaryJson { dependencyGraph?: unknown; } +export interface FolderBucket { + folderPath: string; + files: CondensedFileAnalysis[]; +} + +/** + * Splits the folder groups into "individual" (one LLM call per folder, used + * for big folders or when batching is disabled) and "batches" (N small + * folders summarised in one LLM call). Driven by `Config.FolderSummaryBatchSize` + * (set to 1 to disable batching entirely) and `Config.FolderSummaryBatchMaxFiles` + * (folders exceeding this file count always take the individual path). + * + * Folders are sorted by path so that two runs of the same repo produce the + * same batch composition — helpful when A/B-comparing outputs. + */ +export function groupFoldersForBatching(groups: Map): { + individual: FolderBucket[]; + batches: FolderBucket[][]; +} { + const batchSize = getConfigValue(Config.FolderSummaryBatchSize); + const maxFiles = getConfigValue(Config.FolderSummaryBatchMaxFiles); + const sorted: FolderBucket[] = [...groups.entries()] + .map(([folderPath, files]) => ({ folderPath, files })) + .sort((a, b) => a.folderPath.localeCompare(b.folderPath)); + + if (batchSize <= 1) { + return { individual: sorted, batches: [] }; + } + + const individual: FolderBucket[] = []; + const batchable: FolderBucket[] = []; + for (const bucket of sorted) { + if (bucket.files.length > maxFiles) { + individual.push(bucket); + } else { + batchable.push(bucket); + } + } + + const batches: FolderBucket[][] = []; + for (let i = 0; i < batchable.length; i += batchSize) { + batches.push(batchable.slice(i, i + batchSize)); + } + return { individual, batches }; +} + export async function summariseFolder( folderPath: string, files: CondensedFileAnalysis[], @@ -82,6 +134,72 @@ export async function summariseFolder( } } +/** + * Multi-folder summary. Builds a label-indexed prompt, parses the keyed JSON + * response, returns one `FolderSummary | null` per folder. Folders missing + * from the response (or whose entry fails shape validation) are surfaced as + * `null` with a warn log; the caller counts those as failed. + */ +export async function summariseFolderBatch( + batch: FolderBucket[], + llmCallContext?: AskLlmOptions, +): Promise<{ + summaries: Map; + tokenUsage: { inputTokens: number; outputTokens: number; costUsd: number }; +}> { + const labeled: BatchedFolderInput[] = batch.map((b, i) => ({ label: i, folderPath: b.folderPath, files: b.files })); + const userPrompt = folderBatchUserPrompt(labeled); + const summaries = new Map(); + try { + const response = await askJsonLLM>( + FOLDER_BATCH_SYSTEM_PROMPT, + userPrompt, + llmCallContext ?? {}, + ); + if (response.result === null) { + logger.warn(`summariseFolderBatch: batch of ${batch.length} returned unparseable JSON`); + for (const b of batch) { + summaries.set(b.folderPath, null); + } + return { + summaries, + tokenUsage: { + inputTokens: response.usage.inputTokens, + outputTokens: response.usage.outputTokens, + costUsd: response.usage.costUsd, + }, + }; + } + for (const b of labeled) { + const raw = response.result[String(b.label)]; + if (raw === undefined || typeof raw !== "object" || raw === null) { + logger.warn(`summariseFolderBatch: missing/invalid entry for label ${b.label} (${b.folderPath || ""})`); + summaries.set(b.folderPath, null); + continue; + } + summaries.set(b.folderPath, shapeFolderSummary(b.folderPath, raw)); + } + return { + summaries, + tokenUsage: { + inputTokens: response.usage.inputTokens, + outputTokens: response.usage.outputTokens, + costUsd: response.usage.costUsd, + }, + }; + } catch (cause: unknown) { + if (cause instanceof LlmConfigError || cause instanceof LlmError) { + throw cause; + } + const msg = cause instanceof Error ? cause.message : String(cause); + logger.warn(`summariseFolderBatch: batch of ${batch.length} askJsonLLM failed: ${msg}`); + for (const b of batch) { + summaries.set(b.folderPath, null); + } + return { summaries, tokenUsage: { inputTokens: 0, outputTokens: 0, costUsd: 0 } }; + } +} + export async function persistFolderSummary(metaPaths: MetaPaths, summary: FolderSummary): Promise { const file = path.join(metaPaths.folderSummariesDir, `${encodeMetaPath(summary.folderPath || "__ROOT__")}.json`); await writeFile(file, JSON.stringify(summary, null, 2), "utf8"); @@ -110,9 +228,138 @@ export async function* iterateFolderSummaries(metaPaths: MetaPaths): AsyncGenera } } +interface FolderSummaryTotals { + succeeded: number; + failed: number; + inputTokens: number; + outputTokens: number; + costUsd: number; +} + +/** + * Dispatches a single folder through `summariseFolder` and persists the + * result. Shared between `runFolderSummaryPhase` and `runSelectiveFolderSummary`. + */ +async function dispatchIndividual( + bucket: FolderBucket, + metaPaths: MetaPaths, + totals: FolderSummaryTotals, + llmCallContext: AskLlmOptions | undefined, + reporter: ReturnType> | undefined, + knowledgeId: string, + phaseLabel: string, +): Promise { + try { + throwIfCancelled(knowledgeId); + const { summary, tokenUsage } = await summariseFolder(bucket.folderPath, bucket.files, llmCallContext); + totals.inputTokens += tokenUsage.inputTokens; + totals.outputTokens += tokenUsage.outputTokens; + totals.costUsd += tokenUsage.costUsd; + if (summary !== null) { + await persistFolderSummary(metaPaths, summary); + totals.succeeded += 1; + } else { + totals.failed += 1; + } + } catch (cause: unknown) { + if (cause instanceof CancellationError) { + throw cause; + } + totals.failed += 1; + logger.warn(`${phaseLabel}: folder summary failed for ${bucket.folderPath || ""}`); + } finally { + reporter?.increment(1, { fileName: bucket.folderPath || "" }); + } +} + +/** + * Dispatches a multi-folder batch through `summariseFolderBatch`. Each + * non-null per-folder summary is persisted; missing/null entries count + * toward `failed`. Progress increments once per folder. + */ +async function dispatchBatch( + batch: FolderBucket[], + metaPaths: MetaPaths, + totals: FolderSummaryTotals, + llmCallContext: AskLlmOptions | undefined, + reporter: ReturnType> | undefined, + knowledgeId: string, + phaseLabel: string, +): Promise { + try { + throwIfCancelled(knowledgeId); + const { summaries, tokenUsage } = await summariseFolderBatch(batch, llmCallContext); + totals.inputTokens += tokenUsage.inputTokens; + totals.outputTokens += tokenUsage.outputTokens; + totals.costUsd += tokenUsage.costUsd; + for (const bucket of batch) { + const summary = summaries.get(bucket.folderPath) ?? null; + if (summary !== null) { + try { + await persistFolderSummary(metaPaths, summary); + totals.succeeded += 1; + } catch (cause: unknown) { + totals.failed += 1; + logger.warn( + `${phaseLabel}: persist failed for ${bucket.folderPath || ""}: ${cause instanceof Error ? cause.message : String(cause)}`, + ); + } + } else { + totals.failed += 1; + } + reporter?.increment(1, { fileName: bucket.folderPath || "" }); + } + } catch (cause: unknown) { + if (cause instanceof CancellationError) { + throw cause; + } + totals.failed += batch.length; + for (const bucket of batch) { + reporter?.increment(1, { fileName: bucket.folderPath || "" }); + } + logger.warn( + `${phaseLabel}: batch summary failed for ${batch.length} folders: ${cause instanceof Error ? cause.message : String(cause)}`, + ); + } +} + +/** + * Dispatch helper used by both `runFolderSummaryPhase` and + * `runSelectiveFolderSummary`. Splits `groups` into individual + batched + * buckets, schedules every task through the shared `limiter`, awaits all, + * and returns the aggregated totals. + */ +export async function dispatchFolderSummaries( + groups: Map, + metaPaths: MetaPaths, + limiter: ConcurrencyLimiter, + llmCallContext: AskLlmOptions | undefined, + reporter: ReturnType> | undefined, + knowledgeId: string, + phaseLabel: string, +): Promise { + const totals: FolderSummaryTotals = { succeeded: 0, failed: 0, inputTokens: 0, outputTokens: 0, costUsd: 0 }; + const { individual, batches } = groupFoldersForBatching(groups); + const tasks: Promise[] = []; + for (const bucket of individual) { + tasks.push( + limiter(() => dispatchIndividual(bucket, metaPaths, totals, llmCallContext, reporter, knowledgeId, phaseLabel)), + ); + } + for (const batch of batches) { + tasks.push( + limiter(() => dispatchBatch(batch, metaPaths, totals, llmCallContext, reporter, knowledgeId, phaseLabel)), + ); + } + await Promise.all(tasks); + return totals; +} + export async function runFolderSummaryPhase( knowledgeId: string, metaPaths: MetaPaths, + cache: FileAnalysisCache, + limiter: ConcurrencyLimiter, llmCallContext?: AskLlmOptions, progressContext?: ProgressContext, ): Promise<{ @@ -120,57 +367,23 @@ export async function runFolderSummaryPhase( failed: number; tokenUsage: { inputTokens: number; outputTokens: number; costUsd: number }; }> { - const concurrentWorkers = getConfigValue(Config.ConcurrentWorkers); - const limit = withConcurrency(concurrentWorkers); - const groups = await groupByDirectFolder(metaPaths); - let succeeded = 0; - let failed = 0; - let totalInputTokens = 0; - let totalOutputTokens = 0; - let totalCostUsd = 0; + const groups = groupByDirectFolder(cache); const reporter = progressContext?.reporter({ phase: "folder_analysis", total: { kind: "fixed", total: groups.size }, }); await reporter?.start(); + let totals: FolderSummaryTotals; try { - const tasks: Promise[] = []; - for (const [folderPath, files] of groups.entries()) { - tasks.push( - limit(async () => { - try { - throwIfCancelled(knowledgeId); - const { summary, tokenUsage } = await summariseFolder(folderPath, files, llmCallContext); - totalInputTokens += tokenUsage.inputTokens; - totalOutputTokens += tokenUsage.outputTokens; - totalCostUsd += tokenUsage.costUsd; - if (summary !== null) { - await persistFolderSummary(metaPaths, summary); - succeeded += 1; - } else { - failed += 1; - } - } catch (cause: unknown) { - if (cause instanceof CancellationError) { - throw cause; - } - failed += 1; - logger.warn(`phase5: folder summary failed for ${folderPath || ""}`); - } finally { - reporter?.increment(1, { fileName: folderPath || "" }); - } - }), - ); - } - await Promise.all(tasks); + totals = await dispatchFolderSummaries(groups, metaPaths, limiter, llmCallContext, reporter, knowledgeId, "phase5"); } finally { reporter?.stop(); } - logger.info(`phase5 done: foldersSummarised=${succeeded} failed=${failed}`); + logger.info(`phase5 done: foldersSummarised=${totals.succeeded} failed=${totals.failed}`); return { - succeeded, - failed, - tokenUsage: { inputTokens: totalInputTokens, outputTokens: totalOutputTokens, costUsd: totalCostUsd }, + succeeded: totals.succeeded, + failed: totals.failed, + tokenUsage: { inputTokens: totals.inputTokens, outputTokens: totals.outputTokens, costUsd: totals.costUsd }, }; } diff --git a/packages/ingest-github/src/strategies/flat-folder/index.ts b/packages/ingest-github/src/strategies/flat-folder/index.ts index 09c03c6..86797a6 100644 --- a/packages/ingest-github/src/strategies/flat-folder/index.ts +++ b/packages/ingest-github/src/strategies/flat-folder/index.ts @@ -1,12 +1,16 @@ +import { Config } from "@bb/types"; +import { getConfigValue } from "@bb/config"; import { logger } from "@bb/logger"; import type { FileAnalyzer } from "#src/types/pipeline.ts"; import type { IngestStrategy, StrategyInput, StrategyResult } from "#src/types/strategy.ts"; import { throwIfCancelled } from "#src/pipeline/cancellation.ts"; import { classifyFailure } from "#src/pipeline/failure-classifier.ts"; -import { classifyAndAnalyseSmall } from "./phases/classify-and-analyse-small.ts"; -import { processBigFilesQueue } from "./phases/process-big-files.ts"; +import { withConcurrency } from "#src/pipeline/concurrency.ts"; +import { scanAndClassify } from "./phases/scan-and-classify.ts"; +import { analyseSmallFiles } from "./phases/analyse-small.ts"; +import { analyseBigFiles } from "./phases/analyse-big-files.ts"; import { backfillMissingFields } from "./backfill/fields.ts"; -import { backfillBigFiles } from "./backfill/big-files.ts"; +import { FileAnalysisCache } from "./file-analysis-cache.ts"; import { runFolderSummaryPhase } from "./folder-summary.ts"; import { makeRepoSummaryEnvelope, persistRepoSummary, summariseRepo } from "./repo-summary.ts"; import { storeFlatAnalysis } from "./phases/store-flat-analysis.ts"; @@ -28,65 +32,81 @@ export function createFlatFolderStrategy(deps: FlatFolderStrategyDeps): IngestSt const progressContext: ProgressContext = progressContextFactory(knowledgeId); try { - progressContext.phaseChanged("file_analysis"); + // Shared LLM limiter — small-file analyses, big-file chunk analyses, + // and per-file condense calls all check out from this single pool. + const llmConcurrency = getConfigValue(Config.LlmConcurrency); + const limiter = withConcurrency(llmConcurrency); - logger.info(`flat-folder: phase1 (classify + analyse small) starting for ${knowledgeId}`); + progressContext.phaseChanged("scan"); + logger.info(`flat-folder: phase1 (scan + classify) starting for ${knowledgeId} limit=${llmConcurrency}`); throwIfCancelled(knowledgeId); - const phase1Input: Parameters[0] = { + const scanInput: Parameters[0] = { knowledgeId, source, metaPaths, - analyzer: deps.fileAnalyzer, + limiter, progressContext, }; - if (archiveSink !== undefined) { - phase1Input.archiveSink = archiveSink; - } if (llmCallContext !== undefined) { - phase1Input.llmCallContext = llmCallContext; + scanInput.llmCallContext = llmCallContext; } - const phase1 = await classifyAndAnalyseSmall(phase1Input); - let totalInputTokens = phase1.tokenUsage.inputTokens; - let totalOutputTokens = phase1.tokenUsage.outputTokens; - let totalCostUsd = phase1.tokenUsage.costUsd; + const { manifest } = await scanAndClassify(scanInput); - logger.info(`flat-folder: phase2 (process big files) starting`); + progressContext.phaseChanged("file_analysis"); + logger.info( + `flat-folder: phase2 (analyse small ${manifest.summary.smallCount} + big ${manifest.summary.bigCount}) starting in parallel`, + ); throwIfCancelled(knowledgeId); - const phase2Input: Parameters[0] = { + const smallInput: Parameters[0] = { knowledgeId, + manifest, source, metaPaths, + analyzer: deps.fileAnalyzer, + limiter, progressContext, }; + if (archiveSink !== undefined) { + smallInput.archiveSink = archiveSink; + } if (llmCallContext !== undefined) { - phase2Input.llmCallContext = llmCallContext; + smallInput.llmCallContext = llmCallContext; } - const phase2 = await processBigFilesQueue(phase2Input); - totalInputTokens += phase2.tokenUsage.inputTokens; - totalOutputTokens += phase2.tokenUsage.outputTokens; - totalCostUsd += phase2.tokenUsage.costUsd; - - logger.info(`flat-folder: phase3 (backfill missing fields) starting`); - throwIfCancelled(knowledgeId); - await backfillMissingFields(metaPaths, llmCallContext, progressContext); - - logger.info(`flat-folder: phase4 (backfill big files) starting`); - throwIfCancelled(knowledgeId); - const phase4Input: Parameters[0] = { + const bigInput: Parameters[0] = { knowledgeId, + manifest, source, metaPaths, + limiter, progressContext, }; if (llmCallContext !== undefined) { - phase4Input.llmCallContext = llmCallContext; + bigInput.llmCallContext = llmCallContext; } - await backfillBigFiles(phase4Input); + const [smallResult, bigResult] = await Promise.all([analyseSmallFiles(smallInput), analyseBigFiles(bigInput)]); + let totalInputTokens = smallResult.tokenUsage.inputTokens + bigResult.tokenUsage.inputTokens; + let totalOutputTokens = smallResult.tokenUsage.outputTokens + bigResult.tokenUsage.outputTokens; + let totalCostUsd = smallResult.tokenUsage.costUsd + bigResult.tokenUsage.costUsd; + + logger.info(`flat-folder: loading file-analysis cache`); + throwIfCancelled(knowledgeId); + const fileAnalysisCache = await FileAnalysisCache.loadAll(metaPaths); + + logger.info(`flat-folder: phase3 (backfill missing fields) starting`); + throwIfCancelled(knowledgeId); + await backfillMissingFields(metaPaths, fileAnalysisCache, limiter, llmCallContext, progressContext); progressContext.phaseChanged("folder_analysis"); logger.info(`flat-folder: phase5 (folder summaries) starting`); throwIfCancelled(knowledgeId); - const phase5 = await runFolderSummaryPhase(knowledgeId, metaPaths, llmCallContext, progressContext); + const phase5 = await runFolderSummaryPhase( + knowledgeId, + metaPaths, + fileAnalysisCache, + limiter, + llmCallContext, + progressContext, + ); totalInputTokens += phase5.tokenUsage.inputTokens; totalOutputTokens += phase5.tokenUsage.outputTokens; totalCostUsd += phase5.tokenUsage.costUsd; @@ -115,13 +135,15 @@ export function createFlatFolderStrategy(deps: FlatFolderStrategyDeps): IngestSt payload, branch, metaPaths, + cache: fileAnalysisCache, progressContext, }); progressContext.completed(); return { - filesAnalyzed: phase1.smallFilesAnalysed + phase2.processed + phase2.cached + phase1.oversizedStubs, + filesAnalyzed: + smallResult.smallFilesAnalysed + smallResult.oversizedStubs + bigResult.processed + bigResult.cached, foldersSummarised: phase5.succeeded, repoSummarised, graphNodesWritten: phase7.nodesWritten, diff --git a/packages/ingest-github/src/strategies/flat-folder/phases/README.md b/packages/ingest-github/src/strategies/flat-folder/phases/README.md index f0701a7..64cfc96 100644 --- a/packages/ingest-github/src/strategies/flat-folder/phases/README.md +++ b/packages/ingest-github/src/strategies/flat-folder/phases/README.md @@ -6,84 +6,151 @@ Backfill (Phases 3 and 4) lives in the sibling `backfill/` folder; folder and repo summarisation (Phases 5 and 6) live as `folder-summary.ts` and `repo-summary.ts` at the strategy root. +The strategy constructs a **shared LLM limiter** (`withConcurrency(Config.LlmConcurrency)`, +default 29) once at entry. Every LLM call across the small-file phase, +the big-file chunk phase, and per-file condense calls checks out from +the same pool — the single tunable for total in-flight LLM calls. + ## Files -- `classify-and-analyse-small.ts` — Phase 1. - `classifyAndAnalyseSmall({knowledgeId, source, metaPaths, analyzer, -skipDecider?, archiveSink?, llmCallContext?, progressContext?})` walks - `source.scan({ skipDecider, llmCallContext })` and per entry: - - `kind === "oversized"` → write a stub via `buildOversizedStub` + - `saveCondensed`, and append a `too-large` row to `bigFiles.json`. - - token count > `Config.ContextWindowLimit` → buffer a - `context-window-exceeded` row for Phase 2. - - otherwise → run `analyseScannedFile(analyzer, entry)` and persist via - `saveCondensed`, under a `withConcurrency(Config.ConcurrentWorkers)` - limiter so analyses run in parallel. - Cancellation is checked at scan boundaries and inside each task; the - buffered big-file list is flushed via `writeBigFiles` after all tasks - drain. -- `process-big-files.ts` — Phase 2. - `processBigFilesQueue({knowledgeId, source, metaPaths, llmCallContext?, progressContext?})` - reads `bigFiles.json`, skips `too-large` entries (counted as - `skippedOversized`), short-circuits when `inspect` returns `complete` - (counted as `cached`), reads the file via `source.readFile`, and - dispatches `processBigFile` sequentially per file with the per-job - `llmCallContext` threaded through. When `progressContext` is present - this phase opens a fixed-total reporter (`subPhase: "big_files_queue"`, - `total = entries.length`) and increments per entry — including - skipped/cached/failed paths so the percentage never stalls. The same - `progressContext` is forwarded into `processBigFile` so each big file - gets its own per-chunk sub-phase. Cancellation re-throws past the - phase; other errors are logged per file and counted as `failed`. +- `scan-and-classify.ts` — Phase 1. `scanAndClassify({knowledgeId, source, +metaPaths, skipDecider?, llmCallContext?, progressContext?})` walks + `source.scan({ skipDecider, llmCallContext })` exactly once, counts + tokens for every eligible entry, classifies each as `"small"`, + `"big"` (token count > `Config.ContextWindowLimit`), or `"oversized"` + (yielded as `kind === "oversized"` by `scanRepository`), and writes + `meta-output/scan-manifest.json` plus the legacy `bigFiles.json` (for + pull-path and backfill consumers that have not migrated). Big entries + get a cheap `estimatedChunks = ceil(tokenCount / Config.MaxTokensPerChunk)` + used by Phase 2's progress reporter. No LLM calls. No file analysis. +- `analyse-small.ts` — Phase 2a. `analyseSmallFiles({knowledgeId, manifest, +source, metaPaths, analyzer, limiter, archiveSink?, llmCallContext?, +progressContext?})` filters the manifest to `kind === "small"` entries, + re-reads each file via `source.readFile`, runs the LLM file analyser, + and persists via `saveCondensed`. Oversized entries also flow through + here as stub writes (no LLM). Every LLM dispatch goes through the + shared `limiter`. Progress is a fixed total — `smallCount + oversizedCount`. +- `process-big-files.ts` — Phase 2b plus the legacy queue. Exports two + functions: + - `analyseBigFiles({knowledgeId, manifest, source, metaPaths, limiter, +llmCallContext?, progressContext?})` — manifest-driven chunk-task + queue. Skips files already complete (manifest + condensed on disk). + For each remaining big file: read content, split into chunks + via `splitFileIntoChunks`, register a per-file `pendingChunks` + counter. Every chunk becomes an independent task scheduled through + the shared limiter: cache-check via `loadChunkIfPresent`, otherwise + `analyzeChunk` + `saveChunk`. When a file's last chunk lands, that + file's condense is **immediately** scheduled through the same + limiter — condenses across multiple files run in parallel with + chunks of slower files. Two fixed-total progress sub-phases: + `"big_files_chunks"` (sum of `estimatedChunks`) and + `"big_files_condense"` (`bigCount`). + - `processBigFilesQueue({knowledgeId, source, metaPaths, llmCallContext?, +progressContext?})` — legacy serial driver kept for the pull-path + (`pipeline/pull.ts`) and any caller that has not migrated to + `analyseBigFiles(manifest, …)`. Reads `bigFiles.json`, dispatches + `processBigFile` once per file in a `for` loop. - `store-flat-analysis.ts` — Phase 7. - `storeFlatAnalysis({scope, payload, branch, metaPaths})` ensures + `storeFlatAnalysis({scope, payload, branch, metaPaths, cache})` ensures `flat-folder` Neo4j indexes, upserts `:Repo` (from `repo-summary.json` - if present, empty payload otherwise), then iterates folder summaries - via `iterateFolderSummaries` to upsert `:Folder`, then iterates - condensed entries via `iterateCondensed` to upsert `:File`. Files whose - containing folder was not in the summaries set get a synthesised empty - `:Folder` so the `CONTAINS` edge always lands. `languageFromPath` - fills `language` when the analysis left it blank. + if present, empty payload otherwise), then **dispatches `:Folder` and + `:File` upserts in batches of `Config.Neo4jBatchSize` (default 50)** + via `upsertFolderNodesBatch` / `upsertFileNodesBatch` from `@bb/neo4j`. + Each batch is one Neo4j write transaction containing the same 12 + Cyphers (1 MERGE + 1 folder-attach + 5 rel CLEARs + 5 rel ATTACHes via + UNWIND) that a single upsert used to issue — so a 1 000-file repo + collapses from ~12 000 round-trips to ~240. Files whose containing + folder was not in the summaries set get a synthesised empty `:Folder` + entry added to the folder batch list **up front** (before any batch + dispatches) so the `CONTAINS` edge always lands. + `languageFromPath` fills `language` when the analysis left it blank. + Both progress reporters (`folders`, `files`) open at phase entry with + their fixed totals so the indexing overall-progress aggregate sees + both denominators from the first tick — fixes the prior "leaps to 100 + then sits there" UX bug. + +## Execution order + +``` +scanAndClassify + ↓ (manifest in-memory + on disk) +┌── analyseSmallFiles ──┐ +│ │ (Promise.all, share one limiter) +└── analyseBigFiles ────┘ + ↓ +FileAnalysisCache.loadAll (one parallel readdir+readFile pass) + ↓ +backfillMissingFields → folderSummary → repoSummary → storeFlatAnalysis + (cache read+write) (cache read) (cache read) +``` + +`FileAnalysisCache` is a `Map` loaded +once between phase 2 and phase 3. Phases 3, 5, 7 all consume the same +instance — phase 3 also calls `cache.set(...)` after each backfill write +so phases 5 and 7 see the updated entries without re-reading disk. ## Public interfaces -- `classifyAndAnalyseSmall(input): Promise` — - `{ smallFilesAnalysed, bigFilesQueued, oversizedStubs, failed }`. - `input.progressContext?` opens a growing-total reporter - (`source.scan` size is not known up front); `incrementSeen()` fires per - scan yield and `increment()` fires per persisted entry. -- `processBigFilesQueue(input): Promise` — - `{ processed, cached, failed, skippedOversized }`. `input.progressContext?` - opens a fixed-total reporter sized by `bigFiles.json` and forwards - itself into the per-file `processBigFile` call. +- `scanAndClassify(input): Promise` — + `{ manifest }`. The manifest contains every eligible file plus a + `summary` with `totalFiles`, `smallCount`, `bigCount`, `oversizedCount`, + `totalTokens`, `estimatedBigChunks`. +- `analyseSmallFiles(input): Promise` — + `{ smallFilesAnalysed, oversizedStubs, failed, tokenUsage }`. + Progress: fixed-total reporter sized by `smallCount + oversizedCount`. +- `analyseBigFiles(input): Promise` — + `{ processed, cached, failed, skippedOversized, tokenUsage }`. + Progress: two fixed-total reporters — one for chunks across all + big files, one for per-file condenses. +- `processBigFilesQueue(input): Promise` — same + result shape; legacy driver used by the pull path. - `storeFlatAnalysis(input): Promise` — `{ nodesWritten, foldersWritten, filesWritten }`. -Each phase returns its own counter shape; the strategy aggregates them -into `FlatFolderResult`. - ## Data ownership -- Phase 1 writes condensed JSON (small files + oversized stubs) and - `bigFiles.json`. -- Phase 2 writes chunk artifacts, the chunk manifest, and condensed JSON - for big files via `processBigFile`. -- Phase 7 owns no disk artifacts. It reads the on-disk state produced by +- Phase 1 writes `scan-manifest.json` (canonical) and `bigFiles.json` + (legacy view for backfill + pull). It does not write per-file + analyses. +- Phase 2a writes condensed JSON for small files + oversized stubs. +- Phase 2b writes per-chunk JSON (`chunks//chunk-N.json`), + per-file chunk manifests (`.manifest.json`), and condensed JSON + for big files. +- `FileAnalysisCache` is an in-memory artifact owned by the strategy + run (not persisted). It loads from `fileAnalysisDir` once and is + passed by reference to phases 3, 5, and 7. +- Phase 7 owns no disk artifacts. It reads on-disk state produced by Phases 1–6 and writes Neo4j nodes (`:Repo`, `:Folder`, `:File`) plus the `CONTAINS` edge. ## Invariants - Disk is the inter-phase contract; nothing crosses a phase boundary in - memory. + memory (except the in-memory manifest object that scan returns directly + to the orchestrator, which is a convenience — the canonical copy on + disk is what later resume/backfill runs read). - `throwIfCancelled(knowledgeId)` runs at every scan boundary, every - big-file boundary, and before each Neo4j upsert in Phase 7. -- Per-file LLM or I/O failures are logged and counted; phases do not - abort on a single bad file. Only `CancellationError` propagates. + per-chunk and per-file dispatch boundary, and before each Neo4j + upsert in Phase 7. +- Per-file or per-chunk LLM/I/O failures are logged and counted; phases + do not abort on a single bad file. Only `CancellationError`, + `LlmConfigError`, and `LlmError` propagate. +- The shared LLM limiter is the only place LLM concurrency is bounded + during the small/big phases **and the folder-summary phase**. + `Config.BigFileConcurrency` is no longer consulted from the chunk-queue + path (it is still consulted by the legacy `processBigFile` used by the + pull-path driver). `Config.ConcurrentWorkers` is no longer consulted + by the folder-summary phase. +- Phase 5 batches small folders by default. `Config.FolderSummaryBatchSize` + (default 10) controls batch size; set to 1 to disable and restore one + LLM call per folder. `Config.FolderSummaryBatchMaxFiles` (default 15) + is the per-folder file ceiling above which a folder always takes the + individual path so the LLM still sees the full per-file context. Large + folders run side-by-side with batches under the same shared limiter. +- Phase 1 respects `Config.ContextWindowLimit` and + `Config.MaxTokensPerChunk`; do not hardcode either. - Phase 7 always emits a `:Repo` node, even when `repo-summary.json` is absent (logged as a `phase7` warning). -- Phase 1 respects `Config.ContextWindowLimit` and - `Config.ConcurrentWorkers`; do not hardcode either. ## External dependencies @@ -92,8 +159,8 @@ into `FlatFolderResult`. `upsertRepoNode`, `upsertFolderNode`, `upsertFileNode`, `NodeScope`), `pipeline/scan.ts`, `pipeline/concurrency.ts`, `pipeline/cancellation.ts`, and the sibling `flat-folder/{analyse-file, big-file, folder-summary, -folder-path}` modules plus `adapters/llm-file-analyzer.ts` -(`languageFromPath`). +folder-path, scan-manifest}` modules plus +`adapters/llm-file-analyzer.ts` (`languageFromPath`). ## Tier diff --git a/packages/ingest-github/src/strategies/flat-folder/phases/analyse-big-files.ts b/packages/ingest-github/src/strategies/flat-folder/phases/analyse-big-files.ts new file mode 100644 index 0000000..33f6446 --- /dev/null +++ b/packages/ingest-github/src/strategies/flat-folder/phases/analyse-big-files.ts @@ -0,0 +1,287 @@ +import { createHash } from "node:crypto"; +import { logger } from "@bb/logger"; +import { Config } from "@bb/types"; +import { getConfigValue } from "@bb/config"; +import type { AskLlmOptions } from "@bb/llm"; +import { LlmConfigError, LlmError } from "@bb/errors"; +import type { MetaPaths } from "#src/types/meta-paths.ts"; +import type { AnalyzedFileResult, SourceReader } from "#src/types/pipeline.ts"; +import type { ProgressContext } from "#src/progress/types.ts"; +import type { ConcurrencyLimiter } from "#src/pipeline/concurrency.ts"; +import type { ChunkAnalysisResult, FileChunk, HugeFileManifest } from "#src/types/big-file.ts"; +import type { CondensedFileAnalysis } from "#src/types/condensed-file-analysis.ts"; +import { throwIfCancelled, CancellationError } from "#src/pipeline/cancellation.ts"; +import { inspect } from "#src/strategies/flat-folder/big-file/cache.ts"; +import { splitFileIntoChunks } from "#src/strategies/flat-folder/big-file/chunker.ts"; +import { analyzeChunk } from "#src/strategies/flat-folder/big-file/chunk-analyzer.ts"; +import { condenseChunks } from "#src/strategies/flat-folder/big-file/condenser.ts"; +import { + loadChunkIfPresent, + saveChunk, + saveCondensed, + saveManifest, +} from "#src/strategies/flat-folder/big-file/storage.ts"; +import type { ScanManifest, ScanManifestEntry } from "#src/strategies/flat-folder/scan-manifest.ts"; +import type { ProcessBigFilesResult } from "#src/strategies/flat-folder/phases/process-big-files.ts"; +import { describe } from "#src/strategies/flat-folder/phases/process-big-files.ts"; + +const CONDENSE_MAX_ATTEMPTS = 2; +const CONDENSE_RETRY_BACKOFF_MS = 2000; + +export interface AnalyseBigFilesInput { + knowledgeId: string; + manifest: ScanManifest; + source: SourceReader; + metaPaths: MetaPaths; + limiter: ConcurrencyLimiter; + llmCallContext?: AskLlmOptions; + progressContext?: ProgressContext; +} + +interface BigFileState { + entry: ScanManifestEntry; + content: string; + chunks: FileChunk[]; + results: (ChunkAnalysisResult | undefined)[]; + pendingChunks: number; + fatal: boolean; +} + +/** + * Manifest-driven big-file phase. Every chunk of every big file is an + * independent task scheduled through the shared LLM limiter. As soon as the + * last chunk of a given file lands, that file's condense is scheduled — + * multiple condenses run in parallel with the still-pending chunks of slower + * files. All LLM calls (chunk + condense) check out from the same limiter. + * + * Files already fully processed (manifest + condensed on disk) are skipped. + */ +export async function analyseBigFiles(input: AnalyseBigFilesInput): Promise { + const maxTokensPerChunk = getConfigValue(Config.MaxTokensPerChunk); + const bigEntries = input.manifest.entries.filter((e) => e.kind === "big"); + + let cached = 0; + let failed = 0; + let processed = 0; + let totalInputTokens = 0; + let totalOutputTokens = 0; + let totalCostUsd = 0; + + // Per-file preparation: read content, chunk, record state. Sequential and + // cheap — no LLM calls here. + const states: BigFileState[] = []; + for (const entry of bigEntries) { + throwIfCancelled(input.knowledgeId); + const status = await inspect(input.metaPaths, entry.relativePath); + if (status === "complete") { + cached += 1; + continue; + } + let content: string; + try { + content = await input.source.readFile(entry.relativePath); + } catch (cause: unknown) { + failed += 1; + logger.warn(`analyse-big: read failed for ${entry.relativePath}: ${describe(cause)}`); + continue; + } + if (content.length === 0) { + failed += 1; + logger.warn(`analyse-big: empty content for ${entry.relativePath}; skipping`); + continue; + } + const chunks = splitFileIntoChunks(entry.relativePath, content, maxTokensPerChunk); + states.push({ + entry, + content, + chunks, + results: new Array(chunks.length), + pendingChunks: chunks.length, + fatal: false, + }); + logger.info(`analyse-big: ${entry.relativePath} split into ${chunks.length} chunks`); + } + + const totalChunks = states.reduce((acc, s) => acc + s.chunks.length, 0); + const chunkReporter = input.progressContext?.reporter({ + phase: "file_analysis", + subPhase: "big_files_chunks", + total: { kind: "fixed", total: totalChunks }, + }); + await chunkReporter?.start(); + const condenseReporter = input.progressContext?.reporter({ + phase: "file_analysis", + subPhase: "big_files_condense", + total: { kind: "fixed", total: states.length }, + }); + await condenseReporter?.start(); + + // For oversized entries the legacy phase counted them; we accept the manifest + // already accounted for them via the small phase (which writes the stub). + // Surfaced here for parity with the legacy result shape. + const skippedOversized = input.manifest.entries.filter((e) => e.kind === "oversized").length; + + const condensePromises: Promise[] = []; + + function maybeScheduleCondense(state: BigFileState): void { + if (state.pendingChunks > 0 || state.fatal) { + return; + } + const definedResults = state.results.filter((r): r is ChunkAnalysisResult => r !== undefined); + condensePromises.push( + input.limiter(async () => { + throwIfCancelled(input.knowledgeId); + let merged: AnalyzedFileResult | null = null; + for (let attempt = 1; attempt <= CONDENSE_MAX_ATTEMPTS; attempt += 1) { + try { + merged = await condenseChunks(state.entry.relativePath, definedResults, input.llmCallContext); + break; + } catch (cause: unknown) { + if (cause instanceof CancellationError) { + throw cause; + } + if (cause instanceof LlmConfigError || cause instanceof LlmError) { + throw cause; + } + if (attempt < CONDENSE_MAX_ATTEMPTS) { + logger.warn( + `analyse-big: condense attempt ${attempt}/${CONDENSE_MAX_ATTEMPTS} failed for ${state.entry.relativePath}; retrying: ${describe(cause)}`, + ); + await sleep(CONDENSE_RETRY_BACKOFF_MS); + continue; + } + failed += 1; + logger.warn( + `analyse-big: condense failed after ${CONDENSE_MAX_ATTEMPTS} attempts for ${state.entry.relativePath}: ${describe(cause)}`, + ); + } + } + if (merged === null) { + condenseReporter?.increment(1, { fileName: state.entry.relativePath }); + return; + } + + try { + const chunkInputTokens = definedResults.reduce((acc, r) => acc + (r.tokenUsage?.inputTokens ?? 0), 0); + const chunkOutputTokens = definedResults.reduce((acc, r) => acc + (r.tokenUsage?.outputTokens ?? 0), 0); + const chunkCostUsd = definedResults.reduce((acc, r) => acc + (r.tokenUsage?.costUsd ?? 0), 0); + const totalTokenCount = state.chunks.reduce((acc, c) => acc + c.tokenCount, 0); + const totalIn = chunkInputTokens + (merged.tokenUsage?.inputTokens ?? 0); + const totalOut = chunkOutputTokens + (merged.tokenUsage?.outputTokens ?? 0); + const totalCost = chunkCostUsd + (merged.tokenUsage?.costUsd ?? 0); + + const manifest: HugeFileManifest = { + relativePath: state.entry.relativePath, + totalChunks: state.chunks.length, + totalTokenCount, + chunkPaths: state.chunks.map((_, i) => `chunks/${encodeFolder(state.entry.relativePath)}/chunk-${i}.json`), + generatedAt: new Date().toISOString(), + }; + await saveManifest(input.metaPaths, manifest); + + const condensed: CondensedFileAnalysis = { + relativePath: state.entry.relativePath, + language: merged.language, + sha256: sha256(state.content), + sizeBytes: state.entry.sizeBytes, + tokenCount: totalTokenCount, + isBigFile: true, + totalChunks: state.chunks.length, + totalTokenCount, + analysedAt: new Date().toISOString(), + analysis: merged.analysis, + tokenUsage: { inputTokens: totalIn, outputTokens: totalOut, costUsd: totalCost }, + }; + await saveCondensed(input.metaPaths, condensed); + + totalInputTokens += totalIn; + totalOutputTokens += totalOut; + totalCostUsd += totalCost; + processed += 1; + } catch (cause: unknown) { + if (cause instanceof CancellationError) { + throw cause; + } + failed += 1; + logger.warn(`analyse-big: persist failed for ${state.entry.relativePath}: ${describe(cause)}`); + } finally { + condenseReporter?.increment(1, { fileName: state.entry.relativePath }); + } + }), + ); + } + + const chunkPromises: Promise[] = []; + for (const state of states) { + for (let i = 0; i < state.chunks.length; i += 1) { + const idx = i; + const chunk = state.chunks[idx]; + if (chunk === undefined) { + continue; + } + chunkPromises.push( + input.limiter(async () => { + throwIfCancelled(input.knowledgeId); + try { + const cachedChunk = await loadChunkIfPresent(input.metaPaths, state.entry.relativePath, idx); + if (cachedChunk !== null) { + state.results[idx] = cachedChunk; + } else { + const analyzed = await analyzeChunk(chunk, input.llmCallContext); + await saveChunk(input.metaPaths, analyzed); + state.results[idx] = analyzed; + } + } catch (cause: unknown) { + if (cause instanceof CancellationError) { + throw cause; + } + if (cause instanceof LlmConfigError || cause instanceof LlmError) { + state.fatal = true; + throw cause; + } + logger.warn( + `analyse-big: chunk ${idx + 1}/${state.chunks.length} failed for ${state.entry.relativePath}: ${describe(cause)}`, + ); + } finally { + state.pendingChunks -= 1; + chunkReporter?.increment(1, { fileName: `${state.entry.relativePath}#chunk-${String(idx)}` }); + maybeScheduleCondense(state); + } + }), + ); + } + } + + try { + await Promise.all(chunkPromises); + await Promise.all(condensePromises); + } finally { + chunkReporter?.stop(); + condenseReporter?.stop(); + } + + logger.info( + `analyse-big done: processed=${processed} cached=${cached} failed=${failed} skippedOversized=${skippedOversized}`, + ); + return { + processed, + cached, + failed, + skippedOversized, + tokenUsage: { inputTokens: totalInputTokens, outputTokens: totalOutputTokens, costUsd: totalCostUsd }, + }; +} + +function sha256(content: string): string { + return createHash("sha256").update(content).digest("hex"); +} + +function encodeFolder(relativePath: string): string { + return relativePath.replace(/\//gu, "__SL__").replace(/\\/gu, "__BS__"); +} + +function sleep(ms: number): Promise { + return new Promise((resolve) => { + setTimeout(resolve, ms); + }); +} diff --git a/packages/ingest-github/src/strategies/flat-folder/phases/analyse-small.ts b/packages/ingest-github/src/strategies/flat-folder/phases/analyse-small.ts new file mode 100644 index 0000000..5176f7f --- /dev/null +++ b/packages/ingest-github/src/strategies/flat-folder/phases/analyse-small.ts @@ -0,0 +1,133 @@ +import { logger } from "@bb/logger"; +import type { AskLlmOptions } from "@bb/llm"; +import { LlmConfigError, LlmError } from "@bb/errors"; +import type { ArchiveSink, FileAnalyzer, ScannedFile, SourceReader } from "#src/types/pipeline.ts"; +import type { MetaPaths } from "#src/types/meta-paths.ts"; +import type { ProgressContext } from "#src/progress/types.ts"; +import type { ConcurrencyLimiter } from "#src/pipeline/concurrency.ts"; +import { throwIfCancelled, CancellationError } from "#src/pipeline/cancellation.ts"; +import { analyseScannedFile, buildOversizedStub } from "#src/strategies/flat-folder/analyse-file.ts"; +import { saveCondensed } from "#src/strategies/flat-folder/big-file/storage.ts"; +import type { ScanManifest } from "#src/strategies/flat-folder/scan-manifest.ts"; + +export interface AnalyseSmallInput { + knowledgeId: string; + manifest: ScanManifest; + source: SourceReader; + metaPaths: MetaPaths; + analyzer: FileAnalyzer; + limiter: ConcurrencyLimiter; + archiveSink?: ArchiveSink; + llmCallContext?: AskLlmOptions; + progressContext?: ProgressContext; +} + +export interface AnalyseSmallResult { + smallFilesAnalysed: number; + oversizedStubs: number; + failed: number; + tokenUsage: { inputTokens: number; outputTokens: number; costUsd: number }; +} + +/** + * Consumes the `scan-manifest.json` produced by `scanAndClassify` and + * analyses every `kind: "small"` entry through the shared LLM limiter. + * + * Oversized stubs are also written here (they don't go through the LLM but + * still need a placeholder analysis row on disk so downstream phases see a + * complete file set). + */ +export async function analyseSmallFiles(input: AnalyseSmallInput): Promise { + const smallEntries = input.manifest.entries.filter((e) => e.kind === "small"); + const oversizedEntries = input.manifest.entries.filter((e) => e.kind === "oversized"); + + let smallFilesAnalysed = 0; + let oversizedStubs = 0; + let failed = 0; + let totalInputTokens = 0; + let totalOutputTokens = 0; + let totalCostUsd = 0; + + const reporter = input.progressContext?.reporter({ + phase: "file_analysis", + subPhase: "analyse_small", + total: { kind: "fixed", total: smallEntries.length + oversizedEntries.length }, + }); + await reporter?.start(); + + try { + for (const entry of oversizedEntries) { + throwIfCancelled(input.knowledgeId); + try { + await saveCondensed(input.metaPaths, buildOversizedStub(entry.relativePath, entry.sizeBytes)); + oversizedStubs += 1; + } catch (cause: unknown) { + failed += 1; + logger.warn(`analyse-small: oversized stub write failed for ${entry.relativePath}: ${describe(cause)}`); + } + reporter?.increment(1, { fileName: entry.relativePath }); + } + + const pending: Promise[] = []; + for (const entry of smallEntries) { + pending.push( + input.limiter(async () => { + throwIfCancelled(input.knowledgeId); + try { + const content = await input.source.readFile(entry.relativePath); + const scanned: ScannedFile = { + kind: "file", + relativePath: entry.relativePath, + absolutePath: entry.absolutePath, + sizeBytes: entry.sizeBytes, + content, + }; + const condensed = await analyseScannedFile(input.analyzer, scanned, input.llmCallContext); + await saveCondensed(input.metaPaths, condensed); + if (input.archiveSink !== undefined) { + await input.archiveSink.push({ + knowledgeId: input.knowledgeId, + relativePath: entry.relativePath, + content, + }); + } + if (condensed.tokenUsage) { + totalInputTokens += condensed.tokenUsage.inputTokens; + totalOutputTokens += condensed.tokenUsage.outputTokens; + totalCostUsd += condensed.tokenUsage.costUsd; + } + smallFilesAnalysed += 1; + reporter?.increment(1, { fileName: entry.relativePath }); + } catch (cause: unknown) { + if (cause instanceof CancellationError) { + throw cause; + } + if (cause instanceof LlmConfigError || cause instanceof LlmError) { + throw cause; + } + failed += 1; + logger.warn(`analyse-small: analyse failed for ${entry.relativePath}: ${describe(cause)}`); + reporter?.increment(1, { fileName: entry.relativePath }); + } + }), + ); + } + await Promise.all(pending); + } finally { + reporter?.stop(); + } + + logger.info( + `analyse-small done: smallFilesAnalysed=${smallFilesAnalysed} oversizedStubs=${oversizedStubs} failed=${failed}`, + ); + return { + smallFilesAnalysed, + oversizedStubs, + failed, + tokenUsage: { inputTokens: totalInputTokens, outputTokens: totalOutputTokens, costUsd: totalCostUsd }, + }; +} + +function describe(cause: unknown): string { + return cause instanceof Error ? cause.message : String(cause); +} diff --git a/packages/ingest-github/src/strategies/flat-folder/phases/classify-and-analyse-small.ts b/packages/ingest-github/src/strategies/flat-folder/phases/classify-and-analyse-small.ts deleted file mode 100644 index a9ad59a..0000000 --- a/packages/ingest-github/src/strategies/flat-folder/phases/classify-and-analyse-small.ts +++ /dev/null @@ -1,161 +0,0 @@ -import path from "node:path"; -import { tokenLen, type AskLlmOptions } from "@bb/llm"; -import { LlmConfigError, LlmError } from "@bb/errors"; -import { logger } from "@bb/logger"; -import { Config } from "@bb/types"; -import { getConfigValue } from "@bb/config"; -import type { ArchiveSink, FileAnalyzer, SkipDecider, SourceReader } from "#src/types/pipeline.ts"; -import type { MetaPaths } from "#src/types/meta-paths.ts"; -import type { BigFileEntry } from "#src/types/big-file.ts"; -import type { ProgressContext } from "#src/progress/types.ts"; -import { withConcurrency } from "#src/pipeline/concurrency.ts"; -import { throwIfCancelled, CancellationError } from "#src/pipeline/cancellation.ts"; -import { makeSkipDecider } from "#src/pipeline/skip-decisions/index.ts"; -import { analyseScannedFile, buildOversizedStub } from "#src/strategies/flat-folder/analyse-file.ts"; -import { saveCondensed } from "#src/strategies/flat-folder/big-file/storage.ts"; -import { writeBigFiles } from "#src/strategies/flat-folder/big-file/detector.ts"; - -export interface ClassifyPhaseInput { - knowledgeId: string; - source: SourceReader; - metaPaths: MetaPaths; - analyzer: FileAnalyzer; - skipDecider?: SkipDecider; - archiveSink?: ArchiveSink; - llmCallContext?: AskLlmOptions; - progressContext?: ProgressContext; -} - -export interface ClassifyPhaseResult { - smallFilesAnalysed: number; - bigFilesQueued: number; - oversizedStubs: number; - failed: number; - tokenUsage: { inputTokens: number; outputTokens: number; costUsd: number }; -} - -export async function classifyAndAnalyseSmall(input: ClassifyPhaseInput): Promise { - const contextWindowLimit = getConfigValue(Config.ContextWindowLimit); - const concurrentWorkers = getConfigValue(Config.ConcurrentWorkers); - const limit = withConcurrency(concurrentWorkers); - const bigFileBuffer: BigFileEntry[] = []; - let smallFilesAnalysed = 0; - let oversizedStubs = 0; - let failed = 0; - let totalInputTokens = 0; - let totalOutputTokens = 0; - let totalCostUsd = 0; - - const repositoryHint = - input.source.localRepoDir.length > 0 ? path.basename(input.source.localRepoDir) : input.knowledgeId; - const skipDecider = input.skipDecider ?? makeSkipDecider({ repositoryName: repositoryHint }); - - const pending: Promise[] = []; - - const reporter = input.progressContext?.reporter({ - phase: "file_analysis", - total: { kind: "growing" }, - }); - await reporter?.start(); - - try { - const scanDeps: Parameters[0] = { skipDecider }; - if (input.llmCallContext !== undefined) { - scanDeps.llmCallContext = input.llmCallContext; - } - for await (const entry of input.source.scan(scanDeps)) { - throwIfCancelled(input.knowledgeId); - reporter?.incrementSeen(); - - if (entry.kind === "oversized") { - bigFileBuffer.push({ - relativePath: entry.relativePath, - sizeBytes: entry.sizeBytes, - tokenCount: 0, - reason: "too-large", - }); - try { - await saveCondensed(input.metaPaths, buildOversizedStub(entry.relativePath, entry.sizeBytes)); - oversizedStubs += 1; - reporter?.increment(1, { fileName: entry.relativePath }); - } catch (cause: unknown) { - failed += 1; - logger.warn(`phase1: oversized stub write failed for ${entry.relativePath}: ${describe(cause)}`); - } - continue; - } - - const tokenCount = tokenLen(entry.content); - if (tokenCount > contextWindowLimit) { - bigFileBuffer.push({ - relativePath: entry.relativePath, - sizeBytes: entry.sizeBytes, - tokenCount, - reason: "context-window-exceeded", - }); - // Big files are accounted for here; phase 2 has its own reporter. - reporter?.increment(1, { fileName: entry.relativePath }); - continue; - } - - const fileContent = entry.content; - const filePath = entry.relativePath; - pending.push( - limit(async () => { - try { - throwIfCancelled(input.knowledgeId); - const condensed = await analyseScannedFile(input.analyzer, entry, input.llmCallContext); - await saveCondensed(input.metaPaths, condensed); - if (input.archiveSink !== undefined) { - await input.archiveSink.push({ - knowledgeId: input.knowledgeId, - relativePath: filePath, - content: fileContent, - }); - } - if (condensed.tokenUsage) { - totalInputTokens += condensed.tokenUsage.inputTokens; - totalOutputTokens += condensed.tokenUsage.outputTokens; - totalCostUsd += condensed.tokenUsage.costUsd; - } - smallFilesAnalysed += 1; - reporter?.increment(1, { fileName: filePath }); - } catch (cause: unknown) { - if (cause instanceof CancellationError) { - throw cause; - } - if (cause instanceof LlmConfigError || cause instanceof LlmError) { - // LLM unreachable — bail the whole job, don't keep iterating - // over the rest of the files producing the same failure. - throw cause; - } - failed += 1; - logger.warn(`phase1: analyse failed for ${entry.relativePath}: ${describe(cause)}`); - reporter?.increment(1, { fileName: filePath }); - } - }), - ); - } - - await Promise.all(pending); - - await writeBigFiles(input.metaPaths, bigFileBuffer); - } finally { - reporter?.stop(); - } - - logger.info( - `phase1 done: smallFilesAnalysed=${smallFilesAnalysed} bigFilesQueued=${bigFileBuffer.filter((e) => e.reason === "context-window-exceeded").length} oversizedStubs=${oversizedStubs} failed=${failed}`, - ); - return { - smallFilesAnalysed, - bigFilesQueued: bigFileBuffer.filter((e) => e.reason === "context-window-exceeded").length, - oversizedStubs, - failed, - tokenUsage: { inputTokens: totalInputTokens, outputTokens: totalOutputTokens, costUsd: totalCostUsd }, - }; -} - -function describe(cause: unknown): string { - return cause instanceof Error ? cause.message : String(cause); -} diff --git a/packages/ingest-github/src/strategies/flat-folder/phases/process-big-files.ts b/packages/ingest-github/src/strategies/flat-folder/phases/process-big-files.ts index 1197753..951b10e 100644 --- a/packages/ingest-github/src/strategies/flat-folder/phases/process-big-files.ts +++ b/packages/ingest-github/src/strategies/flat-folder/phases/process-big-files.ts @@ -25,6 +25,12 @@ export interface ProcessBigFilesResult { tokenUsage: { inputTokens: number; outputTokens: number; costUsd: number }; } +/** + * Legacy big-file driver. Reads the deprecated `bigFiles.json`, processes + * each entry serially via `processBigFile` (which internally does + * chunk-then-condense). Kept for the pull-path (`pipeline/pull.ts`) and any + * caller that has not migrated to `analyseBigFiles(manifest, …)` yet. + */ export async function processBigFilesQueue(input: ProcessBigFilesInput): Promise { const entries = await readBigFiles(input.metaPaths); let processed = 0; @@ -61,13 +67,13 @@ export async function processBigFilesQueue(input: ProcessBigFilesInput): Promise content = await input.source.readFile(entry.relativePath); } catch (cause: unknown) { failed += 1; - logger.warn(`phase2: read failed for ${entry.relativePath}: ${describe(cause)}`); + logger.warn(`big-files-queue: read failed for ${entry.relativePath}: ${describe(cause)}`); reporter?.increment(1, { fileName: entry.relativePath }); continue; } if (content.length === 0) { failed += 1; - logger.warn(`phase2: empty content for ${entry.relativePath}; skipping`); + logger.warn(`big-files-queue: empty content for ${entry.relativePath}; skipping`); reporter?.increment(1, { fileName: entry.relativePath }); continue; } @@ -95,12 +101,12 @@ export async function processBigFilesQueue(input: ProcessBigFilesInput): Promise throw cause; } failed += 1; - logger.warn(`phase2: processBigFile failed for ${entry.relativePath}: ${describe(cause)}`); + logger.warn(`big-files-queue: processBigFile failed for ${entry.relativePath}: ${describe(cause)}`); } reporter?.increment(1, { fileName: entry.relativePath }); } logger.info( - `phase2 done: processed=${processed} cached=${cached} failed=${failed} skippedOversized=${skippedOversized}`, + `big-files-queue done: processed=${processed} cached=${cached} failed=${failed} skippedOversized=${skippedOversized}`, ); return { processed, @@ -114,6 +120,6 @@ export async function processBigFilesQueue(input: ProcessBigFilesInput): Promise } } -function describe(cause: unknown): string { +export function describe(cause: unknown): string { return cause instanceof Error ? cause.message : String(cause); } diff --git a/packages/ingest-github/src/strategies/flat-folder/phases/scan-and-classify.ts b/packages/ingest-github/src/strategies/flat-folder/phases/scan-and-classify.ts new file mode 100644 index 0000000..6dc92a7 --- /dev/null +++ b/packages/ingest-github/src/strategies/flat-folder/phases/scan-and-classify.ts @@ -0,0 +1,143 @@ +import path from "node:path"; +import { Config } from "@bb/types"; +import { getConfigValue } from "@bb/config"; +import { logger } from "@bb/logger"; +import type { AskLlmOptions } from "@bb/llm"; +import type { MetaPaths } from "#src/types/meta-paths.ts"; +import type { BigFileEntry } from "#src/types/big-file.ts"; +import type { SkipDecider, SourceReader } from "#src/types/pipeline.ts"; +import type { ProgressContext } from "#src/progress/types.ts"; +import type { ConcurrencyLimiter } from "#src/pipeline/concurrency.ts"; +import { throwIfCancelled } from "#src/pipeline/cancellation.ts"; +import { makeSkipDecider } from "#src/pipeline/skip-decisions/index.ts"; +import { classifyByTokens, writeBigFiles } from "#src/strategies/flat-folder/big-file/detector.ts"; +import { + emptyManifest, + writeScanManifest, + type ScanManifest, + type ScanManifestEntry, +} from "#src/strategies/flat-folder/scan-manifest.ts"; + +export interface ScanAndClassifyInput { + knowledgeId: string; + source: SourceReader; + metaPaths: MetaPaths; + skipDecider?: SkipDecider; + llmCallContext?: AskLlmOptions; + progressContext?: ProgressContext; + /** + * Shared LLM-concurrency limiter. When supplied the underlying + * `scanRepository` runs its two-pass strategy: walk + cache-only decisions + * first, then parallel-deduplicated LLM resolution for unknown + * extensions/filenames under this limiter. Optional so the function + * still works standalone. + */ + limiter?: ConcurrencyLimiter; +} + +export interface ScanAndClassifyResult { + manifest: ScanManifest; +} + +/** + * Walks the repo once, classifies every eligible file as small / big / + * oversized by token count, and writes `scan-manifest.json`. The downstream + * small-file and big-file phases consume the manifest instead of re-walking. + * + * Also writes the legacy `bigFiles.json` so the pull-path and backfill phases + * (which still read it directly) keep working without migration. + */ +export async function scanAndClassify(input: ScanAndClassifyInput): Promise { + const contextWindowLimit = getConfigValue(Config.ContextWindowLimit); + const maxTokensPerChunk = getConfigValue(Config.MaxTokensPerChunk); + const manifest = emptyManifest(); + const bigFileEntries: BigFileEntry[] = []; + + const repositoryHint = + input.source.localRepoDir.length > 0 ? path.basename(input.source.localRepoDir) : input.knowledgeId; + const skipDecider = input.skipDecider ?? makeSkipDecider({ repositoryName: repositoryHint }); + + const reporter = input.progressContext?.reporter({ + phase: "scan", + total: { kind: "growing" }, + }); + await reporter?.start(); + + try { + const scanDeps: Parameters[0] = { skipDecider }; + if (input.limiter !== undefined) { + scanDeps.limiter = input.limiter; + } + if (input.llmCallContext !== undefined) { + scanDeps.llmCallContext = input.llmCallContext; + } + + for await (const entry of input.source.scan(scanDeps)) { + throwIfCancelled(input.knowledgeId); + reporter?.incrementSeen(); + + if (entry.kind === "oversized") { + const manifestEntry: ScanManifestEntry = { + relativePath: entry.relativePath, + absolutePath: entry.absolutePath, + sizeBytes: entry.sizeBytes, + tokenCount: 0, + kind: "oversized", + }; + manifest.entries.push(manifestEntry); + manifest.summary.oversizedCount += 1; + manifest.summary.totalFiles += 1; + bigFileEntries.push({ + relativePath: entry.relativePath, + sizeBytes: entry.sizeBytes, + tokenCount: 0, + reason: "too-large", + }); + reporter?.increment(1, { fileName: entry.relativePath }); + continue; + } + + const { tokenCount, isBigFile } = classifyByTokens(entry.content, contextWindowLimit); + manifest.summary.totalFiles += 1; + manifest.summary.totalTokens += tokenCount; + if (isBigFile) { + const estimatedChunks = Math.max(1, Math.ceil(tokenCount / maxTokensPerChunk)); + manifest.entries.push({ + relativePath: entry.relativePath, + absolutePath: entry.absolutePath, + sizeBytes: entry.sizeBytes, + tokenCount, + kind: "big", + estimatedChunks, + }); + manifest.summary.bigCount += 1; + manifest.summary.estimatedBigChunks += estimatedChunks; + bigFileEntries.push({ + relativePath: entry.relativePath, + sizeBytes: entry.sizeBytes, + tokenCount, + reason: "context-window-exceeded", + }); + } else { + manifest.entries.push({ + relativePath: entry.relativePath, + absolutePath: entry.absolutePath, + sizeBytes: entry.sizeBytes, + tokenCount, + kind: "small", + }); + manifest.summary.smallCount += 1; + } + reporter?.increment(1, { fileName: entry.relativePath }); + } + } finally { + reporter?.stop(); + } + + await writeScanManifest(input.metaPaths, manifest); + await writeBigFiles(input.metaPaths, bigFileEntries); + logger.info( + `scan-and-classify done: total=${manifest.summary.totalFiles} small=${manifest.summary.smallCount} big=${manifest.summary.bigCount} oversized=${manifest.summary.oversizedCount} totalTokens=${manifest.summary.totalTokens} estimatedBigChunks=${manifest.summary.estimatedBigChunks}`, + ); + return { manifest }; +} diff --git a/packages/ingest-github/src/strategies/flat-folder/phases/store-flat-analysis.ts b/packages/ingest-github/src/strategies/flat-folder/phases/store-flat-analysis.ts index dbcbb30..7db4433 100644 --- a/packages/ingest-github/src/strategies/flat-folder/phases/store-flat-analysis.ts +++ b/packages/ingest-github/src/strategies/flat-folder/phases/store-flat-analysis.ts @@ -1,10 +1,20 @@ import { readFile } from "node:fs/promises"; +import { Config } from "@bb/types"; +import { getConfigValue } from "@bb/config"; import { logger } from "@bb/logger"; -import { ensureFlatFolderIndexes, upsertFileNode, upsertFolderNode, upsertRepoNode, type NodeScope } from "@bb/neo4j"; +import { + ensureFlatFolderIndexes, + upsertFileNodesBatch, + upsertFolderNodesBatch, + upsertRepoNode, + type NodeScope, + type UpsertFileNodeInput, + type UpsertFolderNodeInput, +} from "@bb/neo4j"; import type { GithubIndexPayload } from "@bb/types"; import type { MetaPaths } from "#src/types/meta-paths.ts"; import { throwIfCancelled } from "#src/pipeline/cancellation.ts"; -import { iterateCondensed } from "#src/strategies/flat-folder/big-file/storage.ts"; +import type { FileAnalysisCache } from "#src/strategies/flat-folder/file-analysis-cache.ts"; import { iterateFolderSummaries } from "#src/strategies/flat-folder/folder-summary.ts"; import { directFolderOf } from "#src/strategies/flat-folder/folder-path.ts"; import { languageFromPath } from "#src/adapters/llm-file-analyzer.ts"; @@ -16,6 +26,7 @@ export interface StoreFlatAnalysisInput { payload: GithubIndexPayload; branch: string; metaPaths: MetaPaths; + cache: FileAnalysisCache; progressContext?: ProgressContext; } @@ -29,10 +40,10 @@ export async function storeFlatAnalysis(input: StoreFlatAnalysisInput): Promise< throwIfCancelled(input.scope.knowledgeId); await ensureFlatFolderIndexes(); - let nodesWritten = 0; - let foldersWritten = 0; - let filesWritten = 0; + const batchSize = getConfigValue(Config.Neo4jBatchSize); + // 1. :Repo node — single upsert, not batched (one repo per knowledge). + let nodesWritten = 0; const repoSummary = await readRepoSummary(input.metaPaths); if (repoSummary !== null) { await upsertRepoNode({ @@ -49,7 +60,6 @@ export async function storeFlatAnalysis(input: StoreFlatAnalysisInput): Promise< keyPatterns: repoSummary.keyPatterns, }, }); - nodesWritten += 1; } else { logger.warn(`phase7: no repo summary on disk; writing :Repo with empty summary`); await upsertRepoNode({ @@ -58,61 +68,79 @@ export async function storeFlatAnalysis(input: StoreFlatAnalysisInput): Promise< branch: input.branch, summary: emptyRepoSummaryPayload(), }); - nodesWritten += 1; } + nodesWritten += 1; - const folderReporter = input.progressContext?.reporter({ - phase: "indexing", - subPhase: "folders", - total: { kind: "growing" }, - }); - await folderReporter?.start(); + // 2. Collect every folder we'll upsert: the on-disk folder summaries plus + // synthesised parents for any file whose folder didn't get a summary. Doing + // this up front gives both reporters real fixed totals so `overallProgress` + // doesn't leap to 100 the moment the folder loop completes (the previous + // UX bug where the file sub-phase registered too late to dilute the + // indexing aggregate). + const folderInputs: UpsertFolderNodeInput[] = []; const folderPaths = new Set(); - try { - for await (const folder of iterateFolderSummaries(input.metaPaths)) { - throwIfCancelled(input.scope.knowledgeId); - folderReporter?.incrementSeen(); - await upsertFolderNode({ + for await (const folder of iterateFolderSummaries(input.metaPaths)) { + folderInputs.push({ + scope: input.scope, + folderPath: folder.folderPath, + summary: shapeFolderPayload(folder), + }); + folderPaths.add(folder.folderPath); + } + for (const file of input.cache.values()) { + const folderPath = directFolderOf(file.relativePath); + if (!folderPaths.has(folderPath)) { + folderInputs.push({ scope: input.scope, - folderPath: folder.folderPath, - summary: shapeFolderPayload(folder), + folderPath, + summary: emptyFolderPayload(), }); - folderPaths.add(folder.folderPath); - foldersWritten += 1; - nodesWritten += 1; - folderReporter?.increment(1, { fileName: folder.folderPath || "" }); + folderPaths.add(folderPath); } - } finally { - folderReporter?.stop(); } + // 3. Both reporters open at phase entry with their true totals so the + // overall-progress aggregate sees both denominators from the first tick. + const folderReporter = input.progressContext?.reporter({ + phase: "indexing", + subPhase: "folders", + total: { kind: "fixed", total: folderInputs.length }, + }); const fileReporter = input.progressContext?.reporter({ phase: "indexing", subPhase: "files", - total: { kind: "growing" }, + total: { kind: "fixed", total: input.cache.size }, }); + await folderReporter?.start(); await fileReporter?.start(); + + let foldersWritten = 0; + let filesWritten = 0; try { - for await (const file of iterateCondensed(input.metaPaths)) { + // 4. Batched folder upserts. + logger.info( + `phase7: folder upsert dispatching ${Math.ceil(folderInputs.length / batchSize)} batches of up to ${batchSize} folders (total=${folderInputs.length})`, + ); + for (let i = 0; i < folderInputs.length; i += batchSize) { throwIfCancelled(input.scope.knowledgeId); - fileReporter?.incrementSeen(); - const folderPath = directFolderOf(file.relativePath); - if (!folderPaths.has(folderPath)) { - await upsertFolderNode({ - scope: input.scope, - folderPath, - summary: emptyFolderPayload(), - }); - folderPaths.add(folderPath); - foldersWritten += 1; - nodesWritten += 1; + const batch = folderInputs.slice(i, i + batchSize); + await upsertFolderNodesBatch(batch); + foldersWritten += batch.length; + nodesWritten += batch.length; + for (const item of batch) { + folderReporter?.increment(1, { fileName: item.folderPath || "" }); } - await upsertFileNode({ + } + + // 5. Batched file upserts. + const fileInputs: UpsertFileNodeInput[] = []; + for (const file of input.cache.values()) { + fileInputs.push({ orgId: input.scope.orgId, knowledgeId: input.scope.knowledgeId, repoId: input.scope.repoId, relativePath: file.relativePath, - folderPath, + folderPath: directFolderOf(file.relativePath), language: file.language.length > 0 ? file.language : languageFromPath(file.relativePath), sha: file.sha256, sizeBytes: file.sizeBytes, @@ -121,11 +149,22 @@ export async function storeFlatAnalysis(input: StoreFlatAnalysisInput): Promise< totalChunks: file.totalChunks, totalTokenCount: file.totalTokenCount, }); - filesWritten += 1; - nodesWritten += 1; - fileReporter?.increment(1, { fileName: file.relativePath }); + } + logger.info( + `phase7: file upsert dispatching ${Math.ceil(fileInputs.length / batchSize)} batches of up to ${batchSize} files (total=${fileInputs.length})`, + ); + for (let i = 0; i < fileInputs.length; i += batchSize) { + throwIfCancelled(input.scope.knowledgeId); + const batch = fileInputs.slice(i, i + batchSize); + await upsertFileNodesBatch(batch); + filesWritten += batch.length; + nodesWritten += batch.length; + for (const item of batch) { + fileReporter?.increment(1, { fileName: item.relativePath }); + } } } finally { + folderReporter?.stop(); fileReporter?.stop(); } diff --git a/packages/ingest-github/src/strategies/flat-folder/prompts/folder-summary.ts b/packages/ingest-github/src/strategies/flat-folder/prompts/folder-summary.ts index 10276a8..30e110b 100644 --- a/packages/ingest-github/src/strategies/flat-folder/prompts/folder-summary.ts +++ b/packages/ingest-github/src/strategies/flat-folder/prompts/folder-summary.ts @@ -40,3 +40,57 @@ Per-file analyses (direct children only): ${serialised}`; } + +export const FOLDER_BATCH_SYSTEM_PROMPT = `You are summarising MULTIPLE small folders of a source repository in one pass. The user will provide several folders, each labeled with an integer ID (0, 1, 2, ...). Each folder lists the files directly inside it (subfolders are summarised separately and are NOT in your input). + +Return ONLY a JSON object whose keys are the integer labels as strings ("0", "1", ...) and whose values are folder-summary objects with EXACTLY these keys: + +- purpose : string — one-paragraph explanation of what this folder is responsible for. +- summary : string — natural-language summary of how the files in this folder work together. Plain English, no key-value pairs. ≤ 300 tokens. +- keywords : string[] — up to 10 domain keywords describing this folder. +- classes : string[] — most important class/type entries, deduplicated. Format "Name: short purpose". Max 15 entries. +- functions : string[] — most important function/method entries, deduplicated. Format "name: short purpose". Max 15 entries. +- importsInternal : string[] — significant relative imports observed across the folder's files. Max 15 entries. +- importsExternal : string[] — significant external packages observed across the folder's files. Max 15 entries. +- dependencyGraph : string — Mermaid \`graph LR\` block (no triple-backtick fences) of inter-file dependencies. Empty string if not enough signal. + +You MUST return one entry per labeled folder, even if some fields are empty arrays. Do NOT invent files not listed. Do NOT speculate about subfolders. Do NOT add keys outside the integer-label set; do NOT add commentary outside the JSON object.`; + +export interface BatchedFolderInput { + label: number; + folderPath: string; + files: CondensedFileAnalysis[]; +} + +export function folderBatchUserPrompt(batch: BatchedFolderInput[]): string { + const sections = batch.map((b) => { + const folderLabel = b.folderPath.length === 0 ? "" : b.folderPath; + const fileLines = b.files.map((f) => `- ${f.relativePath}: ${f.analysis.purpose}`).join("\n"); + const aggregatedKeywords = aggregateKeywords(b.files, 10); + return `### Folder ${b.label} :: ${folderLabel} +Files: ${b.files.length} +${fileLines} +Aggregated keywords: ${JSON.stringify(aggregatedKeywords)}`; + }); + return `You are summarising ${batch.length} folder(s). Produce one folder-summary object per labeled folder. + +${sections.join("\n\n")}`; +} + +function aggregateKeywords(files: CondensedFileAnalysis[], cap: number): string[] { + const seen = new Set(); + const out: string[] = []; + for (const f of files) { + for (const k of f.analysis.keywords) { + if (typeof k !== "string" || k.length === 0 || seen.has(k)) { + continue; + } + seen.add(k); + out.push(k); + if (out.length >= cap) { + return out; + } + } + } + return out; +} diff --git a/packages/ingest-github/src/strategies/flat-folder/scan-manifest.ts b/packages/ingest-github/src/strategies/flat-folder/scan-manifest.ts new file mode 100644 index 0000000..5caee3b --- /dev/null +++ b/packages/ingest-github/src/strategies/flat-folder/scan-manifest.ts @@ -0,0 +1,61 @@ +import { readFile, writeFile } from "node:fs/promises"; +import type { MetaPaths } from "#src/types/meta-paths.ts"; + +export type ScanEntryKind = "small" | "big" | "oversized"; + +export interface ScanManifestEntry { + relativePath: string; + absolutePath: string; + sizeBytes: number; + tokenCount: number; + kind: ScanEntryKind; + estimatedChunks?: number; +} + +export interface ScanManifestSummary { + totalFiles: number; + smallCount: number; + bigCount: number; + oversizedCount: number; + totalTokens: number; + estimatedBigChunks: number; +} + +export interface ScanManifest { + generatedAt: string; + summary: ScanManifestSummary; + entries: ScanManifestEntry[]; +} + +export function emptyManifest(): ScanManifest { + return { + generatedAt: new Date().toISOString(), + summary: { totalFiles: 0, smallCount: 0, bigCount: 0, oversizedCount: 0, totalTokens: 0, estimatedBigChunks: 0 }, + entries: [], + }; +} + +export async function writeScanManifest(metaPaths: MetaPaths, manifest: ScanManifest): Promise { + await writeFile(metaPaths.scanManifestJson, JSON.stringify(manifest, null, 2), "utf8"); +} + +export async function readScanManifest(metaPaths: MetaPaths): Promise { + try { + const raw = await readFile(metaPaths.scanManifestJson, "utf8"); + const parsed: unknown = JSON.parse(raw); + if (!isManifest(parsed)) { + return null; + } + return parsed; + } catch { + return null; + } +} + +function isManifest(value: unknown): value is ScanManifest { + if (typeof value !== "object" || value === null) { + return false; + } + const rec = value as Record; + return Array.isArray(rec["entries"]) && typeof rec["summary"] === "object" && typeof rec["generatedAt"] === "string"; +} diff --git a/packages/ingest-github/src/types/README.md b/packages/ingest-github/src/types/README.md index 87b2cea..1fd8479 100644 --- a/packages/ingest-github/src/types/README.md +++ b/packages/ingest-github/src/types/README.md @@ -19,9 +19,21 @@ llmCallContext? }`; `llmCallContext` is the optional `AskLlmOptions` - `pipeline.ts` — `ScannedFile`, `OversizedFile`, `ScanEntry`, `FileAnalyzer` port, `AnalyzedFileResult`, `PipelineDeps`, `PipelineSummary`, `SkipDecider` / `SkipDeciderInput` / `SkipDecision` (the unknown-extension - gate port; implementation lives under `pipeline/skip-decisions/`), + gate port; implementation lives under `pipeline/skip-decisions/`). The + `SkipDecider` interface exposes four methods: `decide` (legacy async + single-shot), `decideStatic` (synchronous; returns the resolved decision + or `null` to signal "needs an LLM call"), `decideAndDeferSave` (async LLM + call that mutates the in-memory cache without flushing to disk), and + `persist` (one-shot cache flush). The two-pass scan in `scan.ts` uses the + latter three so unknown-extension probes fan out under the shared LLM + limiter and the disk cache is written exactly once at the end of the + batch. `SourceReader` / `ScanDeps` (the repository-read abstraction; default - implementation in `pipeline/disk-source-reader.ts`), `ArchiveSink` / + implementation in `pipeline/disk-source-reader.ts`). `ScanDeps.limiter` + is the optional shared `ConcurrencyLimiter`; when supplied together with + `skipDecider`, `scanRepository` switches to its two-pass strategy + instead of the legacy inline-await walk. + `ArchiveSink` / `ArchiveSinkInput` (an optional non-fatal sink that the open-source binary never calls), `SourceFactory` / `SourceFactoryInput` / `SourceFactoryResult` (the optional index-side injection hook surfaced diff --git a/packages/ingest-github/src/types/meta-paths.ts b/packages/ingest-github/src/types/meta-paths.ts index 8898df3..5da4f89 100644 --- a/packages/ingest-github/src/types/meta-paths.ts +++ b/packages/ingest-github/src/types/meta-paths.ts @@ -5,5 +5,6 @@ export interface MetaPaths { bigFileAnalysisDir: string; bigFileChunksDir: string; bigFilesJson: string; + scanManifestJson: string; repoSummaryJson: string; } diff --git a/packages/ingest-github/src/types/pipeline.ts b/packages/ingest-github/src/types/pipeline.ts index 9f5c0be..aaf13a5 100644 --- a/packages/ingest-github/src/types/pipeline.ts +++ b/packages/ingest-github/src/types/pipeline.ts @@ -1,6 +1,7 @@ import type { GithubIndexPayload, GithubPullPayload } from "@bb/types"; import type { AskLlmOptions } from "@bb/llm"; import type { FileAnalysis } from "@bb/mongo"; +import type { ConcurrencyLimiter } from "#src/pipeline/concurrency.ts"; import type { DiffResult } from "#src/pipeline/git-diff.ts"; export interface ScannedFile { @@ -59,6 +60,14 @@ export interface ScanDeps { * invokes the LLM branch. Absent in OSS standalone runs. */ llmCallContext?: AskLlmOptions; + /** + * Shared LLM-concurrency limiter. When set, `scanRepository` uses a + * two-pass strategy: walk + cache-only decisions in pass 1, parallel + * deduplicated LLM resolution under this limiter in pass 2, drain the + * pending list in pass 3 (all cache-hits). When absent (e.g. legacy + * `SourceFactory` consumers), scan falls back to inline-await per file. + */ + limiter?: ConcurrencyLimiter; } export interface SourceReader { @@ -152,5 +161,31 @@ export interface SkipDeciderInput { } export interface SkipDecider { + /** + * Single-shot decision: applies static filters, consults the in-memory + * + on-disk caches, and falls through to the LLM when neither resolves + * the decision. Persists the cache to disk after each LLM call. + * Kept for non-scan callers and the legacy inline-await path. + */ decide(input: SkipDeciderInput): Promise; + /** + * Synchronous static-only decision. Returns the resolved `SkipDecision` + * when static filters or cache hit resolves it; returns `null` to signal + * "this needs an LLM call to resolve". Used by `scanRepository` in its + * two-pass mode to collect pending entries without blocking the walk. + */ + decideStatic(input: SkipDeciderInput): SkipDecision | null; + /** + * Asynchronous LLM-resolution path that **mutates the in-memory cache** + * but does NOT persist to disk. The caller (typically `scanRepository`) + * batches these under a `ConcurrencyLimiter` and then calls `persist()` + * exactly once at the end of the batch, so concurrent `saveCache` calls + * don't race on the tmp/rename atomicity. + */ + decideAndDeferSave(input: SkipDeciderInput): Promise; + /** + * Persist the in-memory decision cache to disk. Best-effort: swallows + * I/O errors. Called once at the end of a `decideAndDeferSave` batch. + */ + persist(): void; } diff --git a/packages/ingest-github/tsconfig.json b/packages/ingest-github/tsconfig.json index d8a16a7..4ed0786 100644 --- a/packages/ingest-github/tsconfig.json +++ b/packages/ingest-github/tsconfig.json @@ -1,4 +1,4 @@ { - "extends": "../../../../tsconfig.base.json", + "extends": "../../tsconfig.base.json", "include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.json"] } diff --git a/packages/ingest-github/types/index.d.ts b/packages/ingest-github/types/index.d.ts deleted file mode 100644 index 98445ad..0000000 --- a/packages/ingest-github/types/index.d.ts +++ /dev/null @@ -1,137 +0,0 @@ -export interface RegisterGithubWorkersDeps { - sourceFactory?: SourceFactory; - pullFactory?: PullFactory; - progressContextFactory?: ProgressContextFactory; -} - -export type ProgressPhase = "file_analysis" | "folder_analysis" | "indexing"; - -export type ProgressTotalMode = { kind: "fixed"; total: number } | { kind: "growing"; initialTotal?: number }; - -export interface ProgressReporterInput { - readonly phase: ProgressPhase; - readonly subPhase?: string; - readonly total: ProgressTotalMode; - readonly resolveInitialProcessed?: () => Promise | number; -} - -export interface ProgressReporter { - start(): Promise; - increment(delta?: number, meta?: { fileName?: string }): void; - incrementSeen(delta?: number): void; - setTotal(total: number): void; - stop(): void; -} - -export interface ProgressContext { - reporter(input: ProgressReporterInput): ProgressReporter; - phaseChanged(phase: ProgressPhase): void; - completed(message?: string): void; - failed(error: string, phase?: ProgressPhase): void; -} - -export type ProgressContextFactory = (knowledgeId: string) => ProgressContext; - -export declare const nullProgressContextFactory: ProgressContextFactory; - -export declare function registerGithubWorkers(deps?: RegisterGithubWorkersDeps): void; -export declare function registerLocalIngestWorker(): void; - -export interface FlatFolderStrategyDeps { - fileAnalyzer: FileAnalyzer; - progressContextFactory?: ProgressContextFactory; -} -export declare function createFlatFolderStrategy(deps: FlatFolderStrategyDeps): IngestStrategy; -export declare const createLlmFileAnalyzer: (...args: any[]) => any; -export declare const createDiskSourceReader: (...args: any[]) => any; -export declare const createPipelineRunner: (...args: any[]) => any; -export declare const createGithubIngestHandler: (...args: any[]) => any; -export declare const createLocalIngestHandler: (...args: any[]) => any; -export declare const runPull: (...args: any[]) => any; -export declare const reposRoot: (...args: any[]) => string; -export declare const repoCloneDir: (knowledgeId: string) => string; -export declare const metaRootFor: (knowledgeId: string) => string; -export declare const metaPathsFor: (knowledgeId: string) => unknown; -export declare const commitMetaDir: (knowledgeId: string, commitHash: string) => string; -export declare const businessContextDir: (knowledgeId: string, commitHash: string, sanitizedTitle: string) => string; -export declare const orgRegistryDir: (knowledgeId: string, orgId: string) => string; -export declare function fetchLatestCommitHash( - repoUrl: string, - branch: string, - gitToken?: string, -): Promise; -export declare function fetchRecentCommits( - repoUrl: string, - branch: string, - limit?: number, - gitToken?: string, -): Promise; -export declare function fetchDefaultBranch(repoUrl: string, gitToken?: string): Promise; -export declare function fetchBranches( - repoUrl: string, - gitToken?: string, - limit?: number, -): Promise<{ status: "ok"; branches: string[] } | { status: "error"; message: string }>; -export declare function parseGithubRepo(repoUrl: string): ParsedRepo | null; - -export interface BootstrapRuntimeOptions { - config: unknown; - loggerFactory: (scope: string) => unknown; -} -export declare function bootstrapRuntime(opts: BootstrapRuntimeOptions): Promise; - -export declare const COMBINED_CODE_ANALYSIS_SYSTEM_PROMPT: string; -export declare function buildFileAnalysisUserPrompt(input: { relativePath: string; content: string }): string; - -export type CreatePipelineRunnerDeps = any; -export type IngestJobHandlerDeps = any; -export type IngestRunnerDeps = any; -export type IngestRunnerInput = any; -export type IngestStrategy = any; -export type StrategyInput = any; -export type StrategyResult = any; -export type StrategyContext = any; -export type FileAnalyzer = any; -export type AnalyzedFileResult = any; -export type ScanEntry = any; -export type ScannedFile = any; -export type OversizedFile = any; -export type ScanDeps = any; -export type SourceReader = any; -export type ArchiveSink = any; -export type ArchiveSinkInput = any; -export type SourceFactory = any; -export type SourceFactoryInput = any; -export type SourceFactoryResult = any; -export type PullFactory = any; -export type PullFactoryInput = any; -export type PullFactoryResult = any; -export type DiffResult = any; -export type RenamedFile = any; -export type CondensedFileAnalysis = any; -export interface CommitEntry { - sha: string; - message: string; - author: string; - timestamp: string; -} - -export type FetchCommitsResult = - | { status: "ok"; commits: CommitEntry[] } - | { status: "not_found" } - | { status: "unauthorized" } - | { status: "rate_limited" } - | { status: "error"; message: string }; - -export interface ParsedRepo { - owner: string; - repo: string; - branch?: string; -} - -export type DefaultBranchResult = - | { status: "ok"; branch: string } - | { status: "not_found" } - | { status: "unauthorized" } - | { status: "rate_limited" } - | { status: "error"; message: string }; diff --git a/packages/llm/README.md b/packages/llm/README.md index 5d659d0..64e6cef 100644 --- a/packages/llm/README.md +++ b/packages/llm/README.md @@ -29,10 +29,15 @@ selected by `Config.LlmProvider` (`"openrouter"` default, or fallback chain. The request body includes a `models: [...]` array when the deduplicated chain has ≥2 non-empty entries and always sends `usage: { include: true }` so OpenRouter populates `usage.cost` in - the response. `usage.model` is the actual model the gateway picked. - Tokens come straight from OpenRouter's `usage.prompt_tokens` / - `usage.completion_tokens`; `costUsd` from `usage.cost` (defaults to - `0` when the provider omits it — common for `:free` models). + the response. The body also pins `provider: { allow_fallbacks: false }` + so OpenRouter does not silently cycle across upstream providers of the + same model — a slow or sick provider surfaces a real error to us + instead of consuming the wall-clock budget. Model-level fallback + through the `models` chain is unaffected. `usage.model` is the actual + model the gateway picked. Tokens come straight from OpenRouter's + `usage.prompt_tokens` / `usage.completion_tokens`; `costUsd` from + `usage.cost` (defaults to `0` when the provider omits it — common for + `:free` models). - **Ollama mode** — POST to `${Config.OllamaUrl}/api/chat` with `{ model: Config.OllamaModel, messages, stream: false }`. Single model per request — no fallback chain (Ollama does not have a @@ -151,6 +156,15 @@ it. The cost ledger described in [docs/arch.md](../../docs/arch.md) is sees a single `AskLlmResult`. BullMQ's `attempts: 3` wraps the whole call — retries walk the chain again, useful when a transient OpenRouter outage clears between retries. + 4a. **No upstream-provider fallback.** Every request carries + `provider: { allow_fallbacks: false }`. This is orthogonal to the + `models` chain in invariant 4 — `models` controls _which model_ the + gateway tries; `allow_fallbacks` controls whether OpenRouter routes + to a different upstream backend serving the same model when the first + one stalls. We disable the latter so a slow provider cannot eat the + wall-clock without ever producing tokens; the surfaced error becomes + actionable (specific provider, specific status) instead of a generic + timeout. 5. **Errors are typed, not strings.** `LlmConfigError` carries the exact `bytebell keys set` hint; `LlmError` carries `cause`. 6. **Timeout is enforced.** AbortController fires at `timeoutMs`; the diff --git a/packages/llm/src/README.md b/packages/llm/src/README.md index 61d122a..1b3bba7 100644 --- a/packages/llm/src/README.md +++ b/packages/llm/src/README.md @@ -21,10 +21,13 @@ package-level contract; this file documents how the source tree is split. or `Config.OpenrouterModel` + four fallback slots), caps the chain at 3 entries (OpenRouter's hard limit), POSTs to the chat-completions endpoint with an AbortController timeout, parses the typed - `OpenRouterResponse`, returns the first choice's content. `usage.model` - reflects which model OpenRouter actually routed to. Throws - `LlmConfigError` if the API key resolves to empty, `LlmError` on - timeout / HTTP non-2xx / empty completion. + `OpenRouterResponse`, returns the first choice's content. The body + always carries `provider: { allow_fallbacks: false }` so OpenRouter + cannot silently route across upstream providers of the same model; + see `OpenRouterProviderRouting` in this file and invariant 4a in the + package README. `usage.model` reflects which model OpenRouter actually + routed to. Throws `LlmConfigError` if the API key resolves to empty, + `LlmError` on timeout / HTTP non-2xx / empty completion. - **[ollama.ts](ollama.ts)** — `callOllama` and `resolveOllamaChain`. Single-model per request (Ollama has no fan-out). Reads model from `opts.model ?? Config.OllamaModel`. Ignores `opts.apiKey` (Ollama is diff --git a/packages/llm/src/openrouter.ts b/packages/llm/src/openrouter.ts index 53b48b4..a4f99e7 100644 --- a/packages/llm/src/openrouter.ts +++ b/packages/llm/src/openrouter.ts @@ -20,11 +20,19 @@ interface OpenRouterUsageAccounting { include: true; } +interface OpenRouterProviderRouting { + // Pin OpenRouter to the first viable upstream provider. Without this, + // OpenRouter silently cycles across providers on slow/failed calls and + // we lose the per-call wall-clock budget before a real error surfaces. + allow_fallbacks: boolean; +} + interface OpenRouterRequest { model: string; models?: string[]; messages: OpenRouterMessage[]; usage: OpenRouterUsageAccounting; + provider: OpenRouterProviderRouting; } interface OpenRouterResponse { @@ -67,10 +75,11 @@ export async function callOpenRouter(prompt: string, opts: AskLlmOptions, timeou messages.push({ role: "user", content: prompt }); const usageAccounting: OpenRouterUsageAccounting = { include: true }; + const providerRouting: OpenRouterProviderRouting = { allow_fallbacks: false }; const body: OpenRouterRequest = cappedChain.length > 1 - ? { model, models: cappedChain, messages, usage: usageAccounting } - : { model, messages, usage: usageAccounting }; + ? { model, models: cappedChain, messages, usage: usageAccounting, provider: providerRouting } + : { model, messages, usage: usageAccounting, provider: providerRouting }; const controller = new AbortController(); const timer = setTimeout(() => controller.abort(), timeoutMs); diff --git a/packages/llm/tsconfig.json b/packages/llm/tsconfig.json index d8a16a7..4ed0786 100644 --- a/packages/llm/tsconfig.json +++ b/packages/llm/tsconfig.json @@ -1,4 +1,4 @@ { - "extends": "../../../../tsconfig.base.json", + "extends": "../../tsconfig.base.json", "include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.json"] } diff --git a/packages/logger/tsconfig.json b/packages/logger/tsconfig.json index d8a16a7..4ed0786 100644 --- a/packages/logger/tsconfig.json +++ b/packages/logger/tsconfig.json @@ -1,4 +1,4 @@ { - "extends": "../../../../tsconfig.base.json", + "extends": "../../tsconfig.base.json", "include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.json"] } diff --git a/packages/mcp/tsconfig.json b/packages/mcp/tsconfig.json index d8a16a7..4ed0786 100644 --- a/packages/mcp/tsconfig.json +++ b/packages/mcp/tsconfig.json @@ -1,4 +1,4 @@ { - "extends": "../../../../tsconfig.base.json", + "extends": "../../tsconfig.base.json", "include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.json"] } diff --git a/packages/mongo/src/aggregateStats.ts b/packages/mongo/src/aggregateStats.ts index 0cfa6a8..95f7d59 100644 --- a/packages/mongo/src/aggregateStats.ts +++ b/packages/mongo/src/aggregateStats.ts @@ -1,10 +1,4 @@ -import type { - KnowledgeDoc, - StatsCommitEntry, - StatsRepoEntry, - StatsResponse, - StatsTotals, -} from "@bb/types"; +import type { KnowledgeDoc, StatsCommitEntry, StatsRepoEntry, StatsResponse, StatsTotals } from "@bb/types"; import { _getDb } from "./client.ts"; import { Collections } from "./collections.ts"; diff --git a/packages/mongo/tsconfig.json b/packages/mongo/tsconfig.json index d8a16a7..4ed0786 100644 --- a/packages/mongo/tsconfig.json +++ b/packages/mongo/tsconfig.json @@ -1,4 +1,4 @@ { - "extends": "../../../../tsconfig.base.json", + "extends": "../../tsconfig.base.json", "include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.json"] } diff --git a/packages/neo4j/README.md b/packages/neo4j/README.md index e363877..ba441b2 100644 --- a/packages/neo4j/README.md +++ b/packages/neo4j/README.md @@ -40,20 +40,25 @@ The package owns: function / import edges), and one to remove the `:Knowledge` node itself. Called by the server's `DELETE /api/v1/repos/:knowledgeId` route. -- File-node CRUD (`upsertFileNode`) — composes the per-file relationships - (`:HAS_KEYWORD / :HAS_CLASS / :HAS_FUNCTION / :HAS_IMPORT_INTERNAL / -:HAS_IMPORT_EXTERNAL`), clearing stale relationships before - re-attaching for re-runs. The two-`:HAS_IMPORT_*` split mirrors - kube-package's distinction between relative imports and external - packages — downstream MCP queries can ask "which files import this - internal module" vs "which files import this external package" - cleanly +- File-node CRUD (`upsertFileNode`, `upsertFileNodesBatch`) — composes + the per-file relationships (`:HAS_KEYWORD / :HAS_CLASS / :HAS_FUNCTION +/ :HAS_IMPORT_INTERNAL / :HAS_IMPORT_EXTERNAL`), clearing stale + relationships before re-attaching for re-runs. The two-`:HAS_IMPORT_*` + split mirrors kube-package's distinction between relative imports and + external packages — downstream MCP queries can ask "which files + import this internal module" vs "which files import this external + package" cleanly. The `*Batch` variant lands an arbitrary number of + files in **one transaction** via Cypher `UNWIND` — same Cypher shape, + wrapped with an outer UNWIND so 50+ files cost the same 12 Cyphers a + single file used to cost. +- Folder-node CRUD (`upsertFolderNode`, `upsertFolderNodesBatch`) — + same shape as file CRUD; batched variant for bulk indexing. The package does **not** own: - Read queries — defer to a future `@bb/graph` once `@bb/mcp` retrieval has a use case -- Telemetry, retry, or transaction batching — driver defaults apply +- Telemetry — driver defaults apply. - Migration tooling — the `IF NOT EXISTS` constraint creates handle schema drift; richer migrations land later @@ -69,6 +74,9 @@ function upsertKnowledgeNode(doc: KnowledgeDoc): Promise; function setKnowledgeStateInGraph(knowledgeId: string, state: KnowledgeState): Promise; function deleteKnowledgeGraph(knowledgeId: string): Promise; function upsertFileNode(input: UpsertFileNodeInput): Promise; +function upsertFileNodesBatch(inputs: readonly UpsertFileNodeInput[]): Promise; +function upsertFolderNode(input: UpsertFolderNodeInput): Promise; +function upsertFolderNodesBatch(inputs: readonly UpsertFolderNodeInput[]): Promise; function runCypher(query: string, params?: Record): Promise; @@ -160,9 +168,12 @@ Neo4jPassword`). Repo-wide ESLint rule blocks `process.env`. "already exists" errors (Neo4j refuses constraints when a matching plain index exists). Operators must drop conflicting indexes manually if uniqueness guarantees matter. -6. **`upsertFileNode` clears stale relationships before re-attaching.** - Re-runs of the same `(knowledgeId, relativePath)` produce a clean - relationship set rather than accumulating outdated keywords/imports. +6. **`upsertFileNode` and `upsertFileNodesBatch` clear stale relationships + before re-attaching.** Re-runs of the same `(knowledgeId, relativePath)` + produce a clean relationship set rather than accumulating outdated + keywords/imports. In the batched variant the clear-then-attach happens + atomically inside one transaction per batch — partial failures roll + back, so re-runs always start from a consistent state. 7. **No raw `Driver` leaks.** `_getDriver()` is not in `src/index.ts`. Higher tiers go through the typed helpers. @@ -174,7 +185,6 @@ Neo4jPassword`). Repo-wide ESLint rule blocks `process.env`. ## What is intentionally out of scope (v0) - Read queries (defer to `@bb/graph`) -- Cypher transactions / batch writes (single-statement per call) - Schema migrations / drops / renames (only `IF NOT EXISTS` creates) - Multi-database support (we use the default `neo4j` db) - Pub/sub / change-data-capture diff --git a/packages/neo4j/src/client.ts b/packages/neo4j/src/client.ts index 56207d2..dac5fbb 100644 --- a/packages/neo4j/src/client.ts +++ b/packages/neo4j/src/client.ts @@ -81,6 +81,35 @@ export async function _runCypher(query: string, params: Record; +} + +/** + * Run multiple Cypher statements inside one write transaction. All-or-nothing: + * either every statement commits or none do. Used by the batched upsert APIs + * so a 50-file batch lands as one transaction instead of 12 × 50 sessions. + * + * Uses the driver's `executeWrite` which retries automatically on transient + * errors (deadlock, leader switch) up to a few attempts. + */ +export async function _runInTransaction(steps: readonly CypherStep[]): Promise { + if (steps.length === 0) { + return; + } + const session: Session = _getDriver().session(); + try { + await session.executeWrite(async (tx) => { + for (const step of steps) { + await tx.run(step.query, step.params); + } + }); + } finally { + await session.close(); + } +} + export function toNeo4jInt(value: number): Integer { return int(value); } diff --git a/packages/neo4j/src/files.ts b/packages/neo4j/src/files.ts index eaf4182..7d049e3 100644 --- a/packages/neo4j/src/files.ts +++ b/packages/neo4j/src/files.ts @@ -1,5 +1,5 @@ import type { FileAnalysis } from "@bb/mongo"; -import { _runCypher } from "./client.ts"; +import { _runCypher, _runInTransaction, type CypherStep } from "./client.ts"; const UPSERT_FILE = ` MERGE (f:File {knowledgeId: $knowledgeId, relativePath: $relativePath}) @@ -133,6 +133,232 @@ export async function deleteFileNodes(knowledgeId: string, relativePaths: string await _runCypher(DELETE_FILES, { knowledgeId, relativePaths }); } +// ───────────────────────────────────────────────────────────────────────────── +// Batched upsert — used by the flat-folder indexing phase to land 50+ files in +// one transaction instead of 12 round-trips per file. Same Cypher shape as the +// single-shot path above; just wrapped with an outer UNWIND so one query +// services every file in the batch. The five rel types (HAS_KEYWORD / +// HAS_CLASS / HAS_FUNCTION / HAS_IMPORT_INTERNAL / HAS_IMPORT_EXTERNAL) each +// take two Cyphers: a batched DELETE that clears existing rels for every file +// in the batch by relativePath, then a batched UNWIND that attaches the new +// rels from flattened `(knowledgeId, relativePath, name)` triples. +// ───────────────────────────────────────────────────────────────────────────── + +const BATCH_UPSERT_FILES = ` +UNWIND $files AS f +MERGE (file:File {knowledgeId: f.knowledgeId, relativePath: f.relativePath}) +SET file.orgId = f.orgId, + file.repoId = f.repoId, + file.language = f.language, + file.sha = f.sha, + file.sizeBytes = f.sizeBytes, + file.purpose = f.purpose, + file.summary = f.summary, + file.businessContext = f.businessContext, + file.dataFlowDirection = f.dataFlowDirection, + file.ontologyConcepts = f.ontologyConcepts, + file.businessEntities = f.businessEntities, + file.systemCapabilities = f.systemCapabilities, + file.sideEffects = f.sideEffects, + file.configDependencies = f.configDependencies, + file.integrationSurface = f.integrationSurface, + file.contractsProvided = f.contractsProvided, + file.contractsConsumed = f.contractsConsumed, + file.sectionNames = f.sectionNames, + file.sectionDescriptions = f.sectionDescriptions, + file.isBigFile = f.isBigFile, + file.totalChunks = f.totalChunks, + file.totalTokenCount = f.totalTokenCount, + file.updatedAt = $updatedAt +WITH file, f +MATCH (k:Knowledge {knowledgeId: f.knowledgeId}) +MERGE (k)-[:HAS_FILE]->(file) +`; + +const BATCH_ATTACH_FILES_TO_FOLDERS = ` +UNWIND $pairs AS pair +MATCH (file:File {knowledgeId: pair.knowledgeId, relativePath: pair.relativePath}) +MATCH (folder:Folder {knowledgeId: pair.knowledgeId, folderPath: pair.folderPath}) +MERGE (folder)-[:CONTAINS]->(file) +`; + +const BATCH_CLEAR_RELS_BY_TYPE: Readonly> = { + HAS_KEYWORD: ` +UNWIND $files AS f +MATCH (file:File {knowledgeId: f.knowledgeId, relativePath: f.relativePath})-[r:HAS_KEYWORD]->() +DELETE r +`, + HAS_CLASS: ` +UNWIND $files AS f +MATCH (file:File {knowledgeId: f.knowledgeId, relativePath: f.relativePath})-[r:HAS_CLASS]->() +DELETE r +`, + HAS_FUNCTION: ` +UNWIND $files AS f +MATCH (file:File {knowledgeId: f.knowledgeId, relativePath: f.relativePath})-[r:HAS_FUNCTION]->() +DELETE r +`, + HAS_IMPORT_INTERNAL: ` +UNWIND $files AS f +MATCH (file:File {knowledgeId: f.knowledgeId, relativePath: f.relativePath})-[r:HAS_IMPORT_INTERNAL]->() +DELETE r +`, + HAS_IMPORT_EXTERNAL: ` +UNWIND $files AS f +MATCH (file:File {knowledgeId: f.knowledgeId, relativePath: f.relativePath})-[r:HAS_IMPORT_EXTERNAL]->() +DELETE r +`, +}; + +const BATCH_ATTACH_KEYWORDS = ` +UNWIND $pairs AS p +MATCH (file:File {knowledgeId: p.knowledgeId, relativePath: p.relativePath}) +MERGE (kw:Keyword {name: p.name}) +MERGE (file)-[:HAS_KEYWORD]->(kw) +`; + +const BATCH_ATTACH_CLASSES = ` +UNWIND $pairs AS p +MATCH (file:File {knowledgeId: p.knowledgeId, relativePath: p.relativePath}) +MERGE (c:Class {signature: p.signature}) +MERGE (file)-[:HAS_CLASS]->(c) +`; + +const BATCH_ATTACH_FUNCTIONS = ` +UNWIND $pairs AS p +MATCH (file:File {knowledgeId: p.knowledgeId, relativePath: p.relativePath}) +MERGE (fn:Function {signature: p.signature}) +MERGE (file)-[:HAS_FUNCTION]->(fn) +`; + +const BATCH_ATTACH_IMPORTS_INTERNAL = ` +UNWIND $pairs AS p +MATCH (file:File {knowledgeId: p.knowledgeId, relativePath: p.relativePath}) +MERGE (m:Module {name: p.name}) +MERGE (file)-[:HAS_IMPORT_INTERNAL]->(m) +`; + +const BATCH_ATTACH_IMPORTS_EXTERNAL = ` +UNWIND $pairs AS p +MATCH (file:File {knowledgeId: p.knowledgeId, relativePath: p.relativePath}) +MERGE (m:Module {name: p.name}) +MERGE (file)-[:HAS_IMPORT_EXTERNAL]->(m) +`; + +type RelType = "HAS_KEYWORD" | "HAS_CLASS" | "HAS_FUNCTION" | "HAS_IMPORT_INTERNAL" | "HAS_IMPORT_EXTERNAL"; + +interface FileRow { + knowledgeId: string; + relativePath: string; +} + +export async function upsertFileNodesBatch(inputs: readonly UpsertFileNodeInput[]): Promise { + if (inputs.length === 0) { + return; + } + const updatedAt = new Date().toISOString(); + const files = inputs.map((input) => fileRowFor(input)); + const fileKeys: FileRow[] = inputs.map((input) => ({ + knowledgeId: input.knowledgeId, + relativePath: input.relativePath, + })); + const folderPairs = inputs + .filter((input): input is UpsertFileNodeInput & { folderPath: string } => input.folderPath !== undefined) + .map((input) => ({ + knowledgeId: input.knowledgeId, + relativePath: input.relativePath, + folderPath: input.folderPath, + })); + + const keywordPairs = flattenPairs(inputs, "keywords", "name", (v) => v.toLowerCase()); + const classPairs = flattenPairs(inputs, "classes", "signature"); + const functionPairs = flattenPairs(inputs, "functions", "signature"); + const importsInternalPairs = flattenPairs(inputs, "importsInternal", "name"); + const importsExternalPairs = flattenPairs(inputs, "importsExternal", "name"); + + const steps: CypherStep[] = [{ query: BATCH_UPSERT_FILES, params: { files, updatedAt } }]; + if (folderPairs.length > 0) { + steps.push({ query: BATCH_ATTACH_FILES_TO_FOLDERS, params: { pairs: folderPairs } }); + } + // Clear existing rels of every type for every file in the batch. + for (const relType of [ + "HAS_KEYWORD", + "HAS_CLASS", + "HAS_FUNCTION", + "HAS_IMPORT_INTERNAL", + "HAS_IMPORT_EXTERNAL", + ] as const) { + steps.push({ query: BATCH_CLEAR_RELS_BY_TYPE[relType], params: { files: fileKeys } }); + } + if (keywordPairs.length > 0) { + steps.push({ query: BATCH_ATTACH_KEYWORDS, params: { pairs: keywordPairs } }); + } + if (classPairs.length > 0) { + steps.push({ query: BATCH_ATTACH_CLASSES, params: { pairs: classPairs } }); + } + if (functionPairs.length > 0) { + steps.push({ query: BATCH_ATTACH_FUNCTIONS, params: { pairs: functionPairs } }); + } + if (importsInternalPairs.length > 0) { + steps.push({ query: BATCH_ATTACH_IMPORTS_INTERNAL, params: { pairs: importsInternalPairs } }); + } + if (importsExternalPairs.length > 0) { + steps.push({ query: BATCH_ATTACH_IMPORTS_EXTERNAL, params: { pairs: importsExternalPairs } }); + } + + await _runInTransaction(steps); +} + +function fileRowFor(input: UpsertFileNodeInput): Record { + const sectionMap = input.analysis.sectionMap ?? []; + return { + knowledgeId: input.knowledgeId, + relativePath: input.relativePath, + orgId: input.orgId ?? "local", + repoId: input.repoId ?? input.knowledgeId, + language: input.language, + sha: input.sha, + sizeBytes: input.sizeBytes, + purpose: input.analysis.purpose, + summary: input.analysis.summary, + businessContext: input.analysis.businessContext, + dataFlowDirection: input.analysis.dataFlowDirection ?? "", + ontologyConcepts: input.analysis.ontologyConcepts ?? [], + businessEntities: input.analysis.businessEntities ?? [], + systemCapabilities: input.analysis.systemCapabilities ?? [], + sideEffects: input.analysis.sideEffects ?? [], + configDependencies: input.analysis.configDependencies ?? [], + integrationSurface: input.analysis.integrationSurface ?? [], + contractsProvided: input.analysis.contractsProvided ?? [], + contractsConsumed: input.analysis.contractsConsumed ?? [], + sectionNames: sectionMap.map((s) => s.name), + sectionDescriptions: sectionMap.map((s) => s.description), + isBigFile: input.isBigFile ?? false, + totalChunks: input.totalChunks ?? 0, + totalTokenCount: input.totalTokenCount ?? 0, + }; +} + +function flattenPairs( + inputs: readonly UpsertFileNodeInput[], + field: "keywords" | "classes" | "functions" | "importsInternal" | "importsExternal", + valueKey: "name" | "signature", + normalize?: (v: string) => string, +): Array> { + const out: Array> = []; + for (const input of inputs) { + const values = input.analysis[field]; + if (!Array.isArray(values)) { + continue; + } + for (const raw of values) { + const value = normalize !== undefined ? normalize(raw) : raw; + out.push({ knowledgeId: input.knowledgeId, relativePath: input.relativePath, [valueKey]: value }); + } + } + return out; +} + export async function upsertFileNode(input: UpsertFileNodeInput): Promise { const params = { knowledgeId: input.knowledgeId, relativePath: input.relativePath }; const sectionMap = input.analysis.sectionMap ?? []; diff --git a/packages/neo4j/src/folder.ts b/packages/neo4j/src/folder.ts index e862c3e..f4c8ad8 100644 --- a/packages/neo4j/src/folder.ts +++ b/packages/neo4j/src/folder.ts @@ -1,4 +1,4 @@ -import { _runCypher } from "./client.ts"; +import { _runCypher, _runInTransaction, type CypherStep } from "./client.ts"; import type { NodeScope } from "./repo.ts"; export interface FolderSummaryPayload { @@ -41,6 +41,80 @@ MERGE (kw:Keyword {name: name}) MERGE (folder)-[:HAS_KEYWORD]->(kw) `; +// ───────────────────────────────────────────────────────────────────────────── +// Batched folder upsert. Same Cypher shape as the single-shot path; wrapped +// with an outer UNWIND so one transaction lands every folder in the batch. +// ───────────────────────────────────────────────────────────────────────────── + +const BATCH_UPSERT_FOLDERS = ` +UNWIND $folders AS fld +MERGE (folder:Folder {orgId: fld.orgId, knowledgeId: fld.knowledgeId, repoId: fld.repoId, folderPath: fld.folderPath}) +SET folder.purpose = fld.purpose, + folder.summary = fld.summary, + folder.dependencyGraph = fld.dependencyGraph, + folder.updatedAt = $updatedAt +WITH folder, fld +MATCH (r:Repo {orgId: fld.orgId, knowledgeId: fld.knowledgeId, repoId: fld.repoId}) +MERGE (r)-[:CONTAINS]->(folder) +`; + +const BATCH_CLEAR_FOLDER_KEYWORDS = ` +UNWIND $folders AS fld +MATCH (folder:Folder {orgId: fld.orgId, knowledgeId: fld.knowledgeId, repoId: fld.repoId, folderPath: fld.folderPath})-[rel:HAS_KEYWORD]->() +DELETE rel +`; + +const BATCH_ATTACH_FOLDER_KEYWORDS = ` +UNWIND $pairs AS p +MATCH (folder:Folder {orgId: p.orgId, knowledgeId: p.knowledgeId, repoId: p.repoId, folderPath: p.folderPath}) +MERGE (kw:Keyword {name: p.name}) +MERGE (folder)-[:HAS_KEYWORD]->(kw) +`; + +export async function upsertFolderNodesBatch(inputs: readonly UpsertFolderNodeInput[]): Promise { + if (inputs.length === 0) { + return; + } + const updatedAt = new Date().toISOString(); + const folders = inputs.map((input) => ({ + orgId: input.scope.orgId, + knowledgeId: input.scope.knowledgeId, + repoId: input.scope.repoId, + folderPath: input.folderPath, + purpose: input.summary.purpose, + summary: input.summary.summary, + dependencyGraph: input.summary.dependencyGraph, + })); + const folderKeys = inputs.map((input) => ({ + orgId: input.scope.orgId, + knowledgeId: input.scope.knowledgeId, + repoId: input.scope.repoId, + folderPath: input.folderPath, + })); + const keywordPairs: Array> = []; + for (const input of inputs) { + for (const raw of input.summary.keywords) { + keywordPairs.push({ + orgId: input.scope.orgId, + knowledgeId: input.scope.knowledgeId, + repoId: input.scope.repoId, + folderPath: input.folderPath, + name: raw.toLowerCase(), + }); + } + } + + const steps: CypherStep[] = [ + { query: BATCH_UPSERT_FOLDERS, params: { folders, updatedAt } }, + { query: BATCH_CLEAR_FOLDER_KEYWORDS, params: { folders: folderKeys } }, + ]; + if (keywordPairs.length > 0) { + steps.push({ query: BATCH_ATTACH_FOLDER_KEYWORDS, params: { pairs: keywordPairs } }); + } + + await _runInTransaction(steps); +} + export async function upsertFolderNode(input: UpsertFolderNodeInput): Promise { const scope = input.scope; const params = { diff --git a/packages/neo4j/src/index.ts b/packages/neo4j/src/index.ts index 03b51c0..c581c80 100644 --- a/packages/neo4j/src/index.ts +++ b/packages/neo4j/src/index.ts @@ -12,13 +12,13 @@ export { deleteKnowledgeGraph, } from "./knowledge.ts"; -export { upsertFileNode, deleteFileNodes } from "./files.ts"; +export { upsertFileNode, upsertFileNodesBatch, deleteFileNodes } from "./files.ts"; export type { UpsertFileNodeInput } from "./files.ts"; export { upsertRepoNode } from "./repo.ts"; export type { NodeScope, RepoSummaryPayload, UpsertRepoNodeInput } from "./repo.ts"; -export { upsertFolderNode } from "./folder.ts"; +export { upsertFolderNode, upsertFolderNodesBatch } from "./folder.ts"; export type { FolderSummaryPayload, UpsertFolderNodeInput } from "./folder.ts"; export { snapshotFilesToVersion } from "./fileVersions.ts"; diff --git a/packages/neo4j/tsconfig.json b/packages/neo4j/tsconfig.json index d8a16a7..4ed0786 100644 --- a/packages/neo4j/tsconfig.json +++ b/packages/neo4j/tsconfig.json @@ -1,4 +1,4 @@ { - "extends": "../../../../tsconfig.base.json", + "extends": "../../tsconfig.base.json", "include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.json"] } diff --git a/packages/queue/tsconfig.json b/packages/queue/tsconfig.json index d8a16a7..4ed0786 100644 --- a/packages/queue/tsconfig.json +++ b/packages/queue/tsconfig.json @@ -1,4 +1,4 @@ { - "extends": "../../../../tsconfig.base.json", + "extends": "../../tsconfig.base.json", "include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.json"] } diff --git a/packages/redis/tsconfig.json b/packages/redis/tsconfig.json index d8a16a7..4ed0786 100644 --- a/packages/redis/tsconfig.json +++ b/packages/redis/tsconfig.json @@ -1,4 +1,4 @@ { - "extends": "../../../../tsconfig.base.json", + "extends": "../../tsconfig.base.json", "include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.json"] } diff --git a/packages/server/tsconfig.json b/packages/server/tsconfig.json index d8a16a7..4ed0786 100644 --- a/packages/server/tsconfig.json +++ b/packages/server/tsconfig.json @@ -1,4 +1,4 @@ { - "extends": "../../../../tsconfig.base.json", + "extends": "../../tsconfig.base.json", "include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.json"] } diff --git a/packages/types/src/config.ts b/packages/types/src/config.ts index 882381a..c878718 100644 --- a/packages/types/src/config.ts +++ b/packages/types/src/config.ts @@ -23,6 +23,10 @@ export enum Config { BigFileConcurrency = "big.file.concurrency", AbsoluteFileSizeCap = "absolute.file.size.cap", ConcurrentWorkers = "concurrent.workers", + LlmConcurrency = "llm.concurrency", + FolderSummaryBatchSize = "folder.summary.batch.size", + FolderSummaryBatchMaxFiles = "folder.summary.batch.max.files", + Neo4jBatchSize = "neo4j.batch.size", CondenseContextLimit = "condense.context.limit", CondensePromptOverhead = "condense.prompt.overhead", SmallFileDedupThreshold = "small.file.dedup.threshold", diff --git a/packages/types/tsconfig.json b/packages/types/tsconfig.json index d8a16a7..4ed0786 100644 --- a/packages/types/tsconfig.json +++ b/packages/types/tsconfig.json @@ -1,4 +1,4 @@ { - "extends": "../../../../tsconfig.base.json", + "extends": "../../tsconfig.base.json", "include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.json"] } diff --git a/tsconfig.base.json b/tsconfig.base.json index 6903d08..9226217 100644 --- a/tsconfig.base.json +++ b/tsconfig.base.json @@ -6,6 +6,7 @@ "module": "ESNext", "moduleResolution": "bundler", "moduleDetection": "force", + "jsx": "react-jsx", "allowImportingTsExtensions": true, "verbatimModuleSyntax": true, "isolatedModules": true, @@ -36,12 +37,9 @@ "types": ["bun"], - "composite": true, - "declaration": true, - "declarationMap": true, - "sourceMap": true, - "incremental": true, - "noEmit": false, - "emitDeclarationOnly": true + "composite": false, + "declaration": false, + "noEmit": true, + "incremental": true } } diff --git a/tsconfig.json b/tsconfig.json index 4f4863d..80c98f2 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -1,26 +1,8 @@ { "extends": "./tsconfig.base.json", "compilerOptions": { - "composite": false, - "declaration": false, - "declarationMap": false, "noEmit": true }, - "files": [], - "references": [ - { "path": "packages/types" }, - { "path": "packages/errors" }, - { "path": "packages/config" }, - { "path": "packages/logger" }, - { "path": "packages/mongo" }, - { "path": "packages/redis" }, - { "path": "packages/queue" }, - { "path": "packages/llm" }, - { "path": "packages/ingest-github" }, - { "path": "packages/ingest-business-context" }, - { "path": "packages/cli" }, - { "path": "packages/server" }, - { "path": "packages/neo4j" }, - { "path": "packages/mcp" } - ] + "include": ["packages/*/src/**/*.ts", "packages/*/src/**/*.tsx", "packages/*/src/**/*.json"], + "exclude": ["**/node_modules", "**/dist", "**/*.d.ts"] }