From 064ebf27fad655a5ed85a986f0d82f0618e26766 Mon Sep 17 00:00:00 2001 From: Dead-Bytes <143434285+Dead-Bytes@users.noreply.github.com> Date: Fri, 22 May 2026 13:01:56 +0530 Subject: [PATCH 01/11] refactor: update LLM credential handling in big-file processing and condensing --- .../src/strategies/flat-folder/README.md | 13 +++++----- .../strategies/flat-folder/big-file/README.md | 23 ++++++++++++------ .../flat-folder/big-file/condenser.ts | Bin 9844 -> 10066 bytes .../strategies/flat-folder/big-file/index.ts | 2 +- 4 files changed, 23 insertions(+), 15 deletions(-) diff --git a/packages/ingest-github/src/strategies/flat-folder/README.md b/packages/ingest-github/src/strategies/flat-folder/README.md index 8d26d9d..a454303 100644 --- a/packages/ingest-github/src/strategies/flat-folder/README.md +++ b/packages/ingest-github/src/strategies/flat-folder/README.md @@ -103,14 +103,15 @@ The strategy emits progress through the `ProgressContext` port defined in after `saveCondensed`; failures inside the sink are logged WARN and do not interrupt the analyse loop. The open-source binary never wires a sink — `archiveSink` is undefined and the call is skipped entirely. -- **Per-job LLM credentials thread through every phase.** The orchestrator +- **Per-call LLM credentials thread through every phase.** The orchestrator reads `context.llmCallContext` (an optional `AskLlmOptions` built by the runner from `GithubIndexPayload.{llmApiKey, llmProvider, llmModel}`) and forwards it into every phase that issues LLM calls: phase 1 via `classifyAndAnalyseSmall`'s `llmCallContext`, phase 2 via - `processBigFilesQueue`, phase 3 via `backfillMissingFields`, phase 4 via - `backfillBigFiles`, phase 5 via `runFolderSummaryPhase`, phase 6 via - `summariseRepo`. The phases pass the same option object through to - `askJsonLLM` so per-org overrides reach `@bb/llm` unchanged. OSS - standalone leaves `llmCallContext` undefined and falls back to + `processBigFilesQueue` (which threads it into **both** the chunk + analyzer and `condenseChunks`), phase 3 via `backfillMissingFields`, + phase 4 via `backfillBigFiles`, phase 5 via `runFolderSummaryPhase`, + phase 6 via `summariseRepo`. The phases pass the same option object + through to `askJsonLLM` so the per-call override reaches `@bb/llm` + unchanged. When `llmCallContext` is undefined the call falls back to `Config.OpenrouterApiKey` + `Config.LlmProvider`. diff --git a/packages/ingest-github/src/strategies/flat-folder/big-file/README.md b/packages/ingest-github/src/strategies/flat-folder/big-file/README.md index b1c974a..ba5e5f8 100644 --- a/packages/ingest-github/src/strategies/flat-folder/big-file/README.md +++ b/packages/ingest-github/src/strategies/flat-folder/big-file/README.md @@ -15,10 +15,15 @@ depending on chunk count and prompt budget. `askJsonLLM` with the chunk prompt; tolerates failures by returning an empty analysis. `llmCallContext` forwards per-job LLM credentials threaded through from `StrategyContext`. -- `condenser.ts` — `condenseChunks(relativePath, chunks)`: +- `condenser.ts` — `condenseChunks(relativePath, chunks, llmCallContext?)`: ≤ `SmallFileDedupThreshold` → deterministic merge (no LLM); - above → recursive map-reduce. Per-condense LLM failure falls back to - deterministic dedup so recursion always terminates. + above → recursive map-reduce. `llmCallContext` is threaded through + `condenseRecursively` and `condenseOne` to every `askJsonLLM` call so + the same per-call credential bag the chunk analyser uses also reaches + the condense step — without it, callers that rely on per-call overrides + instead of `Config.OpenrouterApiKey` would hit `LlmConfigError` here. + Per-condense LLM failure falls back to deterministic dedup so recursion + always terminates. - `storage.ts` — on-disk cache (chunk JSON, manifest, condensed analysis) + `iterateCondensed(metaPaths)` async iterator used by Phase 5. - `cache.ts` — `inspect(metaPaths, relativePath)` returns `complete`, @@ -28,11 +33,13 @@ depending on chunk count and prompt budget. sizeBytes, llmCallContext?, progressContext?})`. Sequential per file (chunk-level concurrency inside). Persists every intermediate artifact, so a restart resumes from the next unfinished chunk. `llmCallContext` - is forwarded to every chunk analyzer call so per-job LLM credentials - reach `@bb/llm`. When `progressContext` is present, the chunk pool runs - under a fixed-total reporter - (`subPhase: "big_file:"`, `total = chunks.length`) so - long single-file analyses surface as live `PHASE_TICK` envelopes + is forwarded to **both** sides of the big-file pipeline — every + `analyzeChunk` call inside the worker loop **and** the final + `condenseChunks(...)` call — so per-call LLM credentials reach + `@bb/llm` consistently across chunk analysis and condense. When + `progressContext` is present, the chunk pool runs under a fixed-total + reporter (`subPhase: "big_file:"`, `total = chunks.length`) + so long single-file analyses surface as live `PHASE_TICK` envelopes carrying per-chunk progress instead of looking frozen. ## Invariants diff --git a/packages/ingest-github/src/strategies/flat-folder/big-file/condenser.ts b/packages/ingest-github/src/strategies/flat-folder/big-file/condenser.ts index a7b927655c05f013c4257bbd221f0150187bcf68..fdde9b835a295bd1137c8c84fd809dc1a180b219 100644 GIT binary patch delta 305 zcmez3bIEUltcZ?6No7H*f@5*EPfo6XK}lwQUhziRT})D33JOK3If*5iWvKy)B^g!< z#U(|VdFeV_3X|V3$teM8g`AvR=fs>G=lr~q)QS>&D^yb^&t_GfT*t$<*_in|qks;Y z>d9KH#R`Os*gTuHk_kiWW)Y6rSPWdxnU1Mcj7yklaxkm498qrlB8Kiv1$%pi>e|gk HyiAe+=QC=i delta 110 zcmccQ_r+&|>_)>~OwvWEIf*5iWvKy)B^g!<#U(|VdFeU|$r+`2*^?cGq$l&UDQwPS z{>wPIoV9qeGOzMxeYOy$&67E1GfuYSO5ePdOPFc0G#3vSLhocpX?c*g$!@$~HV5!A GNdf?c9waRQ diff --git a/packages/ingest-github/src/strategies/flat-folder/big-file/index.ts b/packages/ingest-github/src/strategies/flat-folder/big-file/index.ts index 255be0b..c35b234 100644 --- a/packages/ingest-github/src/strategies/flat-folder/big-file/index.ts +++ b/packages/ingest-github/src/strategies/flat-folder/big-file/index.ts @@ -74,7 +74,7 @@ export async function processBigFile(input: ProcessBigFileInput): Promise `chunks/${encodeFolder(input.relativePath)}/chunk-${i}.json`); const totalTokenCount = chunks.reduce((acc, c) => acc + c.tokenCount, 0); From f9949f640be03ca7e245b1a9a45be02054507efa Mon Sep 17 00:00:00 2001 From: Dead-Bytes <143434285+Dead-Bytes@users.noreply.github.com> Date: Fri, 22 May 2026 13:02:35 +0530 Subject: [PATCH 02/11] refactor: enhance OpenRouter provider routing to prevent fallback on slow calls --- .../seed-data/ignorePatterns.json | 3 ++- packages/llm/README.md | 22 +++++++++++++++---- packages/llm/src/README.md | 11 ++++++---- packages/llm/src/openrouter.ts | 13 +++++++++-- 4 files changed, 38 insertions(+), 11 deletions(-) diff --git a/packages/ingest-github/src/pipeline/skip-decisions/seed-data/ignorePatterns.json b/packages/ingest-github/src/pipeline/skip-decisions/seed-data/ignorePatterns.json index f7991f1..96de6e3 100644 --- a/packages/ingest-github/src/pipeline/skip-decisions/seed-data/ignorePatterns.json +++ b/packages/ingest-github/src/pipeline/skip-decisions/seed-data/ignorePatterns.json @@ -305,7 +305,8 @@ { "type": "exact", "pattern": "CODE_OF_CONDUCT.txt" }, { "type": "exact", "pattern": "FAQ.md" }, { "type": "exact", "pattern": "TROUBLESHOOTING.md" }, - { "type": "exact", "pattern": "UPGRADING.md" } + { "type": "exact", "pattern": "UPGRADING.md" }, + { "type": "extension", "pattern": ".md" } ], "logFiles": [ { "type": "extension", "pattern": ".log" }, diff --git a/packages/llm/README.md b/packages/llm/README.md index 5d659d0..2deb951 100644 --- a/packages/llm/README.md +++ b/packages/llm/README.md @@ -29,10 +29,15 @@ selected by `Config.LlmProvider` (`"openrouter"` default, or fallback chain. The request body includes a `models: [...]` array when the deduplicated chain has ≥2 non-empty entries and always sends `usage: { include: true }` so OpenRouter populates `usage.cost` in - the response. `usage.model` is the actual model the gateway picked. - Tokens come straight from OpenRouter's `usage.prompt_tokens` / - `usage.completion_tokens`; `costUsd` from `usage.cost` (defaults to - `0` when the provider omits it — common for `:free` models). + the response. The body also pins `provider: { allow_fallbacks: false }` + so OpenRouter does not silently cycle across upstream providers of the + same model — a slow or sick provider surfaces a real error to us + instead of consuming the wall-clock budget. Model-level fallback + through the `models` chain is unaffected. `usage.model` is the actual + model the gateway picked. Tokens come straight from OpenRouter's + `usage.prompt_tokens` / `usage.completion_tokens`; `costUsd` from + `usage.cost` (defaults to `0` when the provider omits it — common for + `:free` models). - **Ollama mode** — POST to `${Config.OllamaUrl}/api/chat` with `{ model: Config.OllamaModel, messages, stream: false }`. Single model per request — no fallback chain (Ollama does not have a @@ -151,6 +156,15 @@ it. The cost ledger described in [docs/arch.md](../../docs/arch.md) is sees a single `AskLlmResult`. BullMQ's `attempts: 3` wraps the whole call — retries walk the chain again, useful when a transient OpenRouter outage clears between retries. +4a. **No upstream-provider fallback.** Every request carries + `provider: { allow_fallbacks: false }`. This is orthogonal to the + `models` chain in invariant 4 — `models` controls *which model* the + gateway tries; `allow_fallbacks` controls whether OpenRouter routes + to a different upstream backend serving the same model when the first + one stalls. We disable the latter so a slow provider cannot eat the + wall-clock without ever producing tokens; the surfaced error becomes + actionable (specific provider, specific status) instead of a generic + timeout. 5. **Errors are typed, not strings.** `LlmConfigError` carries the exact `bytebell keys set` hint; `LlmError` carries `cause`. 6. **Timeout is enforced.** AbortController fires at `timeoutMs`; the diff --git a/packages/llm/src/README.md b/packages/llm/src/README.md index 61d122a..1b3bba7 100644 --- a/packages/llm/src/README.md +++ b/packages/llm/src/README.md @@ -21,10 +21,13 @@ package-level contract; this file documents how the source tree is split. or `Config.OpenrouterModel` + four fallback slots), caps the chain at 3 entries (OpenRouter's hard limit), POSTs to the chat-completions endpoint with an AbortController timeout, parses the typed - `OpenRouterResponse`, returns the first choice's content. `usage.model` - reflects which model OpenRouter actually routed to. Throws - `LlmConfigError` if the API key resolves to empty, `LlmError` on - timeout / HTTP non-2xx / empty completion. + `OpenRouterResponse`, returns the first choice's content. The body + always carries `provider: { allow_fallbacks: false }` so OpenRouter + cannot silently route across upstream providers of the same model; + see `OpenRouterProviderRouting` in this file and invariant 4a in the + package README. `usage.model` reflects which model OpenRouter actually + routed to. Throws `LlmConfigError` if the API key resolves to empty, + `LlmError` on timeout / HTTP non-2xx / empty completion. - **[ollama.ts](ollama.ts)** — `callOllama` and `resolveOllamaChain`. Single-model per request (Ollama has no fan-out). Reads model from `opts.model ?? Config.OllamaModel`. Ignores `opts.apiKey` (Ollama is diff --git a/packages/llm/src/openrouter.ts b/packages/llm/src/openrouter.ts index 53b48b4..a4f99e7 100644 --- a/packages/llm/src/openrouter.ts +++ b/packages/llm/src/openrouter.ts @@ -20,11 +20,19 @@ interface OpenRouterUsageAccounting { include: true; } +interface OpenRouterProviderRouting { + // Pin OpenRouter to the first viable upstream provider. Without this, + // OpenRouter silently cycles across providers on slow/failed calls and + // we lose the per-call wall-clock budget before a real error surfaces. + allow_fallbacks: boolean; +} + interface OpenRouterRequest { model: string; models?: string[]; messages: OpenRouterMessage[]; usage: OpenRouterUsageAccounting; + provider: OpenRouterProviderRouting; } interface OpenRouterResponse { @@ -67,10 +75,11 @@ export async function callOpenRouter(prompt: string, opts: AskLlmOptions, timeou messages.push({ role: "user", content: prompt }); const usageAccounting: OpenRouterUsageAccounting = { include: true }; + const providerRouting: OpenRouterProviderRouting = { allow_fallbacks: false }; const body: OpenRouterRequest = cappedChain.length > 1 - ? { model, models: cappedChain, messages, usage: usageAccounting } - : { model, messages, usage: usageAccounting }; + ? { model, models: cappedChain, messages, usage: usageAccounting, provider: providerRouting } + : { model, messages, usage: usageAccounting, provider: providerRouting }; const controller = new AbortController(); const timer = setTimeout(() => controller.abort(), timeoutMs); From 665c4d124f027cb109bb4d42ec593214a1f0810b Mon Sep 17 00:00:00 2001 From: Dead-Bytes <143434285+Dead-Bytes@users.noreply.github.com> Date: Fri, 22 May 2026 14:07:37 +0530 Subject: [PATCH 03/11] refactor: restructure flat-folder phases for improved clarity and performance --- packages/config/src/schema.ts | 7 + packages/ingest-github/README.md | 20 +- packages/ingest-github/src/pipeline/paths.ts | 1 + .../strategies/flat-folder/analyse-changed.ts | 2 +- .../strategies/flat-folder/big-file/README.md | 31 ++- .../src/strategies/flat-folder/index.ts | 64 +++-- .../strategies/flat-folder/phases/README.md | 143 ++++++---- .../flat-folder/phases/analyse-small.ts | 133 +++++++++ .../phases/classify-and-analyse-small.ts | 161 ----------- .../flat-folder/phases/process-big-files.ts | 257 +++++++++++++++++- .../flat-folder/phases/scan-and-classify.ts | 131 +++++++++ .../strategies/flat-folder/scan-manifest.ts | 61 +++++ .../ingest-github/src/types/meta-paths.ts | 1 + packages/types/src/config.ts | 1 + 14 files changed, 763 insertions(+), 250 deletions(-) create mode 100644 packages/ingest-github/src/strategies/flat-folder/phases/analyse-small.ts delete mode 100644 packages/ingest-github/src/strategies/flat-folder/phases/classify-and-analyse-small.ts create mode 100644 packages/ingest-github/src/strategies/flat-folder/phases/scan-and-classify.ts create mode 100644 packages/ingest-github/src/strategies/flat-folder/scan-manifest.ts diff --git a/packages/config/src/schema.ts b/packages/config/src/schema.ts index 63a65d4..77a7468 100644 --- a/packages/config/src/schema.ts +++ b/packages/config/src/schema.ts @@ -41,6 +41,7 @@ export const configSchema = z "big.file.concurrency": z.number().int().positive().default(25), "absolute.file.size.cap": z.number().int().positive().default(52428800), "concurrent.workers": z.number().int().positive().default(4), + "llm.concurrency": z.number().int().positive().default(29), "condense.context.limit": z.number().int().positive().default(12000), "condense.prompt.overhead": z.number().int().nonnegative().default(1500), "small.file.dedup.threshold": z.number().int().positive().default(3), @@ -81,6 +82,7 @@ export type ConfigValueMap = { [Config.BigFileConcurrency]: number; [Config.AbsoluteFileSizeCap]: number; [Config.ConcurrentWorkers]: number; + [Config.LlmConcurrency]: number; [Config.CondenseContextLimit]: number; [Config.CondensePromptOverhead]: number; [Config.SmallFileDedupThreshold]: number; @@ -135,6 +137,7 @@ export const HINTS: Readonly> = { [Config.BigFileConcurrency]: "bytebell set big.file.concurrency ", [Config.AbsoluteFileSizeCap]: "bytebell set absolute.file.size.cap ", [Config.ConcurrentWorkers]: "bytebell set concurrent.workers ", + [Config.LlmConcurrency]: "bytebell set llm.concurrency ", [Config.CondenseContextLimit]: "bytebell set condense.context.limit ", [Config.CondensePromptOverhead]: "bytebell set condense.prompt.overhead ", [Config.SmallFileDedupThreshold]: "bytebell set small.file.dedup.threshold ", @@ -195,6 +198,8 @@ export function readField(cfg: BytebellConfig, key: K): Config return cfg["absolute.file.size.cap"] as ConfigValue; case Config.ConcurrentWorkers: return cfg["concurrent.workers"] as ConfigValue; + case Config.LlmConcurrency: + return cfg["llm.concurrency"] as ConfigValue; case Config.CondenseContextLimit: return cfg["condense.context.limit"] as ConfigValue; case Config.CondensePromptOverhead: @@ -264,6 +269,8 @@ export function writeField(cfg: BytebellConfig, key: K, value: return { ...cfg, "absolute.file.size.cap": value as number }; case Config.ConcurrentWorkers: return { ...cfg, "concurrent.workers": value as number }; + case Config.LlmConcurrency: + return { ...cfg, "llm.concurrency": value as number }; case Config.CondenseContextLimit: return { ...cfg, "condense.context.limit": value as number }; case Config.CondensePromptOverhead: diff --git a/packages/ingest-github/README.md b/packages/ingest-github/README.md index b442726..6073339 100644 --- a/packages/ingest-github/README.md +++ b/packages/ingest-github/README.md @@ -132,14 +132,23 @@ worker hardcodes a single `IngestionStrategy` instance (currently - `:File` graph nodes + `:HAS_FILE` / `:HAS_KEYWORD` / `:HAS_CLASS` / `:HAS_FUNCTION` / `:HAS_IMPORT_INTERNAL` / `:HAS_IMPORT_EXTERNAL` relationships — written via `upsertFileNode` from `@bb/neo4j`. +- `meta-output/scan-manifest.json` — the canonical small/big/oversized + classification produced by Phase 1 (`scanAndClassify`). Per-file entries + carry `tokenCount`, `kind`, and (for big files) `estimatedChunks`. + Phases 2a (small) and 2b (big) consume the manifest in parallel. +- `meta-output/bigFiles.json` — legacy view written alongside the manifest + for the pull-path and backfill phases. The main strategy no longer + consumes it directly. ## Invariants -1. **Sequential per-file processing.** Intentionally degraded; one - `upsertRawFile` per file. The small-file path issues one `askLLM`; - the big-file path issues N (one per chunk) plus condensation calls, - all sequential — no `Promise.all`, no concurrency cap. Revisit when - the latency profile demands it. +1. **Shared LLM concurrency limiter.** The flat-folder strategy + constructs one `withConcurrency(Config.LlmConcurrency)` instance at + entry (default 29). The small-file phase, the big-file chunk phase, + and per-file condense calls all check out from this single pool, so + total in-flight LLM calls is bounded by one knob. The legacy + `processBigFile` driver used by the pull-path still uses its own + per-file pool sized by `Config.BigFileConcurrency`. 2. **Clone idempotent.** Re-runs (BullMQ retries) call `git fetch` + `git reset --hard` in the existing dir rather than re-cloning. Tokens are re-injected into the remote URL each time. @@ -179,7 +188,6 @@ worker hardcodes a single `IngestionStrategy` instance (currently - GitHub API streaming mode (always shell-clone) - Default-branch auto-detection (caller supplies `branch`; defaults to `"main"`) -- Concurrency control / parallel file processing - Folder-level summaries / `repoSummary.json` / `flat-folder` strategy - Semantic chunking (`SemanticChunker`) - Per-chunk persistence (we persist only the merged file-level diff --git a/packages/ingest-github/src/pipeline/paths.ts b/packages/ingest-github/src/pipeline/paths.ts index cdddc2f..ac52215 100644 --- a/packages/ingest-github/src/pipeline/paths.ts +++ b/packages/ingest-github/src/pipeline/paths.ts @@ -30,6 +30,7 @@ export function metaPathsFor(knowledgeId: string): MetaPaths { bigFileAnalysisDir: path.join(metaRoot, "big-file-analysis"), bigFileChunksDir: path.join(metaRoot, "big-file-analysis", "chunks"), bigFilesJson: path.join(metaRoot, "bigFiles.json"), + scanManifestJson: path.join(metaRoot, "scan-manifest.json"), repoSummaryJson: path.join(metaRoot, "repo-summary.json"), }; } diff --git a/packages/ingest-github/src/strategies/flat-folder/analyse-changed.ts b/packages/ingest-github/src/strategies/flat-folder/analyse-changed.ts index 982d0a7..17f0125 100644 --- a/packages/ingest-github/src/strategies/flat-folder/analyse-changed.ts +++ b/packages/ingest-github/src/strategies/flat-folder/analyse-changed.ts @@ -39,7 +39,7 @@ export interface AnalyseChangedResult { /** * Pull-time per-file dispatcher. Iterates the changed file set from the - * diff and runs the same per-file work as `classifyAndAnalyseSmall`, but + * diff and runs the same per-file work as `analyseSmallFiles`, but * targeted at known paths rather than a tree walk. * * Reads file content through `input.source` (a `SourceReader`) so the diff --git a/packages/ingest-github/src/strategies/flat-folder/big-file/README.md b/packages/ingest-github/src/strategies/flat-folder/big-file/README.md index ba5e5f8..3e4e6ef 100644 --- a/packages/ingest-github/src/strategies/flat-folder/big-file/README.md +++ b/packages/ingest-github/src/strategies/flat-folder/big-file/README.md @@ -42,11 +42,32 @@ sizeBytes, llmCallContext?, progressContext?})`. Sequential per file so long single-file analyses surface as live `PHASE_TICK` envelopes carrying per-chunk progress instead of looking frozen. +## Two callers + +These leaf helpers (`splitFileIntoChunks`, `analyzeChunk`, `condenseChunks`, +the storage / cache primitives) are consumed by **two** drivers: + +- `processBigFile` (`index.ts`) — legacy serial driver. One big file at a + time, chunks-within-file parallel under `Config.BigFileConcurrency`, + followed by a blocking condense. Used today by the pull-path + (`pipeline/pull.ts`) via `processBigFilesQueue` and by the Phase 4 + backfill. +- `analyseBigFiles` (`phases/process-big-files.ts`) — manifest-driven + chunk-task queue used by the main strategy entry. Every chunk of every + big file is an independent task scheduled through a strategy-wide + shared `ConcurrencyLimiter`. As soon as a file's last chunk lands, + that file's `condenseChunks` is scheduled through the same limiter — + multiple condenses run in parallel with chunks of slower files. + Reuses `splitFileIntoChunks`, `analyzeChunk`, `condenseChunks`, and + the storage helpers without modification. + ## Invariants -- One big file at a time. Concurrency lives at the chunk level inside - `processBigFile`, never across files, to bound peak memory. - Every artifact is durable on disk before the next step. The chunk cache - short-circuits on re-runs; the manifest plus condensed JSON are the - Phase 7 graph-store inputs. -- Cancellation is checked between chunks (`throwIfCancelled(knowledgeId)`). + short-circuits on re-runs (per-chunk granularity, not per-file); the + manifest plus condensed JSON are the Phase 7 graph-store inputs. +- Cancellation is checked between chunks and before each condense + dispatch (`throwIfCancelled(knowledgeId)`). +- `bigFiles.json` is now a derived view written by `scanAndClassify`. + The main strategy reads it indirectly via the manifest; the legacy + drivers (pull-path + backfill) continue to read it directly. diff --git a/packages/ingest-github/src/strategies/flat-folder/index.ts b/packages/ingest-github/src/strategies/flat-folder/index.ts index 09c03c6..924b26f 100644 --- a/packages/ingest-github/src/strategies/flat-folder/index.ts +++ b/packages/ingest-github/src/strategies/flat-folder/index.ts @@ -1,10 +1,14 @@ +import { Config } from "@bb/types"; +import { getConfigValue } from "@bb/config"; import { logger } from "@bb/logger"; import type { FileAnalyzer } from "#src/types/pipeline.ts"; import type { IngestStrategy, StrategyInput, StrategyResult } from "#src/types/strategy.ts"; import { throwIfCancelled } from "#src/pipeline/cancellation.ts"; import { classifyFailure } from "#src/pipeline/failure-classifier.ts"; -import { classifyAndAnalyseSmall } from "./phases/classify-and-analyse-small.ts"; -import { processBigFilesQueue } from "./phases/process-big-files.ts"; +import { withConcurrency } from "#src/pipeline/concurrency.ts"; +import { scanAndClassify } from "./phases/scan-and-classify.ts"; +import { analyseSmallFiles } from "./phases/analyse-small.ts"; +import { analyseBigFiles } from "./phases/process-big-files.ts"; import { backfillMissingFields } from "./backfill/fields.ts"; import { backfillBigFiles } from "./backfill/big-files.ts"; import { runFolderSummaryPhase } from "./folder-summary.ts"; @@ -28,43 +32,60 @@ export function createFlatFolderStrategy(deps: FlatFolderStrategyDeps): IngestSt const progressContext: ProgressContext = progressContextFactory(knowledgeId); try { - progressContext.phaseChanged("file_analysis"); + // Shared LLM limiter — small-file analyses, big-file chunk analyses, + // and per-file condense calls all check out from this single pool. + const llmConcurrency = getConfigValue(Config.LlmConcurrency); + const limiter = withConcurrency(llmConcurrency); + + progressContext.phaseChanged("scan"); + logger.info(`flat-folder: phase1 (scan + classify) starting for ${knowledgeId} limit=${llmConcurrency}`); + throwIfCancelled(knowledgeId); + const scanInput: Parameters[0] = { + knowledgeId, + source, + metaPaths, + progressContext, + }; + if (llmCallContext !== undefined) { + scanInput.llmCallContext = llmCallContext; + } + const { manifest } = await scanAndClassify(scanInput); - logger.info(`flat-folder: phase1 (classify + analyse small) starting for ${knowledgeId}`); + progressContext.phaseChanged("file_analysis"); + logger.info( + `flat-folder: phase2 (analyse small ${manifest.summary.smallCount} + big ${manifest.summary.bigCount}) starting in parallel`, + ); throwIfCancelled(knowledgeId); - const phase1Input: Parameters[0] = { + const smallInput: Parameters[0] = { knowledgeId, + manifest, source, metaPaths, analyzer: deps.fileAnalyzer, + limiter, progressContext, }; if (archiveSink !== undefined) { - phase1Input.archiveSink = archiveSink; + smallInput.archiveSink = archiveSink; } if (llmCallContext !== undefined) { - phase1Input.llmCallContext = llmCallContext; + smallInput.llmCallContext = llmCallContext; } - const phase1 = await classifyAndAnalyseSmall(phase1Input); - let totalInputTokens = phase1.tokenUsage.inputTokens; - let totalOutputTokens = phase1.tokenUsage.outputTokens; - let totalCostUsd = phase1.tokenUsage.costUsd; - - logger.info(`flat-folder: phase2 (process big files) starting`); - throwIfCancelled(knowledgeId); - const phase2Input: Parameters[0] = { + const bigInput: Parameters[0] = { knowledgeId, + manifest, source, metaPaths, + limiter, progressContext, }; if (llmCallContext !== undefined) { - phase2Input.llmCallContext = llmCallContext; + bigInput.llmCallContext = llmCallContext; } - const phase2 = await processBigFilesQueue(phase2Input); - totalInputTokens += phase2.tokenUsage.inputTokens; - totalOutputTokens += phase2.tokenUsage.outputTokens; - totalCostUsd += phase2.tokenUsage.costUsd; + const [smallResult, bigResult] = await Promise.all([analyseSmallFiles(smallInput), analyseBigFiles(bigInput)]); + let totalInputTokens = smallResult.tokenUsage.inputTokens + bigResult.tokenUsage.inputTokens; + let totalOutputTokens = smallResult.tokenUsage.outputTokens + bigResult.tokenUsage.outputTokens; + let totalCostUsd = smallResult.tokenUsage.costUsd + bigResult.tokenUsage.costUsd; logger.info(`flat-folder: phase3 (backfill missing fields) starting`); throwIfCancelled(knowledgeId); @@ -121,7 +142,8 @@ export function createFlatFolderStrategy(deps: FlatFolderStrategyDeps): IngestSt progressContext.completed(); return { - filesAnalyzed: phase1.smallFilesAnalysed + phase2.processed + phase2.cached + phase1.oversizedStubs, + filesAnalyzed: + smallResult.smallFilesAnalysed + smallResult.oversizedStubs + bigResult.processed + bigResult.cached, foldersSummarised: phase5.succeeded, repoSummarised, graphNodesWritten: phase7.nodesWritten, diff --git a/packages/ingest-github/src/strategies/flat-folder/phases/README.md b/packages/ingest-github/src/strategies/flat-folder/phases/README.md index f0701a7..e2d218a 100644 --- a/packages/ingest-github/src/strategies/flat-folder/phases/README.md +++ b/packages/ingest-github/src/strategies/flat-folder/phases/README.md @@ -6,35 +6,50 @@ Backfill (Phases 3 and 4) lives in the sibling `backfill/` folder; folder and repo summarisation (Phases 5 and 6) live as `folder-summary.ts` and `repo-summary.ts` at the strategy root. +The strategy constructs a **shared LLM limiter** (`withConcurrency(Config.LlmConcurrency)`, +default 29) once at entry. Every LLM call across the small-file phase, +the big-file chunk phase, and per-file condense calls checks out from +the same pool — the single tunable for total in-flight LLM calls. + ## Files -- `classify-and-analyse-small.ts` — Phase 1. - `classifyAndAnalyseSmall({knowledgeId, source, metaPaths, analyzer, -skipDecider?, archiveSink?, llmCallContext?, progressContext?})` walks - `source.scan({ skipDecider, llmCallContext })` and per entry: - - `kind === "oversized"` → write a stub via `buildOversizedStub` + - `saveCondensed`, and append a `too-large` row to `bigFiles.json`. - - token count > `Config.ContextWindowLimit` → buffer a - `context-window-exceeded` row for Phase 2. - - otherwise → run `analyseScannedFile(analyzer, entry)` and persist via - `saveCondensed`, under a `withConcurrency(Config.ConcurrentWorkers)` - limiter so analyses run in parallel. - Cancellation is checked at scan boundaries and inside each task; the - buffered big-file list is flushed via `writeBigFiles` after all tasks - drain. -- `process-big-files.ts` — Phase 2. - `processBigFilesQueue({knowledgeId, source, metaPaths, llmCallContext?, progressContext?})` - reads `bigFiles.json`, skips `too-large` entries (counted as - `skippedOversized`), short-circuits when `inspect` returns `complete` - (counted as `cached`), reads the file via `source.readFile`, and - dispatches `processBigFile` sequentially per file with the per-job - `llmCallContext` threaded through. When `progressContext` is present - this phase opens a fixed-total reporter (`subPhase: "big_files_queue"`, - `total = entries.length`) and increments per entry — including - skipped/cached/failed paths so the percentage never stalls. The same - `progressContext` is forwarded into `processBigFile` so each big file - gets its own per-chunk sub-phase. Cancellation re-throws past the - phase; other errors are logged per file and counted as `failed`. +- `scan-and-classify.ts` — Phase 1. `scanAndClassify({knowledgeId, source, +metaPaths, skipDecider?, llmCallContext?, progressContext?})` walks + `source.scan({ skipDecider, llmCallContext })` exactly once, counts + tokens for every eligible entry, classifies each as `"small"`, + `"big"` (token count > `Config.ContextWindowLimit`), or `"oversized"` + (yielded as `kind === "oversized"` by `scanRepository`), and writes + `meta-output/scan-manifest.json` plus the legacy `bigFiles.json` (for + pull-path and backfill consumers that have not migrated). Big entries + get a cheap `estimatedChunks = ceil(tokenCount / Config.MaxTokensPerChunk)` + used by Phase 2's progress reporter. No LLM calls. No file analysis. +- `analyse-small.ts` — Phase 2a. `analyseSmallFiles({knowledgeId, manifest, +source, metaPaths, analyzer, limiter, archiveSink?, llmCallContext?, +progressContext?})` filters the manifest to `kind === "small"` entries, + re-reads each file via `source.readFile`, runs the LLM file analyser, + and persists via `saveCondensed`. Oversized entries also flow through + here as stub writes (no LLM). Every LLM dispatch goes through the + shared `limiter`. Progress is a fixed total — `smallCount + oversizedCount`. +- `process-big-files.ts` — Phase 2b plus the legacy queue. Exports two + functions: + - `analyseBigFiles({knowledgeId, manifest, source, metaPaths, limiter, +llmCallContext?, progressContext?})` — manifest-driven chunk-task + queue. Skips files already complete (manifest + condensed on disk). + For each remaining big file: read content, split into chunks + via `splitFileIntoChunks`, register a per-file `pendingChunks` + counter. Every chunk becomes an independent task scheduled through + the shared limiter: cache-check via `loadChunkIfPresent`, otherwise + `analyzeChunk` + `saveChunk`. When a file's last chunk lands, that + file's condense is **immediately** scheduled through the same + limiter — condenses across multiple files run in parallel with + chunks of slower files. Two fixed-total progress sub-phases: + `"big_files_chunks"` (sum of `estimatedChunks`) and + `"big_files_condense"` (`bigCount`). + - `processBigFilesQueue({knowledgeId, source, metaPaths, llmCallContext?, +progressContext?})` — legacy serial driver kept for the pull-path + (`pipeline/pull.ts`) and any caller that has not migrated to + `analyseBigFiles(manifest, …)`. Reads `bigFiles.json`, dispatches + `processBigFile` once per file in a `for` loop. - `store-flat-analysis.ts` — Phase 7. `storeFlatAnalysis({scope, payload, branch, metaPaths})` ensures `flat-folder` Neo4j indexes, upserts `:Repo` (from `repo-summary.json` @@ -45,45 +60,69 @@ skipDecider?, archiveSink?, llmCallContext?, progressContext?})` walks `:Folder` so the `CONTAINS` edge always lands. `languageFromPath` fills `language` when the analysis left it blank. +## Execution order + +``` +scanAndClassify + ↓ (manifest in-memory + on disk) +┌── analyseSmallFiles ──┐ +│ │ (Promise.all, share one limiter) +└── analyseBigFiles ────┘ + ↓ +backfillMissingFields → backfillBigFiles → folderSummary → repoSummary → storeFlatAnalysis +``` + ## Public interfaces -- `classifyAndAnalyseSmall(input): Promise` — - `{ smallFilesAnalysed, bigFilesQueued, oversizedStubs, failed }`. - `input.progressContext?` opens a growing-total reporter - (`source.scan` size is not known up front); `incrementSeen()` fires per - scan yield and `increment()` fires per persisted entry. -- `processBigFilesQueue(input): Promise` — - `{ processed, cached, failed, skippedOversized }`. `input.progressContext?` - opens a fixed-total reporter sized by `bigFiles.json` and forwards - itself into the per-file `processBigFile` call. +- `scanAndClassify(input): Promise` — + `{ manifest }`. The manifest contains every eligible file plus a + `summary` with `totalFiles`, `smallCount`, `bigCount`, `oversizedCount`, + `totalTokens`, `estimatedBigChunks`. +- `analyseSmallFiles(input): Promise` — + `{ smallFilesAnalysed, oversizedStubs, failed, tokenUsage }`. + Progress: fixed-total reporter sized by `smallCount + oversizedCount`. +- `analyseBigFiles(input): Promise` — + `{ processed, cached, failed, skippedOversized, tokenUsage }`. + Progress: two fixed-total reporters — one for chunks across all + big files, one for per-file condenses. +- `processBigFilesQueue(input): Promise` — same + result shape; legacy driver used by the pull path. - `storeFlatAnalysis(input): Promise` — `{ nodesWritten, foldersWritten, filesWritten }`. -Each phase returns its own counter shape; the strategy aggregates them -into `FlatFolderResult`. - ## Data ownership -- Phase 1 writes condensed JSON (small files + oversized stubs) and - `bigFiles.json`. -- Phase 2 writes chunk artifacts, the chunk manifest, and condensed JSON - for big files via `processBigFile`. -- Phase 7 owns no disk artifacts. It reads the on-disk state produced by +- Phase 1 writes `scan-manifest.json` (canonical) and `bigFiles.json` + (legacy view for backfill + pull). It does not write per-file + analyses. +- Phase 2a writes condensed JSON for small files + oversized stubs. +- Phase 2b writes per-chunk JSON (`chunks//chunk-N.json`), + per-file chunk manifests (`.manifest.json`), and condensed JSON + for big files. +- Phase 7 owns no disk artifacts. It reads on-disk state produced by Phases 1–6 and writes Neo4j nodes (`:Repo`, `:Folder`, `:File`) plus the `CONTAINS` edge. ## Invariants - Disk is the inter-phase contract; nothing crosses a phase boundary in - memory. + memory (except the in-memory manifest object that scan returns directly + to the orchestrator, which is a convenience — the canonical copy on + disk is what later resume/backfill runs read). - `throwIfCancelled(knowledgeId)` runs at every scan boundary, every - big-file boundary, and before each Neo4j upsert in Phase 7. -- Per-file LLM or I/O failures are logged and counted; phases do not - abort on a single bad file. Only `CancellationError` propagates. + per-chunk and per-file dispatch boundary, and before each Neo4j + upsert in Phase 7. +- Per-file or per-chunk LLM/I/O failures are logged and counted; phases + do not abort on a single bad file. Only `CancellationError`, + `LlmConfigError`, and `LlmError` propagate. +- The shared LLM limiter is the only place LLM concurrency is bounded + during the small/big phases. `Config.BigFileConcurrency` is no longer + consulted from the chunk-queue path (it is still consulted by the + legacy `processBigFile` used by the pull-path driver). +- Phase 1 respects `Config.ContextWindowLimit` and + `Config.MaxTokensPerChunk`; do not hardcode either. - Phase 7 always emits a `:Repo` node, even when `repo-summary.json` is absent (logged as a `phase7` warning). -- Phase 1 respects `Config.ContextWindowLimit` and - `Config.ConcurrentWorkers`; do not hardcode either. ## External dependencies @@ -92,8 +131,8 @@ into `FlatFolderResult`. `upsertRepoNode`, `upsertFolderNode`, `upsertFileNode`, `NodeScope`), `pipeline/scan.ts`, `pipeline/concurrency.ts`, `pipeline/cancellation.ts`, and the sibling `flat-folder/{analyse-file, big-file, folder-summary, -folder-path}` modules plus `adapters/llm-file-analyzer.ts` -(`languageFromPath`). +folder-path, scan-manifest}` modules plus +`adapters/llm-file-analyzer.ts` (`languageFromPath`). ## Tier diff --git a/packages/ingest-github/src/strategies/flat-folder/phases/analyse-small.ts b/packages/ingest-github/src/strategies/flat-folder/phases/analyse-small.ts new file mode 100644 index 0000000..5176f7f --- /dev/null +++ b/packages/ingest-github/src/strategies/flat-folder/phases/analyse-small.ts @@ -0,0 +1,133 @@ +import { logger } from "@bb/logger"; +import type { AskLlmOptions } from "@bb/llm"; +import { LlmConfigError, LlmError } from "@bb/errors"; +import type { ArchiveSink, FileAnalyzer, ScannedFile, SourceReader } from "#src/types/pipeline.ts"; +import type { MetaPaths } from "#src/types/meta-paths.ts"; +import type { ProgressContext } from "#src/progress/types.ts"; +import type { ConcurrencyLimiter } from "#src/pipeline/concurrency.ts"; +import { throwIfCancelled, CancellationError } from "#src/pipeline/cancellation.ts"; +import { analyseScannedFile, buildOversizedStub } from "#src/strategies/flat-folder/analyse-file.ts"; +import { saveCondensed } from "#src/strategies/flat-folder/big-file/storage.ts"; +import type { ScanManifest } from "#src/strategies/flat-folder/scan-manifest.ts"; + +export interface AnalyseSmallInput { + knowledgeId: string; + manifest: ScanManifest; + source: SourceReader; + metaPaths: MetaPaths; + analyzer: FileAnalyzer; + limiter: ConcurrencyLimiter; + archiveSink?: ArchiveSink; + llmCallContext?: AskLlmOptions; + progressContext?: ProgressContext; +} + +export interface AnalyseSmallResult { + smallFilesAnalysed: number; + oversizedStubs: number; + failed: number; + tokenUsage: { inputTokens: number; outputTokens: number; costUsd: number }; +} + +/** + * Consumes the `scan-manifest.json` produced by `scanAndClassify` and + * analyses every `kind: "small"` entry through the shared LLM limiter. + * + * Oversized stubs are also written here (they don't go through the LLM but + * still need a placeholder analysis row on disk so downstream phases see a + * complete file set). + */ +export async function analyseSmallFiles(input: AnalyseSmallInput): Promise { + const smallEntries = input.manifest.entries.filter((e) => e.kind === "small"); + const oversizedEntries = input.manifest.entries.filter((e) => e.kind === "oversized"); + + let smallFilesAnalysed = 0; + let oversizedStubs = 0; + let failed = 0; + let totalInputTokens = 0; + let totalOutputTokens = 0; + let totalCostUsd = 0; + + const reporter = input.progressContext?.reporter({ + phase: "file_analysis", + subPhase: "analyse_small", + total: { kind: "fixed", total: smallEntries.length + oversizedEntries.length }, + }); + await reporter?.start(); + + try { + for (const entry of oversizedEntries) { + throwIfCancelled(input.knowledgeId); + try { + await saveCondensed(input.metaPaths, buildOversizedStub(entry.relativePath, entry.sizeBytes)); + oversizedStubs += 1; + } catch (cause: unknown) { + failed += 1; + logger.warn(`analyse-small: oversized stub write failed for ${entry.relativePath}: ${describe(cause)}`); + } + reporter?.increment(1, { fileName: entry.relativePath }); + } + + const pending: Promise[] = []; + for (const entry of smallEntries) { + pending.push( + input.limiter(async () => { + throwIfCancelled(input.knowledgeId); + try { + const content = await input.source.readFile(entry.relativePath); + const scanned: ScannedFile = { + kind: "file", + relativePath: entry.relativePath, + absolutePath: entry.absolutePath, + sizeBytes: entry.sizeBytes, + content, + }; + const condensed = await analyseScannedFile(input.analyzer, scanned, input.llmCallContext); + await saveCondensed(input.metaPaths, condensed); + if (input.archiveSink !== undefined) { + await input.archiveSink.push({ + knowledgeId: input.knowledgeId, + relativePath: entry.relativePath, + content, + }); + } + if (condensed.tokenUsage) { + totalInputTokens += condensed.tokenUsage.inputTokens; + totalOutputTokens += condensed.tokenUsage.outputTokens; + totalCostUsd += condensed.tokenUsage.costUsd; + } + smallFilesAnalysed += 1; + reporter?.increment(1, { fileName: entry.relativePath }); + } catch (cause: unknown) { + if (cause instanceof CancellationError) { + throw cause; + } + if (cause instanceof LlmConfigError || cause instanceof LlmError) { + throw cause; + } + failed += 1; + logger.warn(`analyse-small: analyse failed for ${entry.relativePath}: ${describe(cause)}`); + reporter?.increment(1, { fileName: entry.relativePath }); + } + }), + ); + } + await Promise.all(pending); + } finally { + reporter?.stop(); + } + + logger.info( + `analyse-small done: smallFilesAnalysed=${smallFilesAnalysed} oversizedStubs=${oversizedStubs} failed=${failed}`, + ); + return { + smallFilesAnalysed, + oversizedStubs, + failed, + tokenUsage: { inputTokens: totalInputTokens, outputTokens: totalOutputTokens, costUsd: totalCostUsd }, + }; +} + +function describe(cause: unknown): string { + return cause instanceof Error ? cause.message : String(cause); +} diff --git a/packages/ingest-github/src/strategies/flat-folder/phases/classify-and-analyse-small.ts b/packages/ingest-github/src/strategies/flat-folder/phases/classify-and-analyse-small.ts deleted file mode 100644 index a9ad59a..0000000 --- a/packages/ingest-github/src/strategies/flat-folder/phases/classify-and-analyse-small.ts +++ /dev/null @@ -1,161 +0,0 @@ -import path from "node:path"; -import { tokenLen, type AskLlmOptions } from "@bb/llm"; -import { LlmConfigError, LlmError } from "@bb/errors"; -import { logger } from "@bb/logger"; -import { Config } from "@bb/types"; -import { getConfigValue } from "@bb/config"; -import type { ArchiveSink, FileAnalyzer, SkipDecider, SourceReader } from "#src/types/pipeline.ts"; -import type { MetaPaths } from "#src/types/meta-paths.ts"; -import type { BigFileEntry } from "#src/types/big-file.ts"; -import type { ProgressContext } from "#src/progress/types.ts"; -import { withConcurrency } from "#src/pipeline/concurrency.ts"; -import { throwIfCancelled, CancellationError } from "#src/pipeline/cancellation.ts"; -import { makeSkipDecider } from "#src/pipeline/skip-decisions/index.ts"; -import { analyseScannedFile, buildOversizedStub } from "#src/strategies/flat-folder/analyse-file.ts"; -import { saveCondensed } from "#src/strategies/flat-folder/big-file/storage.ts"; -import { writeBigFiles } from "#src/strategies/flat-folder/big-file/detector.ts"; - -export interface ClassifyPhaseInput { - knowledgeId: string; - source: SourceReader; - metaPaths: MetaPaths; - analyzer: FileAnalyzer; - skipDecider?: SkipDecider; - archiveSink?: ArchiveSink; - llmCallContext?: AskLlmOptions; - progressContext?: ProgressContext; -} - -export interface ClassifyPhaseResult { - smallFilesAnalysed: number; - bigFilesQueued: number; - oversizedStubs: number; - failed: number; - tokenUsage: { inputTokens: number; outputTokens: number; costUsd: number }; -} - -export async function classifyAndAnalyseSmall(input: ClassifyPhaseInput): Promise { - const contextWindowLimit = getConfigValue(Config.ContextWindowLimit); - const concurrentWorkers = getConfigValue(Config.ConcurrentWorkers); - const limit = withConcurrency(concurrentWorkers); - const bigFileBuffer: BigFileEntry[] = []; - let smallFilesAnalysed = 0; - let oversizedStubs = 0; - let failed = 0; - let totalInputTokens = 0; - let totalOutputTokens = 0; - let totalCostUsd = 0; - - const repositoryHint = - input.source.localRepoDir.length > 0 ? path.basename(input.source.localRepoDir) : input.knowledgeId; - const skipDecider = input.skipDecider ?? makeSkipDecider({ repositoryName: repositoryHint }); - - const pending: Promise[] = []; - - const reporter = input.progressContext?.reporter({ - phase: "file_analysis", - total: { kind: "growing" }, - }); - await reporter?.start(); - - try { - const scanDeps: Parameters[0] = { skipDecider }; - if (input.llmCallContext !== undefined) { - scanDeps.llmCallContext = input.llmCallContext; - } - for await (const entry of input.source.scan(scanDeps)) { - throwIfCancelled(input.knowledgeId); - reporter?.incrementSeen(); - - if (entry.kind === "oversized") { - bigFileBuffer.push({ - relativePath: entry.relativePath, - sizeBytes: entry.sizeBytes, - tokenCount: 0, - reason: "too-large", - }); - try { - await saveCondensed(input.metaPaths, buildOversizedStub(entry.relativePath, entry.sizeBytes)); - oversizedStubs += 1; - reporter?.increment(1, { fileName: entry.relativePath }); - } catch (cause: unknown) { - failed += 1; - logger.warn(`phase1: oversized stub write failed for ${entry.relativePath}: ${describe(cause)}`); - } - continue; - } - - const tokenCount = tokenLen(entry.content); - if (tokenCount > contextWindowLimit) { - bigFileBuffer.push({ - relativePath: entry.relativePath, - sizeBytes: entry.sizeBytes, - tokenCount, - reason: "context-window-exceeded", - }); - // Big files are accounted for here; phase 2 has its own reporter. - reporter?.increment(1, { fileName: entry.relativePath }); - continue; - } - - const fileContent = entry.content; - const filePath = entry.relativePath; - pending.push( - limit(async () => { - try { - throwIfCancelled(input.knowledgeId); - const condensed = await analyseScannedFile(input.analyzer, entry, input.llmCallContext); - await saveCondensed(input.metaPaths, condensed); - if (input.archiveSink !== undefined) { - await input.archiveSink.push({ - knowledgeId: input.knowledgeId, - relativePath: filePath, - content: fileContent, - }); - } - if (condensed.tokenUsage) { - totalInputTokens += condensed.tokenUsage.inputTokens; - totalOutputTokens += condensed.tokenUsage.outputTokens; - totalCostUsd += condensed.tokenUsage.costUsd; - } - smallFilesAnalysed += 1; - reporter?.increment(1, { fileName: filePath }); - } catch (cause: unknown) { - if (cause instanceof CancellationError) { - throw cause; - } - if (cause instanceof LlmConfigError || cause instanceof LlmError) { - // LLM unreachable — bail the whole job, don't keep iterating - // over the rest of the files producing the same failure. - throw cause; - } - failed += 1; - logger.warn(`phase1: analyse failed for ${entry.relativePath}: ${describe(cause)}`); - reporter?.increment(1, { fileName: filePath }); - } - }), - ); - } - - await Promise.all(pending); - - await writeBigFiles(input.metaPaths, bigFileBuffer); - } finally { - reporter?.stop(); - } - - logger.info( - `phase1 done: smallFilesAnalysed=${smallFilesAnalysed} bigFilesQueued=${bigFileBuffer.filter((e) => e.reason === "context-window-exceeded").length} oversizedStubs=${oversizedStubs} failed=${failed}`, - ); - return { - smallFilesAnalysed, - bigFilesQueued: bigFileBuffer.filter((e) => e.reason === "context-window-exceeded").length, - oversizedStubs, - failed, - tokenUsage: { inputTokens: totalInputTokens, outputTokens: totalOutputTokens, costUsd: totalCostUsd }, - }; -} - -function describe(cause: unknown): string { - return cause instanceof Error ? cause.message : String(cause); -} diff --git a/packages/ingest-github/src/strategies/flat-folder/phases/process-big-files.ts b/packages/ingest-github/src/strategies/flat-folder/phases/process-big-files.ts index 1197753..70d5102 100644 --- a/packages/ingest-github/src/strategies/flat-folder/phases/process-big-files.ts +++ b/packages/ingest-github/src/strategies/flat-folder/phases/process-big-files.ts @@ -1,13 +1,24 @@ +import { createHash } from "node:crypto"; import { logger } from "@bb/logger"; +import { Config } from "@bb/types"; +import { getConfigValue } from "@bb/config"; import type { AskLlmOptions } from "@bb/llm"; import { LlmConfigError, LlmError } from "@bb/errors"; import type { MetaPaths } from "#src/types/meta-paths.ts"; import type { SourceReader } from "#src/types/pipeline.ts"; import type { ProgressContext } from "#src/progress/types.ts"; +import type { ConcurrencyLimiter } from "#src/pipeline/concurrency.ts"; +import type { ChunkAnalysisResult, FileChunk, HugeFileManifest } from "#src/types/big-file.ts"; +import type { CondensedFileAnalysis } from "#src/types/condensed-file-analysis.ts"; import { throwIfCancelled, CancellationError } from "#src/pipeline/cancellation.ts"; import { readBigFiles } from "#src/strategies/flat-folder/big-file/detector.ts"; import { inspect } from "#src/strategies/flat-folder/big-file/cache.ts"; +import { splitFileIntoChunks } from "#src/strategies/flat-folder/big-file/chunker.ts"; +import { analyzeChunk } from "#src/strategies/flat-folder/big-file/chunk-analyzer.ts"; +import { condenseChunks } from "#src/strategies/flat-folder/big-file/condenser.ts"; +import { loadChunkIfPresent, saveChunk, saveCondensed, saveManifest } from "#src/strategies/flat-folder/big-file/storage.ts"; import { processBigFile } from "#src/strategies/flat-folder/big-file/index.ts"; +import type { ScanManifest, ScanManifestEntry } from "#src/strategies/flat-folder/scan-manifest.ts"; export interface ProcessBigFilesInput { knowledgeId: string; @@ -25,6 +36,12 @@ export interface ProcessBigFilesResult { tokenUsage: { inputTokens: number; outputTokens: number; costUsd: number }; } +/** + * Legacy big-file driver. Reads the deprecated `bigFiles.json`, processes + * each entry serially via `processBigFile` (which internally does + * chunk-then-condense). Kept for the pull-path (`pipeline/pull.ts`) and any + * caller that has not migrated to `analyseBigFiles(manifest, …)` yet. + */ export async function processBigFilesQueue(input: ProcessBigFilesInput): Promise { const entries = await readBigFiles(input.metaPaths); let processed = 0; @@ -61,13 +78,13 @@ export async function processBigFilesQueue(input: ProcessBigFilesInput): Promise content = await input.source.readFile(entry.relativePath); } catch (cause: unknown) { failed += 1; - logger.warn(`phase2: read failed for ${entry.relativePath}: ${describe(cause)}`); + logger.warn(`big-files-queue: read failed for ${entry.relativePath}: ${describe(cause)}`); reporter?.increment(1, { fileName: entry.relativePath }); continue; } if (content.length === 0) { failed += 1; - logger.warn(`phase2: empty content for ${entry.relativePath}; skipping`); + logger.warn(`big-files-queue: empty content for ${entry.relativePath}; skipping`); reporter?.increment(1, { fileName: entry.relativePath }); continue; } @@ -95,12 +112,12 @@ export async function processBigFilesQueue(input: ProcessBigFilesInput): Promise throw cause; } failed += 1; - logger.warn(`phase2: processBigFile failed for ${entry.relativePath}: ${describe(cause)}`); + logger.warn(`big-files-queue: processBigFile failed for ${entry.relativePath}: ${describe(cause)}`); } reporter?.increment(1, { fileName: entry.relativePath }); } logger.info( - `phase2 done: processed=${processed} cached=${cached} failed=${failed} skippedOversized=${skippedOversized}`, + `big-files-queue done: processed=${processed} cached=${cached} failed=${failed} skippedOversized=${skippedOversized}`, ); return { processed, @@ -114,6 +131,238 @@ export async function processBigFilesQueue(input: ProcessBigFilesInput): Promise } } +// --------------------------------------------------------------------------- +// Chunk-queue model (manifest-driven) +// --------------------------------------------------------------------------- + +export interface AnalyseBigFilesInput { + knowledgeId: string; + manifest: ScanManifest; + source: SourceReader; + metaPaths: MetaPaths; + limiter: ConcurrencyLimiter; + llmCallContext?: AskLlmOptions; + progressContext?: ProgressContext; +} + +interface BigFileState { + entry: ScanManifestEntry; + content: string; + chunks: FileChunk[]; + results: (ChunkAnalysisResult | undefined)[]; + pendingChunks: number; + fatal: boolean; +} + +/** + * Manifest-driven big-file phase. Every chunk of every big file is an + * independent task scheduled through the shared LLM limiter. As soon as the + * last chunk of a given file lands, that file's condense is scheduled — + * multiple condenses run in parallel with the still-pending chunks of slower + * files. All LLM calls (chunk + condense) check out from the same limiter. + * + * Files already fully processed (manifest + condensed on disk) are skipped. + */ +export async function analyseBigFiles(input: AnalyseBigFilesInput): Promise { + const maxTokensPerChunk = getConfigValue(Config.MaxTokensPerChunk); + const bigEntries = input.manifest.entries.filter((e) => e.kind === "big"); + + let cached = 0; + let skippedOversized = 0; + let failed = 0; + let processed = 0; + let totalInputTokens = 0; + let totalOutputTokens = 0; + let totalCostUsd = 0; + + // Per-file preparation: read content, chunk, record state. Sequential and + // cheap — no LLM calls here. + const states: BigFileState[] = []; + for (const entry of bigEntries) { + throwIfCancelled(input.knowledgeId); + const status = await inspect(input.metaPaths, entry.relativePath); + if (status === "complete") { + cached += 1; + continue; + } + let content: string; + try { + content = await input.source.readFile(entry.relativePath); + } catch (cause: unknown) { + failed += 1; + logger.warn(`analyse-big: read failed for ${entry.relativePath}: ${describe(cause)}`); + continue; + } + if (content.length === 0) { + failed += 1; + logger.warn(`analyse-big: empty content for ${entry.relativePath}; skipping`); + continue; + } + const chunks = splitFileIntoChunks(entry.relativePath, content, maxTokensPerChunk); + states.push({ + entry, + content, + chunks, + results: new Array(chunks.length), + pendingChunks: chunks.length, + fatal: false, + }); + logger.info(`analyse-big: ${entry.relativePath} split into ${chunks.length} chunks`); + } + + const totalChunks = states.reduce((acc, s) => acc + s.chunks.length, 0); + const chunkReporter = input.progressContext?.reporter({ + phase: "file_analysis", + subPhase: "big_files_chunks", + total: { kind: "fixed", total: totalChunks }, + }); + await chunkReporter?.start(); + const condenseReporter = input.progressContext?.reporter({ + phase: "file_analysis", + subPhase: "big_files_condense", + total: { kind: "fixed", total: states.length }, + }); + await condenseReporter?.start(); + + // For oversized entries the legacy phase counted them; we accept the manifest + // already accounted for them via the small phase (which writes the stub). + // Surfaced here for parity with the legacy result shape. + skippedOversized = input.manifest.entries.filter((e) => e.kind === "oversized").length; + + const condensePromises: Promise[] = []; + + function maybeScheduleCondense(state: BigFileState): void { + if (state.pendingChunks > 0 || state.fatal) { + return; + } + const definedResults = state.results.filter((r): r is ChunkAnalysisResult => r !== undefined); + condensePromises.push( + input.limiter(async () => { + throwIfCancelled(input.knowledgeId); + try { + const merged = await condenseChunks(state.entry.relativePath, definedResults, input.llmCallContext); + + const chunkInputTokens = definedResults.reduce((acc, r) => acc + (r.tokenUsage?.inputTokens ?? 0), 0); + const chunkOutputTokens = definedResults.reduce((acc, r) => acc + (r.tokenUsage?.outputTokens ?? 0), 0); + const chunkCostUsd = definedResults.reduce((acc, r) => acc + (r.tokenUsage?.costUsd ?? 0), 0); + const totalTokenCount = state.chunks.reduce((acc, c) => acc + c.tokenCount, 0); + const totalIn = chunkInputTokens + (merged.tokenUsage?.inputTokens ?? 0); + const totalOut = chunkOutputTokens + (merged.tokenUsage?.outputTokens ?? 0); + const totalCost = chunkCostUsd + (merged.tokenUsage?.costUsd ?? 0); + + const manifest: HugeFileManifest = { + relativePath: state.entry.relativePath, + totalChunks: state.chunks.length, + totalTokenCount, + chunkPaths: state.chunks.map((_, i) => `chunks/${encodeFolder(state.entry.relativePath)}/chunk-${i}.json`), + generatedAt: new Date().toISOString(), + }; + await saveManifest(input.metaPaths, manifest); + + const condensed: CondensedFileAnalysis = { + relativePath: state.entry.relativePath, + language: merged.language, + sha256: sha256(state.content), + sizeBytes: state.entry.sizeBytes, + tokenCount: totalTokenCount, + isBigFile: true, + totalChunks: state.chunks.length, + totalTokenCount, + analysedAt: new Date().toISOString(), + analysis: merged.analysis, + tokenUsage: { inputTokens: totalIn, outputTokens: totalOut, costUsd: totalCost }, + }; + await saveCondensed(input.metaPaths, condensed); + + totalInputTokens += totalIn; + totalOutputTokens += totalOut; + totalCostUsd += totalCost; + processed += 1; + } catch (cause: unknown) { + if (cause instanceof CancellationError) { + throw cause; + } + if (cause instanceof LlmConfigError || cause instanceof LlmError) { + throw cause; + } + failed += 1; + logger.warn(`analyse-big: condense failed for ${state.entry.relativePath}: ${describe(cause)}`); + } finally { + condenseReporter?.increment(1, { fileName: state.entry.relativePath }); + } + }), + ); + } + + const chunkPromises: Promise[] = []; + for (const state of states) { + for (let i = 0; i < state.chunks.length; i += 1) { + const idx = i; + const chunk = state.chunks[idx]; + if (chunk === undefined) { + continue; + } + chunkPromises.push( + input.limiter(async () => { + throwIfCancelled(input.knowledgeId); + try { + const cachedChunk = await loadChunkIfPresent(input.metaPaths, state.entry.relativePath, idx); + if (cachedChunk !== null) { + state.results[idx] = cachedChunk; + } else { + const analyzed = await analyzeChunk(chunk, input.llmCallContext); + await saveChunk(input.metaPaths, analyzed); + state.results[idx] = analyzed; + } + } catch (cause: unknown) { + if (cause instanceof CancellationError) { + throw cause; + } + if (cause instanceof LlmConfigError || cause instanceof LlmError) { + state.fatal = true; + throw cause; + } + logger.warn( + `analyse-big: chunk ${idx + 1}/${state.chunks.length} failed for ${state.entry.relativePath}: ${describe(cause)}`, + ); + } finally { + state.pendingChunks -= 1; + chunkReporter?.increment(1, { fileName: `${state.entry.relativePath}#chunk-${String(idx)}` }); + maybeScheduleCondense(state); + } + }), + ); + } + } + + try { + await Promise.all(chunkPromises); + await Promise.all(condensePromises); + } finally { + chunkReporter?.stop(); + condenseReporter?.stop(); + } + + logger.info( + `analyse-big done: processed=${processed} cached=${cached} failed=${failed} skippedOversized=${skippedOversized}`, + ); + return { + processed, + cached, + failed, + skippedOversized, + tokenUsage: { inputTokens: totalInputTokens, outputTokens: totalOutputTokens, costUsd: totalCostUsd }, + }; +} + +function sha256(content: string): string { + return createHash("sha256").update(content).digest("hex"); +} + +function encodeFolder(relativePath: string): string { + return relativePath.replace(/\//gu, "__SL__").replace(/\\/gu, "__BS__"); +} + function describe(cause: unknown): string { return cause instanceof Error ? cause.message : String(cause); } diff --git a/packages/ingest-github/src/strategies/flat-folder/phases/scan-and-classify.ts b/packages/ingest-github/src/strategies/flat-folder/phases/scan-and-classify.ts new file mode 100644 index 0000000..786c9b0 --- /dev/null +++ b/packages/ingest-github/src/strategies/flat-folder/phases/scan-and-classify.ts @@ -0,0 +1,131 @@ +import path from "node:path"; +import { Config } from "@bb/types"; +import { getConfigValue } from "@bb/config"; +import { logger } from "@bb/logger"; +import type { AskLlmOptions } from "@bb/llm"; +import type { MetaPaths } from "#src/types/meta-paths.ts"; +import type { BigFileEntry } from "#src/types/big-file.ts"; +import type { SkipDecider, SourceReader } from "#src/types/pipeline.ts"; +import type { ProgressContext } from "#src/progress/types.ts"; +import { throwIfCancelled } from "#src/pipeline/cancellation.ts"; +import { makeSkipDecider } from "#src/pipeline/skip-decisions/index.ts"; +import { classifyByTokens, writeBigFiles } from "#src/strategies/flat-folder/big-file/detector.ts"; +import { + emptyManifest, + writeScanManifest, + type ScanManifest, + type ScanManifestEntry, +} from "#src/strategies/flat-folder/scan-manifest.ts"; + +export interface ScanAndClassifyInput { + knowledgeId: string; + source: SourceReader; + metaPaths: MetaPaths; + skipDecider?: SkipDecider; + llmCallContext?: AskLlmOptions; + progressContext?: ProgressContext; +} + +export interface ScanAndClassifyResult { + manifest: ScanManifest; +} + +/** + * Walks the repo once, classifies every eligible file as small / big / + * oversized by token count, and writes `scan-manifest.json`. The downstream + * small-file and big-file phases consume the manifest instead of re-walking. + * + * Also writes the legacy `bigFiles.json` so the pull-path and backfill phases + * (which still read it directly) keep working without migration. + */ +export async function scanAndClassify(input: ScanAndClassifyInput): Promise { + const contextWindowLimit = getConfigValue(Config.ContextWindowLimit); + const maxTokensPerChunk = getConfigValue(Config.MaxTokensPerChunk); + const manifest = emptyManifest(); + const bigFileEntries: BigFileEntry[] = []; + + const repositoryHint = + input.source.localRepoDir.length > 0 ? path.basename(input.source.localRepoDir) : input.knowledgeId; + const skipDecider = input.skipDecider ?? makeSkipDecider({ repositoryName: repositoryHint }); + + const reporter = input.progressContext?.reporter({ + phase: "scan", + total: { kind: "growing" }, + }); + await reporter?.start(); + + try { + const scanDeps: Parameters[0] = { skipDecider }; + if (input.llmCallContext !== undefined) { + scanDeps.llmCallContext = input.llmCallContext; + } + + for await (const entry of input.source.scan(scanDeps)) { + throwIfCancelled(input.knowledgeId); + reporter?.incrementSeen(); + + if (entry.kind === "oversized") { + const manifestEntry: ScanManifestEntry = { + relativePath: entry.relativePath, + absolutePath: entry.absolutePath, + sizeBytes: entry.sizeBytes, + tokenCount: 0, + kind: "oversized", + }; + manifest.entries.push(manifestEntry); + manifest.summary.oversizedCount += 1; + manifest.summary.totalFiles += 1; + bigFileEntries.push({ + relativePath: entry.relativePath, + sizeBytes: entry.sizeBytes, + tokenCount: 0, + reason: "too-large", + }); + reporter?.increment(1, { fileName: entry.relativePath }); + continue; + } + + const { tokenCount, isBigFile } = classifyByTokens(entry.content, contextWindowLimit); + manifest.summary.totalFiles += 1; + manifest.summary.totalTokens += tokenCount; + if (isBigFile) { + const estimatedChunks = Math.max(1, Math.ceil(tokenCount / maxTokensPerChunk)); + manifest.entries.push({ + relativePath: entry.relativePath, + absolutePath: entry.absolutePath, + sizeBytes: entry.sizeBytes, + tokenCount, + kind: "big", + estimatedChunks, + }); + manifest.summary.bigCount += 1; + manifest.summary.estimatedBigChunks += estimatedChunks; + bigFileEntries.push({ + relativePath: entry.relativePath, + sizeBytes: entry.sizeBytes, + tokenCount, + reason: "context-window-exceeded", + }); + } else { + manifest.entries.push({ + relativePath: entry.relativePath, + absolutePath: entry.absolutePath, + sizeBytes: entry.sizeBytes, + tokenCount, + kind: "small", + }); + manifest.summary.smallCount += 1; + } + reporter?.increment(1, { fileName: entry.relativePath }); + } + } finally { + reporter?.stop(); + } + + await writeScanManifest(input.metaPaths, manifest); + await writeBigFiles(input.metaPaths, bigFileEntries); + logger.info( + `scan-and-classify done: total=${manifest.summary.totalFiles} small=${manifest.summary.smallCount} big=${manifest.summary.bigCount} oversized=${manifest.summary.oversizedCount} totalTokens=${manifest.summary.totalTokens} estimatedBigChunks=${manifest.summary.estimatedBigChunks}`, + ); + return { manifest }; +} diff --git a/packages/ingest-github/src/strategies/flat-folder/scan-manifest.ts b/packages/ingest-github/src/strategies/flat-folder/scan-manifest.ts new file mode 100644 index 0000000..5caee3b --- /dev/null +++ b/packages/ingest-github/src/strategies/flat-folder/scan-manifest.ts @@ -0,0 +1,61 @@ +import { readFile, writeFile } from "node:fs/promises"; +import type { MetaPaths } from "#src/types/meta-paths.ts"; + +export type ScanEntryKind = "small" | "big" | "oversized"; + +export interface ScanManifestEntry { + relativePath: string; + absolutePath: string; + sizeBytes: number; + tokenCount: number; + kind: ScanEntryKind; + estimatedChunks?: number; +} + +export interface ScanManifestSummary { + totalFiles: number; + smallCount: number; + bigCount: number; + oversizedCount: number; + totalTokens: number; + estimatedBigChunks: number; +} + +export interface ScanManifest { + generatedAt: string; + summary: ScanManifestSummary; + entries: ScanManifestEntry[]; +} + +export function emptyManifest(): ScanManifest { + return { + generatedAt: new Date().toISOString(), + summary: { totalFiles: 0, smallCount: 0, bigCount: 0, oversizedCount: 0, totalTokens: 0, estimatedBigChunks: 0 }, + entries: [], + }; +} + +export async function writeScanManifest(metaPaths: MetaPaths, manifest: ScanManifest): Promise { + await writeFile(metaPaths.scanManifestJson, JSON.stringify(manifest, null, 2), "utf8"); +} + +export async function readScanManifest(metaPaths: MetaPaths): Promise { + try { + const raw = await readFile(metaPaths.scanManifestJson, "utf8"); + const parsed: unknown = JSON.parse(raw); + if (!isManifest(parsed)) { + return null; + } + return parsed; + } catch { + return null; + } +} + +function isManifest(value: unknown): value is ScanManifest { + if (typeof value !== "object" || value === null) { + return false; + } + const rec = value as Record; + return Array.isArray(rec["entries"]) && typeof rec["summary"] === "object" && typeof rec["generatedAt"] === "string"; +} diff --git a/packages/ingest-github/src/types/meta-paths.ts b/packages/ingest-github/src/types/meta-paths.ts index 8898df3..5da4f89 100644 --- a/packages/ingest-github/src/types/meta-paths.ts +++ b/packages/ingest-github/src/types/meta-paths.ts @@ -5,5 +5,6 @@ export interface MetaPaths { bigFileAnalysisDir: string; bigFileChunksDir: string; bigFilesJson: string; + scanManifestJson: string; repoSummaryJson: string; } diff --git a/packages/types/src/config.ts b/packages/types/src/config.ts index 882381a..950cb81 100644 --- a/packages/types/src/config.ts +++ b/packages/types/src/config.ts @@ -23,6 +23,7 @@ export enum Config { BigFileConcurrency = "big.file.concurrency", AbsoluteFileSizeCap = "absolute.file.size.cap", ConcurrentWorkers = "concurrent.workers", + LlmConcurrency = "llm.concurrency", CondenseContextLimit = "condense.context.limit", CondensePromptOverhead = "condense.prompt.overhead", SmallFileDedupThreshold = "small.file.dedup.threshold", From b6311ba7d58e37ec5842a3a988f9214c3484f61b Mon Sep 17 00:00:00 2001 From: Dead-Bytes <143434285+Dead-Bytes@users.noreply.github.com> Date: Fri, 22 May 2026 14:26:40 +0530 Subject: [PATCH 04/11] refactor: implement FileAnalysisCache for improved performance in file analysis phases --- packages/ingest-github/README.md | 8 ++ packages/ingest-github/src/pipeline/pull.ts | 8 +- .../strategies/flat-folder/backfill/fields.ts | 9 +- .../flat-folder/file-analysis-cache.ts | 91 +++++++++++++++++++ .../flat-folder/folder-summary-selective.ts | 4 +- .../strategies/flat-folder/folder-summary.ts | 9 +- .../src/strategies/flat-folder/index.ts | 16 +++- .../strategies/flat-folder/phases/README.md | 11 +++ .../flat-folder/phases/store-flat-analysis.ts | 8 +- 9 files changed, 148 insertions(+), 16 deletions(-) create mode 100644 packages/ingest-github/src/strategies/flat-folder/file-analysis-cache.ts diff --git a/packages/ingest-github/README.md b/packages/ingest-github/README.md index 6073339..c9e1ca2 100644 --- a/packages/ingest-github/README.md +++ b/packages/ingest-github/README.md @@ -139,6 +139,14 @@ worker hardcodes a single `IngestionStrategy` instance (currently - `meta-output/bigFiles.json` — legacy view written alongside the manifest for the pull-path and backfill phases. The main strategy no longer consumes it directly. +- `FileAnalysisCache` (in-memory only, not persisted) — single + `Map` loaded once between the + analyse and backfill phases via parallel `readdir + readFile`. Replaces + three sequential `iterateCondensed` walks (phases 3, 5, 7) with one + parallel preload + three in-memory iterations. The pull workflow loads + its own cache instance; only one strategy run owns a given + `metaPaths` directory at a time. For repos beyond ~50k analysed files + consider a streaming-mode fallback (not implemented today). ## Invariants diff --git a/packages/ingest-github/src/pipeline/pull.ts b/packages/ingest-github/src/pipeline/pull.ts index 930b7be..6ffab0b 100644 --- a/packages/ingest-github/src/pipeline/pull.ts +++ b/packages/ingest-github/src/pipeline/pull.ts @@ -20,6 +20,7 @@ import { analyseChangedFiles } from "#src/strategies/flat-folder/analyse-changed import { processBigFilesQueue } from "#src/strategies/flat-folder/phases/process-big-files.ts"; import { backfillMissingFields } from "#src/strategies/flat-folder/backfill/fields.ts"; import { backfillBigFiles } from "#src/strategies/flat-folder/backfill/big-files.ts"; +import { FileAnalysisCache } from "#src/strategies/flat-folder/file-analysis-cache.ts"; import { runSelectiveFolderSummary } from "#src/strategies/flat-folder/folder-summary-selective.ts"; import { makeRepoSummaryEnvelope, @@ -192,9 +193,13 @@ export async function runPull( totalOutputTokens += phase2.tokenUsage.outputTokens; totalCostUsd += phase2.tokenUsage.costUsd; + logger.info(`pull: loading file-analysis cache`); + throwIfCancelled(knowledgeId); + const fileAnalysisCache = await FileAnalysisCache.loadAll(metaPaths); + logger.info(`pull: phase backfill fields starting`); throwIfCancelled(knowledgeId); - await backfillMissingFields(metaPaths, llmCallContext, progressContext); + await backfillMissingFields(metaPaths, fileAnalysisCache, llmCallContext, progressContext); logger.info(`pull: phase backfill big-files starting`); throwIfCancelled(knowledgeId); @@ -215,6 +220,7 @@ export async function runPull( const selectiveInput: Parameters[0] = { knowledgeId, metaPaths, + cache: fileAnalysisCache, affectedFolders, }; if (llmCallContext !== undefined) { diff --git a/packages/ingest-github/src/strategies/flat-folder/backfill/fields.ts b/packages/ingest-github/src/strategies/flat-folder/backfill/fields.ts index b6db25e..7836520 100644 --- a/packages/ingest-github/src/strategies/flat-folder/backfill/fields.ts +++ b/packages/ingest-github/src/strategies/flat-folder/backfill/fields.ts @@ -4,8 +4,8 @@ import { logger } from "@bb/logger"; import type { FileAnalysis, FileAnalysisSection } from "@bb/mongo"; import type { MetaPaths } from "#src/types/meta-paths.ts"; import type { ProgressContext } from "#src/progress/types.ts"; -import { iterateCondensed } from "#src/strategies/flat-folder/big-file/storage.ts"; import { saveCondensed } from "#src/strategies/flat-folder/big-file/storage.ts"; +import type { FileAnalysisCache } from "#src/strategies/flat-folder/file-analysis-cache.ts"; import { BACKFILL_SYSTEM_PROMPT, buildBackfillUserPrompt } from "#src/strategies/flat-folder/prompts/backfill.ts"; const EXTENDED_ARRAY_KEYS = [ @@ -44,6 +44,7 @@ interface NeededFlags { export async function backfillMissingFields( metaPaths: MetaPaths, + cache: FileAnalysisCache, llmCallContext?: AskLlmOptions, progressContext?: ProgressContext, ): Promise<{ updated: number; failed: number }> { @@ -52,12 +53,11 @@ export async function backfillMissingFields( const reporter = progressContext?.reporter({ phase: "file_analysis", subPhase: "backfill", - total: { kind: "growing" }, + total: { kind: "fixed", total: cache.size }, }); await reporter?.start(); try { - for await (const entry of iterateCondensed(metaPaths)) { - reporter?.incrementSeen(); + for (const entry of cache.values()) { const a = entry.analysis; const needed = computeNeeded(a); if (!hasAnyMissing(needed)) { @@ -74,6 +74,7 @@ export async function backfillMissingFields( } applyBackfill(a, result, needed); await saveCondensed(metaPaths, entry); + cache.set(entry); updated += 1; } catch (cause: unknown) { if (cause instanceof LlmConfigError || cause instanceof LlmError) { diff --git a/packages/ingest-github/src/strategies/flat-folder/file-analysis-cache.ts b/packages/ingest-github/src/strategies/flat-folder/file-analysis-cache.ts new file mode 100644 index 0000000..4405682 --- /dev/null +++ b/packages/ingest-github/src/strategies/flat-folder/file-analysis-cache.ts @@ -0,0 +1,91 @@ +import { readdir, readFile } from "node:fs/promises"; +import path from "node:path"; +import { logger } from "@bb/logger"; +import type { CondensedFileAnalysis } from "#src/types/condensed-file-analysis.ts"; +import type { MetaPaths } from "#src/types/meta-paths.ts"; +import { withConcurrency } from "#src/pipeline/concurrency.ts"; + +const LOAD_CONCURRENCY = 20; + +/** + * In-memory snapshot of every `CondensedFileAnalysis` JSON under + * `metaPaths.fileAnalysisDir`. Loaded once per strategy run between the + * analyse phases (2a/2b) and the backfill / folder-summary / graph-store + * phases. The downstream consumers iterate `.values()` (full sweeps) or + * `.get(relativePath)` (random-access); Phase 3 also calls `.set(...)` + * to keep the map in sync with disk writes. + * + * Replaces three sequential `iterateCondensed` walks (one per consumer) + * with one parallel preload + three in-memory iterations. + */ +export class FileAnalysisCache { + private readonly map: Map; + + private constructor(map: Map) { + this.map = map; + } + + static async loadAll(metaPaths: MetaPaths): Promise { + const startedAt = Date.now(); + let filenames: string[]; + try { + filenames = await readdir(metaPaths.fileAnalysisDir); + } catch (cause: unknown) { + logger.warn(`file-analysis-cache: readdir failed for ${metaPaths.fileAnalysisDir}: ${describe(cause)}`); + return new FileAnalysisCache(new Map()); + } + const jsonFiles = filenames.filter((n) => n.endsWith(".json")); + const map = new Map(); + const limit = withConcurrency(LOAD_CONCURRENCY); + const tasks: Promise[] = []; + for (const name of jsonFiles) { + tasks.push( + limit(async () => { + const full = path.join(metaPaths.fileAnalysisDir, name); + try { + const raw = await readFile(full, "utf8"); + const parsed: unknown = JSON.parse(raw); + if (typeof parsed !== "object" || parsed === null) { + return; + } + const entry = parsed as CondensedFileAnalysis; + if (typeof entry.relativePath !== "string" || entry.relativePath.length === 0) { + return; + } + map.set(entry.relativePath, entry); + } catch (cause: unknown) { + logger.warn(`file-analysis-cache: failed to read ${name}: ${describe(cause)}`); + } + }), + ); + } + await Promise.all(tasks); + const elapsedMs = Date.now() - startedAt; + logger.info(`file-analysis-cache: loaded ${map.size} entries in ${elapsedMs} ms`); + return new FileAnalysisCache(map); + } + + get(relativePath: string): CondensedFileAnalysis | undefined { + return this.map.get(relativePath); + } + + set(entry: CondensedFileAnalysis): void { + this.map.set(entry.relativePath, entry); + } + + values(): IterableIterator { + return this.map.values(); + } + + entries(): IterableIterator<[string, CondensedFileAnalysis]> { + return this.map.entries(); + } + + get size(): number { + return this.map.size; + } +} + +function describe(cause: unknown): string { + return cause instanceof Error ? cause.message : String(cause); +} diff --git a/packages/ingest-github/src/strategies/flat-folder/folder-summary-selective.ts b/packages/ingest-github/src/strategies/flat-folder/folder-summary-selective.ts index d053d82..9b4e71c 100644 --- a/packages/ingest-github/src/strategies/flat-folder/folder-summary-selective.ts +++ b/packages/ingest-github/src/strategies/flat-folder/folder-summary-selective.ts @@ -5,6 +5,7 @@ import type { AskLlmOptions } from "@bb/llm"; import type { MetaPaths } from "#src/types/meta-paths.ts"; import { withConcurrency } from "#src/pipeline/concurrency.ts"; import { throwIfCancelled, CancellationError } from "#src/pipeline/cancellation.ts"; +import type { FileAnalysisCache } from "#src/strategies/flat-folder/file-analysis-cache.ts"; import { groupByDirectFolder, persistFolderSummary, @@ -14,6 +15,7 @@ import { export interface SelectiveFolderSummaryInput { knowledgeId: string; metaPaths: MetaPaths; + cache: FileAnalysisCache; affectedFolders: Set; llmCallContext?: AskLlmOptions; } @@ -35,7 +37,7 @@ export async function runSelectiveFolderSummary( ): Promise { const concurrentWorkers = getConfigValue(Config.ConcurrentWorkers); const limit = withConcurrency(concurrentWorkers); - const groups = await groupByDirectFolder(input.metaPaths); + const groups = groupByDirectFolder(input.cache); let succeeded = 0; let failed = 0; let skipped = 0; diff --git a/packages/ingest-github/src/strategies/flat-folder/folder-summary.ts b/packages/ingest-github/src/strategies/flat-folder/folder-summary.ts index 4fa175b..805eae6 100644 --- a/packages/ingest-github/src/strategies/flat-folder/folder-summary.ts +++ b/packages/ingest-github/src/strategies/flat-folder/folder-summary.ts @@ -11,14 +11,14 @@ import { encodeMetaPath } from "#src/pipeline/paths.ts"; import { withConcurrency } from "#src/pipeline/concurrency.ts"; import { throwIfCancelled, CancellationError } from "#src/pipeline/cancellation.ts"; import type { ProgressContext } from "#src/progress/types.ts"; -import { iterateCondensed } from "./big-file/storage.ts"; +import type { FileAnalysisCache } from "./file-analysis-cache.ts"; import { directFolderOf } from "./folder-path.ts"; import { FOLDER_ANALYSIS_SYSTEM_PROMPT, folderAnalysisUserPrompt } from "./prompts/folder-summary.ts"; import type { FolderSummary } from "./types.ts"; -export async function groupByDirectFolder(metaPaths: MetaPaths): Promise> { +export function groupByDirectFolder(cache: FileAnalysisCache): Map { const groups = new Map(); - for await (const entry of iterateCondensed(metaPaths)) { + for (const entry of cache.values()) { const folder = directFolderOf(entry.relativePath); const bucket = groups.get(folder) ?? []; bucket.push(entry); @@ -113,6 +113,7 @@ export async function* iterateFolderSummaries(metaPaths: MetaPaths): AsyncGenera export async function runFolderSummaryPhase( knowledgeId: string, metaPaths: MetaPaths, + cache: FileAnalysisCache, llmCallContext?: AskLlmOptions, progressContext?: ProgressContext, ): Promise<{ @@ -122,7 +123,7 @@ export async function runFolderSummaryPhase( }> { const concurrentWorkers = getConfigValue(Config.ConcurrentWorkers); const limit = withConcurrency(concurrentWorkers); - const groups = await groupByDirectFolder(metaPaths); + const groups = groupByDirectFolder(cache); let succeeded = 0; let failed = 0; let totalInputTokens = 0; diff --git a/packages/ingest-github/src/strategies/flat-folder/index.ts b/packages/ingest-github/src/strategies/flat-folder/index.ts index 924b26f..de9211b 100644 --- a/packages/ingest-github/src/strategies/flat-folder/index.ts +++ b/packages/ingest-github/src/strategies/flat-folder/index.ts @@ -11,6 +11,7 @@ import { analyseSmallFiles } from "./phases/analyse-small.ts"; import { analyseBigFiles } from "./phases/process-big-files.ts"; import { backfillMissingFields } from "./backfill/fields.ts"; import { backfillBigFiles } from "./backfill/big-files.ts"; +import { FileAnalysisCache } from "./file-analysis-cache.ts"; import { runFolderSummaryPhase } from "./folder-summary.ts"; import { makeRepoSummaryEnvelope, persistRepoSummary, summariseRepo } from "./repo-summary.ts"; import { storeFlatAnalysis } from "./phases/store-flat-analysis.ts"; @@ -87,9 +88,13 @@ export function createFlatFolderStrategy(deps: FlatFolderStrategyDeps): IngestSt let totalOutputTokens = smallResult.tokenUsage.outputTokens + bigResult.tokenUsage.outputTokens; let totalCostUsd = smallResult.tokenUsage.costUsd + bigResult.tokenUsage.costUsd; + logger.info(`flat-folder: loading file-analysis cache`); + throwIfCancelled(knowledgeId); + const fileAnalysisCache = await FileAnalysisCache.loadAll(metaPaths); + logger.info(`flat-folder: phase3 (backfill missing fields) starting`); throwIfCancelled(knowledgeId); - await backfillMissingFields(metaPaths, llmCallContext, progressContext); + await backfillMissingFields(metaPaths, fileAnalysisCache, llmCallContext, progressContext); logger.info(`flat-folder: phase4 (backfill big files) starting`); throwIfCancelled(knowledgeId); @@ -107,7 +112,13 @@ export function createFlatFolderStrategy(deps: FlatFolderStrategyDeps): IngestSt progressContext.phaseChanged("folder_analysis"); logger.info(`flat-folder: phase5 (folder summaries) starting`); throwIfCancelled(knowledgeId); - const phase5 = await runFolderSummaryPhase(knowledgeId, metaPaths, llmCallContext, progressContext); + const phase5 = await runFolderSummaryPhase( + knowledgeId, + metaPaths, + fileAnalysisCache, + llmCallContext, + progressContext, + ); totalInputTokens += phase5.tokenUsage.inputTokens; totalOutputTokens += phase5.tokenUsage.outputTokens; totalCostUsd += phase5.tokenUsage.costUsd; @@ -136,6 +147,7 @@ export function createFlatFolderStrategy(deps: FlatFolderStrategyDeps): IngestSt payload, branch, metaPaths, + cache: fileAnalysisCache, progressContext, }); diff --git a/packages/ingest-github/src/strategies/flat-folder/phases/README.md b/packages/ingest-github/src/strategies/flat-folder/phases/README.md index e2d218a..05ee606 100644 --- a/packages/ingest-github/src/strategies/flat-folder/phases/README.md +++ b/packages/ingest-github/src/strategies/flat-folder/phases/README.md @@ -69,9 +69,17 @@ scanAndClassify │ │ (Promise.all, share one limiter) └── analyseBigFiles ────┘ ↓ +FileAnalysisCache.loadAll (one parallel readdir+readFile pass) + ↓ backfillMissingFields → backfillBigFiles → folderSummary → repoSummary → storeFlatAnalysis + (cache read+write) (no cache) (cache read) (cache read) ``` +`FileAnalysisCache` is a `Map` loaded +once between phase 2 and phase 3. Phases 3, 5, 7 all consume the same +instance — phase 3 also calls `cache.set(...)` after each backfill write +so phases 5 and 7 see the updated entries without re-reading disk. + ## Public interfaces - `scanAndClassify(input): Promise` — @@ -99,6 +107,9 @@ backfillMissingFields → backfillBigFiles → folderSummary → repoSummary → - Phase 2b writes per-chunk JSON (`chunks//chunk-N.json`), per-file chunk manifests (`.manifest.json`), and condensed JSON for big files. +- `FileAnalysisCache` is an in-memory artifact owned by the strategy + run (not persisted). It loads from `fileAnalysisDir` once and is + passed by reference to phases 3, 5, and 7. - Phase 7 owns no disk artifacts. It reads on-disk state produced by Phases 1–6 and writes Neo4j nodes (`:Repo`, `:Folder`, `:File`) plus the `CONTAINS` edge. diff --git a/packages/ingest-github/src/strategies/flat-folder/phases/store-flat-analysis.ts b/packages/ingest-github/src/strategies/flat-folder/phases/store-flat-analysis.ts index dbcbb30..adeb0a6 100644 --- a/packages/ingest-github/src/strategies/flat-folder/phases/store-flat-analysis.ts +++ b/packages/ingest-github/src/strategies/flat-folder/phases/store-flat-analysis.ts @@ -4,7 +4,7 @@ import { ensureFlatFolderIndexes, upsertFileNode, upsertFolderNode, upsertRepoNo import type { GithubIndexPayload } from "@bb/types"; import type { MetaPaths } from "#src/types/meta-paths.ts"; import { throwIfCancelled } from "#src/pipeline/cancellation.ts"; -import { iterateCondensed } from "#src/strategies/flat-folder/big-file/storage.ts"; +import type { FileAnalysisCache } from "#src/strategies/flat-folder/file-analysis-cache.ts"; import { iterateFolderSummaries } from "#src/strategies/flat-folder/folder-summary.ts"; import { directFolderOf } from "#src/strategies/flat-folder/folder-path.ts"; import { languageFromPath } from "#src/adapters/llm-file-analyzer.ts"; @@ -16,6 +16,7 @@ export interface StoreFlatAnalysisInput { payload: GithubIndexPayload; branch: string; metaPaths: MetaPaths; + cache: FileAnalysisCache; progressContext?: ProgressContext; } @@ -89,13 +90,12 @@ export async function storeFlatAnalysis(input: StoreFlatAnalysisInput): Promise< const fileReporter = input.progressContext?.reporter({ phase: "indexing", subPhase: "files", - total: { kind: "growing" }, + total: { kind: "fixed", total: input.cache.size }, }); await fileReporter?.start(); try { - for await (const file of iterateCondensed(input.metaPaths)) { + for (const file of input.cache.values()) { throwIfCancelled(input.scope.knowledgeId); - fileReporter?.incrementSeen(); const folderPath = directFolderOf(file.relativePath); if (!folderPaths.has(folderPath)) { await upsertFolderNode({ From 13970c7dd0c0b2e07ea20abf4d31b7339cc1ee26 Mon Sep 17 00:00:00 2001 From: Dead-Bytes <143434285+Dead-Bytes@users.noreply.github.com> Date: Fri, 22 May 2026 14:49:57 +0530 Subject: [PATCH 05/11] refactor: add folder summary batching configuration and enhance folder summary processing --- packages/config/src/schema.ts | 14 + packages/ingest-github/README.md | 17 +- packages/ingest-github/src/pipeline/pull.ts | 6 +- .../flat-folder/folder-summary-selective.ts | 76 ++--- .../strategies/flat-folder/folder-summary.ts | 292 +++++++++++++++--- .../src/strategies/flat-folder/index.ts | 1 + .../strategies/flat-folder/phases/README.md | 14 +- .../flat-folder/prompts/folder-summary.ts | 54 ++++ packages/types/src/config.ts | 2 + 9 files changed, 378 insertions(+), 98 deletions(-) diff --git a/packages/config/src/schema.ts b/packages/config/src/schema.ts index 77a7468..1f7a021 100644 --- a/packages/config/src/schema.ts +++ b/packages/config/src/schema.ts @@ -42,6 +42,8 @@ export const configSchema = z "absolute.file.size.cap": z.number().int().positive().default(52428800), "concurrent.workers": z.number().int().positive().default(4), "llm.concurrency": z.number().int().positive().default(29), + "folder.summary.batch.size": z.number().int().positive().default(10), + "folder.summary.batch.max.files": z.number().int().positive().default(15), "condense.context.limit": z.number().int().positive().default(12000), "condense.prompt.overhead": z.number().int().nonnegative().default(1500), "small.file.dedup.threshold": z.number().int().positive().default(3), @@ -83,6 +85,8 @@ export type ConfigValueMap = { [Config.AbsoluteFileSizeCap]: number; [Config.ConcurrentWorkers]: number; [Config.LlmConcurrency]: number; + [Config.FolderSummaryBatchSize]: number; + [Config.FolderSummaryBatchMaxFiles]: number; [Config.CondenseContextLimit]: number; [Config.CondensePromptOverhead]: number; [Config.SmallFileDedupThreshold]: number; @@ -138,6 +142,8 @@ export const HINTS: Readonly> = { [Config.AbsoluteFileSizeCap]: "bytebell set absolute.file.size.cap ", [Config.ConcurrentWorkers]: "bytebell set concurrent.workers ", [Config.LlmConcurrency]: "bytebell set llm.concurrency ", + [Config.FolderSummaryBatchSize]: "bytebell set folder.summary.batch.size ", + [Config.FolderSummaryBatchMaxFiles]: "bytebell set folder.summary.batch.max.files ", [Config.CondenseContextLimit]: "bytebell set condense.context.limit ", [Config.CondensePromptOverhead]: "bytebell set condense.prompt.overhead ", [Config.SmallFileDedupThreshold]: "bytebell set small.file.dedup.threshold ", @@ -200,6 +206,10 @@ export function readField(cfg: BytebellConfig, key: K): Config return cfg["concurrent.workers"] as ConfigValue; case Config.LlmConcurrency: return cfg["llm.concurrency"] as ConfigValue; + case Config.FolderSummaryBatchSize: + return cfg["folder.summary.batch.size"] as ConfigValue; + case Config.FolderSummaryBatchMaxFiles: + return cfg["folder.summary.batch.max.files"] as ConfigValue; case Config.CondenseContextLimit: return cfg["condense.context.limit"] as ConfigValue; case Config.CondensePromptOverhead: @@ -271,6 +281,10 @@ export function writeField(cfg: BytebellConfig, key: K, value: return { ...cfg, "concurrent.workers": value as number }; case Config.LlmConcurrency: return { ...cfg, "llm.concurrency": value as number }; + case Config.FolderSummaryBatchSize: + return { ...cfg, "folder.summary.batch.size": value as number }; + case Config.FolderSummaryBatchMaxFiles: + return { ...cfg, "folder.summary.batch.max.files": value as number }; case Config.CondenseContextLimit: return { ...cfg, "condense.context.limit": value as number }; case Config.CondensePromptOverhead: diff --git a/packages/ingest-github/README.md b/packages/ingest-github/README.md index c9e1ca2..9d28387 100644 --- a/packages/ingest-github/README.md +++ b/packages/ingest-github/README.md @@ -153,10 +153,19 @@ worker hardcodes a single `IngestionStrategy` instance (currently 1. **Shared LLM concurrency limiter.** The flat-folder strategy constructs one `withConcurrency(Config.LlmConcurrency)` instance at entry (default 29). The small-file phase, the big-file chunk phase, - and per-file condense calls all check out from this single pool, so - total in-flight LLM calls is bounded by one knob. The legacy - `processBigFile` driver used by the pull-path still uses its own - per-file pool sized by `Config.BigFileConcurrency`. + per-file condense calls, **and the folder-summary phase** all check + out from this single pool, so total in-flight LLM calls is bounded + by one knob. The pull-path constructs its own shared limiter at + `runPull` entry and threads it into the selective folder-summary + phase. The legacy `processBigFile` driver used by the pull-path + still uses its own per-file pool sized by `Config.BigFileConcurrency`. +2. **Folder-summary batching by default.** Phase 5 groups small folders + (`≤ Config.FolderSummaryBatchMaxFiles`, default 15) into batches of + up to `Config.FolderSummaryBatchSize` (default 10) and asks the LLM + for one JSON object keyed by integer label that returns one summary + per folder. Bigger folders take the individual single-folder path. + Roll back to one LLM call per folder via + `bytebell set folder.summary.batch.size 1`. 2. **Clone idempotent.** Re-runs (BullMQ retries) call `git fetch` + `git reset --hard` in the existing dir rather than re-cloning. Tokens are re-injected into the remote URL each time. diff --git a/packages/ingest-github/src/pipeline/pull.ts b/packages/ingest-github/src/pipeline/pull.ts index 6ffab0b..8a4c706 100644 --- a/packages/ingest-github/src/pipeline/pull.ts +++ b/packages/ingest-github/src/pipeline/pull.ts @@ -1,4 +1,6 @@ -import { KnowledgeState, type GithubPullPayload, type JobMessage } from "@bb/types"; +import { Config, KnowledgeState, type GithubPullPayload, type JobMessage } from "@bb/types"; +import { getConfigValue } from "@bb/config"; +import { withConcurrency } from "./concurrency.ts"; import { getKnowledge, markKnowledgeFailed, setKnowledgeCommit, setKnowledgeState } from "@bb/mongo"; import { setKnowledgeStateInGraph, snapshotFilesToVersion, type NodeScope } from "@bb/neo4j"; import type { PipelineSummary } from "#src/types/pipeline.ts"; @@ -196,6 +198,7 @@ export async function runPull( logger.info(`pull: loading file-analysis cache`); throwIfCancelled(knowledgeId); const fileAnalysisCache = await FileAnalysisCache.loadAll(metaPaths); + const limiter = withConcurrency(getConfigValue(Config.LlmConcurrency)); logger.info(`pull: phase backfill fields starting`); throwIfCancelled(knowledgeId); @@ -221,6 +224,7 @@ export async function runPull( knowledgeId, metaPaths, cache: fileAnalysisCache, + limiter, affectedFolders, }; if (llmCallContext !== undefined) { diff --git a/packages/ingest-github/src/strategies/flat-folder/folder-summary-selective.ts b/packages/ingest-github/src/strategies/flat-folder/folder-summary-selective.ts index 9b4e71c..17ac699 100644 --- a/packages/ingest-github/src/strategies/flat-folder/folder-summary-selective.ts +++ b/packages/ingest-github/src/strategies/flat-folder/folder-summary-selective.ts @@ -1,21 +1,19 @@ import { logger } from "@bb/logger"; -import { Config } from "@bb/types"; -import { getConfigValue } from "@bb/config"; import type { AskLlmOptions } from "@bb/llm"; +import type { CondensedFileAnalysis } from "#src/types/condensed-file-analysis.ts"; import type { MetaPaths } from "#src/types/meta-paths.ts"; -import { withConcurrency } from "#src/pipeline/concurrency.ts"; -import { throwIfCancelled, CancellationError } from "#src/pipeline/cancellation.ts"; +import type { ConcurrencyLimiter } from "#src/pipeline/concurrency.ts"; import type { FileAnalysisCache } from "#src/strategies/flat-folder/file-analysis-cache.ts"; import { + dispatchFolderSummaries, groupByDirectFolder, - persistFolderSummary, - summariseFolder, } from "#src/strategies/flat-folder/folder-summary.ts"; export interface SelectiveFolderSummaryInput { knowledgeId: string; metaPaths: MetaPaths; cache: FileAnalysisCache; + limiter: ConcurrencyLimiter; affectedFolders: Set; llmCallContext?: AskLlmOptions; } @@ -29,57 +27,39 @@ export interface SelectiveFolderSummaryResult { /** * Pull-time folder summary. Same machinery as `runFolderSummaryPhase` but - * only regenerates folders the caller flagged as affected. Reads condensed - * file analyses from disk; the dispatcher must have populated them already. + * only regenerates folders the caller flagged as affected. Filters by + * `affectedFolders` BEFORE batching so skipped folders never enter a batch. */ export async function runSelectiveFolderSummary( input: SelectiveFolderSummaryInput, ): Promise { - const concurrentWorkers = getConfigValue(Config.ConcurrentWorkers); - const limit = withConcurrency(concurrentWorkers); - const groups = groupByDirectFolder(input.cache); - let succeeded = 0; - let failed = 0; + const allGroups = groupByDirectFolder(input.cache); + const affectedGroups = new Map(); let skipped = 0; - let totalInputTokens = 0; - let totalOutputTokens = 0; - let totalCostUsd = 0; - const tasks: Promise[] = []; - for (const [folderPath, files] of groups.entries()) { - if (!input.affectedFolders.has(folderPath)) { + for (const [folderPath, files] of allGroups.entries()) { + if (input.affectedFolders.has(folderPath)) { + affectedGroups.set(folderPath, files); + } else { skipped += 1; - continue; } - tasks.push( - limit(async () => { - try { - throwIfCancelled(input.knowledgeId); - const { summary, tokenUsage } = await summariseFolder(folderPath, files, input.llmCallContext); - totalInputTokens += tokenUsage.inputTokens; - totalOutputTokens += tokenUsage.outputTokens; - totalCostUsd += tokenUsage.costUsd; - if (summary !== null) { - await persistFolderSummary(input.metaPaths, summary); - succeeded += 1; - } else { - failed += 1; - } - } catch (cause: unknown) { - if (cause instanceof CancellationError) { - throw cause; - } - failed += 1; - logger.warn(`pull-folder-summary: failed for ${folderPath || ""}`); - } - }), - ); } - await Promise.all(tasks); - logger.info(`pull-folder-summary done: succeeded=${succeeded} failed=${failed} skipped=${skipped}`); + + const totals = await dispatchFolderSummaries( + affectedGroups, + input.metaPaths, + input.limiter, + input.llmCallContext, + undefined, + input.knowledgeId, + "pull-folder-summary", + ); + logger.info( + `pull-folder-summary done: succeeded=${totals.succeeded} failed=${totals.failed} skipped=${skipped}`, + ); return { - succeeded, - failed, + succeeded: totals.succeeded, + failed: totals.failed, skipped, - tokenUsage: { inputTokens: totalInputTokens, outputTokens: totalOutputTokens, costUsd: totalCostUsd }, + tokenUsage: { inputTokens: totals.inputTokens, outputTokens: totals.outputTokens, costUsd: totals.costUsd }, }; } diff --git a/packages/ingest-github/src/strategies/flat-folder/folder-summary.ts b/packages/ingest-github/src/strategies/flat-folder/folder-summary.ts index 805eae6..a5d95a3 100644 --- a/packages/ingest-github/src/strategies/flat-folder/folder-summary.ts +++ b/packages/ingest-github/src/strategies/flat-folder/folder-summary.ts @@ -8,12 +8,18 @@ import { getConfigValue } from "@bb/config"; import type { CondensedFileAnalysis } from "#src/types/condensed-file-analysis.ts"; import type { MetaPaths } from "#src/types/meta-paths.ts"; import { encodeMetaPath } from "#src/pipeline/paths.ts"; -import { withConcurrency } from "#src/pipeline/concurrency.ts"; +import type { ConcurrencyLimiter } from "#src/pipeline/concurrency.ts"; import { throwIfCancelled, CancellationError } from "#src/pipeline/cancellation.ts"; import type { ProgressContext } from "#src/progress/types.ts"; import type { FileAnalysisCache } from "./file-analysis-cache.ts"; import { directFolderOf } from "./folder-path.ts"; -import { FOLDER_ANALYSIS_SYSTEM_PROMPT, folderAnalysisUserPrompt } from "./prompts/folder-summary.ts"; +import { + FOLDER_ANALYSIS_SYSTEM_PROMPT, + FOLDER_BATCH_SYSTEM_PROMPT, + folderAnalysisUserPrompt, + folderBatchUserPrompt, + type BatchedFolderInput, +} from "./prompts/folder-summary.ts"; import type { FolderSummary } from "./types.ts"; export function groupByDirectFolder(cache: FileAnalysisCache): Map { @@ -38,6 +44,52 @@ interface FolderSummaryJson { dependencyGraph?: unknown; } +export interface FolderBucket { + folderPath: string; + files: CondensedFileAnalysis[]; +} + +/** + * Splits the folder groups into "individual" (one LLM call per folder, used + * for big folders or when batching is disabled) and "batches" (N small + * folders summarised in one LLM call). Driven by `Config.FolderSummaryBatchSize` + * (set to 1 to disable batching entirely) and `Config.FolderSummaryBatchMaxFiles` + * (folders exceeding this file count always take the individual path). + * + * Folders are sorted by path so that two runs of the same repo produce the + * same batch composition — helpful when A/B-comparing outputs. + */ +export function groupFoldersForBatching(groups: Map): { + individual: FolderBucket[]; + batches: FolderBucket[][]; +} { + const batchSize = getConfigValue(Config.FolderSummaryBatchSize); + const maxFiles = getConfigValue(Config.FolderSummaryBatchMaxFiles); + const sorted: FolderBucket[] = [...groups.entries()] + .map(([folderPath, files]) => ({ folderPath, files })) + .sort((a, b) => a.folderPath.localeCompare(b.folderPath)); + + if (batchSize <= 1) { + return { individual: sorted, batches: [] }; + } + + const individual: FolderBucket[] = []; + const batchable: FolderBucket[] = []; + for (const bucket of sorted) { + if (bucket.files.length > maxFiles) { + individual.push(bucket); + } else { + batchable.push(bucket); + } + } + + const batches: FolderBucket[][] = []; + for (let i = 0; i < batchable.length; i += batchSize) { + batches.push(batchable.slice(i, i + batchSize)); + } + return { individual, batches }; +} + export async function summariseFolder( folderPath: string, files: CondensedFileAnalysis[], @@ -82,6 +134,72 @@ export async function summariseFolder( } } +/** + * Multi-folder summary. Builds a label-indexed prompt, parses the keyed JSON + * response, returns one `FolderSummary | null` per folder. Folders missing + * from the response (or whose entry fails shape validation) are surfaced as + * `null` with a warn log; the caller counts those as failed. + */ +export async function summariseFolderBatch( + batch: FolderBucket[], + llmCallContext?: AskLlmOptions, +): Promise<{ + summaries: Map; + tokenUsage: { inputTokens: number; outputTokens: number; costUsd: number }; +}> { + const labeled: BatchedFolderInput[] = batch.map((b, i) => ({ label: i, folderPath: b.folderPath, files: b.files })); + const userPrompt = folderBatchUserPrompt(labeled); + const summaries = new Map(); + try { + const response = await askJsonLLM>( + FOLDER_BATCH_SYSTEM_PROMPT, + userPrompt, + llmCallContext ?? {}, + ); + if (response.result === null) { + logger.warn(`summariseFolderBatch: batch of ${batch.length} returned unparseable JSON`); + for (const b of batch) { + summaries.set(b.folderPath, null); + } + return { + summaries, + tokenUsage: { + inputTokens: response.usage.inputTokens, + outputTokens: response.usage.outputTokens, + costUsd: response.usage.costUsd, + }, + }; + } + for (const b of labeled) { + const raw = response.result[String(b.label)]; + if (raw === undefined || typeof raw !== "object" || raw === null) { + logger.warn(`summariseFolderBatch: missing/invalid entry for label ${b.label} (${b.folderPath || ""})`); + summaries.set(b.folderPath, null); + continue; + } + summaries.set(b.folderPath, shapeFolderSummary(b.folderPath, raw)); + } + return { + summaries, + tokenUsage: { + inputTokens: response.usage.inputTokens, + outputTokens: response.usage.outputTokens, + costUsd: response.usage.costUsd, + }, + }; + } catch (cause: unknown) { + if (cause instanceof LlmConfigError || cause instanceof LlmError) { + throw cause; + } + const msg = cause instanceof Error ? cause.message : String(cause); + logger.warn(`summariseFolderBatch: batch of ${batch.length} askJsonLLM failed: ${msg}`); + for (const b of batch) { + summaries.set(b.folderPath, null); + } + return { summaries, tokenUsage: { inputTokens: 0, outputTokens: 0, costUsd: 0 } }; + } +} + export async function persistFolderSummary(metaPaths: MetaPaths, summary: FolderSummary): Promise { const file = path.join(metaPaths.folderSummariesDir, `${encodeMetaPath(summary.folderPath || "__ROOT__")}.json`); await writeFile(file, JSON.stringify(summary, null, 2), "utf8"); @@ -110,10 +228,134 @@ export async function* iterateFolderSummaries(metaPaths: MetaPaths): AsyncGenera } } +interface FolderSummaryTotals { + succeeded: number; + failed: number; + inputTokens: number; + outputTokens: number; + costUsd: number; +} + +/** + * Dispatches a single folder through `summariseFolder` and persists the + * result. Shared between `runFolderSummaryPhase` and `runSelectiveFolderSummary`. + */ +async function dispatchIndividual( + bucket: FolderBucket, + metaPaths: MetaPaths, + totals: FolderSummaryTotals, + llmCallContext: AskLlmOptions | undefined, + reporter: ReturnType> | undefined, + knowledgeId: string, + phaseLabel: string, +): Promise { + try { + throwIfCancelled(knowledgeId); + const { summary, tokenUsage } = await summariseFolder(bucket.folderPath, bucket.files, llmCallContext); + totals.inputTokens += tokenUsage.inputTokens; + totals.outputTokens += tokenUsage.outputTokens; + totals.costUsd += tokenUsage.costUsd; + if (summary !== null) { + await persistFolderSummary(metaPaths, summary); + totals.succeeded += 1; + } else { + totals.failed += 1; + } + } catch (cause: unknown) { + if (cause instanceof CancellationError) { + throw cause; + } + totals.failed += 1; + logger.warn(`${phaseLabel}: folder summary failed for ${bucket.folderPath || ""}`); + } finally { + reporter?.increment(1, { fileName: bucket.folderPath || "" }); + } +} + +/** + * Dispatches a multi-folder batch through `summariseFolderBatch`. Each + * non-null per-folder summary is persisted; missing/null entries count + * toward `failed`. Progress increments once per folder. + */ +async function dispatchBatch( + batch: FolderBucket[], + metaPaths: MetaPaths, + totals: FolderSummaryTotals, + llmCallContext: AskLlmOptions | undefined, + reporter: ReturnType> | undefined, + knowledgeId: string, + phaseLabel: string, +): Promise { + try { + throwIfCancelled(knowledgeId); + const { summaries, tokenUsage } = await summariseFolderBatch(batch, llmCallContext); + totals.inputTokens += tokenUsage.inputTokens; + totals.outputTokens += tokenUsage.outputTokens; + totals.costUsd += tokenUsage.costUsd; + for (const bucket of batch) { + const summary = summaries.get(bucket.folderPath) ?? null; + if (summary !== null) { + try { + await persistFolderSummary(metaPaths, summary); + totals.succeeded += 1; + } catch (cause: unknown) { + totals.failed += 1; + logger.warn( + `${phaseLabel}: persist failed for ${bucket.folderPath || ""}: ${cause instanceof Error ? cause.message : String(cause)}`, + ); + } + } else { + totals.failed += 1; + } + reporter?.increment(1, { fileName: bucket.folderPath || "" }); + } + } catch (cause: unknown) { + if (cause instanceof CancellationError) { + throw cause; + } + totals.failed += batch.length; + for (const bucket of batch) { + reporter?.increment(1, { fileName: bucket.folderPath || "" }); + } + logger.warn( + `${phaseLabel}: batch summary failed for ${batch.length} folders: ${cause instanceof Error ? cause.message : String(cause)}`, + ); + } +} + +/** + * Dispatch helper used by both `runFolderSummaryPhase` and + * `runSelectiveFolderSummary`. Splits `groups` into individual + batched + * buckets, schedules every task through the shared `limiter`, awaits all, + * and returns the aggregated totals. + */ +export async function dispatchFolderSummaries( + groups: Map, + metaPaths: MetaPaths, + limiter: ConcurrencyLimiter, + llmCallContext: AskLlmOptions | undefined, + reporter: ReturnType> | undefined, + knowledgeId: string, + phaseLabel: string, +): Promise { + const totals: FolderSummaryTotals = { succeeded: 0, failed: 0, inputTokens: 0, outputTokens: 0, costUsd: 0 }; + const { individual, batches } = groupFoldersForBatching(groups); + const tasks: Promise[] = []; + for (const bucket of individual) { + tasks.push(limiter(() => dispatchIndividual(bucket, metaPaths, totals, llmCallContext, reporter, knowledgeId, phaseLabel))); + } + for (const batch of batches) { + tasks.push(limiter(() => dispatchBatch(batch, metaPaths, totals, llmCallContext, reporter, knowledgeId, phaseLabel))); + } + await Promise.all(tasks); + return totals; +} + export async function runFolderSummaryPhase( knowledgeId: string, metaPaths: MetaPaths, cache: FileAnalysisCache, + limiter: ConcurrencyLimiter, llmCallContext?: AskLlmOptions, progressContext?: ProgressContext, ): Promise<{ @@ -121,57 +363,23 @@ export async function runFolderSummaryPhase( failed: number; tokenUsage: { inputTokens: number; outputTokens: number; costUsd: number }; }> { - const concurrentWorkers = getConfigValue(Config.ConcurrentWorkers); - const limit = withConcurrency(concurrentWorkers); const groups = groupByDirectFolder(cache); - let succeeded = 0; - let failed = 0; - let totalInputTokens = 0; - let totalOutputTokens = 0; - let totalCostUsd = 0; const reporter = progressContext?.reporter({ phase: "folder_analysis", total: { kind: "fixed", total: groups.size }, }); await reporter?.start(); + let totals: FolderSummaryTotals; try { - const tasks: Promise[] = []; - for (const [folderPath, files] of groups.entries()) { - tasks.push( - limit(async () => { - try { - throwIfCancelled(knowledgeId); - const { summary, tokenUsage } = await summariseFolder(folderPath, files, llmCallContext); - totalInputTokens += tokenUsage.inputTokens; - totalOutputTokens += tokenUsage.outputTokens; - totalCostUsd += tokenUsage.costUsd; - if (summary !== null) { - await persistFolderSummary(metaPaths, summary); - succeeded += 1; - } else { - failed += 1; - } - } catch (cause: unknown) { - if (cause instanceof CancellationError) { - throw cause; - } - failed += 1; - logger.warn(`phase5: folder summary failed for ${folderPath || ""}`); - } finally { - reporter?.increment(1, { fileName: folderPath || "" }); - } - }), - ); - } - await Promise.all(tasks); + totals = await dispatchFolderSummaries(groups, metaPaths, limiter, llmCallContext, reporter, knowledgeId, "phase5"); } finally { reporter?.stop(); } - logger.info(`phase5 done: foldersSummarised=${succeeded} failed=${failed}`); + logger.info(`phase5 done: foldersSummarised=${totals.succeeded} failed=${totals.failed}`); return { - succeeded, - failed, - tokenUsage: { inputTokens: totalInputTokens, outputTokens: totalOutputTokens, costUsd: totalCostUsd }, + succeeded: totals.succeeded, + failed: totals.failed, + tokenUsage: { inputTokens: totals.inputTokens, outputTokens: totals.outputTokens, costUsd: totals.costUsd }, }; } diff --git a/packages/ingest-github/src/strategies/flat-folder/index.ts b/packages/ingest-github/src/strategies/flat-folder/index.ts index de9211b..c23f42c 100644 --- a/packages/ingest-github/src/strategies/flat-folder/index.ts +++ b/packages/ingest-github/src/strategies/flat-folder/index.ts @@ -116,6 +116,7 @@ export function createFlatFolderStrategy(deps: FlatFolderStrategyDeps): IngestSt knowledgeId, metaPaths, fileAnalysisCache, + limiter, llmCallContext, progressContext, ); diff --git a/packages/ingest-github/src/strategies/flat-folder/phases/README.md b/packages/ingest-github/src/strategies/flat-folder/phases/README.md index 05ee606..cdcfddb 100644 --- a/packages/ingest-github/src/strategies/flat-folder/phases/README.md +++ b/packages/ingest-github/src/strategies/flat-folder/phases/README.md @@ -127,9 +127,17 @@ so phases 5 and 7 see the updated entries without re-reading disk. do not abort on a single bad file. Only `CancellationError`, `LlmConfigError`, and `LlmError` propagate. - The shared LLM limiter is the only place LLM concurrency is bounded - during the small/big phases. `Config.BigFileConcurrency` is no longer - consulted from the chunk-queue path (it is still consulted by the - legacy `processBigFile` used by the pull-path driver). + during the small/big phases **and the folder-summary phase**. + `Config.BigFileConcurrency` is no longer consulted from the chunk-queue + path (it is still consulted by the legacy `processBigFile` used by the + pull-path driver). `Config.ConcurrentWorkers` is no longer consulted + by the folder-summary phase. +- Phase 5 batches small folders by default. `Config.FolderSummaryBatchSize` + (default 10) controls batch size; set to 1 to disable and restore one + LLM call per folder. `Config.FolderSummaryBatchMaxFiles` (default 15) + is the per-folder file ceiling above which a folder always takes the + individual path so the LLM still sees the full per-file context. Large + folders run side-by-side with batches under the same shared limiter. - Phase 1 respects `Config.ContextWindowLimit` and `Config.MaxTokensPerChunk`; do not hardcode either. - Phase 7 always emits a `:Repo` node, even when `repo-summary.json` is diff --git a/packages/ingest-github/src/strategies/flat-folder/prompts/folder-summary.ts b/packages/ingest-github/src/strategies/flat-folder/prompts/folder-summary.ts index 10276a8..30e110b 100644 --- a/packages/ingest-github/src/strategies/flat-folder/prompts/folder-summary.ts +++ b/packages/ingest-github/src/strategies/flat-folder/prompts/folder-summary.ts @@ -40,3 +40,57 @@ Per-file analyses (direct children only): ${serialised}`; } + +export const FOLDER_BATCH_SYSTEM_PROMPT = `You are summarising MULTIPLE small folders of a source repository in one pass. The user will provide several folders, each labeled with an integer ID (0, 1, 2, ...). Each folder lists the files directly inside it (subfolders are summarised separately and are NOT in your input). + +Return ONLY a JSON object whose keys are the integer labels as strings ("0", "1", ...) and whose values are folder-summary objects with EXACTLY these keys: + +- purpose : string — one-paragraph explanation of what this folder is responsible for. +- summary : string — natural-language summary of how the files in this folder work together. Plain English, no key-value pairs. ≤ 300 tokens. +- keywords : string[] — up to 10 domain keywords describing this folder. +- classes : string[] — most important class/type entries, deduplicated. Format "Name: short purpose". Max 15 entries. +- functions : string[] — most important function/method entries, deduplicated. Format "name: short purpose". Max 15 entries. +- importsInternal : string[] — significant relative imports observed across the folder's files. Max 15 entries. +- importsExternal : string[] — significant external packages observed across the folder's files. Max 15 entries. +- dependencyGraph : string — Mermaid \`graph LR\` block (no triple-backtick fences) of inter-file dependencies. Empty string if not enough signal. + +You MUST return one entry per labeled folder, even if some fields are empty arrays. Do NOT invent files not listed. Do NOT speculate about subfolders. Do NOT add keys outside the integer-label set; do NOT add commentary outside the JSON object.`; + +export interface BatchedFolderInput { + label: number; + folderPath: string; + files: CondensedFileAnalysis[]; +} + +export function folderBatchUserPrompt(batch: BatchedFolderInput[]): string { + const sections = batch.map((b) => { + const folderLabel = b.folderPath.length === 0 ? "" : b.folderPath; + const fileLines = b.files.map((f) => `- ${f.relativePath}: ${f.analysis.purpose}`).join("\n"); + const aggregatedKeywords = aggregateKeywords(b.files, 10); + return `### Folder ${b.label} :: ${folderLabel} +Files: ${b.files.length} +${fileLines} +Aggregated keywords: ${JSON.stringify(aggregatedKeywords)}`; + }); + return `You are summarising ${batch.length} folder(s). Produce one folder-summary object per labeled folder. + +${sections.join("\n\n")}`; +} + +function aggregateKeywords(files: CondensedFileAnalysis[], cap: number): string[] { + const seen = new Set(); + const out: string[] = []; + for (const f of files) { + for (const k of f.analysis.keywords) { + if (typeof k !== "string" || k.length === 0 || seen.has(k)) { + continue; + } + seen.add(k); + out.push(k); + if (out.length >= cap) { + return out; + } + } + } + return out; +} diff --git a/packages/types/src/config.ts b/packages/types/src/config.ts index 950cb81..1e72f67 100644 --- a/packages/types/src/config.ts +++ b/packages/types/src/config.ts @@ -24,6 +24,8 @@ export enum Config { AbsoluteFileSizeCap = "absolute.file.size.cap", ConcurrentWorkers = "concurrent.workers", LlmConcurrency = "llm.concurrency", + FolderSummaryBatchSize = "folder.summary.batch.size", + FolderSummaryBatchMaxFiles = "folder.summary.batch.max.files", CondenseContextLimit = "condense.context.limit", CondensePromptOverhead = "condense.prompt.overhead", SmallFileDedupThreshold = "small.file.dedup.threshold", From d4b99b12bf5ab7f8c8a99520486cd9d99dcaaef5 Mon Sep 17 00:00:00 2001 From: Dead-Bytes <143434285+Dead-Bytes@users.noreply.github.com> Date: Fri, 22 May 2026 14:59:27 +0530 Subject: [PATCH 06/11] refactor: remove backfillBigFiles phase and update related documentation --- packages/ingest-github/src/pipeline/pull.ts | 14 ---- .../strategies/flat-folder/backfill/README.md | 77 +++++++++--------- .../flat-folder/backfill/big-files.ts | 78 ------------------- .../strategies/flat-folder/big-file/README.md | 21 ++--- .../src/strategies/flat-folder/index.ts | 14 ---- .../strategies/flat-folder/phases/README.md | 4 +- .../flat-folder/phases/process-big-files.ts | 48 ++++++++++-- 7 files changed, 93 insertions(+), 163 deletions(-) delete mode 100644 packages/ingest-github/src/strategies/flat-folder/backfill/big-files.ts diff --git a/packages/ingest-github/src/pipeline/pull.ts b/packages/ingest-github/src/pipeline/pull.ts index 8a4c706..fbc960a 100644 --- a/packages/ingest-github/src/pipeline/pull.ts +++ b/packages/ingest-github/src/pipeline/pull.ts @@ -21,7 +21,6 @@ import { nullProgressContextFactory } from "#src/progress/NullProgressReporter.t import { analyseChangedFiles } from "#src/strategies/flat-folder/analyse-changed.ts"; import { processBigFilesQueue } from "#src/strategies/flat-folder/phases/process-big-files.ts"; import { backfillMissingFields } from "#src/strategies/flat-folder/backfill/fields.ts"; -import { backfillBigFiles } from "#src/strategies/flat-folder/backfill/big-files.ts"; import { FileAnalysisCache } from "#src/strategies/flat-folder/file-analysis-cache.ts"; import { runSelectiveFolderSummary } from "#src/strategies/flat-folder/folder-summary-selective.ts"; import { @@ -204,19 +203,6 @@ export async function runPull( throwIfCancelled(knowledgeId); await backfillMissingFields(metaPaths, fileAnalysisCache, llmCallContext, progressContext); - logger.info(`pull: phase backfill big-files starting`); - throwIfCancelled(knowledgeId); - const backfillBigFilesInput: Parameters[0] = { - knowledgeId, - source, - metaPaths, - progressContext, - }; - if (llmCallContext !== undefined) { - backfillBigFilesInput.llmCallContext = llmCallContext; - } - await backfillBigFiles(backfillBigFilesInput); - progressContext.phaseChanged("folder_analysis"); logger.info(`pull: phase selective folder summary (${affectedFolders.size} folders) starting`); throwIfCancelled(knowledgeId); diff --git a/packages/ingest-github/src/strategies/flat-folder/backfill/README.md b/packages/ingest-github/src/strategies/flat-folder/backfill/README.md index dfa3d72..34f744d 100644 --- a/packages/ingest-github/src/strategies/flat-folder/backfill/README.md +++ b/packages/ingest-github/src/strategies/flat-folder/backfill/README.md @@ -1,66 +1,65 @@ # `@bb/ingest-github/src/strategies/flat-folder/backfill` -Post-analysis top-up phases. After Phases 1 and 2 have produced -`CondensedFileAnalysis` JSON on disk, the backfill phases sweep the cache -to fill gaps left by per-file LLM noise or by interrupted big-file runs. -Both are idempotent and skip entries that already look complete. +Post-analysis top-up. After Phases 1 and 2 have produced +`CondensedFileAnalysis` JSON on disk, this phase sweeps the in-memory +cache to fill extended-analysis fields the main per-file prompt left +empty. Idempotent — entries that already look complete are skipped +without an LLM call. + +The big-file backfill phase that used to live here was removed: the +new chunk-task-queue model in `phases/process-big-files.ts` handles +crash recovery directly via the per-chunk disk cache and `inspect()`, +and same-run condense failures are now retried twice in-place before +being marked failed. ## Files -- `fields.ts` — Phase 3. `backfillMissingFields(metaPaths, llmCallContext?, progressContext?)` - iterates every condensed entry via `iterateCondensed`, computes which - extended-analysis fields are missing (`keywords`, `ontologyConcepts`, - `businessEntities`, `systemCapabilities`, `sideEffects`, - `configDependencies`, `dataFlowDirection`, `integrationSurface`, - `contractsProvided`, `contractsConsumed`, `sectionMap`), and asks one - LLM call per file to fill only the missing slots. The response is - validated and normalised (`pickStringArray`, `pickSections`) before - being written back via `saveCondensed`. Entries with nothing missing - are skipped without an LLM call. When `progressContext` is present - this phase opens a growing-total reporter (`subPhase: "backfill"`) - because `iterateCondensed`'s size is not known up front. -- `big-files.ts` — Phase 4. `backfillBigFiles({knowledgeId, repoDir, -metaPaths, llmCallContext?, progressContext?})` re-reads - `bigFiles.json`, skips `reason === "too-large"`, and for each - non-complete entry (per `inspect`) re-runs `processBigFile` against - the file on disk so the condensed JSON is rebuilt from cached chunks - where possible. When `progressContext` is present this phase opens a - fixed-total reporter (`subPhase: "backfill:big_files"`, sized by - `bigFiles.json`) and forwards itself into `processBigFile` so per-file - chunk pulses also surface. +- `fields.ts` — Phase 3. `backfillMissingFields(metaPaths, cache, llmCallContext?, progressContext?)` + iterates every condensed entry from the shared `FileAnalysisCache`, + computes which extended-analysis fields are missing (`keywords`, + `ontologyConcepts`, `businessEntities`, `systemCapabilities`, + `sideEffects`, `configDependencies`, `dataFlowDirection`, + `integrationSurface`, `contractsProvided`, `contractsConsumed`, + `sectionMap`), and asks one LLM call per file to fill only the + missing slots. The response is validated and normalised + (`pickStringArray`, `pickSections`) before being written back via + `saveCondensed` **and** mirrored into the cache via `cache.set(entry)` + so downstream phases (folder summary, graph store) see the updated + entry without re-reading disk. Entries with nothing missing are + skipped without an LLM call. Progress reporter is fixed-total sized + by `cache.size`. ## Public interfaces -- `backfillMissingFields(metaPaths, llmCallContext?, progressContext?): Promise<{ updated, failed }>` -- `backfillBigFiles(input: BackfillBigFilesInput): Promise` - — `BackfillBigFilesInput` carries an optional `llmCallContext?: AskLlmOptions` that the inner `processBigFile` call uses to forward per-job LLM credentials, and an optional `progressContext?: ProgressContext` for the per-phase reporter described above. +- `backfillMissingFields(metaPaths, cache, llmCallContext?, progressContext?): Promise<{ updated, failed }>` -Both return phase-summary counters consumed by `createFlatFolderStrategy` +Returns phase-summary counters consumed by `createFlatFolderStrategy` to roll up into the strategy result. ## Data ownership -These phases own no new on-disk artifacts. They mutate existing condensed -JSON in place via `saveCondensed`, and (Phase 4) drive `processBigFile` to -refresh the chunk and condensed caches under `big-file/storage.ts`. +This phase owns no new on-disk artifacts. It mutates existing +condensed JSON in place via `saveCondensed` and mirrors the same +mutation into `FileAnalysisCache`. ## Invariants - Idempotent: a second run is a no-op once every entry passes the completeness check. - Per-file LLM failure is logged and counted, never thrown. The phase - continues to the next entry. -- LLM output is untrusted: missing slots are filled only when the response - yields a non-empty value of the expected shape; partial responses leave - unfilled slots for a future pass. -- Phase 4 never touches `reason === "too-large"` entries — those stay as - stubs forever. + continues to the next entry. Only `LlmConfigError` / `LlmError` + propagate (treated as job-fatal upstream). +- LLM output is untrusted: missing slots are filled only when the + response yields a non-empty value of the expected shape; partial + responses leave unfilled slots for a future pass. +- Cache and disk stay in lockstep — every `saveCondensed` is paired + with a `cache.set(entry)` in the same code path. ## External dependencies `@bb/llm` (`askJsonLLM`), `@bb/logger`, `@bb/mongo` (types only — `FileAnalysis`, `FileAnalysisSection`), the sibling -`flat-folder/big-file/` cache layer, and the prompts under +`flat-folder/file-analysis-cache.ts`, and the prompts under `flat-folder/prompts/backfill.ts`. ## Tier diff --git a/packages/ingest-github/src/strategies/flat-folder/backfill/big-files.ts b/packages/ingest-github/src/strategies/flat-folder/backfill/big-files.ts deleted file mode 100644 index 587808c..0000000 --- a/packages/ingest-github/src/strategies/flat-folder/backfill/big-files.ts +++ /dev/null @@ -1,78 +0,0 @@ -import { logger } from "@bb/logger"; -import type { AskLlmOptions } from "@bb/llm"; -import { LlmConfigError, LlmError } from "@bb/errors"; -import type { MetaPaths } from "#src/types/meta-paths.ts"; -import type { SourceReader } from "#src/types/pipeline.ts"; -import type { ProgressContext } from "#src/progress/types.ts"; -import { readBigFiles } from "#src/strategies/flat-folder/big-file/detector.ts"; -import { inspect } from "#src/strategies/flat-folder/big-file/cache.ts"; -import { processBigFile } from "#src/strategies/flat-folder/big-file/index.ts"; - -export interface BackfillBigFilesInput { - knowledgeId: string; - source: SourceReader; - metaPaths: MetaPaths; - llmCallContext?: AskLlmOptions; - progressContext?: ProgressContext; -} - -export interface BackfillBigFilesResult { - reCondensed: number; - failed: number; -} - -export async function backfillBigFiles(input: BackfillBigFilesInput): Promise { - const entries = await readBigFiles(input.metaPaths); - let reCondensed = 0; - let failed = 0; - const reporter = input.progressContext?.reporter({ - phase: "file_analysis", - subPhase: "backfill:big_files", - total: { kind: "fixed", total: entries.length }, - }); - await reporter?.start(); - try { - for (const entry of entries) { - if (entry.reason === "too-large") { - reporter?.increment(1, { fileName: entry.relativePath }); - continue; - } - const status = await inspect(input.metaPaths, entry.relativePath); - if (status === "complete") { - reporter?.increment(1, { fileName: entry.relativePath }); - continue; - } - try { - const content = await input.source.readFile(entry.relativePath); - if (content.length === 0) { - failed += 1; - logger.warn(`phase4: empty content for ${entry.relativePath}; skipping`); - reporter?.increment(1, { fileName: entry.relativePath }); - continue; - } - await processBigFile({ - knowledgeId: input.knowledgeId, - metaPaths: input.metaPaths, - relativePath: entry.relativePath, - content, - sizeBytes: entry.sizeBytes, - ...(input.llmCallContext !== undefined ? { llmCallContext: input.llmCallContext } : {}), - ...(input.progressContext !== undefined ? { progressContext: input.progressContext } : {}), - }); - reCondensed += 1; - } catch (cause: unknown) { - if (cause instanceof LlmConfigError || cause instanceof LlmError) { - throw cause; - } - failed += 1; - const msg = cause instanceof Error ? cause.message : String(cause); - logger.warn(`phase4: re-condense failed for ${entry.relativePath}: ${msg}`); - } - reporter?.increment(1, { fileName: entry.relativePath }); - } - logger.info(`phase4 done: reCondensed=${reCondensed} failed=${failed}`); - return { reCondensed, failed }; - } finally { - reporter?.stop(); - } -} diff --git a/packages/ingest-github/src/strategies/flat-folder/big-file/README.md b/packages/ingest-github/src/strategies/flat-folder/big-file/README.md index 3e4e6ef..264d8ea 100644 --- a/packages/ingest-github/src/strategies/flat-folder/big-file/README.md +++ b/packages/ingest-github/src/strategies/flat-folder/big-file/README.md @@ -27,8 +27,11 @@ depending on chunk count and prompt budget. - `storage.ts` — on-disk cache (chunk JSON, manifest, condensed analysis) + `iterateCondensed(metaPaths)` async iterator used by Phase 5. - `cache.ts` — `inspect(metaPaths, relativePath)` returns `complete`, - `stale-condensed`, or `missing`. Used by Phase 2 to short-circuit and by - Phase 4 to find candidates for cheap re-condense. + `stale-condensed`, or `missing`. Used by Phase 2 to short-circuit + already-finished big files on resume. The chunk task queue then + re-uses cached chunks via `loadChunkIfPresent` and re-runs condense + to recover any `stale-condensed` files — this is the crash-recovery + pathway that replaced the deleted Phase 4 backfill. - `index.ts` — `processBigFile({knowledgeId, metaPaths, relativePath, content, sizeBytes, llmCallContext?, progressContext?})`. Sequential per file (chunk-level concurrency inside). Persists every intermediate artifact, @@ -49,17 +52,17 @@ the storage / cache primitives) are consumed by **two** drivers: - `processBigFile` (`index.ts`) — legacy serial driver. One big file at a time, chunks-within-file parallel under `Config.BigFileConcurrency`, - followed by a blocking condense. Used today by the pull-path - (`pipeline/pull.ts`) via `processBigFilesQueue` and by the Phase 4 - backfill. + followed by a blocking condense. Used today only by the pull-path + (`pipeline/pull.ts`) via `processBigFilesQueue`. - `analyseBigFiles` (`phases/process-big-files.ts`) — manifest-driven chunk-task queue used by the main strategy entry. Every chunk of every big file is an independent task scheduled through a strategy-wide shared `ConcurrencyLimiter`. As soon as a file's last chunk lands, - that file's `condenseChunks` is scheduled through the same limiter — - multiple condenses run in parallel with chunks of slower files. - Reuses `splitFileIntoChunks`, `analyzeChunk`, `condenseChunks`, and - the storage helpers without modification. + that file's `condenseChunks` is scheduled through the same limiter + (with one in-place retry on transient failure) — multiple condenses + run in parallel with chunks of slower files. Reuses + `splitFileIntoChunks`, `analyzeChunk`, `condenseChunks`, and the + storage helpers without modification. ## Invariants diff --git a/packages/ingest-github/src/strategies/flat-folder/index.ts b/packages/ingest-github/src/strategies/flat-folder/index.ts index c23f42c..e70396f 100644 --- a/packages/ingest-github/src/strategies/flat-folder/index.ts +++ b/packages/ingest-github/src/strategies/flat-folder/index.ts @@ -10,7 +10,6 @@ import { scanAndClassify } from "./phases/scan-and-classify.ts"; import { analyseSmallFiles } from "./phases/analyse-small.ts"; import { analyseBigFiles } from "./phases/process-big-files.ts"; import { backfillMissingFields } from "./backfill/fields.ts"; -import { backfillBigFiles } from "./backfill/big-files.ts"; import { FileAnalysisCache } from "./file-analysis-cache.ts"; import { runFolderSummaryPhase } from "./folder-summary.ts"; import { makeRepoSummaryEnvelope, persistRepoSummary, summariseRepo } from "./repo-summary.ts"; @@ -96,19 +95,6 @@ export function createFlatFolderStrategy(deps: FlatFolderStrategyDeps): IngestSt throwIfCancelled(knowledgeId); await backfillMissingFields(metaPaths, fileAnalysisCache, llmCallContext, progressContext); - logger.info(`flat-folder: phase4 (backfill big files) starting`); - throwIfCancelled(knowledgeId); - const phase4Input: Parameters[0] = { - knowledgeId, - source, - metaPaths, - progressContext, - }; - if (llmCallContext !== undefined) { - phase4Input.llmCallContext = llmCallContext; - } - await backfillBigFiles(phase4Input); - progressContext.phaseChanged("folder_analysis"); logger.info(`flat-folder: phase5 (folder summaries) starting`); throwIfCancelled(knowledgeId); diff --git a/packages/ingest-github/src/strategies/flat-folder/phases/README.md b/packages/ingest-github/src/strategies/flat-folder/phases/README.md index cdcfddb..6301e38 100644 --- a/packages/ingest-github/src/strategies/flat-folder/phases/README.md +++ b/packages/ingest-github/src/strategies/flat-folder/phases/README.md @@ -71,8 +71,8 @@ scanAndClassify ↓ FileAnalysisCache.loadAll (one parallel readdir+readFile pass) ↓ -backfillMissingFields → backfillBigFiles → folderSummary → repoSummary → storeFlatAnalysis - (cache read+write) (no cache) (cache read) (cache read) +backfillMissingFields → folderSummary → repoSummary → storeFlatAnalysis + (cache read+write) (cache read) (cache read) ``` `FileAnalysisCache` is a `Map` loaded diff --git a/packages/ingest-github/src/strategies/flat-folder/phases/process-big-files.ts b/packages/ingest-github/src/strategies/flat-folder/phases/process-big-files.ts index 70d5102..1577849 100644 --- a/packages/ingest-github/src/strategies/flat-folder/phases/process-big-files.ts +++ b/packages/ingest-github/src/strategies/flat-folder/phases/process-big-files.ts @@ -5,7 +5,7 @@ import { getConfigValue } from "@bb/config"; import type { AskLlmOptions } from "@bb/llm"; import { LlmConfigError, LlmError } from "@bb/errors"; import type { MetaPaths } from "#src/types/meta-paths.ts"; -import type { SourceReader } from "#src/types/pipeline.ts"; +import type { AnalyzedFileResult, SourceReader } from "#src/types/pipeline.ts"; import type { ProgressContext } from "#src/progress/types.ts"; import type { ConcurrencyLimiter } from "#src/pipeline/concurrency.ts"; import type { ChunkAnalysisResult, FileChunk, HugeFileManifest } from "#src/types/big-file.ts"; @@ -20,6 +20,9 @@ import { loadChunkIfPresent, saveChunk, saveCondensed, saveManifest } from "#src import { processBigFile } from "#src/strategies/flat-folder/big-file/index.ts"; import type { ScanManifest, ScanManifestEntry } from "#src/strategies/flat-folder/scan-manifest.ts"; +const CONDENSE_MAX_ATTEMPTS = 2; +const CONDENSE_RETRY_BACKOFF_MS = 2000; + export interface ProcessBigFilesInput { knowledgeId: string; source: SourceReader; @@ -239,9 +242,37 @@ export async function analyseBigFiles(input: AnalyseBigFilesInput): Promise { throwIfCancelled(input.knowledgeId); - try { - const merged = await condenseChunks(state.entry.relativePath, definedResults, input.llmCallContext); + let merged: AnalyzedFileResult | null = null; + for (let attempt = 1; attempt <= CONDENSE_MAX_ATTEMPTS; attempt += 1) { + try { + merged = await condenseChunks(state.entry.relativePath, definedResults, input.llmCallContext); + break; + } catch (cause: unknown) { + if (cause instanceof CancellationError) { + throw cause; + } + if (cause instanceof LlmConfigError || cause instanceof LlmError) { + throw cause; + } + if (attempt < CONDENSE_MAX_ATTEMPTS) { + logger.warn( + `analyse-big: condense attempt ${attempt}/${CONDENSE_MAX_ATTEMPTS} failed for ${state.entry.relativePath}; retrying: ${describe(cause)}`, + ); + await sleep(CONDENSE_RETRY_BACKOFF_MS); + continue; + } + failed += 1; + logger.warn( + `analyse-big: condense failed after ${CONDENSE_MAX_ATTEMPTS} attempts for ${state.entry.relativePath}: ${describe(cause)}`, + ); + } + } + if (merged === null) { + condenseReporter?.increment(1, { fileName: state.entry.relativePath }); + return; + } + try { const chunkInputTokens = definedResults.reduce((acc, r) => acc + (r.tokenUsage?.inputTokens ?? 0), 0); const chunkOutputTokens = definedResults.reduce((acc, r) => acc + (r.tokenUsage?.outputTokens ?? 0), 0); const chunkCostUsd = definedResults.reduce((acc, r) => acc + (r.tokenUsage?.costUsd ?? 0), 0); @@ -282,11 +313,8 @@ export async function analyseBigFiles(input: AnalyseBigFilesInput): Promise { + return new Promise((resolve) => { + setTimeout(resolve, ms); + }); +} From 1afd5d68e0ff3792744df8f601baab1825251e53 Mon Sep 17 00:00:00 2001 From: Dead-Bytes <143434285+Dead-Bytes@users.noreply.github.com> Date: Fri, 22 May 2026 17:25:07 +0530 Subject: [PATCH 07/11] Refactor backfill process to use concurrency limiter and batch Neo4j upserts --- packages/config/src/schema.ts | 7 + packages/ingest-github/src/pipeline/README.md | 27 ++- packages/ingest-github/src/pipeline/pull.ts | 2 +- packages/ingest-github/src/pipeline/scan.ts | 197 ++++++++++++++-- .../src/pipeline/skip-decisions/README.md | 46 +++- .../src/pipeline/skip-decisions/decider.ts | 119 ++++++---- .../src/strategies/flat-folder/README.md | 160 ++++++++----- .../strategies/flat-folder/backfill/README.md | 27 ++- .../strategies/flat-folder/backfill/fields.ts | 52 +++-- .../src/strategies/flat-folder/index.ts | 3 +- .../strategies/flat-folder/phases/README.md | 23 +- .../flat-folder/phases/scan-and-classify.ts | 12 + .../flat-folder/phases/store-flat-analysis.ts | 121 ++++++---- packages/ingest-github/src/types/README.md | 16 +- packages/ingest-github/src/types/pipeline.ts | 35 +++ packages/neo4j/README.md | 36 +-- packages/neo4j/src/client.ts | 29 +++ packages/neo4j/src/files.ts | 221 +++++++++++++++++- packages/neo4j/src/folder.ts | 76 +++++- packages/neo4j/src/index.ts | 4 +- packages/types/src/config.ts | 1 + 21 files changed, 992 insertions(+), 222 deletions(-) diff --git a/packages/config/src/schema.ts b/packages/config/src/schema.ts index 1f7a021..d5bae9d 100644 --- a/packages/config/src/schema.ts +++ b/packages/config/src/schema.ts @@ -44,6 +44,7 @@ export const configSchema = z "llm.concurrency": z.number().int().positive().default(29), "folder.summary.batch.size": z.number().int().positive().default(10), "folder.summary.batch.max.files": z.number().int().positive().default(15), + "neo4j.batch.size": z.number().int().positive().default(50), "condense.context.limit": z.number().int().positive().default(12000), "condense.prompt.overhead": z.number().int().nonnegative().default(1500), "small.file.dedup.threshold": z.number().int().positive().default(3), @@ -87,6 +88,7 @@ export type ConfigValueMap = { [Config.LlmConcurrency]: number; [Config.FolderSummaryBatchSize]: number; [Config.FolderSummaryBatchMaxFiles]: number; + [Config.Neo4jBatchSize]: number; [Config.CondenseContextLimit]: number; [Config.CondensePromptOverhead]: number; [Config.SmallFileDedupThreshold]: number; @@ -144,6 +146,7 @@ export const HINTS: Readonly> = { [Config.LlmConcurrency]: "bytebell set llm.concurrency ", [Config.FolderSummaryBatchSize]: "bytebell set folder.summary.batch.size ", [Config.FolderSummaryBatchMaxFiles]: "bytebell set folder.summary.batch.max.files ", + [Config.Neo4jBatchSize]: "bytebell set neo4j.batch.size ", [Config.CondenseContextLimit]: "bytebell set condense.context.limit ", [Config.CondensePromptOverhead]: "bytebell set condense.prompt.overhead ", [Config.SmallFileDedupThreshold]: "bytebell set small.file.dedup.threshold ", @@ -210,6 +213,8 @@ export function readField(cfg: BytebellConfig, key: K): Config return cfg["folder.summary.batch.size"] as ConfigValue; case Config.FolderSummaryBatchMaxFiles: return cfg["folder.summary.batch.max.files"] as ConfigValue; + case Config.Neo4jBatchSize: + return cfg["neo4j.batch.size"] as ConfigValue; case Config.CondenseContextLimit: return cfg["condense.context.limit"] as ConfigValue; case Config.CondensePromptOverhead: @@ -285,6 +290,8 @@ export function writeField(cfg: BytebellConfig, key: K, value: return { ...cfg, "folder.summary.batch.size": value as number }; case Config.FolderSummaryBatchMaxFiles: return { ...cfg, "folder.summary.batch.max.files": value as number }; + case Config.Neo4jBatchSize: + return { ...cfg, "neo4j.batch.size": value as number }; case Config.CondenseContextLimit: return { ...cfg, "condense.context.limit": value as number }; case Config.CondensePromptOverhead: diff --git a/packages/ingest-github/src/pipeline/README.md b/packages/ingest-github/src/pipeline/README.md index 0c57d78..ae9da32 100644 --- a/packages/ingest-github/src/pipeline/README.md +++ b/packages/ingest-github/src/pipeline/README.md @@ -28,7 +28,7 @@ Domain (sub-folder of `@bb/ingest-github`). - `skip-decisions/` — LLM-backed unknown-extension gate. See `skip-decisions/README.md`. Active when `Config.SkipDecisionEnabled = true` (default). Consumed by `scan.ts` via the optional `skipDecider` - dep; built by `classifyAndAnalyseSmall` if not injected. + dep; built by `scanAndClassify` (Phase 1) if not injected. - `disk-source-reader.ts` — `createDiskSourceReader({ repoDir, commitHash })` returns a `SourceReader` that wraps `scanRepository` + `node:fs.readFile`. The default reader the open-source binary always uses, unless the caller @@ -40,9 +40,26 @@ true` (default). Consumed by `scan.ts` via the optional `skipDecider` enters the big-file phase). Both thresholds are config-driven — no magic numbers in this file. `deps.llmCallContext` (when present) is forwarded into every `SkipDeciderInput` so the LLM branch of the - unknown-extension gate uses per-job credentials. `readScannedFile` - re-reads a file by absolute path for the big-file phase which streams - content lazily. + unknown-extension gate uses per-job credentials. + + **Two scan modes:** + - **Two-pass (default for the flat-folder strategy)** — activated when + `deps.skipDecider` AND `deps.limiter` are both supplied. Pass 1 walks + the tree calling `decider.decideStatic(...)`; static-resolved files + yield immediately, "needs LLM" files go into a pending buffer with + their content. Pass 2 dedupes pending entries by `ext:` or + `filename:`, dispatches one `decider.decideAndDeferSave(...)` per + unique key through the shared limiter via `Promise.all`, then calls + `decider.persist()` exactly once. Pass 3 drains pending — every + `decideStatic` call is now a cache hit, so the drain is sync at the + decider boundary and yields each kept file with its buffered content. + - **Legacy inline (`walk()`)** — used when `deps.limiter` is omitted (e.g. + a custom `SourceFactory` consumer that didn't opt in). Inline `await +deps.skipDecider.decide(input)` per file. Same semantics as before this + refactor; preserved for backwards compatibility. + + `readScannedFile` re-reads a file by absolute path for the big-file phase + which streams content lazily. - `run.ts` — `createPipelineRunner({ reposRootDir, strategy, sourceFactory?, progressContextFactory? })` builds an `IngestRunnerDeps`. GitHub payloads run: branch resolve, source-reader construction, strategy execute, commit persistence. Local @@ -76,7 +93,7 @@ archiveSink?}` and `runPull` skips `syncRepository` + `materialiseEndpoints` (open-source default), the legacy git-based path runs. Either path produces the same downstream pipeline: snapshot prior version, `analyseChangedFiles` (now reading via `SourceReader`), - `processBigFilesQueue`, `backfillMissingFields`, `backfillBigFiles`, + `processBigFilesQueue`, `backfillMissingFields`, `runSelectiveFolderSummary`, `summariseRepo`, `storePullAnalysis`. Mirrors the index-side strategy orchestrator for progress: builds one `ProgressContext` per job from the optional `progressContextFactory` diff --git a/packages/ingest-github/src/pipeline/pull.ts b/packages/ingest-github/src/pipeline/pull.ts index fbc960a..be344a6 100644 --- a/packages/ingest-github/src/pipeline/pull.ts +++ b/packages/ingest-github/src/pipeline/pull.ts @@ -201,7 +201,7 @@ export async function runPull( logger.info(`pull: phase backfill fields starting`); throwIfCancelled(knowledgeId); - await backfillMissingFields(metaPaths, fileAnalysisCache, llmCallContext, progressContext); + await backfillMissingFields(metaPaths, fileAnalysisCache, limiter, llmCallContext, progressContext); progressContext.phaseChanged("folder_analysis"); logger.info(`pull: phase selective folder summary (${affectedFolders.size} folders) starting`); diff --git a/packages/ingest-github/src/pipeline/scan.ts b/packages/ingest-github/src/pipeline/scan.ts index 02d17ea..fda9236 100644 --- a/packages/ingest-github/src/pipeline/scan.ts +++ b/packages/ingest-github/src/pipeline/scan.ts @@ -5,7 +5,8 @@ import { getConfigValue } from "@bb/config"; import type { AskLlmOptions } from "@bb/llm"; import { logger } from "@bb/logger"; import { SKIP_DIRS, looksBinary, passesPathFilters } from "./filters.ts"; -import type { ScanEntry, SkipDecider } from "#src/types/pipeline.ts"; +import type { ConcurrencyLimiter } from "./concurrency.ts"; +import type { ScanEntry, SkipDecider, SkipDeciderInput } from "#src/types/pipeline.ts"; interface ScanLimits { absoluteCap: number; @@ -15,18 +16,7 @@ interface ScanLimits { export interface ScanRepositoryDeps { skipDecider?: SkipDecider; llmCallContext?: AskLlmOptions; -} - -export async function* scanRepository(rootDir: string, deps: ScanRepositoryDeps = {}): AsyncGenerator { - const limits: ScanLimits = { - absoluteCap: getConfigValue(Config.AbsoluteFileSizeCap), - bigFileLineThreshold: getConfigValue(Config.BigFileLineThreshold), - }; - const counts = { acceptStatic: 0, acceptLlm: 0, rejectStatic: 0, rejectLlm: 0, oversized: 0, binary: 0 }; - yield* walk(rootDir, rootDir, limits, deps, counts); - logger.info( - `scan: acceptStatic=${counts.acceptStatic} acceptLlm=${counts.acceptLlm} rejectStatic=${counts.rejectStatic} rejectLlm=${counts.rejectLlm} oversized=${counts.oversized} binary=${counts.binary}`, - ); + limiter?: ConcurrencyLimiter; } interface ScanCounts { @@ -38,6 +28,44 @@ interface ScanCounts { binary: number; } +interface PendingFile { + relativePath: string; + absolutePath: string; + sizeBytes: number; + content: string; + ext: string; + input: SkipDeciderInput; +} + +function newCounts(): ScanCounts { + return { acceptStatic: 0, acceptLlm: 0, rejectStatic: 0, rejectLlm: 0, oversized: 0, binary: 0 }; +} + +function logCounts(counts: ScanCounts): void { + logger.info( + `scan: acceptStatic=${counts.acceptStatic} acceptLlm=${counts.acceptLlm} rejectStatic=${counts.rejectStatic} rejectLlm=${counts.rejectLlm} oversized=${counts.oversized} binary=${counts.binary}`, + ); +} + +export async function* scanRepository(rootDir: string, deps: ScanRepositoryDeps = {}): AsyncGenerator { + const limits: ScanLimits = { + absoluteCap: getConfigValue(Config.AbsoluteFileSizeCap), + bigFileLineThreshold: getConfigValue(Config.BigFileLineThreshold), + }; + + // Two-pass parallel mode requires both a skip-decider AND a limiter so that + // pending LLM resolutions can be deduplicated and dispatched concurrently. + // Without either, fall back to the inline-await walk that's been here all along. + if (deps.skipDecider !== undefined && deps.limiter !== undefined) { + yield* twoPassScan(rootDir, limits, deps.skipDecider, deps.limiter, deps); + return; + } + + const counts = newCounts(); + yield* walk(rootDir, rootDir, limits, deps, counts); + logCounts(counts); +} + async function* walk( rootDir: string, currentDir: string, @@ -82,7 +110,7 @@ async function* walk( continue; } if (deps.skipDecider !== undefined) { - const deciderInput: Parameters[0] = { relativePath, absolutePath: abs, ext }; + const deciderInput: SkipDeciderInput = { relativePath, absolutePath: abs, ext }; if (deps.llmCallContext !== undefined) { deciderInput.llmCallContext = deps.llmCallContext; } @@ -113,6 +141,147 @@ async function* walk( } } +async function* twoPassScan( + rootDir: string, + limits: ScanLimits, + decider: SkipDecider, + limiter: ConcurrencyLimiter, + deps: ScanRepositoryDeps, +): AsyncGenerator { + const counts = newCounts(); + const pending: PendingFile[] = []; + + // Pass 1: walk + categorize. Static-decided files yield immediately; + // "needs LLM" files go into `pending` for batch resolution. + yield* walkAndCategorize(rootDir, rootDir, limits, deps, decider, counts, pending); + + // Pass 2: dedupe pending by decision key (extension or filename), schedule + // one LLM call per unique key through the shared limiter, then persist the + // decider's cache once. + if (pending.length > 0) { + const unique = new Map(); + for (const p of pending) { + const key = decisionKey(p); + if (!unique.has(key)) { + unique.set(key, p.input); + } + } + logger.info(`scan: resolving ${unique.size} unique skip-decision keys for ${pending.length} pending files`); + await Promise.all( + Array.from(unique.values()).map((input) => limiter(() => decider.decideAndDeferSave(input))), + ); + decider.persist(); + } + + // Pass 3: drain pending. Every decideStatic call is now a cache hit. + for (const p of pending) { + const decision = decider.decideStatic(p.input); + if (decision === "reject-static" || decision === null) { + counts.rejectStatic += 1; + continue; + } + if (decision === "reject-llm") { + counts.rejectLlm += 1; + continue; + } + if (decision === "accept-llm") { + counts.acceptLlm += 1; + } else { + counts.acceptStatic += 1; + } + yield { + kind: "file", + relativePath: p.relativePath, + absolutePath: p.absolutePath, + sizeBytes: p.sizeBytes, + content: p.content, + }; + } + + logCounts(counts); +} + +async function* walkAndCategorize( + rootDir: string, + currentDir: string, + limits: ScanLimits, + deps: ScanRepositoryDeps, + decider: SkipDecider, + counts: ScanCounts, + pending: PendingFile[], +): AsyncGenerator { + const dir = await opendir(currentDir); + for await (const entry of dir) { + const abs = path.join(currentDir, entry.name); + if (entry.isDirectory()) { + if (SKIP_DIRS.has(entry.name)) { + continue; + } + yield* walkAndCategorize(rootDir, abs, limits, deps, decider, counts, pending); + continue; + } + if (!entry.isFile()) { + continue; + } + if (!passesPathFilters(entry.name, path.extname(entry.name))) { + counts.rejectStatic += 1; + continue; + } + const sizeBytes = (await stat(abs)).size; + const relativePath = path.relative(rootDir, abs); + const ext = path.extname(entry.name).toLowerCase(); + if (sizeBytes > limits.absoluteCap) { + counts.oversized += 1; + yield { kind: "oversized", relativePath, absolutePath: abs, sizeBytes }; + continue; + } + const buf = await readFile(abs); + if (looksBinary(buf)) { + counts.binary += 1; + continue; + } + const content = buf.toString("utf8"); + if (countLines(content) > limits.bigFileLineThreshold) { + counts.oversized += 1; + yield { kind: "oversized", relativePath, absolutePath: abs, sizeBytes }; + continue; + } + const deciderInput: SkipDeciderInput = { relativePath, absolutePath: abs, ext }; + if (deps.llmCallContext !== undefined) { + deciderInput.llmCallContext = deps.llmCallContext; + } + const sync = decider.decideStatic(deciderInput); + if (sync === "reject-static") { + counts.rejectStatic += 1; + continue; + } + if (sync === "reject-llm") { + counts.rejectLlm += 1; + continue; + } + if (sync === "accept-llm") { + counts.acceptLlm += 1; + yield { kind: "file", relativePath, absolutePath: abs, sizeBytes, content }; + continue; + } + if (sync === "accept") { + counts.acceptStatic += 1; + yield { kind: "file", relativePath, absolutePath: abs, sizeBytes, content }; + continue; + } + // sync === null → needs LLM. Defer to pass 2. + pending.push({ relativePath, absolutePath: abs, sizeBytes, content, ext, input: deciderInput }); + } +} + +function decisionKey(p: PendingFile): string { + if (p.ext.length > 0) { + return `ext:${p.ext}`; + } + const segments = p.relativePath.split("/"); + return `filename:${segments[segments.length - 1] ?? p.relativePath}`; +} + function countLines(content: string): number { if (content.length === 0) { return 0; diff --git a/packages/ingest-github/src/pipeline/skip-decisions/README.md b/packages/ingest-github/src/pipeline/skip-decisions/README.md index f4e0273..4a6fa2f 100644 --- a/packages/ingest-github/src/pipeline/skip-decisions/README.md +++ b/packages/ingest-github/src/pipeline/skip-decisions/README.md @@ -17,6 +17,36 @@ single-tenant public layout. 8. Persist verdict to ~/.bytebell/llmDecisions.json. LLM failure → reject + cache the rejection. ``` +Steps 1-6 are pure CPU + cached lookup — they run synchronously via +`decideStatic`. Step 7 is the slow LLM branch; `decide` performs it +inline, while `decideAndDeferSave` performs it without flushing the +cache to disk so a batched caller can `persist()` once at the end of +its batch. + +## Public methods (`SkipDecider`) + +```ts +interface SkipDecider { + decide(input): Promise; // legacy single-shot path + decideStatic(input): SkipDecision | null; // sync; null = needs LLM + decideAndDeferSave(input): Promise; // LLM call, no disk save + persist(): void; // flush cache to disk once +} +``` + +- `decide` — the original single-shot API. Calls `decideStatic`; if that + returns `null`, runs the LLM call and `persist()`s the cache. Used by + the legacy `walk()` in `scan.ts` when no shared limiter is passed + (e.g. custom `SourceFactory` consumers that don't opt into two-pass). +- `decideStatic` — synchronous. Returns the resolved `SkipDecision` for + steps 1-6; returns `null` to signal "would need an LLM call". Used by + the two-pass scan to categorise files without blocking the walk. +- `decideAndDeferSave` — runs the LLM call and mutates the in-memory + cache but does **not** flush to disk. Scan calls this concurrently + for unique extension/filename keys under a shared limiter; the disk + write happens once via `persist()` after the batch. +- `persist` — best-effort cache flush; swallows I/O errors. + ## Files - `seed.ts` — loads the four bundled JSON files (directory/filename/pattern/extension lists) @@ -36,7 +66,10 @@ single-tenant public layout. factory time; when disabled the decider degrades to "accept everything past the static blocklist". The LLM branch forwards `SkipDeciderInput.llmCallContext` (when set by the runner) into - `askYesNoLLM` so per-job credentials reach the decision call. + `askYesNoLLM` so per-job credentials reach the decision call. The four + methods (`decide`, `decideStatic`, `decideAndDeferSave`, `persist`) share + one internal `staticDecision()` helper so the seed-list + cache-lookup + branch is defined exactly once. - `seed-data/` — the five JSON files copied from kube's `shared/`: `directoryIgnore.json`, `filenameIgnore.json`, `ignorePatterns.json`, `extensions.json`, `llmDecisionsBase.json`. `llmDecisionsBase.json` is @@ -56,8 +89,15 @@ single-tenant public layout. beyond reading the cache file once at factory time. Only the LLM branch reads file content from disk, and even that is bounded by `Config.SkipDecisionMaxCharsForLlm`. -- Every LLM verdict is flushed to disk immediately so a crash mid-scan does - not lose decisions made earlier in the run. +- `decide` flushes to disk immediately after each LLM verdict — same + semantics as before this refactor, so crash mid-scan does not lose + decisions made earlier in the run when the legacy inline path is in use. +- `decideAndDeferSave` does **not** flush; the batched caller (two-pass + scan) is responsible for calling `persist()` exactly once after the + parallel batch resolves. This avoids racing tmp/rename writes when many + unique extensions resolve concurrently. Crash recovery in two-pass mode + is acceptable because the batch is short and re-running the scan + re-resolves the same decisions. - LLM failure defaults to reject and caches the rejection — matches kube's one-shot-rule behavior. Users can hand-edit the cache to revisit. - The decider is process-local: tests may construct one with `cachePath` diff --git a/packages/ingest-github/src/pipeline/skip-decisions/decider.ts b/packages/ingest-github/src/pipeline/skip-decisions/decider.ts index 455f633..50185e8 100644 --- a/packages/ingest-github/src/pipeline/skip-decisions/decider.ts +++ b/packages/ingest-github/src/pipeline/skip-decisions/decider.ts @@ -29,6 +29,11 @@ export interface SkipDeciderDeps { cachePath?: string; } +interface StaticDecisionContext { + filename: string; + segments: string[]; +} + export function makeSkipDecider(deps: SkipDeciderDeps = {}): SkipDecider { const enabled = getConfigValue(Config.SkipDecisionEnabled); const cachePath = deps.cachePath ?? defaultCachePath(); @@ -37,54 +42,90 @@ export function makeSkipDecider(deps: SkipDeciderDeps = {}): SkipDecider { logCacheSummary(cache); } - return { - async decide(input: SkipDeciderInput): Promise { - const segments = input.relativePath.split("/"); - const filename = segments[segments.length - 1] ?? input.relativePath; - for (const segment of segments.slice(0, -1)) { - if (SEED_DIRECTORIES.has(segment)) { - return "reject-static"; - } - } - if (SEED_FILENAMES.has(filename)) { - return "reject-static"; - } - if (input.ext.length > 0 && SEED_EXTENSIONS.has(input.ext)) { - return "reject-static"; - } - if (matchesAnyGlob(filename)) { + function contextFor(input: SkipDeciderInput): StaticDecisionContext { + const segments = input.relativePath.split("/"); + const filename = segments[segments.length - 1] ?? input.relativePath; + return { filename, segments }; + } + + function staticDecision(input: SkipDeciderInput): SkipDecision | null { + const { filename, segments } = contextFor(input); + for (const segment of segments.slice(0, -1)) { + if (SEED_DIRECTORIES.has(segment)) { return "reject-static"; } + } + if (SEED_FILENAMES.has(filename)) { + return "reject-static"; + } + if (input.ext.length > 0 && SEED_EXTENSIONS.has(input.ext)) { + return "reject-static"; + } + if (matchesAnyGlob(filename)) { + return "reject-static"; + } - if (input.ext.length > 0 && KNOWN_LANGUAGE_EXTENSIONS.has(input.ext)) { - return "accept"; - } + if (input.ext.length > 0 && KNOWN_LANGUAGE_EXTENSIONS.has(input.ext)) { + return "accept"; + } - if (!enabled) { - return "accept"; - } + if (!enabled) { + return "accept"; + } - const cacheKey = input.ext.length > 0 ? input.ext : filename; - const section = input.ext.length > 0 ? cache.extensions : cache.filenames; - const cached = section[cacheKey]; - if (cached !== undefined) { - return cached.ignore ? "reject-llm" : "accept-llm"; - } + const cacheKey = input.ext.length > 0 ? input.ext : filename; + const section = input.ext.length > 0 ? cache.extensions : cache.filenames; + const cached = section[cacheKey]; + if (cached !== undefined) { + return cached.ignore ? "reject-llm" : "accept-llm"; + } + return null; + } + + async function resolveLlm(input: SkipDeciderInput): Promise { + const { filename } = contextFor(input); + const decision = await askLlmDecision(input, deps.repositoryName, input.llmCallContext); + if (input.ext.length > 0) { + setExtensionDecision(cache, input.ext, !decision, "llm", deps.repositoryName, input.relativePath); + } else { + setFilenameDecision(cache, filename, !decision, "llm", deps.repositoryName, input.relativePath); + } + return decision ? "accept-llm" : "reject-llm"; + } + + function persist(): void { + if (!enabled) { + return; + } + try { + saveCache(cachePath, cache); + } catch (cause: unknown) { + const msg = cause instanceof Error ? cause.message : String(cause); + logger.warn(`skip-decisions: failed to save cache to ${cachePath}: ${msg}`); + } + } - const decision = await askLlmDecision(input, deps.repositoryName, input.llmCallContext); - if (input.ext.length > 0) { - setExtensionDecision(cache, input.ext, !decision, "llm", deps.repositoryName, input.relativePath); - } else { - setFilenameDecision(cache, filename, !decision, "llm", deps.repositoryName, input.relativePath); + return { + async decide(input: SkipDeciderInput): Promise { + const sync = staticDecision(input); + if (sync !== null) { + return sync; } - try { - saveCache(cachePath, cache); - } catch (cause: unknown) { - const msg = cause instanceof Error ? cause.message : String(cause); - logger.warn(`skip-decisions: failed to save cache to ${cachePath}: ${msg}`); + const result = await resolveLlm(input); + persist(); + return result; + }, + decideStatic(input: SkipDeciderInput): SkipDecision | null { + return staticDecision(input); + }, + async decideAndDeferSave(input: SkipDeciderInput): Promise { + const sync = staticDecision(input); + if (sync !== null) { + return sync; } - return decision ? "accept-llm" : "reject-llm"; + return await resolveLlm(input); }, + persist, }; } diff --git a/packages/ingest-github/src/strategies/flat-folder/README.md b/packages/ingest-github/src/strategies/flat-folder/README.md index a454303..5a725f4 100644 --- a/packages/ingest-github/src/strategies/flat-folder/README.md +++ b/packages/ingest-github/src/strategies/flat-folder/README.md @@ -1,32 +1,50 @@ # `@bb/ingest-github/src/strategies/flat-folder` -The v2 ingestion strategy: clone → scan → big-file split → per-file analyse → -folder summary → repo summary → graph store. Each phase persists artifacts on -disk before the next begins, so a crash resumes cleanly from the next -sub-phase boundary. +The v2 ingestion strategy: scan + classify → analyse small + big in parallel → +field backfill → folder summary → repo summary → graph store. Each phase +persists artifacts on disk before the next begins, so a crash resumes cleanly +from the next sub-phase boundary. + +The strategy constructs **one shared `ConcurrencyLimiter`** at entry (sized by +`Config.LlmConcurrency`, default 29). Every LLM call across small-file +analyses, big-file chunk analyses, per-file condense calls, the skip-decision +LLM gate (during scan), field backfill, and folder summaries checks out from +this single pool. One knob bounds total in-flight LLM concurrency. ## Phases -1. **classify-and-analyse-small** (`phases/classify-and-analyse-small.ts`) — - walks `source.scan({ skipDecider })`; small files → LLM file-analysis → - write `CondensedFileAnalysis` Oversized files → write a stub. Big-by-tokens - files → append to `bigFiles.json` for Phase 2. -2. **process-big-files** (`phases/process-big-files.ts`) — reads - `bigFiles.json`, calls `source.readFile(relativePath)` per entry, - dispatches `processBigFile` sequentially (chunk-level concurrency - inside). -3. **backfill-fields** (`backfill/fields.ts`) — top up `keywords`, - `sideEffects`, `configDependencies`, `dataFlowDirection` on condensed - entries that miss them. Idempotent. -4. **backfill-big-files** (`backfill/big-files.ts`) — re-condense entries - whose chunks exist but condensed JSON is stale or missing. -5. **summarise-folders** (`folder-summary.ts`) — group condensed entries by - `path.posix.dirname` (root = ""), one LLM call per folder, persist to - `folder-summaries/.json`. -6. **summarise-repo** (`repo-summary.ts`) — load folder summaries +1. **scan-and-classify** (`phases/scan-and-classify.ts`) — walks + `source.scan({ skipDecider, limiter })` once, tokenises each file, classifies + as `small` / `big` / `oversized`, and writes + `meta-output/scan-manifest.json` (canonical) plus the legacy + `bigFiles.json` (for the pull-path consumers). Scan internally uses a + **two-pass** strategy: walk + cache-only `decideStatic` first, then + parallel-deduplicated LLM resolution for unknown extensions/filenames + through the shared limiter, then drain. +2a. **analyse-small** (`phases/analyse-small.ts`) — reads the manifest's + `kind: "small"` entries, re-opens content, runs the LLM file-analyser + per file under the shared limiter, writes `CondensedFileAnalysis` JSON. + Also writes oversized stubs. +2b. **analyse-big-files** (`phases/process-big-files.ts` — + `analyseBigFiles`) — chunk-task queue across all big files. Every chunk + is an independent task on the shared limiter; per-file condense is + scheduled as soon as that file's last chunk lands (one in-place retry + on transient condense failures). Runs **concurrently with 2a**. +3. **backfill-fields** (`backfill/fields.ts`) — for each cached condensed + entry with missing extended fields (`keywords`, `sideEffects`, + `dataFlowDirection`, `sectionMap`, …) dispatches one LLM call through + the shared limiter to fill the gaps. Idempotent — no-op on a complete + entry. +4. **summarise-folders** (`folder-summary.ts`) — groups condensed entries + by direct parent folder. Small folders + (`≤ Config.FolderSummaryBatchMaxFiles`, default 15) are batched up to + `Config.FolderSummaryBatchSize` (default 10) per LLM call. Bigger + folders take the individual single-folder path. Both flows run through + the shared limiter. +5. **summarise-repo** (`repo-summary.ts`) — load folder summaries shallowest-first; one call if it fits `ContextWindowLimit`, batch + merge otherwise; persist `repo-summary.json` with the v2-flat envelope. -7. **store-flat-analysis** (`phases/store-flat-analysis.ts`) — ensure +6. **store-flat-analysis** (`phases/store-flat-analysis.ts`) — ensure flat-folder indexes, upsert `:Repo`, then every `:Folder`, then every `:File` with the extended analysis + Folder→File `CONTAINS` edge. @@ -38,50 +56,68 @@ The strategy emits progress through the `ProgressContext` port defined in (no-op, OSS default). - **Boundary events** are split between the runner and the strategy: - - `phaseChanged("clone")` and `phaseChanged("scan")` are emitted by - `pipeline/run.ts` (the runner) before `strategy.execute` is called, - so the SSE stream stays alive during the network/disk-bound prelude. - - `phaseChanged("file_analysis")` is emitted by `index.ts` before phase 1 - - `phaseChanged("folder_analysis")` before phase 5 - - `phaseChanged("indexing")` before phase 6 (which feeds phase 7) - - `completed()` after phase 7 returns - - `failed(message)` from a `try/catch` wrapping the whole `execute` -- **Intra-phase ticks** are emitted by each phase via per-phase reporters - created from `progressContext.reporter(...)`. Sub-phase labels: - - phase 1 → no sub-phase (the main file-analysis loop) - - phase 2 → `big_files_queue`; inner `processBigFile` adds - `big_file:` for chunk pulses - - phase 3 → `backfill`; phase 4 → `backfill:big_files` - - phase 5 → no sub-phase, fixed total = directly-grouped folder count - - phase 7 → `folders` then `files`, both `growing` (drained from - on-disk async generators) -- **Total mode**: phase 1, phase 3, and any other streaming-iterator loop - use `total: { kind: "growing" }` (denominator grows as `source.scan` - yields). Phases 2 and 4, plus the big-file chunk pool, know their size - up front and use `total: { kind: "fixed", total: N }`. + - `phaseChanged("clone")` is emitted by `pipeline/run.ts` (the runner) + before `syncRepository`, so the SSE stream stays alive during the + network/disk-bound prelude. + - `phaseChanged("scan")` is emitted by `index.ts` before phase 1. + - `phaseChanged("file_analysis")` before the parallel 2a/2b block. + - `phaseChanged("folder_analysis")` before phase 4 (folder summaries). + - `phaseChanged("indexing")` before phase 5 (which feeds phase 6). + - `completed()` after phase 6 returns. + - `failed(message)` from a `try/catch` wrapping the whole `execute`. +- **Intra-phase ticks** are emitted via per-phase reporters created from + `progressContext.reporter(...)`. Sub-phase labels: + - phase 1 (scan) → no sub-phase, growing total (driven by `incrementSeen`). + - phase 2a (analyse-small) → `analyse_small`, fixed total = + `smallCount + oversizedCount`. + - phase 2b (analyse-big) → two reporters: `big_files_chunks` (fixed total + = sum of estimated chunks across all big files) and `big_files_condense` + (fixed total = `bigCount`). + - phase 3 → `backfill`, fixed total = `cache.size`. + - phase 4 → no sub-phase, fixed total = directly-grouped folder count. + - phase 6 → `folders` (growing) then `files` (fixed total = `cache.size`). +- **Pull-path-only sub-phases** (emitted by `pipeline/pull.ts` workflow, + not the main strategy): `big_files_queue` (legacy single-file driver), + `big_file:` (per-big-file chunk pulses inside the legacy + driver), `pull` (`analyse-changed.ts` selective file analysis). +- **Total mode**: scan is the only main-strategy phase that uses + `growing` mode. Everything else has fixed totals known up front from the + scan manifest, the file-analysis cache, or the folder grouping. - The cancellation path in `execute` lets `CancellationError` propagate past the orchestrator; `failed()` only fires for non-cancellation errors. ## Files -- `index.ts` — `createFlatFolderStrategy(deps)` orchestrates the 7 phases. +- `index.ts` — `createFlatFolderStrategy(deps)` orchestrates the phases. Accepts `{ fileAnalyzer, progressContextFactory? }`. Constructs one - `ProgressContext` per job and threads it into every phase that takes a - `progressContext?` field. + `ProgressContext` per job AND one shared `ConcurrencyLimiter` per job + (sized by `Config.LlmConcurrency`); threads both into every phase that + needs them. - `types.ts` — `AnalyzedFileEntry`, `FolderSummary`, `RepoSummary`, `RepoSummaryEnvelope`, `FlatFolderResult`. - `analyse-file.ts` — `analyseScannedFile(analyzer, file, llmCallContext?)` + `buildOversizedStub`. -- `analyse-changed.ts` — `analyseChangedFiles({knowledgeId, source, metaPaths, analyzer, diff, llmCallContext?, archiveSink?, progressContext?})`. Pull-time per-file dispatcher. Reads changed file content through `input.source` (a `SourceReader`) so it works with both the disk-backed reader (OSS default) and any HTTP-backed alternative supplied via the `pullFactory` hook. Mirrors `classifyAndAnalyseSmall`'s small-file path: filter → fetch → size cap → binary detect → line count → analyse → save + archive push. Does NOT invoke the skip-decision LLM gate. When `progressContext` is present it creates a fixed-total reporter (`subPhase: "pull"`, `total = dedupedPaths.length`) and increments per-path so the pull SSE stream stays live. +- `analyse-changed.ts` — `analyseChangedFiles({knowledgeId, source, metaPaths, analyzer, diff, llmCallContext?, archiveSink?, progressContext?})`. Pull-time per-file dispatcher. Reads changed file content through `input.source` (a `SourceReader`) so it works with both the disk-backed reader (OSS default) and any HTTP-backed alternative supplied via the `pullFactory` hook. Mirrors `analyseSmallFiles`'s per-file path: filter → fetch → size cap → binary detect → line count → analyse → save + archive push. Does NOT invoke the skip-decision LLM gate. When `progressContext` is present it creates a fixed-total reporter (`subPhase: "pull"`, `total = dedupedPaths.length`) and increments per-path so the pull SSE stream stays live. +- `file-analysis-cache.ts` — in-memory `Map` + loaded once between phase 2 and phase 3; shared read-only by phases 3, 4, + 6; mutated by phase 3 backfill via `cache.set(entry)` so downstream phases + see updated entries without re-reading disk. +- `scan-manifest.ts` — `ScanManifest` shape, `readScanManifest`, + `writeScanManifest`. The canonical handoff between phase 1 and phases 2a/2b. - `folder-path.ts` — `directFolderOf`, `affectedFolderPaths`. -- `folder-summary.ts` — group + summarise + persist + iterate folder summaries. +- `folder-summary.ts` — group + summarise (individual or batched) + persist + + iterate folder summaries; shared `dispatchFolderSummaries` used by both + the main strategy and the pull-path's selective folder phase. +- `folder-summary-selective.ts` — pull-time selective folder summary phase. - `repo-summary.ts` — single-shot or batched repo summary with envelope writer. -- `phases/classify-and-analyse-small.ts` — Phase 1. -- `phases/process-big-files.ts` — Phase 2. -- `phases/store-flat-analysis.ts` — Phase 7. -- `backfill/fields.ts` — Phase 3. -- `backfill/big-files.ts` — Phase 4. -- `big-file/` — chunker, analyzer, condenser, storage, cache for Phase 2 & 4. +- `phases/scan-and-classify.ts` — Phase 1. +- `phases/analyse-small.ts` — Phase 2a. +- `phases/process-big-files.ts` — Phase 2b (`analyseBigFiles`, chunk-task + queue) plus the legacy `processBigFilesQueue` driver used by the pull-path. +- `phases/store-flat-analysis.ts` — Phase 6. +- `backfill/fields.ts` — Phase 3 (parallel via shared limiter). +- `big-file/` — chunker, analyzer, condenser, storage, cache used by both + big-file drivers. - `prompts/` — LLM prompts shared across the phases. ## Invariants @@ -107,11 +143,11 @@ The strategy emits progress through the `ProgressContext` port defined in reads `context.llmCallContext` (an optional `AskLlmOptions` built by the runner from `GithubIndexPayload.{llmApiKey, llmProvider, llmModel}`) and forwards it into every phase that issues LLM calls: phase 1 via - `classifyAndAnalyseSmall`'s `llmCallContext`, phase 2 via - `processBigFilesQueue` (which threads it into **both** the chunk - analyzer and `condenseChunks`), phase 3 via `backfillMissingFields`, - phase 4 via `backfillBigFiles`, phase 5 via `runFolderSummaryPhase`, - phase 6 via `summariseRepo`. The phases pass the same option object - through to `askJsonLLM` so the per-call override reaches `@bb/llm` - unchanged. When `llmCallContext` is undefined the call falls back to - `Config.OpenrouterApiKey` + `Config.LlmProvider`. + `scanAndClassify` (forwarded into `source.scan({ llmCallContext })` for + the skip-decision LLM gate), phase 2a via `analyseSmallFiles`, phase 2b + via `analyseBigFiles` (which threads it into **both** the chunk analyzer + and `condenseChunks`), phase 3 via `backfillMissingFields`, phase 4 via + `runFolderSummaryPhase`, phase 5 via `summariseRepo`. The phases pass + the same option object through to `askJsonLLM` so the per-call override + reaches `@bb/llm` unchanged. When `llmCallContext` is undefined the call + falls back to `Config.OpenrouterApiKey` + `Config.LlmProvider`. diff --git a/packages/ingest-github/src/strategies/flat-folder/backfill/README.md b/packages/ingest-github/src/strategies/flat-folder/backfill/README.md index 34f744d..f580f19 100644 --- a/packages/ingest-github/src/strategies/flat-folder/backfill/README.md +++ b/packages/ingest-github/src/strategies/flat-folder/backfill/README.md @@ -14,24 +14,27 @@ being marked failed. ## Files -- `fields.ts` — Phase 3. `backfillMissingFields(metaPaths, cache, llmCallContext?, progressContext?)` +- `fields.ts` — Phase 3. `backfillMissingFields(metaPaths, cache, limiter, llmCallContext?, progressContext?)` iterates every condensed entry from the shared `FileAnalysisCache`, computes which extended-analysis fields are missing (`keywords`, `ontologyConcepts`, `businessEntities`, `systemCapabilities`, `sideEffects`, `configDependencies`, `dataFlowDirection`, `integrationSurface`, `contractsProvided`, `contractsConsumed`, - `sectionMap`), and asks one LLM call per file to fill only the - missing slots. The response is validated and normalised - (`pickStringArray`, `pickSections`) before being written back via - `saveCondensed` **and** mirrored into the cache via `cache.set(entry)` - so downstream phases (folder summary, graph store) see the updated - entry without re-reading disk. Entries with nothing missing are - skipped without an LLM call. Progress reporter is fixed-total sized - by `cache.size`. + `sectionMap`), and dispatches one LLM call per file **through the shared + `ConcurrencyLimiter`** to fill only the missing slots. Tasks run + concurrently up to `Config.LlmConcurrency`; the loop builds the task + array and awaits `Promise.all` at the end. The response is validated and + normalised (`pickStringArray`, `pickSections`) before being written back + via `saveCondensed` **and** mirrored into the cache via `cache.set(entry)` + so downstream phases (folder summary, graph store) see the updated entry + without re-reading disk. Entries with nothing missing are skipped + without an LLM call. Progress reporter is fixed-total sized by + `cache.size`. Emits `phase3 dispatching N backfill tasks` at entry so the + caller can see how many tasks went through the limiter. ## Public interfaces -- `backfillMissingFields(metaPaths, cache, llmCallContext?, progressContext?): Promise<{ updated, failed }>` +- `backfillMissingFields(metaPaths, cache, limiter, llmCallContext?, progressContext?): Promise<{ updated, failed }>` Returns phase-summary counters consumed by `createFlatFolderStrategy` to roll up into the strategy result. @@ -54,6 +57,10 @@ mutation into `FileAnalysisCache`. responses leave unfilled slots for a future pass. - Cache and disk stay in lockstep — every `saveCondensed` is paired with a `cache.set(entry)` in the same code path. +- Concurrency is bounded by the shared `ConcurrencyLimiter` (today's + `Config.LlmConcurrency`). Counters (`updated`, `failed`, token totals) + are mutated from inside the concurrent tasks — safe under JS's + single-threaded event loop, no locking needed. ## External dependencies diff --git a/packages/ingest-github/src/strategies/flat-folder/backfill/fields.ts b/packages/ingest-github/src/strategies/flat-folder/backfill/fields.ts index 7836520..9effedb 100644 --- a/packages/ingest-github/src/strategies/flat-folder/backfill/fields.ts +++ b/packages/ingest-github/src/strategies/flat-folder/backfill/fields.ts @@ -4,6 +4,7 @@ import { logger } from "@bb/logger"; import type { FileAnalysis, FileAnalysisSection } from "@bb/mongo"; import type { MetaPaths } from "#src/types/meta-paths.ts"; import type { ProgressContext } from "#src/progress/types.ts"; +import type { ConcurrencyLimiter } from "#src/pipeline/concurrency.ts"; import { saveCondensed } from "#src/strategies/flat-folder/big-file/storage.ts"; import type { FileAnalysisCache } from "#src/strategies/flat-folder/file-analysis-cache.ts"; import { BACKFILL_SYSTEM_PROMPT, buildBackfillUserPrompt } from "#src/strategies/flat-folder/prompts/backfill.ts"; @@ -45,11 +46,13 @@ interface NeededFlags { export async function backfillMissingFields( metaPaths: MetaPaths, cache: FileAnalysisCache, + limiter: ConcurrencyLimiter, llmCallContext?: AskLlmOptions, progressContext?: ProgressContext, ): Promise<{ updated: number; failed: number }> { let updated = 0; let failed = 0; + let dispatched = 0; const reporter = progressContext?.reporter({ phase: "file_analysis", subPhase: "backfill", @@ -57,6 +60,7 @@ export async function backfillMissingFields( }); await reporter?.start(); try { + const tasks: Promise[] = []; for (const entry of cache.values()) { const a = entry.analysis; const needed = computeNeeded(a); @@ -64,27 +68,35 @@ export async function backfillMissingFields( reporter?.increment(1, { fileName: entry.relativePath }); continue; } - const userPrompt = buildBackfillUserPrompt(entry.relativePath, entry.analysis); - try { - const response = await askJsonLLM(BACKFILL_SYSTEM_PROMPT, userPrompt, llmCallContext ?? {}); - const result = response.result; - if (result === null) { - reporter?.increment(1, { fileName: entry.relativePath }); - continue; - } - applyBackfill(a, result, needed); - await saveCondensed(metaPaths, entry); - cache.set(entry); - updated += 1; - } catch (cause: unknown) { - if (cause instanceof LlmConfigError || cause instanceof LlmError) { - throw cause; - } - failed += 1; - logger.warn(`phase3: backfill failed for ${entry.relativePath}: ${describe(cause)}`); - } - reporter?.increment(1, { fileName: entry.relativePath }); + dispatched += 1; + tasks.push( + limiter(async () => { + const userPrompt = buildBackfillUserPrompt(entry.relativePath, entry.analysis); + try { + const response = await askJsonLLM(BACKFILL_SYSTEM_PROMPT, userPrompt, llmCallContext ?? {}); + const result = response.result; + if (result === null) { + reporter?.increment(1, { fileName: entry.relativePath }); + return; + } + applyBackfill(a, result, needed); + await saveCondensed(metaPaths, entry); + cache.set(entry); + updated += 1; + } catch (cause: unknown) { + if (cause instanceof LlmConfigError || cause instanceof LlmError) { + throw cause; + } + failed += 1; + logger.warn(`phase3: backfill failed for ${entry.relativePath}: ${describe(cause)}`); + } finally { + reporter?.increment(1, { fileName: entry.relativePath }); + } + }), + ); } + logger.info(`phase3 dispatching ${dispatched} backfill tasks`); + await Promise.all(tasks); logger.info(`phase3 done: updated=${updated} failed=${failed}`); return { updated, failed }; } finally { diff --git a/packages/ingest-github/src/strategies/flat-folder/index.ts b/packages/ingest-github/src/strategies/flat-folder/index.ts index e70396f..5093568 100644 --- a/packages/ingest-github/src/strategies/flat-folder/index.ts +++ b/packages/ingest-github/src/strategies/flat-folder/index.ts @@ -44,6 +44,7 @@ export function createFlatFolderStrategy(deps: FlatFolderStrategyDeps): IngestSt knowledgeId, source, metaPaths, + limiter, progressContext, }; if (llmCallContext !== undefined) { @@ -93,7 +94,7 @@ export function createFlatFolderStrategy(deps: FlatFolderStrategyDeps): IngestSt logger.info(`flat-folder: phase3 (backfill missing fields) starting`); throwIfCancelled(knowledgeId); - await backfillMissingFields(metaPaths, fileAnalysisCache, llmCallContext, progressContext); + await backfillMissingFields(metaPaths, fileAnalysisCache, limiter, llmCallContext, progressContext); progressContext.phaseChanged("folder_analysis"); logger.info(`flat-folder: phase5 (folder summaries) starting`); diff --git a/packages/ingest-github/src/strategies/flat-folder/phases/README.md b/packages/ingest-github/src/strategies/flat-folder/phases/README.md index 6301e38..64cfc96 100644 --- a/packages/ingest-github/src/strategies/flat-folder/phases/README.md +++ b/packages/ingest-github/src/strategies/flat-folder/phases/README.md @@ -51,14 +51,23 @@ progressContext?})` — legacy serial driver kept for the pull-path `analyseBigFiles(manifest, …)`. Reads `bigFiles.json`, dispatches `processBigFile` once per file in a `for` loop. - `store-flat-analysis.ts` — Phase 7. - `storeFlatAnalysis({scope, payload, branch, metaPaths})` ensures + `storeFlatAnalysis({scope, payload, branch, metaPaths, cache})` ensures `flat-folder` Neo4j indexes, upserts `:Repo` (from `repo-summary.json` - if present, empty payload otherwise), then iterates folder summaries - via `iterateFolderSummaries` to upsert `:Folder`, then iterates - condensed entries via `iterateCondensed` to upsert `:File`. Files whose - containing folder was not in the summaries set get a synthesised empty - `:Folder` so the `CONTAINS` edge always lands. `languageFromPath` - fills `language` when the analysis left it blank. + if present, empty payload otherwise), then **dispatches `:Folder` and + `:File` upserts in batches of `Config.Neo4jBatchSize` (default 50)** + via `upsertFolderNodesBatch` / `upsertFileNodesBatch` from `@bb/neo4j`. + Each batch is one Neo4j write transaction containing the same 12 + Cyphers (1 MERGE + 1 folder-attach + 5 rel CLEARs + 5 rel ATTACHes via + UNWIND) that a single upsert used to issue — so a 1 000-file repo + collapses from ~12 000 round-trips to ~240. Files whose containing + folder was not in the summaries set get a synthesised empty `:Folder` + entry added to the folder batch list **up front** (before any batch + dispatches) so the `CONTAINS` edge always lands. + `languageFromPath` fills `language` when the analysis left it blank. + Both progress reporters (`folders`, `files`) open at phase entry with + their fixed totals so the indexing overall-progress aggregate sees + both denominators from the first tick — fixes the prior "leaps to 100 + then sits there" UX bug. ## Execution order diff --git a/packages/ingest-github/src/strategies/flat-folder/phases/scan-and-classify.ts b/packages/ingest-github/src/strategies/flat-folder/phases/scan-and-classify.ts index 786c9b0..6dc92a7 100644 --- a/packages/ingest-github/src/strategies/flat-folder/phases/scan-and-classify.ts +++ b/packages/ingest-github/src/strategies/flat-folder/phases/scan-and-classify.ts @@ -7,6 +7,7 @@ import type { MetaPaths } from "#src/types/meta-paths.ts"; import type { BigFileEntry } from "#src/types/big-file.ts"; import type { SkipDecider, SourceReader } from "#src/types/pipeline.ts"; import type { ProgressContext } from "#src/progress/types.ts"; +import type { ConcurrencyLimiter } from "#src/pipeline/concurrency.ts"; import { throwIfCancelled } from "#src/pipeline/cancellation.ts"; import { makeSkipDecider } from "#src/pipeline/skip-decisions/index.ts"; import { classifyByTokens, writeBigFiles } from "#src/strategies/flat-folder/big-file/detector.ts"; @@ -24,6 +25,14 @@ export interface ScanAndClassifyInput { skipDecider?: SkipDecider; llmCallContext?: AskLlmOptions; progressContext?: ProgressContext; + /** + * Shared LLM-concurrency limiter. When supplied the underlying + * `scanRepository` runs its two-pass strategy: walk + cache-only decisions + * first, then parallel-deduplicated LLM resolution for unknown + * extensions/filenames under this limiter. Optional so the function + * still works standalone. + */ + limiter?: ConcurrencyLimiter; } export interface ScanAndClassifyResult { @@ -56,6 +65,9 @@ export async function scanAndClassify(input: ScanAndClassifyInput): Promise[0] = { skipDecider }; + if (input.limiter !== undefined) { + scanDeps.limiter = input.limiter; + } if (input.llmCallContext !== undefined) { scanDeps.llmCallContext = input.llmCallContext; } diff --git a/packages/ingest-github/src/strategies/flat-folder/phases/store-flat-analysis.ts b/packages/ingest-github/src/strategies/flat-folder/phases/store-flat-analysis.ts index adeb0a6..7db4433 100644 --- a/packages/ingest-github/src/strategies/flat-folder/phases/store-flat-analysis.ts +++ b/packages/ingest-github/src/strategies/flat-folder/phases/store-flat-analysis.ts @@ -1,6 +1,16 @@ import { readFile } from "node:fs/promises"; +import { Config } from "@bb/types"; +import { getConfigValue } from "@bb/config"; import { logger } from "@bb/logger"; -import { ensureFlatFolderIndexes, upsertFileNode, upsertFolderNode, upsertRepoNode, type NodeScope } from "@bb/neo4j"; +import { + ensureFlatFolderIndexes, + upsertFileNodesBatch, + upsertFolderNodesBatch, + upsertRepoNode, + type NodeScope, + type UpsertFileNodeInput, + type UpsertFolderNodeInput, +} from "@bb/neo4j"; import type { GithubIndexPayload } from "@bb/types"; import type { MetaPaths } from "#src/types/meta-paths.ts"; import { throwIfCancelled } from "#src/pipeline/cancellation.ts"; @@ -30,10 +40,10 @@ export async function storeFlatAnalysis(input: StoreFlatAnalysisInput): Promise< throwIfCancelled(input.scope.knowledgeId); await ensureFlatFolderIndexes(); - let nodesWritten = 0; - let foldersWritten = 0; - let filesWritten = 0; + const batchSize = getConfigValue(Config.Neo4jBatchSize); + // 1. :Repo node — single upsert, not batched (one repo per knowledge). + let nodesWritten = 0; const repoSummary = await readRepoSummary(input.metaPaths); if (repoSummary !== null) { await upsertRepoNode({ @@ -50,7 +60,6 @@ export async function storeFlatAnalysis(input: StoreFlatAnalysisInput): Promise< keyPatterns: repoSummary.keyPatterns, }, }); - nodesWritten += 1; } else { logger.warn(`phase7: no repo summary on disk; writing :Repo with empty summary`); await upsertRepoNode({ @@ -59,60 +68,79 @@ export async function storeFlatAnalysis(input: StoreFlatAnalysisInput): Promise< branch: input.branch, summary: emptyRepoSummaryPayload(), }); - nodesWritten += 1; } + nodesWritten += 1; - const folderReporter = input.progressContext?.reporter({ - phase: "indexing", - subPhase: "folders", - total: { kind: "growing" }, - }); - await folderReporter?.start(); + // 2. Collect every folder we'll upsert: the on-disk folder summaries plus + // synthesised parents for any file whose folder didn't get a summary. Doing + // this up front gives both reporters real fixed totals so `overallProgress` + // doesn't leap to 100 the moment the folder loop completes (the previous + // UX bug where the file sub-phase registered too late to dilute the + // indexing aggregate). + const folderInputs: UpsertFolderNodeInput[] = []; const folderPaths = new Set(); - try { - for await (const folder of iterateFolderSummaries(input.metaPaths)) { - throwIfCancelled(input.scope.knowledgeId); - folderReporter?.incrementSeen(); - await upsertFolderNode({ + for await (const folder of iterateFolderSummaries(input.metaPaths)) { + folderInputs.push({ + scope: input.scope, + folderPath: folder.folderPath, + summary: shapeFolderPayload(folder), + }); + folderPaths.add(folder.folderPath); + } + for (const file of input.cache.values()) { + const folderPath = directFolderOf(file.relativePath); + if (!folderPaths.has(folderPath)) { + folderInputs.push({ scope: input.scope, - folderPath: folder.folderPath, - summary: shapeFolderPayload(folder), + folderPath, + summary: emptyFolderPayload(), }); - folderPaths.add(folder.folderPath); - foldersWritten += 1; - nodesWritten += 1; - folderReporter?.increment(1, { fileName: folder.folderPath || "" }); + folderPaths.add(folderPath); } - } finally { - folderReporter?.stop(); } + // 3. Both reporters open at phase entry with their true totals so the + // overall-progress aggregate sees both denominators from the first tick. + const folderReporter = input.progressContext?.reporter({ + phase: "indexing", + subPhase: "folders", + total: { kind: "fixed", total: folderInputs.length }, + }); const fileReporter = input.progressContext?.reporter({ phase: "indexing", subPhase: "files", total: { kind: "fixed", total: input.cache.size }, }); + await folderReporter?.start(); await fileReporter?.start(); + + let foldersWritten = 0; + let filesWritten = 0; try { - for (const file of input.cache.values()) { + // 4. Batched folder upserts. + logger.info( + `phase7: folder upsert dispatching ${Math.ceil(folderInputs.length / batchSize)} batches of up to ${batchSize} folders (total=${folderInputs.length})`, + ); + for (let i = 0; i < folderInputs.length; i += batchSize) { throwIfCancelled(input.scope.knowledgeId); - const folderPath = directFolderOf(file.relativePath); - if (!folderPaths.has(folderPath)) { - await upsertFolderNode({ - scope: input.scope, - folderPath, - summary: emptyFolderPayload(), - }); - folderPaths.add(folderPath); - foldersWritten += 1; - nodesWritten += 1; + const batch = folderInputs.slice(i, i + batchSize); + await upsertFolderNodesBatch(batch); + foldersWritten += batch.length; + nodesWritten += batch.length; + for (const item of batch) { + folderReporter?.increment(1, { fileName: item.folderPath || "" }); } - await upsertFileNode({ + } + + // 5. Batched file upserts. + const fileInputs: UpsertFileNodeInput[] = []; + for (const file of input.cache.values()) { + fileInputs.push({ orgId: input.scope.orgId, knowledgeId: input.scope.knowledgeId, repoId: input.scope.repoId, relativePath: file.relativePath, - folderPath, + folderPath: directFolderOf(file.relativePath), language: file.language.length > 0 ? file.language : languageFromPath(file.relativePath), sha: file.sha256, sizeBytes: file.sizeBytes, @@ -121,11 +149,22 @@ export async function storeFlatAnalysis(input: StoreFlatAnalysisInput): Promise< totalChunks: file.totalChunks, totalTokenCount: file.totalTokenCount, }); - filesWritten += 1; - nodesWritten += 1; - fileReporter?.increment(1, { fileName: file.relativePath }); + } + logger.info( + `phase7: file upsert dispatching ${Math.ceil(fileInputs.length / batchSize)} batches of up to ${batchSize} files (total=${fileInputs.length})`, + ); + for (let i = 0; i < fileInputs.length; i += batchSize) { + throwIfCancelled(input.scope.knowledgeId); + const batch = fileInputs.slice(i, i + batchSize); + await upsertFileNodesBatch(batch); + filesWritten += batch.length; + nodesWritten += batch.length; + for (const item of batch) { + fileReporter?.increment(1, { fileName: item.relativePath }); + } } } finally { + folderReporter?.stop(); fileReporter?.stop(); } diff --git a/packages/ingest-github/src/types/README.md b/packages/ingest-github/src/types/README.md index 87b2cea..1fd8479 100644 --- a/packages/ingest-github/src/types/README.md +++ b/packages/ingest-github/src/types/README.md @@ -19,9 +19,21 @@ llmCallContext? }`; `llmCallContext` is the optional `AskLlmOptions` - `pipeline.ts` — `ScannedFile`, `OversizedFile`, `ScanEntry`, `FileAnalyzer` port, `AnalyzedFileResult`, `PipelineDeps`, `PipelineSummary`, `SkipDecider` / `SkipDeciderInput` / `SkipDecision` (the unknown-extension - gate port; implementation lives under `pipeline/skip-decisions/`), + gate port; implementation lives under `pipeline/skip-decisions/`). The + `SkipDecider` interface exposes four methods: `decide` (legacy async + single-shot), `decideStatic` (synchronous; returns the resolved decision + or `null` to signal "needs an LLM call"), `decideAndDeferSave` (async LLM + call that mutates the in-memory cache without flushing to disk), and + `persist` (one-shot cache flush). The two-pass scan in `scan.ts` uses the + latter three so unknown-extension probes fan out under the shared LLM + limiter and the disk cache is written exactly once at the end of the + batch. `SourceReader` / `ScanDeps` (the repository-read abstraction; default - implementation in `pipeline/disk-source-reader.ts`), `ArchiveSink` / + implementation in `pipeline/disk-source-reader.ts`). `ScanDeps.limiter` + is the optional shared `ConcurrencyLimiter`; when supplied together with + `skipDecider`, `scanRepository` switches to its two-pass strategy + instead of the legacy inline-await walk. + `ArchiveSink` / `ArchiveSinkInput` (an optional non-fatal sink that the open-source binary never calls), `SourceFactory` / `SourceFactoryInput` / `SourceFactoryResult` (the optional index-side injection hook surfaced diff --git a/packages/ingest-github/src/types/pipeline.ts b/packages/ingest-github/src/types/pipeline.ts index 9f5c0be..aaf13a5 100644 --- a/packages/ingest-github/src/types/pipeline.ts +++ b/packages/ingest-github/src/types/pipeline.ts @@ -1,6 +1,7 @@ import type { GithubIndexPayload, GithubPullPayload } from "@bb/types"; import type { AskLlmOptions } from "@bb/llm"; import type { FileAnalysis } from "@bb/mongo"; +import type { ConcurrencyLimiter } from "#src/pipeline/concurrency.ts"; import type { DiffResult } from "#src/pipeline/git-diff.ts"; export interface ScannedFile { @@ -59,6 +60,14 @@ export interface ScanDeps { * invokes the LLM branch. Absent in OSS standalone runs. */ llmCallContext?: AskLlmOptions; + /** + * Shared LLM-concurrency limiter. When set, `scanRepository` uses a + * two-pass strategy: walk + cache-only decisions in pass 1, parallel + * deduplicated LLM resolution under this limiter in pass 2, drain the + * pending list in pass 3 (all cache-hits). When absent (e.g. legacy + * `SourceFactory` consumers), scan falls back to inline-await per file. + */ + limiter?: ConcurrencyLimiter; } export interface SourceReader { @@ -152,5 +161,31 @@ export interface SkipDeciderInput { } export interface SkipDecider { + /** + * Single-shot decision: applies static filters, consults the in-memory + * + on-disk caches, and falls through to the LLM when neither resolves + * the decision. Persists the cache to disk after each LLM call. + * Kept for non-scan callers and the legacy inline-await path. + */ decide(input: SkipDeciderInput): Promise; + /** + * Synchronous static-only decision. Returns the resolved `SkipDecision` + * when static filters or cache hit resolves it; returns `null` to signal + * "this needs an LLM call to resolve". Used by `scanRepository` in its + * two-pass mode to collect pending entries without blocking the walk. + */ + decideStatic(input: SkipDeciderInput): SkipDecision | null; + /** + * Asynchronous LLM-resolution path that **mutates the in-memory cache** + * but does NOT persist to disk. The caller (typically `scanRepository`) + * batches these under a `ConcurrencyLimiter` and then calls `persist()` + * exactly once at the end of the batch, so concurrent `saveCache` calls + * don't race on the tmp/rename atomicity. + */ + decideAndDeferSave(input: SkipDeciderInput): Promise; + /** + * Persist the in-memory decision cache to disk. Best-effort: swallows + * I/O errors. Called once at the end of a `decideAndDeferSave` batch. + */ + persist(): void; } diff --git a/packages/neo4j/README.md b/packages/neo4j/README.md index e363877..f597e1b 100644 --- a/packages/neo4j/README.md +++ b/packages/neo4j/README.md @@ -40,20 +40,25 @@ The package owns: function / import edges), and one to remove the `:Knowledge` node itself. Called by the server's `DELETE /api/v1/repos/:knowledgeId` route. -- File-node CRUD (`upsertFileNode`) — composes the per-file relationships - (`:HAS_KEYWORD / :HAS_CLASS / :HAS_FUNCTION / :HAS_IMPORT_INTERNAL / -:HAS_IMPORT_EXTERNAL`), clearing stale relationships before - re-attaching for re-runs. The two-`:HAS_IMPORT_*` split mirrors - kube-package's distinction between relative imports and external - packages — downstream MCP queries can ask "which files import this - internal module" vs "which files import this external package" - cleanly +- File-node CRUD (`upsertFileNode`, `upsertFileNodesBatch`) — composes + the per-file relationships (`:HAS_KEYWORD / :HAS_CLASS / :HAS_FUNCTION + / :HAS_IMPORT_INTERNAL / :HAS_IMPORT_EXTERNAL`), clearing stale + relationships before re-attaching for re-runs. The two-`:HAS_IMPORT_*` + split mirrors kube-package's distinction between relative imports and + external packages — downstream MCP queries can ask "which files + import this internal module" vs "which files import this external + package" cleanly. The `*Batch` variant lands an arbitrary number of + files in **one transaction** via Cypher `UNWIND` — same Cypher shape, + wrapped with an outer UNWIND so 50+ files cost the same 12 Cyphers a + single file used to cost. +- Folder-node CRUD (`upsertFolderNode`, `upsertFolderNodesBatch`) — + same shape as file CRUD; batched variant for bulk indexing. The package does **not** own: - Read queries — defer to a future `@bb/graph` once `@bb/mcp` retrieval has a use case -- Telemetry, retry, or transaction batching — driver defaults apply +- Telemetry — driver defaults apply. - Migration tooling — the `IF NOT EXISTS` constraint creates handle schema drift; richer migrations land later @@ -69,6 +74,9 @@ function upsertKnowledgeNode(doc: KnowledgeDoc): Promise; function setKnowledgeStateInGraph(knowledgeId: string, state: KnowledgeState): Promise; function deleteKnowledgeGraph(knowledgeId: string): Promise; function upsertFileNode(input: UpsertFileNodeInput): Promise; +function upsertFileNodesBatch(inputs: readonly UpsertFileNodeInput[]): Promise; +function upsertFolderNode(input: UpsertFolderNodeInput): Promise; +function upsertFolderNodesBatch(inputs: readonly UpsertFolderNodeInput[]): Promise; function runCypher(query: string, params?: Record): Promise; @@ -160,9 +168,12 @@ Neo4jPassword`). Repo-wide ESLint rule blocks `process.env`. "already exists" errors (Neo4j refuses constraints when a matching plain index exists). Operators must drop conflicting indexes manually if uniqueness guarantees matter. -6. **`upsertFileNode` clears stale relationships before re-attaching.** - Re-runs of the same `(knowledgeId, relativePath)` produce a clean - relationship set rather than accumulating outdated keywords/imports. +6. **`upsertFileNode` and `upsertFileNodesBatch` clear stale relationships + before re-attaching.** Re-runs of the same `(knowledgeId, relativePath)` + produce a clean relationship set rather than accumulating outdated + keywords/imports. In the batched variant the clear-then-attach happens + atomically inside one transaction per batch — partial failures roll + back, so re-runs always start from a consistent state. 7. **No raw `Driver` leaks.** `_getDriver()` is not in `src/index.ts`. Higher tiers go through the typed helpers. @@ -174,7 +185,6 @@ Neo4jPassword`). Repo-wide ESLint rule blocks `process.env`. ## What is intentionally out of scope (v0) - Read queries (defer to `@bb/graph`) -- Cypher transactions / batch writes (single-statement per call) - Schema migrations / drops / renames (only `IF NOT EXISTS` creates) - Multi-database support (we use the default `neo4j` db) - Pub/sub / change-data-capture diff --git a/packages/neo4j/src/client.ts b/packages/neo4j/src/client.ts index 56207d2..dac5fbb 100644 --- a/packages/neo4j/src/client.ts +++ b/packages/neo4j/src/client.ts @@ -81,6 +81,35 @@ export async function _runCypher(query: string, params: Record; +} + +/** + * Run multiple Cypher statements inside one write transaction. All-or-nothing: + * either every statement commits or none do. Used by the batched upsert APIs + * so a 50-file batch lands as one transaction instead of 12 × 50 sessions. + * + * Uses the driver's `executeWrite` which retries automatically on transient + * errors (deadlock, leader switch) up to a few attempts. + */ +export async function _runInTransaction(steps: readonly CypherStep[]): Promise { + if (steps.length === 0) { + return; + } + const session: Session = _getDriver().session(); + try { + await session.executeWrite(async (tx) => { + for (const step of steps) { + await tx.run(step.query, step.params); + } + }); + } finally { + await session.close(); + } +} + export function toNeo4jInt(value: number): Integer { return int(value); } diff --git a/packages/neo4j/src/files.ts b/packages/neo4j/src/files.ts index eaf4182..01695ea 100644 --- a/packages/neo4j/src/files.ts +++ b/packages/neo4j/src/files.ts @@ -1,5 +1,5 @@ import type { FileAnalysis } from "@bb/mongo"; -import { _runCypher } from "./client.ts"; +import { _runCypher, _runInTransaction, type CypherStep } from "./client.ts"; const UPSERT_FILE = ` MERGE (f:File {knowledgeId: $knowledgeId, relativePath: $relativePath}) @@ -133,6 +133,225 @@ export async function deleteFileNodes(knowledgeId: string, relativePaths: string await _runCypher(DELETE_FILES, { knowledgeId, relativePaths }); } +// ───────────────────────────────────────────────────────────────────────────── +// Batched upsert — used by the flat-folder indexing phase to land 50+ files in +// one transaction instead of 12 round-trips per file. Same Cypher shape as the +// single-shot path above; just wrapped with an outer UNWIND so one query +// services every file in the batch. The five rel types (HAS_KEYWORD / +// HAS_CLASS / HAS_FUNCTION / HAS_IMPORT_INTERNAL / HAS_IMPORT_EXTERNAL) each +// take two Cyphers: a batched DELETE that clears existing rels for every file +// in the batch by relativePath, then a batched UNWIND that attaches the new +// rels from flattened `(knowledgeId, relativePath, name)` triples. +// ───────────────────────────────────────────────────────────────────────────── + +const BATCH_UPSERT_FILES = ` +UNWIND $files AS f +MERGE (file:File {knowledgeId: f.knowledgeId, relativePath: f.relativePath}) +SET file.orgId = f.orgId, + file.repoId = f.repoId, + file.language = f.language, + file.sha = f.sha, + file.sizeBytes = f.sizeBytes, + file.purpose = f.purpose, + file.summary = f.summary, + file.businessContext = f.businessContext, + file.dataFlowDirection = f.dataFlowDirection, + file.ontologyConcepts = f.ontologyConcepts, + file.businessEntities = f.businessEntities, + file.systemCapabilities = f.systemCapabilities, + file.sideEffects = f.sideEffects, + file.configDependencies = f.configDependencies, + file.integrationSurface = f.integrationSurface, + file.contractsProvided = f.contractsProvided, + file.contractsConsumed = f.contractsConsumed, + file.sectionNames = f.sectionNames, + file.sectionDescriptions = f.sectionDescriptions, + file.isBigFile = f.isBigFile, + file.totalChunks = f.totalChunks, + file.totalTokenCount = f.totalTokenCount, + file.updatedAt = $updatedAt +WITH file, f +MATCH (k:Knowledge {knowledgeId: f.knowledgeId}) +MERGE (k)-[:HAS_FILE]->(file) +`; + +const BATCH_ATTACH_FILES_TO_FOLDERS = ` +UNWIND $pairs AS pair +MATCH (file:File {knowledgeId: pair.knowledgeId, relativePath: pair.relativePath}) +MATCH (folder:Folder {knowledgeId: pair.knowledgeId, folderPath: pair.folderPath}) +MERGE (folder)-[:CONTAINS]->(file) +`; + +const BATCH_CLEAR_RELS_BY_TYPE: Readonly> = { + HAS_KEYWORD: ` +UNWIND $files AS f +MATCH (file:File {knowledgeId: f.knowledgeId, relativePath: f.relativePath})-[r:HAS_KEYWORD]->() +DELETE r +`, + HAS_CLASS: ` +UNWIND $files AS f +MATCH (file:File {knowledgeId: f.knowledgeId, relativePath: f.relativePath})-[r:HAS_CLASS]->() +DELETE r +`, + HAS_FUNCTION: ` +UNWIND $files AS f +MATCH (file:File {knowledgeId: f.knowledgeId, relativePath: f.relativePath})-[r:HAS_FUNCTION]->() +DELETE r +`, + HAS_IMPORT_INTERNAL: ` +UNWIND $files AS f +MATCH (file:File {knowledgeId: f.knowledgeId, relativePath: f.relativePath})-[r:HAS_IMPORT_INTERNAL]->() +DELETE r +`, + HAS_IMPORT_EXTERNAL: ` +UNWIND $files AS f +MATCH (file:File {knowledgeId: f.knowledgeId, relativePath: f.relativePath})-[r:HAS_IMPORT_EXTERNAL]->() +DELETE r +`, +}; + +const BATCH_ATTACH_KEYWORDS = ` +UNWIND $pairs AS p +MATCH (file:File {knowledgeId: p.knowledgeId, relativePath: p.relativePath}) +MERGE (kw:Keyword {name: p.name}) +MERGE (file)-[:HAS_KEYWORD]->(kw) +`; + +const BATCH_ATTACH_CLASSES = ` +UNWIND $pairs AS p +MATCH (file:File {knowledgeId: p.knowledgeId, relativePath: p.relativePath}) +MERGE (c:Class {signature: p.signature}) +MERGE (file)-[:HAS_CLASS]->(c) +`; + +const BATCH_ATTACH_FUNCTIONS = ` +UNWIND $pairs AS p +MATCH (file:File {knowledgeId: p.knowledgeId, relativePath: p.relativePath}) +MERGE (fn:Function {signature: p.signature}) +MERGE (file)-[:HAS_FUNCTION]->(fn) +`; + +const BATCH_ATTACH_IMPORTS_INTERNAL = ` +UNWIND $pairs AS p +MATCH (file:File {knowledgeId: p.knowledgeId, relativePath: p.relativePath}) +MERGE (m:Module {name: p.name}) +MERGE (file)-[:HAS_IMPORT_INTERNAL]->(m) +`; + +const BATCH_ATTACH_IMPORTS_EXTERNAL = ` +UNWIND $pairs AS p +MATCH (file:File {knowledgeId: p.knowledgeId, relativePath: p.relativePath}) +MERGE (m:Module {name: p.name}) +MERGE (file)-[:HAS_IMPORT_EXTERNAL]->(m) +`; + +type RelType = "HAS_KEYWORD" | "HAS_CLASS" | "HAS_FUNCTION" | "HAS_IMPORT_INTERNAL" | "HAS_IMPORT_EXTERNAL"; + +interface FileRow { + knowledgeId: string; + relativePath: string; +} + +export async function upsertFileNodesBatch(inputs: readonly UpsertFileNodeInput[]): Promise { + if (inputs.length === 0) { + return; + } + const updatedAt = new Date().toISOString(); + const files = inputs.map((input) => fileRowFor(input)); + const fileKeys: FileRow[] = inputs.map((input) => ({ knowledgeId: input.knowledgeId, relativePath: input.relativePath })); + const folderPairs = inputs + .filter((input): input is UpsertFileNodeInput & { folderPath: string } => input.folderPath !== undefined) + .map((input) => ({ + knowledgeId: input.knowledgeId, + relativePath: input.relativePath, + folderPath: input.folderPath, + })); + + const keywordPairs = flattenPairs(inputs, "keywords", "name", (v) => v.toLowerCase()); + const classPairs = flattenPairs(inputs, "classes", "signature"); + const functionPairs = flattenPairs(inputs, "functions", "signature"); + const importsInternalPairs = flattenPairs(inputs, "importsInternal", "name"); + const importsExternalPairs = flattenPairs(inputs, "importsExternal", "name"); + + const steps: CypherStep[] = [ + { query: BATCH_UPSERT_FILES, params: { files, updatedAt } }, + ]; + if (folderPairs.length > 0) { + steps.push({ query: BATCH_ATTACH_FILES_TO_FOLDERS, params: { pairs: folderPairs } }); + } + // Clear existing rels of every type for every file in the batch. + for (const relType of ["HAS_KEYWORD", "HAS_CLASS", "HAS_FUNCTION", "HAS_IMPORT_INTERNAL", "HAS_IMPORT_EXTERNAL"] as const) { + steps.push({ query: BATCH_CLEAR_RELS_BY_TYPE[relType], params: { files: fileKeys } }); + } + if (keywordPairs.length > 0) { + steps.push({ query: BATCH_ATTACH_KEYWORDS, params: { pairs: keywordPairs } }); + } + if (classPairs.length > 0) { + steps.push({ query: BATCH_ATTACH_CLASSES, params: { pairs: classPairs } }); + } + if (functionPairs.length > 0) { + steps.push({ query: BATCH_ATTACH_FUNCTIONS, params: { pairs: functionPairs } }); + } + if (importsInternalPairs.length > 0) { + steps.push({ query: BATCH_ATTACH_IMPORTS_INTERNAL, params: { pairs: importsInternalPairs } }); + } + if (importsExternalPairs.length > 0) { + steps.push({ query: BATCH_ATTACH_IMPORTS_EXTERNAL, params: { pairs: importsExternalPairs } }); + } + + await _runInTransaction(steps); +} + +function fileRowFor(input: UpsertFileNodeInput): Record { + const sectionMap = input.analysis.sectionMap ?? []; + return { + knowledgeId: input.knowledgeId, + relativePath: input.relativePath, + orgId: input.orgId ?? "local", + repoId: input.repoId ?? input.knowledgeId, + language: input.language, + sha: input.sha, + sizeBytes: input.sizeBytes, + purpose: input.analysis.purpose, + summary: input.analysis.summary, + businessContext: input.analysis.businessContext, + dataFlowDirection: input.analysis.dataFlowDirection ?? "", + ontologyConcepts: input.analysis.ontologyConcepts ?? [], + businessEntities: input.analysis.businessEntities ?? [], + systemCapabilities: input.analysis.systemCapabilities ?? [], + sideEffects: input.analysis.sideEffects ?? [], + configDependencies: input.analysis.configDependencies ?? [], + integrationSurface: input.analysis.integrationSurface ?? [], + contractsProvided: input.analysis.contractsProvided ?? [], + contractsConsumed: input.analysis.contractsConsumed ?? [], + sectionNames: sectionMap.map((s) => s.name), + sectionDescriptions: sectionMap.map((s) => s.description), + isBigFile: input.isBigFile ?? false, + totalChunks: input.totalChunks ?? 0, + totalTokenCount: input.totalTokenCount ?? 0, + }; +} + +function flattenPairs( + inputs: readonly UpsertFileNodeInput[], + field: "keywords" | "classes" | "functions" | "importsInternal" | "importsExternal", + valueKey: "name" | "signature", + normalize?: (v: string) => string, +): Array> { + const out: Array> = []; + for (const input of inputs) { + const values = input.analysis[field]; + if (!Array.isArray(values)) { + continue; + } + for (const raw of values) { + const value = normalize !== undefined ? normalize(raw) : raw; + out.push({ knowledgeId: input.knowledgeId, relativePath: input.relativePath, [valueKey]: value }); + } + } + return out; +} + export async function upsertFileNode(input: UpsertFileNodeInput): Promise { const params = { knowledgeId: input.knowledgeId, relativePath: input.relativePath }; const sectionMap = input.analysis.sectionMap ?? []; diff --git a/packages/neo4j/src/folder.ts b/packages/neo4j/src/folder.ts index e862c3e..f4c8ad8 100644 --- a/packages/neo4j/src/folder.ts +++ b/packages/neo4j/src/folder.ts @@ -1,4 +1,4 @@ -import { _runCypher } from "./client.ts"; +import { _runCypher, _runInTransaction, type CypherStep } from "./client.ts"; import type { NodeScope } from "./repo.ts"; export interface FolderSummaryPayload { @@ -41,6 +41,80 @@ MERGE (kw:Keyword {name: name}) MERGE (folder)-[:HAS_KEYWORD]->(kw) `; +// ───────────────────────────────────────────────────────────────────────────── +// Batched folder upsert. Same Cypher shape as the single-shot path; wrapped +// with an outer UNWIND so one transaction lands every folder in the batch. +// ───────────────────────────────────────────────────────────────────────────── + +const BATCH_UPSERT_FOLDERS = ` +UNWIND $folders AS fld +MERGE (folder:Folder {orgId: fld.orgId, knowledgeId: fld.knowledgeId, repoId: fld.repoId, folderPath: fld.folderPath}) +SET folder.purpose = fld.purpose, + folder.summary = fld.summary, + folder.dependencyGraph = fld.dependencyGraph, + folder.updatedAt = $updatedAt +WITH folder, fld +MATCH (r:Repo {orgId: fld.orgId, knowledgeId: fld.knowledgeId, repoId: fld.repoId}) +MERGE (r)-[:CONTAINS]->(folder) +`; + +const BATCH_CLEAR_FOLDER_KEYWORDS = ` +UNWIND $folders AS fld +MATCH (folder:Folder {orgId: fld.orgId, knowledgeId: fld.knowledgeId, repoId: fld.repoId, folderPath: fld.folderPath})-[rel:HAS_KEYWORD]->() +DELETE rel +`; + +const BATCH_ATTACH_FOLDER_KEYWORDS = ` +UNWIND $pairs AS p +MATCH (folder:Folder {orgId: p.orgId, knowledgeId: p.knowledgeId, repoId: p.repoId, folderPath: p.folderPath}) +MERGE (kw:Keyword {name: p.name}) +MERGE (folder)-[:HAS_KEYWORD]->(kw) +`; + +export async function upsertFolderNodesBatch(inputs: readonly UpsertFolderNodeInput[]): Promise { + if (inputs.length === 0) { + return; + } + const updatedAt = new Date().toISOString(); + const folders = inputs.map((input) => ({ + orgId: input.scope.orgId, + knowledgeId: input.scope.knowledgeId, + repoId: input.scope.repoId, + folderPath: input.folderPath, + purpose: input.summary.purpose, + summary: input.summary.summary, + dependencyGraph: input.summary.dependencyGraph, + })); + const folderKeys = inputs.map((input) => ({ + orgId: input.scope.orgId, + knowledgeId: input.scope.knowledgeId, + repoId: input.scope.repoId, + folderPath: input.folderPath, + })); + const keywordPairs: Array> = []; + for (const input of inputs) { + for (const raw of input.summary.keywords) { + keywordPairs.push({ + orgId: input.scope.orgId, + knowledgeId: input.scope.knowledgeId, + repoId: input.scope.repoId, + folderPath: input.folderPath, + name: raw.toLowerCase(), + }); + } + } + + const steps: CypherStep[] = [ + { query: BATCH_UPSERT_FOLDERS, params: { folders, updatedAt } }, + { query: BATCH_CLEAR_FOLDER_KEYWORDS, params: { folders: folderKeys } }, + ]; + if (keywordPairs.length > 0) { + steps.push({ query: BATCH_ATTACH_FOLDER_KEYWORDS, params: { pairs: keywordPairs } }); + } + + await _runInTransaction(steps); +} + export async function upsertFolderNode(input: UpsertFolderNodeInput): Promise { const scope = input.scope; const params = { diff --git a/packages/neo4j/src/index.ts b/packages/neo4j/src/index.ts index 03b51c0..c581c80 100644 --- a/packages/neo4j/src/index.ts +++ b/packages/neo4j/src/index.ts @@ -12,13 +12,13 @@ export { deleteKnowledgeGraph, } from "./knowledge.ts"; -export { upsertFileNode, deleteFileNodes } from "./files.ts"; +export { upsertFileNode, upsertFileNodesBatch, deleteFileNodes } from "./files.ts"; export type { UpsertFileNodeInput } from "./files.ts"; export { upsertRepoNode } from "./repo.ts"; export type { NodeScope, RepoSummaryPayload, UpsertRepoNodeInput } from "./repo.ts"; -export { upsertFolderNode } from "./folder.ts"; +export { upsertFolderNode, upsertFolderNodesBatch } from "./folder.ts"; export type { FolderSummaryPayload, UpsertFolderNodeInput } from "./folder.ts"; export { snapshotFilesToVersion } from "./fileVersions.ts"; diff --git a/packages/types/src/config.ts b/packages/types/src/config.ts index 1e72f67..c878718 100644 --- a/packages/types/src/config.ts +++ b/packages/types/src/config.ts @@ -26,6 +26,7 @@ export enum Config { LlmConcurrency = "llm.concurrency", FolderSummaryBatchSize = "folder.summary.batch.size", FolderSummaryBatchMaxFiles = "folder.summary.batch.max.files", + Neo4jBatchSize = "neo4j.batch.size", CondenseContextLimit = "condense.context.limit", CondensePromptOverhead = "condense.prompt.overhead", SmallFileDedupThreshold = "small.file.dedup.threshold", From e45277dba6888422915ad378f4d86c3dbc12fd60 Mon Sep 17 00:00:00 2001 From: Dead-Bytes <143434285+Dead-Bytes@users.noreply.github.com> Date: Fri, 22 May 2026 18:11:30 +0530 Subject: [PATCH 08/11] chore(format): clean up README formatting and improve code readability across multiple files --- packages/ingest-github/README.md | 10 +++++----- packages/ingest-github/src/pipeline/README.md | 1 + packages/ingest-github/src/pipeline/scan.ts | 4 +--- .../src/pipeline/skip-decisions/README.md | 8 ++++---- .../src/strategies/flat-folder/README.md | 16 ++++++++-------- .../flat-folder/big-file/condenser.ts | Bin 10066 -> 10091 bytes .../flat-folder/folder-summary-selective.ts | 9 ++------- .../strategies/flat-folder/folder-summary.ts | 8 ++++++-- .../flat-folder/phases/process-big-files.ts | 7 ++++++- packages/llm/README.md | 4 ++-- packages/mongo/src/aggregateStats.ts | 8 +------- packages/neo4j/README.md | 2 +- packages/neo4j/src/files.ts | 17 ++++++++++++----- 13 files changed, 49 insertions(+), 45 deletions(-) diff --git a/packages/ingest-github/README.md b/packages/ingest-github/README.md index 9d28387..93d7786 100644 --- a/packages/ingest-github/README.md +++ b/packages/ingest-github/README.md @@ -166,17 +166,17 @@ worker hardcodes a single `IngestionStrategy` instance (currently per folder. Bigger folders take the individual single-folder path. Roll back to one LLM call per folder via `bytebell set folder.summary.batch.size 1`. -2. **Clone idempotent.** Re-runs (BullMQ retries) call `git fetch` + +3. **Clone idempotent.** Re-runs (BullMQ retries) call `git fetch` + `git reset --hard` in the existing dir rather than re-cloning. Tokens are re-injected into the remote URL each time. -3. **Token redaction.** `GitCloneError` carries the **redacted** repo +4. **Token redaction.** `GitCloneError` carries the **redacted** repo URL (`https://user:***@host`) — the raw `gitToken` never appears in error messages or logs. -4. **State transition order.** `Processing` is set _before_ any clone +5. **State transition order.** `Processing` is set _before_ any clone work. `Processed` is set _only_ after the entire scan + analyze loop completes. On any thrown error, the handler best-effort sets `Failed` then re-throws so BullMQ records the retry. -5. **Fail-soft analysis, fail-hard infra.** A single file's LLM call +6. **Fail-soft analysis, fail-hard infra.** A single file's LLM call failing falls back to an empty-analysis Raw doc and processing continues. In the big-file path, a single chunk failure contributes an empty analysis to the merge but does not stop the file; a @@ -184,7 +184,7 @@ worker hardcodes a single `IngestionStrategy` instance (currently `dedupAnalyses` so the merged result is always well-formed. A clone failure or Mongo write failure throws and propagates to BullMQ for retry under the queue's `attempts: 3`. -6. **Hardcoded filters only.** No LLM-based ignore decisions in v0. The +7. **Hardcoded filters only.** No LLM-based ignore decisions in v0. The directory / file / extension blocklists in `scan.ts` are the only way files get skipped. diff --git a/packages/ingest-github/src/pipeline/README.md b/packages/ingest-github/src/pipeline/README.md index ae9da32..7c7b0d6 100644 --- a/packages/ingest-github/src/pipeline/README.md +++ b/packages/ingest-github/src/pipeline/README.md @@ -60,6 +60,7 @@ deps.skipDecider.decide(input)` per file. Same semantics as before this `readScannedFile` re-reads a file by absolute path for the big-file phase which streams content lazily. + - `run.ts` — `createPipelineRunner({ reposRootDir, strategy, sourceFactory?, progressContextFactory? })` builds an `IngestRunnerDeps`. GitHub payloads run: branch resolve, source-reader construction, strategy execute, commit persistence. Local diff --git a/packages/ingest-github/src/pipeline/scan.ts b/packages/ingest-github/src/pipeline/scan.ts index fda9236..d7d9db6 100644 --- a/packages/ingest-github/src/pipeline/scan.ts +++ b/packages/ingest-github/src/pipeline/scan.ts @@ -167,9 +167,7 @@ async function* twoPassScan( } } logger.info(`scan: resolving ${unique.size} unique skip-decision keys for ${pending.length} pending files`); - await Promise.all( - Array.from(unique.values()).map((input) => limiter(() => decider.decideAndDeferSave(input))), - ); + await Promise.all(Array.from(unique.values()).map((input) => limiter(() => decider.decideAndDeferSave(input)))); decider.persist(); } diff --git a/packages/ingest-github/src/pipeline/skip-decisions/README.md b/packages/ingest-github/src/pipeline/skip-decisions/README.md index 4a6fa2f..18d80bb 100644 --- a/packages/ingest-github/src/pipeline/skip-decisions/README.md +++ b/packages/ingest-github/src/pipeline/skip-decisions/README.md @@ -27,10 +27,10 @@ its batch. ```ts interface SkipDecider { - decide(input): Promise; // legacy single-shot path - decideStatic(input): SkipDecision | null; // sync; null = needs LLM - decideAndDeferSave(input): Promise; // LLM call, no disk save - persist(): void; // flush cache to disk once + decide(input): Promise; // legacy single-shot path + decideStatic(input): SkipDecision | null; // sync; null = needs LLM + decideAndDeferSave(input): Promise; // LLM call, no disk save + persist(): void; // flush cache to disk once } ``` diff --git a/packages/ingest-github/src/strategies/flat-folder/README.md b/packages/ingest-github/src/strategies/flat-folder/README.md index 5a725f4..78d8acf 100644 --- a/packages/ingest-github/src/strategies/flat-folder/README.md +++ b/packages/ingest-github/src/strategies/flat-folder/README.md @@ -21,30 +21,30 @@ this single pool. One knob bounds total in-flight LLM concurrency. **two-pass** strategy: walk + cache-only `decideStatic` first, then parallel-deduplicated LLM resolution for unknown extensions/filenames through the shared limiter, then drain. -2a. **analyse-small** (`phases/analyse-small.ts`) — reads the manifest's + 2a. **analyse-small** (`phases/analyse-small.ts`) — reads the manifest's `kind: "small"` entries, re-opens content, runs the LLM file-analyser per file under the shared limiter, writes `CondensedFileAnalysis` JSON. Also writes oversized stubs. -2b. **analyse-big-files** (`phases/process-big-files.ts` — + 2b. **analyse-big-files** (`phases/process-big-files.ts` — `analyseBigFiles`) — chunk-task queue across all big files. Every chunk is an independent task on the shared limiter; per-file condense is scheduled as soon as that file's last chunk lands (one in-place retry on transient condense failures). Runs **concurrently with 2a**. -3. **backfill-fields** (`backfill/fields.ts`) — for each cached condensed +2. **backfill-fields** (`backfill/fields.ts`) — for each cached condensed entry with missing extended fields (`keywords`, `sideEffects`, `dataFlowDirection`, `sectionMap`, …) dispatches one LLM call through the shared limiter to fill the gaps. Idempotent — no-op on a complete entry. -4. **summarise-folders** (`folder-summary.ts`) — groups condensed entries +3. **summarise-folders** (`folder-summary.ts`) — groups condensed entries by direct parent folder. Small folders (`≤ Config.FolderSummaryBatchMaxFiles`, default 15) are batched up to `Config.FolderSummaryBatchSize` (default 10) per LLM call. Bigger folders take the individual single-folder path. Both flows run through the shared limiter. -5. **summarise-repo** (`repo-summary.ts`) — load folder summaries +4. **summarise-repo** (`repo-summary.ts`) — load folder summaries shallowest-first; one call if it fits `ContextWindowLimit`, batch + merge otherwise; persist `repo-summary.json` with the v2-flat envelope. -6. **store-flat-analysis** (`phases/store-flat-analysis.ts`) — ensure +5. **store-flat-analysis** (`phases/store-flat-analysis.ts`) — ensure flat-folder indexes, upsert `:Repo`, then every `:Folder`, then every `:File` with the extended analysis + Folder→File `CONTAINS` edge. @@ -106,8 +106,8 @@ The strategy emits progress through the `ProgressContext` port defined in `writeScanManifest`. The canonical handoff between phase 1 and phases 2a/2b. - `folder-path.ts` — `directFolderOf`, `affectedFolderPaths`. - `folder-summary.ts` — group + summarise (individual or batched) + persist - + iterate folder summaries; shared `dispatchFolderSummaries` used by both - the main strategy and the pull-path's selective folder phase. + - iterate folder summaries; shared `dispatchFolderSummaries` used by both + the main strategy and the pull-path's selective folder phase. - `folder-summary-selective.ts` — pull-time selective folder summary phase. - `repo-summary.ts` — single-shot or batched repo summary with envelope writer. - `phases/scan-and-classify.ts` — Phase 1. diff --git a/packages/ingest-github/src/strategies/flat-folder/big-file/condenser.ts b/packages/ingest-github/src/strategies/flat-folder/big-file/condenser.ts index fdde9b835a295bd1137c8c84fd809dc1a180b219..a4663bd8edfe7468d01368a7aeb5affc0a9bdc0e 100644 GIT binary patch delta 94 zcmccQ_u6lR1YaPR0vI^^`?IXTYxc_pb8B?|WT3e~j`xy=E5Op*W+{~C$_ delta 50 zcmaFucgb&q1m9$NK5<2b(&E&jfTH}|f)X8toSa[] = []; for (const bucket of individual) { - tasks.push(limiter(() => dispatchIndividual(bucket, metaPaths, totals, llmCallContext, reporter, knowledgeId, phaseLabel))); + tasks.push( + limiter(() => dispatchIndividual(bucket, metaPaths, totals, llmCallContext, reporter, knowledgeId, phaseLabel)), + ); } for (const batch of batches) { - tasks.push(limiter(() => dispatchBatch(batch, metaPaths, totals, llmCallContext, reporter, knowledgeId, phaseLabel))); + tasks.push( + limiter(() => dispatchBatch(batch, metaPaths, totals, llmCallContext, reporter, knowledgeId, phaseLabel)), + ); } await Promise.all(tasks); return totals; diff --git a/packages/ingest-github/src/strategies/flat-folder/phases/process-big-files.ts b/packages/ingest-github/src/strategies/flat-folder/phases/process-big-files.ts index 1577849..2f8b7ba 100644 --- a/packages/ingest-github/src/strategies/flat-folder/phases/process-big-files.ts +++ b/packages/ingest-github/src/strategies/flat-folder/phases/process-big-files.ts @@ -16,7 +16,12 @@ import { inspect } from "#src/strategies/flat-folder/big-file/cache.ts"; import { splitFileIntoChunks } from "#src/strategies/flat-folder/big-file/chunker.ts"; import { analyzeChunk } from "#src/strategies/flat-folder/big-file/chunk-analyzer.ts"; import { condenseChunks } from "#src/strategies/flat-folder/big-file/condenser.ts"; -import { loadChunkIfPresent, saveChunk, saveCondensed, saveManifest } from "#src/strategies/flat-folder/big-file/storage.ts"; +import { + loadChunkIfPresent, + saveChunk, + saveCondensed, + saveManifest, +} from "#src/strategies/flat-folder/big-file/storage.ts"; import { processBigFile } from "#src/strategies/flat-folder/big-file/index.ts"; import type { ScanManifest, ScanManifestEntry } from "#src/strategies/flat-folder/scan-manifest.ts"; diff --git a/packages/llm/README.md b/packages/llm/README.md index 2deb951..64e6cef 100644 --- a/packages/llm/README.md +++ b/packages/llm/README.md @@ -156,9 +156,9 @@ it. The cost ledger described in [docs/arch.md](../../docs/arch.md) is sees a single `AskLlmResult`. BullMQ's `attempts: 3` wraps the whole call — retries walk the chain again, useful when a transient OpenRouter outage clears between retries. -4a. **No upstream-provider fallback.** Every request carries + 4a. **No upstream-provider fallback.** Every request carries `provider: { allow_fallbacks: false }`. This is orthogonal to the - `models` chain in invariant 4 — `models` controls *which model* the + `models` chain in invariant 4 — `models` controls _which model_ the gateway tries; `allow_fallbacks` controls whether OpenRouter routes to a different upstream backend serving the same model when the first one stalls. We disable the latter so a slow provider cannot eat the diff --git a/packages/mongo/src/aggregateStats.ts b/packages/mongo/src/aggregateStats.ts index 0cfa6a8..95f7d59 100644 --- a/packages/mongo/src/aggregateStats.ts +++ b/packages/mongo/src/aggregateStats.ts @@ -1,10 +1,4 @@ -import type { - KnowledgeDoc, - StatsCommitEntry, - StatsRepoEntry, - StatsResponse, - StatsTotals, -} from "@bb/types"; +import type { KnowledgeDoc, StatsCommitEntry, StatsRepoEntry, StatsResponse, StatsTotals } from "@bb/types"; import { _getDb } from "./client.ts"; import { Collections } from "./collections.ts"; diff --git a/packages/neo4j/README.md b/packages/neo4j/README.md index f597e1b..ba441b2 100644 --- a/packages/neo4j/README.md +++ b/packages/neo4j/README.md @@ -42,7 +42,7 @@ The package owns: route. - File-node CRUD (`upsertFileNode`, `upsertFileNodesBatch`) — composes the per-file relationships (`:HAS_KEYWORD / :HAS_CLASS / :HAS_FUNCTION - / :HAS_IMPORT_INTERNAL / :HAS_IMPORT_EXTERNAL`), clearing stale +/ :HAS_IMPORT_INTERNAL / :HAS_IMPORT_EXTERNAL`), clearing stale relationships before re-attaching for re-runs. The two-`:HAS_IMPORT_*` split mirrors kube-package's distinction between relative imports and external packages — downstream MCP queries can ask "which files diff --git a/packages/neo4j/src/files.ts b/packages/neo4j/src/files.ts index 01695ea..7d049e3 100644 --- a/packages/neo4j/src/files.ts +++ b/packages/neo4j/src/files.ts @@ -258,7 +258,10 @@ export async function upsertFileNodesBatch(inputs: readonly UpsertFileNodeInput[ } const updatedAt = new Date().toISOString(); const files = inputs.map((input) => fileRowFor(input)); - const fileKeys: FileRow[] = inputs.map((input) => ({ knowledgeId: input.knowledgeId, relativePath: input.relativePath })); + const fileKeys: FileRow[] = inputs.map((input) => ({ + knowledgeId: input.knowledgeId, + relativePath: input.relativePath, + })); const folderPairs = inputs .filter((input): input is UpsertFileNodeInput & { folderPath: string } => input.folderPath !== undefined) .map((input) => ({ @@ -273,14 +276,18 @@ export async function upsertFileNodesBatch(inputs: readonly UpsertFileNodeInput[ const importsInternalPairs = flattenPairs(inputs, "importsInternal", "name"); const importsExternalPairs = flattenPairs(inputs, "importsExternal", "name"); - const steps: CypherStep[] = [ - { query: BATCH_UPSERT_FILES, params: { files, updatedAt } }, - ]; + const steps: CypherStep[] = [{ query: BATCH_UPSERT_FILES, params: { files, updatedAt } }]; if (folderPairs.length > 0) { steps.push({ query: BATCH_ATTACH_FILES_TO_FOLDERS, params: { pairs: folderPairs } }); } // Clear existing rels of every type for every file in the batch. - for (const relType of ["HAS_KEYWORD", "HAS_CLASS", "HAS_FUNCTION", "HAS_IMPORT_INTERNAL", "HAS_IMPORT_EXTERNAL"] as const) { + for (const relType of [ + "HAS_KEYWORD", + "HAS_CLASS", + "HAS_FUNCTION", + "HAS_IMPORT_INTERNAL", + "HAS_IMPORT_EXTERNAL", + ] as const) { steps.push({ query: BATCH_CLEAR_RELS_BY_TYPE[relType], params: { files: fileKeys } }); } if (keywordPairs.length > 0) { From f5cdaa3beba9b61d94355ec1ade0094425ad7a97 Mon Sep 17 00:00:00 2001 From: Dead-Bytes <143434285+Dead-Bytes@users.noreply.github.com> Date: Fri, 22 May 2026 18:13:54 +0530 Subject: [PATCH 09/11] chore: update bun.lock to reflect dependency changes --- bun.lock | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/bun.lock b/bun.lock index 92fb911..0042f56 100644 --- a/bun.lock +++ b/bun.lock @@ -1,6 +1,5 @@ { "lockfileVersion": 1, - "configVersion": 0, "workspaces": { "": { "name": "bytebell-public", @@ -56,6 +55,20 @@ "@bb/types": "workspace:*", }, }, + "packages/ingest-business-context": { + "name": "@bb/ingest-business-context", + "version": "0.0.0", + "dependencies": { + "@bb/config": "workspace:*", + "@bb/errors": "workspace:*", + "@bb/ingest-github": "workspace:*", + "@bb/llm": "workspace:*", + "@bb/logger": "workspace:*", + "@bb/neo4j": "workspace:*", + "@bb/queue": "workspace:*", + "@bb/types": "workspace:*", + }, + }, "packages/ingest-github": { "name": "@bb/ingest-github", "version": "0.0.0", @@ -191,6 +204,8 @@ "@bb/errors": ["@bb/errors@workspace:packages/errors"], + "@bb/ingest-business-context": ["@bb/ingest-business-context@workspace:packages/ingest-business-context"], + "@bb/ingest-github": ["@bb/ingest-github@workspace:packages/ingest-github"], "@bb/llm": ["@bb/llm@workspace:packages/llm"], From 0a58a29d18c0b52302fe5e9c9fd4a3fad457511e Mon Sep 17 00:00:00 2001 From: Dead-Bytes <143434285+Dead-Bytes@users.noreply.github.com> Date: Fri, 22 May 2026 18:23:16 +0530 Subject: [PATCH 10/11] refactor: rename import for analyseBigFiles and remove legacy processBigFiles code --- .../src/strategies/flat-folder/index.ts | 2 +- .../flat-folder/phases/analyse-big-files.ts | 287 ++++++++++++++++++ .../flat-folder/phases/process-big-files.ts | 286 +---------------- 3 files changed, 290 insertions(+), 285 deletions(-) create mode 100644 packages/ingest-github/src/strategies/flat-folder/phases/analyse-big-files.ts diff --git a/packages/ingest-github/src/strategies/flat-folder/index.ts b/packages/ingest-github/src/strategies/flat-folder/index.ts index 5093568..86797a6 100644 --- a/packages/ingest-github/src/strategies/flat-folder/index.ts +++ b/packages/ingest-github/src/strategies/flat-folder/index.ts @@ -8,7 +8,7 @@ import { classifyFailure } from "#src/pipeline/failure-classifier.ts"; import { withConcurrency } from "#src/pipeline/concurrency.ts"; import { scanAndClassify } from "./phases/scan-and-classify.ts"; import { analyseSmallFiles } from "./phases/analyse-small.ts"; -import { analyseBigFiles } from "./phases/process-big-files.ts"; +import { analyseBigFiles } from "./phases/analyse-big-files.ts"; import { backfillMissingFields } from "./backfill/fields.ts"; import { FileAnalysisCache } from "./file-analysis-cache.ts"; import { runFolderSummaryPhase } from "./folder-summary.ts"; diff --git a/packages/ingest-github/src/strategies/flat-folder/phases/analyse-big-files.ts b/packages/ingest-github/src/strategies/flat-folder/phases/analyse-big-files.ts new file mode 100644 index 0000000..33f6446 --- /dev/null +++ b/packages/ingest-github/src/strategies/flat-folder/phases/analyse-big-files.ts @@ -0,0 +1,287 @@ +import { createHash } from "node:crypto"; +import { logger } from "@bb/logger"; +import { Config } from "@bb/types"; +import { getConfigValue } from "@bb/config"; +import type { AskLlmOptions } from "@bb/llm"; +import { LlmConfigError, LlmError } from "@bb/errors"; +import type { MetaPaths } from "#src/types/meta-paths.ts"; +import type { AnalyzedFileResult, SourceReader } from "#src/types/pipeline.ts"; +import type { ProgressContext } from "#src/progress/types.ts"; +import type { ConcurrencyLimiter } from "#src/pipeline/concurrency.ts"; +import type { ChunkAnalysisResult, FileChunk, HugeFileManifest } from "#src/types/big-file.ts"; +import type { CondensedFileAnalysis } from "#src/types/condensed-file-analysis.ts"; +import { throwIfCancelled, CancellationError } from "#src/pipeline/cancellation.ts"; +import { inspect } from "#src/strategies/flat-folder/big-file/cache.ts"; +import { splitFileIntoChunks } from "#src/strategies/flat-folder/big-file/chunker.ts"; +import { analyzeChunk } from "#src/strategies/flat-folder/big-file/chunk-analyzer.ts"; +import { condenseChunks } from "#src/strategies/flat-folder/big-file/condenser.ts"; +import { + loadChunkIfPresent, + saveChunk, + saveCondensed, + saveManifest, +} from "#src/strategies/flat-folder/big-file/storage.ts"; +import type { ScanManifest, ScanManifestEntry } from "#src/strategies/flat-folder/scan-manifest.ts"; +import type { ProcessBigFilesResult } from "#src/strategies/flat-folder/phases/process-big-files.ts"; +import { describe } from "#src/strategies/flat-folder/phases/process-big-files.ts"; + +const CONDENSE_MAX_ATTEMPTS = 2; +const CONDENSE_RETRY_BACKOFF_MS = 2000; + +export interface AnalyseBigFilesInput { + knowledgeId: string; + manifest: ScanManifest; + source: SourceReader; + metaPaths: MetaPaths; + limiter: ConcurrencyLimiter; + llmCallContext?: AskLlmOptions; + progressContext?: ProgressContext; +} + +interface BigFileState { + entry: ScanManifestEntry; + content: string; + chunks: FileChunk[]; + results: (ChunkAnalysisResult | undefined)[]; + pendingChunks: number; + fatal: boolean; +} + +/** + * Manifest-driven big-file phase. Every chunk of every big file is an + * independent task scheduled through the shared LLM limiter. As soon as the + * last chunk of a given file lands, that file's condense is scheduled — + * multiple condenses run in parallel with the still-pending chunks of slower + * files. All LLM calls (chunk + condense) check out from the same limiter. + * + * Files already fully processed (manifest + condensed on disk) are skipped. + */ +export async function analyseBigFiles(input: AnalyseBigFilesInput): Promise { + const maxTokensPerChunk = getConfigValue(Config.MaxTokensPerChunk); + const bigEntries = input.manifest.entries.filter((e) => e.kind === "big"); + + let cached = 0; + let failed = 0; + let processed = 0; + let totalInputTokens = 0; + let totalOutputTokens = 0; + let totalCostUsd = 0; + + // Per-file preparation: read content, chunk, record state. Sequential and + // cheap — no LLM calls here. + const states: BigFileState[] = []; + for (const entry of bigEntries) { + throwIfCancelled(input.knowledgeId); + const status = await inspect(input.metaPaths, entry.relativePath); + if (status === "complete") { + cached += 1; + continue; + } + let content: string; + try { + content = await input.source.readFile(entry.relativePath); + } catch (cause: unknown) { + failed += 1; + logger.warn(`analyse-big: read failed for ${entry.relativePath}: ${describe(cause)}`); + continue; + } + if (content.length === 0) { + failed += 1; + logger.warn(`analyse-big: empty content for ${entry.relativePath}; skipping`); + continue; + } + const chunks = splitFileIntoChunks(entry.relativePath, content, maxTokensPerChunk); + states.push({ + entry, + content, + chunks, + results: new Array(chunks.length), + pendingChunks: chunks.length, + fatal: false, + }); + logger.info(`analyse-big: ${entry.relativePath} split into ${chunks.length} chunks`); + } + + const totalChunks = states.reduce((acc, s) => acc + s.chunks.length, 0); + const chunkReporter = input.progressContext?.reporter({ + phase: "file_analysis", + subPhase: "big_files_chunks", + total: { kind: "fixed", total: totalChunks }, + }); + await chunkReporter?.start(); + const condenseReporter = input.progressContext?.reporter({ + phase: "file_analysis", + subPhase: "big_files_condense", + total: { kind: "fixed", total: states.length }, + }); + await condenseReporter?.start(); + + // For oversized entries the legacy phase counted them; we accept the manifest + // already accounted for them via the small phase (which writes the stub). + // Surfaced here for parity with the legacy result shape. + const skippedOversized = input.manifest.entries.filter((e) => e.kind === "oversized").length; + + const condensePromises: Promise[] = []; + + function maybeScheduleCondense(state: BigFileState): void { + if (state.pendingChunks > 0 || state.fatal) { + return; + } + const definedResults = state.results.filter((r): r is ChunkAnalysisResult => r !== undefined); + condensePromises.push( + input.limiter(async () => { + throwIfCancelled(input.knowledgeId); + let merged: AnalyzedFileResult | null = null; + for (let attempt = 1; attempt <= CONDENSE_MAX_ATTEMPTS; attempt += 1) { + try { + merged = await condenseChunks(state.entry.relativePath, definedResults, input.llmCallContext); + break; + } catch (cause: unknown) { + if (cause instanceof CancellationError) { + throw cause; + } + if (cause instanceof LlmConfigError || cause instanceof LlmError) { + throw cause; + } + if (attempt < CONDENSE_MAX_ATTEMPTS) { + logger.warn( + `analyse-big: condense attempt ${attempt}/${CONDENSE_MAX_ATTEMPTS} failed for ${state.entry.relativePath}; retrying: ${describe(cause)}`, + ); + await sleep(CONDENSE_RETRY_BACKOFF_MS); + continue; + } + failed += 1; + logger.warn( + `analyse-big: condense failed after ${CONDENSE_MAX_ATTEMPTS} attempts for ${state.entry.relativePath}: ${describe(cause)}`, + ); + } + } + if (merged === null) { + condenseReporter?.increment(1, { fileName: state.entry.relativePath }); + return; + } + + try { + const chunkInputTokens = definedResults.reduce((acc, r) => acc + (r.tokenUsage?.inputTokens ?? 0), 0); + const chunkOutputTokens = definedResults.reduce((acc, r) => acc + (r.tokenUsage?.outputTokens ?? 0), 0); + const chunkCostUsd = definedResults.reduce((acc, r) => acc + (r.tokenUsage?.costUsd ?? 0), 0); + const totalTokenCount = state.chunks.reduce((acc, c) => acc + c.tokenCount, 0); + const totalIn = chunkInputTokens + (merged.tokenUsage?.inputTokens ?? 0); + const totalOut = chunkOutputTokens + (merged.tokenUsage?.outputTokens ?? 0); + const totalCost = chunkCostUsd + (merged.tokenUsage?.costUsd ?? 0); + + const manifest: HugeFileManifest = { + relativePath: state.entry.relativePath, + totalChunks: state.chunks.length, + totalTokenCount, + chunkPaths: state.chunks.map((_, i) => `chunks/${encodeFolder(state.entry.relativePath)}/chunk-${i}.json`), + generatedAt: new Date().toISOString(), + }; + await saveManifest(input.metaPaths, manifest); + + const condensed: CondensedFileAnalysis = { + relativePath: state.entry.relativePath, + language: merged.language, + sha256: sha256(state.content), + sizeBytes: state.entry.sizeBytes, + tokenCount: totalTokenCount, + isBigFile: true, + totalChunks: state.chunks.length, + totalTokenCount, + analysedAt: new Date().toISOString(), + analysis: merged.analysis, + tokenUsage: { inputTokens: totalIn, outputTokens: totalOut, costUsd: totalCost }, + }; + await saveCondensed(input.metaPaths, condensed); + + totalInputTokens += totalIn; + totalOutputTokens += totalOut; + totalCostUsd += totalCost; + processed += 1; + } catch (cause: unknown) { + if (cause instanceof CancellationError) { + throw cause; + } + failed += 1; + logger.warn(`analyse-big: persist failed for ${state.entry.relativePath}: ${describe(cause)}`); + } finally { + condenseReporter?.increment(1, { fileName: state.entry.relativePath }); + } + }), + ); + } + + const chunkPromises: Promise[] = []; + for (const state of states) { + for (let i = 0; i < state.chunks.length; i += 1) { + const idx = i; + const chunk = state.chunks[idx]; + if (chunk === undefined) { + continue; + } + chunkPromises.push( + input.limiter(async () => { + throwIfCancelled(input.knowledgeId); + try { + const cachedChunk = await loadChunkIfPresent(input.metaPaths, state.entry.relativePath, idx); + if (cachedChunk !== null) { + state.results[idx] = cachedChunk; + } else { + const analyzed = await analyzeChunk(chunk, input.llmCallContext); + await saveChunk(input.metaPaths, analyzed); + state.results[idx] = analyzed; + } + } catch (cause: unknown) { + if (cause instanceof CancellationError) { + throw cause; + } + if (cause instanceof LlmConfigError || cause instanceof LlmError) { + state.fatal = true; + throw cause; + } + logger.warn( + `analyse-big: chunk ${idx + 1}/${state.chunks.length} failed for ${state.entry.relativePath}: ${describe(cause)}`, + ); + } finally { + state.pendingChunks -= 1; + chunkReporter?.increment(1, { fileName: `${state.entry.relativePath}#chunk-${String(idx)}` }); + maybeScheduleCondense(state); + } + }), + ); + } + } + + try { + await Promise.all(chunkPromises); + await Promise.all(condensePromises); + } finally { + chunkReporter?.stop(); + condenseReporter?.stop(); + } + + logger.info( + `analyse-big done: processed=${processed} cached=${cached} failed=${failed} skippedOversized=${skippedOversized}`, + ); + return { + processed, + cached, + failed, + skippedOversized, + tokenUsage: { inputTokens: totalInputTokens, outputTokens: totalOutputTokens, costUsd: totalCostUsd }, + }; +} + +function sha256(content: string): string { + return createHash("sha256").update(content).digest("hex"); +} + +function encodeFolder(relativePath: string): string { + return relativePath.replace(/\//gu, "__SL__").replace(/\\/gu, "__BS__"); +} + +function sleep(ms: number): Promise { + return new Promise((resolve) => { + setTimeout(resolve, ms); + }); +} diff --git a/packages/ingest-github/src/strategies/flat-folder/phases/process-big-files.ts b/packages/ingest-github/src/strategies/flat-folder/phases/process-big-files.ts index 2f8b7ba..951b10e 100644 --- a/packages/ingest-github/src/strategies/flat-folder/phases/process-big-files.ts +++ b/packages/ingest-github/src/strategies/flat-folder/phases/process-big-files.ts @@ -1,32 +1,13 @@ -import { createHash } from "node:crypto"; import { logger } from "@bb/logger"; -import { Config } from "@bb/types"; -import { getConfigValue } from "@bb/config"; import type { AskLlmOptions } from "@bb/llm"; import { LlmConfigError, LlmError } from "@bb/errors"; import type { MetaPaths } from "#src/types/meta-paths.ts"; -import type { AnalyzedFileResult, SourceReader } from "#src/types/pipeline.ts"; +import type { SourceReader } from "#src/types/pipeline.ts"; import type { ProgressContext } from "#src/progress/types.ts"; -import type { ConcurrencyLimiter } from "#src/pipeline/concurrency.ts"; -import type { ChunkAnalysisResult, FileChunk, HugeFileManifest } from "#src/types/big-file.ts"; -import type { CondensedFileAnalysis } from "#src/types/condensed-file-analysis.ts"; import { throwIfCancelled, CancellationError } from "#src/pipeline/cancellation.ts"; import { readBigFiles } from "#src/strategies/flat-folder/big-file/detector.ts"; import { inspect } from "#src/strategies/flat-folder/big-file/cache.ts"; -import { splitFileIntoChunks } from "#src/strategies/flat-folder/big-file/chunker.ts"; -import { analyzeChunk } from "#src/strategies/flat-folder/big-file/chunk-analyzer.ts"; -import { condenseChunks } from "#src/strategies/flat-folder/big-file/condenser.ts"; -import { - loadChunkIfPresent, - saveChunk, - saveCondensed, - saveManifest, -} from "#src/strategies/flat-folder/big-file/storage.ts"; import { processBigFile } from "#src/strategies/flat-folder/big-file/index.ts"; -import type { ScanManifest, ScanManifestEntry } from "#src/strategies/flat-folder/scan-manifest.ts"; - -const CONDENSE_MAX_ATTEMPTS = 2; -const CONDENSE_RETRY_BACKOFF_MS = 2000; export interface ProcessBigFilesInput { knowledgeId: string; @@ -139,269 +120,6 @@ export async function processBigFilesQueue(input: ProcessBigFilesInput): Promise } } -// --------------------------------------------------------------------------- -// Chunk-queue model (manifest-driven) -// --------------------------------------------------------------------------- - -export interface AnalyseBigFilesInput { - knowledgeId: string; - manifest: ScanManifest; - source: SourceReader; - metaPaths: MetaPaths; - limiter: ConcurrencyLimiter; - llmCallContext?: AskLlmOptions; - progressContext?: ProgressContext; -} - -interface BigFileState { - entry: ScanManifestEntry; - content: string; - chunks: FileChunk[]; - results: (ChunkAnalysisResult | undefined)[]; - pendingChunks: number; - fatal: boolean; -} - -/** - * Manifest-driven big-file phase. Every chunk of every big file is an - * independent task scheduled through the shared LLM limiter. As soon as the - * last chunk of a given file lands, that file's condense is scheduled — - * multiple condenses run in parallel with the still-pending chunks of slower - * files. All LLM calls (chunk + condense) check out from the same limiter. - * - * Files already fully processed (manifest + condensed on disk) are skipped. - */ -export async function analyseBigFiles(input: AnalyseBigFilesInput): Promise { - const maxTokensPerChunk = getConfigValue(Config.MaxTokensPerChunk); - const bigEntries = input.manifest.entries.filter((e) => e.kind === "big"); - - let cached = 0; - let skippedOversized = 0; - let failed = 0; - let processed = 0; - let totalInputTokens = 0; - let totalOutputTokens = 0; - let totalCostUsd = 0; - - // Per-file preparation: read content, chunk, record state. Sequential and - // cheap — no LLM calls here. - const states: BigFileState[] = []; - for (const entry of bigEntries) { - throwIfCancelled(input.knowledgeId); - const status = await inspect(input.metaPaths, entry.relativePath); - if (status === "complete") { - cached += 1; - continue; - } - let content: string; - try { - content = await input.source.readFile(entry.relativePath); - } catch (cause: unknown) { - failed += 1; - logger.warn(`analyse-big: read failed for ${entry.relativePath}: ${describe(cause)}`); - continue; - } - if (content.length === 0) { - failed += 1; - logger.warn(`analyse-big: empty content for ${entry.relativePath}; skipping`); - continue; - } - const chunks = splitFileIntoChunks(entry.relativePath, content, maxTokensPerChunk); - states.push({ - entry, - content, - chunks, - results: new Array(chunks.length), - pendingChunks: chunks.length, - fatal: false, - }); - logger.info(`analyse-big: ${entry.relativePath} split into ${chunks.length} chunks`); - } - - const totalChunks = states.reduce((acc, s) => acc + s.chunks.length, 0); - const chunkReporter = input.progressContext?.reporter({ - phase: "file_analysis", - subPhase: "big_files_chunks", - total: { kind: "fixed", total: totalChunks }, - }); - await chunkReporter?.start(); - const condenseReporter = input.progressContext?.reporter({ - phase: "file_analysis", - subPhase: "big_files_condense", - total: { kind: "fixed", total: states.length }, - }); - await condenseReporter?.start(); - - // For oversized entries the legacy phase counted them; we accept the manifest - // already accounted for them via the small phase (which writes the stub). - // Surfaced here for parity with the legacy result shape. - skippedOversized = input.manifest.entries.filter((e) => e.kind === "oversized").length; - - const condensePromises: Promise[] = []; - - function maybeScheduleCondense(state: BigFileState): void { - if (state.pendingChunks > 0 || state.fatal) { - return; - } - const definedResults = state.results.filter((r): r is ChunkAnalysisResult => r !== undefined); - condensePromises.push( - input.limiter(async () => { - throwIfCancelled(input.knowledgeId); - let merged: AnalyzedFileResult | null = null; - for (let attempt = 1; attempt <= CONDENSE_MAX_ATTEMPTS; attempt += 1) { - try { - merged = await condenseChunks(state.entry.relativePath, definedResults, input.llmCallContext); - break; - } catch (cause: unknown) { - if (cause instanceof CancellationError) { - throw cause; - } - if (cause instanceof LlmConfigError || cause instanceof LlmError) { - throw cause; - } - if (attempt < CONDENSE_MAX_ATTEMPTS) { - logger.warn( - `analyse-big: condense attempt ${attempt}/${CONDENSE_MAX_ATTEMPTS} failed for ${state.entry.relativePath}; retrying: ${describe(cause)}`, - ); - await sleep(CONDENSE_RETRY_BACKOFF_MS); - continue; - } - failed += 1; - logger.warn( - `analyse-big: condense failed after ${CONDENSE_MAX_ATTEMPTS} attempts for ${state.entry.relativePath}: ${describe(cause)}`, - ); - } - } - if (merged === null) { - condenseReporter?.increment(1, { fileName: state.entry.relativePath }); - return; - } - - try { - const chunkInputTokens = definedResults.reduce((acc, r) => acc + (r.tokenUsage?.inputTokens ?? 0), 0); - const chunkOutputTokens = definedResults.reduce((acc, r) => acc + (r.tokenUsage?.outputTokens ?? 0), 0); - const chunkCostUsd = definedResults.reduce((acc, r) => acc + (r.tokenUsage?.costUsd ?? 0), 0); - const totalTokenCount = state.chunks.reduce((acc, c) => acc + c.tokenCount, 0); - const totalIn = chunkInputTokens + (merged.tokenUsage?.inputTokens ?? 0); - const totalOut = chunkOutputTokens + (merged.tokenUsage?.outputTokens ?? 0); - const totalCost = chunkCostUsd + (merged.tokenUsage?.costUsd ?? 0); - - const manifest: HugeFileManifest = { - relativePath: state.entry.relativePath, - totalChunks: state.chunks.length, - totalTokenCount, - chunkPaths: state.chunks.map((_, i) => `chunks/${encodeFolder(state.entry.relativePath)}/chunk-${i}.json`), - generatedAt: new Date().toISOString(), - }; - await saveManifest(input.metaPaths, manifest); - - const condensed: CondensedFileAnalysis = { - relativePath: state.entry.relativePath, - language: merged.language, - sha256: sha256(state.content), - sizeBytes: state.entry.sizeBytes, - tokenCount: totalTokenCount, - isBigFile: true, - totalChunks: state.chunks.length, - totalTokenCount, - analysedAt: new Date().toISOString(), - analysis: merged.analysis, - tokenUsage: { inputTokens: totalIn, outputTokens: totalOut, costUsd: totalCost }, - }; - await saveCondensed(input.metaPaths, condensed); - - totalInputTokens += totalIn; - totalOutputTokens += totalOut; - totalCostUsd += totalCost; - processed += 1; - } catch (cause: unknown) { - if (cause instanceof CancellationError) { - throw cause; - } - failed += 1; - logger.warn(`analyse-big: persist failed for ${state.entry.relativePath}: ${describe(cause)}`); - } finally { - condenseReporter?.increment(1, { fileName: state.entry.relativePath }); - } - }), - ); - } - - const chunkPromises: Promise[] = []; - for (const state of states) { - for (let i = 0; i < state.chunks.length; i += 1) { - const idx = i; - const chunk = state.chunks[idx]; - if (chunk === undefined) { - continue; - } - chunkPromises.push( - input.limiter(async () => { - throwIfCancelled(input.knowledgeId); - try { - const cachedChunk = await loadChunkIfPresent(input.metaPaths, state.entry.relativePath, idx); - if (cachedChunk !== null) { - state.results[idx] = cachedChunk; - } else { - const analyzed = await analyzeChunk(chunk, input.llmCallContext); - await saveChunk(input.metaPaths, analyzed); - state.results[idx] = analyzed; - } - } catch (cause: unknown) { - if (cause instanceof CancellationError) { - throw cause; - } - if (cause instanceof LlmConfigError || cause instanceof LlmError) { - state.fatal = true; - throw cause; - } - logger.warn( - `analyse-big: chunk ${idx + 1}/${state.chunks.length} failed for ${state.entry.relativePath}: ${describe(cause)}`, - ); - } finally { - state.pendingChunks -= 1; - chunkReporter?.increment(1, { fileName: `${state.entry.relativePath}#chunk-${String(idx)}` }); - maybeScheduleCondense(state); - } - }), - ); - } - } - - try { - await Promise.all(chunkPromises); - await Promise.all(condensePromises); - } finally { - chunkReporter?.stop(); - condenseReporter?.stop(); - } - - logger.info( - `analyse-big done: processed=${processed} cached=${cached} failed=${failed} skippedOversized=${skippedOversized}`, - ); - return { - processed, - cached, - failed, - skippedOversized, - tokenUsage: { inputTokens: totalInputTokens, outputTokens: totalOutputTokens, costUsd: totalCostUsd }, - }; -} - -function sha256(content: string): string { - return createHash("sha256").update(content).digest("hex"); -} - -function encodeFolder(relativePath: string): string { - return relativePath.replace(/\//gu, "__SL__").replace(/\\/gu, "__BS__"); -} - -function describe(cause: unknown): string { +export function describe(cause: unknown): string { return cause instanceof Error ? cause.message : String(cause); } - -function sleep(ms: number): Promise { - return new Promise((resolve) => { - setTimeout(resolve, ms); - }); -} From 29e6cc53b2a0b36dd421a5d4969b1be67b7be4af Mon Sep 17 00:00:00 2001 From: Dead-Bytes <143434285+Dead-Bytes@users.noreply.github.com> Date: Fri, 22 May 2026 18:35:55 +0530 Subject: [PATCH 11/11] chore(ts-cleanup): tsconfig files cleared --- packages/cli/src/output.d.ts | 16 -- packages/cli/tsconfig.json | 2 +- packages/config/tsconfig.json | 2 +- packages/errors/tsconfig.json | 2 +- .../ingest-business-context/tsconfig.json | 2 +- packages/ingest-github/tsconfig.json | 2 +- packages/ingest-github/types/index.d.ts | 137 ------------------ packages/llm/tsconfig.json | 2 +- packages/logger/tsconfig.json | 2 +- packages/mcp/tsconfig.json | 2 +- packages/mongo/tsconfig.json | 2 +- packages/neo4j/tsconfig.json | 2 +- packages/queue/tsconfig.json | 2 +- packages/redis/tsconfig.json | 2 +- packages/server/tsconfig.json | 2 +- packages/types/tsconfig.json | 2 +- tsconfig.base.json | 12 +- tsconfig.json | 22 +-- 18 files changed, 21 insertions(+), 194 deletions(-) delete mode 100644 packages/cli/src/output.d.ts delete mode 100644 packages/ingest-github/types/index.d.ts diff --git a/packages/cli/src/output.d.ts b/packages/cli/src/output.d.ts deleted file mode 100644 index e20f44b..0000000 --- a/packages/cli/src/output.d.ts +++ /dev/null @@ -1,16 +0,0 @@ -export declare function success(line: string): void; -export declare function error(line: string, hint?: string): void; -export declare function list(label: string, items: readonly string[]): void; -export interface Spinner { - update(text: string): void; - stop(success: boolean, finalMsg?: string): void; -} -export declare function createSpinner(initialText: string): Spinner; -export interface ProgressBar { - update(current: number, total: number, text?: string): void; - stop(success: boolean, finalMsg?: string): void; -} -export declare function createProgressBar(initialText: string): ProgressBar; -export declare function table(headers: string[], rows: string[][]): void; -export declare function info(line: string): void; -//# sourceMappingURL=output.d.ts.map diff --git a/packages/cli/tsconfig.json b/packages/cli/tsconfig.json index d8a16a7..4ed0786 100644 --- a/packages/cli/tsconfig.json +++ b/packages/cli/tsconfig.json @@ -1,4 +1,4 @@ { - "extends": "../../../../tsconfig.base.json", + "extends": "../../tsconfig.base.json", "include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.json"] } diff --git a/packages/config/tsconfig.json b/packages/config/tsconfig.json index d8a16a7..4ed0786 100644 --- a/packages/config/tsconfig.json +++ b/packages/config/tsconfig.json @@ -1,4 +1,4 @@ { - "extends": "../../../../tsconfig.base.json", + "extends": "../../tsconfig.base.json", "include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.json"] } diff --git a/packages/errors/tsconfig.json b/packages/errors/tsconfig.json index d8a16a7..4ed0786 100644 --- a/packages/errors/tsconfig.json +++ b/packages/errors/tsconfig.json @@ -1,4 +1,4 @@ { - "extends": "../../../../tsconfig.base.json", + "extends": "../../tsconfig.base.json", "include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.json"] } diff --git a/packages/ingest-business-context/tsconfig.json b/packages/ingest-business-context/tsconfig.json index d8a16a7..4ed0786 100644 --- a/packages/ingest-business-context/tsconfig.json +++ b/packages/ingest-business-context/tsconfig.json @@ -1,4 +1,4 @@ { - "extends": "../../../../tsconfig.base.json", + "extends": "../../tsconfig.base.json", "include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.json"] } diff --git a/packages/ingest-github/tsconfig.json b/packages/ingest-github/tsconfig.json index d8a16a7..4ed0786 100644 --- a/packages/ingest-github/tsconfig.json +++ b/packages/ingest-github/tsconfig.json @@ -1,4 +1,4 @@ { - "extends": "../../../../tsconfig.base.json", + "extends": "../../tsconfig.base.json", "include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.json"] } diff --git a/packages/ingest-github/types/index.d.ts b/packages/ingest-github/types/index.d.ts deleted file mode 100644 index 98445ad..0000000 --- a/packages/ingest-github/types/index.d.ts +++ /dev/null @@ -1,137 +0,0 @@ -export interface RegisterGithubWorkersDeps { - sourceFactory?: SourceFactory; - pullFactory?: PullFactory; - progressContextFactory?: ProgressContextFactory; -} - -export type ProgressPhase = "file_analysis" | "folder_analysis" | "indexing"; - -export type ProgressTotalMode = { kind: "fixed"; total: number } | { kind: "growing"; initialTotal?: number }; - -export interface ProgressReporterInput { - readonly phase: ProgressPhase; - readonly subPhase?: string; - readonly total: ProgressTotalMode; - readonly resolveInitialProcessed?: () => Promise | number; -} - -export interface ProgressReporter { - start(): Promise; - increment(delta?: number, meta?: { fileName?: string }): void; - incrementSeen(delta?: number): void; - setTotal(total: number): void; - stop(): void; -} - -export interface ProgressContext { - reporter(input: ProgressReporterInput): ProgressReporter; - phaseChanged(phase: ProgressPhase): void; - completed(message?: string): void; - failed(error: string, phase?: ProgressPhase): void; -} - -export type ProgressContextFactory = (knowledgeId: string) => ProgressContext; - -export declare const nullProgressContextFactory: ProgressContextFactory; - -export declare function registerGithubWorkers(deps?: RegisterGithubWorkersDeps): void; -export declare function registerLocalIngestWorker(): void; - -export interface FlatFolderStrategyDeps { - fileAnalyzer: FileAnalyzer; - progressContextFactory?: ProgressContextFactory; -} -export declare function createFlatFolderStrategy(deps: FlatFolderStrategyDeps): IngestStrategy; -export declare const createLlmFileAnalyzer: (...args: any[]) => any; -export declare const createDiskSourceReader: (...args: any[]) => any; -export declare const createPipelineRunner: (...args: any[]) => any; -export declare const createGithubIngestHandler: (...args: any[]) => any; -export declare const createLocalIngestHandler: (...args: any[]) => any; -export declare const runPull: (...args: any[]) => any; -export declare const reposRoot: (...args: any[]) => string; -export declare const repoCloneDir: (knowledgeId: string) => string; -export declare const metaRootFor: (knowledgeId: string) => string; -export declare const metaPathsFor: (knowledgeId: string) => unknown; -export declare const commitMetaDir: (knowledgeId: string, commitHash: string) => string; -export declare const businessContextDir: (knowledgeId: string, commitHash: string, sanitizedTitle: string) => string; -export declare const orgRegistryDir: (knowledgeId: string, orgId: string) => string; -export declare function fetchLatestCommitHash( - repoUrl: string, - branch: string, - gitToken?: string, -): Promise; -export declare function fetchRecentCommits( - repoUrl: string, - branch: string, - limit?: number, - gitToken?: string, -): Promise; -export declare function fetchDefaultBranch(repoUrl: string, gitToken?: string): Promise; -export declare function fetchBranches( - repoUrl: string, - gitToken?: string, - limit?: number, -): Promise<{ status: "ok"; branches: string[] } | { status: "error"; message: string }>; -export declare function parseGithubRepo(repoUrl: string): ParsedRepo | null; - -export interface BootstrapRuntimeOptions { - config: unknown; - loggerFactory: (scope: string) => unknown; -} -export declare function bootstrapRuntime(opts: BootstrapRuntimeOptions): Promise; - -export declare const COMBINED_CODE_ANALYSIS_SYSTEM_PROMPT: string; -export declare function buildFileAnalysisUserPrompt(input: { relativePath: string; content: string }): string; - -export type CreatePipelineRunnerDeps = any; -export type IngestJobHandlerDeps = any; -export type IngestRunnerDeps = any; -export type IngestRunnerInput = any; -export type IngestStrategy = any; -export type StrategyInput = any; -export type StrategyResult = any; -export type StrategyContext = any; -export type FileAnalyzer = any; -export type AnalyzedFileResult = any; -export type ScanEntry = any; -export type ScannedFile = any; -export type OversizedFile = any; -export type ScanDeps = any; -export type SourceReader = any; -export type ArchiveSink = any; -export type ArchiveSinkInput = any; -export type SourceFactory = any; -export type SourceFactoryInput = any; -export type SourceFactoryResult = any; -export type PullFactory = any; -export type PullFactoryInput = any; -export type PullFactoryResult = any; -export type DiffResult = any; -export type RenamedFile = any; -export type CondensedFileAnalysis = any; -export interface CommitEntry { - sha: string; - message: string; - author: string; - timestamp: string; -} - -export type FetchCommitsResult = - | { status: "ok"; commits: CommitEntry[] } - | { status: "not_found" } - | { status: "unauthorized" } - | { status: "rate_limited" } - | { status: "error"; message: string }; - -export interface ParsedRepo { - owner: string; - repo: string; - branch?: string; -} - -export type DefaultBranchResult = - | { status: "ok"; branch: string } - | { status: "not_found" } - | { status: "unauthorized" } - | { status: "rate_limited" } - | { status: "error"; message: string }; diff --git a/packages/llm/tsconfig.json b/packages/llm/tsconfig.json index d8a16a7..4ed0786 100644 --- a/packages/llm/tsconfig.json +++ b/packages/llm/tsconfig.json @@ -1,4 +1,4 @@ { - "extends": "../../../../tsconfig.base.json", + "extends": "../../tsconfig.base.json", "include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.json"] } diff --git a/packages/logger/tsconfig.json b/packages/logger/tsconfig.json index d8a16a7..4ed0786 100644 --- a/packages/logger/tsconfig.json +++ b/packages/logger/tsconfig.json @@ -1,4 +1,4 @@ { - "extends": "../../../../tsconfig.base.json", + "extends": "../../tsconfig.base.json", "include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.json"] } diff --git a/packages/mcp/tsconfig.json b/packages/mcp/tsconfig.json index d8a16a7..4ed0786 100644 --- a/packages/mcp/tsconfig.json +++ b/packages/mcp/tsconfig.json @@ -1,4 +1,4 @@ { - "extends": "../../../../tsconfig.base.json", + "extends": "../../tsconfig.base.json", "include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.json"] } diff --git a/packages/mongo/tsconfig.json b/packages/mongo/tsconfig.json index d8a16a7..4ed0786 100644 --- a/packages/mongo/tsconfig.json +++ b/packages/mongo/tsconfig.json @@ -1,4 +1,4 @@ { - "extends": "../../../../tsconfig.base.json", + "extends": "../../tsconfig.base.json", "include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.json"] } diff --git a/packages/neo4j/tsconfig.json b/packages/neo4j/tsconfig.json index d8a16a7..4ed0786 100644 --- a/packages/neo4j/tsconfig.json +++ b/packages/neo4j/tsconfig.json @@ -1,4 +1,4 @@ { - "extends": "../../../../tsconfig.base.json", + "extends": "../../tsconfig.base.json", "include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.json"] } diff --git a/packages/queue/tsconfig.json b/packages/queue/tsconfig.json index d8a16a7..4ed0786 100644 --- a/packages/queue/tsconfig.json +++ b/packages/queue/tsconfig.json @@ -1,4 +1,4 @@ { - "extends": "../../../../tsconfig.base.json", + "extends": "../../tsconfig.base.json", "include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.json"] } diff --git a/packages/redis/tsconfig.json b/packages/redis/tsconfig.json index d8a16a7..4ed0786 100644 --- a/packages/redis/tsconfig.json +++ b/packages/redis/tsconfig.json @@ -1,4 +1,4 @@ { - "extends": "../../../../tsconfig.base.json", + "extends": "../../tsconfig.base.json", "include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.json"] } diff --git a/packages/server/tsconfig.json b/packages/server/tsconfig.json index d8a16a7..4ed0786 100644 --- a/packages/server/tsconfig.json +++ b/packages/server/tsconfig.json @@ -1,4 +1,4 @@ { - "extends": "../../../../tsconfig.base.json", + "extends": "../../tsconfig.base.json", "include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.json"] } diff --git a/packages/types/tsconfig.json b/packages/types/tsconfig.json index d8a16a7..4ed0786 100644 --- a/packages/types/tsconfig.json +++ b/packages/types/tsconfig.json @@ -1,4 +1,4 @@ { - "extends": "../../../../tsconfig.base.json", + "extends": "../../tsconfig.base.json", "include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.json"] } diff --git a/tsconfig.base.json b/tsconfig.base.json index 6903d08..9226217 100644 --- a/tsconfig.base.json +++ b/tsconfig.base.json @@ -6,6 +6,7 @@ "module": "ESNext", "moduleResolution": "bundler", "moduleDetection": "force", + "jsx": "react-jsx", "allowImportingTsExtensions": true, "verbatimModuleSyntax": true, "isolatedModules": true, @@ -36,12 +37,9 @@ "types": ["bun"], - "composite": true, - "declaration": true, - "declarationMap": true, - "sourceMap": true, - "incremental": true, - "noEmit": false, - "emitDeclarationOnly": true + "composite": false, + "declaration": false, + "noEmit": true, + "incremental": true } } diff --git a/tsconfig.json b/tsconfig.json index 4f4863d..80c98f2 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -1,26 +1,8 @@ { "extends": "./tsconfig.base.json", "compilerOptions": { - "composite": false, - "declaration": false, - "declarationMap": false, "noEmit": true }, - "files": [], - "references": [ - { "path": "packages/types" }, - { "path": "packages/errors" }, - { "path": "packages/config" }, - { "path": "packages/logger" }, - { "path": "packages/mongo" }, - { "path": "packages/redis" }, - { "path": "packages/queue" }, - { "path": "packages/llm" }, - { "path": "packages/ingest-github" }, - { "path": "packages/ingest-business-context" }, - { "path": "packages/cli" }, - { "path": "packages/server" }, - { "path": "packages/neo4j" }, - { "path": "packages/mcp" } - ] + "include": ["packages/*/src/**/*.ts", "packages/*/src/**/*.tsx", "packages/*/src/**/*.json"], + "exclude": ["**/node_modules", "**/dist", "**/*.d.ts"] }