Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 16 additions & 1 deletion bun.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

16 changes: 0 additions & 16 deletions packages/cli/src/output.d.ts

This file was deleted.

2 changes: 1 addition & 1 deletion packages/cli/tsconfig.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{
"extends": "../../../../tsconfig.base.json",
"extends": "../../tsconfig.base.json",
"include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.json"]
}
28 changes: 28 additions & 0 deletions packages/config/src/schema.ts
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,10 @@ export const configSchema = z
"big.file.concurrency": z.number().int().positive().default(25),
"absolute.file.size.cap": z.number().int().positive().default(52428800),
"concurrent.workers": z.number().int().positive().default(4),
"llm.concurrency": z.number().int().positive().default(29),
"folder.summary.batch.size": z.number().int().positive().default(10),
"folder.summary.batch.max.files": z.number().int().positive().default(15),
"neo4j.batch.size": z.number().int().positive().default(50),
"condense.context.limit": z.number().int().positive().default(12000),
"condense.prompt.overhead": z.number().int().nonnegative().default(1500),
"small.file.dedup.threshold": z.number().int().positive().default(3),
Expand Down Expand Up @@ -81,6 +85,10 @@ export type ConfigValueMap = {
[Config.BigFileConcurrency]: number;
[Config.AbsoluteFileSizeCap]: number;
[Config.ConcurrentWorkers]: number;
[Config.LlmConcurrency]: number;
[Config.FolderSummaryBatchSize]: number;
[Config.FolderSummaryBatchMaxFiles]: number;
[Config.Neo4jBatchSize]: number;
[Config.CondenseContextLimit]: number;
[Config.CondensePromptOverhead]: number;
[Config.SmallFileDedupThreshold]: number;
Expand Down Expand Up @@ -135,6 +143,10 @@ export const HINTS: Readonly<Record<Config, string>> = {
[Config.BigFileConcurrency]: "bytebell set big.file.concurrency <n>",
[Config.AbsoluteFileSizeCap]: "bytebell set absolute.file.size.cap <bytes>",
[Config.ConcurrentWorkers]: "bytebell set concurrent.workers <n>",
[Config.LlmConcurrency]: "bytebell set llm.concurrency <n>",
[Config.FolderSummaryBatchSize]: "bytebell set folder.summary.batch.size <n>",
[Config.FolderSummaryBatchMaxFiles]: "bytebell set folder.summary.batch.max.files <n>",
[Config.Neo4jBatchSize]: "bytebell set neo4j.batch.size <n>",
[Config.CondenseContextLimit]: "bytebell set condense.context.limit <n>",
[Config.CondensePromptOverhead]: "bytebell set condense.prompt.overhead <n>",
[Config.SmallFileDedupThreshold]: "bytebell set small.file.dedup.threshold <n>",
Expand Down Expand Up @@ -195,6 +207,14 @@ export function readField<K extends Config>(cfg: BytebellConfig, key: K): Config
return cfg["absolute.file.size.cap"] as ConfigValue<K>;
case Config.ConcurrentWorkers:
return cfg["concurrent.workers"] as ConfigValue<K>;
case Config.LlmConcurrency:
return cfg["llm.concurrency"] as ConfigValue<K>;
case Config.FolderSummaryBatchSize:
return cfg["folder.summary.batch.size"] as ConfigValue<K>;
case Config.FolderSummaryBatchMaxFiles:
return cfg["folder.summary.batch.max.files"] as ConfigValue<K>;
case Config.Neo4jBatchSize:
return cfg["neo4j.batch.size"] as ConfigValue<K>;
case Config.CondenseContextLimit:
return cfg["condense.context.limit"] as ConfigValue<K>;
case Config.CondensePromptOverhead:
Expand Down Expand Up @@ -264,6 +284,14 @@ export function writeField<K extends Config>(cfg: BytebellConfig, key: K, value:
return { ...cfg, "absolute.file.size.cap": value as number };
case Config.ConcurrentWorkers:
return { ...cfg, "concurrent.workers": value as number };
case Config.LlmConcurrency:
return { ...cfg, "llm.concurrency": value as number };
case Config.FolderSummaryBatchSize:
return { ...cfg, "folder.summary.batch.size": value as number };
case Config.FolderSummaryBatchMaxFiles:
return { ...cfg, "folder.summary.batch.max.files": value as number };
case Config.Neo4jBatchSize:
return { ...cfg, "neo4j.batch.size": value as number };
case Config.CondenseContextLimit:
return { ...cfg, "condense.context.limit": value as number };
case Config.CondensePromptOverhead:
Expand Down
2 changes: 1 addition & 1 deletion packages/config/tsconfig.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{
"extends": "../../../../tsconfig.base.json",
"extends": "../../tsconfig.base.json",
"include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.json"]
}
2 changes: 1 addition & 1 deletion packages/errors/tsconfig.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{
"extends": "../../../../tsconfig.base.json",
"extends": "../../tsconfig.base.json",
"include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.json"]
}
2 changes: 1 addition & 1 deletion packages/ingest-business-context/tsconfig.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{
"extends": "../../../../tsconfig.base.json",
"extends": "../../tsconfig.base.json",
"include": ["src/**/*.ts", "src/**/*.tsx", "src/**/*.json"]
}
47 changes: 36 additions & 11 deletions packages/ingest-github/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -132,33 +132,59 @@ worker hardcodes a single `IngestionStrategy` instance (currently
- `:File` graph nodes + `:HAS_FILE` / `:HAS_KEYWORD` / `:HAS_CLASS` /
`:HAS_FUNCTION` / `:HAS_IMPORT_INTERNAL` / `:HAS_IMPORT_EXTERNAL` relationships — written via
`upsertFileNode` from `@bb/neo4j`.
- `meta-output/scan-manifest.json` — the canonical small/big/oversized
classification produced by Phase 1 (`scanAndClassify`). Per-file entries
carry `tokenCount`, `kind`, and (for big files) `estimatedChunks`.
Phases 2a (small) and 2b (big) consume the manifest in parallel.
- `meta-output/bigFiles.json` — legacy view written alongside the manifest
for the pull-path and backfill phases. The main strategy no longer
consumes it directly.
- `FileAnalysisCache` (in-memory only, not persisted) — single
`Map<relativePath, CondensedFileAnalysis>` loaded once between the
analyse and backfill phases via parallel `readdir + readFile`. Replaces
three sequential `iterateCondensed` walks (phases 3, 5, 7) with one
parallel preload + three in-memory iterations. The pull workflow loads
its own cache instance; only one strategy run owns a given
`metaPaths` directory at a time. For repos beyond ~50k analysed files
consider a streaming-mode fallback (not implemented today).

## Invariants

1. **Sequential per-file processing.** Intentionally degraded; one
`upsertRawFile` per file. The small-file path issues one `askLLM`;
the big-file path issues N (one per chunk) plus condensation calls,
all sequential — no `Promise.all`, no concurrency cap. Revisit when
the latency profile demands it.
2. **Clone idempotent.** Re-runs (BullMQ retries) call `git fetch` +
1. **Shared LLM concurrency limiter.** The flat-folder strategy
constructs one `withConcurrency(Config.LlmConcurrency)` instance at
entry (default 29). The small-file phase, the big-file chunk phase,
per-file condense calls, **and the folder-summary phase** all check
out from this single pool, so total in-flight LLM calls is bounded
by one knob. The pull-path constructs its own shared limiter at
`runPull` entry and threads it into the selective folder-summary
phase. The legacy `processBigFile` driver used by the pull-path
still uses its own per-file pool sized by `Config.BigFileConcurrency`.
2. **Folder-summary batching by default.** Phase 5 groups small folders
(`≤ Config.FolderSummaryBatchMaxFiles`, default 15) into batches of
up to `Config.FolderSummaryBatchSize` (default 10) and asks the LLM
for one JSON object keyed by integer label that returns one summary
per folder. Bigger folders take the individual single-folder path.
Roll back to one LLM call per folder via
`bytebell set folder.summary.batch.size 1`.
3. **Clone idempotent.** Re-runs (BullMQ retries) call `git fetch` +
`git reset --hard` in the existing dir rather than re-cloning.
Tokens are re-injected into the remote URL each time.
3. **Token redaction.** `GitCloneError` carries the **redacted** repo
4. **Token redaction.** `GitCloneError` carries the **redacted** repo
URL (`https://user:***@host`) — the raw `gitToken` never appears in
error messages or logs.
4. **State transition order.** `Processing` is set _before_ any clone
5. **State transition order.** `Processing` is set _before_ any clone
work. `Processed` is set _only_ after the entire scan + analyze loop
completes. On any thrown error, the handler best-effort sets `Failed`
then re-throws so BullMQ records the retry.
5. **Fail-soft analysis, fail-hard infra.** A single file's LLM call
6. **Fail-soft analysis, fail-hard infra.** A single file's LLM call
failing falls back to an empty-analysis Raw doc and processing
continues. In the big-file path, a single chunk failure contributes
an empty analysis to the merge but does not stop the file; a
condensation-call failure falls through to deterministic
`dedupAnalyses` so the merged result is always well-formed. A clone
failure or Mongo write failure throws and propagates to BullMQ for
retry under the queue's `attempts: 3`.
6. **Hardcoded filters only.** No LLM-based ignore decisions in v0. The
7. **Hardcoded filters only.** No LLM-based ignore decisions in v0. The
directory / file / extension blocklists in `scan.ts` are the only
way files get skipped.

Expand All @@ -179,7 +205,6 @@ worker hardcodes a single `IngestionStrategy` instance (currently
- GitHub API streaming mode (always shell-clone)
- Default-branch auto-detection (caller supplies `branch`; defaults to
`"main"`)
- Concurrency control / parallel file processing
- Folder-level summaries / `repoSummary.json` / `flat-folder` strategy
- Semantic chunking (`SemanticChunker`)
- Per-chunk persistence (we persist only the merged file-level
Expand Down
28 changes: 23 additions & 5 deletions packages/ingest-github/src/pipeline/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ Domain (sub-folder of `@bb/ingest-github`).
- `skip-decisions/` — LLM-backed unknown-extension gate. See
`skip-decisions/README.md`. Active when `Config.SkipDecisionEnabled =
true` (default). Consumed by `scan.ts` via the optional `skipDecider`
dep; built by `classifyAndAnalyseSmall` if not injected.
dep; built by `scanAndClassify` (Phase 1) if not injected.
- `disk-source-reader.ts` — `createDiskSourceReader({ repoDir, commitHash })`
returns a `SourceReader` that wraps `scanRepository` + `node:fs.readFile`.
The default reader the open-source binary always uses, unless the caller
Expand All @@ -40,9 +40,27 @@ true` (default). Consumed by `scan.ts` via the optional `skipDecider`
enters the big-file phase). Both thresholds are config-driven — no
magic numbers in this file. `deps.llmCallContext` (when present) is
forwarded into every `SkipDeciderInput` so the LLM branch of the
unknown-extension gate uses per-job credentials. `readScannedFile`
re-reads a file by absolute path for the big-file phase which streams
content lazily.
unknown-extension gate uses per-job credentials.

**Two scan modes:**
- **Two-pass (default for the flat-folder strategy)** — activated when
`deps.skipDecider` AND `deps.limiter` are both supplied. Pass 1 walks
the tree calling `decider.decideStatic(...)`; static-resolved files
yield immediately, "needs LLM" files go into a pending buffer with
their content. Pass 2 dedupes pending entries by `ext:<ext>` or
`filename:<name>`, dispatches one `decider.decideAndDeferSave(...)` per
unique key through the shared limiter via `Promise.all`, then calls
`decider.persist()` exactly once. Pass 3 drains pending — every
`decideStatic` call is now a cache hit, so the drain is sync at the
decider boundary and yields each kept file with its buffered content.
- **Legacy inline (`walk()`)** — used when `deps.limiter` is omitted (e.g.
a custom `SourceFactory` consumer that didn't opt in). Inline `await
deps.skipDecider.decide(input)` per file. Same semantics as before this
refactor; preserved for backwards compatibility.

`readScannedFile` re-reads a file by absolute path for the big-file phase
which streams content lazily.

- `run.ts` — `createPipelineRunner({ reposRootDir, strategy, sourceFactory?, progressContextFactory? })`
builds an `IngestRunnerDeps`. GitHub payloads run: branch resolve,
source-reader construction, strategy execute, commit persistence. Local
Expand Down Expand Up @@ -76,7 +94,7 @@ archiveSink?}` and `runPull` skips `syncRepository` + `materialiseEndpoints`
(open-source default), the legacy git-based path runs. Either path
produces the same downstream pipeline: snapshot prior version,
`analyseChangedFiles` (now reading via `SourceReader`),
`processBigFilesQueue`, `backfillMissingFields`, `backfillBigFiles`,
`processBigFilesQueue`, `backfillMissingFields`,
`runSelectiveFolderSummary`, `summariseRepo`, `storePullAnalysis`.
Mirrors the index-side strategy orchestrator for progress: builds one
`ProgressContext` per job from the optional `progressContextFactory`
Expand Down
1 change: 1 addition & 0 deletions packages/ingest-github/src/pipeline/paths.ts
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ export function metaPathsFor(knowledgeId: string): MetaPaths {
bigFileAnalysisDir: path.join(metaRoot, "big-file-analysis"),
bigFileChunksDir: path.join(metaRoot, "big-file-analysis", "chunks"),
bigFilesJson: path.join(metaRoot, "bigFiles.json"),
scanManifestJson: path.join(metaRoot, "scan-manifest.json"),
repoSummaryJson: path.join(metaRoot, "repo-summary.json"),
};
}
Expand Down
26 changes: 11 additions & 15 deletions packages/ingest-github/src/pipeline/pull.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import { KnowledgeState, type GithubPullPayload, type JobMessage } from "@bb/types";
import { Config, KnowledgeState, type GithubPullPayload, type JobMessage } from "@bb/types";
import { getConfigValue } from "@bb/config";
import { withConcurrency } from "./concurrency.ts";
import { getKnowledge, markKnowledgeFailed, setKnowledgeCommit, setKnowledgeState } from "@bb/mongo";
import { setKnowledgeStateInGraph, snapshotFilesToVersion, type NodeScope } from "@bb/neo4j";
import type { PipelineSummary } from "#src/types/pipeline.ts";
Expand All @@ -19,7 +21,7 @@ import { nullProgressContextFactory } from "#src/progress/NullProgressReporter.t
import { analyseChangedFiles } from "#src/strategies/flat-folder/analyse-changed.ts";
import { processBigFilesQueue } from "#src/strategies/flat-folder/phases/process-big-files.ts";
import { backfillMissingFields } from "#src/strategies/flat-folder/backfill/fields.ts";
import { backfillBigFiles } from "#src/strategies/flat-folder/backfill/big-files.ts";
import { FileAnalysisCache } from "#src/strategies/flat-folder/file-analysis-cache.ts";
import { runSelectiveFolderSummary } from "#src/strategies/flat-folder/folder-summary-selective.ts";
import {
makeRepoSummaryEnvelope,
Expand Down Expand Up @@ -192,29 +194,23 @@ export async function runPull(
totalOutputTokens += phase2.tokenUsage.outputTokens;
totalCostUsd += phase2.tokenUsage.costUsd;

logger.info(`pull: phase backfill fields starting`);
logger.info(`pull: loading file-analysis cache`);
throwIfCancelled(knowledgeId);
await backfillMissingFields(metaPaths, llmCallContext, progressContext);
const fileAnalysisCache = await FileAnalysisCache.loadAll(metaPaths);
const limiter = withConcurrency(getConfigValue(Config.LlmConcurrency));

logger.info(`pull: phase backfill big-files starting`);
logger.info(`pull: phase backfill fields starting`);
throwIfCancelled(knowledgeId);
const backfillBigFilesInput: Parameters<typeof backfillBigFiles>[0] = {
knowledgeId,
source,
metaPaths,
progressContext,
};
if (llmCallContext !== undefined) {
backfillBigFilesInput.llmCallContext = llmCallContext;
}
await backfillBigFiles(backfillBigFilesInput);
await backfillMissingFields(metaPaths, fileAnalysisCache, limiter, llmCallContext, progressContext);

progressContext.phaseChanged("folder_analysis");
logger.info(`pull: phase selective folder summary (${affectedFolders.size} folders) starting`);
throwIfCancelled(knowledgeId);
const selectiveInput: Parameters<typeof runSelectiveFolderSummary>[0] = {
knowledgeId,
metaPaths,
cache: fileAnalysisCache,
limiter,
affectedFolders,
};
if (llmCallContext !== undefined) {
Expand Down
Loading
Loading