Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions packages/ingest-github/src/strategies/flat-folder/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,14 @@ this single pool. One knob bounds total in-flight LLM concurrency.
**two-pass** strategy: walk + cache-only `decideStatic` first, then
parallel-deduplicated LLM resolution for unknown extensions/filenames
through the shared limiter, then drain.
1b. **write-eligible-files** (`eligible-files.ts`) — between scan and the
2a/2b parallel block, persists `.bytebell/eligible_files.json` (paths +
parent folders for every `small`/`big` entry, plus the commit hash) to
the source layer (local disk under `source.localRepoDir/.bytebell/` and/or
the `archiveSink`). Read back by `@bytebell/knowledge-validation` to
verify every file the analyzer was asked to process landed in Neo4j.
Hard-fails if neither write target is available — an un-validatable
knowledge is not a state we want.
2a. **analyse-small** (`phases/analyse-small.ts`) — reads the manifest's
`kind: "small"` entries, re-opens content, runs the LLM file-analyser
per file under the shared limiter, writes `CondensedFileAnalysis` JSON.
Expand Down Expand Up @@ -104,6 +112,7 @@ The strategy emits progress through the `ProgressContext` port defined in
see updated entries without re-reading disk.
- `scan-manifest.ts` — `ScanManifest` shape, `readScanManifest`,
`writeScanManifest`. The canonical handoff between phase 1 and phases 2a/2b.
- `eligible-files.ts` — `writeEligibleFiles({knowledgeId, manifest, source, archiveSink?})`. Writes `.bytebell/eligible_files.json` to the source layer between phase 1 and 2a/2b. The validation service (`@bytebell/knowledge-validation`) reads this artifact to cross-check post-indexing consistency.
- `folder-path.ts` — `directFolderOf`, `affectedFolderPaths`.
- `folder-summary.ts` — group + summarise (individual or batched) + persist
- iterate folder summaries; shared `dispatchFolderSummaries` used by both
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import { mkdir, writeFile } from "node:fs/promises";
import path from "node:path";
import { logger } from "@bb/logger";
import type { ArchiveSink, SourceReader } from "#src/types/pipeline.ts";
import { affectedFolderPaths } from "./folder-path.ts";
import type { ScanManifest } from "./scan-manifest.ts";

export const ELIGIBLE_FILES_RELATIVE_PATH = ".bytebell/eligible_files.json";

export interface EligibleFilesDocument {
knowledgeId: string;
commitHash: string;
generatedAt: string;
files: string[];
folders: string[];
}

export interface WriteEligibleFilesInput {
knowledgeId: string;
manifest: ScanManifest;
source: SourceReader;
archiveSink?: ArchiveSink;
}

/**
* Persist the canonical list of files the analyzer is about to process,
* BEFORE any small-file or big-file LLM call runs. The downstream
* `@bytebell/knowledge-validation` service reads this artifact via the same
* source layer to cross-check that every eligible file landed in Neo4j.
*
* Writes to whichever source layer is active: local disk when the source
* reader is disk-backed (`source.localRepoDir !== ""`), the archive sink
* otherwise. Fails the strategy if neither target is available, since a
* successfully-indexed but un-validatable knowledge is not a state we want.
*/
export async function writeEligibleFiles(input: WriteEligibleFilesInput): Promise<void> {
const files = input.manifest.entries
.filter((entry) => entry.kind === "small" || entry.kind === "big")
.map((entry) => entry.relativePath)
.sort();
const folders = affectedFolderPaths(files);
const doc: EligibleFilesDocument = {
knowledgeId: input.knowledgeId,
commitHash: input.source.commitHash,
generatedAt: new Date().toISOString(),
files,
folders,
};
const content = JSON.stringify(doc, null, 2);

let wrote = false;
if (input.source.localRepoDir.length > 0) {
const targetDir = path.join(input.source.localRepoDir, ".bytebell");
const targetFile = path.join(targetDir, "eligible_files.json");
await mkdir(targetDir, { recursive: true });
await writeFile(targetFile, content, "utf8");
wrote = true;
}
if (input.archiveSink !== undefined) {
await input.archiveSink.push({
knowledgeId: input.knowledgeId,
relativePath: ELIGIBLE_FILES_RELATIVE_PATH,
content,
});
wrote = true;
}
if (!wrote) {
throw new Error(
`flat-folder: cannot persist eligible_files.json for ${input.knowledgeId}: source reader has no localRepoDir and no archiveSink is configured`,
);
}
logger.info(
`flat-folder: persisted eligible_files.json for ${input.knowledgeId} (files=${String(files.length)} folders=${String(folders.length)})`,
);
}
17 changes: 17 additions & 0 deletions packages/ingest-github/src/strategies/flat-folder/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import { withConcurrency } from "#src/pipeline/concurrency.ts";
import { scanAndClassify } from "./phases/scan-and-classify.ts";
import { analyseSmallFiles } from "./phases/analyse-small.ts";
import { analyseBigFiles } from "./phases/analyse-big-files.ts";
import { writeEligibleFiles } from "./eligible-files.ts";
import { backfillMissingFields } from "./backfill/fields.ts";
import { FileAnalysisCache } from "./file-analysis-cache.ts";
import { runFolderSummaryPhase } from "./folder-summary.ts";
Expand Down Expand Up @@ -52,6 +53,22 @@ export function createFlatFolderStrategy(deps: FlatFolderStrategyDeps): IngestSt
}
const { manifest } = await scanAndClassify(scanInput);

// Persist the canonical eligible-files list BEFORE any small- or
// big-file LLM call runs. Read back by `@bytebell/knowledge-validation`
// to verify every file the analyzer was asked to process landed in
// Neo4j. Must be the last step before analysis dispatch — if this
// fails, the knowledge is not validatable post-hoc and we'd rather
// fail the run than ship an un-checkable index.
const eligibleInput: Parameters<typeof writeEligibleFiles>[0] = {
knowledgeId,
manifest,
source,
};
if (archiveSink !== undefined) {
eligibleInput.archiveSink = archiveSink;
}
await writeEligibleFiles(eligibleInput);

progressContext.phaseChanged("file_analysis");
logger.info(
`flat-folder: phase2 (analyse small ${manifest.summary.smallCount} + big ${manifest.summary.bigCount}) starting in parallel`,
Expand Down
Loading