diff --git a/packages/ingest-github/src/strategies/flat-folder/README.md b/packages/ingest-github/src/strategies/flat-folder/README.md index 78d8acf..b391d3c 100644 --- a/packages/ingest-github/src/strategies/flat-folder/README.md +++ b/packages/ingest-github/src/strategies/flat-folder/README.md @@ -21,6 +21,14 @@ this single pool. One knob bounds total in-flight LLM concurrency. **two-pass** strategy: walk + cache-only `decideStatic` first, then parallel-deduplicated LLM resolution for unknown extensions/filenames through the shared limiter, then drain. + 1b. **write-eligible-files** (`eligible-files.ts`) — between scan and the + 2a/2b parallel block, persists `.bytebell/eligible_files.json` (paths + + parent folders for every `small`/`big` entry, plus the commit hash) to + the source layer (local disk under `source.localRepoDir/.bytebell/` and/or + the `archiveSink`). Read back by `@bytebell/knowledge-validation` to + verify every file the analyzer was asked to process landed in Neo4j. + Hard-fails if neither write target is available — an un-validatable + knowledge is not a state we want. 2a. **analyse-small** (`phases/analyse-small.ts`) — reads the manifest's `kind: "small"` entries, re-opens content, runs the LLM file-analyser per file under the shared limiter, writes `CondensedFileAnalysis` JSON. @@ -104,6 +112,7 @@ The strategy emits progress through the `ProgressContext` port defined in see updated entries without re-reading disk. - `scan-manifest.ts` — `ScanManifest` shape, `readScanManifest`, `writeScanManifest`. The canonical handoff between phase 1 and phases 2a/2b. +- `eligible-files.ts` — `writeEligibleFiles({knowledgeId, manifest, source, archiveSink?})`. Writes `.bytebell/eligible_files.json` to the source layer between phase 1 and 2a/2b. The validation service (`@bytebell/knowledge-validation`) reads this artifact to cross-check post-indexing consistency. - `folder-path.ts` — `directFolderOf`, `affectedFolderPaths`. - `folder-summary.ts` — group + summarise (individual or batched) + persist - iterate folder summaries; shared `dispatchFolderSummaries` used by both diff --git a/packages/ingest-github/src/strategies/flat-folder/eligible-files.ts b/packages/ingest-github/src/strategies/flat-folder/eligible-files.ts new file mode 100644 index 0000000..0205889 --- /dev/null +++ b/packages/ingest-github/src/strategies/flat-folder/eligible-files.ts @@ -0,0 +1,75 @@ +import { mkdir, writeFile } from "node:fs/promises"; +import path from "node:path"; +import { logger } from "@bb/logger"; +import type { ArchiveSink, SourceReader } from "#src/types/pipeline.ts"; +import { affectedFolderPaths } from "./folder-path.ts"; +import type { ScanManifest } from "./scan-manifest.ts"; + +export const ELIGIBLE_FILES_RELATIVE_PATH = ".bytebell/eligible_files.json"; + +export interface EligibleFilesDocument { + knowledgeId: string; + commitHash: string; + generatedAt: string; + files: string[]; + folders: string[]; +} + +export interface WriteEligibleFilesInput { + knowledgeId: string; + manifest: ScanManifest; + source: SourceReader; + archiveSink?: ArchiveSink; +} + +/** + * Persist the canonical list of files the analyzer is about to process, + * BEFORE any small-file or big-file LLM call runs. The downstream + * `@bytebell/knowledge-validation` service reads this artifact via the same + * source layer to cross-check that every eligible file landed in Neo4j. + * + * Writes to whichever source layer is active: local disk when the source + * reader is disk-backed (`source.localRepoDir !== ""`), the archive sink + * otherwise. Fails the strategy if neither target is available, since a + * successfully-indexed but un-validatable knowledge is not a state we want. + */ +export async function writeEligibleFiles(input: WriteEligibleFilesInput): Promise { + const files = input.manifest.entries + .filter((entry) => entry.kind === "small" || entry.kind === "big") + .map((entry) => entry.relativePath) + .sort(); + const folders = affectedFolderPaths(files); + const doc: EligibleFilesDocument = { + knowledgeId: input.knowledgeId, + commitHash: input.source.commitHash, + generatedAt: new Date().toISOString(), + files, + folders, + }; + const content = JSON.stringify(doc, null, 2); + + let wrote = false; + if (input.source.localRepoDir.length > 0) { + const targetDir = path.join(input.source.localRepoDir, ".bytebell"); + const targetFile = path.join(targetDir, "eligible_files.json"); + await mkdir(targetDir, { recursive: true }); + await writeFile(targetFile, content, "utf8"); + wrote = true; + } + if (input.archiveSink !== undefined) { + await input.archiveSink.push({ + knowledgeId: input.knowledgeId, + relativePath: ELIGIBLE_FILES_RELATIVE_PATH, + content, + }); + wrote = true; + } + if (!wrote) { + throw new Error( + `flat-folder: cannot persist eligible_files.json for ${input.knowledgeId}: source reader has no localRepoDir and no archiveSink is configured`, + ); + } + logger.info( + `flat-folder: persisted eligible_files.json for ${input.knowledgeId} (files=${String(files.length)} folders=${String(folders.length)})`, + ); +} diff --git a/packages/ingest-github/src/strategies/flat-folder/index.ts b/packages/ingest-github/src/strategies/flat-folder/index.ts index 86797a6..4fcd091 100644 --- a/packages/ingest-github/src/strategies/flat-folder/index.ts +++ b/packages/ingest-github/src/strategies/flat-folder/index.ts @@ -9,6 +9,7 @@ import { withConcurrency } from "#src/pipeline/concurrency.ts"; import { scanAndClassify } from "./phases/scan-and-classify.ts"; import { analyseSmallFiles } from "./phases/analyse-small.ts"; import { analyseBigFiles } from "./phases/analyse-big-files.ts"; +import { writeEligibleFiles } from "./eligible-files.ts"; import { backfillMissingFields } from "./backfill/fields.ts"; import { FileAnalysisCache } from "./file-analysis-cache.ts"; import { runFolderSummaryPhase } from "./folder-summary.ts"; @@ -52,6 +53,22 @@ export function createFlatFolderStrategy(deps: FlatFolderStrategyDeps): IngestSt } const { manifest } = await scanAndClassify(scanInput); + // Persist the canonical eligible-files list BEFORE any small- or + // big-file LLM call runs. Read back by `@bytebell/knowledge-validation` + // to verify every file the analyzer was asked to process landed in + // Neo4j. Must be the last step before analysis dispatch — if this + // fails, the knowledge is not validatable post-hoc and we'd rather + // fail the run than ship an un-checkable index. + const eligibleInput: Parameters[0] = { + knowledgeId, + manifest, + source, + }; + if (archiveSink !== undefined) { + eligibleInput.archiveSink = archiveSink; + } + await writeEligibleFiles(eligibleInput); + progressContext.phaseChanged("file_analysis"); logger.info( `flat-folder: phase2 (analyse small ${manifest.summary.smallCount} + big ${manifest.summary.bigCount}) starting in parallel`,