From e5305c1c6e4b91654f20a8617c808f08864aed03 Mon Sep 17 00:00:00 2001 From: unknown Date: Tue, 7 Apr 2026 23:11:36 +0000 Subject: [PATCH 1/8] =?UTF-8?q?=E5=87=8F=E5=B0=91=E4=B8=B2=E8=A1=8Cembeddi?= =?UTF-8?q?ng=E8=AF=B7=E6=B1=82=E5=AF=B9=E4=BA=8E=E6=9C=AC=E5=9C=B0?= =?UTF-8?q?=E6=9C=8D=E5=8A=A1=E5=9C=BA=E6=99=AF=E4=B8=8B=E7=9A=84=E6=80=A7?= =?UTF-8?q?=E8=83=BD=E6=8D=9F=E5=A4=B1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- package-lock.json | 14 ++++- src/smart-extractor.ts | 123 +++++++++++++++++++++++++++++++---------- 2 files changed, 105 insertions(+), 32 deletions(-) diff --git a/package-lock.json b/package-lock.json index fcbf1b04..de165655 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "memory-lancedb-pro", - "version": "1.1.0-beta.9", + "version": "1.1.0-beta.10", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "memory-lancedb-pro", - "version": "1.1.0-beta.9", + "version": "1.1.0-beta.10", "license": "MIT", "dependencies": { "@lancedb/lancedb": "^0.26.2", @@ -20,6 +20,13 @@ "commander": "^14.0.0", "jiti": "^2.6.0", "typescript": "^5.9.3" + }, + "optionalDependencies": { + "@lancedb/lancedb-darwin-arm64": "^0.26.2", + "@lancedb/lancedb-darwin-x64": "^0.26.2", + "@lancedb/lancedb-linux-arm64-gnu": "^0.26.2", + "@lancedb/lancedb-linux-x64-gnu": "^0.26.2", + "@lancedb/lancedb-win32-x64-msvc": "^0.26.2" } }, "node_modules/@lancedb/lancedb": { @@ -71,6 +78,9 @@ "node": ">= 18" } }, + "node_modules/@lancedb/lancedb-darwin-x64": { + "optional": true + }, "node_modules/@lancedb/lancedb-linux-arm64-gnu": { "version": "0.26.2", "resolved": "https://registry.npmjs.org/@lancedb/lancedb-linux-arm64-gnu/-/lancedb-linux-arm64-gnu-0.26.2.tgz", diff --git a/src/smart-extractor.ts b/src/smart-extractor.ts index dab1bb56..7837c660 100644 --- a/src/smart-extractor.ts +++ b/src/smart-extractor.ts @@ -287,10 +287,9 @@ export class SmartExtractor { let survivingCandidates = capped; try { const abstracts = capped.map((c) => c.abstract); - const vectors = await Promise.all( - abstracts.map((a) => this.embedder.embed(a).catch(() => [] as number[])), - ); - const dedupResult = batchDedup(abstracts, vectors); + const vectors = await this.embedder.embedBatch(abstracts); + const safeVectors = vectors.map((v) => v || []); + const dedupResult = batchDedup(abstracts, safeVectors); if (dedupResult.duplicateIndices.length > 0) { survivingCandidates = dedupResult.survivingIndices.map((i) => capped[i]); stats.skipped += dedupResult.duplicateIndices.length; @@ -305,7 +304,35 @@ export class SmartExtractor { } // Step 2: Process each surviving candidate through dedup pipeline - for (const candidate of survivingCandidates) { + // Pre-compute vectors for non-profile candidates in a single batch API call + // to reduce embedding round-trips from N to 1. + const precomputedVectors = new Map(); + const nonProfileEntries: { index: number; text: string }[] = []; + for (let i = 0; i < survivingCandidates.length; i++) { + const c = survivingCandidates[i]; + if (!ALWAYS_MERGE_CATEGORIES.has(c.category)) { + nonProfileEntries.push({ index: i, text: `${c.abstract} ${c.content}` }); + } + } + if (nonProfileEntries.length > 0) { + try { + const batchTexts = nonProfileEntries.map((e) => e.text); + const batchVectors = await this.embedder.embedBatch(batchTexts); + for (let j = 0; j < nonProfileEntries.length; j++) { + const vec = batchVectors[j]; + if (vec && vec.length > 0) { + precomputedVectors.set(nonProfileEntries[j].index, vec); + } + } + } catch (err) { + this.log( + `memory-pro: smart-extractor: batch pre-embed failed, will embed individually: ${String(err)}`, + ); + } + } + + for (let idx = 0; idx < survivingCandidates.length; idx++) { + const candidate = survivingCandidates[idx]; if ( isUserMdExclusiveMemory( { @@ -332,6 +359,7 @@ export class SmartExtractor { stats, targetScope, scopeFilter, + precomputedVectors.get(idx), ); } catch (err) { this.log( @@ -351,38 +379,68 @@ export class SmartExtractor { * Filter out texts that match noise prototypes by embedding similarity. * Long texts (>300 chars) are passed through without checking. * Only active when noiseBank is configured and initialized. + * + * Uses batch embedding to reduce API round-trips from N to 1. */ async filterNoiseByEmbedding(texts: string[]): Promise { const noiseBank = this.config.noiseBank; if (!noiseBank || !noiseBank.initialized) return texts; - const result: string[] = []; - for (const text of texts) { - // Very short texts lack semantic signal — skip noise check to avoid false positives - if (text.length <= 8) { - result.push(text); - continue; - } - // Long texts are unlikely to be pure noise queries - if (text.length > 300) { - result.push(text); - continue; + // Partition: short/long texts bypass noise check; mid-length need embedding + const SHORT_THRESHOLD = 8; + const LONG_THRESHOLD = 300; + const bypassFlags: boolean[] = texts.map( + (t) => t.length <= SHORT_THRESHOLD || t.length > LONG_THRESHOLD, + ); + + const needsEmbedIndices: number[] = []; + const needsEmbedTexts: string[] = []; + for (let i = 0; i < texts.length; i++) { + if (!bypassFlags[i]) { + needsEmbedIndices.push(i); + needsEmbedTexts.push(texts[i]); } + } + + // Batch embed all mid-length texts in a single API call + let vectors: number[][] = []; + if (needsEmbedTexts.length > 0) { try { - const vec = await this.embedder.embed(text); - if (!vec || vec.length === 0 || !noiseBank.isNoise(vec)) { - result.push(text); - } else { - this.debugLog( - `memory-lancedb-pro: smart-extractor: embedding noise filtered: ${text.slice(0, 80)}`, - ); - } + vectors = await this.embedder.embedBatch(needsEmbedTexts); } catch { - // Embedding failed — pass text through - result.push(text); + // Batch failed — pass all through + return texts.slice(); + } + } + + const result: string[] = new Array(texts.length); + // First, fill in bypass texts (always kept) + for (let i = 0; i < texts.length; i++) { + if (bypassFlags[i]) { + result[i] = texts[i]; + } + } + + // Then, check noise for embedded texts + for (let j = 0; j < needsEmbedIndices.length; j++) { + const idx = needsEmbedIndices[j]; + const vec = vectors[j]; + if (!vec || vec.length === 0) { + result[idx] = texts[idx]; + continue; + } + if (noiseBank.isNoise(vec)) { + this.debugLog( + `memory-lancedb-pro: smart-extractor: embedding noise filtered: ${texts[idx].slice(0, 80)}`, + ); + // Leave result[idx] as undefined — will be compacted below + } else { + result[idx] = texts[idx]; } } - return result; + + // Compact: remove undefined slots (filtered-out entries) + return result.filter(Boolean); } /** @@ -513,6 +571,10 @@ export class SmartExtractor { /** * Process a single candidate memory: dedup → merge/create → store + * + * @param precomputedVector - Optional pre-embedded vector for the candidate. + * When provided (from batch pre-embedding), skips the per-candidate embed + * call to reduce API round-trips. */ private async processCandidate( candidate: CandidateMemory, @@ -521,6 +583,7 @@ export class SmartExtractor { stats: ExtractionStats, targetScope: string, scopeFilter?: string[], + precomputedVector?: number[], ): Promise { // Profile always merges (skip dedup — admission control still applies) if (ALWAYS_MERGE_CATEGORIES.has(candidate.category)) { @@ -541,9 +604,9 @@ export class SmartExtractor { return; } - // Embed the candidate for vector dedup - const embeddingText = `${candidate.abstract} ${candidate.content}`; - const vector = await this.embedder.embed(embeddingText); + // Use pre-computed vector if available (batch embed optimization), + // otherwise fall back to per-candidate embed call. + const vector = precomputedVector ?? await this.embedder.embed(`${candidate.abstract} ${candidate.content}`); if (!vector || vector.length === 0) { this.log("memory-pro: smart-extractor: embedding failed, storing as-is"); await this.storeCandidate(candidate, vector || [], sessionKey, targetScope); From c3c53791656a4fc5db6881d0de88c555e6a2b533 Mon Sep 17 00:00:00 2001 From: unknown Date: Tue, 7 Apr 2026 23:19:09 +0000 Subject: [PATCH 2/8] =?UTF-8?q?=E4=BF=AE=E4=B8=AAbug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/smart-extractor.ts | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/smart-extractor.ts b/src/smart-extractor.ts index 7837c660..74ebf6fb 100644 --- a/src/smart-extractor.ts +++ b/src/smart-extractor.ts @@ -439,8 +439,10 @@ export class SmartExtractor { } } - // Compact: remove undefined slots (filtered-out entries) - return result.filter(Boolean); + // Compact: remove undefined slots (filtered-out entries). + // Use explicit undefined check rather than filter(Boolean) to preserve + // empty strings that were legitimately in bypass slots. + return result.filter((x): x is string => x !== undefined); } /** From 02e6ba2a3f627bf5635e44b5ebec152b719b6a6e Mon Sep 17 00:00:00 2001 From: unknown Date: Wed, 8 Apr 2026 10:19:52 +0000 Subject: [PATCH 3/8] =?UTF-8?q?=E5=87=8F=E5=B0=91=E4=B8=B2=E8=A1=8Cembeddi?= =?UTF-8?q?ng=E8=AF=B7=E6=B1=82=E5=AF=B9=E4=BA=8E=E6=9C=AC=E5=9C=B0?= =?UTF-8?q?=E6=9C=8D=E5=8A=A1=E5=9C=BA=E6=99=AF=E4=B8=8B=E7=9A=84=E6=80=A7?= =?UTF-8?q?=E8=83=BD=E6=8D=9F=E5=A4=B1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 将 smart-extractor.ts 中的三处串行 embed 调用改为批量 embedBatch 调用 --- src/smart-extractor.ts | 123 +++++++++++++++++++++++++++++++---------- 1 file changed, 93 insertions(+), 30 deletions(-) diff --git a/src/smart-extractor.ts b/src/smart-extractor.ts index dab1bb56..7837c660 100644 --- a/src/smart-extractor.ts +++ b/src/smart-extractor.ts @@ -287,10 +287,9 @@ export class SmartExtractor { let survivingCandidates = capped; try { const abstracts = capped.map((c) => c.abstract); - const vectors = await Promise.all( - abstracts.map((a) => this.embedder.embed(a).catch(() => [] as number[])), - ); - const dedupResult = batchDedup(abstracts, vectors); + const vectors = await this.embedder.embedBatch(abstracts); + const safeVectors = vectors.map((v) => v || []); + const dedupResult = batchDedup(abstracts, safeVectors); if (dedupResult.duplicateIndices.length > 0) { survivingCandidates = dedupResult.survivingIndices.map((i) => capped[i]); stats.skipped += dedupResult.duplicateIndices.length; @@ -305,7 +304,35 @@ export class SmartExtractor { } // Step 2: Process each surviving candidate through dedup pipeline - for (const candidate of survivingCandidates) { + // Pre-compute vectors for non-profile candidates in a single batch API call + // to reduce embedding round-trips from N to 1. + const precomputedVectors = new Map(); + const nonProfileEntries: { index: number; text: string }[] = []; + for (let i = 0; i < survivingCandidates.length; i++) { + const c = survivingCandidates[i]; + if (!ALWAYS_MERGE_CATEGORIES.has(c.category)) { + nonProfileEntries.push({ index: i, text: `${c.abstract} ${c.content}` }); + } + } + if (nonProfileEntries.length > 0) { + try { + const batchTexts = nonProfileEntries.map((e) => e.text); + const batchVectors = await this.embedder.embedBatch(batchTexts); + for (let j = 0; j < nonProfileEntries.length; j++) { + const vec = batchVectors[j]; + if (vec && vec.length > 0) { + precomputedVectors.set(nonProfileEntries[j].index, vec); + } + } + } catch (err) { + this.log( + `memory-pro: smart-extractor: batch pre-embed failed, will embed individually: ${String(err)}`, + ); + } + } + + for (let idx = 0; idx < survivingCandidates.length; idx++) { + const candidate = survivingCandidates[idx]; if ( isUserMdExclusiveMemory( { @@ -332,6 +359,7 @@ export class SmartExtractor { stats, targetScope, scopeFilter, + precomputedVectors.get(idx), ); } catch (err) { this.log( @@ -351,38 +379,68 @@ export class SmartExtractor { * Filter out texts that match noise prototypes by embedding similarity. * Long texts (>300 chars) are passed through without checking. * Only active when noiseBank is configured and initialized. + * + * Uses batch embedding to reduce API round-trips from N to 1. */ async filterNoiseByEmbedding(texts: string[]): Promise { const noiseBank = this.config.noiseBank; if (!noiseBank || !noiseBank.initialized) return texts; - const result: string[] = []; - for (const text of texts) { - // Very short texts lack semantic signal — skip noise check to avoid false positives - if (text.length <= 8) { - result.push(text); - continue; - } - // Long texts are unlikely to be pure noise queries - if (text.length > 300) { - result.push(text); - continue; + // Partition: short/long texts bypass noise check; mid-length need embedding + const SHORT_THRESHOLD = 8; + const LONG_THRESHOLD = 300; + const bypassFlags: boolean[] = texts.map( + (t) => t.length <= SHORT_THRESHOLD || t.length > LONG_THRESHOLD, + ); + + const needsEmbedIndices: number[] = []; + const needsEmbedTexts: string[] = []; + for (let i = 0; i < texts.length; i++) { + if (!bypassFlags[i]) { + needsEmbedIndices.push(i); + needsEmbedTexts.push(texts[i]); } + } + + // Batch embed all mid-length texts in a single API call + let vectors: number[][] = []; + if (needsEmbedTexts.length > 0) { try { - const vec = await this.embedder.embed(text); - if (!vec || vec.length === 0 || !noiseBank.isNoise(vec)) { - result.push(text); - } else { - this.debugLog( - `memory-lancedb-pro: smart-extractor: embedding noise filtered: ${text.slice(0, 80)}`, - ); - } + vectors = await this.embedder.embedBatch(needsEmbedTexts); } catch { - // Embedding failed — pass text through - result.push(text); + // Batch failed — pass all through + return texts.slice(); + } + } + + const result: string[] = new Array(texts.length); + // First, fill in bypass texts (always kept) + for (let i = 0; i < texts.length; i++) { + if (bypassFlags[i]) { + result[i] = texts[i]; + } + } + + // Then, check noise for embedded texts + for (let j = 0; j < needsEmbedIndices.length; j++) { + const idx = needsEmbedIndices[j]; + const vec = vectors[j]; + if (!vec || vec.length === 0) { + result[idx] = texts[idx]; + continue; + } + if (noiseBank.isNoise(vec)) { + this.debugLog( + `memory-lancedb-pro: smart-extractor: embedding noise filtered: ${texts[idx].slice(0, 80)}`, + ); + // Leave result[idx] as undefined — will be compacted below + } else { + result[idx] = texts[idx]; } } - return result; + + // Compact: remove undefined slots (filtered-out entries) + return result.filter(Boolean); } /** @@ -513,6 +571,10 @@ export class SmartExtractor { /** * Process a single candidate memory: dedup → merge/create → store + * + * @param precomputedVector - Optional pre-embedded vector for the candidate. + * When provided (from batch pre-embedding), skips the per-candidate embed + * call to reduce API round-trips. */ private async processCandidate( candidate: CandidateMemory, @@ -521,6 +583,7 @@ export class SmartExtractor { stats: ExtractionStats, targetScope: string, scopeFilter?: string[], + precomputedVector?: number[], ): Promise { // Profile always merges (skip dedup — admission control still applies) if (ALWAYS_MERGE_CATEGORIES.has(candidate.category)) { @@ -541,9 +604,9 @@ export class SmartExtractor { return; } - // Embed the candidate for vector dedup - const embeddingText = `${candidate.abstract} ${candidate.content}`; - const vector = await this.embedder.embed(embeddingText); + // Use pre-computed vector if available (batch embed optimization), + // otherwise fall back to per-candidate embed call. + const vector = precomputedVector ?? await this.embedder.embed(`${candidate.abstract} ${candidate.content}`); if (!vector || vector.length === 0) { this.log("memory-pro: smart-extractor: embedding failed, storing as-is"); await this.storeCandidate(candidate, vector || [], sessionKey, targetScope); From 57b8b2ffdbd8b1ef519507df81ac30edd8ae8541 Mon Sep 17 00:00:00 2001 From: unknown Date: Wed, 8 Apr 2026 10:20:05 +0000 Subject: [PATCH 4/8] chore: update dependencies (npm install) --- package-lock.json | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/package-lock.json b/package-lock.json index fcbf1b04..de165655 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "memory-lancedb-pro", - "version": "1.1.0-beta.9", + "version": "1.1.0-beta.10", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "memory-lancedb-pro", - "version": "1.1.0-beta.9", + "version": "1.1.0-beta.10", "license": "MIT", "dependencies": { "@lancedb/lancedb": "^0.26.2", @@ -20,6 +20,13 @@ "commander": "^14.0.0", "jiti": "^2.6.0", "typescript": "^5.9.3" + }, + "optionalDependencies": { + "@lancedb/lancedb-darwin-arm64": "^0.26.2", + "@lancedb/lancedb-darwin-x64": "^0.26.2", + "@lancedb/lancedb-linux-arm64-gnu": "^0.26.2", + "@lancedb/lancedb-linux-x64-gnu": "^0.26.2", + "@lancedb/lancedb-win32-x64-msvc": "^0.26.2" } }, "node_modules/@lancedb/lancedb": { @@ -71,6 +78,9 @@ "node": ">= 18" } }, + "node_modules/@lancedb/lancedb-darwin-x64": { + "optional": true + }, "node_modules/@lancedb/lancedb-linux-arm64-gnu": { "version": "0.26.2", "resolved": "https://registry.npmjs.org/@lancedb/lancedb-linux-arm64-gnu/-/lancedb-linux-arm64-gnu-0.26.2.tgz", From 59c043437d1edcab8d4aa94c7321e28024b89bdf Mon Sep 17 00:00:00 2001 From: unknown Date: Wed, 8 Apr 2026 10:20:25 +0000 Subject: [PATCH 5/8] fix: use explicit undefined check in filterNoiseByEmbedding compaction --- src/smart-extractor.ts | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/smart-extractor.ts b/src/smart-extractor.ts index 7837c660..74ebf6fb 100644 --- a/src/smart-extractor.ts +++ b/src/smart-extractor.ts @@ -439,8 +439,10 @@ export class SmartExtractor { } } - // Compact: remove undefined slots (filtered-out entries) - return result.filter(Boolean); + // Compact: remove undefined slots (filtered-out entries). + // Use explicit undefined check rather than filter(Boolean) to preserve + // empty strings that were legitimately in bypass slots. + return result.filter((x): x is string => x !== undefined); } /** From ecd1309a9b45b2beebda38379945cca4a07aa454 Mon Sep 17 00:00:00 2001 From: unknown Date: Wed, 8 Apr 2026 10:29:40 +0000 Subject: [PATCH 6/8] test: add explicit batch embedding path tests for SmartExtractor Covers 7 scenarios: Step 1b dedup batch, filterNoiseByEmbedding batch, candidate pre-compute batch, batch failure fallback, noise filter bypass correctness, profile exclusion from pre-computation --- test/smart-extractor-batch-embed.test.mjs | 371 ++++++++++++++++++++++ 1 file changed, 371 insertions(+) create mode 100644 test/smart-extractor-batch-embed.test.mjs diff --git a/test/smart-extractor-batch-embed.test.mjs b/test/smart-extractor-batch-embed.test.mjs new file mode 100644 index 00000000..b044b410 --- /dev/null +++ b/test/smart-extractor-batch-embed.test.mjs @@ -0,0 +1,371 @@ +/** + * Explicit tests for batch embedding paths in SmartExtractor. + * + * Verifies that the three refactored sites use embedBatch/embedBatch + * instead of serial per-element embed() calls, and that graceful + * fallback works when batch fails. + * + * NOTE: SmartExtractor uses INTERNAL categories (profile/preferences/entities/ + * events/cases/patterns), NOT store categories (preference/fact/decision/entity/ + * other). See src/memory-categories.ts for the canonical list. + */ + +import { describe, it } from "node:test"; +import assert from "node:assert/strict"; +import jitiFactory from "jiti"; + +const jiti = jitiFactory(import.meta.url, { interopDefault: true }); +const { SmartExtractor } = jiti("../src/smart-extractor.ts"); + +// ============================================================================ +// Helpers +// ============================================================================ + +/** Create a mock embedder with call counters for each method. */ +function makeCountingEmbedder(options = {}) { + const { + /** If set, embedBatch will throw (simulates batch failure). */ + batchShouldFail = false, + /** If set, embed will throw (simulates single embed failure). */ + embedShouldFail = false, + } = options; + + const calls = { embed: 0, embedBatch: 0 }; + + const embedder = { + async embed(text) { + calls.embed++; + if (embedShouldFail) throw new Error("mock embed failure"); + // Deterministic vector based on text length for dedup stability + return Array(256).fill(0).map((_, i) => (text.length > 0 ? (text.charCodeAt(i % text.length) / 255) : 0)); + }, + async embedBatch(texts) { + calls.embedBatch++; + if (batchShouldFail) throw new Error("mock batch failure"); + return Promise.all(texts.map((t) => embedder.embed(t))); + }, + get calls() { + return { ...calls }; + }, + }; + + return { embedder, calls }; +} + +/** Create a minimal LLM client that returns configurable candidates. + * Categories must use SmartExtractor INTERNAL names: + * profile | preferences | entities | events | cases | patterns + */ +function makeLlm(candidates) { + return { + async completeJson(_prompt, mode) { + if (mode === "extract-candidates") { + return { memories: candidates }; + } + if (mode === "dedup-decision") { + return { decision: "create", reason: "no match" }; + } + if (mode === "merge-memory") { + return candidates[0] ?? null; + } + return null; + }, + }; +} + +/** Create a minimal store that records all writes. */ +function makeStore() { + const entries = []; + const store = { + async vectorSearch(_vector, _limit, _minScore, _scopeFilter) { + return []; + }, + async store(entry) { + entries.push({ action: "store", entry }); + return entry; + }, + async update(_id, _patch, _scopeFilter) { + entries.push({ action: "update", id: _id }); + }, + async getById(_id, _scopeFilter) { + return null; + }, + get entries() { + return [...entries]; + }, + }; + return store; +} + +function makeExtractor(embedder, llm, store, config = {}) { + return new SmartExtractor(store, embedder, llm, { + user: "User", + extractMinMessages: 1, + extractMaxChars: 8000, + defaultScope: "global", + log() {}, + debugLog() {}, + ...config, + }); +} + +// ============================================================================ +// Tests +// ============================================================================ + +describe("SmartExtractor batch embedding paths", () => { + + // -------------------------------------------------------------------------- + // Test 1: Step 1b batchDedup uses embedBatch (not N×embed) + // -------------------------------------------------------------------------- + it("uses embedBatch for batch-internal dedup of candidate abstracts", async () => { + const { embedder, calls } = makeCountingEmbedder(); + const llm = makeLlm([ + { + category: "cases", + abstract: "用户居住在上海市浦东新区张江高科技园区", + overview: "地址信息", + content: "用户的居住地是上海市浦东新区张江高科技园区附近。", + }, + { + category: "cases", + abstract: "用户非常喜欢使用Python进行数据分析工作", + overview: "职业兴趣", + content: "用户对编程很感兴趣,特别是Python数据分析方向。", + }, + ]); + const store = makeStore(); + const extractor = makeExtractor(embedder, llm, store); + + await extractor.extractAndPersist("用户说:我住上海,喜欢编程。", "s1"); + + // Should have called embedBatch once for the abstracts (Step 1b) + assert.ok( + calls.embedBatch >= 1, + `Expected at least 1 embedBatch call for Step 1b dedup, got ${calls.embedBatch}`, + ); + }); + + // -------------------------------------------------------------------------- + // Test 2: filterNoiseByEmbedding uses embedBatch + // -------------------------------------------------------------------------- + it("uses embedBatch in filterNoiseByEmbedding when noise bank is active", async () => { + const { embedder, calls } = makeCountingEmbedder(); + const llm = makeLlm([ + { + category: "cases", + abstract: "这是一条测试记忆用于验证批量嵌入功能", + overview: "概览描述", + content: "详细的内容描述文本", + }, + ]); + const store = makeStore(); + + // Noise bank with isNoise that rejects everything (no noise) + const noiseBank = { + initialized: true, + isNoise(_vec) { return false; }, + learn(_vec) {}, + }; + + const extractor = makeExtractor(embedder, llm, store, { noiseBank }); + + await extractor.extractAndPersist("测试对话文本内容", "s1"); + + // filterNoiseByEmbedding should have used embedBatch for mid-length texts + assert.ok( + calls.embedBatch >= 1, + `Expected embedBatch to be called by filterNoiseByEmbedding, total=${calls.embedBatch}`, + ); + }); + + // -------------------------------------------------------------------------- + // Test 3: Batch pre-compute for non-profile candidates uses embedBatch + // -------------------------------------------------------------------------- + it("pre-computes vectors via embedBatch before processing candidates", async () => { + const { embedder, calls } = makeCountingEmbedder(); + const llm = makeLlm([ + { + category: "preferences", + abstract: "用户偏好使用深色主题来减少眼睛疲劳", + overview: "", + content: "用户明确表示偏好深色主题界面设置", + }, + { + category: "entities", + abstract: "张三是用户经常提到的同事名字", + overview: "", + content: "张三在用户的对话中多次被提及为同事关系", + }, + { + category: "events", + abstract: "上周参加了公司年度技术分享会议", + overview: "", + content: "用户参与了公司的年度技术分享活动", + }, + ]); + const store = makeStore(); + const extractor = makeExtractor(embedder, llm, store); + + await extractor.extractAndPersist("多候选对话内容用于测试预计算", "s1"); + + // At least one embedBatch call for pre-computing non-profile candidate vectors + assert.ok( + calls.embedBatch >= 1, + `Expected embedBatch for candidate pre-computation, got ${calls.embedBatch}`, + ); + }); + + // -------------------------------------------------------------------------- + // Test 4: Batch failure falls back gracefully (no crash) + // -------------------------------------------------------------------------- + it("falls back to individual embed when batch pre-computation fails", async () => { + const { embedder, calls } = makeCountingEmbedder({ + batchShouldFail: true, + }); + const llm = makeLlm([ + { + category: "cases", + abstract: "回退路径测试用例验证降级逻辑正确性", + overview: "", + content: "当batch失败时应该回退到单条embed调用方式", + }, + ]); + const store = makeStore(); + const extractor = makeExtractor(embedder, llm, store); + + // Should NOT throw — batch failure is caught and logged + const stats = await extractor.extractAndPersist("回退测试对话内容", "s1"); + + // Extraction should still succeed (fallback path) + assert.ok(stats.created >= 0 || stats.merged >= 0 || stats.skipped >= 0, + `Extraction should produce stats, got ${JSON.stringify(stats)}`); + + // Individual embed calls should have been made as fallback + assert.ok( + calls.embed >= 1, + `Expected fallback embed calls after batch failure, got embed=${calls.embed}, embedBatch=${calls.embedBatch}`, + ); + }); + + // -------------------------------------------------------------------------- + // Test 5: filterNoiseByEmbedding batch failure passes all texts through + // -------------------------------------------------------------------------- + it("passes all texts through when filterNoiseByEmbedding batch fails", async () => { + const { embedder } = makeCountingEmbedder({ + batchShouldFail: true, + }); + const llm = makeLlm([ + { + category: "cases", + abstract: "噪声过滤回退测试用例文本内容", + overview: "", + content: "详细内容描述", + }, + ]); + const store = makeStore(); + + const noiseBank = { + initialized: true, + isNoise(_vec) { return false; }, + learn(_vec) {}, + }; + + const extractor = makeExtractor(embedder, llm, store, { noiseBank }); + + // Should NOT throw — batch failure returns all texts unfiltered + const stats = await extractor.extractAndPersist("噪声过滤回退测试对话", "s1"); + + assert.ok(stats, "Extraction should complete despite noise filter batch failure"); + }); + + // -------------------------------------------------------------------------- + // Test 6: Bypass texts (short/long) are not sent to embedBatch + // -------------------------------------------------------------------------- + it("does not send bypass texts (short/long) to embedBatch in noise filter", async () => { + let lastBatchInput = null; + const embedder = { + async embed() { return [0.1]; }, + async embedBatch(texts) { + lastBatchInput = texts; + return texts.map(() => [0.1]); + }, + }; + const llm = makeLlm([ + { + category: "cases", + abstract: "正常长度文本用于噪声过滤测试验证逻辑", + overview: "", + content: "详细内容", + }, + ]); + const store = makeStore(); + + const noiseBank = { + initialized: true, + isNoise(_vec) { return false; }, + learn(_vec) {}, + }; + + const extractor = makeExtractor(embedder, llm, store, { noiseBank }); + await extractor.extractAndPersist("正常对话内容", "s1"); + + // If embedBatch was called, verify no extremely short or long texts were included + if (lastBatchInput !== null) { + for (const t of lastBatchInput) { + assert.ok( + t.length > 8 && t.length <= 300, + `Text sent to embedBatch should be in (8, 300] range, got length=${t.length}: "${t.slice(0, 40)}"`, + ); + } + } + }); + + // -------------------------------------------------------------------------- + // Test 7: Profile candidates are excluded from batch pre-computation + // -------------------------------------------------------------------------- + it("excludes profile-category candidates from batch pre-computation (Step 2)", async () => { + // Track all embedBatch calls to distinguish Step 1b (dedup) from Step 2 (pre-compute) + const allBatchCalls = []; + const embedder = { + async embed() { return Array(256).fill(0.1); }, + async embedBatch(texts) { + allBatchCalls.push([...texts]); + return texts.map(() => Array(256).fill(0.1)); + }, + }; + const llm = makeLlm([ + { + category: "profile", + abstract: "用户基本画像信息包括职业和地理位置偏好", + overview: "", + content: "这是用户的基本画像信息汇总数据。", + }, + ]); + const store = makeStore(); + const extractor = makeExtractor(embedder, llm, store); + + await extractor.extractAndPersist("画像提取测试对话内容", "s1"); + + // There should be at least 2 embedBatch calls: + // Call 1: Step 1b batchDedup (abstracts) — may include profile + // Call 2 (or later): Step 2 pre-computation — must NOT include profile + assert.ok(allBatchCalls.length >= 1, + `Expected at least 1 embedBatch call, got ${allBatchCalls.length}`); + + // The LAST embedBatch call(s) are for Step 2 pre-computation. + // Check that none of them contain profile candidate text. + const profileTexts = allBatchCalls.filter((call) => + call.some((t) => t.includes("用户基本画像") || t.includes("画像信息")), + ); + + // Step 1b dedup MAY include profile abstract (that's expected). + // But Step 2 pre-compute MUST exclude it. + // With a single profile candidate, we expect at most 1 call that includes + // profile text (the Step 1b dedup call). If there are more, that's a bug. + assert.ok( + profileTexts.length <= 1, + `Only Step 1b dedup may include profile text, but got ${profileTexts.length} calls with profile text`, + ); + }); +}); From 35e1a0ccc614fac56bcbcfe9ee130b17d14efb45 Mon Sep 17 00:00:00 2001 From: unknown Date: Wed, 8 Apr 2026 10:57:36 +0000 Subject: [PATCH 7/8] fix: make filterNoiseByEmbedding tests call the method directly Tests 2/5/6 were exercising extractAndPersist (which never calls filterNoiseByEmbedding) and asserting embedBatch was called for the wrong reason. Now they call filterNoiseByEmbedding() directly with controlled inputs, verifying batch path, failure fallback, and bypass correctness. --- test/smart-extractor-batch-embed.test.mjs | 127 +++++++++++++--------- 1 file changed, 77 insertions(+), 50 deletions(-) diff --git a/test/smart-extractor-batch-embed.test.mjs b/test/smart-extractor-batch-embed.test.mjs index b044b410..5fb9966a 100644 --- a/test/smart-extractor-batch-embed.test.mjs +++ b/test/smart-extractor-batch-embed.test.mjs @@ -42,7 +42,10 @@ function makeCountingEmbedder(options = {}) { async embedBatch(texts) { calls.embedBatch++; if (batchShouldFail) throw new Error("mock batch failure"); - return Promise.all(texts.map((t) => embedder.embed(t))); + // Return vectors directly WITHOUT calling this.embed() to keep counters independent + return (texts || []).map((t) => + Array(256).fill(0).map((_, i) => (t.length > 0 ? (t.charCodeAt(i % t.length) / 255) : 0)), + ); }, get calls() { return { ...calls }; @@ -147,21 +150,13 @@ describe("SmartExtractor batch embedding paths", () => { }); // -------------------------------------------------------------------------- - // Test 2: filterNoiseByEmbedding uses embedBatch + // Test 2: filterNoiseByEmbedding uses embedBatch (direct call) // -------------------------------------------------------------------------- it("uses embedBatch in filterNoiseByEmbedding when noise bank is active", async () => { const { embedder, calls } = makeCountingEmbedder(); - const llm = makeLlm([ - { - category: "cases", - abstract: "这是一条测试记忆用于验证批量嵌入功能", - overview: "概览描述", - content: "详细的内容描述文本", - }, - ]); - const store = makeStore(); + const llm = makeLlm([]); // not used by filterNoiseByEmbedding + const store = makeStore(); // not used by filterNoiseByEmbedding - // Noise bank with isNoise that rejects everything (no noise) const noiseBank = { initialized: true, isNoise(_vec) { return false; }, @@ -170,13 +165,28 @@ describe("SmartExtractor batch embedding paths", () => { const extractor = makeExtractor(embedder, llm, store, { noiseBank }); - await extractor.extractAndPersist("测试对话文本内容", "s1"); + // Call filterNoiseByEmbedding DIRECTLY — this is the method under test. + // Mix of lengths: short (bypass), mid-length (needs embedding), long (bypass). + const inputTexts = [ + "短", // ≤8 → bypass + "这是一条中等长度的测试文本用于验证批量嵌入功能", // 9-300 → needs embed + "这是一条另一条中等长度文本内容", // 9-300 → needs embed + "x".repeat(350), // >300 → bypass + ]; - // filterNoiseByEmbedding should have used embedBatch for mid-length texts - assert.ok( - calls.embedBatch >= 1, - `Expected embedBatch to be called by filterNoiseByEmbedding, total=${calls.embedBatch}`, - ); + const result = await extractor.filterNoiseByEmbedding(inputTexts); + + // All texts should pass through (isNoise returns false for everything) + assert.strictEqual(result.length, 4, + `Expected all 4 texts to pass through, got ${result.length}`); + + // embedBatch should have been called exactly once for the 2 mid-length texts + assert.strictEqual(calls.embedBatch, 1, + `Expected 1 embedBatch call for filterNoiseByEmbedding, got ${calls.embedBatch}`); + + // embed() should NOT have been called (batch path used instead) + assert.strictEqual(calls.embed, 0, + `Expected 0 embed calls (batch path), got ${calls.embed}`); }); // -------------------------------------------------------------------------- @@ -249,21 +259,14 @@ describe("SmartExtractor batch embedding paths", () => { }); // -------------------------------------------------------------------------- - // Test 5: filterNoiseByEmbedding batch failure passes all texts through + // Test 5: filterNoiseByEmbedding batch failure passes all texts through (direct call) // -------------------------------------------------------------------------- it("passes all texts through when filterNoiseByEmbedding batch fails", async () => { const { embedder } = makeCountingEmbedder({ batchShouldFail: true, }); - const llm = makeLlm([ - { - category: "cases", - abstract: "噪声过滤回退测试用例文本内容", - overview: "", - content: "详细内容描述", - }, - ]); - const store = makeStore(); + const llm = makeLlm([]); // not used + const store = makeStore(); // not used const noiseBank = { initialized: true, @@ -273,14 +276,23 @@ describe("SmartExtractor batch embedding paths", () => { const extractor = makeExtractor(embedder, llm, store, { noiseBank }); + // Call filterNoiseByEmbedding DIRECTLY with mid-length texts that would + // normally be sent to embedBatch. + const inputTexts = [ + "噪声过滤回退测试用例文本内容第一段", + "噪声过滤回退测试用例文本内容第二段", + "噪声过滤回退测试用例文本内容第三段", + ]; + // Should NOT throw — batch failure returns all texts unfiltered - const stats = await extractor.extractAndPersist("噪声过滤回退测试对话", "s1"); + const result = await extractor.filterNoiseByEmbedding(inputTexts); - assert.ok(stats, "Extraction should complete despite noise filter batch failure"); + assert.strictEqual(result.length, inputTexts.length, + `Expected all ${inputTexts.length} texts to pass through on batch failure, got ${result.length}`); }); // -------------------------------------------------------------------------- - // Test 6: Bypass texts (short/long) are not sent to embedBatch + // Test 6: Bypass texts (short/long) are not sent to embedBatch in noise filter (direct call) // -------------------------------------------------------------------------- it("does not send bypass texts (short/long) to embedBatch in noise filter", async () => { let lastBatchInput = null; @@ -291,15 +303,8 @@ describe("SmartExtractor batch embedding paths", () => { return texts.map(() => [0.1]); }, }; - const llm = makeLlm([ - { - category: "cases", - abstract: "正常长度文本用于噪声过滤测试验证逻辑", - overview: "", - content: "详细内容", - }, - ]); - const store = makeStore(); + const llm = makeLlm([]); // not used + const store = makeStore(); // not used const noiseBank = { initialized: true, @@ -308,17 +313,39 @@ describe("SmartExtractor batch embedding paths", () => { }; const extractor = makeExtractor(embedder, llm, store, { noiseBank }); - await extractor.extractAndPersist("正常对话内容", "s1"); - - // If embedBatch was called, verify no extremely short or long texts were included - if (lastBatchInput !== null) { - for (const t of lastBatchInput) { - assert.ok( - t.length > 8 && t.length <= 300, - `Text sent to embedBatch should be in (8, 300] range, got length=${t.length}: "${t.slice(0, 40)}"`, - ); - } + + // Call filterNoiseByEmbedding DIRECTLY with a mix of lengths + const inputTexts = [ + "短", // ≤8 → bypass + "正常长度文本用于噪声过滤测试验证逻辑正确性", // 9-300 → needs embed + "x".repeat(5), // ≤8 → bypass + "另一条正常长度文本内容用于测试", // 9-300 → needs embed + "x".repeat(350), // >300 → bypass + ]; + + await extractor.filterNoiseByEmbedding(inputTexts); + + // embedBatch should have been called with ONLY mid-length texts + assert.ok(lastBatchInput !== null, + "Expected embedBatch to be called for mid-length texts"); + + for (const t of lastBatchInput) { + assert.ok( + t.length > 8 && t.length <= 300, + `Text sent to embedBatch should be in (8, 300] range, got length=${t.length}: "${t.slice(0, 40)}"`, + ); } + + // Verify the specific texts that should have been batched + const batchedTexts = lastBatchInput.map((t) => t); + assert.ok( + batchedTexts.some((t) => t.includes("正常长度文本")), + "Expected mid-length text '正常长度文本...' in batch input", + ); + assert.ok( + batchedTexts.some((t) => t.includes("另一条正常长度")), + "Expected mid-length text '另一条正常长度...' in batch input", + ); }); // -------------------------------------------------------------------------- From 06c28c11caa8930af28f0e52750aa6f7c1b30b25 Mon Sep 17 00:00:00 2001 From: unknown Date: Sun, 12 Apr 2026 12:28:52 +0000 Subject: [PATCH 8/8] fix: address rwmjhb review nits EF2: Register test/smart-extractor-batch-embed.test.mjs in CI core-regression group EF1: TypeScript verified - no new errors (pre-existing handleSupersede scopeFilter mismatch unrelated) MR1: Filter boundary-excluded candidates before batch pre-embedding to avoid wasted embed calls --- scripts/ci-test-manifest.mjs | 1 + scripts/verify-ci-test-manifest.mjs | 1 + src/smart-extractor.ts | 71 ++++++++++++++++------------- 3 files changed, 41 insertions(+), 32 deletions(-) diff --git a/scripts/ci-test-manifest.mjs b/scripts/ci-test-manifest.mjs index 77bc1d98..48286b9f 100644 --- a/scripts/ci-test-manifest.mjs +++ b/scripts/ci-test-manifest.mjs @@ -23,6 +23,7 @@ export const CI_TEST_MANIFEST = [ { group: "core-regression", runner: "node", file: "test/retriever-rerank-regression.mjs" }, { group: "core-regression", runner: "node", file: "test/smart-memory-lifecycle.mjs" }, { group: "core-regression", runner: "node", file: "test/smart-extractor-branches.mjs" }, + { group: "core-regression", runner: "node", file: "test/smart-extractor-batch-embed.test.mjs" }, { group: "packaging-and-workflow", runner: "node", file: "test/plugin-manifest-regression.mjs" }, { group: "core-regression", runner: "node", file: "test/session-summary-before-reset.test.mjs", args: ["--test"] }, { group: "packaging-and-workflow", runner: "node", file: "test/sync-plugin-version.test.mjs", args: ["--test"] }, diff --git a/scripts/verify-ci-test-manifest.mjs b/scripts/verify-ci-test-manifest.mjs index 1a7d652a..df8090dc 100644 --- a/scripts/verify-ci-test-manifest.mjs +++ b/scripts/verify-ci-test-manifest.mjs @@ -24,6 +24,7 @@ const EXPECTED_BASELINE = [ { group: "core-regression", runner: "node", file: "test/retriever-rerank-regression.mjs" }, { group: "core-regression", runner: "node", file: "test/smart-memory-lifecycle.mjs" }, { group: "core-regression", runner: "node", file: "test/smart-extractor-branches.mjs" }, + { group: "core-regression", runner: "node", file: "test/smart-extractor-batch-embed.test.mjs" }, { group: "packaging-and-workflow", runner: "node", file: "test/plugin-manifest-regression.mjs" }, { group: "core-regression", runner: "node", file: "test/session-summary-before-reset.test.mjs", args: ["--test"] }, { group: "packaging-and-workflow", runner: "node", file: "test/sync-plugin-version.test.mjs", args: ["--test"] }, diff --git a/src/smart-extractor.ts b/src/smart-extractor.ts index 74ebf6fb..fcdd68f2 100644 --- a/src/smart-extractor.ts +++ b/src/smart-extractor.ts @@ -303,25 +303,51 @@ export class SmartExtractor { ); } - // Step 2: Process each surviving candidate through dedup pipeline - // Pre-compute vectors for non-profile candidates in a single batch API call - // to reduce embedding round-trips from N to 1. - const precomputedVectors = new Map(); - const nonProfileEntries: { index: number; text: string }[] = []; + // Step 2: Process each surviving candidate through dedup pipeline. + // + // Optimization: filter boundary-excluded candidates BEFORE batch embedding + // to avoid wasting embed API calls on candidates that will be skipped. + // See MR1 from code review. + const processableCandidates: { index: number; candidate: CandidateMemory }[] = []; for (let i = 0; i < survivingCandidates.length; i++) { const c = survivingCandidates[i]; - if (!ALWAYS_MERGE_CATEGORIES.has(c.category)) { - nonProfileEntries.push({ index: i, text: `${c.abstract} ${c.content}` }); + if ( + isUserMdExclusiveMemory( + { + memoryCategory: c.category, + abstract: c.abstract, + content: c.content, + }, + this.config.workspaceBoundary, + ) + ) { + stats.skipped += 1; + stats.boundarySkipped = (stats.boundarySkipped ?? 0) + 1; + this.log( + `memory-pro: smart-extractor: skipped USER.md-exclusive [${c.category}] ${c.abstract.slice(0, 60)}`, + ); + continue; } + processableCandidates.push({ index: i, candidate: c }); } - if (nonProfileEntries.length > 0) { + + // Pre-compute vectors for processable non-profile candidates in a single batch API call + // to reduce embedding round-trips from N to 1. + const precomputedVectors = new Map(); + const nonProfileToEmbed: { index: number; text: string }[] = []; + for (const { index, candidate } of processableCandidates) { + if (!ALWAYS_MERGE_CATEGORIES.has(candidate.category)) { + nonProfileToEmbed.push({ index, text: `${candidate.abstract} ${candidate.content}` }); + } + } + if (nonProfileToEmbed.length > 0) { try { - const batchTexts = nonProfileEntries.map((e) => e.text); + const batchTexts = nonProfileToEmbed.map((e) => e.text); const batchVectors = await this.embedder.embedBatch(batchTexts); - for (let j = 0; j < nonProfileEntries.length; j++) { + for (let j = 0; j < nonProfileToEmbed.length; j++) { const vec = batchVectors[j]; if (vec && vec.length > 0) { - precomputedVectors.set(nonProfileEntries[j].index, vec); + precomputedVectors.set(nonProfileToEmbed[j].index, vec); } } } catch (err) { @@ -331,26 +357,7 @@ export class SmartExtractor { } } - for (let idx = 0; idx < survivingCandidates.length; idx++) { - const candidate = survivingCandidates[idx]; - if ( - isUserMdExclusiveMemory( - { - memoryCategory: candidate.category, - abstract: candidate.abstract, - content: candidate.content, - }, - this.config.workspaceBoundary, - ) - ) { - stats.skipped += 1; - stats.boundarySkipped = (stats.boundarySkipped ?? 0) + 1; - this.log( - `memory-pro: smart-extractor: skipped USER.md-exclusive [${candidate.category}] ${candidate.abstract.slice(0, 60)}`, - ); - continue; - } - + for (const { index, candidate } of processableCandidates) { try { await this.processCandidate( candidate, @@ -359,7 +366,7 @@ export class SmartExtractor { stats, targetScope, scopeFilter, - precomputedVectors.get(idx), + precomputedVectors.get(index), ); } catch (err) { this.log(