diff --git a/package-lock.json b/package-lock.json index fcbf1b04..de165655 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "memory-lancedb-pro", - "version": "1.1.0-beta.9", + "version": "1.1.0-beta.10", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "memory-lancedb-pro", - "version": "1.1.0-beta.9", + "version": "1.1.0-beta.10", "license": "MIT", "dependencies": { "@lancedb/lancedb": "^0.26.2", @@ -20,6 +20,13 @@ "commander": "^14.0.0", "jiti": "^2.6.0", "typescript": "^5.9.3" + }, + "optionalDependencies": { + "@lancedb/lancedb-darwin-arm64": "^0.26.2", + "@lancedb/lancedb-darwin-x64": "^0.26.2", + "@lancedb/lancedb-linux-arm64-gnu": "^0.26.2", + "@lancedb/lancedb-linux-x64-gnu": "^0.26.2", + "@lancedb/lancedb-win32-x64-msvc": "^0.26.2" } }, "node_modules/@lancedb/lancedb": { @@ -71,6 +78,9 @@ "node": ">= 18" } }, + "node_modules/@lancedb/lancedb-darwin-x64": { + "optional": true + }, "node_modules/@lancedb/lancedb-linux-arm64-gnu": { "version": "0.26.2", "resolved": "https://registry.npmjs.org/@lancedb/lancedb-linux-arm64-gnu/-/lancedb-linux-arm64-gnu-0.26.2.tgz", diff --git a/scripts/ci-test-manifest.mjs b/scripts/ci-test-manifest.mjs index 77bc1d98..48286b9f 100644 --- a/scripts/ci-test-manifest.mjs +++ b/scripts/ci-test-manifest.mjs @@ -23,6 +23,7 @@ export const CI_TEST_MANIFEST = [ { group: "core-regression", runner: "node", file: "test/retriever-rerank-regression.mjs" }, { group: "core-regression", runner: "node", file: "test/smart-memory-lifecycle.mjs" }, { group: "core-regression", runner: "node", file: "test/smart-extractor-branches.mjs" }, + { group: "core-regression", runner: "node", file: "test/smart-extractor-batch-embed.test.mjs" }, { group: "packaging-and-workflow", runner: "node", file: "test/plugin-manifest-regression.mjs" }, { group: "core-regression", runner: "node", file: "test/session-summary-before-reset.test.mjs", args: ["--test"] }, { group: "packaging-and-workflow", runner: "node", file: "test/sync-plugin-version.test.mjs", args: ["--test"] }, diff --git a/scripts/verify-ci-test-manifest.mjs b/scripts/verify-ci-test-manifest.mjs index 1a7d652a..df8090dc 100644 --- a/scripts/verify-ci-test-manifest.mjs +++ b/scripts/verify-ci-test-manifest.mjs @@ -24,6 +24,7 @@ const EXPECTED_BASELINE = [ { group: "core-regression", runner: "node", file: "test/retriever-rerank-regression.mjs" }, { group: "core-regression", runner: "node", file: "test/smart-memory-lifecycle.mjs" }, { group: "core-regression", runner: "node", file: "test/smart-extractor-branches.mjs" }, + { group: "core-regression", runner: "node", file: "test/smart-extractor-batch-embed.test.mjs" }, { group: "packaging-and-workflow", runner: "node", file: "test/plugin-manifest-regression.mjs" }, { group: "core-regression", runner: "node", file: "test/session-summary-before-reset.test.mjs", args: ["--test"] }, { group: "packaging-and-workflow", runner: "node", file: "test/sync-plugin-version.test.mjs", args: ["--test"] }, diff --git a/src/smart-extractor.ts b/src/smart-extractor.ts index dab1bb56..fcdd68f2 100644 --- a/src/smart-extractor.ts +++ b/src/smart-extractor.ts @@ -287,10 +287,9 @@ export class SmartExtractor { let survivingCandidates = capped; try { const abstracts = capped.map((c) => c.abstract); - const vectors = await Promise.all( - abstracts.map((a) => this.embedder.embed(a).catch(() => [] as number[])), - ); - const dedupResult = batchDedup(abstracts, vectors); + const vectors = await this.embedder.embedBatch(abstracts); + const safeVectors = vectors.map((v) => v || []); + const dedupResult = batchDedup(abstracts, safeVectors); if (dedupResult.duplicateIndices.length > 0) { survivingCandidates = dedupResult.survivingIndices.map((i) => capped[i]); stats.skipped += dedupResult.duplicateIndices.length; @@ -304,14 +303,20 @@ export class SmartExtractor { ); } - // Step 2: Process each surviving candidate through dedup pipeline - for (const candidate of survivingCandidates) { + // Step 2: Process each surviving candidate through dedup pipeline. + // + // Optimization: filter boundary-excluded candidates BEFORE batch embedding + // to avoid wasting embed API calls on candidates that will be skipped. + // See MR1 from code review. + const processableCandidates: { index: number; candidate: CandidateMemory }[] = []; + for (let i = 0; i < survivingCandidates.length; i++) { + const c = survivingCandidates[i]; if ( isUserMdExclusiveMemory( { - memoryCategory: candidate.category, - abstract: candidate.abstract, - content: candidate.content, + memoryCategory: c.category, + abstract: c.abstract, + content: c.content, }, this.config.workspaceBoundary, ) @@ -319,11 +324,40 @@ export class SmartExtractor { stats.skipped += 1; stats.boundarySkipped = (stats.boundarySkipped ?? 0) + 1; this.log( - `memory-pro: smart-extractor: skipped USER.md-exclusive [${candidate.category}] ${candidate.abstract.slice(0, 60)}`, + `memory-pro: smart-extractor: skipped USER.md-exclusive [${c.category}] ${c.abstract.slice(0, 60)}`, ); continue; } + processableCandidates.push({ index: i, candidate: c }); + } + // Pre-compute vectors for processable non-profile candidates in a single batch API call + // to reduce embedding round-trips from N to 1. + const precomputedVectors = new Map(); + const nonProfileToEmbed: { index: number; text: string }[] = []; + for (const { index, candidate } of processableCandidates) { + if (!ALWAYS_MERGE_CATEGORIES.has(candidate.category)) { + nonProfileToEmbed.push({ index, text: `${candidate.abstract} ${candidate.content}` }); + } + } + if (nonProfileToEmbed.length > 0) { + try { + const batchTexts = nonProfileToEmbed.map((e) => e.text); + const batchVectors = await this.embedder.embedBatch(batchTexts); + for (let j = 0; j < nonProfileToEmbed.length; j++) { + const vec = batchVectors[j]; + if (vec && vec.length > 0) { + precomputedVectors.set(nonProfileToEmbed[j].index, vec); + } + } + } catch (err) { + this.log( + `memory-pro: smart-extractor: batch pre-embed failed, will embed individually: ${String(err)}`, + ); + } + } + + for (const { index, candidate } of processableCandidates) { try { await this.processCandidate( candidate, @@ -332,6 +366,7 @@ export class SmartExtractor { stats, targetScope, scopeFilter, + precomputedVectors.get(index), ); } catch (err) { this.log( @@ -351,38 +386,70 @@ export class SmartExtractor { * Filter out texts that match noise prototypes by embedding similarity. * Long texts (>300 chars) are passed through without checking. * Only active when noiseBank is configured and initialized. + * + * Uses batch embedding to reduce API round-trips from N to 1. */ async filterNoiseByEmbedding(texts: string[]): Promise { const noiseBank = this.config.noiseBank; if (!noiseBank || !noiseBank.initialized) return texts; - const result: string[] = []; - for (const text of texts) { - // Very short texts lack semantic signal — skip noise check to avoid false positives - if (text.length <= 8) { - result.push(text); - continue; - } - // Long texts are unlikely to be pure noise queries - if (text.length > 300) { - result.push(text); - continue; + // Partition: short/long texts bypass noise check; mid-length need embedding + const SHORT_THRESHOLD = 8; + const LONG_THRESHOLD = 300; + const bypassFlags: boolean[] = texts.map( + (t) => t.length <= SHORT_THRESHOLD || t.length > LONG_THRESHOLD, + ); + + const needsEmbedIndices: number[] = []; + const needsEmbedTexts: string[] = []; + for (let i = 0; i < texts.length; i++) { + if (!bypassFlags[i]) { + needsEmbedIndices.push(i); + needsEmbedTexts.push(texts[i]); } + } + + // Batch embed all mid-length texts in a single API call + let vectors: number[][] = []; + if (needsEmbedTexts.length > 0) { try { - const vec = await this.embedder.embed(text); - if (!vec || vec.length === 0 || !noiseBank.isNoise(vec)) { - result.push(text); - } else { - this.debugLog( - `memory-lancedb-pro: smart-extractor: embedding noise filtered: ${text.slice(0, 80)}`, - ); - } + vectors = await this.embedder.embedBatch(needsEmbedTexts); } catch { - // Embedding failed — pass text through - result.push(text); + // Batch failed — pass all through + return texts.slice(); + } + } + + const result: string[] = new Array(texts.length); + // First, fill in bypass texts (always kept) + for (let i = 0; i < texts.length; i++) { + if (bypassFlags[i]) { + result[i] = texts[i]; + } + } + + // Then, check noise for embedded texts + for (let j = 0; j < needsEmbedIndices.length; j++) { + const idx = needsEmbedIndices[j]; + const vec = vectors[j]; + if (!vec || vec.length === 0) { + result[idx] = texts[idx]; + continue; + } + if (noiseBank.isNoise(vec)) { + this.debugLog( + `memory-lancedb-pro: smart-extractor: embedding noise filtered: ${texts[idx].slice(0, 80)}`, + ); + // Leave result[idx] as undefined — will be compacted below + } else { + result[idx] = texts[idx]; } } - return result; + + // Compact: remove undefined slots (filtered-out entries). + // Use explicit undefined check rather than filter(Boolean) to preserve + // empty strings that were legitimately in bypass slots. + return result.filter((x): x is string => x !== undefined); } /** @@ -513,6 +580,10 @@ export class SmartExtractor { /** * Process a single candidate memory: dedup → merge/create → store + * + * @param precomputedVector - Optional pre-embedded vector for the candidate. + * When provided (from batch pre-embedding), skips the per-candidate embed + * call to reduce API round-trips. */ private async processCandidate( candidate: CandidateMemory, @@ -521,6 +592,7 @@ export class SmartExtractor { stats: ExtractionStats, targetScope: string, scopeFilter?: string[], + precomputedVector?: number[], ): Promise { // Profile always merges (skip dedup — admission control still applies) if (ALWAYS_MERGE_CATEGORIES.has(candidate.category)) { @@ -541,9 +613,9 @@ export class SmartExtractor { return; } - // Embed the candidate for vector dedup - const embeddingText = `${candidate.abstract} ${candidate.content}`; - const vector = await this.embedder.embed(embeddingText); + // Use pre-computed vector if available (batch embed optimization), + // otherwise fall back to per-candidate embed call. + const vector = precomputedVector ?? await this.embedder.embed(`${candidate.abstract} ${candidate.content}`); if (!vector || vector.length === 0) { this.log("memory-pro: smart-extractor: embedding failed, storing as-is"); await this.storeCandidate(candidate, vector || [], sessionKey, targetScope); diff --git a/test/smart-extractor-batch-embed.test.mjs b/test/smart-extractor-batch-embed.test.mjs new file mode 100644 index 00000000..5fb9966a --- /dev/null +++ b/test/smart-extractor-batch-embed.test.mjs @@ -0,0 +1,398 @@ +/** + * Explicit tests for batch embedding paths in SmartExtractor. + * + * Verifies that the three refactored sites use embedBatch/embedBatch + * instead of serial per-element embed() calls, and that graceful + * fallback works when batch fails. + * + * NOTE: SmartExtractor uses INTERNAL categories (profile/preferences/entities/ + * events/cases/patterns), NOT store categories (preference/fact/decision/entity/ + * other). See src/memory-categories.ts for the canonical list. + */ + +import { describe, it } from "node:test"; +import assert from "node:assert/strict"; +import jitiFactory from "jiti"; + +const jiti = jitiFactory(import.meta.url, { interopDefault: true }); +const { SmartExtractor } = jiti("../src/smart-extractor.ts"); + +// ============================================================================ +// Helpers +// ============================================================================ + +/** Create a mock embedder with call counters for each method. */ +function makeCountingEmbedder(options = {}) { + const { + /** If set, embedBatch will throw (simulates batch failure). */ + batchShouldFail = false, + /** If set, embed will throw (simulates single embed failure). */ + embedShouldFail = false, + } = options; + + const calls = { embed: 0, embedBatch: 0 }; + + const embedder = { + async embed(text) { + calls.embed++; + if (embedShouldFail) throw new Error("mock embed failure"); + // Deterministic vector based on text length for dedup stability + return Array(256).fill(0).map((_, i) => (text.length > 0 ? (text.charCodeAt(i % text.length) / 255) : 0)); + }, + async embedBatch(texts) { + calls.embedBatch++; + if (batchShouldFail) throw new Error("mock batch failure"); + // Return vectors directly WITHOUT calling this.embed() to keep counters independent + return (texts || []).map((t) => + Array(256).fill(0).map((_, i) => (t.length > 0 ? (t.charCodeAt(i % t.length) / 255) : 0)), + ); + }, + get calls() { + return { ...calls }; + }, + }; + + return { embedder, calls }; +} + +/** Create a minimal LLM client that returns configurable candidates. + * Categories must use SmartExtractor INTERNAL names: + * profile | preferences | entities | events | cases | patterns + */ +function makeLlm(candidates) { + return { + async completeJson(_prompt, mode) { + if (mode === "extract-candidates") { + return { memories: candidates }; + } + if (mode === "dedup-decision") { + return { decision: "create", reason: "no match" }; + } + if (mode === "merge-memory") { + return candidates[0] ?? null; + } + return null; + }, + }; +} + +/** Create a minimal store that records all writes. */ +function makeStore() { + const entries = []; + const store = { + async vectorSearch(_vector, _limit, _minScore, _scopeFilter) { + return []; + }, + async store(entry) { + entries.push({ action: "store", entry }); + return entry; + }, + async update(_id, _patch, _scopeFilter) { + entries.push({ action: "update", id: _id }); + }, + async getById(_id, _scopeFilter) { + return null; + }, + get entries() { + return [...entries]; + }, + }; + return store; +} + +function makeExtractor(embedder, llm, store, config = {}) { + return new SmartExtractor(store, embedder, llm, { + user: "User", + extractMinMessages: 1, + extractMaxChars: 8000, + defaultScope: "global", + log() {}, + debugLog() {}, + ...config, + }); +} + +// ============================================================================ +// Tests +// ============================================================================ + +describe("SmartExtractor batch embedding paths", () => { + + // -------------------------------------------------------------------------- + // Test 1: Step 1b batchDedup uses embedBatch (not N×embed) + // -------------------------------------------------------------------------- + it("uses embedBatch for batch-internal dedup of candidate abstracts", async () => { + const { embedder, calls } = makeCountingEmbedder(); + const llm = makeLlm([ + { + category: "cases", + abstract: "用户居住在上海市浦东新区张江高科技园区", + overview: "地址信息", + content: "用户的居住地是上海市浦东新区张江高科技园区附近。", + }, + { + category: "cases", + abstract: "用户非常喜欢使用Python进行数据分析工作", + overview: "职业兴趣", + content: "用户对编程很感兴趣,特别是Python数据分析方向。", + }, + ]); + const store = makeStore(); + const extractor = makeExtractor(embedder, llm, store); + + await extractor.extractAndPersist("用户说:我住上海,喜欢编程。", "s1"); + + // Should have called embedBatch once for the abstracts (Step 1b) + assert.ok( + calls.embedBatch >= 1, + `Expected at least 1 embedBatch call for Step 1b dedup, got ${calls.embedBatch}`, + ); + }); + + // -------------------------------------------------------------------------- + // Test 2: filterNoiseByEmbedding uses embedBatch (direct call) + // -------------------------------------------------------------------------- + it("uses embedBatch in filterNoiseByEmbedding when noise bank is active", async () => { + const { embedder, calls } = makeCountingEmbedder(); + const llm = makeLlm([]); // not used by filterNoiseByEmbedding + const store = makeStore(); // not used by filterNoiseByEmbedding + + const noiseBank = { + initialized: true, + isNoise(_vec) { return false; }, + learn(_vec) {}, + }; + + const extractor = makeExtractor(embedder, llm, store, { noiseBank }); + + // Call filterNoiseByEmbedding DIRECTLY — this is the method under test. + // Mix of lengths: short (bypass), mid-length (needs embedding), long (bypass). + const inputTexts = [ + "短", // ≤8 → bypass + "这是一条中等长度的测试文本用于验证批量嵌入功能", // 9-300 → needs embed + "这是一条另一条中等长度文本内容", // 9-300 → needs embed + "x".repeat(350), // >300 → bypass + ]; + + const result = await extractor.filterNoiseByEmbedding(inputTexts); + + // All texts should pass through (isNoise returns false for everything) + assert.strictEqual(result.length, 4, + `Expected all 4 texts to pass through, got ${result.length}`); + + // embedBatch should have been called exactly once for the 2 mid-length texts + assert.strictEqual(calls.embedBatch, 1, + `Expected 1 embedBatch call for filterNoiseByEmbedding, got ${calls.embedBatch}`); + + // embed() should NOT have been called (batch path used instead) + assert.strictEqual(calls.embed, 0, + `Expected 0 embed calls (batch path), got ${calls.embed}`); + }); + + // -------------------------------------------------------------------------- + // Test 3: Batch pre-compute for non-profile candidates uses embedBatch + // -------------------------------------------------------------------------- + it("pre-computes vectors via embedBatch before processing candidates", async () => { + const { embedder, calls } = makeCountingEmbedder(); + const llm = makeLlm([ + { + category: "preferences", + abstract: "用户偏好使用深色主题来减少眼睛疲劳", + overview: "", + content: "用户明确表示偏好深色主题界面设置", + }, + { + category: "entities", + abstract: "张三是用户经常提到的同事名字", + overview: "", + content: "张三在用户的对话中多次被提及为同事关系", + }, + { + category: "events", + abstract: "上周参加了公司年度技术分享会议", + overview: "", + content: "用户参与了公司的年度技术分享活动", + }, + ]); + const store = makeStore(); + const extractor = makeExtractor(embedder, llm, store); + + await extractor.extractAndPersist("多候选对话内容用于测试预计算", "s1"); + + // At least one embedBatch call for pre-computing non-profile candidate vectors + assert.ok( + calls.embedBatch >= 1, + `Expected embedBatch for candidate pre-computation, got ${calls.embedBatch}`, + ); + }); + + // -------------------------------------------------------------------------- + // Test 4: Batch failure falls back gracefully (no crash) + // -------------------------------------------------------------------------- + it("falls back to individual embed when batch pre-computation fails", async () => { + const { embedder, calls } = makeCountingEmbedder({ + batchShouldFail: true, + }); + const llm = makeLlm([ + { + category: "cases", + abstract: "回退路径测试用例验证降级逻辑正确性", + overview: "", + content: "当batch失败时应该回退到单条embed调用方式", + }, + ]); + const store = makeStore(); + const extractor = makeExtractor(embedder, llm, store); + + // Should NOT throw — batch failure is caught and logged + const stats = await extractor.extractAndPersist("回退测试对话内容", "s1"); + + // Extraction should still succeed (fallback path) + assert.ok(stats.created >= 0 || stats.merged >= 0 || stats.skipped >= 0, + `Extraction should produce stats, got ${JSON.stringify(stats)}`); + + // Individual embed calls should have been made as fallback + assert.ok( + calls.embed >= 1, + `Expected fallback embed calls after batch failure, got embed=${calls.embed}, embedBatch=${calls.embedBatch}`, + ); + }); + + // -------------------------------------------------------------------------- + // Test 5: filterNoiseByEmbedding batch failure passes all texts through (direct call) + // -------------------------------------------------------------------------- + it("passes all texts through when filterNoiseByEmbedding batch fails", async () => { + const { embedder } = makeCountingEmbedder({ + batchShouldFail: true, + }); + const llm = makeLlm([]); // not used + const store = makeStore(); // not used + + const noiseBank = { + initialized: true, + isNoise(_vec) { return false; }, + learn(_vec) {}, + }; + + const extractor = makeExtractor(embedder, llm, store, { noiseBank }); + + // Call filterNoiseByEmbedding DIRECTLY with mid-length texts that would + // normally be sent to embedBatch. + const inputTexts = [ + "噪声过滤回退测试用例文本内容第一段", + "噪声过滤回退测试用例文本内容第二段", + "噪声过滤回退测试用例文本内容第三段", + ]; + + // Should NOT throw — batch failure returns all texts unfiltered + const result = await extractor.filterNoiseByEmbedding(inputTexts); + + assert.strictEqual(result.length, inputTexts.length, + `Expected all ${inputTexts.length} texts to pass through on batch failure, got ${result.length}`); + }); + + // -------------------------------------------------------------------------- + // Test 6: Bypass texts (short/long) are not sent to embedBatch in noise filter (direct call) + // -------------------------------------------------------------------------- + it("does not send bypass texts (short/long) to embedBatch in noise filter", async () => { + let lastBatchInput = null; + const embedder = { + async embed() { return [0.1]; }, + async embedBatch(texts) { + lastBatchInput = texts; + return texts.map(() => [0.1]); + }, + }; + const llm = makeLlm([]); // not used + const store = makeStore(); // not used + + const noiseBank = { + initialized: true, + isNoise(_vec) { return false; }, + learn(_vec) {}, + }; + + const extractor = makeExtractor(embedder, llm, store, { noiseBank }); + + // Call filterNoiseByEmbedding DIRECTLY with a mix of lengths + const inputTexts = [ + "短", // ≤8 → bypass + "正常长度文本用于噪声过滤测试验证逻辑正确性", // 9-300 → needs embed + "x".repeat(5), // ≤8 → bypass + "另一条正常长度文本内容用于测试", // 9-300 → needs embed + "x".repeat(350), // >300 → bypass + ]; + + await extractor.filterNoiseByEmbedding(inputTexts); + + // embedBatch should have been called with ONLY mid-length texts + assert.ok(lastBatchInput !== null, + "Expected embedBatch to be called for mid-length texts"); + + for (const t of lastBatchInput) { + assert.ok( + t.length > 8 && t.length <= 300, + `Text sent to embedBatch should be in (8, 300] range, got length=${t.length}: "${t.slice(0, 40)}"`, + ); + } + + // Verify the specific texts that should have been batched + const batchedTexts = lastBatchInput.map((t) => t); + assert.ok( + batchedTexts.some((t) => t.includes("正常长度文本")), + "Expected mid-length text '正常长度文本...' in batch input", + ); + assert.ok( + batchedTexts.some((t) => t.includes("另一条正常长度")), + "Expected mid-length text '另一条正常长度...' in batch input", + ); + }); + + // -------------------------------------------------------------------------- + // Test 7: Profile candidates are excluded from batch pre-computation + // -------------------------------------------------------------------------- + it("excludes profile-category candidates from batch pre-computation (Step 2)", async () => { + // Track all embedBatch calls to distinguish Step 1b (dedup) from Step 2 (pre-compute) + const allBatchCalls = []; + const embedder = { + async embed() { return Array(256).fill(0.1); }, + async embedBatch(texts) { + allBatchCalls.push([...texts]); + return texts.map(() => Array(256).fill(0.1)); + }, + }; + const llm = makeLlm([ + { + category: "profile", + abstract: "用户基本画像信息包括职业和地理位置偏好", + overview: "", + content: "这是用户的基本画像信息汇总数据。", + }, + ]); + const store = makeStore(); + const extractor = makeExtractor(embedder, llm, store); + + await extractor.extractAndPersist("画像提取测试对话内容", "s1"); + + // There should be at least 2 embedBatch calls: + // Call 1: Step 1b batchDedup (abstracts) — may include profile + // Call 2 (or later): Step 2 pre-computation — must NOT include profile + assert.ok(allBatchCalls.length >= 1, + `Expected at least 1 embedBatch call, got ${allBatchCalls.length}`); + + // The LAST embedBatch call(s) are for Step 2 pre-computation. + // Check that none of them contain profile candidate text. + const profileTexts = allBatchCalls.filter((call) => + call.some((t) => t.includes("用户基本画像") || t.includes("画像信息")), + ); + + // Step 1b dedup MAY include profile abstract (that's expected). + // But Step 2 pre-compute MUST exclude it. + // With a single profile candidate, we expect at most 1 call that includes + // profile text (the Step 1b dedup call). If there are more, that's a bug. + assert.ok( + profileTexts.length <= 1, + `Only Step 1b dedup may include profile text, but got ${profileTexts.length} calls with profile text`, + ); + }); +});