From e988a9a2c1ac0c3355014e34897d8cdb4ba175bc Mon Sep 17 00:00:00 2001 From: Ssyn Date: Sun, 29 Mar 2026 06:05:10 -0700 Subject: [PATCH 1/7] test: add failing tests for adaptive-retrieval CJK and edge case bugs Co-Authored-By: Claude Sonnet 4.6 --- test/adaptive-retrieval.test.mjs | 100 +++++++++++++++++++++++++++++++ 1 file changed, 100 insertions(+) create mode 100644 test/adaptive-retrieval.test.mjs diff --git a/test/adaptive-retrieval.test.mjs b/test/adaptive-retrieval.test.mjs new file mode 100644 index 00000000..fab58072 --- /dev/null +++ b/test/adaptive-retrieval.test.mjs @@ -0,0 +1,100 @@ +import { describe, it } from "node:test"; +import assert from "node:assert/strict"; +import jitiFactory from "jiti"; + +const jiti = jitiFactory(import.meta.url, { interopDefault: true }); +const { shouldSkipRetrieval } = jiti("../src/adaptive-retrieval.ts"); + +describe("shouldSkipRetrieval", () => { + // --- emoji regex fix --- + describe("emoji regex should not match digits", () => { + it("does not skip pure digit strings", () => { + assert.equal(shouldSkipRetrieval("12345"), false); + }); + + it("does not skip port numbers", () => { + assert.equal(shouldSkipRetrieval("8080"), false); + }); + + it("does not skip hash-prefixed numbers", () => { + assert.equal(shouldSkipRetrieval("#123"), false); + }); + + it("skips pure emoji input", () => { + assert.equal(shouldSkipRetrieval("\ud83d\udc4d\ud83c\udf89\ud83d\ude80"), true); + }); + + it("does not skip emoji mixed with text", () => { + assert.equal(shouldSkipRetrieval("\ud83d\udc68\u200d\ud83d\udc69\u200d\ud83d\udc67 family trip plan"), false); + }); + }); + + // --- slash command regex fix --- + describe("slash command regex should not match file paths", () => { + it("skips single-word slash commands", () => { + assert.equal(shouldSkipRetrieval("/help"), true); + }); + + it("skips slash command with trailing space", () => { + assert.equal(shouldSkipRetrieval("/recall "), true); + }); + + it("does not skip file paths", () => { + assert.equal(shouldSkipRetrieval("/usr/bin/node"), false); + }); + + it("does not skip path with question", () => { + assert.equal(shouldSkipRetrieval("/etc/nginx/nginx.conf \u600e\u4e48\u914d\u7f6e"), false); + }); + + it("does not skip API paths", () => { + assert.equal(shouldSkipRetrieval("/api/v2/users"), false); + }); + }); + + // --- CJK short text threshold fix --- + describe("CJK short text should not be killed by hard threshold", () => { + it("does not skip 4-char CJK query", () => { + assert.equal(shouldSkipRetrieval("\u4ed6\u559c\u6b22\u732b"), false); + }); + + it("does not skip 4-char CJK query (residence)", () => { + assert.equal(shouldSkipRetrieval("\u6211\u4f4f\u5317\u4eac"), false); + }); + + it("does not skip 3-char mixed CJK query", () => { + assert.equal(shouldSkipRetrieval("\u7528Go\u5199"), false); + }); + + it("does not skip CJK query with question mark", () => { + assert.equal(shouldSkipRetrieval("\u5bc6\u7801\u662f\u5565\uff1f"), false); + }); + + it("skips single CJK character", () => { + assert.equal(shouldSkipRetrieval("\u597d"), true); + }); + }); + + // --- existing behavior preserved --- + describe("existing skip/force behavior preserved", () => { + it("skips greetings", () => { + assert.equal(shouldSkipRetrieval("hi"), true); + }); + + it("skips short English affirmations", () => { + assert.equal(shouldSkipRetrieval("ok"), true); + }); + + it("does not skip memory-related queries (English)", () => { + assert.equal(shouldSkipRetrieval("remember my name is Alice"), false); + }); + + it("does not skip memory-related queries (Chinese)", () => { + assert.equal(shouldSkipRetrieval("\u4f60\u8bb0\u5f97\u5417"), false); + }); + + it("does not skip normal length queries", () => { + assert.equal(shouldSkipRetrieval("what was the database schema we discussed"), false); + }); + }); +}); From 804fff12e186c270305359edc4330c6aba0d21e1 Mon Sep 17 00:00:00 2001 From: Ssyn Date: Sun, 29 Mar 2026 06:10:41 -0700 Subject: [PATCH 2/7] test: add failing tests for noise-filter CJK short text bug --- test/noise-filter.test.mjs | 96 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 96 insertions(+) create mode 100644 test/noise-filter.test.mjs diff --git a/test/noise-filter.test.mjs b/test/noise-filter.test.mjs new file mode 100644 index 00000000..3ed86173 --- /dev/null +++ b/test/noise-filter.test.mjs @@ -0,0 +1,96 @@ +import { describe, it } from "node:test"; +import assert from "node:assert/strict"; +import jitiFactory from "jiti"; + +const jiti = jitiFactory(import.meta.url, { interopDefault: true }); +const { isNoise, filterNoise } = jiti("../src/noise-filter.ts"); + +describe("isNoise", () => { + // --- CJK short text fix --- + describe("CJK short text should not be marked as noise", () => { + it("4-char CJK is not noise", () => { + assert.equal(isNoise("\u4ed6\u559c\u6b22\u732b"), false); + }); + + it("3-char mixed CJK is not noise", () => { + assert.equal(isNoise("\u7528Go\u5199"), false); + }); + + it("2-char CJK is not noise", () => { + assert.equal(isNoise("\u5b66\u4e60"), false); + }); + + it("single CJK char is noise", () => { + assert.equal(isNoise("\u597d"), true); + }); + }); + + // --- English short text preserved --- + describe("English short text filtering preserved", () => { + it("marks 'ok' as noise", () => { + assert.equal(isNoise("ok"), true); + }); + + it("marks 'hi' as noise", () => { + assert.equal(isNoise("hi"), true); + }); + + it("marks 'test' as noise", () => { + assert.equal(isNoise("test"), true); + }); + + it("does not mark 5+ char English as noise by length alone", () => { + assert.equal(isNoise("hello world this is a real memory"), false); + }); + }); + + // --- pattern filters --- + describe("denial pattern filtering", () => { + it("marks agent denial as noise", () => { + assert.equal(isNoise("I don't have any information about that"), true); + }); + }); + + describe("meta-question pattern filtering", () => { + it("marks meta-question as noise", () => { + assert.equal(isNoise("do you remember what I said"), true); + }); + }); + + describe("boilerplate pattern filtering", () => { + it("marks greeting as noise", () => { + assert.equal(isNoise("hello there"), true); + }); + }); + + // --- options control --- + describe("options control", () => { + it("respects filterBoilerplate: false", () => { + assert.equal(isNoise("hello there", { filterBoilerplate: false }), false); + }); + + it("respects filterDenials: false", () => { + assert.equal(isNoise("I don't have any information", { filterDenials: false }), false); + }); + + it("respects filterMetaQuestions: false", () => { + assert.equal(isNoise("do you remember", { filterMetaQuestions: false }), false); + }); + }); +}); + +describe("filterNoise", () => { + it("filters noise items from array", () => { + const items = [ + { id: 1, text: "\u4ed6\u559c\u6b22\u732b" }, + { id: 2, text: "ok" }, + { id: 3, text: "I prefer dark mode for all editors" }, + { id: 4, text: "\u597d" }, + ]; + const result = filterNoise(items, (item) => item.text); + assert.deepEqual( + result.map((r) => r.id), + [1, 3] + ); + }); +}); From 938d691390c3b4318efc14e05de832935b802089 Mon Sep 17 00:00:00 2001 From: Ssyn Date: Sun, 29 Mar 2026 06:19:59 -0700 Subject: [PATCH 3/7] fix: improve adaptive retrieval accuracy for CJK and edge cases - Use \p{Extended_Pictographic} instead of \p{Emoji} to avoid matching digits - Narrow slash command regex to /word format, no longer matches file paths - Add CJK-aware hard threshold (length < 2 for CJK, < 5 for non-CJK) - Exempt digit-containing strings (port numbers, issue IDs) from length thresholds - Lower CJK defaultMinLength from 6 to 3 for short meaningful CJK queries - Lower non-CJK defaultMinLength from 15 to 13 to allow file path queries - Prevent FORCE_RETRIEVE from hijacking slash commands like /recall --- src/adaptive-retrieval.ts | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/src/adaptive-retrieval.ts b/src/adaptive-retrieval.ts index 88770aa9..c5454002 100644 --- a/src/adaptive-retrieval.ts +++ b/src/adaptive-retrieval.ts @@ -10,14 +10,14 @@ const SKIP_PATTERNS = [ // Greetings & pleasantries /^(hi|hello|hey|good\s*(morning|afternoon|evening|night)|greetings|yo|sup|howdy|what'?s up)\b/i, // System/bot commands - /^\//, // slash commands + /^\/[a-z][\w-]*\s*$/i, // slash commands like /help, /recall /^(run|build|test|ls|cd|git|npm|pip|docker|curl|cat|grep|find|make|sudo)\b/i, // Simple affirmations/negations /^(yes|no|yep|nope|ok|okay|sure|fine|thanks|thank you|thx|ty|got it|understood|cool|nice|great|good|perfect|awesome|👍|👎|✅|❌)\s*[.!]?$/i, // Continuation prompts /^(go ahead|continue|proceed|do it|start|begin|next|实施|實施|开始|開始|继续|繼續|好的|可以|行)\s*[.!]?$/i, // Pure emoji - /^[\p{Emoji}\s]+$/u, + /^[\p{Extended_Pictographic}\u200d\ufe0f\s]+$/u, // Heartbeat/system (match anywhere, not just at start, to handle prefixed formats) /HEARTBEAT/i, /^\[System/i, @@ -72,10 +72,15 @@ export function shouldSkipRetrieval(query: string, minLength?: number): boolean // Force retrieve if query has memory-related intent (checked FIRST, // before length check, so short CJK queries like "你记得吗" aren't skipped) - if (FORCE_RETRIEVE_PATTERNS.some(p => p.test(trimmed))) return false; + // 注意:slash 命令(如 /recall)优先走 SKIP 路径,不走 FORCE 路径 + const isSlashCmd = /^\/[a-z][\w-]*\s*$/i.test(trimmed); + if (!isSlashCmd && FORCE_RETRIEVE_PATTERNS.some(p => p.test(trimmed))) return false; // Too short to be meaningful - if (trimmed.length < 5) return true; + // 含数字的字符串(如端口号 8080、issue 号 #123)携带语义信息,豁免长度截断 + const hasCJKEarly = /[\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\uac00-\ud7af]/.test(trimmed); + const hasDigit = /\d/.test(trimmed); + if (!hasDigit && trimmed.length < (hasCJKEarly ? 2 : 5)) return true; // Skip if matches any skip pattern if (SKIP_PATTERNS.some(p => p.test(trimmed))) return true; @@ -88,9 +93,10 @@ export function shouldSkipRetrieval(query: string, minLength?: number): boolean // Skip very short non-question messages (likely commands or affirmations) // CJK characters carry more meaning per character, so use a lower threshold + // 含数字的字符串豁免此规则(端口号、issue 号等均属有语义内容) const hasCJK = /[\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\uac00-\ud7af]/.test(trimmed); - const defaultMinLength = hasCJK ? 6 : 15; - if (trimmed.length < defaultMinLength && !trimmed.includes('?') && !trimmed.includes('?')) return true; + const defaultMinLength = hasCJK ? 3 : 13; + if (!hasDigit && trimmed.length < defaultMinLength && !trimmed.includes('?') && !trimmed.includes('?')) return true; // Default: do retrieve return false; From 18a4da9414aaa750c06fedc69335791e17f143f6 Mon Sep 17 00:00:00 2001 From: Ssyn Date: Sun, 29 Mar 2026 06:24:37 -0700 Subject: [PATCH 4/7] fix: add CJK-aware threshold to noise filter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Short CJK text (2+ chars) is no longer falsely marked as noise. Uses same CJK detection pattern as adaptive-retrieval. Also tightens boilerplate greeting regex to only match standalone greetings (≤1 trailing word), so real memories starting with "hello" are not incorrectly filtered. --- src/noise-filter.ts | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/noise-filter.ts b/src/noise-filter.ts index b21cf37b..198b371a 100644 --- a/src/noise-filter.ts +++ b/src/noise-filter.ts @@ -37,7 +37,7 @@ const META_QUESTION_PATTERNS = [ // Session boilerplate const BOILERPLATE_PATTERNS = [ - /^(hi|hello|hey|good morning|good evening|greetings)/i, + /^(hi|hello|hey|good morning|good evening|greetings)(\s+\w+)?[!,.]?\s*$/i, /^fresh session/i, /^new session/i, /^HEARTBEAT/i, @@ -73,7 +73,8 @@ export function isNoise(text: string, options: NoiseFilterOptions = {}): boolean const opts = { ...DEFAULT_OPTIONS, ...options }; const trimmed = text.trim(); - if (trimmed.length < 5) return true; + const hasCJK = /[\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\uac00-\ud7af]/.test(trimmed); + if (trimmed.length < (hasCJK ? 2 : 5)) return true; if (opts.filterDenials && DENIAL_PATTERNS.some(p => p.test(trimmed))) return true; if (opts.filterMetaQuestions && META_QUESTION_PATTERNS.some(p => p.test(trimmed))) return true; From 1978a7d7e240f31a71f91b24cbf3b079ab7324a8 Mon Sep 17 00:00:00 2001 From: Ssyn Date: Sun, 29 Mar 2026 06:40:31 -0700 Subject: [PATCH 5/7] refactor: deduplicate hasCJK variable in shouldSkipRetrieval --- src/adaptive-retrieval.ts | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/adaptive-retrieval.ts b/src/adaptive-retrieval.ts index c5454002..bc0133a5 100644 --- a/src/adaptive-retrieval.ts +++ b/src/adaptive-retrieval.ts @@ -78,9 +78,9 @@ export function shouldSkipRetrieval(query: string, minLength?: number): boolean // Too short to be meaningful // 含数字的字符串(如端口号 8080、issue 号 #123)携带语义信息,豁免长度截断 - const hasCJKEarly = /[\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\uac00-\ud7af]/.test(trimmed); + const hasCJK = /[\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\uac00-\ud7af]/.test(trimmed); const hasDigit = /\d/.test(trimmed); - if (!hasDigit && trimmed.length < (hasCJKEarly ? 2 : 5)) return true; + if (!hasDigit && trimmed.length < (hasCJK ? 2 : 5)) return true; // Skip if matches any skip pattern if (SKIP_PATTERNS.some(p => p.test(trimmed))) return true; @@ -94,7 +94,6 @@ export function shouldSkipRetrieval(query: string, minLength?: number): boolean // Skip very short non-question messages (likely commands or affirmations) // CJK characters carry more meaning per character, so use a lower threshold // 含数字的字符串豁免此规则(端口号、issue 号等均属有语义内容) - const hasCJK = /[\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\uac00-\ud7af]/.test(trimmed); const defaultMinLength = hasCJK ? 3 : 13; if (!hasDigit && trimmed.length < defaultMinLength && !trimmed.includes('?') && !trimmed.includes('?')) return true; From 61c864dd76d33dfec170236e88446793a9d316b5 Mon Sep 17 00:00:00 2001 From: Ssyn Date: Sun, 29 Mar 2026 06:53:12 -0700 Subject: [PATCH 6/7] chore: register adaptive-retrieval and noise-filter tests in npm test --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index f40ecbcd..47852116 100644 --- a/package.json +++ b/package.json @@ -38,7 +38,7 @@ ] }, "scripts": { - "test": "node test/embedder-error-hints.test.mjs && node test/cjk-recursion-regression.test.mjs && node test/migrate-legacy-schema.test.mjs && node --test test/config-session-strategy-migration.test.mjs && node --test test/scope-access-undefined.test.mjs && node --test test/reflection-bypass-hook.test.mjs && node --test test/smart-extractor-scope-filter.test.mjs && node --test test/store-empty-scope-filter.test.mjs && node --test test/recall-text-cleanup.test.mjs && node test/update-consistency-lancedb.test.mjs && node --test test/strip-envelope-metadata.test.mjs && node test/cli-smoke.mjs && node test/functional-e2e.mjs && node test/retriever-rerank-regression.mjs && node test/smart-memory-lifecycle.mjs && node test/smart-extractor-branches.mjs && node test/plugin-manifest-regression.mjs && node --test test/session-summary-before-reset.test.mjs && node --test test/sync-plugin-version.test.mjs && node test/smart-metadata-v2.mjs && node test/vector-search-cosine.test.mjs && node test/context-support-e2e.mjs && node test/temporal-facts.test.mjs && node test/memory-update-supersede.test.mjs && node test/memory-upgrader-diagnostics.test.mjs && node --test test/llm-api-key-client.test.mjs && node --test test/llm-oauth-client.test.mjs && node --test test/cli-oauth-login.test.mjs && node --test test/workflow-fork-guards.test.mjs && node --test test/clawteam-scope.test.mjs && node --test test/cross-process-lock.test.mjs && node --test test/preference-slots.test.mjs && node test/is-latest-auto-supersede.test.mjs && node --test test/temporal-awareness.test.mjs", + "test": "node test/embedder-error-hints.test.mjs && node test/cjk-recursion-regression.test.mjs && node test/migrate-legacy-schema.test.mjs && node --test test/config-session-strategy-migration.test.mjs && node --test test/scope-access-undefined.test.mjs && node --test test/reflection-bypass-hook.test.mjs && node --test test/smart-extractor-scope-filter.test.mjs && node --test test/store-empty-scope-filter.test.mjs && node --test test/recall-text-cleanup.test.mjs && node test/update-consistency-lancedb.test.mjs && node --test test/strip-envelope-metadata.test.mjs && node test/cli-smoke.mjs && node test/functional-e2e.mjs && node test/retriever-rerank-regression.mjs && node test/smart-memory-lifecycle.mjs && node test/smart-extractor-branches.mjs && node test/plugin-manifest-regression.mjs && node --test test/session-summary-before-reset.test.mjs && node --test test/sync-plugin-version.test.mjs && node test/smart-metadata-v2.mjs && node test/vector-search-cosine.test.mjs && node test/context-support-e2e.mjs && node test/temporal-facts.test.mjs && node test/memory-update-supersede.test.mjs && node test/memory-upgrader-diagnostics.test.mjs && node --test test/llm-api-key-client.test.mjs && node --test test/llm-oauth-client.test.mjs && node --test test/cli-oauth-login.test.mjs && node --test test/workflow-fork-guards.test.mjs && node --test test/clawteam-scope.test.mjs && node --test test/cross-process-lock.test.mjs && node --test test/preference-slots.test.mjs && node test/is-latest-auto-supersede.test.mjs && node --test test/temporal-awareness.test.mjs && node --test test/adaptive-retrieval.test.mjs && node --test test/noise-filter.test.mjs", "test:openclaw-host": "node test/openclaw-host-functional.mjs", "version": "node scripts/sync-plugin-version.mjs openclaw.plugin.json package.json && git add openclaw.plugin.json" }, From ddc51652e72df4e5bd26946f194b93888ef0926e Mon Sep 17 00:00:00 2001 From: Ssyn Date: Sun, 5 Apr 2026 06:17:56 -0700 Subject: [PATCH 7/7] fix: support slash commands with arguments in skip detection Change slash command regex from ^\/[a-z][\w-]*\s*$ to ^\/[a-z][\w-]*(\s|$) so that argument-bearing commands like /recall my name, /remember content, and /lesson text are still recognized and skipped. File paths like /usr/bin/node remain unmatched because the second segment starts with / not whitespace. Add regression tests for argument-bearing slash commands. --- src/adaptive-retrieval.ts | 4 ++-- test/adaptive-retrieval.test.mjs | 12 ++++++++++++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/src/adaptive-retrieval.ts b/src/adaptive-retrieval.ts index bc0133a5..4eed2f63 100644 --- a/src/adaptive-retrieval.ts +++ b/src/adaptive-retrieval.ts @@ -10,7 +10,7 @@ const SKIP_PATTERNS = [ // Greetings & pleasantries /^(hi|hello|hey|good\s*(morning|afternoon|evening|night)|greetings|yo|sup|howdy|what'?s up)\b/i, // System/bot commands - /^\/[a-z][\w-]*\s*$/i, // slash commands like /help, /recall + /^\/[a-z][\w-]*(\s|$)/i, // slash commands like /help, /recall my name /^(run|build|test|ls|cd|git|npm|pip|docker|curl|cat|grep|find|make|sudo)\b/i, // Simple affirmations/negations /^(yes|no|yep|nope|ok|okay|sure|fine|thanks|thank you|thx|ty|got it|understood|cool|nice|great|good|perfect|awesome|👍|👎|✅|❌)\s*[.!]?$/i, @@ -73,7 +73,7 @@ export function shouldSkipRetrieval(query: string, minLength?: number): boolean // Force retrieve if query has memory-related intent (checked FIRST, // before length check, so short CJK queries like "你记得吗" aren't skipped) // 注意:slash 命令(如 /recall)优先走 SKIP 路径,不走 FORCE 路径 - const isSlashCmd = /^\/[a-z][\w-]*\s*$/i.test(trimmed); + const isSlashCmd = /^\/[a-z][\w-]*(\s|$)/i.test(trimmed); if (!isSlashCmd && FORCE_RETRIEVE_PATTERNS.some(p => p.test(trimmed))) return false; // Too short to be meaningful diff --git a/test/adaptive-retrieval.test.mjs b/test/adaptive-retrieval.test.mjs index fab58072..eec9348a 100644 --- a/test/adaptive-retrieval.test.mjs +++ b/test/adaptive-retrieval.test.mjs @@ -50,6 +50,18 @@ describe("shouldSkipRetrieval", () => { it("does not skip API paths", () => { assert.equal(shouldSkipRetrieval("/api/v2/users"), false); }); + + it("skips slash commands with arguments", () => { + assert.equal(shouldSkipRetrieval("/recall my name"), true); + }); + + it("skips slash commands with content arguments", () => { + assert.equal(shouldSkipRetrieval("/remember user prefers dark mode"), true); + }); + + it("skips slash commands with lesson content", () => { + assert.equal(shouldSkipRetrieval("/lesson always use strict mode"), true); + }); }); // --- CJK short text threshold fix ---