diff --git a/package.json b/package.json index f40ecbcd..47852116 100644 --- a/package.json +++ b/package.json @@ -38,7 +38,7 @@ ] }, "scripts": { - "test": "node test/embedder-error-hints.test.mjs && node test/cjk-recursion-regression.test.mjs && node test/migrate-legacy-schema.test.mjs && node --test test/config-session-strategy-migration.test.mjs && node --test test/scope-access-undefined.test.mjs && node --test test/reflection-bypass-hook.test.mjs && node --test test/smart-extractor-scope-filter.test.mjs && node --test test/store-empty-scope-filter.test.mjs && node --test test/recall-text-cleanup.test.mjs && node test/update-consistency-lancedb.test.mjs && node --test test/strip-envelope-metadata.test.mjs && node test/cli-smoke.mjs && node test/functional-e2e.mjs && node test/retriever-rerank-regression.mjs && node test/smart-memory-lifecycle.mjs && node test/smart-extractor-branches.mjs && node test/plugin-manifest-regression.mjs && node --test test/session-summary-before-reset.test.mjs && node --test test/sync-plugin-version.test.mjs && node test/smart-metadata-v2.mjs && node test/vector-search-cosine.test.mjs && node test/context-support-e2e.mjs && node test/temporal-facts.test.mjs && node test/memory-update-supersede.test.mjs && node test/memory-upgrader-diagnostics.test.mjs && node --test test/llm-api-key-client.test.mjs && node --test test/llm-oauth-client.test.mjs && node --test test/cli-oauth-login.test.mjs && node --test test/workflow-fork-guards.test.mjs && node --test test/clawteam-scope.test.mjs && node --test test/cross-process-lock.test.mjs && node --test test/preference-slots.test.mjs && node test/is-latest-auto-supersede.test.mjs && node --test test/temporal-awareness.test.mjs", + "test": "node test/embedder-error-hints.test.mjs && node test/cjk-recursion-regression.test.mjs && node test/migrate-legacy-schema.test.mjs && node --test test/config-session-strategy-migration.test.mjs && node --test test/scope-access-undefined.test.mjs && node --test test/reflection-bypass-hook.test.mjs && node --test test/smart-extractor-scope-filter.test.mjs && node --test test/store-empty-scope-filter.test.mjs && node --test test/recall-text-cleanup.test.mjs && node test/update-consistency-lancedb.test.mjs && node --test test/strip-envelope-metadata.test.mjs && node test/cli-smoke.mjs && node test/functional-e2e.mjs && node test/retriever-rerank-regression.mjs && node test/smart-memory-lifecycle.mjs && node test/smart-extractor-branches.mjs && node test/plugin-manifest-regression.mjs && node --test test/session-summary-before-reset.test.mjs && node --test test/sync-plugin-version.test.mjs && node test/smart-metadata-v2.mjs && node test/vector-search-cosine.test.mjs && node test/context-support-e2e.mjs && node test/temporal-facts.test.mjs && node test/memory-update-supersede.test.mjs && node test/memory-upgrader-diagnostics.test.mjs && node --test test/llm-api-key-client.test.mjs && node --test test/llm-oauth-client.test.mjs && node --test test/cli-oauth-login.test.mjs && node --test test/workflow-fork-guards.test.mjs && node --test test/clawteam-scope.test.mjs && node --test test/cross-process-lock.test.mjs && node --test test/preference-slots.test.mjs && node test/is-latest-auto-supersede.test.mjs && node --test test/temporal-awareness.test.mjs && node --test test/adaptive-retrieval.test.mjs && node --test test/noise-filter.test.mjs", "test:openclaw-host": "node test/openclaw-host-functional.mjs", "version": "node scripts/sync-plugin-version.mjs openclaw.plugin.json package.json && git add openclaw.plugin.json" }, diff --git a/src/adaptive-retrieval.ts b/src/adaptive-retrieval.ts index 88770aa9..4eed2f63 100644 --- a/src/adaptive-retrieval.ts +++ b/src/adaptive-retrieval.ts @@ -10,14 +10,14 @@ const SKIP_PATTERNS = [ // Greetings & pleasantries /^(hi|hello|hey|good\s*(morning|afternoon|evening|night)|greetings|yo|sup|howdy|what'?s up)\b/i, // System/bot commands - /^\//, // slash commands + /^\/[a-z][\w-]*(\s|$)/i, // slash commands like /help, /recall my name /^(run|build|test|ls|cd|git|npm|pip|docker|curl|cat|grep|find|make|sudo)\b/i, // Simple affirmations/negations /^(yes|no|yep|nope|ok|okay|sure|fine|thanks|thank you|thx|ty|got it|understood|cool|nice|great|good|perfect|awesome|👍|👎|✅|❌)\s*[.!]?$/i, // Continuation prompts /^(go ahead|continue|proceed|do it|start|begin|next|实施|實施|开始|開始|继续|繼續|好的|可以|行)\s*[.!]?$/i, // Pure emoji - /^[\p{Emoji}\s]+$/u, + /^[\p{Extended_Pictographic}\u200d\ufe0f\s]+$/u, // Heartbeat/system (match anywhere, not just at start, to handle prefixed formats) /HEARTBEAT/i, /^\[System/i, @@ -72,10 +72,15 @@ export function shouldSkipRetrieval(query: string, minLength?: number): boolean // Force retrieve if query has memory-related intent (checked FIRST, // before length check, so short CJK queries like "你记得吗" aren't skipped) - if (FORCE_RETRIEVE_PATTERNS.some(p => p.test(trimmed))) return false; + // 注意:slash 命令(如 /recall)优先走 SKIP 路径,不走 FORCE 路径 + const isSlashCmd = /^\/[a-z][\w-]*(\s|$)/i.test(trimmed); + if (!isSlashCmd && FORCE_RETRIEVE_PATTERNS.some(p => p.test(trimmed))) return false; // Too short to be meaningful - if (trimmed.length < 5) return true; + // 含数字的字符串(如端口号 8080、issue 号 #123)携带语义信息,豁免长度截断 + const hasCJK = /[\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\uac00-\ud7af]/.test(trimmed); + const hasDigit = /\d/.test(trimmed); + if (!hasDigit && trimmed.length < (hasCJK ? 2 : 5)) return true; // Skip if matches any skip pattern if (SKIP_PATTERNS.some(p => p.test(trimmed))) return true; @@ -88,9 +93,9 @@ export function shouldSkipRetrieval(query: string, minLength?: number): boolean // Skip very short non-question messages (likely commands or affirmations) // CJK characters carry more meaning per character, so use a lower threshold - const hasCJK = /[\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\uac00-\ud7af]/.test(trimmed); - const defaultMinLength = hasCJK ? 6 : 15; - if (trimmed.length < defaultMinLength && !trimmed.includes('?') && !trimmed.includes('?')) return true; + // 含数字的字符串豁免此规则(端口号、issue 号等均属有语义内容) + const defaultMinLength = hasCJK ? 3 : 13; + if (!hasDigit && trimmed.length < defaultMinLength && !trimmed.includes('?') && !trimmed.includes('?')) return true; // Default: do retrieve return false; diff --git a/src/noise-filter.ts b/src/noise-filter.ts index b21cf37b..198b371a 100644 --- a/src/noise-filter.ts +++ b/src/noise-filter.ts @@ -37,7 +37,7 @@ const META_QUESTION_PATTERNS = [ // Session boilerplate const BOILERPLATE_PATTERNS = [ - /^(hi|hello|hey|good morning|good evening|greetings)/i, + /^(hi|hello|hey|good morning|good evening|greetings)(\s+\w+)?[!,.]?\s*$/i, /^fresh session/i, /^new session/i, /^HEARTBEAT/i, @@ -73,7 +73,8 @@ export function isNoise(text: string, options: NoiseFilterOptions = {}): boolean const opts = { ...DEFAULT_OPTIONS, ...options }; const trimmed = text.trim(); - if (trimmed.length < 5) return true; + const hasCJK = /[\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\uac00-\ud7af]/.test(trimmed); + if (trimmed.length < (hasCJK ? 2 : 5)) return true; if (opts.filterDenials && DENIAL_PATTERNS.some(p => p.test(trimmed))) return true; if (opts.filterMetaQuestions && META_QUESTION_PATTERNS.some(p => p.test(trimmed))) return true; diff --git a/test/adaptive-retrieval.test.mjs b/test/adaptive-retrieval.test.mjs new file mode 100644 index 00000000..eec9348a --- /dev/null +++ b/test/adaptive-retrieval.test.mjs @@ -0,0 +1,112 @@ +import { describe, it } from "node:test"; +import assert from "node:assert/strict"; +import jitiFactory from "jiti"; + +const jiti = jitiFactory(import.meta.url, { interopDefault: true }); +const { shouldSkipRetrieval } = jiti("../src/adaptive-retrieval.ts"); + +describe("shouldSkipRetrieval", () => { + // --- emoji regex fix --- + describe("emoji regex should not match digits", () => { + it("does not skip pure digit strings", () => { + assert.equal(shouldSkipRetrieval("12345"), false); + }); + + it("does not skip port numbers", () => { + assert.equal(shouldSkipRetrieval("8080"), false); + }); + + it("does not skip hash-prefixed numbers", () => { + assert.equal(shouldSkipRetrieval("#123"), false); + }); + + it("skips pure emoji input", () => { + assert.equal(shouldSkipRetrieval("\ud83d\udc4d\ud83c\udf89\ud83d\ude80"), true); + }); + + it("does not skip emoji mixed with text", () => { + assert.equal(shouldSkipRetrieval("\ud83d\udc68\u200d\ud83d\udc69\u200d\ud83d\udc67 family trip plan"), false); + }); + }); + + // --- slash command regex fix --- + describe("slash command regex should not match file paths", () => { + it("skips single-word slash commands", () => { + assert.equal(shouldSkipRetrieval("/help"), true); + }); + + it("skips slash command with trailing space", () => { + assert.equal(shouldSkipRetrieval("/recall "), true); + }); + + it("does not skip file paths", () => { + assert.equal(shouldSkipRetrieval("/usr/bin/node"), false); + }); + + it("does not skip path with question", () => { + assert.equal(shouldSkipRetrieval("/etc/nginx/nginx.conf \u600e\u4e48\u914d\u7f6e"), false); + }); + + it("does not skip API paths", () => { + assert.equal(shouldSkipRetrieval("/api/v2/users"), false); + }); + + it("skips slash commands with arguments", () => { + assert.equal(shouldSkipRetrieval("/recall my name"), true); + }); + + it("skips slash commands with content arguments", () => { + assert.equal(shouldSkipRetrieval("/remember user prefers dark mode"), true); + }); + + it("skips slash commands with lesson content", () => { + assert.equal(shouldSkipRetrieval("/lesson always use strict mode"), true); + }); + }); + + // --- CJK short text threshold fix --- + describe("CJK short text should not be killed by hard threshold", () => { + it("does not skip 4-char CJK query", () => { + assert.equal(shouldSkipRetrieval("\u4ed6\u559c\u6b22\u732b"), false); + }); + + it("does not skip 4-char CJK query (residence)", () => { + assert.equal(shouldSkipRetrieval("\u6211\u4f4f\u5317\u4eac"), false); + }); + + it("does not skip 3-char mixed CJK query", () => { + assert.equal(shouldSkipRetrieval("\u7528Go\u5199"), false); + }); + + it("does not skip CJK query with question mark", () => { + assert.equal(shouldSkipRetrieval("\u5bc6\u7801\u662f\u5565\uff1f"), false); + }); + + it("skips single CJK character", () => { + assert.equal(shouldSkipRetrieval("\u597d"), true); + }); + }); + + // --- existing behavior preserved --- + describe("existing skip/force behavior preserved", () => { + it("skips greetings", () => { + assert.equal(shouldSkipRetrieval("hi"), true); + }); + + it("skips short English affirmations", () => { + assert.equal(shouldSkipRetrieval("ok"), true); + }); + + it("does not skip memory-related queries (English)", () => { + assert.equal(shouldSkipRetrieval("remember my name is Alice"), false); + }); + + it("does not skip memory-related queries (Chinese)", () => { + assert.equal(shouldSkipRetrieval("\u4f60\u8bb0\u5f97\u5417"), false); + }); + + it("does not skip normal length queries", () => { + assert.equal(shouldSkipRetrieval("what was the database schema we discussed"), false); + }); + }); +}); diff --git a/test/noise-filter.test.mjs b/test/noise-filter.test.mjs new file mode 100644 index 00000000..3ed86173 --- /dev/null +++ b/test/noise-filter.test.mjs @@ -0,0 +1,96 @@ +import { describe, it } from "node:test"; +import assert from "node:assert/strict"; +import jitiFactory from "jiti"; + +const jiti = jitiFactory(import.meta.url, { interopDefault: true }); +const { isNoise, filterNoise } = jiti("../src/noise-filter.ts"); + +describe("isNoise", () => { + // --- CJK short text fix --- + describe("CJK short text should not be marked as noise", () => { + it("4-char CJK is not noise", () => { + assert.equal(isNoise("\u4ed6\u559c\u6b22\u732b"), false); + }); + + it("3-char mixed CJK is not noise", () => { + assert.equal(isNoise("\u7528Go\u5199"), false); + }); + + it("2-char CJK is not noise", () => { + assert.equal(isNoise("\u5b66\u4e60"), false); + }); + + it("single CJK char is noise", () => { + assert.equal(isNoise("\u597d"), true); + }); + }); + + // --- English short text preserved --- + describe("English short text filtering preserved", () => { + it("marks 'ok' as noise", () => { + assert.equal(isNoise("ok"), true); + }); + + it("marks 'hi' as noise", () => { + assert.equal(isNoise("hi"), true); + }); + + it("marks 'test' as noise", () => { + assert.equal(isNoise("test"), true); + }); + + it("does not mark 5+ char English as noise by length alone", () => { + assert.equal(isNoise("hello world this is a real memory"), false); + }); + }); + + // --- pattern filters --- + describe("denial pattern filtering", () => { + it("marks agent denial as noise", () => { + assert.equal(isNoise("I don't have any information about that"), true); + }); + }); + + describe("meta-question pattern filtering", () => { + it("marks meta-question as noise", () => { + assert.equal(isNoise("do you remember what I said"), true); + }); + }); + + describe("boilerplate pattern filtering", () => { + it("marks greeting as noise", () => { + assert.equal(isNoise("hello there"), true); + }); + }); + + // --- options control --- + describe("options control", () => { + it("respects filterBoilerplate: false", () => { + assert.equal(isNoise("hello there", { filterBoilerplate: false }), false); + }); + + it("respects filterDenials: false", () => { + assert.equal(isNoise("I don't have any information", { filterDenials: false }), false); + }); + + it("respects filterMetaQuestions: false", () => { + assert.equal(isNoise("do you remember", { filterMetaQuestions: false }), false); + }); + }); +}); + +describe("filterNoise", () => { + it("filters noise items from array", () => { + const items = [ + { id: 1, text: "\u4ed6\u559c\u6b22\u732b" }, + { id: 2, text: "ok" }, + { id: 3, text: "I prefer dark mode for all editors" }, + { id: 4, text: "\u597d" }, + ]; + const result = filterNoise(items, (item) => item.text); + assert.deepEqual( + result.map((r) => r.id), + [1, 3] + ); + }); +});