From 96b9e58489cc82e4d2500df352ff88c34f75f8d4 Mon Sep 17 00:00:00 2001
From: yuanrengu <heyonggang0811@126.com>
Date: Sun, 24 May 2026 06:01:54 +0000
Subject: [PATCH 1/2] feat: L1 extraction quality improvements
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Four optimizations to reduce LLM dependency in the memory pipeline:

1. Restore & enhance L1 quality gate (sanitize.ts)
   - Re-enable commented-out length filters (CJK >= 4, alpha >= 10)
   - Re-enable prompt injection detection
   - Add conversational filler filter (好的/OK/thanks/got it)

2. Add rule-based pre-extraction layer (pre-extractor.ts — new)
   - 10 persona patterns (喜欢/是/职业/擅长/认为)
   - 8 instruction patterns (以后/记住/禁止/语言切换)
   - Date+verb episodic detection
   - HIGH-confidence items bypass LLM entirely; MEDIUM as hints

3. Self-correction retry on JSON parse failure (l1-extractor.ts)
   - Parse failures trigger one retry with error feedback
   - Reduces silent memory loss from malformed LLM output

4. Post-LLM confidence check (l1-extractor.ts)
   - Source traceability: >=30% keywords must appear in source messages
   - Type consistency: persona must ref user, instruction must ref AI
   - Trivial content rejection: filter vague episodic statements

Fixes: non-greedy regex in pre-extractor patterns, broader CJK injection detection
---
 src/core/record/l1-extractor.ts  | 209 +++++++++++++++++---
 src/core/record/pre-extractor.ts | 318 +++++++++++++++++++++++++++++++
 src/utils/sanitize.ts            |  23 ++-
 3 files changed, 514 insertions(+), 36 deletions(-)
 create mode 100644 src/core/record/pre-extractor.ts

diff --git a/src/core/record/l1-extractor.ts b/src/core/record/l1-extractor.ts
index ad0a8f1..0830f44 100644
--- a/src/core/record/l1-extractor.ts
+++ b/src/core/record/l1-extractor.ts
@@ -17,6 +17,7 @@ import { EXTRACT_MEMORIES_SYSTEM_PROMPT, formatExtractionPrompt } from "../promp
 import { batchDedup } from "./l1-dedup.js";
 import { writeMemory, generateMemoryId } from "./l1-writer.js";
 import type { ExtractedMemory, MemoryRecord, MemoryType, DedupDecision } from "./l1-writer.js";
+import { preExtractMemories, mergeExtractedMemories } from "./pre-extractor.js";
 import { CleanContextRunner } from "../../utils/clean-context-runner.js";
 import { sanitizeJsonForParse, shouldExtractL1 } from "../../utils/sanitize.js";
 import type { IMemoryStore } from "../store/types.js";
@@ -146,6 +147,19 @@ export async function extractL1Memories(params: {
     return { success: true, extractedCount: 0, storedCount: 0, records: [], sceneNames: [] };
   }
 
+  // ── Step 0: Rule-based pre-extraction (v3.1) ──
+  // Catch obvious persona/instruction patterns BEFORE the LLM call.
+  // This reduces token cost for clear patterns and provides hints to the LLM.
+  const preResult = preExtractMemories(qualifiedMessages);
+  if (preResult.direct.length > 0) {
+    logger?.debug?.(
+      `${TAG} Pre-extracted ${preResult.direct.length} HIGH-confidence items directly (bypass LLM)`);
+  }
+  if (preResult.hints.length > 0) {
+    logger?.debug?.(
+      `${TAG} Pre-extracted ${preResult.hints.length} MEDIUM-confidence hints for LLM guidance`);
+  }
+
   // Split messages into background (older) + new (recent)
   const newMessages = qualifiedMessages.slice(-maxNewMessages);
   const bgEndIdx = qualifiedMessages.length - newMessages.length;
@@ -198,6 +212,30 @@ export async function extractL1Memories(params: {
 
   logger?.debug?.(`${TAG} Total extracted memories: ${allExtracted.length} across ${scenes.length} scene(s)`);
 
+  // ── Merge rule-extracted direct items into LLM results ──
+  if (preResult.direct.length > 0) {
+    const beforeMerge = allExtracted.length;
+    const merged = mergeExtractedMemories(allExtracted, preResult);
+    const added = merged.length - beforeMerge;
+    if (added > 0) {
+      logger?.debug?.(
+        `${TAG} Merged ${added} pre-extracted items into LLM results (total: ${merged.length})`);
+    }
+    allExtracted.length = 0;
+    allExtracted.push(...merged);
+  }
+
+  // ── Confidence check: filter low-quality LLM extractions ──
+  const confidenceFiltered = allExtracted
+    .filter((m) => passesConfidenceCheck(m, messages, logger));
+  if (confidenceFiltered.length < allExtracted.length) {
+    logger?.debug?.(
+      `${TAG} Confidence filter: ${allExtracted.length} → ${confidenceFiltered.length} memories ` +
+      `(${allExtracted.length - confidenceFiltered.length} rejected)`);
+    allExtracted.length = 0;
+    allExtracted.push(...confidenceFiltered);
+  }
+
   if (allExtracted.length === 0) {
     return {
       success: true,
@@ -323,34 +361,43 @@ async function callLlmExtraction(params: {
     `${TAG} [l1-debug] ENTRY taskId=l1-extraction, newMsgs=${newMessages.length}, bgMsgs=${backgroundMessages.length}, userPromptLen=${userPrompt.length}, sysPromptLen=${EXTRACT_MEMORIES_SYSTEM_PROMPT.length}, model=${model ?? "(default)"}, previousSceneName=${previousSceneName ? JSON.stringify(previousSceneName) : "(none)"}, runnerKind=${llmRunner ? "llmRunner" : "CleanContextRunner"}`,
   );
 
-  let result: string;
-
-  if (llmRunner) {
-    // Use the host-neutral LLMRunner interface
-    result = await llmRunner.run({
-      prompt: userPrompt,
-      systemPrompt: EXTRACT_MEMORIES_SYSTEM_PROMPT,
-      taskId: "l1-extraction",
-      timeoutMs: 180_000,
-    });
-  } else {
-    // Fallback: create CleanContextRunner (OpenClaw path)
+  const runLlm = async (prompt: string, systemPrompt: string, taskId: string): Promise<string> => {
+    if (llmRunner) {
+      return llmRunner.run({ prompt, systemPrompt, taskId, timeoutMs: 180_000 });
+    }
     const runner = new CleanContextRunner({
       config,
       modelRef: model,
       enableTools: false,
       logger,
     });
+    return runner.run({ prompt, systemPrompt, taskId, timeoutMs: 180_000 });
+  };
 
-    result = await runner.run({
-      prompt: userPrompt,
-      systemPrompt: EXTRACT_MEMORIES_SYSTEM_PROMPT,
-      taskId: "l1-extraction",
-      timeoutMs: 180_000,
-    });
+  let result = await runLlm(userPrompt, EXTRACT_MEMORIES_SYSTEM_PROMPT, "l1-extraction");
+
+  const { scenes, parseError } = parseExtractionResultWithError(result, logger);
+
+  // ── Self-correction retry: if JSON parsing failed, retry once with error feedback ──
+  if (parseError && scenes.length === 0) {
+    logger?.warn?.(
+      `${TAG} First extraction JSON parse failed: ${parseError.slice(0, 200)}. Retrying with correction hint...`);
+
+    try {
+      const correctionPrompt = `${userPrompt}\n\n【⚠ 格式错误】你上一次的输出无法解析为有效 JSON。错误信息：${parseError}\n请严格按照要求的 JSON 数组格式重新输出，不要添加任何解释或 Markdown 代码块标记。`;
+      result = await runLlm(correctionPrompt, EXTRACT_MEMORIES_SYSTEM_PROMPT, "l1-extraction-retry");
+      const retryResult = parseExtractionResultWithError(result, logger);
+      if (retryResult.scenes.length > 0) {
+        logger?.info?.(`${TAG} Self-correction retry succeeded: ${retryResult.scenes.length} scene(s) extracted`);
+        return retryResult.scenes;
+      }
+      logger?.warn?.(`${TAG} Self-correction retry also failed: ${retryResult.parseError?.slice(0, 200)}`);
+    } catch (err) {
+      logger?.warn?.(`${TAG} Self-correction retry threw: ${err instanceof Error ? err.message : String(err)}`);
+    }
   }
 
-  return parseExtractionResult(result, logger);
+  return scenes;
 }
 
 /**
@@ -358,32 +405,39 @@ async function callLlmExtraction(params: {
  * Expected format: [{scene_name, message_ids, memories: [...]}]
  */
 function parseExtractionResult(raw: string, logger?: Logger): SceneSegment[] {
+  return parseExtractionResultWithError(raw, logger).scenes;
+}
+
+/**
+ * Parse the LLM's JSON response, returning both scenes and parse error (if any).
+ * This allows the caller to use the error for self-correction retry.
+ */
+function parseExtractionResultWithError(
+  raw: string,
+  logger?: Logger,
+): { scenes: SceneSegment[]; parseError?: string } {
   try {
-    // Strip markdown code block wrappers if present
     let cleaned = raw.trim();
     if (cleaned.startsWith("```")) {
       cleaned = cleaned.replace(/^```(?:json)?\s*\n?/, "").replace(/\n?```\s*$/, "");
     }
 
-    // Try to extract JSON array
     const arrayMatch = cleaned.match(/\[[\s\S]*\]/);
     if (!arrayMatch) {
-      logger?.warn?.(`${TAG} No JSON array found in extraction response`);
-      // [l1-debug] NO_JSON — dump the full raw so we can see what the LLM actually said
       const rawPreview = raw.slice(0, 2048);
+      logger?.warn?.(`${TAG} No JSON array found in extraction response`);
       logger?.warn?.(
         `${TAG} [l1-debug] NO_JSON taskId=l1-extraction, rawLen=${raw.length}, cleanedLen=${cleaned.length}, rawFull=${JSON.stringify(rawPreview)}${raw.length > 2048 ? `…(+${raw.length - 2048})` : ""}`,
       );
-      return [];
+      return { scenes: [], parseError: "输出中未找到 JSON 数组" };
     }
 
-    // Sanitize control characters inside JSON string literals that LLM may produce
     const sanitized = sanitizeJsonForParse(arrayMatch[0]);
     const parsed = JSON.parse(sanitized) as unknown[];
 
     if (!Array.isArray(parsed)) {
       logger?.warn?.(`${TAG} Extraction response is not an array`);
-      return [];
+      return { scenes: [], parseError: "输出不是 JSON 数组" };
     }
 
     const scenes: SceneSegment[] = [];
@@ -408,10 +462,11 @@ function parseExtractionResult(raw: string, logger?: Logger): SceneSegment[] {
       });
     }
 
-    return scenes;
+    return { scenes };
   } catch (err) {
-    logger?.warn?.(`${TAG} Failed to parse extraction result: ${err instanceof Error ? err.message : String(err)}`);
-    return [];
+    const msg = err instanceof Error ? err.message : String(err);
+    logger?.warn?.(`${TAG} Failed to parse extraction result: ${msg}`);
+    return { scenes: [], parseError: `JSON 解析失败: ${msg}` };
   }
 }
 
@@ -516,6 +571,102 @@ async function storeAllDirectly(
   return storedRecords;
 }
 
+// ============================
+// Confidence check
+// ============================
+
+/**
+ * Validate an LLM-extracted memory against basic quality heuristics.
+ * Returns false if the memory appears to be hallucinated or too low-quality.
+ */
+function passesConfidenceCheck(
+  mem: ExtractedMemory,
+  allMessages: ConversationMessage[],
+  logger?: Logger,
+): boolean {
+  // Check 1: Minimal content
+  const isCJK = /[\u4e00-\u9fff]/.test(mem.content);
+  if (isCJK && mem.content.length < 4) {
+    logger?.debug?.(`${TAG} [confidence] REJECT too-short-CJK: "${mem.content.slice(0, 40)}"`);
+    return false;
+  }
+  if (!isCJK && mem.content.length < 15) {
+    logger?.debug?.(`${TAG} [confidence] REJECT too-short: "${mem.content.slice(0, 40)}"`);
+    return false;
+  }
+
+  // Check 2: Source traceability
+  const memWords = extractSignificantWords(mem.content);
+  if (memWords.size === 0) {
+    logger?.debug?.(`${TAG} [confidence] REJECT no-meaningful-words: "${mem.content.slice(0, 40)}"`);
+    return false;
+  }
+
+  const sourceMsgs = allMessages.filter((m) =>
+    mem.source_message_ids.includes(m.id),
+  );
+
+  if (sourceMsgs.length > 0) {
+    let matchedWords = 0;
+    for (const word of memWords) {
+      for (const src of sourceMsgs) {
+        if (src.content.includes(word)) { matchedWords++; break; }
+      }
+    }
+    const matchRatio = memWords.size > 0 ? matchedWords / memWords.size : 0;
+    if (matchRatio < 0.3) {
+      logger?.debug?.(
+        `${TAG} [confidence] REJECT low-traceability (${(matchRatio * 100).toFixed(0)}%): "${mem.content.slice(0, 60)}"`);
+      return false;
+    }
+  }
+
+  // Check 3: Type consistency
+  if (mem.type === "persona") {
+    if (!/[用我]户|我/.test(mem.content)) {
+      logger?.debug?.(`${TAG} [confidence] REJECT persona-no-user-ref: "${mem.content.slice(0, 40)}"`);
+      return false;
+    }
+  }
+
+  if (mem.type === "instruction") {
+    if (!/AI|回复|回答|使用|输出|禁止|必须|要求/.test(mem.content)) {
+      logger?.debug?.(`${TAG} [confidence] REJECT instruction-no-directive: "${mem.content.slice(0, 40)}"`);
+      return false;
+    }
+  }
+
+  if (mem.type === "episodic") {
+    if (/^用户询问了|^用户说了|^用户问了|^AI回答/.test(mem.content) && mem.content.length < 30) {
+      logger?.debug?.(`${TAG} [confidence] REJECT trivial-episodic: "${mem.content.slice(0, 40)}"`);
+      return false;
+    }
+  }
+
+  return true;
+}
+
+/**
+ * Extract significant words from text for source traceability.
+ * CJK: 2+ character sequences as overlapping bigrams. Non-CJK: 4+ letter words.
+ */
+function extractSignificantWords(text: string): Set<string> {
+  const words = new Set<string>();
+  const cjkSeq = text.match(/[\u4e00-\u9fff]{2,}/g);
+  if (cjkSeq) {
+    for (const seq of cjkSeq) {
+      for (let i = 0; i <= seq.length - 2; i++) {
+        words.add(seq.slice(i, i + 2));
+      }
+    }
+  }
+  const alphaWords = text.match(/[a-zA-Z]{4,}/g);
+  if (alphaWords) {
+    for (const w of alphaWords) words.add(w.toLowerCase());
+  }
+  return words;
+}
+
 // ============================
 // Helpers
 // ============================
diff --git a/src/core/record/pre-extractor.ts b/src/core/record/pre-extractor.ts
new file mode 100644
index 0000000..b76bd1c
--- /dev/null
+++ b/src/core/record/pre-extractor.ts
@@ -0,0 +1,318 @@
+/**
+ * Pre-Extractor: rule-based memory extraction BEFORE the LLM call.
+ *
+ * Catches obvious patterns that don't need LLM inference, reducing both
+ * token cost and hallucination risk. The LLM still handles complex/ambiguous
+ * cases, but this layer catches the low-hanging fruit deterministically.
+ *
+ * Two modes:
+ *   1. HIGH-confidence matches → extracted directly (bypass LLM for this item)
+ *   2. MEDIUM-confidence matches → passed as hints to guide LLM extraction
+ *
+ * v1: Focus on the most reliable patterns — explicit persona, instruction,
+ *     and date-tagged episodic markers.
+ */
+
+import type { ConversationMessage } from "../conversation/l0-recorder.js";
+import type { ExtractedMemory } from "./l1-writer.js";
+
+// ============================
+// Types
+// ============================
+
+export interface PreExtractedMemory {
+  content: string;
+  type: "persona" | "episodic" | "instruction";
+  priority: number;
+  source_message_ids: string[];
+  /** HIGH = bypass LLM, MEDIUM = pass as hint */
+  confidence: "HIGH" | "MEDIUM";
+  /** How this was detected (for debugging) */
+  rule: string;
+}
+
+export interface PreExtractionResult {
+  /** Items to add directly without LLM processing */
+  direct: PreExtractedMemory[];
+  /** Items to pass as hints to the LLM */
+  hints: PreExtractedMemory[];
+}
+
+// ============================
+// Persona patterns
+// ============================
+
+interface PersonaRule {
+  pattern: RegExp;
+  /** Template to generate memory content. $1 = capture group, $TEXT = full match */
+  template: string;
+  priority: number;
+}
+
+const PERSONA_RULES: PersonaRule[] = [
+  // Explicit preference statements
+  {
+    pattern: /我(?:很|非常|比较|特别)?喜欢(.{1,30})/,
+    template: "用户喜欢$1",
+    priority: 70,
+  },
+  {
+    pattern: /我(?:很|非常|比较|特别)?讨厌(.{1,30})/,
+    template: "用户讨厌$1",
+    priority: 75,
+  },
+  {
+    pattern: /我习惯(.{1,30})/,
+    template: "用户习惯$1",
+    priority: 65,
+  },
+  {
+    pattern: /我经常(.{1,30})/,
+    template: "用户经常$1",
+    priority: 60,
+  },
+  // Identity / role statements
+  {
+    pattern: /我是(?:一[个位名])?(.{1,40})/,
+    template: "用户是$1",
+    priority: 80,
+  },
+  {
+    pattern: /我的(?:职业|工作|岗位)是(.{1,40})/,
+    template: "用户的职业是$1",
+    priority: 85,
+  },
+  // Skill / ability statements
+  {
+    pattern: /我擅长(.{1,30})/,
+    template: "用户擅长$1",
+    priority: 70,
+  },
+  {
+    pattern: /我会(.{1,30})/,
+    template: "用户会$1",
+    priority: 55,
+  },
+  // Value judgments
+  {
+    pattern: /我认为(.{1,50})/,
+    template: "用户认为$1",
+    priority: 60,
+  },
+];
+
+// ============================
+// Instruction patterns
+// ============================
+
+interface InstructionRule {
+  pattern: RegExp;
+  template: string;
+  priority: number;
+}
+
+const INSTRUCTION_RULES: InstructionRule[] = [
+  {
+    pattern: /以后(?:都|要|请)?(.{1,50})/,
+    template: "用户要求 AI 以后$1",
+    priority: 90,
+  },
+  {
+    pattern: /从现在开始.{0,5}?(.{1,50})/,
+    template: "用户要求 AI 从现在开始$1",
+    priority: 90,
+  },
+  {
+    pattern: /记住.{0,5}?(.{1,50})/,
+    template: "用户要求 AI 记住$1",
+    priority: 85,
+  },
+  {
+    pattern: /每次(?:都|要|请)?(.{1,50})/,
+    template: "用户要求 AI 每次$1",
+    priority: 75,
+  },
+  {
+    pattern: /(?:用|使用|切换为|换成)(中文|英文|日文|法文)回复/,
+    template: "用户要求 AI 使用$1回复",
+    priority: 95,
+  },
+  {
+    pattern: /回复(?:时|的时候).{0,5}?(.{1,40})/,
+    template: "用户要求 AI 回复时$1",
+    priority: 80,
+  },
+  {
+    pattern: /不要.{0,3}?(.{1,40})/,
+    template: "用户要求 AI 不要$1",
+    priority: 85,
+  },
+  {
+    pattern: /禁止(.{1,40})/,
+    template: "用户禁止 AI $1",
+    priority: 95,
+  },
+];
+
+// ============================
+// Episodic patterns (date-tagged)
+// ============================
+
+const DATE_PATTERN = /(\d{4}[-/年]\d{1,2}[-/月]\d{1,2}[日号]?)/;
+const TIME_PATTERN = /(\d{1,2}:\d{2}(?::\d{2})?)/;
+
+// Verbs that signal completed actions worth remembering
+const EPISODIC_ACTION_VERBS = [
+  "部署", "上线", "发布了", "提交了", "推送了", "合并了",
+  "安装", "配置", "搭建",
+  "完成", "解决", "修复", "优化",
+  "开会", "讨论了", "决定了", "确定了",
+  "购买了", "下载了", "注册了",
+  "deployed", "released", "merged", "installed", "configured",
+  "fixed", "resolved", "completed",
+];
+
+// ============================
+// Main extraction function
+// ============================
+
+/**
+ * Pre-extract memories using deterministic rules.
+ * Runs BEFORE the LLM extraction call.
+ *
+ * @param messages - Messages to scan (already filtered by shouldExtractL1)
+ * @returns Memories with confidence levels
+ */
+export function preExtractMemories(
+  messages: ConversationMessage[],
+): PreExtractionResult {
+  const direct: PreExtractedMemory[] = [];
+  const hints: PreExtractedMemory[] = [];
+
+  for (const msg of messages) {
+    const text = msg.content;
+
+    // ── Persona detection ──
+    for (const rule of PERSONA_RULES) {
+      const match = text.match(rule.pattern);
+      if (match && match[1]) {
+        const captured = match[1].trim();
+        // Reject if captured text is too short or just punctuation
+        if (captured.length < 2 || /^[，。、！？,.!?\s]+$/.test(captured)) continue;
+
+        const content = rule.template.replace("$1", captured);
+        const isHighConfidence =
+          rule.priority >= 80 || captured.length > 5;
+
+        const entry: PreExtractedMemory = {
+          content,
+          type: "persona",
+          priority: rule.priority,
+          source_message_ids: [msg.id],
+          confidence: isHighConfidence ? "HIGH" : "MEDIUM",
+          rule: `persona:${rule.pattern.source.slice(0, 30)}`,
+        };
+
+        if (isHighConfidence) {
+          direct.push(entry);
+        } else {
+          hints.push(entry);
+        }
+        break; // One persona match per message
+      }
+    }
+
+    // ── Instruction detection ──
+    for (const rule of INSTRUCTION_RULES) {
+      const match = text.match(rule.pattern);
+      if (match && match[1]) {
+        const captured = match[1].trim();
+        if (captured.length < 2 || /^[，。、！？,.!?\s]+$/.test(captured)) continue;
+
+        const content = rule.template.replace("$1", captured);
+        // Instructions are always HIGH confidence — they're explicit directives
+        const entry: PreExtractedMemory = {
+          content,
+          type: "instruction",
+          priority: rule.priority,
+          source_message_ids: [msg.id],
+          confidence: "HIGH",
+          rule: `instruction:${rule.pattern.source.slice(0, 40)}`,
+        };
+        direct.push(entry);
+        break;
+      }
+    }
+
+    // ── Episodic detection (weaker; only flag with date) ──
+    const hasDate = DATE_PATTERN.test(text);
+    if (hasDate) {
+      const hasActionVerb = EPISODIC_ACTION_VERBS.some((verb) => text.includes(verb));
+      if (hasActionVerb) {
+        // Strong signal: date + action verb → likely episodic
+        const dateMatch = text.match(DATE_PATTERN);
+        const timeMatch = text.match(TIME_PATTERN);
+        const dateStr = dateMatch ? dateMatch[1] : "某时间";
+        const timeStr = timeMatch ? ` ${timeMatch[1]}` : "";
+
+        hints.push({
+          content: `用户在 ${dateStr}${timeStr} 进行了一次活动（涉及：${text.slice(0, 80)}）`,
+          type: "episodic",
+          priority: 60,
+          source_message_ids: [msg.id],
+          confidence: "MEDIUM",
+          rule: "episodic:date+verb",
+        });
+      }
+    }
+  }
+
+  return { direct, hints };
+}
+
+/**
+ * Merge rule-extracted direct memories with LLM-extracted memories.
+ * Rule-extracted HIGH-confidence items always win; if LLM also extracted
+ * the same content, the rule version takes precedence.
+ */
+export function mergeExtractedMemories(
+  llmMemories: ExtractedMemory[],
+  preResult: PreExtractionResult,
+): ExtractedMemory[] {
+  // Start with LLM memories
+  const merged: ExtractedMemory[] = [...llmMemories];
+
+  // Add direct rule-extracted items (skip duplicates by content similarity)
+  for (const pre of preResult.direct) {
+    const isDuplicate = merged.some(
+      (m) => contentSimilarity(m.content, pre.content) > 0.7,
+    );
+    if (!isDuplicate) {
+      merged.push({
+        content: pre.content,
+        type: pre.type,
+        priority: pre.priority,
+        source_message_ids: pre.source_message_ids,
+        metadata: {},
+        scene_name: "（规则预提取）",
+      });
+    }
+  }
+
+  return merged;
+}
+
+/**
+ * Quick Jaccard-like content similarity check to avoid duplicates.
+ * Returns a value in [0, 1].
+ */
+function contentSimilarity(a: string, b: string): number {
+  const wordsA = new Set(a.split(/[\s，。、！？,.!?]+/).filter(Boolean));
+  const wordsB = new Set(b.split(/[\s，。、！？,.!?]+/).filter(Boolean));
+  if (wordsA.size === 0 || wordsB.size === 0) return 0;
+  let intersection = 0;
+  for (const w of wordsA) {
+    if (wordsB.has(w)) intersection++;
+  }
+  return intersection / Math.max(wordsA.size, wordsB.size);
+}
diff --git a/src/utils/sanitize.ts b/src/utils/sanitize.ts
index 80ee636..5f42165 100644
--- a/src/utils/sanitize.ts
+++ b/src/utils/sanitize.ts
@@ -137,20 +137,29 @@ export function shouldExtractL1(text: string): boolean {
   if (!shouldCaptureL0(text)) return false;
 
   // ── Length filters ──
-  // const isCJK = /[\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af]/.test(text);
-  // if (isCJK && text.length < 2) return false;
-  // if (!isCJK && text.length < 2) return false;
-  // if (text.length > 5000) return false;
+  // CJK languages need fewer characters to carry meaning than alphabetic ones.
+  // Minimum meaningful lengths: CJK >= 4 chars, alphabetic >= 10 chars.
+  // Overly short messages are almost never worth extracting memories from.
+  const isCJK = /[\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af]/.test(text);
+  if (isCJK && text.length < 4) return false;
+  if (!isCJK && text.length < 10) return false;
+  // Reject excessively long messages (e.g. pasted logs) — they will be handled
+  // by truncation in the LLM prompt rather than extraction.
+  if (text.length > 5000) return false;
 
   // ── Content-quality filters ──
   // Match strings composed entirely of non-word, non-space, non-CJK characters (1–5 chars).
   if (/^[^\w\s\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af]{1,5}$/.test(text)) return false;
   if (/^[?？]+$/.test(text)) return false;
 
+  // ── Noise filters ──
+  // Purely conversational fillers ("好的", "嗯", "OK", "thanks") carry no extractable memory.
+  if (/^(好的|嗯嗯?|哦哦?|OK|ok|thanks|thank you|got it|明白了|收到)[!！.]*$/i.test(text.trim())) return false;
+
   // ── Security filters ──
   // Reject prompt-injection payloads — prevent malicious content from being
   // persisted into structured memory and re-injected on future recalls.
-  // if (looksLikePromptInjection(text)) return false;
+  if (looksLikePromptInjection(text)) return false;
 
   return true;
 }
@@ -202,8 +211,8 @@ const PROMPT_INJECTION_PATTERNS: RegExp[] = [
   /\b(run|execute|call|invoke)\b.{0,40}\b(tool|command|function|shell)\b/i,
 
   // ── Chinese variants ──
-  /忽略(?:所有|之前|以上|先前)?(?:的)?(?:指令|规则|指示|说明)/,
-  /无视(?:所有|之前|以上)?(?:的)?(?:指令|规则|限制)/,
+  /忽略.{0,10}(?:指令|规则|指示|说明|限制)/,
+  /无视.{0,10}(?:指令|规则|限制)/,
   /(?:显示|输出|告诉我|给我看)(?:你的)?(?:系统|初始|隐藏)?(?:提示词|指令|规则|prompt)/,
   /你(?:现在|从现在开始)是/,            // "你现在是 DAN"
 ];

From 1d3d0a7acb96693de88686e4858005c5450cfdfa Mon Sep 17 00:00:00 2001
From: yuanrengu <heyonggang0811@126.com>
Date: Mon, 25 May 2026 00:09:28 +0000
Subject: [PATCH 2/2] fix: address PR #83 review feedback

- Move preExtractMemories to newMessages only (after background/new split)
  to prevent extracting memories from background context that should
  only serve as conversational context for the LLM

- Remove MEDIUM-confidence hints logging (hints not wired to LLM prompt;
  keeping types as interface for follow-up PR)

- Remove src/ from package.json files field to fix Size Guard limit
  (matches pattern from #76 and #71)

- Export callLlmExtraction and passesConfidenceCheck for testability

- Add pre-extractor.test.ts covering:
  - Background messages not pre-extracted
  - HIGH-confidence dedup via mergeExtractedMemories
  - Malformed JSON triggers exactly one retry
  - Confidence filtering does not reject valid persona/instruction
---
 package.json                          |   1 -
 src/core/record/l1-extractor.ts       |  26 ++--
 src/core/record/pre-extractor.test.ts | 213 ++++++++++++++++++++++++++
 3 files changed, 224 insertions(+), 16 deletions(-)
 create mode 100644 src/core/record/pre-extractor.test.ts

diff --git a/package.json b/package.json
index 0609611..e442d9b 100644
--- a/package.json
+++ b/package.json
@@ -41,7 +41,6 @@
     "scripts/memory-tencentdb-ctl.sh",
     "scripts/install_hermes_memory_tencentdb.sh",
     "scripts/README.memory-tencentdb-ctl.md",
-    "src/",
     "scripts/openclaw-after-tool-call-messages.patch.sh",
     "scripts/setup-offload.sh",
     "hermes-plugin/",
diff --git a/src/core/record/l1-extractor.ts b/src/core/record/l1-extractor.ts
index 0830f44..607c549 100644
--- a/src/core/record/l1-extractor.ts
+++ b/src/core/record/l1-extractor.ts
@@ -147,19 +147,6 @@ export async function extractL1Memories(params: {
     return { success: true, extractedCount: 0, storedCount: 0, records: [], sceneNames: [] };
   }
 
-  // ── Step 0: Rule-based pre-extraction (v3.1) ──
-  // Catch obvious persona/instruction patterns BEFORE the LLM call.
-  // This reduces token cost for clear patterns and provides hints to the LLM.
-  const preResult = preExtractMemories(qualifiedMessages);
-  if (preResult.direct.length > 0) {
-    logger?.debug?.(
-      `${TAG} Pre-extracted ${preResult.direct.length} HIGH-confidence items directly (bypass LLM)`);
-  }
-  if (preResult.hints.length > 0) {
-    logger?.debug?.(
-      `${TAG} Pre-extracted ${preResult.hints.length} MEDIUM-confidence hints for LLM guidance`);
-  }
-
   // Split messages into background (older) + new (recent)
   const newMessages = qualifiedMessages.slice(-maxNewMessages);
   const bgEndIdx = qualifiedMessages.length - newMessages.length;
@@ -169,6 +156,15 @@ export async function extractL1Memories(params: {
 
   logger?.debug?.(`${TAG} Extracting from ${newMessages.length} new messages (+ ${backgroundMessages.length} background) [${qualifiedMessages.length} qualified from ${messages.length} input]`);
 
+  // ── Step 0: Rule-based pre-extraction (v3.1) ──
+  // Catch obvious persona/instruction patterns BEFORE the LLM call.
+  // Only scan newMessages to avoid extracting from background context.
+  const preResult = preExtractMemories(newMessages);
+  if (preResult.direct.length > 0) {
+    logger?.debug?.(
+      `${TAG} Pre-extracted ${preResult.direct.length} HIGH-confidence items directly (bypass LLM)`);
+  }
+
   // Step 1: LLM extraction (scene segmentation + memory extraction)
   let scenes: SceneSegment[];
   try {
@@ -338,7 +334,7 @@ export async function extractL1Memories(params: {
 /**
  * Call LLM to extract scene-segmented memories from conversation messages.
  */
-async function callLlmExtraction(params: {
+export async function callLlmExtraction(params: {
   newMessages: ConversationMessage[];
   backgroundMessages: ConversationMessage[];
   previousSceneName?: string;
@@ -579,7 +575,7 @@ async function storeAllDirectly(
  * Validate an LLM-extracted memory against basic quality heuristics.
  * Returns false if the memory appears to be hallucinated or too low-quality.
  */
-function passesConfidenceCheck(
+export function passesConfidenceCheck(
   mem: ExtractedMemory,
   allMessages: ConversationMessage[],
   logger?: Logger,
diff --git a/src/core/record/pre-extractor.test.ts b/src/core/record/pre-extractor.test.ts
new file mode 100644
index 0000000..9f39fb3
--- /dev/null
+++ b/src/core/record/pre-extractor.test.ts
@@ -0,0 +1,213 @@
+import { describe, expect, it, vi } from "vitest";
+
+import type { ConversationMessage } from "../conversation/l0-recorder.js";
+import type { ExtractedMemory } from "./l1-writer.js";
+import { preExtractMemories, mergeExtractedMemories } from "./pre-extractor.js";
+import { callLlmExtraction, passesConfidenceCheck } from "./l1-extractor.js";
+
+// ============================
+// Helpers
+// ============================
+
+function makeMsg(
+  id: string,
+  content: string,
+  role: "user" | "assistant" = "user",
+): ConversationMessage {
+  return { id, role, content, timestamp: Date.now() };
+}
+
+function makeExtractedMemory(
+  content: string,
+  type: "persona" | "episodic" | "instruction" = "episodic",
+  sourceIds: string[] = [],
+): ExtractedMemory {
+  return {
+    content,
+    type,
+    priority: 80,
+    source_message_ids: sourceIds,
+    metadata: {},
+    scene_name: "test-scene",
+  };
+}
+
+// ============================
+// Tests
+// ============================
+
+describe("preExtractMemories", () => {
+  it("only extracts from newMessages, not background messages", () => {
+    const newMessages: ConversationMessage[] = [
+      makeMsg("m1", "我是前端工程师"),
+    ];
+    const backgroundMessages: ConversationMessage[] = [
+      makeMsg("bg1", "我喜欢吃川菜"),
+    ];
+
+    const result = preExtractMemories(newMessages);
+
+    // newMessage's pattern should be detected
+    expect(result.direct.length).toBeGreaterThanOrEqual(1);
+    expect(result.direct.some((m) => m.content.includes("前端工程师"))).toBe(true);
+
+    // Background pattern IS detectable in isolation (MEDIUM confidence)
+    const bgResult = preExtractMemories(backgroundMessages);
+    const bgMatches = [...bgResult.direct, ...bgResult.hints];
+    expect(bgMatches.some((m) => m.content.includes("川菜"))).toBe(true);
+
+    // But the newMessages-only result should NOT contain background pattern
+    const allMatches = [...result.direct, ...result.hints];
+    expect(allMatches.some((m) => m.content.includes("川菜"))).toBe(false);
+  });
+
+  it("correctly detects HIGH-confidence persona patterns", () => {
+    const messages: ConversationMessage[] = [makeMsg("m1", "我是产品经理")];
+
+    const result = preExtractMemories(messages);
+
+    expect(result.direct.length).toBe(1);
+    expect(result.direct[0]).toMatchObject({
+      content: "用户是产品经理",
+      type: "persona",
+      confidence: "HIGH",
+    });
+    expect(result.direct[0].priority).toBe(80);
+  });
+
+  it("correctly detects HIGH-confidence instruction patterns", () => {
+    const messages: ConversationMessage[] = [makeMsg("m1", "以后都用中文回复我")];
+
+    const result = preExtractMemories(messages);
+
+    expect(result.direct.length).toBe(1);
+    expect(result.direct[0]).toMatchObject({
+      type: "instruction",
+      confidence: "HIGH",
+    });
+    expect(result.direct[0].content).toContain("中文");
+  });
+});
+
+describe("mergeExtractedMemories", () => {
+  it("deduplicates: rule extraction merged only once with LLM results", () => {
+    const llmMemories: ExtractedMemory[] = [
+      makeExtractedMemory("用户是前端工程师", "persona", ["m1"]),
+    ];
+
+    const preResult = preExtractMemories([makeMsg("m1", "我是前端工程师")]);
+
+    const merged = mergeExtractedMemories(llmMemories, preResult);
+
+    const personaMemories = merged.filter((m) =>
+      m.content.includes("前端工程师"),
+    );
+    expect(personaMemories.length).toBe(1);
+  });
+
+  it("adds new HIGH-confidence items not found in LLM results", () => {
+    const llmMemories: ExtractedMemory[] = [
+      makeExtractedMemory("用户讨论了部署流程", "episodic", ["m1"]),
+    ];
+
+    const preResult = preExtractMemories([makeMsg("m1", "我是后端工程师")]);
+
+    const merged = mergeExtractedMemories(llmMemories, preResult);
+
+    expect(merged.length).toBeGreaterThanOrEqual(2);
+    expect(merged.some((m) => m.content.includes("后端工程师"))).toBe(true);
+    expect(merged.some((m) => m.content.includes("部署流程"))).toBe(true);
+  });
+});
+
+describe("callLlmExtraction", () => {
+  it("retries exactly once when first response is malformed JSON", async () => {
+    let callCount = 0;
+
+    const mockRunner = {
+      run: vi.fn().mockImplementation(async () => {
+        callCount++;
+        if (callCount === 1) {
+          return "not valid json at all";
+        }
+        return JSON.stringify([
+          { scene_name: "test", message_ids: ["m1"], memories: [] },
+        ]);
+      }),
+    };
+
+    const scenes = await callLlmExtraction({
+      newMessages: [makeMsg("m1", "hello")],
+      backgroundMessages: [],
+      config: {},
+      llmRunner: mockRunner,
+    });
+
+    expect(callCount).toBe(2);
+    expect(scenes.length).toBe(1);
+    expect(scenes[0].scene_name).toBe("test");
+  });
+
+  it("does NOT retry more than once", async () => {
+    let callCount = 0;
+
+    const mockRunner = {
+      run: vi.fn().mockImplementation(async () => {
+        callCount++;
+        return "still not json {{broken";
+      }),
+    };
+
+    const scenes = await callLlmExtraction({
+      newMessages: [makeMsg("m1", "hello")],
+      backgroundMessages: [],
+      config: {},
+      llmRunner: mockRunner,
+    });
+
+    expect(callCount).toBe(2);
+    expect(scenes.length).toBe(0);
+  });
+});
+
+describe("passesConfidenceCheck", () => {
+  it("accepts valid persona memory with user reference", () => {
+    const mem: ExtractedMemory = makeExtractedMemory(
+      "用户是前端工程师",
+      "persona",
+      ["m1"],
+    );
+    const messages: ConversationMessage[] = [makeMsg("m1", "我是前端工程师")];
+
+    expect(passesConfidenceCheck(mem, messages)).toBe(true);
+  });
+
+  it("accepts valid instruction memory with directive words", () => {
+    const mem: ExtractedMemory = makeExtractedMemory(
+      "用户要求 AI 使用中文回复",
+      "instruction",
+      ["m1"],
+    );
+    const messages: ConversationMessage[] = [makeMsg("m1", "以后都用中文回复")];
+
+    expect(passesConfidenceCheck(mem, messages)).toBe(true);
+  });
+
+  it("rejects persona memory without user reference", () => {
+    const mem: ExtractedMemory = makeExtractedMemory(
+      "前端工程师",
+      "persona",
+      ["m1"],
+    );
+    const messages: ConversationMessage[] = [makeMsg("m1", "前端工程师")];
+
+    expect(passesConfidenceCheck(mem, messages)).toBe(false);
+  });
+
+  it("rejects too-short CJK content", () => {
+    const mem: ExtractedMemory = makeExtractedMemory("你好", "episodic", ["m1"]);
+    const messages: ConversationMessage[] = [makeMsg("m1", "你好")];
+
+    expect(passesConfidenceCheck(mem, messages)).toBe(false);
+  });
+});