diff --git a/src/smart-extractor.ts b/src/smart-extractor.ts index 2b6f6808..fbc51c57 100644 --- a/src/smart-extractor.ts +++ b/src/smart-extractor.ts @@ -56,61 +56,6 @@ import { batchDedup } from "./batch-dedup.js"; // Envelope Metadata Stripping // ============================================================================ -const RUNTIME_WRAPPER_LINE_RE = /^\[(?:Subagent Context|Subagent Task)\]\s*/i; -const RUNTIME_WRAPPER_PREFIX_RE = /^\[(?:Subagent Context|Subagent Task)\]/i; -const RUNTIME_WRAPPER_BOILERPLATE_RE = - /(?:You are running as a subagent\b.*?(?:$|(?<=\.)\s+)|Results auto-announce to your requester\.?\s*|do not busy-poll for status\.?\s*|Reply with a brief acknowledgment only\.?\s*|Do not use any memory tools\.?\s*)/gi; - -function stripRuntimeWrapperBoilerplate(text: string): string { - return text - .replace(RUNTIME_WRAPPER_BOILERPLATE_RE, "") - .replace(/\s{2,}/g, " ") - .trim(); -} - -function stripLeadingRuntimeWrappers(text: string): string { - const trimmed = text.trim(); - if (!trimmed) { - return trimmed; - } - - const lines = trimmed.split("\n"); - const cleanedLines: string[] = []; - let strippingLeadIn = true; - - for (const line of lines) { - const current = line.trim(); - - if (strippingLeadIn && current === "") { - continue; - } - - if (strippingLeadIn && RUNTIME_WRAPPER_PREFIX_RE.test(current)) { - const remainder = current.replace(RUNTIME_WRAPPER_LINE_RE, "").trim(); - const cleaned = remainder ? stripRuntimeWrapperBoilerplate(remainder) : ""; - if (cleaned) { - cleanedLines.push(cleaned); - strippingLeadIn = false; - } - continue; - } - - if ( - strippingLeadIn && - /^(?:Results auto-announce to your requester\.?|do not busy-poll for status\.?|Reply with a brief acknowledgment only\.?|Do not use any memory tools\.?)$/i.test( - current, - ) - ) { - continue; - } - - strippingLeadIn = false; - cleanedLines.push(line); - } - - return cleanedLines.join("\n").trim(); -} - /** * Strip platform envelope metadata injected by OpenClaw channels before * the conversation text reaches the extraction LLM. These envelopes contain @@ -124,45 +69,127 @@ function stripLeadingRuntimeWrappers(text: string): string { * - "Sender (untrusted metadata):" + JSON code blocks * - "Replied message (untrusted, for context):" + JSON code blocks * - Standalone JSON blocks containing message_id/sender_id fields + * + * Note: stripLeadingRuntimeWrappers and stripRuntimeWrapperBoilerplate from + * the old implementation are dead code after this refactor — they are not + * called anywhere in the pipeline. They have been removed. */ export function stripEnvelopeMetadata(text: string): string { - // 0. PR #444: strip runtime orchestration wrappers (leading only, not global) - // Preserves PR #444's stripLeadingRuntimeWrappers() — do NOT replace with global regex. - let cleaned = stripLeadingRuntimeWrappers(text); + // Matches wrapper lines: [Subagent Context] or [Subagent Task], possibly with + // inline content on the same line (e.g. "[Subagent Task] Reply with brief ack."). + // Also matches when the wrapper prefix is on its own line ("]\n" = no content after ]). + const WRAPPER_LINE_RE = /^\[(?:Subagent Context|Subagent Task)\](?:\s|$|\n)?/i; + const BOILERPLATE_RE = /^(?:Results auto-announce to your requester\.?|do not busy-poll for status\.?|Reply with a brief acknowledgment only\.?|Do not use any memory tools\.?)$/im; + // Anchored inline variant: only strip boilerplate when it starts the wrapper + // remainder. This avoids erasing legitimate inline payload that merely quotes + // a boilerplate phrase later in the sentence. + // Repeat the anchored segment so composite wrappers like "You are running... + // Results auto-announce..." are fully removed before preserving any payload. + // The subagent running phrase uses (?<=\.)\s+|$ alternation (same as old + // RUNTIME_WRAPPER_BOILERPLATE_RE) so that parenthetical depth like "(depth 1/1)." + // is included before the ending whitespace, correctly stripping the full phrase. + const INLINE_BOILERPLATE_RE = + /^(?:(?:You are running as a subagent\b.*?(?:(?<=\.)\s+|$)|Results auto-announce to your requester\.?\s*|do not busy-poll for status\.?\s*|Reply with a brief acknowledgment only\.?\s*|Do not use any memory tools\.?\s*))+/i; + // Anchor to start of line — prevents quoted/cited false-positives + const SUBAGENT_RUNNING_RE = /^You are running as a subagent\b/i; + + const originalLines = text.split("\n"); + + // Pre-scan: determine if there are leading wrappers. + // Needed to decide whether boilerplate in the leading zone should be stripped + // (boilerplate without a wrapper prefix is preserved — it may be legitimate user text). + // + // FIX (Must Fix 2): Only scan the ACTUAL leading zone — lines before the first + // real user content. Previously scanned ALL lines, causing false positives when + // a wrapper appeared in the trailing zone (e.g. user-pasted quoted text). + let foundLeadingWrapper = false; + for (let i = 0; i < originalLines.length; i++) { + const trimmed = originalLines[i].trim(); + if (trimmed === "") continue; // blank lines are part of leading zone + if (WRAPPER_LINE_RE.test(trimmed)) { foundLeadingWrapper = true; continue; } + if (BOILERPLATE_RE.test(trimmed)) continue; + // First real user content — stop scanning, this is the leading zone boundary + break; + } - // 0b. PR #481: strip Discord/channel forwarded message envelope blocks (per-line) - cleaned = cleaned.replace( - /^<< { const noiseBank = this.config.noiseBank; - if (!noiseBank || !noiseBank.initialized) return texts; const result: string[] = []; diff --git a/test/strip-envelope-metadata.test.mjs b/test/strip-envelope-metadata.test.mjs index a3f8f484..36aed611 100644 --- a/test/strip-envelope-metadata.test.mjs +++ b/test/strip-envelope-metadata.test.mjs @@ -142,6 +142,114 @@ describe("stripEnvelopeMetadata", () => { assert.equal(result, "Actual user content starts here."); }); + // rwmjhb Must Fix #1: "Reply with a brief acknowledgment only." on its own line + // followed by user content — must still be stripped (boilerplate, not user text) + it("strips standalone Reply-with-ack line when followed by user content", () => { + const input = [ + "[Subagent Task] Reply with a brief acknowledgment only.", + "Actual user content starts here.", + ].join("\n"); + + const result = stripEnvelopeMetadata(input); + assert.equal(result, "Actual user content starts here."); + }); + + // rwmjhb Nice to Have #2: multiline wrapper where "You are running as a subagent..." + // appears on a separate line after [Subagent Context] prefix — must be stripped + it("strips multiline wrapper with 'You are running as a subagent' on separate line", () => { + const input = [ + "[Subagent Context]", + "You are running as a subagent (depth 1/1).", + "Results auto-announce to your requester.", + "Actual user content starts here.", + ].join("\n"); + + const result = stripEnvelopeMetadata(input); + assert.equal(result, "Actual user content starts here."); + }); + + // Do-not-false-positive: legitimate user text that happens to match a boilerplate + // phrase — must NOT be stripped when followed by user content + it("preserves legitimate user text that matches boilerplate phrases", () => { + const input = [ + "Do not use any memory tools.", + "I need you to remember my preferences.", + ].join("\n"); + + const result = stripEnvelopeMetadata(input); + assert.match(result, /Do not use any memory tools/); + assert.match(result, /I need you to remember my preferences/); + }); + + // FIX 1 (MAJOR): boilerplate BEFORE wrapper in leading zone — must be PRESERVED + // Root cause: encounteredWrapperYet flag ensures boilerplate is only stripped + // when a wrapper has ALREADY appeared on a previous line, not just because + // a wrapper exists somewhere in the leading zone. + it("preserves boilerplate that appears BEFORE wrapper in leading zone", () => { + const input = [ + "Do not use any memory tools.", + "[Subagent Context]", + "Actual user content starts here.", + ].join("\n"); + + const result = stripEnvelopeMetadata(input); + // Boilerplate BEFORE wrapper must be preserved (not a false positive) + assert.match(result, /Do not use any memory tools/); + assert.match(result, /Actual user content starts here/); + assert.doesNotMatch(result, /Subagent Context/); + }); + + // FIX 2 (MINOR): wrapper with inline content — preserve non-boilerplate remainder + // Old implementation stripped only the wrapper prefix, preserving inline payload. + // New implementation initially dropped the entire line (regression). + // This fix restores inline content preservation. + it("preserves non-boilerplate inline content after wrapper prefix", () => { + const input = [ + "[Subagent Context] Actual user content starts here.", + ].join("\n"); + + const result = stripEnvelopeMetadata(input); + assert.match(result, /Actual user content starts here/); + assert.doesNotMatch(result, /Subagent Context/); + }); + + it("preserves inline wrapper payload that only mentions boilerplate later in the sentence", () => { + const input = [ + "[Subagent Context] User quoted the phrase Reply with a brief acknowledgment only. for documentation.", + ].join("\n"); + + const result = stripEnvelopeMetadata(input); + assert.equal(result, "User quoted the phrase Reply with a brief acknowledgment only. for documentation."); + }); + + // FIX 2 regression: wrapper inline boilerplate should still be stripped + it("strips boilerplate-only inline content after wrapper prefix", () => { + const input = [ + "[Subagent Task] Reply with a brief acknowledgment only.", + ].join("\n"); + + const result = stripEnvelopeMetadata(input); + assert.equal(result, ""); + }); + + it("strips leading inline boilerplate but preserves payload that follows it", () => { + const input = [ + "[Subagent Task] Reply with a brief acknowledgment only. Then summarize the failing test.", + ].join("\n"); + + const result = stripEnvelopeMetadata(input); + assert.equal(result, "Then summarize the failing test."); + }); + + it("strips multiple leading boilerplate phrases before preserving inline payload", () => { + const input = [ + "[Subagent Task] Reply with a brief acknowledgment only. Do not use any memory tools. Actual user content starts here.", + ].join("\n"); + + const result = stripEnvelopeMetadata(input); + assert.equal(result, "Actual user content starts here."); + }); + it("handles Telegram-style envelope headers", () => { const input = [ "System: [2026-03-18 14:21:36 GMT+8] Telegram[bot123] DM | user_456 [msg:12345]", @@ -168,6 +276,22 @@ describe("stripEnvelopeMetadata", () => { assert.doesNotMatch(result, /message_id/); }); + it("strips standalone JSON blocks when sender_id appears before message_id", () => { + const input = [ + "Some text before", + "```json", + '{"sender_id": "ou_yyy", "message_id": "om_xxx", "timestamp": "2026-03-18"}', + "```", + "Some text after", + ].join("\n"); + + const result = stripEnvelopeMetadata(input); + assert.match(result, /Some text before/); + assert.match(result, /Some text after/); + assert.doesNotMatch(result, /message_id/); + assert.doesNotMatch(result, /sender_id/); + }); + it("collapses excessive blank lines after stripping", () => { const input = [ "System: [2026-03-18 14:21:36 GMT+8] Feishu[default] DM | ou_xxx [msg:om_xxx]", @@ -227,4 +351,49 @@ describe("stripEnvelopeMetadata", () => { // regex requires both message_id AND sender_id assert.match(result, /message_id/); }); + + // ----------------------------------------------------------------------- + // Fix 1 regression tests: user content BEFORE boilerplate + // ----------------------------------------------------------------------- + it("preserves boilerplate that appears BEFORE user content (user content first)", () => { + const input = [ + "[Subagent Context]", + "User content first.", + "Results auto-announce to your requester.", + ].join("\n"); + + const result = stripEnvelopeMetadata(input); + // Boilerplate appears AFTER user content, so it's outside the leading zone + // and must be preserved + assert.equal(result, "User content first.\nResults auto-announce to your requester."); + }); + + // ----------------------------------------------------------------------- + // Fix 3 regression tests: consecutive subagent content lines + // ----------------------------------------------------------------------- + it("strips all consecutive subagent content lines in the leading zone", () => { + const input = [ + "[Subagent Context]", + "You are running as a subagent (depth 1/1).", + "You are running as a subagent (depth 2/2).", + "Actual.", + ].join("\n"); + + const result = stripEnvelopeMetadata(input); + assert.equal(result, "Actual."); + }); + + // ----------------------------------------------------------------------- + // Edge case: only wrapper + boilerplate, no user content at all + // ----------------------------------------------------------------------- + it("strips everything when there is only wrapper and boilerplate with no user content", () => { + const input = [ + "[Subagent Context]", + "You are running as a subagent (depth 1/1).", + "Results auto-announce to your requester.", + ].join("\n"); + + const result = stripEnvelopeMetadata(input); + assert.equal(result, ""); + }); });