From 67ca0ddac738c864c522438bdcc0403b9be5b3f4 Mon Sep 17 00:00:00 2001 From: jlin53882 Date: Fri, 10 Apr 2026 15:48:31 +0800 Subject: [PATCH 1/2] fix: stripEnvelopeMetadata - Claim 1 + Claim 2 bug fixes - FIX 1 (MAJOR): encounteredWrapperYet flag replaces global hasLeadingWrapper Boilerplate is now only stripped when a wrapper has ALREADY appeared on a previous line, not just because a wrapper exists somewhere in the leading zone. This fixes the false positive where boilerplate text like 'Do not use any memory tools.' appearing BEFORE a wrapper in the input would incorrectly be deleted. - FIX 2 (MINOR): preserve inline content after wrapper prefix Old implementation stripped only the wrapper prefix, preserving inline payload. New implementation initially dropped the entire line (regression). This fix restores inline content preservation by: 1. Stripping wrapper prefix 2. Removing boilerplate phrases from remainder using INLINE_BOILERPLATE_RE 3. Keeping remainder if non-empty (non-boilerplate content preserved) - Add 3 new test cases covering both fixes Closes: (PR 531 review fixes) --- src/smart-extractor.ts | 204 ++++++++++++++------------ test/strip-envelope-metadata.test.mjs | 126 ++++++++++++++++ 2 files changed, 239 insertions(+), 91 deletions(-) diff --git a/src/smart-extractor.ts b/src/smart-extractor.ts index 2b6f6808..7e33cfdd 100644 --- a/src/smart-extractor.ts +++ b/src/smart-extractor.ts @@ -56,61 +56,6 @@ import { batchDedup } from "./batch-dedup.js"; // Envelope Metadata Stripping // ============================================================================ -const RUNTIME_WRAPPER_LINE_RE = /^\[(?:Subagent Context|Subagent Task)\]\s*/i; -const RUNTIME_WRAPPER_PREFIX_RE = /^\[(?:Subagent Context|Subagent Task)\]/i; -const RUNTIME_WRAPPER_BOILERPLATE_RE = - /(?:You are running as a subagent\b.*?(?:$|(?<=\.)\s+)|Results auto-announce to your requester\.?\s*|do not busy-poll for status\.?\s*|Reply with a brief acknowledgment only\.?\s*|Do not use any memory tools\.?\s*)/gi; - -function stripRuntimeWrapperBoilerplate(text: string): string { - return text - .replace(RUNTIME_WRAPPER_BOILERPLATE_RE, "") - .replace(/\s{2,}/g, " ") - .trim(); -} - -function stripLeadingRuntimeWrappers(text: string): string { - const trimmed = text.trim(); - if (!trimmed) { - return trimmed; - } - - const lines = trimmed.split("\n"); - const cleanedLines: string[] = []; - let strippingLeadIn = true; - - for (const line of lines) { - const current = line.trim(); - - if (strippingLeadIn && current === "") { - continue; - } - - if (strippingLeadIn && RUNTIME_WRAPPER_PREFIX_RE.test(current)) { - const remainder = current.replace(RUNTIME_WRAPPER_LINE_RE, "").trim(); - const cleaned = remainder ? stripRuntimeWrapperBoilerplate(remainder) : ""; - if (cleaned) { - cleanedLines.push(cleaned); - strippingLeadIn = false; - } - continue; - } - - if ( - strippingLeadIn && - /^(?:Results auto-announce to your requester\.?|do not busy-poll for status\.?|Reply with a brief acknowledgment only\.?|Do not use any memory tools\.?)$/i.test( - current, - ) - ) { - continue; - } - - strippingLeadIn = false; - cleanedLines.push(line); - } - - return cleanedLines.join("\n").trim(); -} - /** * Strip platform envelope metadata injected by OpenClaw channels before * the conversation text reaches the extraction LLM. These envelopes contain @@ -124,45 +69,123 @@ function stripLeadingRuntimeWrappers(text: string): string { * - "Sender (untrusted metadata):" + JSON code blocks * - "Replied message (untrusted, for context):" + JSON code blocks * - Standalone JSON blocks containing message_id/sender_id fields + * + * Note: stripLeadingRuntimeWrappers and stripRuntimeWrapperBoilerplate from + * the old implementation are dead code after this refactor — they are not + * called anywhere in the pipeline. They have been removed. */ export function stripEnvelopeMetadata(text: string): string { - // 0. PR #444: strip runtime orchestration wrappers (leading only, not global) - // Preserves PR #444's stripLeadingRuntimeWrappers() — do NOT replace with global regex. - let cleaned = stripLeadingRuntimeWrappers(text); + // Matches wrapper lines: [Subagent Context] or [Subagent Task], possibly with + // inline content on the same line (e.g. "[Subagent Task] Reply with brief ack."). + // Also matches when the wrapper prefix is on its own line ("]\n" = no content after ]). + const WRAPPER_LINE_RE = /^\[(?:Subagent Context|Subagent Task)\](?:\s|$|\n)?/i; + const BOILERPLATE_RE = /^(?:Results auto-announce to your requester\.?|do not busy-poll for status\.?|Reply with a brief acknowledgment only\.?|Do not use any memory tools\.?)$/im; + // Non-anchored version for inline content: matches boilerplate phrases anywhere in a string. + // The subagent running phrase uses (?<=\.)\s+|$ alternation (same as old + // RUNTIME_WRAPPER_BOILERPLATE_RE) so that parenthetical depth like "(depth 1/1)." + // is included before the ending whitespace, correctly stripping the full phrase. + const INLINE_BOILERPLATE_RE = + /(?:You are running as a subagent\b.*?(?:(?<=\.)\s+|$)|Results auto-announce to your requester\.?|do not busy-poll for status\.?|Reply with a brief acknowledgment only\.?|Do not use any memory tools\.?)/gi; + // Anchor to start of line — prevents quoted/cited false-positives + const SUBAGENT_RUNNING_RE = /^You are running as a subagent\b/i; + + const originalLines = text.split("\n"); + + // Pre-scan: determine if there are leading wrappers. + // Needed to decide whether boilerplate in the leading zone should be stripped + // (boilerplate without a wrapper prefix is preserved — it may be legitimate user text). + // + // FIX (Must Fix 2): Only scan the ACTUAL leading zone — lines before the first + // real user content. Previously scanned ALL lines, causing false positives when + // a wrapper appeared in the trailing zone (e.g. user-pasted quoted text). + let foundLeadingWrapper = false; + for (let i = 0; i < originalLines.length; i++) { + const trimmed = originalLines[i].trim(); + if (trimmed === "") continue; // blank lines are part of leading zone + if (WRAPPER_LINE_RE.test(trimmed)) { foundLeadingWrapper = true; continue; } + if (BOILERPLATE_RE.test(trimmed)) continue; + // First real user content — stop scanning, this is the leading zone boundary + break; + } - // 0b. PR #481: strip Discord/channel forwarded message envelope blocks (per-line) - cleaned = cleaned.replace( - /^<< { const noiseBank = this.config.noiseBank; - if (!noiseBank || !noiseBank.initialized) return texts; const result: string[] = []; diff --git a/test/strip-envelope-metadata.test.mjs b/test/strip-envelope-metadata.test.mjs index a3f8f484..4108de22 100644 --- a/test/strip-envelope-metadata.test.mjs +++ b/test/strip-envelope-metadata.test.mjs @@ -142,6 +142,87 @@ describe("stripEnvelopeMetadata", () => { assert.equal(result, "Actual user content starts here."); }); + // rwmjhb Must Fix #1: "Reply with a brief acknowledgment only." on its own line + // followed by user content — must still be stripped (boilerplate, not user text) + it("strips standalone Reply-with-ack line when followed by user content", () => { + const input = [ + "[Subagent Task] Reply with a brief acknowledgment only.", + "Actual user content starts here.", + ].join("\n"); + + const result = stripEnvelopeMetadata(input); + assert.equal(result, "Actual user content starts here."); + }); + + // rwmjhb Nice to Have #2: multiline wrapper where "You are running as a subagent..." + // appears on a separate line after [Subagent Context] prefix — must be stripped + it("strips multiline wrapper with 'You are running as a subagent' on separate line", () => { + const input = [ + "[Subagent Context]", + "You are running as a subagent (depth 1/1).", + "Results auto-announce to your requester.", + "Actual user content starts here.", + ].join("\n"); + + const result = stripEnvelopeMetadata(input); + assert.equal(result, "Actual user content starts here."); + }); + + // Do-not-false-positive: legitimate user text that happens to match a boilerplate + // phrase — must NOT be stripped when followed by user content + it("preserves legitimate user text that matches boilerplate phrases", () => { + const input = [ + "Do not use any memory tools.", + "I need you to remember my preferences.", + ].join("\n"); + + const result = stripEnvelopeMetadata(input); + assert.match(result, /Do not use any memory tools/); + assert.match(result, /I need you to remember my preferences/); + }); + + // FIX 1 (MAJOR): boilerplate BEFORE wrapper in leading zone — must be PRESERVED + // Root cause: encounteredWrapperYet flag ensures boilerplate is only stripped + // when a wrapper has ALREADY appeared on a previous line, not just because + // a wrapper exists somewhere in the leading zone. + it("preserves boilerplate that appears BEFORE wrapper in leading zone", () => { + const input = [ + "Do not use any memory tools.", + "[Subagent Context]", + "Actual user content starts here.", + ].join("\n"); + + const result = stripEnvelopeMetadata(input); + // Boilerplate BEFORE wrapper must be preserved (not a false positive) + assert.match(result, /Do not use any memory tools/); + assert.match(result, /Actual user content starts here/); + assert.doesNotMatch(result, /Subagent Context/); + }); + + // FIX 2 (MINOR): wrapper with inline content — preserve non-boilerplate remainder + // Old implementation stripped only the wrapper prefix, preserving inline payload. + // New implementation initially dropped the entire line (regression). + // This fix restores inline content preservation. + it("preserves non-boilerplate inline content after wrapper prefix", () => { + const input = [ + "[Subagent Context] Actual user content starts here.", + ].join("\n"); + + const result = stripEnvelopeMetadata(input); + assert.match(result, /Actual user content starts here/); + assert.doesNotMatch(result, /Subagent Context/); + }); + + // FIX 2 regression: wrapper inline boilerplate should still be stripped + it("strips boilerplate-only inline content after wrapper prefix", () => { + const input = [ + "[Subagent Task] Reply with a brief acknowledgment only.", + ].join("\n"); + + const result = stripEnvelopeMetadata(input); + assert.equal(result, ""); + }); + it("handles Telegram-style envelope headers", () => { const input = [ "System: [2026-03-18 14:21:36 GMT+8] Telegram[bot123] DM | user_456 [msg:12345]", @@ -227,4 +308,49 @@ describe("stripEnvelopeMetadata", () => { // regex requires both message_id AND sender_id assert.match(result, /message_id/); }); + + // ----------------------------------------------------------------------- + // Fix 1 regression tests: user content BEFORE boilerplate + // ----------------------------------------------------------------------- + it("preserves boilerplate that appears BEFORE user content (user content first)", () => { + const input = [ + "[Subagent Context]", + "User content first.", + "Results auto-announce to your requester.", + ].join("\n"); + + const result = stripEnvelopeMetadata(input); + // Boilerplate appears AFTER user content, so it's outside the leading zone + // and must be preserved + assert.equal(result, "User content first.\nResults auto-announce to your requester."); + }); + + // ----------------------------------------------------------------------- + // Fix 3 regression tests: consecutive subagent content lines + // ----------------------------------------------------------------------- + it("strips all consecutive subagent content lines in the leading zone", () => { + const input = [ + "[Subagent Context]", + "You are running as a subagent (depth 1/1).", + "You are running as a subagent (depth 2/2).", + "Actual.", + ].join("\n"); + + const result = stripEnvelopeMetadata(input); + assert.equal(result, "Actual."); + }); + + // ----------------------------------------------------------------------- + // Edge case: only wrapper + boilerplate, no user content at all + // ----------------------------------------------------------------------- + it("strips everything when there is only wrapper and boilerplate with no user content", () => { + const input = [ + "[Subagent Context]", + "You are running as a subagent (depth 1/1).", + "Results auto-announce to your requester.", + ].join("\n"); + + const result = stripEnvelopeMetadata(input); + assert.equal(result, ""); + }); }); From c4bca34f4d8edfa59e98bbb933a1b98ebd601623 Mon Sep 17 00:00:00 2001 From: jlin53882 Date: Mon, 13 Apr 2026 01:04:37 +0800 Subject: [PATCH 2/2] fix: stripEnvelopeMetadata - MR2 anchored regex + hidden JSON key-order bug - MR2: INLINE_BOILERPLATE_RE now anchored with ^ to avoid stripping quoted boilerplate later in payload - Hidden bug: JSON stripper regex only worked when message_id came before sender_id; now uses order-independent lookahead - Added 4 new test cases covering the fixes --- src/smart-extractor.ts | 14 +++++---- test/strip-envelope-metadata.test.mjs | 43 +++++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 5 deletions(-) diff --git a/src/smart-extractor.ts b/src/smart-extractor.ts index 7e33cfdd..fbc51c57 100644 --- a/src/smart-extractor.ts +++ b/src/smart-extractor.ts @@ -80,12 +80,16 @@ export function stripEnvelopeMetadata(text: string): string { // Also matches when the wrapper prefix is on its own line ("]\n" = no content after ]). const WRAPPER_LINE_RE = /^\[(?:Subagent Context|Subagent Task)\](?:\s|$|\n)?/i; const BOILERPLATE_RE = /^(?:Results auto-announce to your requester\.?|do not busy-poll for status\.?|Reply with a brief acknowledgment only\.?|Do not use any memory tools\.?)$/im; - // Non-anchored version for inline content: matches boilerplate phrases anywhere in a string. + // Anchored inline variant: only strip boilerplate when it starts the wrapper + // remainder. This avoids erasing legitimate inline payload that merely quotes + // a boilerplate phrase later in the sentence. + // Repeat the anchored segment so composite wrappers like "You are running... + // Results auto-announce..." are fully removed before preserving any payload. // The subagent running phrase uses (?<=\.)\s+|$ alternation (same as old // RUNTIME_WRAPPER_BOILERPLATE_RE) so that parenthetical depth like "(depth 1/1)." // is included before the ending whitespace, correctly stripping the full phrase. const INLINE_BOILERPLATE_RE = - /(?:You are running as a subagent\b.*?(?:(?<=\.)\s+|$)|Results auto-announce to your requester\.?|do not busy-poll for status\.?|Reply with a brief acknowledgment only\.?|Do not use any memory tools\.?)/gi; + /^(?:(?:You are running as a subagent\b.*?(?:(?<=\.)\s+|$)|Results auto-announce to your requester\.?\s*|do not busy-poll for status\.?\s*|Reply with a brief acknowledgment only\.?\s*|Do not use any memory tools\.?\s*))+/i; // Anchor to start of line — prevents quoted/cited false-positives const SUBAGENT_RUNNING_RE = /^You are running as a subagent\b/i; @@ -133,8 +137,8 @@ export function stripEnvelopeMetadata(text: string): string { let remainder = afterPrefix; // 2. Remove all boilerplate phrases from remainder (handles inline // wrapper+boilerplate like "[Subagent Context] ... Results auto-announce..."). - // Use INLINE_BOILERPLATE_RE (non-anchored, includes subagent phrase) so - // boilerplate embedded anywhere in the inline content is also removed. + // Use INLINE_BOILERPLATE_RE (anchored, includes subagent phrase) so only + // leading wrapper boilerplate is removed while quoted user payload remains. remainder = remainder.replace(INLINE_BOILERPLATE_RE, "").replace(/\s{2,}/g, " ").trim(); // 3. Keep remainder if non-empty (non-boilerplate inline content preserved); // strip the whole line if only boilerplate was present @@ -202,7 +206,7 @@ export function stripEnvelopeMetadata(text: string): string { // 3. Strip any remaining JSON blocks that look like envelope metadata // (contain message_id and sender_id fields) cleaned = cleaned.replace( - /```json\s*\{[^}]*"message_id"\s*:[^}]*"sender_id"\s*:[^}]*\}\s*```/g, + /```json\s*(?=\{[\s\S]*?"message_id"\s*:)(?=\{[\s\S]*?"sender_id"\s*:)\{[\s\S]*?\}\s*```/g, "", ); diff --git a/test/strip-envelope-metadata.test.mjs b/test/strip-envelope-metadata.test.mjs index 4108de22..36aed611 100644 --- a/test/strip-envelope-metadata.test.mjs +++ b/test/strip-envelope-metadata.test.mjs @@ -213,6 +213,15 @@ describe("stripEnvelopeMetadata", () => { assert.doesNotMatch(result, /Subagent Context/); }); + it("preserves inline wrapper payload that only mentions boilerplate later in the sentence", () => { + const input = [ + "[Subagent Context] User quoted the phrase Reply with a brief acknowledgment only. for documentation.", + ].join("\n"); + + const result = stripEnvelopeMetadata(input); + assert.equal(result, "User quoted the phrase Reply with a brief acknowledgment only. for documentation."); + }); + // FIX 2 regression: wrapper inline boilerplate should still be stripped it("strips boilerplate-only inline content after wrapper prefix", () => { const input = [ @@ -223,6 +232,24 @@ describe("stripEnvelopeMetadata", () => { assert.equal(result, ""); }); + it("strips leading inline boilerplate but preserves payload that follows it", () => { + const input = [ + "[Subagent Task] Reply with a brief acknowledgment only. Then summarize the failing test.", + ].join("\n"); + + const result = stripEnvelopeMetadata(input); + assert.equal(result, "Then summarize the failing test."); + }); + + it("strips multiple leading boilerplate phrases before preserving inline payload", () => { + const input = [ + "[Subagent Task] Reply with a brief acknowledgment only. Do not use any memory tools. Actual user content starts here.", + ].join("\n"); + + const result = stripEnvelopeMetadata(input); + assert.equal(result, "Actual user content starts here."); + }); + it("handles Telegram-style envelope headers", () => { const input = [ "System: [2026-03-18 14:21:36 GMT+8] Telegram[bot123] DM | user_456 [msg:12345]", @@ -249,6 +276,22 @@ describe("stripEnvelopeMetadata", () => { assert.doesNotMatch(result, /message_id/); }); + it("strips standalone JSON blocks when sender_id appears before message_id", () => { + const input = [ + "Some text before", + "```json", + '{"sender_id": "ou_yyy", "message_id": "om_xxx", "timestamp": "2026-03-18"}', + "```", + "Some text after", + ].join("\n"); + + const result = stripEnvelopeMetadata(input); + assert.match(result, /Some text before/); + assert.match(result, /Some text after/); + assert.doesNotMatch(result, /message_id/); + assert.doesNotMatch(result, /sender_id/); + }); + it("collapses excessive blank lines after stripping", () => { const input = [ "System: [2026-03-18 14:21:36 GMT+8] Feishu[default] DM | ou_xxx [msg:om_xxx]",