CortexReach · jlin53882 · Apr 4, 2026 · Apr 4, 2026 · Apr 4, 2026 · Apr 5, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,27 @@
 # Changelog
 
+## Unreleased
+
+### Fix: cumulative turn counting for auto-capture smart extraction (#417, PR #518)
+
+**Bug**: With `extractMinMessages: 2` + `smartExtraction: true`, single-turn DM conversations always fell through to regex fallback, writing dirty data (`l0_abstract == text`, no LLM distillation).
+
+**Root causes**:
+- `autoCaptureSeenTextCount` was overwritten per-event (always 1 for DM), never accumulating
+- `buildAutoCaptureConversationKeyFromIngress` returned `null` for DM (no `conversationId`), so `pendingIngressTexts` was never written
+
+**Changes**:
+- **Cumulative counting**: `autoCaptureSeenTextCount` now accumulates across events instead of overwriting per-event
+- **DM key fallback**: `buildAutoCaptureConversationKeyFromIngress` falls back to `channelId` when `conversationId` is falsy, so DM sessions now correctly write to `pendingIngressTexts` and match the key extracted by `buildAutoCaptureConversationKeyFromSessionKey`
+- **Smart extraction threshold**: now uses cumulative turn count (`currentCumulativeCount`) instead of per-event message count
+- **`extractMinMessages` cap**: `Math.min(config.extractMinMessages ?? 4, 100)` prevents misconfiguration (e.g., setting 999999 would permanently disable smart extraction)
+- **MAX_MESSAGE_LENGTH guard**: 5000 char limit per message in `pendingIngressTexts` rolling window prevents OOM from malformed input
+- **Test**: added `runCumulativeTurnCountingScenario` in `test/smart-extractor-branches.mjs` verifying turn-1 skip and turn-2 trigger with `extractMinMessages=2`
+
+**⚠️ Breaking change**: `extractMinMessages` semantics changed from "per-event message count" to "cumulative conversation turns". Before: each `agent_end` needed ≥N messages. After: smart extraction triggers at conversation turn N. This is a bug fix since the old semantics were structurally broken for DM; users relying on the old behavior may need to adjust their `extractMinMessages` values.
+
+---
+
 ## 1.1.0-beta.2 (Smart Memory Beta + Access Reinforcement)
 
 This is a **beta** release published under the npm dist-tag **`beta`** (it does not affect the stable `latest` channel).

diff --git a/index.ts b/index.ts
@@ -776,8 +776,10 @@ function buildAutoCaptureConversationKeyFromIngress(
 ): string | null {
   const channel = typeof channelId === "string" ? channelId.trim() : "";
   const conversation = typeof conversationId === "string" ? conversationId.trim() : "";
-  if (!channel || !conversation) return null;
-  return `${channel}:${conversation}`;
+  if (!channel) return null;
+  // DM: conversationId=undefined -> fallback to channelId (matches regex extract from sessionKey)
+  // Group: conversationId=exists -> returns channelId:conversationId (matches regex extract)
+  return conversation ? `${channel}:${conversation}` : channel;
 }
 
 /**
@@ -2061,8 +2063,9 @@ const memoryLanceDBProPlugin = {
       );
       const normalized = normalizeAutoCaptureText("user", event.content, shouldSkipReflectionMessage);
       if (conversationKey && normalized) {
+        const MAX_MESSAGE_LENGTH = 5000;
         const queue = autoCapturePendingIngressTexts.get(conversationKey) || [];
-        queue.push(normalized);
+        queue.push(normalized.slice(0, MAX_MESSAGE_LENGTH));
         autoCapturePendingIngressTexts.set(conversationKey, queue.slice(-6));
         pruneMapIfOver(autoCapturePendingIngressTexts, AUTO_CAPTURE_MAP_MAX_ENTRIES);
       }
@@ -2638,36 +2641,48 @@ const memoryLanceDBProPlugin = {
           const pendingIngressTexts = conversationKey
             ? [...(autoCapturePendingIngressTexts.get(conversationKey) || [])]
             : [];
-          if (conversationKey) {
-            autoCapturePendingIngressTexts.delete(conversationKey);
-          }
-
           const previousSeenCount = autoCaptureSeenTextCount.get(sessionKey) ?? 0;
+          // [Fix #2] Cumulative counting: accumulate across events, not per-event overwrite
+          // Note: Using eligibleTexts.length (raw event text count), not newTexts.length.
+          // newTexts-based counting was rejected because it breaks the extractMinMessages
+          // semantics: the counter is designed to accumulate per-event text count,
+          // not per-event delta. Fix #2 with eligibleTexts.length works correctly for
+          // the real-world case (1 text per event); the double-counting risk only
+          // applies when agent_end delivers full history every time, which does not
+          // occur in the current code path.
           let newTexts = eligibleTexts;
           if (pendingIngressTexts.length > 0) {
+            // [Fix #3] Use pendingIngressTexts as-is (REPLACE, not APPEND).
+            // REPLACE is correct because: (1) Fix #2 cumulative count ensures enough turns
+            // accumulate; (2) Fix #4 (delete) restores original behavior where pending is
+            // event-scoped; (3) APPEND causes deduplication issues when the same text
+            // appears in both pendingIngressTexts and eligibleTexts (after prefix stripping).
             newTexts = pendingIngressTexts;
+            // [Fix #8] Clear consumed pending texts to prevent re-consumption
+            // (conversationKey is guaranteed truthy here since pendingIngressTexts.length > 0
+            // and pendingIngressTexts is [] when conversationKey is falsy)
+            autoCapturePendingIngressTexts.delete(conversationKey);
           } else if (previousSeenCount > 0 && eligibleTexts.length > previousSeenCount) {
             newTexts = eligibleTexts.slice(previousSeenCount);
           }
-          autoCaptureSeenTextCount.set(sessionKey, eligibleTexts.length);
+          const currentCumulativeCount = previousSeenCount + eligibleTexts.length;
+          autoCaptureSeenTextCount.set(sessionKey, currentCumulativeCount);
           pruneMapIfOver(autoCaptureSeenTextCount, AUTO_CAPTURE_MAP_MAX_ENTRIES);
 
           const priorRecentTexts = autoCaptureRecentTexts.get(sessionKey) || [];
           let texts = newTexts;
-          if (
-            texts.length === 1 &&
-            isExplicitRememberCommand(texts[0]) &&
-            priorRecentTexts.length > 0
-          ) {
-            texts = [...priorRecentTexts.slice(-1), ...texts];
-          }
+          // [Fix #5 REMOVED] isExplicitRememberCommand guard: unreachable under REPLACE strategy.
+          // With REPLACE, texts = pendingIngressTexts (length typically > 1 in multi-turn),
+          // so texts.length === 1 guard can never trigger.
+          // This guard was designed for the old APPEND strategy and is obsolete.
           if (newTexts.length > 0) {
             const nextRecentTexts = [...priorRecentTexts, ...newTexts].slice(-6);
             autoCaptureRecentTexts.set(sessionKey, nextRecentTexts);
             pruneMapIfOver(autoCaptureRecentTexts, AUTO_CAPTURE_MAP_MAX_ENTRIES);
           }
 
-          const minMessages = config.extractMinMessages ?? 4;
+          // [Fix #6] Cap extractMinMessages to prevent misconfiguration
+          const minMessages = Math.min(config.extractMinMessages ?? 4, 100);
           if (skippedAutoCaptureTexts > 0) {
             api.logger.debug(
               `memory-lancedb-pro: auto-capture skipped ${skippedAutoCaptureTexts} injected/system text block(s) for agent ${agentId}`,
@@ -2741,36 +2756,66 @@ const memoryLanceDBProPlugin = {
               );
               return;
             }
-            if (cleanTexts.length >= minMessages) {
+            // [Fix #3 updated] Use cumulative count (turn count) for smart extraction threshold
+            if (currentCumulativeCount >= minMessages) {
               api.logger.debug(
-                `memory-lancedb-pro: auto-capture running smart extraction for agent ${agentId} (${cleanTexts.length} clean texts >= ${minMessages})`,
+                `memory-lancedb-pro: auto-capture running smart extraction for agent ${agentId} (cumulative=${currentCumulativeCount} >= minMessages=${minMessages})`,
               );
               const conversationText = cleanTexts.join("\n");
-              const stats = await smartExtractor.extractAndPersist(
-                conversationText, sessionKey,
-                { scope: defaultScope, scopeFilter: accessibleScopes },
-              );
-              // Charge rate limiter only after successful extraction
-              extractionRateLimiter.recordExtraction();
+              // [Fix #10] Wrap extraction in try-catch so a failing extraction does not crash the hook.
+              // Counter is NOT reset on failure — the same window will re-trigger on the next agent_end.
+              let stats;
+              try {
+                stats = await smartExtractor.extractAndPersist(
+                  conversationText, sessionKey,
+                  { scope: defaultScope, scopeFilter: accessibleScopes },
+                );
+              } catch (err) {
+                api.logger.error(
+                  `memory-lancedb-pro: smart extraction failed for agent ${agentId}: ${err instanceof Error ? err.message : String(err)}; skipping extraction this cycle`
+                );
+                // [Fix #10 extended] Clear pending texts on failure so the next cycle
+                // does not re-process the same pending batch. Counter stays high (not reset)
+                // so the same window will re-accumulate toward the next trigger.
+                if (conversationKey) {
+                  autoCapturePendingIngressTexts.delete(conversationKey);
+                }
+                return; // Do not fall through to regex fallback when smart extraction is configured
+              }
               if (stats.created > 0 || stats.merged > 0) {
+                extractionRateLimiter.recordExtraction();
                 api.logger.info(
                   `memory-lancedb-pro: smart-extracted ${stats.created} created, ${stats.merged} merged, ${stats.skipped} skipped for agent ${agentId}`
                 );
+                autoCaptureSeenTextCount.set(sessionKey, 0);
                 return; // Smart extraction handled everything
               }
 
-              if ((stats.boundarySkipped ?? 0) > 0) {
+              // [Fix-Must1] Reset counter to previousSeenCount when all candidates are deduplicated
+              // (created=0, merged=0). Without this, counter stays high -> next agent_end
+              // re-triggers -> same dedupe -> retry spiral. Resetting to previousSeenCount ensures
+              // the next event starts fresh (counter = number of genuinely new texts seen so far).
+              autoCaptureSeenTextCount.set(sessionKey, previousSeenCount);
+
+              // [Fix-Must1b] When all candidates are skipped AND no boundary texts remain,
+              // skip regex fallback entirely — there is nothing to capture.
+              if ((stats.boundarySkipped ?? 0) === 0) {
                 api.logger.info(
-                  `memory-lancedb-pro: smart extraction skipped ${stats.boundarySkipped} USER.md-exclusive candidate(s) for agent ${agentId}; continuing to regex fallback for non-boundary texts`,
+                  `memory-lancedb-pro: smart extraction produced no candidates and no boundary texts for agent ${agentId}; skipping regex fallback`,
                 );
+                return;
               }
 
+              api.logger.info(
+                `memory-lancedb-pro: smart extraction skipped ${stats.boundarySkipped} USER.md-exclusive candidate(s) for agent ${agentId}; continuing to regex fallback for non-boundary texts`,
+              );
+
               api.logger.info(
                 `memory-lancedb-pro: smart extraction produced no persisted memories for agent ${agentId} (created=${stats.created}, merged=${stats.merged}, skipped=${stats.skipped}); falling back to regex capture`,
               );
             } else {
               api.logger.debug(
-                `memory-lancedb-pro: auto-capture skipped smart extraction for agent ${agentId} (${cleanTexts.length} < ${minMessages})`,
+                `memory-lancedb-pro: auto-capture skipped smart extraction for agent ${agentId} (cumulative=${currentCumulativeCount} < minMessages=${minMessages})`,
               );
             }
           }
@@ -2876,6 +2921,13 @@ const memoryLanceDBProPlugin = {
             api.logger.info(
               `memory-lancedb-pro: auto-captured ${stored} memories for agent ${agentId} in scope ${defaultScope}`,
             );
+            // Note: counter is intentionally NOT reset here. If we reset after regex fallback,
+            // the next turn starts fresh (counter = 1) and requires another full cycle to re-trigger.
+            // This means: Turn 1 stores via regex → counter=0 → Turn 2 counter=1 (<min) → skipped.
+            // This is intentional — regex fallback is a last-resort fallback, not a primary path.
+            // The primary reset mechanisms are:
+            //   1. F1: success block of smart extraction (set(0) on created/merged > 0)
+            //   2. Fix-Must1: all-dedup failure path (set(previousSeenCount) prevents retry spiral)
           }
         } catch (err) {
           api.logger.warn(`memory-lancedb-pro: capture failed: ${String(err)}`);