From 5a9beb0239779ccd8dc11907ccb33b44d52a4abd Mon Sep 17 00:00:00 2001
From: vakrahul <vakitirahul@gmail.com>
Date: Tue, 2 Jun 2026 10:54:33 +0530
Subject: [PATCH 1/9] feat(agents): implement dynamic chunking and recursive
 map-reduce in summarizer

---
 src/agents/summarizer.py | 168 ++++++++++++++++++++++++++++++++++++++-
 src/models/registry.py   | 119 ++++++++++++++++++++++++++-
 2 files changed, 281 insertions(+), 6 deletions(-)

diff --git a/src/agents/summarizer.py b/src/agents/summarizer.py
index ea01b31d..dff62ad6 100644
--- a/src/agents/summarizer.py
+++ b/src/agents/summarizer.py
@@ -7,6 +7,7 @@
 
 from __future__ import annotations
 
+import asyncio
 from typing import Any, Dict
 
 from langchain_core.language_models import BaseChatModel
@@ -14,15 +15,178 @@
 from src.agents.base import BaseAgent
 from src.prompts.summarizer import build_system_prompt, pack_summary_query
 from src.schemas.summary import SummaryResult
+from src.models.registry import get_model_context_window
 
 
 class SummarizerAgent(BaseAgent):
+    # Dynamic Chunking Configuration
+    MAX_RECURSION_DEPTH = 3
+    CHUNK_OVERLAP_TOKENS = 200
+    SAFE_THRESHOLD_RATIO = 0.8  # Use 80% of context window as safe limit
+
+    # Rate limit retry configuration
+    MAX_RETRY_ATTEMPTS = 3
+    INITIAL_BACKOFF_SECONDS = 1.0
+
     def __init__(self, model: BaseChatModel) -> None:
         super().__init__(
             model=model,
             name="summarizer",
             system_prompt=build_system_prompt(),
         )
+        # Dynamically determine max chunk tokens from the model's context window
+        self._init_dynamic_chunk_tokens()
+
+    def _init_dynamic_chunk_tokens(self) -> None:
+        """
+        Initialize MAX_CHUNK_TOKENS based on the active model's context window.
+        This ensures we stay well within the model's limits across all providers.
+        """
+        try:
+            # Try to detect provider and model name from the model instance
+            provider = self._detect_provider()
+            model_name = getattr(
+                self.model, "model", getattr(self.model, "model_name", None)
+            )
+
+            context_window = get_model_context_window(provider, model_name)
+            # Calculate safe chunk size at 80% of context window
+            self.MAX_CHUNK_TOKENS = max(
+                int(context_window * self.SAFE_THRESHOLD_RATIO), 2000
+            )
+
+            self.logger.info(
+                f"Initialized dynamic chunking: context_window={context_window}, "
+                f"max_chunk_tokens={self.MAX_CHUNK_TOKENS}"
+            )
+        except Exception as e:
+            # Fallback to conservative default
+            self.MAX_CHUNK_TOKENS = 3000
+            self.logger.warning(
+                f"Failed to initialize dynamic chunk tokens, using fallback (3000): {e}"
+            )
+
+    def _detect_provider(self) -> str:
+        """Detect the provider from the model instance."""
+        model_type = type(self.model).__name__
+
+        # Map LangChain model classes to providers
+        provider_map = {
+            "ChatAnthropic": "claude",
+            "ChatOpenAI": "openai",
+            "ChatGoogleGenerativeAI": "gemini",
+            "ChatGroq": "groq",
+            "OllamaLLM": "ollama",
+            "ChatBedrock": "bedrock",
+        }
+
+        for class_name, provider in provider_map.items():
+            if class_name in model_type:
+                return provider
+
+        # Default to openai if we can't determine
+        return "openai"
+
+    async def _call_model_with_retry(self, messages: list) -> str:
+        """
+        Call the model with exponential backoff retry logic for rate limits.
+
+        Handles rate limit (429) responses gracefully by retrying with
+        exponential backoff instead of failing immediately.
+        """
+        backoff_seconds = self.INITIAL_BACKOFF_SECONDS
+
+        for attempt in range(self.MAX_RETRY_ATTEMPTS):
+            try:
+                return await self._call_model(messages)
+            except Exception as e:
+                error_msg = str(e).lower()
+                is_rate_limit = (
+                    "429" in error_msg
+                    or "rate limit" in error_msg
+                    or "quota" in error_msg
+                    or "too many requests" in error_msg
+                )
+
+                if not is_rate_limit or attempt == self.MAX_RETRY_ATTEMPTS - 1:
+                    # Not a rate limit error, or last attempt - raise
+                    raise
+
+                self.logger.warning(
+                    f"Rate limit hit (attempt {attempt + 1}/{self.MAX_RETRY_ATTEMPTS}). "
+                    f"Retrying in {backoff_seconds:.1f}s..."
+                )
+                await asyncio.sleep(backoff_seconds)
+                backoff_seconds *= 2  # Exponential backoff
+
+    def _estimate_tokens(self, text: str) -> int:
+        """Lightweight token estimation (approx 4 characters per token)."""
+        return len(text) // 4
+
+    def _chunk_payload(self, text: str) -> list[str]:
+        """Splits text into overlapping chunks based on token limits."""
+        words = text.split(" ")
+        chunks = []
+        current_chunk = []
+        current_tokens = 0
+
+        for word in words:
+            word_tokens = self._estimate_tokens(word + " ")
+            if current_tokens + word_tokens > self.MAX_CHUNK_TOKENS and current_chunk:
+                # Save the current chunk
+                chunks.append(" ".join(current_chunk))
+                # Keep the overlap at the end of the chunk to prevent lost context
+                overlap_words = (
+                    current_chunk[-(self.CHUNK_OVERLAP_TOKENS // 2) :]
+                    if self.CHUNK_OVERLAP_TOKENS
+                    else []
+                )
+                current_chunk = overlap_words + [word]
+                current_tokens = sum(
+                    self._estimate_tokens(w + " ") for w in current_chunk
+                )
+            else:
+                current_chunk.append(word)
+                current_tokens += word_tokens
+
+        if current_chunk:
+            chunks.append(" ".join(current_chunk))
+
+        return chunks
+
+    async def _recursive_summarize(self, text: str, depth: int = 0) -> str:
+        """Stateful graph-based loop to chunk, summarize, and map-reduce."""
+        if depth >= self.MAX_RECURSION_DEPTH:
+            self.logger.warning(
+                f"Max recursion depth ({self.MAX_RECURSION_DEPTH}) reached. Truncating payload."
+            )
+            messages = self._build_messages(text[: self.MAX_CHUNK_TOKENS * 4])
+            return await self._call_model_with_retry(messages)
+
+        estimated_tokens = self._estimate_tokens(text)
+
+        # Base Case: Payload fits safely within the context window
+        if estimated_tokens <= self.MAX_CHUNK_TOKENS:
+            messages = self._build_messages(text)
+            return await self._call_model_with_retry(messages)
+
+        # Recursive Case: Split large payloads and map-reduce
+        self.logger.info(
+            f"Payload too large ({estimated_tokens} tokens). Splitting into chunks (Depth: {depth})."
+        )
+        chunks = self._chunk_payload(text)
+
+        chunk_summaries = []
+        for i, chunk in enumerate(chunks):
+            self.logger.debug(f"Summarizing chunk {i + 1}/{len(chunks)}...")
+            messages = self._build_messages(chunk)
+            chunk_summary = await self._call_model_with_retry(messages)
+            chunk_summaries.append(chunk_summary.strip())
+
+        # Map-reduce: Combine partial summaries and feed them back into the loop
+        aggregated_text = "\n\n--- PARTIAL SUMMARIES ---\n\n".join(chunk_summaries)
+
+        return await self._recursive_summarize(aggregated_text, depth=depth + 1)
 
     async def arun(
         self,
@@ -36,9 +200,9 @@ async def arun(
             return SummaryResult()
 
         user_message = pack_summary_query(user_query, agent_response)
-        messages = self._build_messages(user_message)
-        raw_content = await self._call_model(messages)
 
+        # Route through the new dynamic chunking pipeline
+        raw_content = await self._recursive_summarize(user_message)
         summary = raw_content.strip()
 
         # Treat empty-like responses as no summary
diff --git a/src/models/registry.py b/src/models/registry.py
index 8c3fd163..24a74946 100644
--- a/src/models/registry.py
+++ b/src/models/registry.py
@@ -23,6 +23,71 @@
 logger = logging.getLogger("xmem.models")
 
 
+# ─────────────────────────────────────────────────────────────────────────────
+# Context Window Mappings (in tokens) for each model
+# ─────────────────────────────────────────────────────────────────────────────
+
+_CONTEXT_WINDOWS = {
+    # Claude models
+    "claude": {
+        "claude-3-5-sonnet-20241022": 200000,
+        "claude-3-5-sonnet": 200000,
+        "claude-3-sonnet-20240229": 200000,
+        "claude-3-opus-20240229": 200000,
+        "claude-3-haiku-20240307": 200000,
+        "claude-opus": 200000,
+        "claude-sonnet": 200000,
+        "claude-haiku": 200000,
+        "default": 200000,
+    },
+    # OpenAI models
+    "openai": {
+        "gpt-4o": 128000,
+        "gpt-4-turbo": 128000,
+        "gpt-4": 8192,
+        "gpt-3.5-turbo": 16385,
+        "default": 128000,
+    },
+    # Gemini models
+    "gemini": {
+        "gemini-2.0-flash": 1000000,
+        "gemini-2.0-pro": 1000000,
+        "gemini-1.5-pro": 1000000,
+        "gemini-1.5-flash": 1000000,
+        "gemini-pro": 32768,
+        "default": 1000000,
+    },
+    # DeepSeek models
+    "deepseek": {
+        "deepseek-chat": 128000,
+        "deepseek-coder": 128000,
+        "default": 128000,
+    },
+    # Groq models
+    "groq": {
+        "mixtral-8x7b-32768": 32768,
+        "llama2-70b-4096": 4096,
+        "default": 32768,
+    },
+    # OpenRouter (varies by model, use conservative default)
+    "openrouter": {
+        "default": 128000,
+    },
+    # Ollama (local, typically depends on model)
+    "ollama": {
+        "default": 8000,
+    },
+    # Bedrock (varies by model)
+    "bedrock": {
+        "default": 100000,
+    },
+    # Mimo
+    "mimo": {
+        "default": 32768,
+    },
+}
+
+
 def _build_from_module(module_name: str, func_name: str, **kwargs) -> BaseChatModel:
     module = importlib.import_module(f"src.models.{module_name}")
     factory_fn = getattr(module, func_name)
@@ -33,10 +98,14 @@ def _build_from_module(module_name: str, func_name: str, **kwargs) -> BaseChatMo
     "gemini": lambda **kw: _build_from_module("gemini", "build_gemini_model", **kw),
     "claude": lambda **kw: _build_from_module("claude", "build_claude_model", **kw),
     "openai": lambda **kw: _build_from_module("openai", "build_openai_model", **kw),
-    "deepseek": lambda **kw: _build_from_module("deepseek", "build_deepseek_model", **kw),
+    "deepseek": lambda **kw: _build_from_module(
+        "deepseek", "build_deepseek_model", **kw
+    ),
     "groq": lambda **kw: _build_from_module("groq", "build_groq_model", **kw),
     "mimo": lambda **kw: _build_from_module("mimo", "build_mimo_model", **kw),
-    "openrouter": lambda **kw: _build_from_module("openrouter", "build_openrouter_model", **kw),
+    "openrouter": lambda **kw: _build_from_module(
+        "openrouter", "build_openrouter_model", **kw
+    ),
     "bedrock": lambda **kw: _build_from_module("bedrock", "build_bedrock_model", **kw),
     "ollama": lambda **kw: _build_from_module("ollama", "build_ollama_model", **kw),
 }
@@ -55,6 +124,43 @@ def _build_from_module(module_name: str, func_name: str, **kwargs) -> BaseChatMo
 }
 
 
+def get_model_context_window(
+    provider: Provider, model_name: Optional[str] = None
+) -> int:
+    """
+    Retrieve the context window (max tokens) for a given provider and model.
+
+    Args:
+        provider: The provider name (e.g., 'claude', 'openai', 'gemini')
+        model_name: Specific model name. If None, uses the provider default.
+
+    Returns:
+        Context window size in tokens.
+    """
+    if provider not in _CONTEXT_WINDOWS:
+        logger.warning(
+            f"Provider '{provider}' not found in context window mapping. Using 8192 default."
+        )
+        return 8192
+
+    provider_windows = _CONTEXT_WINDOWS[provider]
+
+    if model_name:
+        # Try exact match first
+        if model_name in provider_windows:
+            return provider_windows[model_name]
+        # Try partial match (e.g., "gpt-4o" matches "gpt-4o-mini")
+        for key, window in provider_windows.items():
+            if key != "default" and key in model_name:
+                logger.debug(
+                    f"Matched model '{model_name}' to key '{key}' with context window {window}"
+                )
+                return window
+
+    # Fall back to provider default
+    return provider_windows.get("default", 8192)
+
+
 @lru_cache(maxsize=16)
 def get_model(
     provider: Optional[Provider] = None,
@@ -130,7 +236,9 @@ def get_vision_model(
     """
     if provider:
         vision_name = _VISION_MODEL_MAP[provider]()
-        return get_model(provider=provider, model_name=vision_name, temperature=temperature)
+        return get_model(
+            provider=provider, model_name=vision_name, temperature=temperature
+        )
 
     # Auto-select from fallback order
     errors: list[str] = []
@@ -139,7 +247,10 @@ def get_vision_model(
         if key_fn and key_fn():
             try:
                 vision_name = _VISION_MODEL_MAP[p]()
-                model = _BUILDERS[p](model_name=vision_name, **({"temperature": temperature} if temperature is not None else {}))
+                model = _BUILDERS[p](
+                    model_name=vision_name,
+                    **({"temperature": temperature} if temperature is not None else {}),
+                )
                 logger.info("Using vision provider: %s (model: %s)", p, vision_name)
                 return model
             except Exception as exc:

From 206c45dfe66a4b757fecdaa953d16533e7128795 Mon Sep 17 00:00:00 2001
From: vakrahul <vakitirahul@gmail.com>
Date: Tue, 2 Jun 2026 11:11:33 +0530
Subject: [PATCH 2/9] feat(agents): implement dynamic chunking and recursive
 map-reduce in summarizer and few changes

---
 src/agents/summarizer.py | 61 +++++++++++++++++++++++++++++-----------
 1 file changed, 44 insertions(+), 17 deletions(-)

diff --git a/src/agents/summarizer.py b/src/agents/summarizer.py
index dff62ad6..36cf58d6 100644
--- a/src/agents/summarizer.py
+++ b/src/agents/summarizer.py
@@ -50,9 +50,11 @@ def _init_dynamic_chunk_tokens(self) -> None:
             )
 
             context_window = get_model_context_window(provider, model_name)
-            # Calculate safe chunk size at 80% of context window
-            self.MAX_CHUNK_TOKENS = max(
-                int(context_window * self.SAFE_THRESHOLD_RATIO), 2000
+            # Calculate safe chunk size at 80% of context window, capped at 12k
+            # to prevent "lost in the middle" degradation and output token exhaustion
+            self.MAX_CHUNK_TOKENS = min(
+                max(int(context_window * self.SAFE_THRESHOLD_RATIO), 2000),
+                12000
             )
 
             self.logger.info(
@@ -67,8 +69,13 @@ def _init_dynamic_chunk_tokens(self) -> None:
             )
 
     def _detect_provider(self) -> str:
-        """Detect the provider from the model instance."""
-        model_type = type(self.model).__name__
+        """Detect the provider from the model instance, unwrapping RunnableBinding if needed."""
+        # Unwrap RunnableBinding and RunnableWithFallbacks to get the actual model
+        model = self.model
+        while hasattr(model, "bound"):
+            model = model.bound
+
+        model_type = type(model).__name__
 
         # Map LangChain model classes to providers
         provider_map = {
@@ -124,8 +131,13 @@ def _estimate_tokens(self, text: str) -> int:
         return len(text) // 4
 
     def _chunk_payload(self, text: str) -> list[str]:
-        """Splits text into overlapping chunks based on token limits."""
-        words = text.split(" ")
+        """Splits text into overlapping chunks based on token limits.
+        
+        Fixes: Uses text.split() for proper whitespace handling and ensures
+        overlap calculation doesn't create infinite loops when single words
+        exceed MAX_CHUNK_TOKENS.
+        """
+        words = text.split()
         chunks = []
         current_chunk = []
         current_tokens = 0
@@ -135,12 +147,22 @@ def _chunk_payload(self, text: str) -> list[str]:
             if current_tokens + word_tokens > self.MAX_CHUNK_TOKENS and current_chunk:
                 # Save the current chunk
                 chunks.append(" ".join(current_chunk))
-                # Keep the overlap at the end of the chunk to prevent lost context
-                overlap_words = (
-                    current_chunk[-(self.CHUNK_OVERLAP_TOKENS // 2) :]
-                    if self.CHUNK_OVERLAP_TOKENS
-                    else []
-                )
+
+                # Calculate overlap by counting tokens from the end of current_chunk
+                overlap_words = []
+                overlap_tokens = 0
+                for w in reversed(current_chunk):
+                    w_tokens = self._estimate_tokens(w + " ")
+                    if overlap_tokens + w_tokens > self.CHUNK_OVERLAP_TOKENS:
+                        break
+                    overlap_words.insert(0, w)
+                    overlap_tokens += w_tokens
+
+                # Safety check: ensure overlap is strictly smaller than current_chunk
+                # to prevent infinite loops/bloat when single words exceed MAX_CHUNK_TOKENS
+                if len(overlap_words) >= len(current_chunk):
+                    overlap_words = current_chunk[1:] if len(current_chunk) > 1 else []
+
                 current_chunk = overlap_words + [word]
                 current_tokens = sum(
                     self._estimate_tokens(w + " ") for w in current_chunk
@@ -176,12 +198,17 @@ async def _recursive_summarize(self, text: str, depth: int = 0) -> str:
         )
         chunks = self._chunk_payload(text)
 
-        chunk_summaries = []
+        # Summarize chunks concurrently to improve performance
+        # (avoids sequential processing that causes high latency)
+        tasks = []
         for i, chunk in enumerate(chunks):
-            self.logger.debug(f"Summarizing chunk {i + 1}/{len(chunks)}...")
+            self.logger.debug(f"Queuing chunk {i + 1}/{len(chunks)} for concurrent summarization...")
             messages = self._build_messages(chunk)
-            chunk_summary = await self._call_model_with_retry(messages)
-            chunk_summaries.append(chunk_summary.strip())
+            tasks.append(self._call_model_with_retry(messages))
+
+        self.logger.debug(f"Processing {len(chunks)} chunks concurrently...")
+        chunk_summaries = await asyncio.gather(*tasks)
+        chunk_summaries = [s.strip() for s in chunk_summaries]
 
         # Map-reduce: Combine partial summaries and feed them back into the loop
         aggregated_text = "\n\n--- PARTIAL SUMMARIES ---\n\n".join(chunk_summaries)

From 9248d51e3a79d8ffa37cc27938597b8aaab5793c Mon Sep 17 00:00:00 2001
From: Rahul vakiti <143106829+vakrahul@users.noreply.github.com>
Date: Tue, 2 Jun 2026 11:17:37 +0530
Subject: [PATCH 3/9] Update src/agents/summarizer.py

Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com>
---
 src/agents/summarizer.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/agents/summarizer.py b/src/agents/summarizer.py
index 36cf58d6..bea484d9 100644
--- a/src/agents/summarizer.py
+++ b/src/agents/summarizer.py
@@ -84,7 +84,10 @@ def _detect_provider(self) -> str:
             "ChatGoogleGenerativeAI": "gemini",
             "ChatGroq": "groq",
             "OllamaLLM": "ollama",
+            "ChatOllama": "ollama",
             "ChatBedrock": "bedrock",
+            "ChatDeepSeek": "deepseek",
+            "ChatMimo": "mimo",
         }
 
         for class_name, provider in provider_map.items():

From 9d327dc3f4a26f9d4e6a1fd5a91e6d491bda3c58 Mon Sep 17 00:00:00 2001
From: Rahul vakiti <143106829+vakrahul@users.noreply.github.com>
Date: Tue, 2 Jun 2026 11:17:51 +0530
Subject: [PATCH 4/9] Update src/agents/summarizer.py

Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com>
---
 src/agents/summarizer.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/agents/summarizer.py b/src/agents/summarizer.py
index bea484d9..b2d27f65 100644
--- a/src/agents/summarizer.py
+++ b/src/agents/summarizer.py
@@ -210,8 +210,14 @@ async def _recursive_summarize(self, text: str, depth: int = 0) -> str:
             tasks.append(self._call_model_with_retry(messages))
 
         self.logger.debug(f"Processing {len(chunks)} chunks concurrently...")
-        chunk_summaries = await asyncio.gather(*tasks)
-        chunk_summaries = [s.strip() for s in chunk_summaries]
+        results = await asyncio.gather(*tasks, return_exceptions=True)
+
+        # Surface any failures rather than silently producing wrong output
+        exceptions = [r for r in results if isinstance(r, BaseException)]
+        if exceptions:
+            raise exceptions[0]
+
+        chunk_summaries = [str(s).strip() for s in results]
 
         # Map-reduce: Combine partial summaries and feed them back into the loop
         aggregated_text = "\n\n--- PARTIAL SUMMARIES ---\n\n".join(chunk_summaries)

From e5e0634f021dfa3dced6c98c448f7e253f6934d5 Mon Sep 17 00:00:00 2001
From: Rahul vakiti <143106829+vakrahul@users.noreply.github.com>
Date: Tue, 2 Jun 2026 11:39:29 +0530
Subject: [PATCH 5/9] Update src/models/registry.py

Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com>
---
 src/models/registry.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/models/registry.py b/src/models/registry.py
index 24a74946..862c5e4a 100644
--- a/src/models/registry.py
+++ b/src/models/registry.py
@@ -150,8 +150,12 @@ def get_model_context_window(
         if model_name in provider_windows:
             return provider_windows[model_name]
         # Try partial match (e.g., "gpt-4o" matches "gpt-4o-mini")
-        for key, window in provider_windows.items():
-            if key != "default" and key in model_name:
+        # Sort by key length descending so more-specific keys win over shorter prefixes
+        for key, window in sorted(
+            ((k, v) for k, v in provider_windows.items() if k != "default"),
+            key=lambda kv: len(kv[0]),
+            reverse=True,
+        ):
                 logger.debug(
                     f"Matched model '{model_name}' to key '{key}' with context window {window}"
                 )

From ca9992bfab0c5e5a0e94c2b768276cc1284edb5e Mon Sep 17 00:00:00 2001
From: Rahul vakiti <143106829+vakrahul@users.noreply.github.com>
Date: Tue, 2 Jun 2026 11:44:21 +0530
Subject: [PATCH 6/9] Update src/models/registry.py

Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com>
---
 src/models/registry.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/models/registry.py b/src/models/registry.py
index 862c5e4a..128e22b1 100644
--- a/src/models/registry.py
+++ b/src/models/registry.py
@@ -156,6 +156,7 @@ def get_model_context_window(
             key=lambda kv: len(kv[0]),
             reverse=True,
         ):
+            if key in model_name:
                 logger.debug(
                     f"Matched model '{model_name}' to key '{key}' with context window {window}"
                 )

From 2de7f313dc2ef5b6b97816b6e830a4b037159e0f Mon Sep 17 00:00:00 2001
From: Rahul vakiti <143106829+vakrahul@users.noreply.github.com>
Date: Tue, 2 Jun 2026 13:15:33 +0530
Subject: [PATCH 7/9] Refactor SummarizerAgent for dynamic chunking

---
 src/agents/summarizer.py | 45 +++++++++++++++++++---------------------
 1 file changed, 21 insertions(+), 24 deletions(-)

diff --git a/src/agents/summarizer.py b/src/agents/summarizer.py
index b2d27f65..1f1794bf 100644
--- a/src/agents/summarizer.py
+++ b/src/agents/summarizer.py
@@ -19,12 +19,12 @@
 
 
 class SummarizerAgent(BaseAgent):
-    # Dynamic Chunking Configuration
+  
     MAX_RECURSION_DEPTH = 3
     CHUNK_OVERLAP_TOKENS = 200
-    SAFE_THRESHOLD_RATIO = 0.8  # Use 80% of context window as safe limit
+    SAFE_THRESHOLD_RATIO = 0.8  
 
-    # Rate limit retry configuration
+  
     MAX_RETRY_ATTEMPTS = 3
     INITIAL_BACKOFF_SECONDS = 1.0
 
@@ -34,7 +34,7 @@ def __init__(self, model: BaseChatModel) -> None:
             name="summarizer",
             system_prompt=build_system_prompt(),
         )
-        # Dynamically determine max chunk tokens from the model's context window
+ 
         self._init_dynamic_chunk_tokens()
 
     def _init_dynamic_chunk_tokens(self) -> None:
@@ -43,15 +43,14 @@ def _init_dynamic_chunk_tokens(self) -> None:
         This ensures we stay well within the model's limits across all providers.
         """
         try:
-            # Try to detect provider and model name from the model instance
+         
             provider = self._detect_provider()
             model_name = getattr(
                 self.model, "model", getattr(self.model, "model_name", None)
             )
 
             context_window = get_model_context_window(provider, model_name)
-            # Calculate safe chunk size at 80% of context window, capped at 12k
-            # to prevent "lost in the middle" degradation and output token exhaustion
+         
             self.MAX_CHUNK_TOKENS = min(
                 max(int(context_window * self.SAFE_THRESHOLD_RATIO), 2000),
                 12000
@@ -62,7 +61,7 @@ def _init_dynamic_chunk_tokens(self) -> None:
                 f"max_chunk_tokens={self.MAX_CHUNK_TOKENS}"
             )
         except Exception as e:
-            # Fallback to conservative default
+         
             self.MAX_CHUNK_TOKENS = 3000
             self.logger.warning(
                 f"Failed to initialize dynamic chunk tokens, using fallback (3000): {e}"
@@ -70,14 +69,14 @@ def _init_dynamic_chunk_tokens(self) -> None:
 
     def _detect_provider(self) -> str:
         """Detect the provider from the model instance, unwrapping RunnableBinding if needed."""
-        # Unwrap RunnableBinding and RunnableWithFallbacks to get the actual model
+
         model = self.model
         while hasattr(model, "bound"):
             model = model.bound
 
         model_type = type(model).__name__
 
-        # Map LangChain model classes to providers
+      
         provider_map = {
             "ChatAnthropic": "claude",
             "ChatOpenAI": "openai",
@@ -94,7 +93,7 @@ def _detect_provider(self) -> str:
             if class_name in model_type:
                 return provider
 
-        # Default to openai if we can't determine
+      
         return "openai"
 
     async def _call_model_with_retry(self, messages: list) -> str:
@@ -119,7 +118,7 @@ async def _call_model_with_retry(self, messages: list) -> str:
                 )
 
                 if not is_rate_limit or attempt == self.MAX_RETRY_ATTEMPTS - 1:
-                    # Not a rate limit error, or last attempt - raise
+                 
                     raise
 
                 self.logger.warning(
@@ -127,7 +126,7 @@ async def _call_model_with_retry(self, messages: list) -> str:
                     f"Retrying in {backoff_seconds:.1f}s..."
                 )
                 await asyncio.sleep(backoff_seconds)
-                backoff_seconds *= 2  # Exponential backoff
+                backoff_seconds *= 2  
 
     def _estimate_tokens(self, text: str) -> int:
         """Lightweight token estimation (approx 4 characters per token)."""
@@ -148,10 +147,10 @@ def _chunk_payload(self, text: str) -> list[str]:
         for word in words:
             word_tokens = self._estimate_tokens(word + " ")
             if current_tokens + word_tokens > self.MAX_CHUNK_TOKENS and current_chunk:
-                # Save the current chunk
+               
                 chunks.append(" ".join(current_chunk))
 
-                # Calculate overlap by counting tokens from the end of current_chunk
+               
                 overlap_words = []
                 overlap_tokens = 0
                 for w in reversed(current_chunk):
@@ -161,8 +160,8 @@ def _chunk_payload(self, text: str) -> list[str]:
                     overlap_words.insert(0, w)
                     overlap_tokens += w_tokens
 
-                # Safety check: ensure overlap is strictly smaller than current_chunk
-                # to prevent infinite loops/bloat when single words exceed MAX_CHUNK_TOKENS
+             
+              
                 if len(overlap_words) >= len(current_chunk):
                     overlap_words = current_chunk[1:] if len(current_chunk) > 1 else []
 
@@ -195,14 +194,13 @@ async def _recursive_summarize(self, text: str, depth: int = 0) -> str:
             messages = self._build_messages(text)
             return await self._call_model_with_retry(messages)
 
-        # Recursive Case: Split large payloads and map-reduce
+      
         self.logger.info(
             f"Payload too large ({estimated_tokens} tokens). Splitting into chunks (Depth: {depth})."
         )
         chunks = self._chunk_payload(text)
 
-        # Summarize chunks concurrently to improve performance
-        # (avoids sequential processing that causes high latency)
+    
         tasks = []
         for i, chunk in enumerate(chunks):
             self.logger.debug(f"Queuing chunk {i + 1}/{len(chunks)} for concurrent summarization...")
@@ -212,14 +210,13 @@ async def _recursive_summarize(self, text: str, depth: int = 0) -> str:
         self.logger.debug(f"Processing {len(chunks)} chunks concurrently...")
         results = await asyncio.gather(*tasks, return_exceptions=True)
 
-        # Surface any failures rather than silently producing wrong output
+       
         exceptions = [r for r in results if isinstance(r, BaseException)]
         if exceptions:
             raise exceptions[0]
 
         chunk_summaries = [str(s).strip() for s in results]
 
-        # Map-reduce: Combine partial summaries and feed them back into the loop
         aggregated_text = "\n\n--- PARTIAL SUMMARIES ---\n\n".join(chunk_summaries)
 
         return await self._recursive_summarize(aggregated_text, depth=depth + 1)
@@ -237,11 +234,11 @@ async def arun(
 
         user_message = pack_summary_query(user_query, agent_response)
 
-        # Route through the new dynamic chunking pipeline
+       
         raw_content = await self._recursive_summarize(user_message)
         summary = raw_content.strip()
 
-        # Treat empty-like responses as no summary
+        
         if summary in ('""', "''", "empty", "(empty)", "(empty string)"):
             summary = ""
 

From 747033e1bb24f865d665d554dd47fb77faac447d Mon Sep 17 00:00:00 2001
From: Rahul vakiti <143106829+vakrahul@users.noreply.github.com>
Date: Tue, 2 Jun 2026 13:17:57 +0530
Subject: [PATCH 8/9] Refactor registry.py by removing comments and whitespace

Removed commented sections and cleaned up formatting.
---
 src/models/registry.py | 26 +++++++++++---------------
 1 file changed, 11 insertions(+), 15 deletions(-)

diff --git a/src/models/registry.py b/src/models/registry.py
index 128e22b1..af76003f 100644
--- a/src/models/registry.py
+++ b/src/models/registry.py
@@ -23,12 +23,10 @@
 logger = logging.getLogger("xmem.models")
 
 
-# ─────────────────────────────────────────────────────────────────────────────
-# Context Window Mappings (in tokens) for each model
-# ─────────────────────────────────────────────────────────────────────────────
+
 
 _CONTEXT_WINDOWS = {
-    # Claude models
+
     "claude": {
         "claude-3-5-sonnet-20241022": 200000,
         "claude-3-5-sonnet": 200000,
@@ -40,7 +38,7 @@
         "claude-haiku": 200000,
         "default": 200000,
     },
-    # OpenAI models
+    
     "openai": {
         "gpt-4o": 128000,
         "gpt-4-turbo": 128000,
@@ -63,7 +61,7 @@
         "deepseek-coder": 128000,
         "default": 128000,
     },
-    # Groq models
+    
     "groq": {
         "mixtral-8x7b-32768": 32768,
         "llama2-70b-4096": 4096,
@@ -146,11 +144,11 @@ def get_model_context_window(
     provider_windows = _CONTEXT_WINDOWS[provider]
 
     if model_name:
-        # Try exact match first
+        
         if model_name in provider_windows:
             return provider_windows[model_name]
-        # Try partial match (e.g., "gpt-4o" matches "gpt-4o-mini")
-        # Sort by key length descending so more-specific keys win over shorter prefixes
+       
+        
         for key, window in sorted(
             ((k, v) for k, v in provider_windows.items() if k != "default"),
             key=lambda kv: len(kv[0]),
@@ -162,7 +160,7 @@ def get_model_context_window(
                 )
                 return window
 
-    # Fall back to provider default
+    
     return provider_windows.get("default", 8192)
 
 
@@ -187,7 +185,7 @@ def get_model(
     if provider:
         return _BUILDERS[provider](**kw)
 
-    # Auto-select from fallback order
+   
     errors: list[str] = []
     for p in settings.fallback_order:
         key_fn = _KEY_MAP.get(p)
@@ -206,9 +204,7 @@ def get_model(
     )
 
 
-# ---------------------------------------------------------------------------
-# Vision model (for image analysis)
-# ---------------------------------------------------------------------------
+
 
 _VISION_MODEL_MAP = {
     "gemini": lambda: settings.gemini_vision_model,
@@ -245,7 +241,7 @@ def get_vision_model(
             provider=provider, model_name=vision_name, temperature=temperature
         )
 
-    # Auto-select from fallback order
+    
     errors: list[str] = []
     for p in settings.fallback_order:
         key_fn = _KEY_MAP.get(p)

From 8110e256d114f643983b5e1afb324f4a31e1c4b6 Mon Sep 17 00:00:00 2001
From: Rahul vakiti <143106829+vakrahul@users.noreply.github.com>
Date: Tue, 2 Jun 2026 13:27:20 +0530
Subject: [PATCH 9/9] Update src/agents/summarizer.py

Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com>
---
 src/agents/summarizer.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/agents/summarizer.py b/src/agents/summarizer.py
index 1f1794bf..ce8e6bb8 100644
--- a/src/agents/summarizer.py
+++ b/src/agents/summarizer.py
@@ -51,10 +51,7 @@ def _init_dynamic_chunk_tokens(self) -> None:
 
             context_window = get_model_context_window(provider, model_name)
          
-            self.MAX_CHUNK_TOKENS = min(
-                max(int(context_window * self.SAFE_THRESHOLD_RATIO), 2000),
-                12000
-            )
+            self.MAX_CHUNK_TOKENS = max(int(context_window * self.SAFE_THRESHOLD_RATIO), 2000)
 
             self.logger.info(
                 f"Initialized dynamic chunking: context_window={context_window}, "