From 5a9beb0239779ccd8dc11907ccb33b44d52a4abd Mon Sep 17 00:00:00 2001 From: vakrahul Date: Tue, 2 Jun 2026 10:54:33 +0530 Subject: [PATCH 1/9] feat(agents): implement dynamic chunking and recursive map-reduce in summarizer --- src/agents/summarizer.py | 168 ++++++++++++++++++++++++++++++++++++++- src/models/registry.py | 119 ++++++++++++++++++++++++++- 2 files changed, 281 insertions(+), 6 deletions(-) diff --git a/src/agents/summarizer.py b/src/agents/summarizer.py index ea01b31d..dff62ad6 100644 --- a/src/agents/summarizer.py +++ b/src/agents/summarizer.py @@ -7,6 +7,7 @@ from __future__ import annotations +import asyncio from typing import Any, Dict from langchain_core.language_models import BaseChatModel @@ -14,15 +15,178 @@ from src.agents.base import BaseAgent from src.prompts.summarizer import build_system_prompt, pack_summary_query from src.schemas.summary import SummaryResult +from src.models.registry import get_model_context_window class SummarizerAgent(BaseAgent): + # Dynamic Chunking Configuration + MAX_RECURSION_DEPTH = 3 + CHUNK_OVERLAP_TOKENS = 200 + SAFE_THRESHOLD_RATIO = 0.8 # Use 80% of context window as safe limit + + # Rate limit retry configuration + MAX_RETRY_ATTEMPTS = 3 + INITIAL_BACKOFF_SECONDS = 1.0 + def __init__(self, model: BaseChatModel) -> None: super().__init__( model=model, name="summarizer", system_prompt=build_system_prompt(), ) + # Dynamically determine max chunk tokens from the model's context window + self._init_dynamic_chunk_tokens() + + def _init_dynamic_chunk_tokens(self) -> None: + """ + Initialize MAX_CHUNK_TOKENS based on the active model's context window. + This ensures we stay well within the model's limits across all providers. + """ + try: + # Try to detect provider and model name from the model instance + provider = self._detect_provider() + model_name = getattr( + self.model, "model", getattr(self.model, "model_name", None) + ) + + context_window = get_model_context_window(provider, model_name) + # Calculate safe chunk size at 80% of context window + self.MAX_CHUNK_TOKENS = max( + int(context_window * self.SAFE_THRESHOLD_RATIO), 2000 + ) + + self.logger.info( + f"Initialized dynamic chunking: context_window={context_window}, " + f"max_chunk_tokens={self.MAX_CHUNK_TOKENS}" + ) + except Exception as e: + # Fallback to conservative default + self.MAX_CHUNK_TOKENS = 3000 + self.logger.warning( + f"Failed to initialize dynamic chunk tokens, using fallback (3000): {e}" + ) + + def _detect_provider(self) -> str: + """Detect the provider from the model instance.""" + model_type = type(self.model).__name__ + + # Map LangChain model classes to providers + provider_map = { + "ChatAnthropic": "claude", + "ChatOpenAI": "openai", + "ChatGoogleGenerativeAI": "gemini", + "ChatGroq": "groq", + "OllamaLLM": "ollama", + "ChatBedrock": "bedrock", + } + + for class_name, provider in provider_map.items(): + if class_name in model_type: + return provider + + # Default to openai if we can't determine + return "openai" + + async def _call_model_with_retry(self, messages: list) -> str: + """ + Call the model with exponential backoff retry logic for rate limits. + + Handles rate limit (429) responses gracefully by retrying with + exponential backoff instead of failing immediately. + """ + backoff_seconds = self.INITIAL_BACKOFF_SECONDS + + for attempt in range(self.MAX_RETRY_ATTEMPTS): + try: + return await self._call_model(messages) + except Exception as e: + error_msg = str(e).lower() + is_rate_limit = ( + "429" in error_msg + or "rate limit" in error_msg + or "quota" in error_msg + or "too many requests" in error_msg + ) + + if not is_rate_limit or attempt == self.MAX_RETRY_ATTEMPTS - 1: + # Not a rate limit error, or last attempt - raise + raise + + self.logger.warning( + f"Rate limit hit (attempt {attempt + 1}/{self.MAX_RETRY_ATTEMPTS}). " + f"Retrying in {backoff_seconds:.1f}s..." + ) + await asyncio.sleep(backoff_seconds) + backoff_seconds *= 2 # Exponential backoff + + def _estimate_tokens(self, text: str) -> int: + """Lightweight token estimation (approx 4 characters per token).""" + return len(text) // 4 + + def _chunk_payload(self, text: str) -> list[str]: + """Splits text into overlapping chunks based on token limits.""" + words = text.split(" ") + chunks = [] + current_chunk = [] + current_tokens = 0 + + for word in words: + word_tokens = self._estimate_tokens(word + " ") + if current_tokens + word_tokens > self.MAX_CHUNK_TOKENS and current_chunk: + # Save the current chunk + chunks.append(" ".join(current_chunk)) + # Keep the overlap at the end of the chunk to prevent lost context + overlap_words = ( + current_chunk[-(self.CHUNK_OVERLAP_TOKENS // 2) :] + if self.CHUNK_OVERLAP_TOKENS + else [] + ) + current_chunk = overlap_words + [word] + current_tokens = sum( + self._estimate_tokens(w + " ") for w in current_chunk + ) + else: + current_chunk.append(word) + current_tokens += word_tokens + + if current_chunk: + chunks.append(" ".join(current_chunk)) + + return chunks + + async def _recursive_summarize(self, text: str, depth: int = 0) -> str: + """Stateful graph-based loop to chunk, summarize, and map-reduce.""" + if depth >= self.MAX_RECURSION_DEPTH: + self.logger.warning( + f"Max recursion depth ({self.MAX_RECURSION_DEPTH}) reached. Truncating payload." + ) + messages = self._build_messages(text[: self.MAX_CHUNK_TOKENS * 4]) + return await self._call_model_with_retry(messages) + + estimated_tokens = self._estimate_tokens(text) + + # Base Case: Payload fits safely within the context window + if estimated_tokens <= self.MAX_CHUNK_TOKENS: + messages = self._build_messages(text) + return await self._call_model_with_retry(messages) + + # Recursive Case: Split large payloads and map-reduce + self.logger.info( + f"Payload too large ({estimated_tokens} tokens). Splitting into chunks (Depth: {depth})." + ) + chunks = self._chunk_payload(text) + + chunk_summaries = [] + for i, chunk in enumerate(chunks): + self.logger.debug(f"Summarizing chunk {i + 1}/{len(chunks)}...") + messages = self._build_messages(chunk) + chunk_summary = await self._call_model_with_retry(messages) + chunk_summaries.append(chunk_summary.strip()) + + # Map-reduce: Combine partial summaries and feed them back into the loop + aggregated_text = "\n\n--- PARTIAL SUMMARIES ---\n\n".join(chunk_summaries) + + return await self._recursive_summarize(aggregated_text, depth=depth + 1) async def arun( self, @@ -36,9 +200,9 @@ async def arun( return SummaryResult() user_message = pack_summary_query(user_query, agent_response) - messages = self._build_messages(user_message) - raw_content = await self._call_model(messages) + # Route through the new dynamic chunking pipeline + raw_content = await self._recursive_summarize(user_message) summary = raw_content.strip() # Treat empty-like responses as no summary diff --git a/src/models/registry.py b/src/models/registry.py index 8c3fd163..24a74946 100644 --- a/src/models/registry.py +++ b/src/models/registry.py @@ -23,6 +23,71 @@ logger = logging.getLogger("xmem.models") +# ───────────────────────────────────────────────────────────────────────────── +# Context Window Mappings (in tokens) for each model +# ───────────────────────────────────────────────────────────────────────────── + +_CONTEXT_WINDOWS = { + # Claude models + "claude": { + "claude-3-5-sonnet-20241022": 200000, + "claude-3-5-sonnet": 200000, + "claude-3-sonnet-20240229": 200000, + "claude-3-opus-20240229": 200000, + "claude-3-haiku-20240307": 200000, + "claude-opus": 200000, + "claude-sonnet": 200000, + "claude-haiku": 200000, + "default": 200000, + }, + # OpenAI models + "openai": { + "gpt-4o": 128000, + "gpt-4-turbo": 128000, + "gpt-4": 8192, + "gpt-3.5-turbo": 16385, + "default": 128000, + }, + # Gemini models + "gemini": { + "gemini-2.0-flash": 1000000, + "gemini-2.0-pro": 1000000, + "gemini-1.5-pro": 1000000, + "gemini-1.5-flash": 1000000, + "gemini-pro": 32768, + "default": 1000000, + }, + # DeepSeek models + "deepseek": { + "deepseek-chat": 128000, + "deepseek-coder": 128000, + "default": 128000, + }, + # Groq models + "groq": { + "mixtral-8x7b-32768": 32768, + "llama2-70b-4096": 4096, + "default": 32768, + }, + # OpenRouter (varies by model, use conservative default) + "openrouter": { + "default": 128000, + }, + # Ollama (local, typically depends on model) + "ollama": { + "default": 8000, + }, + # Bedrock (varies by model) + "bedrock": { + "default": 100000, + }, + # Mimo + "mimo": { + "default": 32768, + }, +} + + def _build_from_module(module_name: str, func_name: str, **kwargs) -> BaseChatModel: module = importlib.import_module(f"src.models.{module_name}") factory_fn = getattr(module, func_name) @@ -33,10 +98,14 @@ def _build_from_module(module_name: str, func_name: str, **kwargs) -> BaseChatMo "gemini": lambda **kw: _build_from_module("gemini", "build_gemini_model", **kw), "claude": lambda **kw: _build_from_module("claude", "build_claude_model", **kw), "openai": lambda **kw: _build_from_module("openai", "build_openai_model", **kw), - "deepseek": lambda **kw: _build_from_module("deepseek", "build_deepseek_model", **kw), + "deepseek": lambda **kw: _build_from_module( + "deepseek", "build_deepseek_model", **kw + ), "groq": lambda **kw: _build_from_module("groq", "build_groq_model", **kw), "mimo": lambda **kw: _build_from_module("mimo", "build_mimo_model", **kw), - "openrouter": lambda **kw: _build_from_module("openrouter", "build_openrouter_model", **kw), + "openrouter": lambda **kw: _build_from_module( + "openrouter", "build_openrouter_model", **kw + ), "bedrock": lambda **kw: _build_from_module("bedrock", "build_bedrock_model", **kw), "ollama": lambda **kw: _build_from_module("ollama", "build_ollama_model", **kw), } @@ -55,6 +124,43 @@ def _build_from_module(module_name: str, func_name: str, **kwargs) -> BaseChatMo } +def get_model_context_window( + provider: Provider, model_name: Optional[str] = None +) -> int: + """ + Retrieve the context window (max tokens) for a given provider and model. + + Args: + provider: The provider name (e.g., 'claude', 'openai', 'gemini') + model_name: Specific model name. If None, uses the provider default. + + Returns: + Context window size in tokens. + """ + if provider not in _CONTEXT_WINDOWS: + logger.warning( + f"Provider '{provider}' not found in context window mapping. Using 8192 default." + ) + return 8192 + + provider_windows = _CONTEXT_WINDOWS[provider] + + if model_name: + # Try exact match first + if model_name in provider_windows: + return provider_windows[model_name] + # Try partial match (e.g., "gpt-4o" matches "gpt-4o-mini") + for key, window in provider_windows.items(): + if key != "default" and key in model_name: + logger.debug( + f"Matched model '{model_name}' to key '{key}' with context window {window}" + ) + return window + + # Fall back to provider default + return provider_windows.get("default", 8192) + + @lru_cache(maxsize=16) def get_model( provider: Optional[Provider] = None, @@ -130,7 +236,9 @@ def get_vision_model( """ if provider: vision_name = _VISION_MODEL_MAP[provider]() - return get_model(provider=provider, model_name=vision_name, temperature=temperature) + return get_model( + provider=provider, model_name=vision_name, temperature=temperature + ) # Auto-select from fallback order errors: list[str] = [] @@ -139,7 +247,10 @@ def get_vision_model( if key_fn and key_fn(): try: vision_name = _VISION_MODEL_MAP[p]() - model = _BUILDERS[p](model_name=vision_name, **({"temperature": temperature} if temperature is not None else {})) + model = _BUILDERS[p]( + model_name=vision_name, + **({"temperature": temperature} if temperature is not None else {}), + ) logger.info("Using vision provider: %s (model: %s)", p, vision_name) return model except Exception as exc: From 206c45dfe66a4b757fecdaa953d16533e7128795 Mon Sep 17 00:00:00 2001 From: vakrahul Date: Tue, 2 Jun 2026 11:11:33 +0530 Subject: [PATCH 2/9] feat(agents): implement dynamic chunking and recursive map-reduce in summarizer and few changes --- src/agents/summarizer.py | 61 +++++++++++++++++++++++++++++----------- 1 file changed, 44 insertions(+), 17 deletions(-) diff --git a/src/agents/summarizer.py b/src/agents/summarizer.py index dff62ad6..36cf58d6 100644 --- a/src/agents/summarizer.py +++ b/src/agents/summarizer.py @@ -50,9 +50,11 @@ def _init_dynamic_chunk_tokens(self) -> None: ) context_window = get_model_context_window(provider, model_name) - # Calculate safe chunk size at 80% of context window - self.MAX_CHUNK_TOKENS = max( - int(context_window * self.SAFE_THRESHOLD_RATIO), 2000 + # Calculate safe chunk size at 80% of context window, capped at 12k + # to prevent "lost in the middle" degradation and output token exhaustion + self.MAX_CHUNK_TOKENS = min( + max(int(context_window * self.SAFE_THRESHOLD_RATIO), 2000), + 12000 ) self.logger.info( @@ -67,8 +69,13 @@ def _init_dynamic_chunk_tokens(self) -> None: ) def _detect_provider(self) -> str: - """Detect the provider from the model instance.""" - model_type = type(self.model).__name__ + """Detect the provider from the model instance, unwrapping RunnableBinding if needed.""" + # Unwrap RunnableBinding and RunnableWithFallbacks to get the actual model + model = self.model + while hasattr(model, "bound"): + model = model.bound + + model_type = type(model).__name__ # Map LangChain model classes to providers provider_map = { @@ -124,8 +131,13 @@ def _estimate_tokens(self, text: str) -> int: return len(text) // 4 def _chunk_payload(self, text: str) -> list[str]: - """Splits text into overlapping chunks based on token limits.""" - words = text.split(" ") + """Splits text into overlapping chunks based on token limits. + + Fixes: Uses text.split() for proper whitespace handling and ensures + overlap calculation doesn't create infinite loops when single words + exceed MAX_CHUNK_TOKENS. + """ + words = text.split() chunks = [] current_chunk = [] current_tokens = 0 @@ -135,12 +147,22 @@ def _chunk_payload(self, text: str) -> list[str]: if current_tokens + word_tokens > self.MAX_CHUNK_TOKENS and current_chunk: # Save the current chunk chunks.append(" ".join(current_chunk)) - # Keep the overlap at the end of the chunk to prevent lost context - overlap_words = ( - current_chunk[-(self.CHUNK_OVERLAP_TOKENS // 2) :] - if self.CHUNK_OVERLAP_TOKENS - else [] - ) + + # Calculate overlap by counting tokens from the end of current_chunk + overlap_words = [] + overlap_tokens = 0 + for w in reversed(current_chunk): + w_tokens = self._estimate_tokens(w + " ") + if overlap_tokens + w_tokens > self.CHUNK_OVERLAP_TOKENS: + break + overlap_words.insert(0, w) + overlap_tokens += w_tokens + + # Safety check: ensure overlap is strictly smaller than current_chunk + # to prevent infinite loops/bloat when single words exceed MAX_CHUNK_TOKENS + if len(overlap_words) >= len(current_chunk): + overlap_words = current_chunk[1:] if len(current_chunk) > 1 else [] + current_chunk = overlap_words + [word] current_tokens = sum( self._estimate_tokens(w + " ") for w in current_chunk @@ -176,12 +198,17 @@ async def _recursive_summarize(self, text: str, depth: int = 0) -> str: ) chunks = self._chunk_payload(text) - chunk_summaries = [] + # Summarize chunks concurrently to improve performance + # (avoids sequential processing that causes high latency) + tasks = [] for i, chunk in enumerate(chunks): - self.logger.debug(f"Summarizing chunk {i + 1}/{len(chunks)}...") + self.logger.debug(f"Queuing chunk {i + 1}/{len(chunks)} for concurrent summarization...") messages = self._build_messages(chunk) - chunk_summary = await self._call_model_with_retry(messages) - chunk_summaries.append(chunk_summary.strip()) + tasks.append(self._call_model_with_retry(messages)) + + self.logger.debug(f"Processing {len(chunks)} chunks concurrently...") + chunk_summaries = await asyncio.gather(*tasks) + chunk_summaries = [s.strip() for s in chunk_summaries] # Map-reduce: Combine partial summaries and feed them back into the loop aggregated_text = "\n\n--- PARTIAL SUMMARIES ---\n\n".join(chunk_summaries) From 9248d51e3a79d8ffa37cc27938597b8aaab5793c Mon Sep 17 00:00:00 2001 From: Rahul vakiti <143106829+vakrahul@users.noreply.github.com> Date: Tue, 2 Jun 2026 11:17:37 +0530 Subject: [PATCH 3/9] Update src/agents/summarizer.py Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com> --- src/agents/summarizer.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/agents/summarizer.py b/src/agents/summarizer.py index 36cf58d6..bea484d9 100644 --- a/src/agents/summarizer.py +++ b/src/agents/summarizer.py @@ -84,7 +84,10 @@ def _detect_provider(self) -> str: "ChatGoogleGenerativeAI": "gemini", "ChatGroq": "groq", "OllamaLLM": "ollama", + "ChatOllama": "ollama", "ChatBedrock": "bedrock", + "ChatDeepSeek": "deepseek", + "ChatMimo": "mimo", } for class_name, provider in provider_map.items(): From 9d327dc3f4a26f9d4e6a1fd5a91e6d491bda3c58 Mon Sep 17 00:00:00 2001 From: Rahul vakiti <143106829+vakrahul@users.noreply.github.com> Date: Tue, 2 Jun 2026 11:17:51 +0530 Subject: [PATCH 4/9] Update src/agents/summarizer.py Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com> --- src/agents/summarizer.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/agents/summarizer.py b/src/agents/summarizer.py index bea484d9..b2d27f65 100644 --- a/src/agents/summarizer.py +++ b/src/agents/summarizer.py @@ -210,8 +210,14 @@ async def _recursive_summarize(self, text: str, depth: int = 0) -> str: tasks.append(self._call_model_with_retry(messages)) self.logger.debug(f"Processing {len(chunks)} chunks concurrently...") - chunk_summaries = await asyncio.gather(*tasks) - chunk_summaries = [s.strip() for s in chunk_summaries] + results = await asyncio.gather(*tasks, return_exceptions=True) + + # Surface any failures rather than silently producing wrong output + exceptions = [r for r in results if isinstance(r, BaseException)] + if exceptions: + raise exceptions[0] + + chunk_summaries = [str(s).strip() for s in results] # Map-reduce: Combine partial summaries and feed them back into the loop aggregated_text = "\n\n--- PARTIAL SUMMARIES ---\n\n".join(chunk_summaries) From e5e0634f021dfa3dced6c98c448f7e253f6934d5 Mon Sep 17 00:00:00 2001 From: Rahul vakiti <143106829+vakrahul@users.noreply.github.com> Date: Tue, 2 Jun 2026 11:39:29 +0530 Subject: [PATCH 5/9] Update src/models/registry.py Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com> --- src/models/registry.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/models/registry.py b/src/models/registry.py index 24a74946..862c5e4a 100644 --- a/src/models/registry.py +++ b/src/models/registry.py @@ -150,8 +150,12 @@ def get_model_context_window( if model_name in provider_windows: return provider_windows[model_name] # Try partial match (e.g., "gpt-4o" matches "gpt-4o-mini") - for key, window in provider_windows.items(): - if key != "default" and key in model_name: + # Sort by key length descending so more-specific keys win over shorter prefixes + for key, window in sorted( + ((k, v) for k, v in provider_windows.items() if k != "default"), + key=lambda kv: len(kv[0]), + reverse=True, + ): logger.debug( f"Matched model '{model_name}' to key '{key}' with context window {window}" ) From ca9992bfab0c5e5a0e94c2b768276cc1284edb5e Mon Sep 17 00:00:00 2001 From: Rahul vakiti <143106829+vakrahul@users.noreply.github.com> Date: Tue, 2 Jun 2026 11:44:21 +0530 Subject: [PATCH 6/9] Update src/models/registry.py Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com> --- src/models/registry.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/models/registry.py b/src/models/registry.py index 862c5e4a..128e22b1 100644 --- a/src/models/registry.py +++ b/src/models/registry.py @@ -156,6 +156,7 @@ def get_model_context_window( key=lambda kv: len(kv[0]), reverse=True, ): + if key in model_name: logger.debug( f"Matched model '{model_name}' to key '{key}' with context window {window}" ) From 2de7f313dc2ef5b6b97816b6e830a4b037159e0f Mon Sep 17 00:00:00 2001 From: Rahul vakiti <143106829+vakrahul@users.noreply.github.com> Date: Tue, 2 Jun 2026 13:15:33 +0530 Subject: [PATCH 7/9] Refactor SummarizerAgent for dynamic chunking --- src/agents/summarizer.py | 45 +++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 24 deletions(-) diff --git a/src/agents/summarizer.py b/src/agents/summarizer.py index b2d27f65..1f1794bf 100644 --- a/src/agents/summarizer.py +++ b/src/agents/summarizer.py @@ -19,12 +19,12 @@ class SummarizerAgent(BaseAgent): - # Dynamic Chunking Configuration + MAX_RECURSION_DEPTH = 3 CHUNK_OVERLAP_TOKENS = 200 - SAFE_THRESHOLD_RATIO = 0.8 # Use 80% of context window as safe limit + SAFE_THRESHOLD_RATIO = 0.8 - # Rate limit retry configuration + MAX_RETRY_ATTEMPTS = 3 INITIAL_BACKOFF_SECONDS = 1.0 @@ -34,7 +34,7 @@ def __init__(self, model: BaseChatModel) -> None: name="summarizer", system_prompt=build_system_prompt(), ) - # Dynamically determine max chunk tokens from the model's context window + self._init_dynamic_chunk_tokens() def _init_dynamic_chunk_tokens(self) -> None: @@ -43,15 +43,14 @@ def _init_dynamic_chunk_tokens(self) -> None: This ensures we stay well within the model's limits across all providers. """ try: - # Try to detect provider and model name from the model instance + provider = self._detect_provider() model_name = getattr( self.model, "model", getattr(self.model, "model_name", None) ) context_window = get_model_context_window(provider, model_name) - # Calculate safe chunk size at 80% of context window, capped at 12k - # to prevent "lost in the middle" degradation and output token exhaustion + self.MAX_CHUNK_TOKENS = min( max(int(context_window * self.SAFE_THRESHOLD_RATIO), 2000), 12000 @@ -62,7 +61,7 @@ def _init_dynamic_chunk_tokens(self) -> None: f"max_chunk_tokens={self.MAX_CHUNK_TOKENS}" ) except Exception as e: - # Fallback to conservative default + self.MAX_CHUNK_TOKENS = 3000 self.logger.warning( f"Failed to initialize dynamic chunk tokens, using fallback (3000): {e}" @@ -70,14 +69,14 @@ def _init_dynamic_chunk_tokens(self) -> None: def _detect_provider(self) -> str: """Detect the provider from the model instance, unwrapping RunnableBinding if needed.""" - # Unwrap RunnableBinding and RunnableWithFallbacks to get the actual model + model = self.model while hasattr(model, "bound"): model = model.bound model_type = type(model).__name__ - # Map LangChain model classes to providers + provider_map = { "ChatAnthropic": "claude", "ChatOpenAI": "openai", @@ -94,7 +93,7 @@ def _detect_provider(self) -> str: if class_name in model_type: return provider - # Default to openai if we can't determine + return "openai" async def _call_model_with_retry(self, messages: list) -> str: @@ -119,7 +118,7 @@ async def _call_model_with_retry(self, messages: list) -> str: ) if not is_rate_limit or attempt == self.MAX_RETRY_ATTEMPTS - 1: - # Not a rate limit error, or last attempt - raise + raise self.logger.warning( @@ -127,7 +126,7 @@ async def _call_model_with_retry(self, messages: list) -> str: f"Retrying in {backoff_seconds:.1f}s..." ) await asyncio.sleep(backoff_seconds) - backoff_seconds *= 2 # Exponential backoff + backoff_seconds *= 2 def _estimate_tokens(self, text: str) -> int: """Lightweight token estimation (approx 4 characters per token).""" @@ -148,10 +147,10 @@ def _chunk_payload(self, text: str) -> list[str]: for word in words: word_tokens = self._estimate_tokens(word + " ") if current_tokens + word_tokens > self.MAX_CHUNK_TOKENS and current_chunk: - # Save the current chunk + chunks.append(" ".join(current_chunk)) - # Calculate overlap by counting tokens from the end of current_chunk + overlap_words = [] overlap_tokens = 0 for w in reversed(current_chunk): @@ -161,8 +160,8 @@ def _chunk_payload(self, text: str) -> list[str]: overlap_words.insert(0, w) overlap_tokens += w_tokens - # Safety check: ensure overlap is strictly smaller than current_chunk - # to prevent infinite loops/bloat when single words exceed MAX_CHUNK_TOKENS + + if len(overlap_words) >= len(current_chunk): overlap_words = current_chunk[1:] if len(current_chunk) > 1 else [] @@ -195,14 +194,13 @@ async def _recursive_summarize(self, text: str, depth: int = 0) -> str: messages = self._build_messages(text) return await self._call_model_with_retry(messages) - # Recursive Case: Split large payloads and map-reduce + self.logger.info( f"Payload too large ({estimated_tokens} tokens). Splitting into chunks (Depth: {depth})." ) chunks = self._chunk_payload(text) - # Summarize chunks concurrently to improve performance - # (avoids sequential processing that causes high latency) + tasks = [] for i, chunk in enumerate(chunks): self.logger.debug(f"Queuing chunk {i + 1}/{len(chunks)} for concurrent summarization...") @@ -212,14 +210,13 @@ async def _recursive_summarize(self, text: str, depth: int = 0) -> str: self.logger.debug(f"Processing {len(chunks)} chunks concurrently...") results = await asyncio.gather(*tasks, return_exceptions=True) - # Surface any failures rather than silently producing wrong output + exceptions = [r for r in results if isinstance(r, BaseException)] if exceptions: raise exceptions[0] chunk_summaries = [str(s).strip() for s in results] - # Map-reduce: Combine partial summaries and feed them back into the loop aggregated_text = "\n\n--- PARTIAL SUMMARIES ---\n\n".join(chunk_summaries) return await self._recursive_summarize(aggregated_text, depth=depth + 1) @@ -237,11 +234,11 @@ async def arun( user_message = pack_summary_query(user_query, agent_response) - # Route through the new dynamic chunking pipeline + raw_content = await self._recursive_summarize(user_message) summary = raw_content.strip() - # Treat empty-like responses as no summary + if summary in ('""', "''", "empty", "(empty)", "(empty string)"): summary = "" From 747033e1bb24f865d665d554dd47fb77faac447d Mon Sep 17 00:00:00 2001 From: Rahul vakiti <143106829+vakrahul@users.noreply.github.com> Date: Tue, 2 Jun 2026 13:17:57 +0530 Subject: [PATCH 8/9] Refactor registry.py by removing comments and whitespace Removed commented sections and cleaned up formatting. --- src/models/registry.py | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/src/models/registry.py b/src/models/registry.py index 128e22b1..af76003f 100644 --- a/src/models/registry.py +++ b/src/models/registry.py @@ -23,12 +23,10 @@ logger = logging.getLogger("xmem.models") -# ───────────────────────────────────────────────────────────────────────────── -# Context Window Mappings (in tokens) for each model -# ───────────────────────────────────────────────────────────────────────────── + _CONTEXT_WINDOWS = { - # Claude models + "claude": { "claude-3-5-sonnet-20241022": 200000, "claude-3-5-sonnet": 200000, @@ -40,7 +38,7 @@ "claude-haiku": 200000, "default": 200000, }, - # OpenAI models + "openai": { "gpt-4o": 128000, "gpt-4-turbo": 128000, @@ -63,7 +61,7 @@ "deepseek-coder": 128000, "default": 128000, }, - # Groq models + "groq": { "mixtral-8x7b-32768": 32768, "llama2-70b-4096": 4096, @@ -146,11 +144,11 @@ def get_model_context_window( provider_windows = _CONTEXT_WINDOWS[provider] if model_name: - # Try exact match first + if model_name in provider_windows: return provider_windows[model_name] - # Try partial match (e.g., "gpt-4o" matches "gpt-4o-mini") - # Sort by key length descending so more-specific keys win over shorter prefixes + + for key, window in sorted( ((k, v) for k, v in provider_windows.items() if k != "default"), key=lambda kv: len(kv[0]), @@ -162,7 +160,7 @@ def get_model_context_window( ) return window - # Fall back to provider default + return provider_windows.get("default", 8192) @@ -187,7 +185,7 @@ def get_model( if provider: return _BUILDERS[provider](**kw) - # Auto-select from fallback order + errors: list[str] = [] for p in settings.fallback_order: key_fn = _KEY_MAP.get(p) @@ -206,9 +204,7 @@ def get_model( ) -# --------------------------------------------------------------------------- -# Vision model (for image analysis) -# --------------------------------------------------------------------------- + _VISION_MODEL_MAP = { "gemini": lambda: settings.gemini_vision_model, @@ -245,7 +241,7 @@ def get_vision_model( provider=provider, model_name=vision_name, temperature=temperature ) - # Auto-select from fallback order + errors: list[str] = [] for p in settings.fallback_order: key_fn = _KEY_MAP.get(p) From 8110e256d114f643983b5e1afb324f4a31e1c4b6 Mon Sep 17 00:00:00 2001 From: Rahul vakiti <143106829+vakrahul@users.noreply.github.com> Date: Tue, 2 Jun 2026 13:27:20 +0530 Subject: [PATCH 9/9] Update src/agents/summarizer.py Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com> --- src/agents/summarizer.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/agents/summarizer.py b/src/agents/summarizer.py index 1f1794bf..ce8e6bb8 100644 --- a/src/agents/summarizer.py +++ b/src/agents/summarizer.py @@ -51,10 +51,7 @@ def _init_dynamic_chunk_tokens(self) -> None: context_window = get_model_context_window(provider, model_name) - self.MAX_CHUNK_TOKENS = min( - max(int(context_window * self.SAFE_THRESHOLD_RATIO), 2000), - 12000 - ) + self.MAX_CHUNK_TOKENS = max(int(context_window * self.SAFE_THRESHOLD_RATIO), 2000) self.logger.info( f"Initialized dynamic chunking: context_window={context_window}, "