xprilion · xprilion · May 3, 2026 · May 3, 2026 · May 3, 2026 · May 3, 2026
diff --git a/backend/configs/prompts/system_prompt.yaml b/backend/configs/prompts/system_prompt.yaml
@@ -72,6 +72,23 @@ prompt: |
   Plan only — ask questions, gather context, create plan. No execution.
   {% endif %}
 
+  {% if memory_context %}
+  {{ memory_context }}
+  {% endif %}
+
+  {% if project_context %}
+  # Project Context
+  The following project-specific instructions were loaded from `.openmlr.md` in the workspace.
+  Follow these instructions for this project:
+
+  {{ project_context }}
+  {% endif %}
+
+  {% if knowledge_context %}
+  # Prior Knowledge (from project knowledge graph)
+  {{ knowledge_context }}
+  {% endif %}
+
   # Tool Selection Guide
 
   Use this decision tree to pick the right tool:
@@ -98,15 +115,45 @@ prompt: |
   - `workspace knowledge_add` for entities (papers, methods, datasets)
   - `workspace knowledge_relate` for relationships between entities
   - `workspace note` for research summaries and important findings
+  - `memory` tool for quick facts that should always be in context
   - These persist across conversations in the same project
 
+  ## Persistent memory
+  - Use `memory(action='add', target='project', content='...')` to save
+    project-scoped facts (environment, conventions, lessons learned)
+  - Use `memory(action='add', target='user', content='...')` to save
+    user preferences (communication style, expertise, tools)
+  - Memory entries are injected into the system prompt at session start —
+    you always have access without a tool call
+  - Save proactively when you learn user preferences, environment facts,
+    corrections, or completed work summaries
+  - Use `memory(action='replace', ...)` and `memory(action='remove', ...)`
+    to maintain and consolidate entries (substring matching on `old_text`)
+
+  ## Recalling past work
+  - Use `session_search(query='...')` to search past conversations
+  - Finds relevant discussions, decisions, and findings from prior sessions
+  - Use `project_only=true` (default) to scope within the current project
+
   ## Running code and experiments
   - `bash` executes in Docker isolation (8GB RAM, read-only root)
   - Default timeout: 120s, max: 3600s
   - Working directory is the project workspace
   - Install dependencies first: `bash(command='pip install ...')`
   - Always check environment before running: `bash(command='python --version')`
 
+  ## Long-running tasks (training, data processing)
+  - Use `process(action='start', command='python train.py --epochs 100')`
+    to start background processes that survive even if the user closes the tab
+  - Use `process(action='poll', session_id='...')` to check status and
+    recent output
+  - Use `process(action='log', session_id='...')` for full output
+  - Use `process(action='kill', session_id='...')` to stop a process
+  - Use `process(action='list')` to see all background processes
+  - Ideal for: ML training, data preprocessing, long evaluations
+  - You can start training, do other work (read papers, write code), and
+    check back on training progress periodically
+
   ## Deep research
   - Use the `research` sub-agent for comprehensive investigations
   - It has independent context and uses: web_search, papers, github tools, hf tools

diff --git a/backend/openmlr/agent/context.py b/backend/openmlr/agent/context.py
@@ -1,13 +1,43 @@
 """ContextManager — message history, compaction, undo, token tracking."""
 
+import logging
 from dataclasses import dataclass, field
 
 from ..config import AgentConfig, get_model_max_tokens
 from .types import Message, ToolCall
 
+_logger = logging.getLogger(__name__)
+
+# Cache tiktoken encoder at module level for performance
+_tiktoken_encoder = None
+_tiktoken_available = False
+
+
+def _get_tiktoken_encoder():
+    """Lazily load tiktoken encoder. Returns None if tiktoken not available."""
+    global _tiktoken_encoder, _tiktoken_available
+    if _tiktoken_available:
+        return _tiktoken_encoder  # Already attempted (may be None if import failed)
+    _tiktoken_available = True  # Mark as attempted regardless of outcome
+    try:
+        import tiktoken
+
+        _tiktoken_encoder = tiktoken.get_encoding("cl100k_base")  # Works for GPT-4, Claude
+        return _tiktoken_encoder
+    except Exception:
+        return None
+
 
 def estimate_tokens(text: str) -> int:
-    """Rough token estimate: ~4 chars per token for English text."""
+    """Estimate token count. Uses tiktoken if available, falls back to len//4."""
+    if not text:
+        return 1
+    encoder = _get_tiktoken_encoder()
+    if encoder:
+        try:
+            return len(encoder.encode(text))
+        except Exception:
+            pass
     return max(1, len(text) // 4)
 
 
@@ -17,6 +47,7 @@ class ContextManager:
     messages: list[Message] = field(default_factory=list)
     system_prompt: str = ""
     running_token_count: int = 0
+    _previous_summary: str = ""
 
     def add_message(self, msg: Message | dict) -> None:
         if isinstance(msg, dict):
@@ -89,6 +120,62 @@ def undo_last_turn(self) -> int:
         self.running_token_count = max(0, self.running_token_count)
         return removed
 
+    def _prune_old_tool_outputs(self, protected_tail_count: int) -> int:
+        """Phase 1: Replace old verbose tool outputs with stubs.
+
+        Only prunes tool messages outside the protected tail.
+        Returns count of messages pruned.
+        """
+        pruned = 0
+        cutoff = len(self.messages) - protected_tail_count
+        for i, msg in enumerate(self.messages):
+            if i >= cutoff:
+                break
+            if msg.role == "tool" and msg.content and len(msg.content) > 200:
+                old_tokens = estimate_tokens(msg.content)
+                msg.content = (
+                    "[Old tool output cleared to save context — use read to re-fetch if needed]"
+                )
+                new_tokens = estimate_tokens(msg.content)
+                self.running_token_count -= old_tokens - new_tokens
+                pruned += 1
+        return pruned
+
+    def _find_tail_boundary(self) -> int:
+        """Phase 2: Find the boundary index for the protected tail.
+
+        Walks backward from the end, accumulating tokens until budget is exhausted.
+        Aligns to avoid splitting tool_call/tool_result pairs.
+        Falls back to self.config.untouched_messages if budget protects fewer.
+        """
+        model_max = get_model_max_tokens(self.config.model_name)
+        # Protect ~20% of the threshold budget as tail
+        tail_budget = int(model_max * self.config.compact_threshold_ratio * 0.20)
+
+        accumulated = 0
+        boundary = len(self.messages)
+        for i in range(len(self.messages) - 1, -1, -1):
+            tokens = estimate_tokens(self.messages[i].content or "")
+            if accumulated + tokens > tail_budget:
+                break
+            accumulated += tokens
+            boundary = i
+
+        # Don't protect fewer than untouched_messages
+        min_boundary = max(0, len(self.messages) - self.config.untouched_messages)
+        boundary = min(boundary, min_boundary)
+
+        # Align boundary backward to avoid splitting tool_call/tool_result pairs
+        while boundary > 0 and boundary < len(self.messages):
+            msg = self.messages[boundary]
+            # If we're landing on a tool result, walk back to include the assistant+tool_calls
+            if msg.role == "tool":
+                boundary -= 1
+            else:
+                break
+
+        return max(self.config.untouched_messages, boundary)
+
     def _patch_dangling_tool_calls(self) -> None:
         i = 0
         while i < len(self.messages):
@@ -109,42 +196,106 @@ def _patch_dangling_tool_calls(self) -> None:
             i += 1
 
     async def compact(self, llm_call) -> str | None:
+        """Structured 4-phase context compression.
+
+        Phase 1: Prune old tool outputs (cheap, no LLM)
+        Phase 2: Determine boundaries (token-budget tail protection)
+        Phase 3: Generate structured summary (research-adapted template)
+        Phase 4: Assemble compressed messages
+        """
         if len(self.messages) <= self.config.untouched_messages + 3:
             return None
 
-        middle = self.messages[self.config.untouched_messages : -self.config.untouched_messages]
+        # Phase 1: Prune old tool outputs
+        tail_boundary = self._find_tail_boundary()
+        pruned = self._prune_old_tool_outputs(tail_boundary - self.config.untouched_messages)
+
+        # Check if pruning alone was enough
+        if not self.needs_compaction() and pruned > 0:
+            return f"Pruned {pruned} old tool outputs (no summary needed)."
+
+        # Phase 2: Determine boundaries
+        head_count = min(self.config.untouched_messages, len(self.messages))
+        middle = self.messages[head_count:tail_boundary]
         if not middle:
             return None
 
+        # Phase 3: Generate structured summary
+        summary_prompt = _build_research_summary_prompt(self._previous_summary)
         summary_messages = [
-            {"role": "system", "content": "Summarize the following conversation concisely."},
+            {"role": "system", "content": summary_prompt},
         ]
         for msg in middle:
-            summary_messages.append({"role": msg.role, "content": msg.content})
+            # Normalize roles for the summary LLM call — "tool" and "system"
+            # are not valid standalone roles for all providers (esp. Anthropic)
+            role = "user" if msg.role in ("user", "tool", "system") else "assistant"
+            summary_messages.append({"role": role, "content": msg.content or ""})
         summary_messages.append(
             {
                 "role": "user",
                 "content": (
-                    "Provide a concise summary focusing on: key decisions, problems solved, "
-                    "current task progress, files/resources created, and what to do next."
+                    "Produce a structured summary of the conversation above. "
+                    "If a previous summary is included, UPDATE it — move items from "
+                    "'In Progress' to 'Done', add new progress, remove obsolete info."
                 ),
             }
         )
 
         summary = await llm_call(summary_messages, self.config)
-        if summary:
-            self.messages = (
-                self.messages[: self.config.untouched_messages]
-                + [Message(role="system", content=f"## Conversation Summary\n\n{summary}")]
-                + self.messages[-self.config.untouched_messages :]
-            )
-            self._patch_dangling_tool_calls()
-            # Recalculate token count after compaction
-            self.running_token_count = sum(estimate_tokens(m.content or "") for m in self.messages)
-            self.running_token_count += estimate_tokens(self.system_prompt)
-            return summary
-        return None
+        if not summary:
+            return None
+
+        # Store for iterative re-compression
+        self._previous_summary = summary
+
+        # Phase 4: Assemble compressed messages
+        # Add compaction note to first message on first compression
+        head = self.messages[:head_count]
+        tail = self.messages[tail_boundary:]
+
+        self.messages = (
+            head + [Message(role="system", content=f"## Conversation Summary\n\n{summary}")] + tail
+        )
+        self._patch_dangling_tool_calls()
+
+        # Recalculate token count
+        self.running_token_count = sum(estimate_tokens(m.content or "") for m in self.messages)
+        self.running_token_count += estimate_tokens(self.system_prompt)
+        return summary
 
     def clear(self) -> None:
         self.messages.clear()
         self.running_token_count = 0
+        self._previous_summary = ""
+
+
+def _build_research_summary_prompt(previous_summary: str = "") -> str:
+    """Build a structured summary prompt for research conversations."""
+    base = (
+        "You are summarizing an ML research conversation. Produce a structured "
+        "summary using EXACTLY this format:\n\n"
+        "## Research Goal\n"
+        "[What the user is investigating]\n\n"
+        "## Papers & Sources\n"
+        "[Papers found/read/cited — include IDs and key findings]\n\n"
+        "## Methodology Decisions\n"
+        "[Research approach, methods chosen, frameworks selected]\n\n"
+        "## Progress\n"
+        "### Done\n[Completed work — specific files, commands, results]\n"
+        "### In Progress\n[Work currently underway]\n"
+        "### Blocked\n[Any blockers or issues]\n\n"
+        "## Code & Experiments\n"
+        "[Scripts written, experiments run, results observed]\n\n"
+        "## Key Findings\n"
+        "[Important results, discoveries, insights]\n\n"
+        "## Next Steps\n"
+        "[What needs to happen next]\n\n"
+        "Be concise but preserve specific details (file paths, paper IDs, "
+        "exact error messages, numeric results)."
+    )
+    if previous_summary:
+        base += (
+            f"\n\n--- PREVIOUS SUMMARY (update this, don't start from scratch) ---\n"
+            f"{previous_summary}"
+        )
+    return base