From 5f0d57537aa707c25255bfaa9f7381d8564e9fdf Mon Sep 17 00:00:00 2001
From: Mike Abernathy <mabernathy87@gmail.com>
Date: Fri, 17 Apr 2026 16:55:34 -0600
Subject: [PATCH] fix(planner): handle tool-loop exhaustion + tool_use-only
 responses
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Post-merge manual smoke of #200 surfaced two bugs on the first tool-using
turn: the LLM called filesystem_list, hit MAX_PLANNER_TOOL_TURNS=4 still
mid-call, and the raw tool_use block leaked into the chat UI as

  [{'id': 'toolu_01...', 'input': {'__arg1': 'dashboard/src/lib'},
    'name': 'filesystem_list', 'type': 'tool_use'}]

Backend logs showed:
  [PLANNER] Hit max tool turns (4)
  Unexpected LLM response content type: list

Fixes:
- Bump MAX_PLANNER_TOOL_TURNS default from 4 to 6. Four turns is tight
  for the Planner's from-scratch exploration (ls dashboard/src/lib,
  ls dashboard/src/lib/components, read package.json, produce final
  answer = easily 4+). Architect Phase 2 uses 4 but only runs AFTER
  Phase 1 already narrowed the work.
- `_extract_text_from_content` now silently skips tool_use /
  tool_result / input_json_delta blocks and returns "" when a list
  contains only those (instead of stringifying the dict). Empty list
  also returns "".
- `_invoke_with_optional_tools` detects tool-loop exhaustion (returned
  response has no usable text) and retries the unbound LLM on the
  original messages. Note: we can't replay the loop's mid-flight
  messages because the final tool_use has no paired tool_result —
  Anthropic would reject that sequence. The wrap-up loses the loop's
  learnings but produces a valid conversational response instead of
  the raw-dict dump the user saw.

Tests: 6 new in TestExtractTextFromContent + TestPlannerReadOnlyTools —
covers tool_use-only response handling, empty-list handling, mixed
text + tool_use, and the exhaustion wrap-up path. 87/87 planner tests
pass; 293/293 related tests pass; ruff clean.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 dev-suite/src/agents/planner.py | 46 +++++++++++++--
 dev-suite/tests/test_planner.py | 99 +++++++++++++++++++++++++++++++++
 2 files changed, 140 insertions(+), 5 deletions(-)

diff --git a/dev-suite/src/agents/planner.py b/dev-suite/src/agents/planner.py
index a97a535..3895a5c 100644
--- a/dev-suite/src/agents/planner.py
+++ b/dev-suite/src/agents/planner.py
@@ -60,9 +60,9 @@
 # budget — enough turns for a few ls/read calls to orient the spec,
 # bounded so a misbehaving LLM can't drain tokens chasing the codebase.
 try:
-    MAX_PLANNER_TOOL_TURNS = int(os.getenv("MAX_PLANNER_TOOL_TURNS", "4"))
+    MAX_PLANNER_TOOL_TURNS = int(os.getenv("MAX_PLANNER_TOOL_TURNS", "6"))
 except ValueError:
-    MAX_PLANNER_TOOL_TURNS = 4
+    MAX_PLANNER_TOOL_TURNS = 6
 
 # Issue #193: loose heuristic for "the user mentioned a `#N` ref but we
 # couldn't resolve it." Used only for warning diagnostics when the
@@ -1082,7 +1082,23 @@ async def _invoke_with_optional_tools(
             trace=[],
             agent_name="planner",
         )
-        return _extract_text_from_content(response.content)
+        text = _extract_text_from_content(response.content)
+        if text.strip():
+            return text
+
+        # Loop exhausted mid-tool-call — response is a pure tool_use
+        # block with no user-facing text. We can't simply replay the
+        # loop's `current_messages` because the final tool_use has no
+        # paired tool_result (Anthropic rejects unpaired sequences),
+        # so fall back to a no-tools call on the ORIGINAL messages.
+        # The LLM loses its mid-loop learnings but produces a valid
+        # conversational response instead of a raw tool_use dump.
+        logger.warning(
+            "[PLANNER] Tool loop exhausted without final text; "
+            "retrying unbound on original messages"
+        )
+        fallback_response = await llm.ainvoke(lc_messages)
+        return _extract_text_from_content(fallback_response.content)
     except Exception as exc:  # noqa: BLE001
         logger.warning(
             "[PLANNER] Tool loop failed (%s); falling back to no-tool call",
@@ -1100,20 +1116,40 @@ def _extract_text_from_content(content: Any) -> str:
     - list of content blocks: concatenates text from all blocks
       with type='text' (Gemini 3.x, some Anthropic responses)
       e.g. [{'type': 'text', 'text': '...', 'extras': {...}}]
+    - list of only tool_use blocks (Anthropic mid-loop): returns ""
+      rather than dumping the raw dicts to the user.
     - other: falls back to str() conversion
     """
     if isinstance(content, str):
         return content
 
     if isinstance(content, list):
+        if not content:
+            return ""
         text_parts = []
+        unknown_block_seen = False
         for block in content:
-            if isinstance(block, dict) and block.get("type") == "text":
-                text_parts.append(block.get("text", ""))
+            if isinstance(block, dict):
+                btype = block.get("type")
+                if btype == "text":
+                    text_parts.append(block.get("text", ""))
+                elif btype in ("tool_use", "tool_result", "input_json_delta"):
+                    # Intermediate tool-calling blocks — not user-facing
+                    # text. Silently skip so we don't render raw dicts.
+                    continue
+                else:
+                    unknown_block_seen = True
             elif isinstance(block, str):
                 text_parts.append(block)
+            else:
+                unknown_block_seen = True
         if text_parts:
             return "\n".join(text_parts)
+        if not unknown_block_seen:
+            # Pure tool_use / empty-recognized response — no text to
+            # surface. Return empty string; caller decides how to
+            # handle (typically: force a final no-tool call).
+            return ""
 
     # Last resort — should not normally reach here
     logger.warning(
diff --git a/dev-suite/tests/test_planner.py b/dev-suite/tests/test_planner.py
index 16d19d3..8913fb1 100644
--- a/dev-suite/tests/test_planner.py
+++ b/dev-suite/tests/test_planner.py
@@ -1314,3 +1314,102 @@ async def exploding_loop(*args, **kwargs):
         result = await _invoke_with_optional_tools(llm, [], tools=fake_tools)
         assert result == "fallback response"
         llm.ainvoke.assert_awaited()
+
+    @pytest.mark.asyncio
+    async def test_invoke_with_optional_tools_wraps_up_on_exhaustion(
+        self, monkeypatch,
+    ):
+        """Max-turns regression: if the loop returns a pure tool_use
+        response with no text, we retry the unbound LLM on the original
+        messages so the user sees prose instead of a raw dict dump.
+        Repro for: `Hit max tool turns (4)` + "Unexpected LLM response
+        content type: list" from the initial PR #200 smoke test.
+        """
+        from src.agents.planner import _invoke_with_optional_tools
+
+        # _run_tool_loop returns a tool_use-only response — what
+        # Anthropic sends mid-call when max turns hits.
+        exhausted_response = AsyncMock()
+        exhausted_response.content = [
+            {
+                "id": "toolu_01GxiJrS8Xj4jN4FeFT3vsJu",
+                "input": {"__arg1": "dashboard/src/lib"},
+                "name": "filesystem_list",
+                "type": "tool_use",
+            },
+        ]
+
+        async def fake_loop(*args, **kwargs):
+            return exhausted_response, 0, []
+
+        monkeypatch.setattr("src.orchestrator._run_tool_loop", fake_loop)
+
+        wrap_up_response = AsyncMock()
+        wrap_up_response.content = (
+            "Here's the task spec based on what I already know..."
+        )
+        llm = AsyncMock()
+        llm.bind_tools = lambda _tools: llm
+        llm.ainvoke = AsyncMock(return_value=wrap_up_response)
+
+        fake_tools = [AsyncMock(name="filesystem_list")]
+        result = await _invoke_with_optional_tools(llm, [], tools=fake_tools)
+        # User sees prose, not "[{'id': 'toolu_...', ...}]"
+        assert result == "Here's the task spec based on what I already know..."
+        assert "toolu_" not in result
+        assert "tool_use" not in result
+        llm.ainvoke.assert_awaited()  # The wrap-up call fired
+
+
+class TestExtractTextFromContent:
+    """Regression coverage for the raw-dict-dump bug: when Anthropic
+    returns a list of content blocks that are all tool_use (no text),
+    we must render empty string, not `str(content)`.
+    """
+
+    def test_string_content_returned_as_is(self):
+        from src.agents.planner import _extract_text_from_content
+
+        assert _extract_text_from_content("hello") == "hello"
+
+    def test_list_with_text_blocks_joins_them(self):
+        from src.agents.planner import _extract_text_from_content
+
+        content = [
+            {"type": "text", "text": "part one"},
+            {"type": "text", "text": "part two"},
+        ]
+        assert _extract_text_from_content(content) == "part one\npart two"
+
+    def test_list_with_only_tool_use_returns_empty(self):
+        """This was the bug the user hit — max-turns exhaustion left
+        Anthropic's response as a pure tool_use block, which used to
+        render as the raw dict string in the chat UI.
+        """
+        from src.agents.planner import _extract_text_from_content
+
+        content = [
+            {
+                "id": "toolu_xyz",
+                "input": {"__arg1": "dashboard/src/lib"},
+                "name": "filesystem_list",
+                "type": "tool_use",
+            },
+        ]
+        assert _extract_text_from_content(content) == ""
+
+    def test_mixed_text_and_tool_use_returns_only_text(self):
+        from src.agents.planner import _extract_text_from_content
+
+        content = [
+            {"type": "text", "text": "Let me check the filesystem."},
+            {"type": "tool_use", "name": "filesystem_list", "input": {}},
+        ]
+        assert _extract_text_from_content(content) == (
+            "Let me check the filesystem."
+        )
+
+    def test_empty_list_returns_empty_string(self):
+        from src.agents.planner import _extract_text_from_content
+
+        assert _extract_text_from_content([]) == ""