From 5f0d57537aa707c25255bfaa9f7381d8564e9fdf Mon Sep 17 00:00:00 2001 From: Mike Abernathy Date: Fri, 17 Apr 2026 16:55:34 -0600 Subject: [PATCH] fix(planner): handle tool-loop exhaustion + tool_use-only responses MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Post-merge manual smoke of #200 surfaced two bugs on the first tool-using turn: the LLM called filesystem_list, hit MAX_PLANNER_TOOL_TURNS=4 still mid-call, and the raw tool_use block leaked into the chat UI as [{'id': 'toolu_01...', 'input': {'__arg1': 'dashboard/src/lib'}, 'name': 'filesystem_list', 'type': 'tool_use'}] Backend logs showed: [PLANNER] Hit max tool turns (4) Unexpected LLM response content type: list Fixes: - Bump MAX_PLANNER_TOOL_TURNS default from 4 to 6. Four turns is tight for the Planner's from-scratch exploration (ls dashboard/src/lib, ls dashboard/src/lib/components, read package.json, produce final answer = easily 4+). Architect Phase 2 uses 4 but only runs AFTER Phase 1 already narrowed the work. - `_extract_text_from_content` now silently skips tool_use / tool_result / input_json_delta blocks and returns "" when a list contains only those (instead of stringifying the dict). Empty list also returns "". - `_invoke_with_optional_tools` detects tool-loop exhaustion (returned response has no usable text) and retries the unbound LLM on the original messages. Note: we can't replay the loop's mid-flight messages because the final tool_use has no paired tool_result — Anthropic would reject that sequence. The wrap-up loses the loop's learnings but produces a valid conversational response instead of the raw-dict dump the user saw. Tests: 6 new in TestExtractTextFromContent + TestPlannerReadOnlyTools — covers tool_use-only response handling, empty-list handling, mixed text + tool_use, and the exhaustion wrap-up path. 87/87 planner tests pass; 293/293 related tests pass; ruff clean. Co-Authored-By: Claude Opus 4.7 (1M context) --- dev-suite/src/agents/planner.py | 46 +++++++++++++-- dev-suite/tests/test_planner.py | 99 +++++++++++++++++++++++++++++++++ 2 files changed, 140 insertions(+), 5 deletions(-) diff --git a/dev-suite/src/agents/planner.py b/dev-suite/src/agents/planner.py index a97a535..3895a5c 100644 --- a/dev-suite/src/agents/planner.py +++ b/dev-suite/src/agents/planner.py @@ -60,9 +60,9 @@ # budget — enough turns for a few ls/read calls to orient the spec, # bounded so a misbehaving LLM can't drain tokens chasing the codebase. try: - MAX_PLANNER_TOOL_TURNS = int(os.getenv("MAX_PLANNER_TOOL_TURNS", "4")) + MAX_PLANNER_TOOL_TURNS = int(os.getenv("MAX_PLANNER_TOOL_TURNS", "6")) except ValueError: - MAX_PLANNER_TOOL_TURNS = 4 + MAX_PLANNER_TOOL_TURNS = 6 # Issue #193: loose heuristic for "the user mentioned a `#N` ref but we # couldn't resolve it." Used only for warning diagnostics when the @@ -1082,7 +1082,23 @@ async def _invoke_with_optional_tools( trace=[], agent_name="planner", ) - return _extract_text_from_content(response.content) + text = _extract_text_from_content(response.content) + if text.strip(): + return text + + # Loop exhausted mid-tool-call — response is a pure tool_use + # block with no user-facing text. We can't simply replay the + # loop's `current_messages` because the final tool_use has no + # paired tool_result (Anthropic rejects unpaired sequences), + # so fall back to a no-tools call on the ORIGINAL messages. + # The LLM loses its mid-loop learnings but produces a valid + # conversational response instead of a raw tool_use dump. + logger.warning( + "[PLANNER] Tool loop exhausted without final text; " + "retrying unbound on original messages" + ) + fallback_response = await llm.ainvoke(lc_messages) + return _extract_text_from_content(fallback_response.content) except Exception as exc: # noqa: BLE001 logger.warning( "[PLANNER] Tool loop failed (%s); falling back to no-tool call", @@ -1100,20 +1116,40 @@ def _extract_text_from_content(content: Any) -> str: - list of content blocks: concatenates text from all blocks with type='text' (Gemini 3.x, some Anthropic responses) e.g. [{'type': 'text', 'text': '...', 'extras': {...}}] + - list of only tool_use blocks (Anthropic mid-loop): returns "" + rather than dumping the raw dicts to the user. - other: falls back to str() conversion """ if isinstance(content, str): return content if isinstance(content, list): + if not content: + return "" text_parts = [] + unknown_block_seen = False for block in content: - if isinstance(block, dict) and block.get("type") == "text": - text_parts.append(block.get("text", "")) + if isinstance(block, dict): + btype = block.get("type") + if btype == "text": + text_parts.append(block.get("text", "")) + elif btype in ("tool_use", "tool_result", "input_json_delta"): + # Intermediate tool-calling blocks — not user-facing + # text. Silently skip so we don't render raw dicts. + continue + else: + unknown_block_seen = True elif isinstance(block, str): text_parts.append(block) + else: + unknown_block_seen = True if text_parts: return "\n".join(text_parts) + if not unknown_block_seen: + # Pure tool_use / empty-recognized response — no text to + # surface. Return empty string; caller decides how to + # handle (typically: force a final no-tool call). + return "" # Last resort — should not normally reach here logger.warning( diff --git a/dev-suite/tests/test_planner.py b/dev-suite/tests/test_planner.py index 16d19d3..8913fb1 100644 --- a/dev-suite/tests/test_planner.py +++ b/dev-suite/tests/test_planner.py @@ -1314,3 +1314,102 @@ async def exploding_loop(*args, **kwargs): result = await _invoke_with_optional_tools(llm, [], tools=fake_tools) assert result == "fallback response" llm.ainvoke.assert_awaited() + + @pytest.mark.asyncio + async def test_invoke_with_optional_tools_wraps_up_on_exhaustion( + self, monkeypatch, + ): + """Max-turns regression: if the loop returns a pure tool_use + response with no text, we retry the unbound LLM on the original + messages so the user sees prose instead of a raw dict dump. + Repro for: `Hit max tool turns (4)` + "Unexpected LLM response + content type: list" from the initial PR #200 smoke test. + """ + from src.agents.planner import _invoke_with_optional_tools + + # _run_tool_loop returns a tool_use-only response — what + # Anthropic sends mid-call when max turns hits. + exhausted_response = AsyncMock() + exhausted_response.content = [ + { + "id": "toolu_01GxiJrS8Xj4jN4FeFT3vsJu", + "input": {"__arg1": "dashboard/src/lib"}, + "name": "filesystem_list", + "type": "tool_use", + }, + ] + + async def fake_loop(*args, **kwargs): + return exhausted_response, 0, [] + + monkeypatch.setattr("src.orchestrator._run_tool_loop", fake_loop) + + wrap_up_response = AsyncMock() + wrap_up_response.content = ( + "Here's the task spec based on what I already know..." + ) + llm = AsyncMock() + llm.bind_tools = lambda _tools: llm + llm.ainvoke = AsyncMock(return_value=wrap_up_response) + + fake_tools = [AsyncMock(name="filesystem_list")] + result = await _invoke_with_optional_tools(llm, [], tools=fake_tools) + # User sees prose, not "[{'id': 'toolu_...', ...}]" + assert result == "Here's the task spec based on what I already know..." + assert "toolu_" not in result + assert "tool_use" not in result + llm.ainvoke.assert_awaited() # The wrap-up call fired + + +class TestExtractTextFromContent: + """Regression coverage for the raw-dict-dump bug: when Anthropic + returns a list of content blocks that are all tool_use (no text), + we must render empty string, not `str(content)`. + """ + + def test_string_content_returned_as_is(self): + from src.agents.planner import _extract_text_from_content + + assert _extract_text_from_content("hello") == "hello" + + def test_list_with_text_blocks_joins_them(self): + from src.agents.planner import _extract_text_from_content + + content = [ + {"type": "text", "text": "part one"}, + {"type": "text", "text": "part two"}, + ] + assert _extract_text_from_content(content) == "part one\npart two" + + def test_list_with_only_tool_use_returns_empty(self): + """This was the bug the user hit — max-turns exhaustion left + Anthropic's response as a pure tool_use block, which used to + render as the raw dict string in the chat UI. + """ + from src.agents.planner import _extract_text_from_content + + content = [ + { + "id": "toolu_xyz", + "input": {"__arg1": "dashboard/src/lib"}, + "name": "filesystem_list", + "type": "tool_use", + }, + ] + assert _extract_text_from_content(content) == "" + + def test_mixed_text_and_tool_use_returns_only_text(self): + from src.agents.planner import _extract_text_from_content + + content = [ + {"type": "text", "text": "Let me check the filesystem."}, + {"type": "tool_use", "name": "filesystem_list", "input": {}}, + ] + assert _extract_text_from_content(content) == ( + "Let me check the filesystem." + ) + + def test_empty_list_returns_empty_string(self): + from src.agents.planner import _extract_text_from_content + + assert _extract_text_from_content([]) == ""