From 7240cdb6da4862d579d54cd57fdd6c6d68aedc49 Mon Sep 17 00:00:00 2001 From: Kevin Date: Mon, 22 Jun 2026 18:03:47 -0700 Subject: [PATCH] fix(models): surface empty LiteLlm streaming completions as error event MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Streaming completions where the provider returns a finish_reason but no text + no tool calls currently produce ZERO yielded LlmResponse events: ``aggregated_llm_response`` only gets set when ``(text or reasoning_parts)`` is truthy, and ``aggregated_llm_response_with_tool_call`` needs a function_call. With neither, the loop simply exits and the downstream Runner observes a silent successful empty stream. This pattern is reported across multiple stalled fix attempts: * #5394 — AnthropicLlm never populates finish_reason on LlmResponse * #5006 — retry with resume message when model returns empty response * #5636 — surface error when model returns STOP with empty content * #3618 / #3699 — Handle empty message in LiteLLM response It hits providers under several real conditions: anthropic content_filter, gemini 2.5-flash-lite STOP-with-empty after tool calls, 0-token completions under safety, model_not_found responses normalized to stop, etc. From the user's perspective the agent "successfully" ends a turn with no visible output. Fix - Track ``last_finish_reason`` + ``last_model_version`` across the stream so we can attribute the empty response. - After both ``aggregated_llm_response`` and ``aggregated_llm_response_with_tool_call`` checks, if BOTH are None AND a finish_reason was observed, yield ONE LlmResponse with ``error_code`` set to the mapped finish_reason, ``error_message`` describing the failure mode, and the provider's ``model_version`` preserved. ``usage_metadata`` + ``grounding_metadata`` (if any) attach to that response so callers do not lose them. - Minimum-surface change: the guard only fires when the stream produced no aggregated response AND a finish_reason was observed. Streams that genuinely yield nothing (test doubles, empty iterators) stay byte-identical. Tests - tests/unittests/models/test_litellm.py adds 4 cases: * content_filter-empty → surfaces with SAFETY error_code * stop-empty → surfaces with STOP finish_reason + error_message * normal text stream → empty-guard does NOT fire (regression) * literally-empty stream (no chunks, no finish_reason) → byte-identical zero responses 281 lite_llm tests pass + 1 skip; 0 regressions. --- src/google/adk/models/lite_llm.py | 49 +++++++++ tests/unittests/models/test_litellm.py | 144 +++++++++++++++++++++++++ 2 files changed, 193 insertions(+) diff --git a/src/google/adk/models/lite_llm.py b/src/google/adk/models/lite_llm.py index 3c8c4b5f71..f7fd2deea4 100644 --- a/src/google/adk/models/lite_llm.py +++ b/src/google/adk/models/lite_llm.py @@ -2413,6 +2413,16 @@ async def generate_content_async( usage_metadata = None grounding_metadata = None fallback_index = 0 + # Track the latest finish_reason + model so we can surface an + # explicit error event when the stream completes with neither text + # nor tool calls (silent-empty pattern). Without this guard the + # aggregator simply yields nothing and the downstream Runner sees + # zero events — the user-visible "empty turn that succeeded" bug + # tracked across #5394 / #5006 / #3618 / user reports against + # anthropic, gemini, and other providers under content_filter, + # safety, model_not_found, and 0-token-completion conditions. + last_finish_reason: str | None = None + last_model_version: str | None = None def _finalize_tool_call_response( *, model_version: str, finish_reason: str @@ -2502,7 +2512,11 @@ def _reset_stream_buffers() -> None: part_grounding = _extract_grounding_metadata(part) if part_grounding: grounding_metadata = part_grounding + if getattr(part, "model", None): + last_model_version = part.model for chunk, finish_reason in _model_response_to_chunk(part): + if finish_reason: + last_finish_reason = finish_reason if isinstance(chunk, FunctionChunk): index = chunk.index or fallback_index if index not in function_calls: @@ -2616,6 +2630,41 @@ def _reset_stream_buffers() -> None: ) yield aggregated_llm_response_with_tool_call + # If we observed a finish_reason but produced no aggregated response + # (no text, no tool calls), surface it as an explicit error event so + # downstream consumers see actionable signal instead of a silent + # zero-yield stream. This is the "empty completion" pattern: + # provider returns 200 OK with a finish_reason (content_filter, + # safety, length-without-content, model_not_found, 0-token + # response, ...) but no text or tool deltas. Tracked across: + # #5394 — AnthropicLlm never populates finish_reason + # #5006 — retry with resume message when model returns empty + # #5636 — surface error when model returns STOP with empty content + # #3618 — Handle empty message in LiteLLM response + # Multiple stalled PRs (#5512, #5636, #3699) attempted variants of + # this fix. This guard keeps the change minimal: ONLY fires when + # the entire stream produced nothing meaningful AND the provider + # told us why via finish_reason. Mapped error_code lets the + # downstream agent loop react (retry, surface, escalate) without + # losing the provider's actual signal. + if ( + last_finish_reason + and aggregated_llm_response is None + and aggregated_llm_response_with_tool_call is None + ): + mapped_empty_finish = _map_finish_reason(last_finish_reason) + empty_response = LlmResponse( + error_code=mapped_empty_finish, + error_message=_finish_reason_to_error_message(mapped_empty_finish), + finish_reason=mapped_empty_finish, + model_version=last_model_version, + ) + if usage_metadata: + empty_response.usage_metadata = usage_metadata + if grounding_metadata: + empty_response.grounding_metadata = grounding_metadata + yield empty_response + else: response = await self.llm_client.acompletion(**completion_args) yield _model_response_to_generate_content_response(response) diff --git a/tests/unittests/models/test_litellm.py b/tests/unittests/models/test_litellm.py index 190f84171d..de5a420aae 100644 --- a/tests/unittests/models/test_litellm.py +++ b/tests/unittests/models/test_litellm.py @@ -5275,3 +5275,147 @@ def test_redact_file_uri_for_log_http_url_keeps_scheme_and_tail(): _redact_file_uri_for_log("https://example.com/path/file.pdf") == "https:///file.pdf" ) + + +# --------------------------------------------------------------------------- +# Empty-completion guard (fixes #5394 / #5006 / #5636 / #3618 family) +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_streaming_empty_completion_with_content_filter_surfaces_error( + mock_completion, lite_llm_instance +): + """A stream that ends with finish_reason but no text/tool deltas must + surface as an explicit error event, not a silent zero-yield stream. + + Reproduces the "empty completion" pattern reported across + #5394 / #5006 / #5636 / #3618: provider returns 200 OK with a + finish_reason (content_filter / safety / model_not_found / etc.) + but no content. Pre-fix the aggregator simply yielded nothing and + the downstream Runner saw zero events — silent successful empty + turn from the user's perspective. + """ + stream_chunks = [ + ModelResponseStream( + model="anthropic/claude-opus-4-8", + choices=[ + StreamingChoices(finish_reason="content_filter", delta=Delta()) + ], + ), + ] + mock_completion.return_value = iter(stream_chunks) + + responses = [ + response + async for response in lite_llm_instance.generate_content_async( + LLM_REQUEST_WITH_FUNCTION_DECLARATION, stream=True + ) + ] + + assert len(responses) == 1, ( + "empty completion with finish_reason must surface exactly one error " + f"LlmResponse, got {len(responses)}: {responses}" + ) + empty_response = responses[0] + assert empty_response.error_code == types.FinishReason.SAFETY + assert empty_response.finish_reason == types.FinishReason.SAFETY + assert empty_response.error_message + # The model name must propagate so downstream consumers know what failed. + assert empty_response.model_version == "anthropic/claude-opus-4-8" + + +@pytest.mark.asyncio +async def test_streaming_empty_completion_with_stop_finish_reason_still_surfaces( + mock_completion, lite_llm_instance +): + """A stream that ends with finish_reason='stop' but no content also + surfaces — the model said "I'm done" but produced nothing. This is the + most common shape of the silent-empty bug (#5636 specifically called + out gemini-2.5-flash-lite returning STOP with empty content after a + tool call). Pre-fix, the (text or reasoning_parts) guard skipped + finalization for this case and the aggregator yielded nothing. + """ + stream_chunks = [ + ModelResponseStream( + model="google/gemini-3.5-flash", + choices=[StreamingChoices(finish_reason="stop", delta=Delta())], + ), + ] + mock_completion.return_value = iter(stream_chunks) + + responses = [ + response + async for response in lite_llm_instance.generate_content_async( + LLM_REQUEST_WITH_FUNCTION_DECLARATION, stream=True + ) + ] + + assert len(responses) == 1, ( + "STOP-with-empty stream must surface exactly one response, got " + f"{len(responses)}: {responses}" + ) + empty_response = responses[0] + # The finish_reason is preserved so callers see the model's signal + # ("I stopped cleanly") + the empty body is also visible (no text + # content). error_message names the failure mode so the operator + # does not have to dig. + assert empty_response.finish_reason == types.FinishReason.STOP + assert empty_response.error_message + assert empty_response.model_version == "google/gemini-3.5-flash" + + +@pytest.mark.asyncio +async def test_streaming_text_response_does_not_trigger_empty_guard( + mock_completion, lite_llm_instance +): + """Regression guard: a normal text completion must NOT trigger the + empty-guard — the existing aggregated_llm_response handles it and + exactly one response is yielded with the actual text.""" + stream_chunks = [ + ModelResponseStream( + choices=[ + StreamingChoices( + finish_reason=None, + delta=Delta(role="assistant", content="hello"), + ) + ] + ), + ModelResponseStream( + choices=[StreamingChoices(finish_reason="stop", delta=Delta())] + ), + ] + mock_completion.return_value = iter(stream_chunks) + + responses = [ + response + async for response in lite_llm_instance.generate_content_async( + LLM_REQUEST_WITH_FUNCTION_DECLARATION, stream=True + ) + ] + + # The text deltas yield partial responses + one final aggregated + # response; the empty-guard MUST NOT fire because text was produced. + assert len(responses) >= 2 + final = responses[-1] + assert final.error_code is None + assert final.finish_reason == types.FinishReason.STOP + + +@pytest.mark.asyncio +async def test_streaming_no_chunks_and_no_finish_reason_is_byte_identical( + mock_completion, lite_llm_instance +): + """The empty-guard ONLY fires when there's a finish_reason. A stream + that yields literally nothing (test doubles, etc.) must stay + byte-identical to pre-fix — zero responses, no synthesized error.""" + mock_completion.return_value = iter([]) + + responses = [ + response + async for response in lite_llm_instance.generate_content_async( + LLM_REQUEST_WITH_FUNCTION_DECLARATION, stream=True + ) + ] + + assert responses == []