From 19f8284957df8b48d02ff4d88e89c9d3af948d69 Mon Sep 17 00:00:00 2001
From: vismaytiwari <vismay.t@gmail.com>
Date: Sun, 21 Jun 2026 21:40:18 +0530
Subject: [PATCH] fix(langchain): mark handled tool errors as errors

---
 langfuse/langchain/CallbackHandler.py | 24 ++++++++++++++----
 tests/unit/test_langchain.py          | 35 ++++++++++++++++++++++++++-
 2 files changed, 53 insertions(+), 6 deletions(-)

diff --git a/langfuse/langchain/CallbackHandler.py b/langfuse/langchain/CallbackHandler.py
index 2989ef216..2f6efe84f 100644
--- a/langfuse/langchain/CallbackHandler.py
+++ b/langfuse/langchain/CallbackHandler.py
@@ -1091,7 +1091,7 @@ def on_retriever_end(
 
     def on_tool_end(
         self,
-        output: str,
+        output: Any,
         *,
         run_id: UUID,
         parent_run_id: Optional[UUID] = None,
@@ -1105,10 +1105,24 @@ def on_tool_end(
             if observation is not None:
                 if parent_run_id is None:
                     self._clear_root_run_resume_key(run_id)
-                observation.update(
-                    output=output,
-                    input=kwargs.get("inputs"),
-                ).end()
+
+                update_kwargs: Dict[str, Any] = {
+                    "output": output,
+                    "input": kwargs.get("inputs"),
+                }
+
+                if (
+                    isinstance(output, ToolMessage)
+                    and getattr(output, "status", None) == "error"
+                ):
+                    update_kwargs["level"] = "ERROR"
+                    update_kwargs["status_message"] = (
+                        output.content
+                        if isinstance(output.content, str)
+                        else str(output.content)
+                    )
+
+                observation.update(**update_kwargs).end()
 
         except Exception as e:
             langfuse_logger.exception(e)
diff --git a/tests/unit/test_langchain.py b/tests/unit/test_langchain.py
index fa7934ba7..8d3b3517b 100644
--- a/tests/unit/test_langchain.py
+++ b/tests/unit/test_langchain.py
@@ -5,7 +5,7 @@
 
 import pytest
 from langchain.messages import HumanMessage
-from langchain_core.messages import AIMessage
+from langchain_core.messages import AIMessage, ToolMessage
 from langchain_core.output_parsers import StrOutputParser
 from langchain_core.outputs import ChatGeneration, ChatResult, Generation, LLMResult
 from langchain_core.prompts import ChatPromptTemplate
@@ -791,6 +791,39 @@ class DummyControlFlowError(RuntimeError):
     assert not _has_run_state(handler, retriever_run_id)
 
 
+def test_handled_tool_error_marks_observation_error(
+    langfuse_memory_client, get_span, json_attr
+):
+    handler = CallbackHandler()
+    run_id = uuid4()
+
+    handler.on_tool_start(
+        {"name": "failing_tool"},
+        '{"query": "x"}',
+        run_id=run_id,
+    )
+    handler.on_tool_end(
+        ToolMessage(
+            content="handled failure",
+            tool_call_id="call_1",
+            status="error",
+        ),
+        run_id=run_id,
+    )
+
+    langfuse_memory_client.flush()
+    span = get_span("failing_tool")
+
+    assert span.attributes[LangfuseOtelSpanAttributes.OBSERVATION_LEVEL] == "ERROR"
+    assert (
+        span.attributes[LangfuseOtelSpanAttributes.OBSERVATION_STATUS_MESSAGE]
+        == "handled failure"
+    )
+    assert json_attr(span, LangfuseOtelSpanAttributes.OBSERVATION_OUTPUT)["status"] == (
+        "error"
+    )
+
+
 def test_pending_resume_contexts_are_capped(langfuse_memory_client, monkeypatch):
     class DummyControlFlowError(RuntimeError):
         pass