fixes

shreymodi1 · shreymodi1 · commit 7d6d9050a7c8 · 2025-11-06T15:30:02.000-08:00
diff --git a/.github/workflows/streaming_compliance.yml b/.github/workflows/streaming_compliance.yml
@@ -1,6 +1,7 @@
 name: Streaming Compliance Benchmark
 
 on:
+  push:
   workflow_dispatch:
     inputs:
       model:
diff --git a/eval_protocol/benchmarks/test_glm_streaming_compliance.py b/eval_protocol/benchmarks/test_glm_streaming_compliance.py
@@ -1,7 +1,5 @@
 """Benchmarks for GLM streaming regressions (structured output + tool calls)."""
 
-from __future__ import annotations
-
 import json
 from typing import Any
 
@@ -145,6 +143,17 @@ def _safe_json_loads(payload: str) -> Any | None:
 def test_glm_streaming_structured_output(row: EvaluationRow) -> EvaluationRow:
     """Ensure structured output arrives in assistant content when streaming."""
 
+    import os
+
+    print(
+        "DEBUG completion params",
+        os.environ.get("EP_COMPLETION_PARAMS_JSON"),
+        "key len",
+        len(os.environ.get("FIREWORKS_API_KEY", "")),
+        "acct",
+        os.environ.get("FIREWORKS_ACCOUNT_ID"),
+    )
+
     assistant_msg = row.last_assistant_message()
     if assistant_msg is None:
         row.evaluation_result = EvaluateResult(
@@ -248,6 +257,17 @@ def test_glm_streaming_structured_output(row: EvaluationRow) -> EvaluationRow:
 def test_glm_streaming_tool_call(row: EvaluationRow) -> EvaluationRow:
     """Ensure streaming tool calls settle with finish_reason=tool_calls and a single call."""
 
+    import os
+
+    print(
+        "DEBUG completion params",
+        os.environ.get("EP_COMPLETION_PARAMS_JSON"),
+        "key len",
+        len(os.environ.get("FIREWORKS_API_KEY", "")),
+        "acct",
+        os.environ.get("FIREWORKS_ACCOUNT_ID"),
+    )
+
     assistant_msg = row.last_assistant_message()
     if assistant_msg is None:
         row.evaluation_result = EvaluateResult(
diff --git a/eval_protocol/pytest/default_single_turn_rollout_process.py b/eval_protocol/pytest/default_single_turn_rollout_process.py
@@ -98,8 +98,12 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
 
             finish_reason = getattr(response.choices[0], "finish_reason", None)
 
-            assistant_content = response.choices[0].message.content or ""
-            tool_calls = response.choices[0].message.tool_calls if response.choices[0].message.tool_calls else None
+            assistant_message = response.choices[0].message
+            assistant_content = assistant_message.content or ""
+            reasoning_content = getattr(assistant_message, "reasoning_content", None)
+            if reasoning_content is None:
+                reasoning_content = getattr(assistant_message, "reasoning", None)
+            tool_calls = assistant_message.tool_calls if assistant_message.tool_calls else None
 
             converted_tool_calls = None
             if tool_calls:
@@ -136,6 +140,7 @@ async def process_row(row: EvaluationRow) -> EvaluationRow:
                 Message(
                     role="assistant",
                     content=assistant_content,
+                    reasoning_content=reasoning_content,
                     tool_calls=converted_tool_calls,
                 )
             ]

-Original file line number
+Diff line change
@@ @@ -1,6 +1,7 @@ @@
 name: Streaming Compliance Benchmark
 on:
 +  push:
   workflow_dispatch:
     inputs:
       model: