From 3f9ed0f0b564e27ae6c9b0e367472a3d691300db Mon Sep 17 00:00:00 2001 From: Yuki Oshima <39944763+os1ma@users.noreply.github.com> Date: Thu, 22 Jan 2026 08:39:16 +0000 Subject: [PATCH] =?UTF-8?q?Trajectory=E3=81=AE=E8=A9=95=E4=BE=A1=E3=81=AE?= =?UTF-8?q?=E4=BE=8B=E3=82=92=E8=BF=BD=E5=8A=A0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- evals/eval_day1_5_naive_agent.py | 68 ++++++++++++++++++++++++++++++++ pages/day1_5_naive_agent.py | 3 +- 2 files changed, 70 insertions(+), 1 deletion(-) create mode 100644 evals/eval_day1_5_naive_agent.py diff --git a/evals/eval_day1_5_naive_agent.py b/evals/eval_day1_5_naive_agent.py new file mode 100644 index 0000000..f6fbc80 --- /dev/null +++ b/evals/eval_day1_5_naive_agent.py @@ -0,0 +1,68 @@ +import asyncio +from typing import Any + +import weave +from dotenv import load_dotenv +from langchain_core.messages import AIMessage, HumanMessage +from weave import Evaluation, Model + +from pages.day1_5_naive_agent import create_agent_with_tools + + +class MyModel(Model): + model_name: str + reasoning_effort: str + + @weave.op + def predict(self, message: str) -> Any: + # エージェントを作成 + agent = create_agent_with_tools( + model_name=self.model_name, + reasoning_effort=self.reasoning_effort, + ) + + # エージェントを実行 + return agent.invoke({"messages": [HumanMessage(content=message)]}) + + +@weave.op +def exist_tool_call(output: Any, expected_tool: str) -> int: + messages = output["messages"] + + # expected_toolが含まれるtool_callがあるかどうかを確認 + for message in messages: + if isinstance(message, AIMessage): + if len(message.tool_calls) > 0: + for tool_call in message.tool_calls: + if tool_call["name"] == expected_tool: + return 1 + + return 0 + + +dataset = [ + {"message": "東京の今日の天気は?", "expected_tool": "tavily_search"}, + {"message": "東京の今日の天気は?Web検索して", "expected_tool": "tavily_search"}, +] + + +def main() -> None: + load_dotenv(override=True) + weave.init("training-llm-app") + + # 評価対象の処理を準備 + my_model = MyModel( + model_name="gpt-5-nano", + reasoning_effort="minimal", + ) + + # 評価の実行 + evaluation = Evaluation( + dataset=dataset, + scorers=[exist_tool_call], + ) + asyncio.run(evaluation.evaluate(my_model)) + + +if __name__ == "__main__": + main() diff --git a/pages/day1_5_naive_agent.py b/pages/day1_5_naive_agent.py index 08b1d90..2c9d920 100644 --- a/pages/day1_5_naive_agent.py +++ b/pages/day1_5_naive_agent.py @@ -132,4 +132,5 @@ def app() -> None: messages.extend(new_messages) -app() +if __name__ == "__main__": + app()