-
Notifications
You must be signed in to change notification settings - Fork 8
Expand file tree
/
Copy pathsingle_agent.py
More file actions
115 lines (96 loc) · 4.96 KB
/
single_agent.py
File metadata and controls
115 lines (96 loc) · 4.96 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import os
import time
import json
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_openai import ChatOpenAI
from langchain.agents import create_openai_tools_agent, AgentExecutor
# Import utility functions (e.g., for post-processing and question sentence generation)
from util import post_process, create_question_sentence, prepare_intermediate_steps
# Retrieve the OpenAI API key from the environment
openai_api_key = os.getenv("OPENAI_API_KEY")
# Instantiate the LLM with appropriate configurations
llm_openai = ChatOpenAI(
api_key=openai_api_key,
model='gpt-4o',
temperature=0.7,
disable_streaming=True
)
def create_agent(llm, tools: list, system_prompt: str):
"""
Create an agent with the given system prompt and tools.
"""
prompt = ChatPromptTemplate.from_messages(
[
SystemMessage(content=system_prompt),
MessagesPlaceholder(variable_name="messages"),
MessagesPlaceholder(variable_name="agent_scratchpad"),
]
)
agent = create_openai_tools_agent(llm, tools, prompt)
executor = AgentExecutor(agent=agent, tools=tools, return_intermediate_steps=True)
return executor
def execute_single_agent(tools, use_summary_info):
"""
Execute the VideoQuestionAnswering task using a single agent.
The agent's task is to analyze the video using available tools and select the most plausible answer
among the five options provided.
"""
# Load the question data from an environment variable
target_question_data = json.loads(os.getenv("QA_JSON_STR"))
# Create a system prompt that outlines the task instructions only
system_prompt = (
"Your task is to perform Video Question Answering. Analyze the video using the available tools, "
"carefully reasoning through each step. Then, select the most plausible answer from the five given options. "
"Finally, respond with 'FINISH' followed by your final answer, which should be one of the following: "
"'Option A', 'Option B', 'Option C', 'Option D', or 'Option E'."
)
if use_summary_info:
summary_info = json.loads(os.getenv("SUMMARY_INFO"))
system_prompt += "\n\n[Video Summary Information]\n"
system_prompt += "Entire Summary: \n" + summary_info["entire_summary"] + "\n\n"
system_prompt += "Detail Summaries: \n" + summary_info["detail_summaries"]
# Generate the question sentence using the provided utility (do not include this in the system prompt)
question_sentence = create_question_sentence(target_question_data)
# Create the single agent with the defined system prompt and tools
single_agent = create_agent(llm_openai, tools, system_prompt=system_prompt)
# Print the input message for debugging purposes
print("******** Single Agent Input Message **********")
print(question_sentence)
print("*****************************************************")
# Create the input state message with the question sentence
state = {"messages": [HumanMessage(content=question_sentence, name="system")]}
result = single_agent.invoke(state)
output_content = result["output"]
# Process the output result (e.g., converting answer to expected format)
prediction_result = post_process(output_content)
# If the result is invalid, retry the task
if prediction_result == -1:
print("***********************************************************")
print("Error: The result is -1. Retrying VideoQuestionAnswering with the single agent.")
print("***********************************************************")
time.sleep(1)
return execute_single_agent(tools)
# Print the result for debugging purposes
print("*********** Single Agent Result **************")
print(output_content)
print("******************************************************")
# Display truth and prediction if a dataset is specified via environment variable
if os.getenv("DATASET") in ["egoschema", "nextqa", "intentqa", "hourvideo"]:
if 0 <= prediction_result <= 4:
print(
f"Truth: {target_question_data['truth']}, "
f"Pred: {prediction_result} (Option {['A', 'B', 'C', 'D', 'E'][prediction_result]})"
)
else:
print("Error: Invalid prediction result value")
elif os.getenv("DATASET") == "momaqa":
print(f"Truth: {target_question_data['truth']}, Pred: {prediction_result}")
print("******************************************************")
# Build additional outputs for debugging and traceability
intermediates = prepare_intermediate_steps(result.get("intermediate_steps", []))
agents_result_dict = {"output": output_content, "intermediate_steps": intermediates}
agent_prompts = {"system_prompt": system_prompt}
return prediction_result, agents_result_dict, agent_prompts
if __name__ == "__main__":
execute_single_agent([])