VideoMultiAgents/single_agent.py at main · PanasonicConnect/VideoMultiAgents · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import os
import time
import json
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_openai import ChatOpenAI
from langchain.agents import create_openai_tools_agent, AgentExecutor

# Import utility functions (e.g., for post-processing and question sentence generation)
from util import post_process, create_question_sentence, prepare_intermediate_steps

# Retrieve the OpenAI API key from the environment
openai_api_key = os.getenv("OPENAI_API_KEY")

# Instantiate the LLM with appropriate configurations
llm_openai = ChatOpenAI(
    api_key=openai_api_key,
    model='gpt-4o',
    temperature=0.7,
    disable_streaming=True
)

def create_agent(llm, tools: list, system_prompt: str):
    """
    Create an agent with the given system prompt and tools.
    """
    prompt = ChatPromptTemplate.from_messages(
        [
            SystemMessage(content=system_prompt),
            MessagesPlaceholder(variable_name="messages"),
            MessagesPlaceholder(variable_name="agent_scratchpad"),
        ]
    )
    agent = create_openai_tools_agent(llm, tools, prompt)
    executor = AgentExecutor(agent=agent, tools=tools, return_intermediate_steps=True)
    return executor

def execute_single_agent(tools, use_summary_info):
    """
    Execute the VideoQuestionAnswering task using a single agent.

    The agent's task is to analyze the video using available tools and select the most plausible answer
    among the five options provided.
    """
    # Load the question data from an environment variable
    target_question_data = json.loads(os.getenv("QA_JSON_STR"))

    # Create a system prompt that outlines the task instructions only
    system_prompt = (
        "Your task is to perform Video Question Answering. Analyze the video using the available tools, "
        "carefully reasoning through each step. Then, select the most plausible answer from the five given options. "
        "Finally, respond with 'FINISH' followed by your final answer, which should be one of the following: "
        "'Option A', 'Option B', 'Option C', 'Option D', or 'Option E'."
    )

    if use_summary_info:
        summary_info = json.loads(os.getenv("SUMMARY_INFO"))
        system_prompt += "\n\n[Video Summary Information]\n"
        system_prompt += "Entire Summary: \n" + summary_info["entire_summary"] + "\n\n"
        system_prompt += "Detail Summaries: \n" + summary_info["detail_summaries"]

    # Generate the question sentence using the provided utility (do not include this in the system prompt)
    question_sentence = create_question_sentence(target_question_data)

    # Create the single agent with the defined system prompt and tools
    single_agent = create_agent(llm_openai, tools, system_prompt=system_prompt)

    # Print the input message for debugging purposes
    print("******** Single Agent Input Message **********")
    print(question_sentence)
    print("*****************************************************")

    # Create the input state message with the question sentence
    state = {"messages": [HumanMessage(content=question_sentence, name="system")]}
    result = single_agent.invoke(state)
    output_content = result["output"]

    # Process the output result (e.g., converting answer to expected format)
    prediction_result = post_process(output_content)

    # If the result is invalid, retry the task
    if prediction_result == -1:
        print("***********************************************************")
        print("Error: The result is -1. Retrying VideoQuestionAnswering with the single agent.")
        print("***********************************************************")
        time.sleep(1)
        return execute_single_agent(tools)

    # Print the result for debugging purposes
    print("*********** Single Agent Result **************")
    print(output_content)
    print("******************************************************")

    # Display truth and prediction if a dataset is specified via environment variable
    if os.getenv("DATASET") in ["egoschema", "nextqa", "intentqa", "hourvideo"]:
        if 0 <= prediction_result <= 4:
            print(
                f"Truth: {target_question_data['truth']}, "
                f"Pred: {prediction_result} (Option {['A', 'B', 'C', 'D', 'E'][prediction_result]})"
            )
        else:
            print("Error: Invalid prediction result value")
    elif os.getenv("DATASET") == "momaqa":
        print(f"Truth: {target_question_data['truth']}, Pred: {prediction_result}")
    print("******************************************************")

    # Build additional outputs for debugging and traceability
    intermediates = prepare_intermediate_steps(result.get("intermediate_steps", []))
    agents_result_dict = {"output": output_content, "intermediate_steps": intermediates}
    agent_prompts = {"system_prompt": system_prompt}

    return prediction_result, agents_result_dict, agent_prompts

if __name__ == "__main__":
    execute_single_agent([])