Azure · qusongms · Nov 16, 2023 · Nov 16, 2023 · Nov 20, 2023 · Nov 17, 2023
diff --git a/README.md b/README.md
@@ -140,6 +140,20 @@ To test out the online enpoint, run:
 python src/run.py --invoke 
 ```
 
+## Step 7: Simulate a multi-turn conversation
+You can simulate a multi-turn conversation. Please note you need to provide a persona profile in json format for the simulator to start with. A sample persona profile is provided in `src/tests/example_persona.json`. You can specify the number of turns of conversation to simulate with the `--num_conversation_turns` flag.
+```
+python src/run.py --implementation aisdk --simulate_conversation --num_conversation_turns 2
+```
+You can try out different sample implementations by specifying the --implementation flag with promptflow, semantickernel, langchain or aisdk. To try running with semantic kernel:
+```
+python src/run.py --implementation semantickernel --simulate_conversation --num_conversation_turns 2
+```
+The flag `--evalute` can be used together with simulator to simulate a multi-turn conversation and evaluate:
+```
+python src/run.py --implementation aisdk --simulate_conversation --num_conversation_turns 2 --evaluate
+```
+
 ## Additional Tips and Resources
 
 ### Customize the development container

diff --git a/requirements.txt b/requirements.txt
@@ -2,7 +2,7 @@
 ipykernel
 
 # generative ai SDK dependencies
-azure-ai-generative[evaluate,index,promptflow]
+azure-ai-generative[evaluate,index,promptflow,simulator]
 
 # hardcoded the version of azureml-mlflow here for faster Docker image building speed
 azureml-mlflow==1.53.0

diff --git a/src/run.py b/src/run.py
@@ -17,11 +17,14 @@
 import pathlib
 import pandas as pd
 import shutil
+from typing import Callable, Dict, List
 
 from azure.ai.resources.client import AIClient
 from azure.ai.resources.entities.models import Model
 from azure.ai.resources.entities.deployment import Deployment
 from azure.identity import DefaultAzureCredential
+from azure.ai.resources.entities import AzureOpenAIModelConfiguration
+from azure.ai.generative.synthetic.simulator import Simulator, SimulatorTemplates
 
 source_path = "./src"
 
@@ -73,6 +76,39 @@ def copilot_qna(question, chat_completion_fn):
         "context": response["context"]
     }
 
+def evaluate_result(task_type,
+             data_mapping,
+             metrics_list,
+             dataset,
+             target=None,
+             evaluation_name="baseline-evaluation",
+             output_path="./evaluation_output"):
+    from azure.ai.generative.evaluate import evaluate
+
+    client = AIClient.from_config(DefaultAzureCredential())
+
+    model_config = {
+                "api_version": os.getenv("OPENAI_API_VERSION"),
+                "api_base": os.getenv("OPENAI_API_BASE"),
+                "api_type": os.getenv("OPENAI_API_TYPE"),
+                "api_key": os.getenv("OPENAI_API_KEY"),
+                "deployment_id": os.environ["AZURE_OPENAI_EVALUATION_DEPLOYMENT"]
+        }
+
+    result = evaluate(
+        evaluation_name=evaluation_name,
+        data=dataset,
+        target=target,
+        task_type=task_type,
+        model_config=model_config,
+        data_mapping = data_mapping,
+        metrics_list = metrics_list,
+        tracking_uri=client.tracking_uri,
+        output_path=output_path,
+    )
+    tabular_result = pd.read_json(os.path.join(output_path, "eval_results.jsonl"), lines=True)
+    return result, tabular_result
+
 
 # Define helper methods
 def load_jsonl(path):
@@ -91,32 +127,21 @@ def run_evaluation(chat_completion_fn, name, dataset_path):
     # temp: generate a single-turn qna wrapper over the chat completion function
     qna_fn = lambda question: copilot_qna(question, chat_completion_fn)
     output_path = "./evaluation_output"
-
-    client = AIClient.from_config(DefaultAzureCredential())
-    result = evaluate(
-        evaluation_name=name,
-        target=qna_fn,
-        data=dataset,
-        task_type="qa",
-        data_mapping={
+    data_mapping={
             "questions": "question",
             "contexts": "context",
             "y_pred": "answer",
             "y_test": "truth"
-        },
-        model_config={
-            "api_version": "2023-05-15",
-            "api_base": os.getenv("OPENAI_API_BASE"),
-            "api_type": "azure",
-            "api_key": os.getenv("OPENAI_API_KEY"),
-            "deployment_id": os.getenv("AZURE_OPENAI_EVALUATION_DEPLOYMENT")
-        },
-        metrics_list=["exact_match", "gpt_groundedness", "gpt_relevance", "gpt_coherence"],
-        tracking_uri=client.tracking_uri,
-        output_path=output_path,
-    )
-
-    tabular_result = pd.read_json(os.path.join(output_path, "eval_results.jsonl"), lines=True)
+        }
+    metrics_list=["exact_match", "gpt_groundedness", "gpt_relevance", "gpt_coherence"]
+
+    result, tabular_result = evaluate_result(task_type="qa",
+             data_mapping=data_mapping,
+             metrics_list=metrics_list,
+             dataset=dataset,
+             target=qna_fn,
+             evaluation_name=name,
+             output_path=output_path)
 
     return result, tabular_result
 
@@ -200,7 +225,91 @@ def invoke_deployment(deployment_name: str, stream: bool = False):
     else:
         print(response.json())
 
+def create_simulator_callback_fn(context_key, chat_completion_fn: Callable[[str, List[Dict], dict], str] = None, callback_citation_key: str = "callback_citations"):
+    async def sim_callback(question, conversation_history, meta_data):
+        # you may also await async call
+        #context_key = conv_template.context_key[0]
+        messages = []
+        for i in range(len(conversation_history)):
+            turn = conversation_history[i]
+            message = turn.to_openai_chat_format()
+            messages.append(message)
+
+        response = await chat_completion_fn(messages = messages)
+        context_dict = {"turn_" + str(i + 1): response["choices"][0]['context']}
+        meta_data[context_key]["callback_citation_key"] = callback_citation_key
+        if callback_citation_key in meta_data[context_key].keys():
+            meta_data[context_key][callback_citation_key].update(context_dict)
+        else:
+            meta_data[context_key][callback_citation_key] = context_dict
+        return response["choices"][0]["message"]['content']
+    return sim_callback
+
+def simulate_conversation(chat_completion_fn, persona_profile='./src/tests/example_persona.json',
+                          num_conv_turn=2, max_tokens=500, temperature=0.0, save=True):
+    import logging
+    logging.disable(logging.INFO)
+    client = AIClient.from_config(DefaultAzureCredential())
 
+    # initialize system bot model
+    system_bot_model = AzureOpenAIModelConfiguration.from_connection(
+        connection=client.get_default_aoai_connection(),
+        model_name= os.environ["AZURE_OPENAI_CHAT_MODEL"],
+        deployment_name=os.environ["AZURE_OPENAI_CHAT_DEPLOYMENT"],
+        max_tokens=max_tokens,
+        temperature=temperature
+    )
+    st = SimulatorTemplates()
+     # retrieve template for conversation task
+    conv_template = st.get_template("conversation")
+
+    # initialize the conversation template parameters
+    with open(persona_profile, 'r') as f:
+        content = f.read()
+    conv_parameters = json.loads(content)
+
+    # create sim_callback function with customer chat_completion function
+    try:
+        context_key = conv_template.context_key[0]
+    except:
+        raise Exception("No context key found in f{persona_profile}.")
+    sim_callback = create_simulator_callback_fn(context_key = context_key,
+                                                chat_completion_fn= chat_completion_fn)
+    # simulate the conversation
+    simulator = Simulator(simulate_callback=sim_callback, systemConnection=system_bot_model)
+    conv_callback = asyncio.run(simulator.simulate_async(
+                                conv_template,
+                                conv_parameters,
+                                max_conversation_turns=num_conv_turn,
+                                api_call_delay_sec=10,
+                                api_call_retry_sleep_sec=10,
+                                api_call_retry_max_count=2,
+                            ))
+    if save:
+        data_output_file = 'simulator_conv.jsonl'
+        with open(data_output_file, 'w', encoding='utf-8') as file:
+            json.dump(conv_callback, file)
+    return conv_callback
+
+def play_conversation(conversation_history):
+    for turn in conversation_history['messages']:
+        print(f"Conversation Turn {turn['turn_number']}:")
+        print(f"role: {turn['role']}\ncontent: {turn['content']}")
+        print('-' * 150)
+
+def simulate_conversation_and_evaluate(chat_completion_fn, persona_profile, num_conv_turn, max_tokens, temperature,
+                          eval_name = "baseline_evaluation"):
+    simulated_conversation = simulate_conversation(chat_completion_fn=chat_completion_fn, persona_profile=persona_profile,
+                          num_conv_turn=num_conv_turn, max_tokens=max_tokens, temperature=temperature, save=True)
+    play_conversation(simulated_conversation)
+    output_path = "./evaluation_output"
+    result, tabular_result = evaluate_result(task_type="chat",
+            data_mapping={"y_pred": "messages"},
+            metrics_list=['gpt_groundedness', 'gpt_relevance', 'gpt_retrieval_score'],
+            dataset='simulator_conv.jsonl',
+            evaluation_name=eval_name,
+            output_path=output_path)
+    return result, tabular_result
 
 # Run a single chat message through one of the co-pilot implementations
 if __name__ == "__main__":
@@ -226,6 +335,14 @@ def invoke_deployment(deployment_name: str, stream: bool = False):
     parser.add_argument("--build-index", help="Build an index with the default docs", action='store_true')
     parser.add_argument("--stream", help="Whether response from a particular implementation should be streamed or not", action="store_true")
     parser.add_argument("--invoke-deployment", help="Invoke a deployment and print out response", action="store_true")
+    parser.add_argument("--simulate_conversation", help="Simulate multi-turn conversation", action='store_true')
+    parser.add_argument("--persona_profile", help="Persona Profile to use for simulator",
+                        default="src/tests/example_persona.json", type=str)
+    parser.add_argument("--num_conversation_turns", help="Number of conversation turns to simulate", default=2, type=int)
+    parser.add_argument("--max_tokens", help="The max number of tokens to generate in simulator", default=500, type=int)
+    parser.add_argument("--temperature",
+                        help="hyperparameter that regulates the randomness, or creativity, of the model responses",
+                        default=0.0, type=float)
     args = parser.parse_args()
 
     if args.implementation:
@@ -255,7 +372,7 @@ def invoke_deployment(deployment_name: str, stream: bool = False):
 
     if args.build_index:
         build_cogsearch_index(os.getenv("AZURE_AI_SEARCH_INDEX_NAME"), "./data/3-product-info")
-    elif args.evaluate:
+    elif args.evaluate and not args.simulate_conversation:
         result, tabular_result = run_evaluation(chat_completion, name=f"test-{args.implementation}-copilot",
                                  dataset_path=args.dataset_path)
         pprint("-----Summarized Metrics-----")
@@ -268,6 +385,24 @@ def invoke_deployment(deployment_name: str, stream: bool = False):
         deploy_flow(deployment_name, deployment_folder, chat_module)
     elif args.invoke_deployment:
         invoke_deployment(args.deployment_name, stream=args.stream)
+    elif args.simulate_conversation and args.evaluate:
+        result, tabular_result = simulate_conversation_and_evaluate(chat_completion_fn=chat_completion,
+                                                                    persona_profile=args.persona_profile,
+                                                                    num_conv_turn=args.num_conversation_turns,
+                                                                    max_tokens=args.max_tokens,
+                                                                    temperature=args.temperature,
+                                                                    eval_name = f"test-{args.implementation}-copilot-simulator")
+        pprint("-----Summarized Metrics-----")
+        pprint(result.metrics_summary)
+        pprint("-----Tabular Result-----")
+        pprint(tabular_result)
+        pprint(f"View evaluation results in AI Studio: {result.studio_url}")
+    elif args.simulate_conversation:
+        conversation = simulate_conversation(chat_completion_fn=chat_completion, persona_profile=args.persona_profile,
+                          num_conv_turn=args.num_conversation_turns,
+                          max_tokens=args.max_tokens,
+                          temperature=args.temperature, save=True)
+        play_conversation(conversation)
     else:
         question = "which tent is the most waterproof?"
         if args.question:

diff --git a/src/tests/example_persona.json b/src/tests/example_persona.json
@@ -0,0 +1 @@
+{"name": "Jane", "profile": "\n    Jane Doe is a 28-year-old outdoor enthusiast \n    who lives in Seattle, Washington. \n    She has a passion for exploring nature and loves going on camping and hiking trips with her friends. \n    She has recently become a member of the company's loyalty program and has achieved Bronze level status.Jane has a busy schedule, but she always makes time for her outdoor adventures.\n    She is constantly looking for high-quality gear that can help her make the most of her trips and ensure she has a comfortable experience in the outdoors.Recently, Jane purchased a TrailMaster X4 Tent from the company. \n    This tent is perfect for her needs, as it is both durable and spacious, allowing her to enjoy her camping trips with ease. \n    The price of the tent was $250, and it has already proved to be a great investment.In addition to the tent, Jane also bought a Pathfinder Pro-1 Adventure Compass for $39.99. This compass has helped her navigate challenging trails with confidence, ensuring that she never loses her way during her adventures.Finally, Jane decided to upgrade her sleeping gear by purchasing a CozyNights Sleeping Bag for $100. This sleeping bag has made her camping nights even more enjoyable, as it provides her with the warmth and comfort she needs after a long day of hiking.", "tone": "happy", "metadata": {"customer_info": "## customer_info      name: Jane Doe    age: 28     phone_number: 555-987-6543     email: jane.doe@example.com     address: 789 Broadway St, Seattle, WA 98101      loyalty_program: True     loyalty_program Level: Bronze        ## recent_purchases      order_number: 5  date: 2023-05-01  item: - description:  TrailMaster X4 Tent, quantity 1, price $250    item_number: 1   order_number: 18  date: 2023-05-04  item: - description:  Pathfinder Pro-1 Adventure Compass, quantity 1, price $39.99    item_number: 4   order_number: 28  date: 2023-04-15  item: - description:  CozyNights Sleeping Bag, quantity 1, price $100    item_number: 7"}, "task": "Jane is trying to accomplish the task of finding out the best hiking backpacks suitable for her weekend camping trips, and how they compare with other options available in the market. She wants to make an informed decision before making a purchase from the outdoor gear company's website or visiting their physical store.Jane uses Google to search for 'best hiking backpacks for weekend trips,' hoping to find reliable and updated information from official sources or trusted websites. She expects to see a list of top-rated backpacks, their features, capacity, comfort, durability, and prices. She is also interested in customer reviews to understand the pros and cons of each backpack.Furthermore, Jane wants to see the specifications, materials used, waterproof capabilities, and available colors for each backpack. She also wants to compare the chosen backpacks with other popular brands like Osprey, Deuter, or Gregory. Jane plans to spend about 20 minutes on this task and shortlist two or three options that suit her requirements and budget.Finally, as a Bronze level member of the outdoor gear company's loyalty program, Jane might also want to contact customer service to inquire about any special deals or discounts available on her shortlisted backpacks, ensuring she gets the best value for her purchase.", "chatbot_name": "ChatBot"}
diff --git a/src/tests/test_aisdk_copilot.py b/src/tests/test_aisdk_copilot.py
@@ -1,6 +1,6 @@
 import pytest
 
-from run import run_evaluation
+from run import run_evaluation, simulate_conversation_and_evaluate
 
 # Test results are stored here
 evaluation_results : any
@@ -9,18 +9,30 @@
 def run_before_any_test():
     from copilot_aisdk import chat
 
-    global metrics_summary, tabular_result
+    global metrics_summary, metrics_summary_chat
     metrics_summary, tabular_result = run_evaluation(chat.chat_completion,
                                         "test_aisdk_copilot",
                                         "src/tests/evaluation_dataset.jsonl")
+    metrics_summary_chat, tabular_result_chat = simulate_conversation_and_evaluate(
+        chat_completion_fn=chat.chat_completion,
+        persona_profile="src/tests/example_persona.json",
+        num_conv_turn=2, max_tokens=500, temperature=0.0 ,
+        eval_name = "test_aisdk_chat")
 
 def test_gpt_groundedness_atleast4():
-    assert(metrics_summary['mean_gpt_groundedness'] >= 4)
+    assert(metrics_summary.metrics_summary['mean_gpt_groundedness'] >= 4)
 
 def test_gpt_relevance_atleast4():
-    assert(metrics_summary['mean_gpt_relevance'] >= 4)
-        
+    assert(metrics_summary.metrics_summary['mean_gpt_relevance'] >= 4)
+
 def test_gpt_coherence_atleast4():
-    assert(metrics_summary['mean_gpt_coherence'] >= 4)
-
-
+    assert(metrics_summary.metrics_summary['mean_gpt_coherence'] >= 4)
+
+def test_chat_gpt_groundedness_atleast4():
+    assert(metrics_summary_chat.metrics_summary['mean_gpt_groundedness'] >= 4)
+
+def test_chat_gpt_relevance_atleast4():
+    assert(metrics_summary_chat.metrics_summary['mean_gpt_relevance'] >= 4)
+
+def test_chat_gpt_retrieval_score_atleast4():
+    assert(metrics_summary_chat.metrics_summary['mean_gpt_retrieval_score'] >= 4)
diff --git a/src/tests/test_promptflow_copilot.py b/src/tests/test_promptflow_copilot.py
@@ -1,6 +1,6 @@
 import pytest
 
-from run import run_evaluation
+from run import run_evaluation, simulate_conversation_and_evaluate
 
 # Test results are stored here
 evaluation_results : any
@@ -9,18 +9,30 @@
 def run_before_any_test():
     from copilot_promptflow import chat
 
-    global metrics_summary
+    global metrics_summary, metrics_summary_chat
     metrics_summary, tabular_result =  run_evaluation(chat.chat_completion, 
                                         "test_promptflow_copilot",
                                         "src/tests/evaluation_dataset.jsonl")
-
+    metrics_summary_chat, tabular_result_chat = simulate_conversation_and_evaluate(
+        chat_completion_fn=chat.chat_completion,
+        persona_profile="src/tests/example_persona.json",
+        num_conv_turn=2, max_tokens=500, temperature=0.0,
+        eval_name = "test_promptflow_chat")
+
 def test_gpt_groundedness_atleast4():
-    assert(metrics_summary['mean_gpt_groundedness'] >= 4)
+    assert(metrics_summary.metrics_summary['mean_gpt_groundedness'] >= 4)
 
 def test_gpt_relevance_atleast4():
-    assert(metrics_summary['mean_gpt_relevance'] >= 4)
-        
+    assert(metrics_summary.metrics_summary['mean_gpt_relevance'] >= 4)
+
 def test_gpt_coherence_atleast4():
-    assert(metrics_summary['mean_gpt_coherence'] >= 4)
-
-
+    assert(metrics_summary.metrics_summary['mean_gpt_coherence'] >= 4)
+
+def test_chat_gpt_groundedness_atleast4():
+    assert(metrics_summary_chat.metrics_summary['mean_gpt_groundedness'] >= 4)
+
+def test_chat_gpt_relevance_atleast4():
+    assert(metrics_summary_chat.metrics_summary['mean_gpt_relevance'] >= 4)
+
+def test_chat_gpt_retrieval_score_atleast4():
+    assert(metrics_summary_chat.metrics_summary['mean_gpt_retrieval_score'] >= 4)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"name": "Jane", "profile": "\n Jane Doe is a 28-year-old outdoor enthusiast \n who lives in Seattle, Washington. \n She has a passion for exploring nature and loves going on camping and hiking trips with her friends. \n She has recently become a member of the company's loyalty program and has achieved Bronze level status.Jane has a busy schedule, but she always makes time for her outdoor adventures.\n She is constantly looking for high-quality gear that can help her make the most of her trips and ensure she has a comfortable experience in the outdoors.Recently, Jane purchased a TrailMaster X4 Tent from the company. \n This tent is perfect for her needs, as it is both durable and spacious, allowing her to enjoy her camping trips with ease. \n The price of the tent was $250, and it has already proved to be a great investment.In addition to the tent, Jane also bought a Pathfinder Pro-1 Adventure Compass for $39.99. This compass has helped her navigate challenging trails with confidence, ensuring that she never loses her way during her adventures.Finally, Jane decided to upgrade her sleeping gear by purchasing a CozyNights Sleeping Bag for $100. This sleeping bag has made her camping nights even more enjoyable, as it provides her with the warmth and comfort she needs after a long day of hiking.", "tone": "happy", "metadata": {"customer_info": "## customer_info name: Jane Doe age: 28 phone_number: 555-987-6543 email: jane.doe@example.com address: 789 Broadway St, Seattle, WA 98101 loyalty_program: True loyalty_program Level: Bronze ## recent_purchases order_number: 5 date: 2023-05-01 item: - description: TrailMaster X4 Tent, quantity 1, price $250 item_number: 1 order_number: 18 date: 2023-05-04 item: - description: Pathfinder Pro-1 Adventure Compass, quantity 1, price $39.99 item_number: 4 order_number: 28 date: 2023-04-15 item: - description: CozyNights Sleeping Bag, quantity 1, price $100 item_number: 7"}, "task": "Jane is trying to accomplish the task of finding out the best hiking backpacks suitable for her weekend camping trips, and how they compare with other options available in the market. She wants to make an informed decision before making a purchase from the outdoor gear company's website or visiting their physical store.Jane uses Google to search for 'best hiking backpacks for weekend trips,' hoping to find reliable and updated information from official sources or trusted websites. She expects to see a list of top-rated backpacks, their features, capacity, comfort, durability, and prices. She is also interested in customer reviews to understand the pros and cons of each backpack.Furthermore, Jane wants to see the specifications, materials used, waterproof capabilities, and available colors for each backpack. She also wants to compare the chosen backpacks with other popular brands like Osprey, Deuter, or Gregory. Jane plans to spend about 20 minutes on this task and shortlist two or three options that suit her requirements and budget.Finally, as a Bronze level member of the outdoor gear company's loyalty program, Jane might also want to contact customer service to inquire about any special deals or discounts available on her shortlisted backpacks, ensuring she gets the best value for her purchase.", "chatbot_name": "ChatBot"}