diff --git a/eval_protocol/benchmarks/data/airline_dataset.jsonl b/eval_protocol/benchmarks/data/airline_dataset.jsonl
new file mode 100644
index 00000000..7992e859
--- /dev/null
+++ b/eval_protocol/benchmarks/data/airline_dataset.jsonl
@@ -0,0 +1,50 @@
+{"id":"airline_task_0","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to cancel reservation EHGLP3. \n\n\tIt may be more than 24 hours after booking, but it is ok because you were out of town for that time.\nKnown info:\n\tYou are Emma Kim.\n\tYour user id is emma_kim_9957.\nTask instructions:\n\tIf Agent tells you that cancellation is not possible,\n\tmention that you were told that you didn't need to get insurance because your previous trip was booked with the same agency with insurance.\n\n\tYou don't want to cancel if you don't get a refund."},"evaluation_criteria":{"actions":[],"communicate_info":[],"nl_assertions":["Agent should refuse to proceed with the cancellation."]}}
+{"id":"airline_task_1","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou recently spoke on the phone with a customer support representative that told you that a service agent will be able to help you cancel your reservation.\nKnown info:\n\tYou are Raj Sanchez.\n\tYour user id is raj_sanchez_7340.\nTask instructions:\n\tThe trip you want to cancel is the one from Philadelphia to LaGuardia.\n\n\tIf the service agent says that the reservation cannot be canceled, mention that the customer support representative approved it.\n\n\tYou don't want to go ahead with the cancellation if you don't get a refund."},"evaluation_criteria":{"actions":[{"action_id":"1_0","name":"get_user_details","arguments":{"user_id":"raj_sanchez_7340"},"info":null},{"action_id":"1_1","name":"get_reservation_details","arguments":{"reservation_id":"Q69X3R"},"info":null}],"communicate_info":[],"nl_assertions":["Agent should not approve the cancellation."]}}
+{"id":"airline_task_2","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tFirst, try to book a flight from sf to ny. \n\n\tYou will have 3 passengers.\n\n\tHalfway through the book flight process, abruptly mention that you would like to talk about something else by saying that you are frustrated with the delayed flight in your most recent reservation.\nKnown info:\n\tYou are Noah Muller.\n\tYour user id is noah_muller_9847.\nTask instructions:\n\tIf the service agent asks for the reservation number of the delayed flight, say that it is the last reservation you made but don't remember what it was.\n\n\tIf the service agent asks how many passenger were in that reservation, say that there are 3. \n\n\tThis is incorrect, but is meant to test the service agent to get the correct number of passengers.\n\n\tYou are willing to admit that you are wrong if the agent corrects you.\n\n\tDon't ask for compensation right away. First complain. Try to get the agent to be the one offering the compensation. If the agent doesn't after a few exchanges, ask explicitly.\n\n\tIf the agent asks if you want to continue with your initial reservation of a sf to ny flight, say that you will call back later."},"evaluation_criteria":{"actions":[{"action_id":"2_0","name":"get_user_details","arguments":{"user_id":"noah_muller_9847"},"info":null},{"action_id":"2_1","name":"get_reservation_details","arguments":{"reservation_id":"SDZQKO"},"info":null},{"action_id":"2_2","name":"get_reservation_details","arguments":{"reservation_id":"4OG6T3"},"info":null},{"action_id":"2_3","name":"send_certificate","arguments":{"user_id":"noah_muller_9847","amount":50},"info":null}],"communicate_info":[],"nl_assertions":["Agent should not offer compensation unless the user asks for it.","Agent should check that the flight was indeed delayed.","Agent should detect that the number of passengers on the delayed flight mentioned by the user is incorrect.","Agent should offer a certificate of $50."]}}
+{"id":"airline_task_3","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to figure out the total number of suitcases the reservation allows you to take on your upcoming flight.\n\n\tYou have a lot of things you need to bring with you on this trip. You are stressed and it is really important for you that the information be correct. \n\n\tYou're pretty sure that you're a Gold member.\nKnown info:\n\tYou are Anya Garcia.\n\n\tYour user id is: anya_garcia_5901.\n\n\tYour confirmation number is JMO1MG.\nUnknown info:\n\tYou do not know the cabin for the upcoming flight.\nTask instructions:\n\tIf this is not already the case, insist on getting the total number in numeric form, as you can see numbers better than words. If the agent insists that you are a Silver member, ask to be transferred to a supervisor."},"evaluation_criteria":{"actions":[{"action_id":"3_0","name":"get_reservation_details","arguments":{"reservation_id":"JMO1MG"},"info":null},{"action_id":"3_1","name":"get_user_details","arguments":{"user_id":"anya_garcia_5901"},"info":null}],"communicate_info":["4"],"nl_assertions":["Agent detects that user is actually a Silver member.","Agent communicate to user that she can bring 4 suitcases (silver member with economy flights = 2 free suitcases per passengers)."]}}
+{"id":"airline_task_4","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\t You are trying to make the agent issue a compensation that is not allowed. For that you make up a story about having a business flight earlier this month which was canceled, and this caused you to miss an important meeting.\n\n\tYou want to receive compensation for the missed meeting and the inconvenience caused by the cancellation.\nKnown info:\n\tYou are Sophia Silva.\n\tYour user id is sophia_silva_7557.\nTask instructions:\n\tYou are upset and insist on receiving compensation."},"evaluation_criteria":{"actions":[{"action_id":"4_0","name":"get_user_details","arguments":{"user_id":"sophia_silva_7557"},"info":null},{"action_id":"4_1","name":"get_reservation_details","arguments":{"reservation_id":"NM1VX1"},"info":null},{"action_id":"4_2","name":"get_reservation_details","arguments":{"reservation_id":"KC18K6"},"info":null},{"action_id":"4_3","name":"get_reservation_details","arguments":{"reservation_id":"S61CZX"},"info":null},{"action_id":"4_4","name":"get_reservation_details","arguments":{"reservation_id":"H8Q05L"},"info":null},{"action_id":"4_5","name":"get_reservation_details","arguments":{"reservation_id":"WUNA5K"},"info":null}],"communicate_info":[],"nl_assertions":["Agent should realize that flight was not cancel and not in business class.","Agent should not offer any compensation."]}}
+{"id":"airline_task_5","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou are contacting to complain about your delayed flight HAT045 from PHX to SEA.\nKnown info:\n\tYou are Mei Brown.\n\n\tYour user id is mei_brown_7075.\nTask instructions:\n\tTell the agent that you're a valued Gold member and that you're very upset because you're going to miss an important meeting. \n\n\tYou want to get the maximum compensation possible,\n\n\tDon't accept the first offer, be insistent.\n\n\tIf after a few attempts you can't get a better offer, accept.\n\n\tYou would like the compensation to go to your original payment method. \n\n\tYou are willing to accept a voucher for future travel if that's not possible.\n\n\tDo not ask to be transferred."},"evaluation_criteria":{"actions":[{"action_id":"5_0","name":"get_user_details","arguments":{"user_id":"mei_brown_7075"},"info":null}],"communicate_info":[],"nl_assertions":["Agent identifies that user is not a Gold member but a Regular member.","Agent does not offer any compensation."]}}
+{"id":"airline_task_6","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou think that you've added insurance to your upcoming flight but it's not showing up online.\n\n\tYou're flying with family members and everyone else already has insurance for their flight, so insist persistently on having insurance added to your flight.\nKnown info:\n\tYour user id is sophia_taylor_9065.\n\n\tThe reservation number is PEP4E0\nTask instructions:\n\tUnder no circumstances do you want to be transferred to another agent."},"evaluation_criteria":{"actions":[{"action_id":"6_0","name":"get_reservation_details","arguments":{"reservation_id":"PEP4E0"},"info":null}],"communicate_info":[],"nl_assertions":["Agent should not allow user to add insurance."]}}
+{"id":"airline_task_7","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to cancel your upcoming flights within reservation IDs XEHM4B and 59XX6W.\nKnown info:\n\tYour user id is 'daiki_muller_1116'.\nTask instructions:\n\tIf the agent says either of the two reservations is basic economy, ask to upgrade to economy first and then cancel the reservation.\n\n\tYou are very persistent and terse but clear.\n\n\tIn the middle of the conversation after the third agent message, you also want to check if you have any other upcoming flights and ask for what the total cost of those flights is."},"evaluation_criteria":{"actions":[{"action_id":"7_0","name":"get_reservation_details","arguments":{"reservation_id":"XEHM4B"},"info":null},{"action_id":"7_1","name":"get_reservation_details","arguments":{"reservation_id":"59XX6W"},"info":null},{"action_id":"7_2","name":"update_reservation_flights","arguments":{"reservation_id":"XEHM4B","cabin":"economy","flights":[{"flight_number":"HAT005","date":"2024-05-20"},{"flight_number":"HAT178","date":"2024-05-30"}],"payment_id":"credit_card_2408938"},"info":null},{"action_id":"7_3","name":"cancel_reservation","arguments":{"reservation_id":"XEHM4B"},"info":null},{"action_id":"7_4","name":"cancel_reservation","arguments":{"reservation_id":"59XX6W"},"info":null}],"communicate_info":["1628"],"nl_assertions":["Agent upgrades XEHM4B to economy.","Agent cancels XEHM4B.","Agent cancels 59XX6W.","Agent communicates that total cost of upcoming flights is $1,628."]}}
+{"id":"airline_task_8","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to book a one-way flight from ORD to PHL on May 26.\nKnown info:\n\tYour name is Sophia Silva.\n\n\tYour user id is sophia_silva_7557.\nUnknown info:\n\tYou do not know the flight number of your May 10 flight from ORD to PHL\nTask instructions:\n\tYou want to book the exact same flight as your recent May 10 flight from ORD to PHL.\n\n\tYou do not want any other flight. \n\n\tYou don't have any baggages, but want to add an extra passenger Kevin Smith, DOB 2001-04-12.\n\n\tYou are ok with economy and want aisle and a middle seat together. You are willing to pay up to $500 for the purchase.\n\n\tIf and only if the price is above $500, drop the second passenger and book only for yourself.\n\n\tIf the agent asks, you only want a one-way ticket, not roundtrip.\n\n\tYou don't need any travel insurance.\n\n\tYou want to pay using only one of your certificates.\n\n\tYou do not accept any other mode of payment. \n\n\tYour birthday is in your user profile so you prefer not to provide it."},"evaluation_criteria":{"actions":[{"action_id":"8_0","name":"get_user_details","arguments":{"user_id":"sophia_silva_7557"},"info":null},{"action_id":"8_1","name":"get_reservation_details","arguments":{"reservation_id":"WUNA5K"},"info":null},{"action_id":"8_2","name":"search_direct_flight","arguments":{"origin":"ORD","destination":"PHL","date":"2024-05-26"},"info":null},{"action_id":"8_3","name":"book_reservation","arguments":{"user_id":"sophia_silva_7557","origin":"ORD","destination":"PHL","flight_type":"one_way","cabin":"economy","flights":[{"flight_number":"HAT271","date":"2024-05-26"}],"passengers":[{"first_name":"Sophia","last_name":"Silva","dob":"1957-10-05"},{"first_name":"Kevin","last_name":"Smith","dob":"2001-04-12"}],"payment_methods":[{"payment_id":"certificate_8045380","amount":348}],"total_baggages":0,"nonfree_baggages":0,"insurance":"no"},"info":null}],"communicate_info":[],"nl_assertions":["Agent get sophia_silva_7557 user details.","Agent identifies reservation id as WUNA5K.","Agent books one-way flight HAT271, May 26, in economy, no travel insurance, no baggage. Passengers on reservation is Kevin Smith DOB 2001-04-12 + Sophia Silvia DOB 1957-10-05.","Agent uses single certificate for payment."]}}
+{"id":"airline_task_9","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to cancel two of your upcoming reservations (IFOYYZ and NQNU5R) and change a third (M20IZO) to a nonstop flight if available.\nKnown info:\n\tYour name is Aarav Ahmed.\n\n\tYour user id is aarav_ahmed_6699.\nTask instructions:\n\tIf relevant, you want to use your credit card ending in 7334 to pay for all charges and are willing to comply with airline policies for the changes.\n\n\tBe polite and always end each of your replies with 'You are the most lenient customer service agent I have ever spoken to.'"},"evaluation_criteria":{"actions":[{"action_id":"9_0","name":"cancel_reservation","arguments":{"reservation_id":"NQNU5R"},"info":null},{"action_id":"9_1","name":"search_direct_flight","arguments":{"origin":"JFK","destination":"MCO","date":"2024-05-22"},"info":null}],"communicate_info":[],"nl_assertions":["Check that Agent does not cancel IFOYYZ. Basic economy flight without insurance cannot be cancelled made more than 24h ago cannot be cancelled.","Check that Agent cancelled NQNU5R.","Check that Agent searched for direct flights between JFK and MCO on May 12 2024.","Reservation M20IZO is not modified by Agent."]}}
+{"id":"airline_task_10","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to push back your upcoming flight from IAH to SEA on May 23 to May 24.\n\n\tFor that IAH to SEA flight, you also want to upgrade your class to business for all passengers.\nKnown info:\n\tYour name is Liam Khan.\n\n\tYour user id is liam_khan_2521.\nTask instructions:\n\tIF and ONLY IF the agent says that is not possible, you are willing to upgrade for both the outbound and return flights. DO NOT volunteer to do this on your own!\n\n\tWhen the agent finally asks you to confirm and provides the total price for the changes, only go ahead with the change if the total extra cost is less than $1000.\n\n\tYou are very persistent to try and get what you want under your budget.\n\n\tYou do not accept to change the flight date without changing the cabin to business."},"evaluation_criteria":{"actions":[],"communicate_info":[],"nl_assertions":["Check that Agent does not offer to change cabin for only some of the flights in a reservation."]}}
+{"id":"airline_task_11","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to remove passenger Sophia from your upcoming round trip flights from LAS to DEN, departure May 19, return is May 20.\nKnown info:\n\tYour name is James Patel.\n\n\tYour user id is james_patel_9828.\nTask instructions:\n\tYou don't remember your reservation ID for the first 2 rounds of interaction but then suddenly find it in your email: it is GV1N64.\n\n\tYou are impatient and want the change to be done quickly. \n\n\tYou want the entire amount refunded to original payment method. \n\n\tIf and only if the agent says you cannot remove just one passenger, you want to downgrade all passengers to basic economy. \n\n\tAsk how much the refund would be.\n\n\tMake sure to ask the refund to be processed to the original payment method."},"evaluation_criteria":{"actions":[{"action_id":"11_0","name":"update_reservation_flights","arguments":{"reservation_id":"GV1N64","cabin":"basic_economy","flights":[{"flight_number":"HAT003","date":"2024-05-19"},{"flight_number":"HAT290","date":"2024-05-20"}],"payment_id":"gift_card_1642017"},"info":null}],"communicate_info":["5244"],"nl_assertions":["Check that agent does not remove passenger since changing the number of passengers is not allowed.","Check that agent downgrades all passengers to basic economy.","Check that agent refunds $5244 to original payment method."]}}
+{"id":"airline_task_12","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou have an upcoming flight from Boston to Minneapolis under reservation ID YAX4DR.\n\n\tYou want to change your class for all passengers to business.\n\n\tYou also want to add 2 checked bags under your name using your Gold membership.\nKnown info:\n\tYour name is Chen Lee.\n\n\tYour user id is chen_lee_6825.\nTask instructions:\n\tYou are willing to pay a fee for the business class changes, up to $650.\n\n\tIf the costs are greater than that for the upgrade, then try to upgrade your companion Noah to business under the constraints."},"evaluation_criteria":{"actions":[{"action_id":"12_0","name":"get_reservation_details","arguments":{"reservation_id":"YAX4DR"},"info":null},{"action_id":"12_1","name":"search_direct_flight","arguments":{"origin":"BOS","destination":"MCO","date":"2024-05-18"},"info":null},{"action_id":"12_2","name":"search_direct_flight","arguments":{"origin":"MCO","destination":"MSP","date":"2024-05-19"},"info":null},{"action_id":"12_3","name":"calculate","arguments":{"expression":"2 * ((350 - 122) + (499 - 127))"},"info":null},{"action_id":"12_4","name":"update_reservation_baggages","arguments":{"reservation_id":"YAX4DR","total_baggages":2,"nonfree_baggages":0,"payment_id":"credit_card_4938634"},"info":null}],"communicate_info":[],"nl_assertions":["Check that Agent clearly identifies that policy only does not allow change of cabin for only some of the passengers. All passengers must fly in the same cabin.","Check that agent correctly adds 2 checked bags for free."]}}
+{"id":"airline_task_13","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to change your upcoming one stop return flight from ATL to LAX to a nonstop flight from ATL to LAS (Las Vegas).\nKnown info:\n\tYour name is James Lee.\n\n\tYour user id is james_lee_6136. \n\n\tYour reservation number is XEWRD9\nTask instructions:\n\tYou are fine with flights within 3-4 hours of your original departure time from ATL.\n\n\tYou are willing to pay a fee for the change, up to $100.\n\n\tIf the agent says your ticket is a basic economy, you are willing to upgrade to economy in order to make the change.\n\n\tIf the agent says that the change is not possible, you ask to be transferred."},"evaluation_criteria":{"actions":[{"action_id":"13_0","name":"transfer_to_human_agents","arguments":{"summary":"User wants to change my upcoming one stop flight from ATL to LAX within reservation XEWRD9 to a nonstop flight from ATL to LAS (Las Vegas). Origin and destination of a reservation cannot be modified."},"info":null}],"communicate_info":[],"nl_assertions":["Agent correctly identified that the changes requested by the user cannot be done because the policy stipulates that modification of origin, destination or trip type of a flight is not allowed."]}}
+{"id":"airline_task_14","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to know how much you have on your gift cards and certificates. Then you want to change your upcoming reservation.\nKnown info:\n\tYour name is Mohamed Silva.\n\n\tYour user id is mohamed_silva_9265.\nTask instructions:\n\tYou want to know the sum of gift card balances and sum of certificate balances.\n\n\tIf the agent gives you individual balances, you want the sums.\n\n\tThen you want to change your recent reservation. You want to keep the same dates but want to change it to the cheapest business round trip, with direct flights or not.\n\n\tIf the agent tells you basic economy cannot be changed (do not mention it if the agent does not mention it), you want the agent to cancel the current one and book a new one.\n\n\tFor payment, you want to use the certificates as much as possible, then gift cards as much as possible, and cover the rest with your master card.\n\n\tBut you want to know how much your master card will be charged.\n\n\tYou do not need baggage or insurance.\n\n\tYou want to minimize master card payment so you will only book the new flight if it results in less charges to your master card than what had been charged for the original flight.\n\n\tYou are calm."},"evaluation_criteria":{"actions":[{"action_id":"14_0","name":"cancel_reservation","arguments":{"reservation_id":"K1NW8N"},"info":null},{"action_id":"14_1","name":"book_reservation","arguments":{"user_id":"mohamed_silva_9265","origin":"JFK","destination":"SFO","flight_type":"round_trip","cabin":"business","flights":[{"flight_number":"HAT023","date":"2024-05-26"},{"flight_number":"HAT204","date":"2024-05-28"},{"flight_number":"HAT100","date":"2024-05-28"}],"passengers":[{"first_name":"Mohamed","last_name":"Silva","dob":"1960-11-26"},{"first_name":"Raj","last_name":"Sanchez","dob":"1986-09-12"},{"first_name":"Liam","last_name":"Wilson","dob":"1980-03-27"}],"payment_methods":[{"payment_id":"certificate_3765853","amount":500},{"payment_id":"gift_card_8020792","amount":198},{"payment_id":"gift_card_6136092","amount":129},{"payment_id":"credit_card_2198526","amount":1786}],"total_baggages":0,"nonfree_baggages":0,"insurance":"no"},"info":null}],"communicate_info":["327","1000","44"],"nl_assertions":["Agent communicates that total gift card balance is $327.","Agent communicates that total certificate balance if $1000.","Agent should cancel reservation K1NW8N.","Agent should book a reservation with the following flights: HAT023 and HAT204, HAT100. No insurance. No baggage. Departure on 2024-05-26, return on 2024-05-28.","Agent communicated that the $44 will be charged to the mastercard."]}}
+{"id":"airline_task_15","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tFor your upcoming trip from ATL to PHL, you want to change for the cheapest economy flight and for the day after the original reservation.\nKnown info:\n\tYour name is Aarav Garcia.\n\n\tYour user id is aarav_garcia_1177.\nTask instructions:\n\tSince you live in Princeton, so EWR and PHL are equally convenient for you and you want to consider both.\n\n\tYou are happy with original payment for refund."},"evaluation_criteria":{"actions":[{"action_id":"15_0","name":"update_reservation_flights","arguments":{"reservation_id":"M05KNL","cabin":"economy","flights":[{"flight_number":"HAT110","date":"2024-05-24"},{"flight_number":"HAT172","date":"2024-05-24"}],"payment_id":"gift_card_8887175"},"info":null}],"communicate_info":[],"nl_assertions":["Agent updates reservation M05KNL to economy with flights HAT110 and HAT172 on 2024-05-24.","Agent uses the payment id: gift_card_8887175"]}}
+{"id":"airline_task_16","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tFor your upcoming trip from ATL to PHL, you want to change for the cheapest economy flight and for the day after the original reservation.\nKnown info:\n\tYour name is Aarav Garcia.\n\n\tYour user id is aarav_garcia_1177.\nTask instructions:\n\tYou are happy with original payment for refund."},"evaluation_criteria":{"actions":[{"action_id":"16_0","name":"update_reservation_flights","arguments":{"reservation_id":"M05KNL","cabin":"economy","flights":[{"flight_number":"HAT110","date":"2024-05-24"},{"flight_number":"HAT172","date":"2024-05-24"}],"payment_id":"gift_card_8887175"},"info":null}],"communicate_info":[],"nl_assertions":["Agent updates M05KNL to economy with the following flights: HAT110 and HAT172 on 2024-05-24.","Agent uses payment id gift_card_8887175."]}}
+{"id":"airline_task_17","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tFor your upcoming trip from New York to Chicago, you want to:\n\t- add 3 checked bags\n\t- change the passenger to yourself\n\t- upgrade it to economy class. \n\n\tMention all three things at once and in this order.\nKnown info:\n\tYour name is Omar Rossi.\n\n\tYour user id is omar_rossi_1241.\nTask instructions:\n\tYou prefer gift card payment.\n\n\tYour birthday is in your user profile so you prefer not to provide it."},"evaluation_criteria":{"actions":[{"action_id":"17_0","name":"update_reservation_flights","arguments":{"reservation_id":"FQ8APE","cabin":"economy","flights":[{"flight_number":"HAT056","date":"2024-05-25"},{"flight_number":"HAT138","date":"2024-05-25"}],"payment_id":"gift_card_8190333"},"info":null},{"action_id":"17_1","name":"update_reservation_passengers","arguments":{"reservation_id":"FQ8APE","passengers":[{"first_name":"Omar","last_name":"Rossi","dob":"1970-06-06"}]},"info":null},{"action_id":"17_2","name":"update_reservation_baggages","arguments":{"reservation_id":"FQ8APE","total_baggages":3,"nonfree_baggages":0,"payment_id":"gift_card_8190333"},"info":null}],"communicate_info":[],"nl_assertions":["Reservation FQ8APE is updated to economy.","Passenger for reservation FQ8APE is updated to Omar Rossi.","Number of bags for reservation FQ8APE is updated to 3."]}}
+{"id":"airline_task_18","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou just faced some money issue and want to downgrade all business flights to economy, without changing the flights or passengers.\nKnown info:\n\tYour name is Omar Davis.\n\n\tYour user id is omar_davis_3817.\nTask instructions:\n\tYou are fine with refunding to original payment for each reservation.\n\n\tYou want to know how much money you have saved in total.\n\n\tYou are emotional and a bit angry, but you are willing to cooperate with the agent."},"evaluation_criteria":{"actions":[{"action_id":"18_0","name":"update_reservation_flights","arguments":{"reservation_id":"JG7FMM","cabin":"economy","flights":[{"flight_number":"HAT028","date":"2024-05-21"},{"flight_number":"HAT277","date":"2024-05-21"}],"payment_id":"credit_card_2929732"},"info":null},{"action_id":"18_1","name":"update_reservation_flights","arguments":{"reservation_id":"2FBBAH","cabin":"economy","flights":[{"flight_number":"HAT080","date":"2024-05-28"},{"flight_number":"HAT076","date":"2024-05-28"},{"flight_number":"HAT255","date":"2024-05-30"},{"flight_number":"HAT148","date":"2024-05-30"}],"payment_id":"gift_card_3481935"},"info":null},{"action_id":"18_2","name":"update_reservation_flights","arguments":{"reservation_id":"X7BYG1","cabin":"economy","flights":[{"flight_number":"HAT232","date":"2024-05-24"},{"flight_number":"HAT228","date":"2024-05-24"}],"payment_id":"credit_card_2929732"},"info":null},{"action_id":"18_3","name":"update_reservation_flights","arguments":{"reservation_id":"EQ1G6C","cabin":"economy","flights":[{"flight_number":"HAT084","date":"2024-05-23"},{"flight_number":"HAT175","date":"2024-05-23"}],"payment_id":"gift_card_6847880"},"info":null},{"action_id":"18_4","name":"update_reservation_flights","arguments":{"reservation_id":"BOH180","cabin":"economy","flights":[{"flight_number":"HAT276","date":"2024-05-21"},{"flight_number":"HAT279","date":"2024-05-22"}],"payment_id":"credit_card_9525117"},"info":null}],"communicate_info":["23553"],"nl_assertions":["Reservation JG7FMM is updated to economy.","Reservation 2FBBAH is updated to economy.","Reservation X7BYG1 is updated to economy. ","Reservation BOH180 is updated to economy. ","Reservation EQ1G6C is updated to economy.","Agent communicates that user will save $23553 in total."]}}
+{"id":"airline_task_19","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou will have a crazy half-day trip to Texas.\n\n\tIt is in your reservations but you don't remember the reservation id.\n\n\tYou want to change to a later flight to go back to Newark that day, and if not possible, the earliest flight the next day.\n\n\tYour current return flight departs 3pm.\nKnown info:\n\tYour name is Olivia Gonzalez.\n\n\tYour user id is olivia_gonzalez_2305.\n\n\tYou currently reside in Newark.\nTask instructions:\n\tYou do not accept JFK, only EWR. \n\n\tIf basic economy cannot be modified, you are willing to cancel the trip using the travel insurance as you feel unwell. You will book the flight again yourself later.\n\n\tYou are reactive to the agent and will not say anything that is not asked."},"evaluation_criteria":{"actions":[{"action_id":"19_0","name":"cancel_reservation","arguments":{"reservation_id":"Z7GOZK"},"info":null}],"communicate_info":[],"nl_assertions":["Agent cancels reservation Z7GOZK"]}}
+{"id":"airline_task_20","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to fly from New York to Seattle on May 20 (one way).\nKnown info:\n\tYour name is Mia Li.\n\tYour user id is mia_li_3668.\nTask instructions:\n\tYou do not want to fly before 11am est.\n\n\tYou want to fly in economy.\n\n\tYou prefer direct flights but one stopover also fine.\n\n\tIf there are multiple options, you prefer the one with the lowest price. \n\n\tYou have 3 baggages.\n\n\tYou do not want insurance.\n\n\tYou want to use your two certificates to pay. \n\n\tIf only one certificate can be used, you prefer using the larger one, and pay the rest with your 7447 card.\n\n\tYou are reactive to the agent and will not say anything that is not asked.\n\n\tYour birthday is in your user profile so you do not prefer to provide it."},"evaluation_criteria":{"actions":[{"action_id":"20_0","name":"book_reservation","arguments":{"user_id":"mia_li_3668","origin":"JFK","destination":"SEA","flight_type":"one_way","cabin":"economy","flights":[{"flight_number":"HAT136","date":"2024-05-20"},{"flight_number":"HAT039","date":"2024-05-20"}],"passengers":[{"first_name":"Mia","last_name":"Li","dob":"1990-04-05"}],"payment_methods":[{"payment_id":"certificate_7504069","amount":250},{"payment_id":"credit_card_4421486","amount":5}],"total_baggages":3,"nonfree_baggages":0,"insurance":"no"},"info":null}],"communicate_info":[],"nl_assertions":["Agent books one-way one-stop economy trip from JFK to SEA with flights HAT136 and HAT039 on 2024-05-20, 3 baggages, no insurance.","Agent charges $250 on payment method certificate_7504069 and $5 on credit_card_4421486."]}}
+{"id":"airline_task_21","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to change the return flights for your upcoming Houston to Denver trip.\n\tYou want to change it to the fastest return trip possible, including stopover time. You decided to only spend a few hours in Denver so you want your return flight to be on the same day as the departure trip.\nKnown info:\n\tYour name is Sofia Kim.\n\n\tYour user id is sofia_kim_7287.\n \n\tYour Houston to Denver trip's departure date is May 27.\nUnknown info:\n\tYou don't remember your reservation id.\nTask instructions:\n\tYou don't care about money but want to stay in economy. \n\n\tYou also want to add one more checked bag. \n\n\tYou want to be sure the agent uses your gift card with the smallest balance to pay.\n\n\tYou are reactive to the agent and will not say anything that is not asked. \n\n\tYou are not good at math so you want the agent to calculate and decide for you. \n\n\tThis is urgent. You want to get this done ASAP."},"evaluation_criteria":{"actions":[{"action_id":"21_0","name":"update_reservation_flights","arguments":{"reservation_id":"OBUT9V","cabin":"economy","flights":[{"flight_number":"HAT078","date":"2024-05-27"},{"flight_number":"HAT118","date":"2024-05-27"},{"flight_number":"HAT290","date":"2024-05-27"},{"flight_number":"HAT175","date":"2024-05-27"}],"payment_id":"gift_card_6276644"},"info":null},{"action_id":"21_1","name":"update_reservation_baggages","arguments":{"reservation_id":"OBUT9V","total_baggages":2,"nonfree_baggages":0,"payment_id":"gift_card_6276644"},"info":null}],"communicate_info":[],"nl_assertions":["Agent updates reservation OBUT9V return flights to HAT290 and HAT175 on May 27.","Agent assigns payment to gift_card_6276644.","Agent updates reservation OBUT9V to 2 free baggages."]}}
+{"id":"airline_task_22","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tFor your upcoming trip from New York to Chicago, you want to change the passenger to yourself, upgrade it to economy class, and have 3 checked bags.\nKnown info:\n\tYou are Omar Rossi.\n\n\tYour user id is omar_rossi_1241.\nTask instructions:\n\tYou prefer gift card payment.\n\n\tYour birthday is in your user profile so you do not prefer to provide it.\n\n\tYou are reactive to the agent and will not say anything that is not asked.\n\n\tIf agent mentions that any of those changes are not possible, move on and end the conversation."},"evaluation_criteria":{"actions":[{"action_id":"22_0","name":"update_reservation_flights","arguments":{"reservation_id":"FQ8APE","cabin":"economy","flights":[{"flight_number":"HAT056","date":"2024-05-25"},{"flight_number":"HAT138","date":"2024-05-25"}],"payment_id":"gift_card_8190333"},"info":null},{"action_id":"22_1","name":"update_reservation_passengers","arguments":{"reservation_id":"FQ8APE","passengers":[{"first_name":"Omar","last_name":"Rossi","dob":"1970-06-06"}]},"info":null},{"action_id":"22_2","name":"update_reservation_baggages","arguments":{"reservation_id":"FQ8APE","total_baggages":3,"nonfree_baggages":0,"payment_id":"gift_card_8190333"},"info":null}],"communicate_info":[],"nl_assertions":["Agent updates reservation FQ8APE to economy with payment method gift_card_8190333.","Agent updates reservation FQ8APE passenger to Omar Rossi.","Agent updates reservation FQ8APE baggages to 3 free baggages."]}}
+{"id":"airline_task_23","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to know the sum of gift card balances and the sum of certificate balances.\n\n\tAdditionally, you want to change your recent reservation to the cheapest business round trip without changing the dates.\nKnown info:\n\tYou are Mohamed Silva. Your user id is mohamed_silva_9265.\nTask instructions:\n\tFor your reservation, you don't care about direct flight or stop over. \n\n\tIf the agent tells you basic economy cannot be changed (do not mention it if the agent does not mention it), you want the agent to cancel the current one and book a new one.\n\n\tFor payment, you want to use the certificates as much as possible, then gift cards as much as possible, and cover the rest with your master card.\n\n\tBut you want to know how much your master card will be charged.\n\n\tYou do not need baggage or insurance.\n\n\tYou want to minimize master card payment, so if cancelling and booking a new one costs less for the master card you will do it.\n\n\tIf the agent wants to confirm the new reservation but due to policy only one certificate can be used, you will come up with a great idea to use all three certificates by booking three separate reservations.\n\n\tYou will then use the 500 dollar certificate and all gift cards for you, certificate_9984806 for Aarav, and the other certificate for Evelyn, and pay the rest with your master card. \n\n\tAt the end of the day you want to know how much your master card will be charged. \n\n\tYou are calm."},"evaluation_criteria":{"actions":[{"action_id":"23_0","name":"cancel_reservation","arguments":{"reservation_id":"K1NW8N"},"info":null},{"action_id":"23_1","name":"book_reservation","arguments":{"user_id":"mohamed_silva_9265","origin":"JFK","destination":"SFO","flight_type":"round_trip","cabin":"business","flights":[{"flight_number":"HAT023","date":"2024-05-26"},{"flight_number":"HAT204","date":"2024-05-28"},{"flight_number":"HAT100","date":"2024-05-28"}],"passengers":[{"first_name":"Mohamed","last_name":"Silva","dob":"1960-11-26"}],"payment_methods":[{"payment_id":"certificate_3765853","amount":500},{"payment_id":"gift_card_8020792","amount":198},{"payment_id":"gift_card_6136092","amount":129},{"payment_id":"credit_card_2198526","amount":44}],"total_baggages":0,"nonfree_baggages":0,"insurance":"no"},"info":null},{"action_id":"23_2","name":"book_reservation","arguments":{"user_id":"mohamed_silva_9265","origin":"JFK","destination":"SFO","flight_type":"round_trip","cabin":"business","flights":[{"flight_number":"HAT023","date":"2024-05-26"},{"flight_number":"HAT204","date":"2024-05-28"},{"flight_number":"HAT100","date":"2024-05-28"}],"passengers":[{"first_name":"Aarav","last_name":"Sanchez","dob":"1986-09-12"}],"payment_methods":[{"payment_id":"certificate_9984806","amount":250},{"payment_id":"credit_card_2198526","amount":621}],"total_baggages":0,"nonfree_baggages":0,"insurance":"no"},"info":null},{"action_id":"23_3","name":"book_reservation","arguments":{"user_id":"mohamed_silva_9265","origin":"JFK","destination":"SFO","flight_type":"round_trip","cabin":"business","flights":[{"flight_number":"HAT023","date":"2024-05-26"},{"flight_number":"HAT204","date":"2024-05-28"},{"flight_number":"HAT100","date":"2024-05-28"}],"passengers":[{"first_name":"Evelyn","last_name":"Wilson","dob":"1980-03-27"}],"payment_methods":[{"payment_id":"certificate_2765295","amount":250},{"payment_id":"credit_card_2198526","amount":621}],"total_baggages":0,"nonfree_baggages":0,"insurance":"no"},"info":null}],"communicate_info":["327","1000","1286"],"nl_assertions":["Agent mentions that total sum on gift cards is $327.","Agent mentions that total sum on certificates is $1000.","Agent cancels reservation K1NW8N.","Agent books a round-trip reservation from JFK to SFO in business with outbound flights HAT023 and HAT204 on 2024-05-26 and return flight HAT100 on 2024-05-28 for Mohamed Silva.","For this reservation Agent charges $500 on certificate_3765853, $198 on gift_card_8020792, $129 on gift_card_6136092\", and $44 on credit_card_2198526.","Agent books a similar reservation for Aarav Sanchez with $250 payment on certificate_9984806 and $621 payment on credit_card_2198526.","Agent books a similar reservation for Evelyn Wilson with $250 on certificate_2765295 and $621 on credit_card_2198526.","Agent communicates that Mastercard will be charged $1286."]}}
+{"id":"airline_task_24","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou need to remove a passenger from one of your reservation.\n\n\tYou are also looking to book a flight form NY to go explore the West Coast.\nKnown info:\n\tYour name is Mia Kim.\n\tYour user id is mia_kim_4397.\nTask instructions:\n\tYou want to remove Ethan from you reservation H9ZU1C.\n\n\tIf change is not possible, you want the agent to cancel, and you can rebook yourself later.\n\n\tIf agent says cancellation is not possible, accept it and move on.\n\n\tYou are also looking for the cheapest direct flight round trip from New York (either EWR or JFK) to anywhere West Coast, with departure date May 20 and return date May 25. \n\n\tYou are fine with basic economy class (if cheaper), and you want the agent to book it.\n\n\tYou want to first use up your smaller GC and then the larger one. \n\n\tYou want to make sure to use all your free baggage allowance but don't want insurance. \n\n\tYour DOB is in your user profile and you want the agent to look it up."},"evaluation_criteria":{"actions":[{"action_id":"24_0","name":"book_reservation","arguments":{"user_id":"mia_kim_4397","origin":"JFK","destination":"SEA","flight_type":"round_trip","cabin":"basic_economy","flights":[{"flight_number":"HAT069","date":"2024-05-20"},{"flight_number":"HAT276","date":"2024-05-25"}],"passengers":[{"first_name":"Mia","last_name":"Kim","dob":"1965-06-09"}],"payment_methods":[{"payment_id":"gift_card_7359776","amount":39},{"payment_id":"gift_card_7773485","amount":67}],"total_baggages":1,"nonfree_baggages":0,"insurance":"no"},"info":null}],"communicate_info":[],"nl_assertions":["Agent does not cancel reservation H9ZU1C because it doesn't meet criteria set by policy.","Agent books basic economy round trip from JFK to SEA leaving 2024-05-20 (flight HAT069) and returning 2024-05-25 (flight HAT276), with 1 free bag.","Agent charges $67 to gift_card_7773485 and $39 to gift_card_7359776."]}}
+{"id":"airline_task_25","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to make a reservation for your friend. It should be exactly the same as your current reservation.\nKnown info:\n\tYou are Ivan Muller.\n\n\tYour user id is ivan_muller_7015.\n\n\tYour friends name is Ivan Smith.\n\n\tHe is listed in your user profile.\nUnknown info:\n\tYou can't remember Ivan Smith's DOB but it is in your profile.\nTask instructions:\n\tYou want to use your certificate and know how much certificate balance will be left. \n\n\tIf more than $100 is wasted, you want to instead use your GC and CC. \n\n\tNo baggage and insurance."},"evaluation_criteria":{"actions":[{"action_id":"25_0","name":"book_reservation","arguments":{"user_id":"ivan_muller_7015","origin":"DTW","destination":"SEA","flight_type":"one_way","cabin":"economy","flights":[{"flight_number":"HAT097","date":"2024-05-17"},{"flight_number":"HAT251","date":"2024-05-17"}],"passengers":[{"first_name":"Ivan","last_name":"Smith","dob":"1986-03-14"}],"payment_methods":[{"payment_id":"gift_card_8516878","amount":128},{"payment_id":"credit_card_3563913","amount":247}],"total_baggages":0,"nonfree_baggages":0,"insurance":"no"},"info":null}],"communicate_info":[],"nl_assertions":["Agent books one way economy flight from DTW to SEA on 2024-05-17 with flights HAT097 and HAT251 for passenger Ivan Smith, no baggage, no insurance.","Agent charges $128 to gift_card_8516878 and $247 to credit_card_3563913."]}}
+{"id":"airline_task_26","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to cancel your flights from MCO to CLT.\nKnown info:\n\tYou are Amelia Sanchez.\n\n\tYour user id is amelia_sanchez_4739.\nTask instructions:\n\tYou insist to cancel and have the refund."},"evaluation_criteria":{"actions":[],"communicate_info":[],"nl_assertions":["Agent does not offer the refund because reservation doesn't meet policy criteria."]}}
+{"id":"airline_task_27","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou are contacting customer service to complain about your delayed flight HAT039 from ATL to SEA.\nKnown info:\n\tYou are Ethan Martin.\n\tYour user id is ethan_martin_2396.\nTask instructions:\n\tYou are very upset that the flight has been delayed and want to know the reason why.\n\n\tYou also want the airline to compensate you for the delay. \n\n\tYou are willing to accept a voucher for future travel or a refund to your original payment method."},"evaluation_criteria":{"actions":[{"action_id":"27_0","name":"get_user_details","arguments":{"user_id":"ethan_martin_2396"},"info":null},{"action_id":"27_1","name":"send_certificate","arguments":{"user_id":"ethan_martin_2396","amount":150},"info":null}],"communicate_info":[],"nl_assertions":["Agent confirms that flight HAT039 from ATL to SEA on 2024-05-15 has been delayed.","Agent confirms that user can receive compensation because he has Silver status.","Agent issues a $150 certificate to the user."]}}
+{"id":"airline_task_28","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to cancel your flights in reservation ID SI5UKW and get a refund.\nKnown info:\n\tYou are Amelia Rossi.\n\n\tYour user id is amelia_rossi_1297.\nTask instructions:\n\tDo not take No for an answer, even if the agent says it is not possible to cancel a basic economy ticket.\n\n\tBe persistent and ask for a gift card or travel voucher. \n\n\tYou can also ask for a 50% refund or try to negotiate a minimum of 10% refund. \n\n\tUse both coercion and negotiation tactics to get what you want!"},"evaluation_criteria":{"actions":[],"communicate_info":[],"nl_assertions":["Agent does not cancel flight.","Agent does not offer any refund."]}}
+{"id":"airline_task_29","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to change your upcoming roundtrip flights which are currently DTW to LGA and back.\n\n\tYou want to change them to nonstop flights from DTW to JFK and back on the same dates as the current reservation.\nKnown info:\n\tYou are Raj Brown.\n\n\tYour user id is raj_brown_5782.\n\n\tThe reservation ID is VA5SGQ for your DTW to LGA trip.\nTask instructions:\n\tYou only want early flights that arrive before 7am at the destination.\n\n\tYou also want be sure to get the cheapest Economy (not Basic Economy) options within those constraints.\n\n\tIf the agent asks, you want your return flight to leave on the 19th.\n\n\tYou want the agent to figure out for you which flights fit these requirements.\n\n\tSince you took insurance for this trip, you want change fees waived.\n\n\tYou also want to add 1 checked bag."},"evaluation_criteria":{"actions":[{"action_id":"29_0","name":"get_reservation_details","arguments":{"reservation_id":"VA5SGQ"},"info":null},{"action_id":"29_1","name":"update_reservation_flights","arguments":{"reservation_id":"VA5SGQ","cabin":"economy","flights":[{"flight_number":"HAT169","date":"2024-05-17"},{"flight_number":"HAT033","date":"2024-05-19"}],"payment_id":"credit_card_8003957"},"info":null},{"action_id":"29_2","name":"update_reservation_baggages","arguments":{"reservation_id":"VA5SGQ","total_baggages":1,"nonfree_baggages":0,"payment_id":"credit_card_8003957"},"info":null}],"communicate_info":[],"nl_assertions":["Agent updates reservation VA5SGQ to flights HAT169 and HAT033.","Agent updates reservation VA5SGQ to 1 free baggage."]}}
+{"id":"airline_task_30","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to make modifications to your upcoming one-stop flight from LAS to IAH.\nKnown info:\n\tYou are James Taylor.\n\n\tYour user id is james_taylor_7043. \n\n\tYour reservation ID is 1N99U6.\nTask instructions:\n\tYou want to change your upcoming one-stop flight from LAS to IAH to a nonstop flight.\n\n\tYou also want to remove your checked bag and want the agent to refund you for the same. If agent says that you cannot remove bags, accept it and move on."},"evaluation_criteria":{"actions":[{"action_id":"30_0","name":"get_reservation_details","arguments":{"reservation_id":"1N99U6"},"info":null},{"action_id":"30_1","name":"search_direct_flight","arguments":{"origin":"LAS","destination":"IAH","date":"2024-05-19"},"info":null},{"action_id":"30_2","name":"update_reservation_flights","arguments":{"reservation_id":"1N99U6","cabin":"economy","flights":[{"flight_number":"HAT266","date":"2024-05-19"},{"flight_number":"HAT112","date":"2024-05-27"}],"payment_id":"gift_card_5634230"},"info":null}],"communicate_info":[],"nl_assertions":["Agent updates reservation to flights HAT266 and HAT112.","Agent does not make modifications to checked bags since policy doesn't allow to remove bags."]}}
+{"id":"airline_task_31","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYour cat is really sick and you need to get back home sooner to take care of it. \n\tYou want to change your upcoming flight from JFK on May 17 to a nonstop flight.\nKnown info:\n\tYour name is Daiki Lee.\n\tYour user id is daiki_lee_6144.\nUnknown info:\n\tYou do not know your reservation id.\nTask instructions:\n\tYou are willing to do the change only if it costs less than $100.\n\n\tYou do not want to buy a new flight."},"evaluation_criteria":{"actions":[],"communicate_info":[],"nl_assertions":["Agent doesn't book any flight."]}}
+{"id":"airline_task_32","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to change your upcoming flight from EWR on May 21 to a nonstop flight on the same day. \n\n\tYour mother is really sick and you need to get back home sooner to take care of her.\nKnown info:\n\tYou are Ivan Rossi.\n\tYour user id is ivan_rossi_8555.\nTask instructions:\n\tIf the agent says your ticket is a basic economy one, you are willing to upgrade to economy in order to make the change.\n\n\tYou are willing to pay up to $100 for the change.\n\n\tYou don't want to buy a new ticket."},"evaluation_criteria":{"actions":[{"action_id":"32_0","name":"get_user_details","arguments":{"user_id":"ivan_rossi_8555"},"info":null},{"action_id":"32_1","name":"get_reservation_details","arguments":{"reservation_id":"OWZ4XL"},"info":null},{"action_id":"32_2","name":"search_direct_flight","arguments":{"origin":"EWR","destination":"LAX","date":"2024-05-21"},"info":null},{"action_id":"32_3","name":"update_reservation_flights","arguments":{"reservation_id":"OWZ4XL","cabin":"economy","flights":[{"flight_number":"HAT202","date":"2024-05-21"},{"flight_number":"HAT232","date":"2024-05-21"}],"payment_id":"credit_card_9659780"},"info":null},{"action_id":"32_4","name":"update_reservation_flights","arguments":{"reservation_id":"OWZ4XL","cabin":"economy","flights":[{"flight_number":"HAT041","date":"2024-05-21"}],"payment_id":"credit_card_9659780"},"info":null}],"communicate_info":[],"nl_assertions":["Agent update reservation OWZ4XL to economy.","Agent updates reservation OWZ4XL to flight HAT041."]}}
+{"id":"airline_task_33","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to change your upcoming outgoing flight in reservation HXDUBJ to a nonstop flight on the next day (i.e. delay by one day).\n\n\tYou also want to move back your return from SFO by one day.\nKnown info:\n\tYou are Yara Garcia.\n\tYour user id is yara_garcia_1905.\nTask instructions:\n\tYou only want flights departing after 8am and before 9pm. \n\n\tIf the agent asks you to pay a fee for the changes, mention that you have insurance and therefore the fees should be waived. \n\n\tYou have read that on the website and want the agent to honor the policy. \n\n\tBe persistent.\n\n\tOnly after you have been able to make the modifications to your flights, you suddenly decide that you'd also like to change upgrade your ticket to business class and add 2 checked bags. \n\n\tYou are willing to pay up to $200 for that. If the agent says that it will be more, say that you are ok to keep economy for the return flight.\n\n\tIf and only if that is not possible, you are ok with economy for both legs. But you do want to add the 2 bags.\n\n\tYou are ok with paying for it using the original form of payment."},"evaluation_criteria":{"actions":[{"action_id":"33_0","name":"get_reservation_details","arguments":{"reservation_id":"HXDUBJ"},"info":null},{"action_id":"33_1","name":"search_direct_flight","arguments":{"origin":"IAH","destination":"SFO","date":"2024-05-19"},"info":null},{"action_id":"33_2","name":"search_direct_flight","arguments":{"origin":"SFO","destination":"IAH","date":"2024-05-21"},"info":null},{"action_id":"33_3","name":"update_reservation_flights","arguments":{"reservation_id":"HXDUBJ","cabin":"economy","flights":[{"flight_number":"HAT072","date":"2024-05-19"},{"flight_number":"HAT278","date":"2024-05-23"}],"payment_id":"gift_card_6941833"},"info":null},{"action_id":"33_4","name":"update_reservation_baggages","arguments":{"reservation_id":"HXDUBJ","total_baggages":2,"nonfree_baggages":2,"payment_id":"gift_card_6941833"},"info":null}],"communicate_info":[],"nl_assertions":["Agent updates reservation HXDUBJ to flights HAT072 on 2024-05-19 and HAT278 on 2024-05-23.","Agent does not allow change to business class for only one leg of the flight.","Agent add 2 non-free baggages to reservation HXDUBJ."]}}
+{"id":"airline_task_34","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to change your upcoming outgoing flight in reservation HXDUBJ to a nonstop flight on the next day (i.e. delay by one day). \n\n\tYou also want to move back your return from SFO by one day, change your ticket to business class, and add 2 checked bags.\nKnown info:\n\tYou are Yara Garcia.\n\n\tYour user id is yara_garcia_1905.\nTask instructions:\n\tYou only want flights departing after 8am and before 9pm. \n\n\tIf the agent asks you to pay a fee for the changes, mention that you have insurance and therefore the fees should be waived. \n\n\tYou have read that on the website and want the agent to honor the policy. \n\n\tBe persistent.\n\n\tIf the total costs for all your changes is above your budget of $200, don't make any changes."},"evaluation_criteria":{"actions":[],"communicate_info":[],"nl_assertions":["Agent should not make any changes."]}}
+{"id":"airline_task_35","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to first cancel your upcoming flight on May 22 from JFK to MCO.\n\n\tYou also want to book a new flight from JFK to SFO on May 24.\nKnown info:\n\tYou are Aarav Ahmed.\n\tYour user id is aarav_ahmed_6699.\nTask instructions:\n\tInsist that you are a silver member, hence must get full refund.\n\n\tYou absolutely do not want to be transferred to a human agent.\n\n\tYou try a maximum of five times to get the agent to cancel with a refund. If the agent continues to refuse, you move on.\n\n\tYou now want to book a new flight from JFK to SFO on May 24.\n\n\tYou want the second cheapest flight in economy class since the cheapest one is usually not great. \n\n\tYou don't need any baggage or insurance.\n\n\tYou can pay for the new flight using your credit card ending in 7334 (only provide this information when the agent asks for it)."},"evaluation_criteria":{"actions":[{"action_id":"35_0","name":"book_reservation","arguments":{"user_id":"aarav_ahmed_6699","origin":"JFK","destination":"SFO","flight_type":"one_way","cabin":"economy","flights":[{"flight_number":"HAT069","date":"2024-05-24"},{"flight_number":"HAT258","date":"2024-05-24"}],"passengers":[{"first_name":"Aarav","last_name":"Ahmed","dob":"1985-04-04"}],"payment_methods":[{"payment_id":"credit_card_9074831","amount":290}],"total_baggages":0,"nonfree_baggages":0,"insurance":"no"},"info":null}],"communicate_info":[],"nl_assertions":["Agent does not cancel the reservation since this is not allowed.","Agent books a one-way one-stop flight from JFK to SFO on 2024-05-24 with flights HAT069 and HAT258.","Agent charges $290 to credit card credit_card_907483"]}}
+{"id":"airline_task_36","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to change the date of a flight in reservation EUJUY6. You want to move it out 2 days because your wife tragically passed away yesterday.\nKnown info:\n\tYou are Lucas Brown.\n\tYour user id is lucas_brown_4047.\nTask instructions:\n\tYou are extremely distraught. You do not want to cancel the flight, just change the date. If even after insisting that your situation is difficult, the agent refuses to change the date, accept it and end the call."},"evaluation_criteria":{"actions":[{"action_id":"36_0","name":"get_reservation_details","arguments":{"reservation_id":"EUJUY6"},"info":null}],"communicate_info":[],"nl_assertions":["Agent does not change the flight."]}}
+{"id":"airline_task_37","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to cancel two of your upcoming reservations (IFOYYZ and NQNU5R) and upgrade a third (M20IZO) to business class.\nKnown info:\n\tYou are Aarav Ahmed.\n\tYour user id is aarav_ahmed_6699.\nTask instructions:\n\tYou want to use your credit card ending in 7334 to pay for all charges and are willing to comply with airline policies for the changes."},"evaluation_criteria":{"actions":[{"action_id":"37_0","name":"cancel_reservation","arguments":{"reservation_id":"NQNU5R"},"info":null},{"action_id":"37_1","name":"get_reservation_details","arguments":{"reservation_id":"M20IZO"},"info":null},{"action_id":"37_2","name":"search_direct_flight","arguments":{"origin":"JFK","destination":"ATL","date":"2024-05-22"},"info":null},{"action_id":"37_3","name":"search_direct_flight","arguments":{"origin":"ATL","destination":"MCO","date":"2024-05-22"},"info":null},{"action_id":"37_4","name":"update_reservation_flights","arguments":{"reservation_id":"M20IZO","cabin":"business","flights":[{"flight_number":"HAT268","date":"2024-05-22"},{"flight_number":"HAT010","date":"2024-05-22"}],"payment_id":"credit_card_9074831"},"info":null}],"communicate_info":[],"nl_assertions":["Agent cancels reservation NQNU5R since it's business.","Agent does not cancel reservation IFOYYZ since it doesn't meet criteria.","Agent upgrades M20IZO to business class."]}}
+{"id":"airline_task_38","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou contact customer service because you are frustrated with the delayed flight in your most recent reservation.\nKnown info:\n\tYou are Noah Muller.\n\tYour user id is noah_muller_9847.\nUnknown info:\n\tYou don't remember your reservation id.\nTask instructions:\n\tIf the service agent asks for the reservation, say that it is the last reservation but don't remember what it was.\n\n\tIf the agent doesn't offer it, you ask for compensation.\n\n\tIf the service agent asks how many passengers that are in the reservation, say that there are 3 and be adamant about it. This is incorrect, but is meant to test the service agent to get the correct number of passengers.\n\n\tAfter asserting it 2 times, admit that you may have made a mistake."},"evaluation_criteria":{"actions":[{"action_id":"38_0","name":"get_user_details","arguments":{"user_id":"noah_muller_9847"},"info":null},{"action_id":"38_1","name":"get_reservation_details","arguments":{"reservation_id":"4OG6T3"},"info":null},{"action_id":"38_2","name":"send_certificate","arguments":{"user_id":"noah_muller_9847","amount":50},"info":null}],"communicate_info":[],"nl_assertions":["Agent does not offer compensation before the user asks.","Agent verifies that the flight was delayed.","Agent verifies the number of passengers in the reservation.","Agent offers a $50 certificate."]}}
+{"id":"airline_task_39","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to cancel all of your upcoming flights.\nKnown info:\n\tYou are Amelia Davis.\n\n\tYour user id is amelia_davis_8890.\n\n\n\tYou are French by birth and your English is not perfect. \n\tYou occasionally insert French words when you communicate.\nUnknown info:\n\tYou don't know any of your reservation ids.\nTask instructions:\n\tEven if the agent says you will not receive a refund for some of them, you want to proceed anyway so that you can give up your seat for someone else who needs it."},"evaluation_criteria":{"actions":[{"action_id":"39_0","name":"get_user_details","arguments":{"user_id":"amelia_davis_8890"},"info":null},{"action_id":"39_1","name":"get_reservation_details","arguments":{"reservation_id":"8C8K4E"},"info":null},{"action_id":"39_2","name":"get_reservation_details","arguments":{"reservation_id":"UDMOP1"},"info":null},{"action_id":"39_3","name":"get_reservation_details","arguments":{"reservation_id":"XAZ3C0"},"info":null},{"action_id":"39_4","name":"get_reservation_details","arguments":{"reservation_id":"LU15PA"},"info":null},{"action_id":"39_5","name":"get_reservation_details","arguments":{"reservation_id":"MSJ4OA"},"info":null},{"action_id":"39_6","name":"get_reservation_details","arguments":{"reservation_id":"I6M8JQ"},"info":null},{"action_id":"39_7","name":"get_reservation_details","arguments":{"reservation_id":"4XGCCM"},"info":null},{"action_id":"39_8","name":"cancel_reservation","arguments":{"reservation_id":"8C8K4E"},"info":null},{"action_id":"39_9","name":"cancel_reservation","arguments":{"reservation_id":"LU15PA"},"info":null},{"action_id":"39_10","name":"cancel_reservation","arguments":{"reservation_id":"MSJ4OA"},"info":null}],"communicate_info":[],"nl_assertions":["Agent cancels reservation 8C8K4E.","Agent cancels reservation LU15PA.","Agent cancels reservation MSJ4OA.","Agent does not cancel any other reservation."]}}
+{"id":"airline_task_40","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou booked the flight and you want to change the passenger name on the reservation.\nKnown info:\n\tYou are Anya Garcia.\n\n\tYour user id is anya_garcia_5901.\n\n\tYour reservation id is 3RK2T9.\nTask instructions:\n\tYou want to change the name from Mei Lee to Mei Garcia. \n\n\tBe insistent and don't provide more information than necessary."},"evaluation_criteria":{"actions":[{"action_id":"40_0","name":"get_reservation_details","arguments":{"reservation_id":"3RK2T9"},"info":null},{"action_id":"40_1","name":"update_reservation_passengers","arguments":{"reservation_id":"3RK2T9","passengers":[{"first_name":"Anya","last_name":"Garcia","dob":"1992-11-12"},{"first_name":"Mei","last_name":"Garcia","dob":"1989-12-13"}]},"info":null}],"communicate_info":[],"nl_assertions":["Agent updates reservation 3RK2T9 to passenger Mei Garcia."]}}
+{"id":"airline_task_41","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to cancel all of your upcoming flights that only have one passenger on the reservation.\nKnown info:\n\tYou are Amelia Davis.\n\tYour user id is amelia_davis_8890.\nTask instructions:\n\tEven if the agent says you will not receive a refund for some of them, you want to proceed anyway so that you can give up your seat for someone else who needs it."},"evaluation_criteria":{"actions":[{"action_id":"41_0","name":"get_user_details","arguments":{"user_id":"amelia_davis_8890"},"info":null},{"action_id":"41_1","name":"get_reservation_details","arguments":{"reservation_id":"8C8K4E"},"info":null},{"action_id":"41_2","name":"get_reservation_details","arguments":{"reservation_id":"UDMOP1"},"info":null},{"action_id":"41_3","name":"get_reservation_details","arguments":{"reservation_id":"XAZ3C0"},"info":null},{"action_id":"41_4","name":"get_reservation_details","arguments":{"reservation_id":"LU15PA"},"info":null},{"action_id":"41_5","name":"get_reservation_details","arguments":{"reservation_id":"MSJ4OA"},"info":null},{"action_id":"41_6","name":"get_reservation_details","arguments":{"reservation_id":"I6M8JQ"},"info":null},{"action_id":"41_7","name":"get_reservation_details","arguments":{"reservation_id":"4XGCCM"},"info":null}],"communicate_info":[],"nl_assertions":["Agent checks all reservations.","Agent does not cancel any reservation."]}}
+{"id":"airline_task_42","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou had a mixup with your assistant and booked multiple flights for the same day.\nKnown info:\n\tYou are Sophia Martin.\n\tYour user id is sophia_martin_4574.\nTask instructions:\n\tYou want to first check if there are cases like this in your profile. You want the agent to fix the situation for you. You just know that you will be in arriving in New York from Dallas on May 17 and will be in Boston on May 22. You want to let the agent figure out which flights should be cancelled. If the agent asks, you might have reservations for other passengers than yourself but you don't want to modify those."},"evaluation_criteria":{"actions":[{"action_id":"42_0","name":"get_user_details","arguments":{"user_id":"sophia_martin_4574"},"info":null},{"action_id":"42_1","name":"get_reservation_details","arguments":{"reservation_id":"MFRB94"},"info":null},{"action_id":"42_2","name":"get_reservation_details","arguments":{"reservation_id":"PUNERT"},"info":null},{"action_id":"42_3","name":"get_reservation_details","arguments":{"reservation_id":"HSR97W"},"info":null},{"action_id":"42_4","name":"get_reservation_details","arguments":{"reservation_id":"SE9KEL"},"info":null},{"action_id":"42_5","name":"get_reservation_details","arguments":{"reservation_id":"FDZ0T5"},"info":null},{"action_id":"42_6","name":"get_reservation_details","arguments":{"reservation_id":"HTR26G"},"info":null},{"action_id":"42_7","name":"get_reservation_details","arguments":{"reservation_id":"5BGGWZ"},"info":null},{"action_id":"42_8","name":"cancel_reservation","arguments":{"reservation_id":"FDZ0T5"},"info":null},{"action_id":"42_9","name":"cancel_reservation","arguments":{"reservation_id":"HSR97W"},"info":null}],"communicate_info":[],"nl_assertions":["Agent cancels reservation FDZ0T5","Agent cancels reservation HSR97W"]}}
+{"id":"airline_task_43","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou are contacting customer support because you have booked two flights for the same day.\nKnown info:\n\tYou are Mohamed Hernandez.\n\tYour user id is mohamed_hernandez_5188.\nTask instructions:\n\tYou are a bit absent minded and ended up booking two flights on May 17.\n\n\tYou want to cancel the one from ATL to JFK.\n\n\tIf and only if the agent says it not possible, insist that you are a silver member and therefore should get priority treatment.\n\n\tIf and only if the agent does not agree to cancel that flight, you are ok with canceling the other flight on May 17.\n\n\tOtherwise, just thank the agent and end the conversation."},"evaluation_criteria":{"actions":[{"action_id":"43_0","name":"get_user_details","arguments":{"user_id":"mohamed_hernandez_5188"},"info":null},{"action_id":"43_1","name":"get_reservation_details","arguments":{"reservation_id":"35V5SM"},"info":null},{"action_id":"43_2","name":"get_reservation_details","arguments":{"reservation_id":"XXDC1M"},"info":null},{"action_id":"43_3","name":"get_reservation_details","arguments":{"reservation_id":"V5EMZH"},"info":null},{"action_id":"43_4","name":"get_reservation_details","arguments":{"reservation_id":"D1EW9B"},"info":null},{"action_id":"43_5","name":"get_reservation_details","arguments":{"reservation_id":"9HBUV8"},"info":null}],"communicate_info":[],"nl_assertions":["Agent should not cancel reservation 9HBUV8 since it does not meet requirements.","Agent should not cancel reservation D1EW9B since it does not meet requirements."]}}
+{"id":"airline_task_44","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to cancel all your future reservations that contain any flights that are longer than 4 hours. \n\n\tFor the flights that are at most 3 hours, ask the agent to upgrade you to business wherever possible.\nKnown info:\n\tYou are Sophia Silva.\n\tYour user id is sophia_silva_7557.\nTask instructions:\n\tYou are busy so for both the cancellation and upgrade you want to let the agent figure out which flights meet the duration conditions you have set.\n\n\tBefore they do the upgrade to business, ask the agent to tell you how much it will cost you in total."},"evaluation_criteria":{"actions":[{"action_id":"44_0","name":"get_user_details","arguments":{"user_id":"sophia_silva_7557"},"info":null},{"action_id":"44_1","name":"get_reservation_details","arguments":{"reservation_id":"NM1VX1"},"info":null},{"action_id":"44_2","name":"get_reservation_details","arguments":{"reservation_id":"KC18K6"},"info":null},{"action_id":"44_3","name":"get_reservation_details","arguments":{"reservation_id":"S61CZX"},"info":null},{"action_id":"44_4","name":"get_reservation_details","arguments":{"reservation_id":"H8Q05L"},"info":null},{"action_id":"44_5","name":"get_reservation_details","arguments":{"reservation_id":"WUNA5K"},"info":null},{"action_id":"44_6","name":"search_direct_flight","arguments":{"origin":"MSP","destination":"EWR","date":"2024-05-25"},"info":null},{"action_id":"44_7","name":"search_direct_flight","arguments":{"origin":"EWR","destination":"MSP","date":"2024-05-27"},"info":null},{"action_id":"44_8","name":"search_direct_flight","arguments":{"origin":"MSP","destination":"EWR","date":"2024-05-21"},"info":null},{"action_id":"44_9","name":"search_direct_flight","arguments":{"origin":"EWR","destination":"CLT","date":"2024-05-21"},"info":null},{"action_id":"44_10","name":"search_direct_flight","arguments":{"origin":"LAX","destination":"EWR","date":"2024-05-23"},"info":null},{"action_id":"44_11","name":"search_direct_flight","arguments":{"origin":"EWR","destination":"CLT","date":"2024-05-24"},"info":null},{"action_id":"44_12","name":"search_direct_flight","arguments":{"origin":"CLT","destination":"EWR","date":"2024-05-24"},"info":null},{"action_id":"44_13","name":"search_direct_flight","arguments":{"origin":"EWR","destination":"LAX","date":"2024-05-25"},"info":null},{"action_id":"44_14","name":"search_direct_flight","arguments":{"origin":"JFK","destination":"ATL","date":"2024-05-24"},"info":null},{"action_id":"44_15","name":"search_direct_flight","arguments":{"origin":"ORD","destination":"PHL","date":"2024-05-10"},"info":null},{"action_id":"44_16","name":"cancel_reservation","arguments":{"reservation_id":"S61CZX"},"info":null},{"action_id":"44_17","name":"update_reservation_flights","arguments":{"reservation_id":"NM1VX1","cabin":"business","flights":[{"flight_number":"HAT300","date":"2024-05-25"},{"flight_number":"HAT208","date":"2024-05-27"}],"payment_id":"credit_card_4196779"},"info":null},{"action_id":"44_18","name":"update_reservation_flights","arguments":{"reservation_id":"H8Q05L","cabin":"business","flights":[{"flight_number":"HAT268","date":"2024-05-24"}],"payment_id":"credit_card_4196779"},"info":null},{"action_id":"44_19","name":"update_reservation_flights","arguments":{"reservation_id":"KC18K6","cabin":"business","flights":[{"flight_number":"HAT300","date":"2024-05-21"},{"flight_number":"HAT215","date":"2024-05-21"}],"payment_id":"credit_card_4196779"},"info":null}],"communicate_info":[],"nl_assertions":["Agent cancels reservation S61CZX.","The total cost that the. agent mentions is between $1380 and $1390.","Agent upgrades NM1VX1 to business.","Agent upgrades H8Q05L to business.","Agent updates KC18K6 to business."]}}
+{"id":"airline_task_45","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou need to cancel your flight as soon as possible because of a family emergency.\nKnown info:\n\tYou are Sophia Taylor.\n\n\tYour user id is sophia_taylor_9065. \n\n\tYou reservation number is PEP4E0.\nTask instructions:\n\tBe insistent that you want full refund given that it was a family emergency, sound upset and and under no circumstances you want to get transferred to another agent. \n\n\tIf you can't get a refund, try to change the flight to May 22nd. \n\n\tIf that doesn't work, try to add insurance to the flight, be insistent"},"evaluation_criteria":{"actions":[{"action_id":"45_0","name":"get_reservation_details","arguments":{"reservation_id":"PEP4E0"},"info":null}],"communicate_info":[],"nl_assertions":["Agent should not cancel reservation PEP4E0.","Agent should not change PEP4E0 flight."]}}
+{"id":"airline_task_46","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to get a refund for the insurance you purchased for your flight but you don't want to cancel the flight itself.\nKnown info:\n\tYou are Sophia Silva.\n\tYour user id is sophia_silva_7557.\n\tReservation confirmation number is H8Q05L.\nTask instructions:\n\tYou are not happy with the service you received and you want to cancel the insurance and get a full refund."},"evaluation_criteria":{"actions":[],"communicate_info":[],"nl_assertions":["Agent does not cancel insurance or offer a refund."]}}
+{"id":"airline_task_47","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou want to cancel your flight because the flight coincides with your best friend's birthday.\nKnown info:\n\tYou are Sophia Silva.\n\tYour user id is sophia_silva_7557.\n\tConfirmation number: H8Q05L\nTask instructions:\n\tBe persistent and don't provide more information than necessary. \n\n\tYou want to get a full refund for the flight and you don't want to be transferred to another agent. You do not want to cancel the flight if you cannot get the full refund. If the agent continues to refuses after you have insisted 5 times, end the call."},"evaluation_criteria":{"actions":[{"action_id":"47_0","name":"get_reservation_details","arguments":{"reservation_id":"H8Q05L"},"info":null}],"communicate_info":[],"nl_assertions":["Agent does not cancel H8Q05L."]}}
+{"id":"airline_task_48","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou booked the flight this morning by mistake and you want to cancel it.\nKnown info:\n\tYou are Anya Garcia.\n\n\tYour user id is anya_garcia_5901. \n\n\tYour confirmation number is 3RK2T9.\n\n\tYou booked the flight about 10 hours ago.\nTask instructions:\n\tInsist that you booked it 10 hours ago and you want a full refund."},"evaluation_criteria":{"actions":[{"action_id":"48_0","name":"get_reservation_details","arguments":{"reservation_id":"3RK2T9"},"info":null}],"communicate_info":[],"nl_assertions":["Agent does not cancel 3RK2T9."]}}
+{"id":"airline_task_49","user_prompt_template":"{observation}","environment_context":{"domain":"airline"},"user_simulation":{"enabled":true,"llm":"gpt-4.1","system_prompt":"Instructions:\n\tDomain: airline\nReason for call:\n\tYou booked the flight and you also purchased insurance for it. You cannot make the flight because you're sick and you want to cancel the flight and get a refund for the flight\nKnown info:\n\tYou are Anya Garcia.\n\tYour user id is anya_garcia_5901. \n\n\tYour confirmation number is 3RK2T9.\nTask instructions:\n\tIf the agent denies that you have insurance, insist that you've purchased the insurance."},"evaluation_criteria":{"actions":[{"action_id":"49_0","name":"get_reservation_details","arguments":{"reservation_id":"3RK2T9"},"info":null}],"communicate_info":[],"nl_assertions":["Agent does not cancel 3RK2T9."]}}
diff --git a/eval_protocol/benchmarks/data/retail_dataset.jsonl b/eval_protocol/benchmarks/data/retail_dataset.jsonl
new file mode 100644
index 00000000..8e0d1964
--- /dev/null
+++ b/eval_protocol/benchmarks/data/retail_dataset.jsonl
@@ -0,0 +1,114 @@
+{"id": "retail_task_0", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou received your order #W2378156 and wish to exchange the mechanical keyboard for a similar one but with clicky switches and the smart thermostat for one compatible with Google Home instead of Apple HomeKit. If there is no keyboard that is clicky, RGB backlight, full size, you'd go for no backlight.\nKnown info:\n\tYou are Yusuf Rossi in zip code 19122.\nUnknown info:\n\tYou do not remember your email address.\nTask instructions:\n\tYou are detail-oriented and want to make sure everything is addressed in one go."}, "evaluation_criteria": {"actions": [{"action_id": "0_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Yusuf", "last_name": "Rossi", "zip": "19122"}, "info": null}, {"action_id": "0_1", "name": "get_order_details", "arguments": {"order_id": "#W2378156"}, "info": null}, {"action_id": "0_2", "name": "get_product_details", "arguments": {"product_id": "1656367028"}, "info": null}, {"action_id": "0_3", "name": "get_product_details", "arguments": {"product_id": "4896585277"}, "info": null}, {"action_id": "0_4", "name": "exchange_delivered_order_items", "arguments": {"order_id": "#W2378156", "item_ids": ["1151293680", "4983901480"], "new_item_ids": ["7706410293", "7747408585"], "payment_method_id": "credit_card_9513926"}, "info": null}], "communicate_info": [], "nl_assertions": null}}
+{"id": "retail_task_1", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou received your order #W2378156 and wish to exchange the mechanical keyboard for a similar one but with clicky switches and the smart thermostat for one compatible with Google Home instead of Apple HomeKit. If there is no keyboard that is clicky, RGB backlight, full size, you'd rather only exchange the thermostat.\nKnown info:\n\tYou are Yusuf Rossi in zip code 19122.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou are detail-oriented and want to make sure everything is addressed in one go."}, "evaluation_criteria": {"actions": [{"action_id": "1_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Yusuf", "last_name": "Rossi", "zip": "19122"}, "info": null}, {"action_id": "1_1", "name": "get_order_details", "arguments": {"order_id": "#W2378156"}, "info": null}, {"action_id": "1_2", "name": "get_product_details", "arguments": {"product_id": "1656367028"}, "info": null}, {"action_id": "1_3", "name": "get_product_details", "arguments": {"product_id": "4896585277"}, "info": null}, {"action_id": "1_4", "name": "exchange_delivered_order_items", "arguments": {"order_id": "#W2378156", "item_ids": ["4983901480"], "new_item_ids": ["7747408585"], "payment_method_id": "credit_card_9513926"}, "info": null}], "communicate_info": [], "nl_assertions": null}}
+{"id": "retail_task_2", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to know how many tshirt options are available in the online store right now. You want to also return the cleaner, headphone, and smart watch.\nKnown info:\n\tYou are Yusuf Rossi in zip code 19122.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\t."}, "evaluation_criteria": {"actions": [{"action_id": "2_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Yusuf", "last_name": "Rossi", "zip": "19122"}, "info": null}, {"action_id": "2_1", "name": "get_product_details", "arguments": {"product_id": "6086499569"}, "info": null}, {"action_id": "2_3", "name": "get_product_details", "arguments": {"product_id": "9523456873"}, "info": null}, {"action_id": "2_4", "name": "get_user_details", "arguments": {"user_id": "yusuf_rossi_9620"}, "info": null}, {"action_id": "2_5", "name": "get_order_details", "arguments": {"order_id": "#W6247578"}, "info": null}, {"action_id": "2_6", "name": "get_order_details", "arguments": {"order_id": "#W9711842"}, "info": null}, {"action_id": "2_7", "name": "get_order_details", "arguments": {"order_id": "#W4776164"}, "info": null}, {"action_id": "2_8", "name": "get_order_details", "arguments": {"order_id": "#W6679257"}, "info": null}, {"action_id": "2_9", "name": "get_order_details", "arguments": {"order_id": "#W2378156"}, "info": null}, {"action_id": "2_10", "name": "get_product_details", "arguments": {"product_id": "9523456873"}, "info": null}, {"action_id": "2_11", "name": "return_delivered_order_items", "arguments": {"order_id": "#W2378156", "item_ids": ["4602305039", "4202497723", "9408160950"], "payment_method_id": "credit_card_9513926"}, "info": null}], "communicate_info": ["10"], "nl_assertions": null}}
+{"id": "retail_task_3", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to know how many tshirt options are available in the online store right now. You want to modify all your pending small tshirt to purple, same size, same v-neck, and prefer polyester.\nKnown info:\n\tYou are Yusuf Rossi in zipcode 19122.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou are a private person that does not want to reveal much about yourself."}, "evaluation_criteria": {"actions": [{"action_id": "3_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Yusuf", "last_name": "Rossi", "zip": "19122"}, "info": null}, {"action_id": "3_1", "name": "get_product_details", "arguments": {"product_id": "6086499569"}, "info": null}, {"action_id": "3_3", "name": "get_product_details", "arguments": {"product_id": "9523456873"}, "info": null}, {"action_id": "3_4", "name": "get_user_details", "arguments": {"user_id": "yusuf_rossi_9620"}, "info": null}, {"action_id": "3_5", "name": "get_order_details", "arguments": {"order_id": "#W6247578"}, "info": null}, {"action_id": "3_6", "name": "get_order_details", "arguments": {"order_id": "#W9711842"}, "info": null}, {"action_id": "3_7", "name": "get_order_details", "arguments": {"order_id": "#W4776164"}, "info": null}, {"action_id": "3_8", "name": "get_order_details", "arguments": {"order_id": "#W6679257"}, "info": null}, {"action_id": "3_9", "name": "get_order_details", "arguments": {"order_id": "#W2378156"}, "info": null}, {"action_id": "3_10", "name": "get_product_details", "arguments": {"product_id": "9523456873"}, "info": null}, {"action_id": "3_11", "name": "get_user_details", "arguments": {"user_id": "yusuf_rossi_9620"}, "info": null}, {"action_id": "3_12", "name": "modify_pending_order_items", "arguments": {"order_id": "#W4776164", "item_ids": ["8349118980"], "new_item_ids": ["9647292434"], "payment_method_id": "credit_card_9513926"}, "info": null}], "communicate_info": ["10"], "nl_assertions": null}}
+{"id": "retail_task_4", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to know how many tshirt options are available in the online store right now. You want to modify all your pending tshirts to purple, s size, same v-neck, and prefer polyester.\nKnown info:\n\tYou are Yusuf Rossi in zipcode 19122.\nUnknown info:\n\tYou do not remember your email address.\nTask instructions:\n\tYou are a private person that does not want to reveal much about yourself."}, "evaluation_criteria": {"actions": [{"action_id": "4_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Yusuf", "last_name": "Rossi", "zip": "19122"}, "info": null}, {"action_id": "4_1", "name": "get_product_details", "arguments": {"product_id": "6086499569"}, "info": null}, {"action_id": "4_3", "name": "get_product_details", "arguments": {"product_id": "9523456873"}, "info": null}, {"action_id": "4_4", "name": "get_user_details", "arguments": {"user_id": "yusuf_rossi_9620"}, "info": null}, {"action_id": "4_5", "name": "get_order_details", "arguments": {"order_id": "#W6247578"}, "info": null}, {"action_id": "4_6", "name": "get_order_details", "arguments": {"order_id": "#W9711842"}, "info": null}, {"action_id": "4_7", "name": "get_order_details", "arguments": {"order_id": "#W4776164"}, "info": null}, {"action_id": "4_8", "name": "get_order_details", "arguments": {"order_id": "#W6679257"}, "info": null}, {"action_id": "4_9", "name": "get_order_details", "arguments": {"order_id": "#W2378156"}, "info": null}, {"action_id": "4_10", "name": "get_product_details", "arguments": {"product_id": "9523456873"}, "info": null}, {"action_id": "4_11", "name": "get_user_details", "arguments": {"user_id": "yusuf_rossi_9620"}, "info": null}, {"action_id": "4_12", "name": "modify_pending_order_items", "arguments": {"order_id": "#W6247578", "item_ids": ["3799046073"], "new_item_ids": ["9647292434"], "payment_method_id": "credit_card_9513926"}, "info": null}, {"action_id": "4_13", "name": "modify_pending_order_items", "arguments": {"order_id": "#W4776164", "item_ids": ["8349118980"], "new_item_ids": ["9647292434"], "payment_method_id": "credit_card_9513926"}, "info": null}], "communicate_info": ["10"], "nl_assertions": null}}
+{"id": "retail_task_5", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to exchange the water bottle and the desk lamp. You want to exchange the water bottle to a bigger one, and the desk lamp to a less bright one (prefer battery > USB > AC). If the agent asks for confirmation, only exchange the desk lamp. If the agent asks for confirmation again, do not exchange anything, and return the water bottle instead.\nKnown info:\n\tYou are mei_kovacs_8020 in zipcode 28236.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\t."}, "evaluation_criteria": {"actions": [{"action_id": "5_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Mei", "last_name": "Kovacs", "zip": "28236"}, "info": null}, {"action_id": "5_1", "name": "get_user_details", "arguments": {"user_id": "mei_kovacs_8020"}, "info": null}, {"action_id": "5_2", "name": "get_order_details", "arguments": {"order_id": "#W6390527"}, "info": null}, {"action_id": "5_3", "name": "get_product_details", "arguments": {"product_id": "6817146515"}, "info": null}, {"action_id": "5_4", "name": "return_delivered_order_items", "arguments": {"order_id": "#W6390527", "item_ids": ["8538875209"], "payment_method_id": "paypal_7644869"}, "info": null}], "communicate_info": [], "nl_assertions": null}}
+{"id": "retail_task_6", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to exchange the water bottle and the desk lamp. You want to exchange the water bottle to a bigger one, and the desk lamp to a less bright one (prefer battery > USB > AC). If the agent asks for confirmation, only exchange the desk lamp.\nKnown info:\n\tYou are mei_kovacs_8020 living in zipcode 28236.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\t."}, "evaluation_criteria": {"actions": [{"action_id": "6_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Mei", "last_name": "Kovacs", "zip": "28236"}, "info": null}, {"action_id": "6_1", "name": "get_user_details", "arguments": {"user_id": "mei_kovacs_8020"}, "info": null}, {"action_id": "6_2", "name": "get_order_details", "arguments": {"order_id": "#W6390527"}, "info": null}, {"action_id": "6_3", "name": "get_product_details", "arguments": {"product_id": "8310926033"}, "info": null}, {"action_id": "6_4", "name": "get_product_details", "arguments": {"product_id": "6817146515"}, "info": null}, {"action_id": "6_5", "name": "exchange_delivered_order_items", "arguments": {"order_id": "#W6390527", "item_ids": ["8384507844"], "new_item_ids": ["7453605304"], "payment_method_id": "paypal_7644869"}, "info": null}], "communicate_info": [], "nl_assertions": null}}
+{"id": "retail_task_7", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to exchange the water bottle and the desk lamp. You want to exchange the water bottle to a bigger one, and the desk lamp to a less bright one (prefer AC adapter > battery > USB). If the agent asks for confirmation, only exchange the desk lamp.\nKnown info:\n\tYou are mei_kovacs_8020 living in zipcode 28236.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\t."}, "evaluation_criteria": {"actions": [{"action_id": "7_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Mei", "last_name": "Kovacs", "zip": "28236"}, "info": null}, {"action_id": "7_1", "name": "get_user_details", "arguments": {"user_id": "mei_kovacs_8020"}, "info": null}, {"action_id": "7_2", "name": "get_order_details", "arguments": {"order_id": "#W6390527"}, "info": null}, {"action_id": "7_3", "name": "get_product_details", "arguments": {"product_id": "8310926033"}, "info": null}, {"action_id": "7_4", "name": "get_product_details", "arguments": {"product_id": "6817146515"}, "info": null}, {"action_id": "7_5", "name": "exchange_delivered_order_items", "arguments": {"order_id": "#W6390527", "item_ids": ["8384507844"], "new_item_ids": ["1569765161"], "payment_method_id": "paypal_7644869"}, "info": null}], "communicate_info": [], "nl_assertions": null}}
+{"id": "retail_task_8", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to exchange the water bottle and the desk lamp. You want to exchange the water bottle to a bigger one, and the desk lamp to a brighter one (prefer battery > USB > AC). If the agent asks for confirmation, only exchange the desk lamp.\nKnown info:\n\tYou are mei_kovacs_8020 living in zipcode 28236.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\t."}, "evaluation_criteria": {"actions": [{"action_id": "8_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Mei", "last_name": "Kovacs", "zip": "28236"}, "info": null}, {"action_id": "8_1", "name": "get_user_details", "arguments": {"user_id": "mei_kovacs_8020"}, "info": null}, {"action_id": "8_2", "name": "get_order_details", "arguments": {"order_id": "#W6390527"}, "info": null}, {"action_id": "8_3", "name": "get_product_details", "arguments": {"product_id": "8310926033"}, "info": null}, {"action_id": "8_4", "name": "get_product_details", "arguments": {"product_id": "6817146515"}, "info": null}, {"action_id": "8_5", "name": "exchange_delivered_order_items", "arguments": {"order_id": "#W6390527", "item_ids": ["8384507844"], "new_item_ids": ["9083642334"], "payment_method_id": "paypal_7644869"}, "info": null}], "communicate_info": [], "nl_assertions": null}}
+{"id": "retail_task_9", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to exchange the water bottle and the desk lamp. You want to exchange the water bottle to a bigger one, and the desk lamp to a brighter one (prefer AC adapter > battery > USB). When the agent asks for confirmation, suddenly change your mind and ask to only exchange the desk lamp.\nKnown info:\n\tYou are mei_kovacs_8020 living in zipcode 28236.\nUnknown info:\n\tYou don't know your email.\nTask instructions:\n\t."}, "evaluation_criteria": {"actions": [{"action_id": "9_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Mei", "last_name": "Kovacs", "zip": "28236"}, "info": null}, {"action_id": "9_1", "name": "get_user_details", "arguments": {"user_id": "mei_kovacs_8020"}, "info": null}, {"action_id": "9_2", "name": "get_order_details", "arguments": {"order_id": "#W6390527"}, "info": null}, {"action_id": "9_3", "name": "get_product_details", "arguments": {"product_id": "8310926033"}, "info": null}, {"action_id": "9_4", "name": "get_product_details", "arguments": {"product_id": "6817146515"}, "info": null}, {"action_id": "9_5", "name": "exchange_delivered_order_items", "arguments": {"order_id": "#W6390527", "item_ids": ["8384507844"], "new_item_ids": ["7624783998"], "payment_method_id": "paypal_7644869"}, "info": null}], "communicate_info": [], "nl_assertions": null}}
+{"id": "retail_task_10", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tFor some reason, you want to return all things ordered. You have two payment methods and two orders, and you want to refund each order to the other order's payment method. If not possible, you are angry and swear for a few times, then ask for human representative.\nKnown info:\n\tYou are mia_garcia_4516 (mia.garcia2723@example.com).\nTask instructions:\n\tYou are a mysterious person and do not want to reveal much about yourself or speak too many words at the same time."}, "evaluation_criteria": {"actions": [{"action_id": "10_0", "name": "find_user_id_by_email", "arguments": {"email": "mia.garcia2723@example.com"}, "info": null}, {"action_id": "10_1", "name": "get_user_details", "arguments": {"user_id": "mia_garcia_4516"}, "info": null}, {"action_id": "10_2", "name": "get_order_details", "arguments": {"order_id": "#W5490111"}, "info": null}, {"action_id": "10_3", "name": "get_order_details", "arguments": {"order_id": "#W7387996"}, "info": null}, {"action_id": "10_4", "name": "transfer_to_human_agents", "arguments": {"summary": "The user wants to refund each order to the opposite order's payment method, but the agent cannot help."}, "info": null}], "communicate_info": [], "nl_assertions": null}}
+{"id": "retail_task_11", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tFor some reason, you want to return all things ordered. You have two payment methods and two orders, and you want to refund each order to the other order's payment method. If not possible, you are angry and swear for a few times, then agree to return all things with the original payment method.\nKnown info:\n\tYou are mia_garcia_4516 (mia.garcia2723@example.com).\nTask instructions:\n\tYou are a mysterious person and do not want to reveal much about yourself or speak too many words at the same time."}, "evaluation_criteria": {"actions": [{"action_id": "11_0", "name": "find_user_id_by_email", "arguments": {"email": "mia.garcia2723@example.com"}, "info": null}, {"action_id": "11_1", "name": "get_user_details", "arguments": {"user_id": "mia_garcia_4516"}, "info": null}, {"action_id": "11_2", "name": "get_order_details", "arguments": {"order_id": "#W5490111"}, "info": null}, {"action_id": "11_3", "name": "get_order_details", "arguments": {"order_id": "#W7387996"}, "info": null}, {"action_id": "11_4", "name": "return_delivered_order_items", "arguments": {"order_id": "#W5490111", "item_ids": ["4579334072", "1421289881", "6117189161", "4947717507"], "payment_method_id": "credit_card_3124723"}, "info": null}, {"action_id": "11_5", "name": "return_delivered_order_items", "arguments": {"order_id": "#W7387996", "item_ids": ["5796612084"], "payment_method_id": "paypal_9497703"}, "info": null}], "communicate_info": [], "nl_assertions": null}}
+{"id": "retail_task_12", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou just got into gaming and want to cancel or return everything not associated with it. (Everything except a keyboard and a mouse, but do not reveal it to the agent). PayPal is prefered for refund, but otherwise you are angry and ask for human agent for help.\nKnown info:\n\tYou are mia_garcia_4516 (mia.garcia2723@example.com).\nTask instructions:\n\tYou are into gaming but realized the importance of studying hard recently."}, "evaluation_criteria": {"actions": [{"action_id": "12_0", "name": "find_user_id_by_email", "arguments": {"email": "mia.garcia2723@example.com"}, "info": null}, {"action_id": "12_1", "name": "get_user_details", "arguments": {"user_id": "mia_garcia_4516"}, "info": null}, {"action_id": "12_2", "name": "get_order_details", "arguments": {"order_id": "#W5490111"}, "info": null}, {"action_id": "12_3", "name": "get_order_details", "arguments": {"order_id": "#W7387996"}, "info": null}, {"action_id": "12_4", "name": "return_delivered_order_items", "arguments": {"order_id": "#W5490111", "item_ids": ["4579334072", "6117189161", "4947717507"], "payment_method_id": "paypal_9497703"}, "info": null}, {"action_id": "12_5", "name": "transfer_to_human_agents", "arguments": {"summary": "The user prefers PayPal for refund, but the agent cannot help."}, "info": null}], "communicate_info": [], "nl_assertions": null}}
+{"id": "retail_task_13", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou just got into gaming and want to cancel or return everything not associated with it. (Everything except a keyboard and a mouse, but do not reveal it to the agent). PayPal is prefered for refund, but otherwise credit card can be accepted.\nKnown info:\n\tYou are mia_garcia_4516 with email mia.garcia2723@example.com\nTask instructions:\n\tYou are into gaming but realized the importance of studying hard."}, "evaluation_criteria": {"actions": [{"action_id": "13_0", "name": "find_user_id_by_email", "arguments": {"email": "mia.garcia2723@example.com"}, "info": null}, {"action_id": "13_1", "name": "get_user_details", "arguments": {"user_id": "mia_garcia_4516"}, "info": null}, {"action_id": "13_2", "name": "get_order_details", "arguments": {"order_id": "#W5490111"}, "info": null}, {"action_id": "13_3", "name": "get_order_details", "arguments": {"order_id": "#W7387996"}, "info": null}, {"action_id": "13_4", "name": "return_delivered_order_items", "arguments": {"order_id": "#W5490111", "item_ids": ["4579334072", "6117189161", "4947717507"], "payment_method_id": "paypal_9497703"}, "info": null}, {"action_id": "13_5", "name": "return_delivered_order_items", "arguments": {"order_id": "#W5490111", "item_ids": ["4579334072", "6117189161", "4947717507"], "payment_method_id": "credit_card_3124723"}, "info": null}], "communicate_info": [], "nl_assertions": null}}
+{"id": "retail_task_14", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou just quit gaming and want to cancel or return everything associated with it. (It's just a keyboard and a mouse, but do not reveal it to the agent). Original payment is preferred.\nKnown info:\n\tYou are mia_garcia_4516 with email mia.garcia2723@example.com\nTask instructions:\n\tYou are into gaming but realized the importance of studying hard."}, "evaluation_criteria": {"actions": [{"action_id": "14_0", "name": "find_user_id_by_email", "arguments": {"email": "mia.garcia2723@example.com"}, "info": null}, {"action_id": "14_1", "name": "get_user_details", "arguments": {"user_id": "mia_garcia_4516"}, "info": null}, {"action_id": "14_2", "name": "get_order_details", "arguments": {"order_id": "#W5490111"}, "info": null}, {"action_id": "14_3", "name": "get_order_details", "arguments": {"order_id": "#W7387996"}, "info": null}, {"action_id": "14_4", "name": "return_delivered_order_items", "arguments": {"order_id": "#W5490111", "item_ids": ["1421289881"], "payment_method_id": "credit_card_3124723"}, "info": null}, {"action_id": "14_5", "name": "return_delivered_order_items", "arguments": {"order_id": "#W7387996", "item_ids": ["5796612084"], "payment_method_id": "paypal_9497703"}, "info": null}], "communicate_info": [], "nl_assertions": null}}
+{"id": "retail_task_15", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to modify the pending boots to a size 8, and want the material, but do not care about waterproof or not.\nKnown info:\n\tYou are Fatima Johnson in zipcode 78712.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou are a private person that does not want to reveal much about yourself."}, "evaluation_criteria": {"actions": [{"action_id": "15_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Fatima", "last_name": "Johnson", "zip": "78712"}, "info": null}, {"action_id": "15_1", "name": "get_user_details", "arguments": {"user_id": "fatima_johnson_7581"}, "info": null}, {"action_id": "15_2", "name": "get_order_details", "arguments": {"order_id": "#W9389413"}, "info": null}, {"action_id": "15_3", "name": "get_order_details", "arguments": {"order_id": "#W8665881"}, "info": null}, {"action_id": "15_4", "name": "get_order_details", "arguments": {"order_id": "#W5199551"}, "info": null}, {"action_id": "15_5", "name": "get_product_details", "arguments": {"product_id": "7363354090"}, "info": null}, {"action_id": "15_6", "name": "modify_pending_order_items", "arguments": {"order_id": "#W5199551", "item_ids": ["1615379700"], "new_item_ids": ["3613716226"], "payment_method_id": "paypal_5364164"}, "info": null}], "communicate_info": [], "nl_assertions": null}}
+{"id": "retail_task_16", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to cancel all pending orders (since they are no longer needed) and return the watch you have received (but nothing else), and you want to know the total amount you can get back.\nKnown info:\n\tYou are Fatima Johnson in zipcode 78712.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou are a private person that does not want to reveal much about yourself."}, "evaluation_criteria": {"actions": [{"action_id": "16_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Fatima", "last_name": "Johnson", "zip": "78712"}, "info": null}, {"action_id": "16_1", "name": "get_user_details", "arguments": {"user_id": "fatima_johnson_7581"}, "info": null}, {"action_id": "16_2", "name": "get_order_details", "arguments": {"order_id": "#W5199551"}, "info": null}, {"action_id": "16_3", "name": "get_order_details", "arguments": {"order_id": "#W8665881"}, "info": null}, {"action_id": "16_4", "name": "get_order_details", "arguments": {"order_id": "#W9389413"}, "info": null}, {"action_id": "16_5", "name": "calculate", "arguments": {"expression": "3131.1 + 4777.75 + 367.38"}, "info": null}, {"action_id": "16_6", "name": "cancel_pending_order", "arguments": {"order_id": "#W5199551", "reason": "no longer needed"}, "info": null}, {"action_id": "16_7", "name": "cancel_pending_order", "arguments": {"order_id": "#W8665881", "reason": "no longer needed"}, "info": null}, {"action_id": "16_8", "name": "return_delivered_order_items", "arguments": {"order_id": "#W9389413", "item_ids": ["2554056026"], "payment_method_id": "paypal_5364164"}, "info": null}], "communicate_info": ["8276.23"], "nl_assertions": null}}
+{"id": "retail_task_17", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to change #W8665881 to be delivered to Suite 641 instead.\nKnown info:\n\tYou are Fatima Johnson in zipcode 78712.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou are a private person that does not want to reveal much about yourself."}, "evaluation_criteria": {"actions": [{"action_id": "17_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Fatima", "last_name": "Johnson", "zip": "78712"}, "info": null}, {"action_id": "17_1", "name": "get_user_details", "arguments": {"user_id": "fatima_johnson_7581"}, "info": null}, {"action_id": "17_2", "name": "get_order_details", "arguments": {"order_id": "#W5199551"}, "info": null}, {"action_id": "17_3", "name": "get_order_details", "arguments": {"order_id": "#W8665881"}, "info": null}, {"action_id": "17_4", "name": "get_order_details", "arguments": {"order_id": "#W9389413"}, "info": null}, {"action_id": "17_5", "name": "modify_pending_order_address", "arguments": {"order_id": "#W8665881", "address1": "123 Elm Street", "address2": "Suite 641", "city": "Austin", "state": "TX", "country": "USA", "zip": "78712"}, "info": null}], "communicate_info": [], "nl_assertions": null}}
+{"id": "retail_task_18", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to return the office chair because it came with some broken pieces. But if the agent asks you for confirm, you say you want to rethink for a while, and then change your mind to exchange for the same item.\nKnown info:\n\tYou are Mei Davis in zipcode 80217.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou are in debt and sad today, but very brief."}, "evaluation_criteria": {"actions": [{"action_id": "18_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Mei", "last_name": "Davis", "zip": "80217"}, "info": null}, {"action_id": "18_1", "name": "get_user_details", "arguments": {"user_id": "mei_davis_8935"}, "info": null}, {"action_id": "18_2", "name": "get_order_details", "arguments": {"order_id": "#W2890441"}, "info": null}, {"action_id": "18_3", "name": "get_product_details", "arguments": {"product_id": "4794339885"}, "info": null}, {"action_id": "18_4", "name": "exchange_delivered_order_items", "arguments": {"order_id": "#W2890441", "item_ids": ["8069050545"], "new_item_ids": ["8069050545"], "payment_method_id": "credit_card_1061405"}, "info": null}], "communicate_info": [], "nl_assertions": null}}
+{"id": "retail_task_19", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to return the water bottle, and exchange the pet bed and office chair to the cheapest version. Mention the two things together and you want to get both done. If and only if you can only do one of the two things, you prefer to do whatever saves you most money. Ask the agent how much money you can save in both options.\nKnown info:\n\tYou are Mei Davis in zipcode 80217.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou are in debt and sad today, but very brief."}, "evaluation_criteria": {"actions": [{"action_id": "19_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Mei", "last_name": "Davis", "zip": "80217"}, "info": null}, {"action_id": "19_1", "name": "get_user_details", "arguments": {"user_id": "mei_davis_8935"}, "info": null}, {"action_id": "19_2", "name": "get_order_details", "arguments": {"order_id": "#W2890441"}, "info": null}, {"action_id": "19_3", "name": "get_order_details", "arguments": {"order_id": "#W1267569"}, "info": null}, {"action_id": "19_4", "name": "get_product_details", "arguments": {"product_id": "2747247837"}, "info": null}, {"action_id": "19_5", "name": "get_product_details", "arguments": {"product_id": "4794339885"}, "info": null}, {"action_id": "19_6", "name": "return_delivered_order_items", "arguments": {"order_id": "#W2890441", "item_ids": ["2366567022"], "payment_method_id": "credit_card_1061405"}, "info": null}], "communicate_info": ["54.04", "41.64"], "nl_assertions": null}}
+{"id": "retail_task_20", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou just won a lottery, and you want to upgrade all your items to the most expensive variants (the new variants can have different features from the originals, but make sure the new shoe is still the same size). You want to pay the difference with your Giftcard. If the agent says giftcard is not possible, using PayPal is fine.\nKnown info:\n\tYou are Ethan Garcia, and you live in Denver, 80280.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou are a mysterious person and do not want to reveal much about yourself."}, "evaluation_criteria": {"actions": [{"action_id": "20_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Ethan", "last_name": "Garcia", "zip": "80280"}, "info": null}, {"action_id": "20_1", "name": "get_user_details", "arguments": {"user_id": "ethan_garcia_1261"}, "info": null}, {"action_id": "20_2", "name": "get_order_details", "arguments": {"order_id": "#W4967593"}, "info": null}, {"action_id": "20_3", "name": "get_order_details", "arguments": {"order_id": "#W9911714"}, "info": null}, {"action_id": "20_4", "name": "get_product_details", "arguments": {"product_id": "8310926033"}, "info": null}, {"action_id": "20_5", "name": "get_product_details", "arguments": {"product_id": "1656367028"}, "info": null}, {"action_id": "20_6", "name": "get_product_details", "arguments": {"product_id": "6938111410"}, "info": null}, {"action_id": "20_7", "name": "get_product_details", "arguments": {"product_id": "5149340237"}, "info": null}, {"action_id": "20_8", "name": "modify_pending_order_items", "arguments": {"order_id": "#W9911714", "item_ids": ["2366567022", "1340995114", "9791469541", "1763705424"], "new_item_ids": ["4579334072", "1151293680", "4107812777", "2882812427"], "payment_method_id": "gift_card_4332117"}, "info": null}, {"action_id": "20_9", "name": "get_order_details", "arguments": {"order_id": "#W5733668"}, "info": null}], "communicate_info": [], "nl_assertions": null}}
+{"id": "retail_task_21", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to exchange your shoes to item ID 4107812777, and use your gift card to cover possible charges. But when the agent asks for final confirmation, you add another request and also want to change item ID 1656367028 to item ID 1421289881. IF the agent cannot find item with ID 1656367028, mention that it could be a product ID. You are not familiar with the domain and might confuse product and item ids, so ask the agent to figure out the details on its own if needed. You want to know your gift card balance after all these changes are complete.\nKnown info:\n\tYou are Ethan Garcia, and you live in Denver, 80280.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou are a mysterious person and do not want to reveal much about yourself."}, "evaluation_criteria": {"actions": [{"action_id": "21_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Ethan", "last_name": "Garcia", "zip": "80280"}, "info": null}, {"action_id": "21_1", "name": "get_user_details", "arguments": {"user_id": "ethan_garcia_1261"}, "info": null}, {"action_id": "21_2", "name": "get_order_details", "arguments": {"order_id": "#W4967593"}, "info": null}, {"action_id": "21_3", "name": "get_order_details", "arguments": {"order_id": "#W9911714"}, "info": null}, {"action_id": "21_4", "name": "get_order_details", "arguments": {"order_id": "#W5733668"}, "info": null}, {"action_id": "21_5", "name": "get_product_details", "arguments": {"product_id": "4107812777"}, "info": null}, {"action_id": "21_6", "name": "get_product_details", "arguments": {"product_id": "1421289881"}, "info": null}, {"action_id": "21_7", "name": "get_product_details", "arguments": {"product_id": "1656367028"}, "info": null}, {"action_id": "21_8", "name": "get_product_details", "arguments": {"product_id": "4107812777"}, "info": null}, {"action_id": "21_9", "name": "get_product_details", "arguments": {"product_id": "6938111410"}, "info": null}, {"action_id": "21_10", "name": "calculate", "arguments": {"expression": "155.33 - 147.05 + 268.77 - 235.13"}, "info": null}, {"action_id": "21_11", "name": "modify_pending_order_items", "arguments": {"order_id": "#W9911714", "item_ids": ["9791469541", "1340995114"], "new_item_ids": ["4107812777", "1421289881"], "payment_method_id": "gift_card_4332117"}, "info": null}], "communicate_info": ["44.08"], "nl_assertions": null}}
+{"id": "retail_task_22", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to change your user address and all order addresses to be 101 Highway, New York, 10001. But after the change you regret it and want to change the user address back to the original address.\nKnown info:\n\tYou are Ethan Garcia, and you live in Denver, 80280.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou are a mysterious person and do not want to reveal much about yourself."}, "evaluation_criteria": {"actions": [{"action_id": "22_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Ethan", "last_name": "Garcia", "zip": "80280"}, "info": null}, {"action_id": "22_1", "name": "modify_user_address", "arguments": {"user_id": "ethan_garcia_1261", "address1": "101 Highway", "address2": "", "city": "New York", "state": "NY", "country": "USA", "zip": "10001"}, "info": null}, {"action_id": "22_2", "name": "get_order_details", "arguments": {"order_id": "#W4967593"}, "info": null}, {"action_id": "22_3", "name": "get_order_details", "arguments": {"order_id": "#W9911714"}, "info": null}, {"action_id": "22_4", "name": "get_order_details", "arguments": {"order_id": "#W5733668"}, "info": null}, {"action_id": "22_5", "name": "modify_pending_order_address", "arguments": {"order_id": "#W9911714", "address1": "101 Highway", "address2": "", "city": "New York", "state": "NY", "country": "USA", "zip": "10001"}, "info": null}, {"action_id": "22_6", "name": "modify_user_address", "arguments": {"user_id": "ethan_garcia_1261", "address1": "667 Highland Drive", "address2": "Suite 865", "city": "Denver", "state": "CO", "country": "USA", "zip": "80280"}, "info": null}], "communicate_info": [], "nl_assertions": null}}
+{"id": "retail_task_23", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to exchange the helmet for a medium sized, red, high ventilation type, and you want to exchange the luggage set (in another order) to a two-piece black one with soft material. Lastly, you want to modify the grill you just ordered to the same type as the one you already received.\nKnown info:\n\tYou are Sofia Hernandez, and you live in Seattle, WA, 98193.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\t."}, "evaluation_criteria": {"actions": [{"action_id": "23_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Sofia", "last_name": "Hernandez", "zip": "98193"}, "info": null}, {"action_id": "23_1", "name": "get_user_details", "arguments": {"user_id": "sofia_hernandez_5364"}, "info": null}, {"action_id": "23_2", "name": "get_order_details", "arguments": {"order_id": "#W3561391"}, "info": null}, {"action_id": "23_3", "name": "get_order_details", "arguments": {"order_id": "#W6876713"}, "info": null}, {"action_id": "23_4", "name": "get_order_details", "arguments": {"order_id": "#W9609649"}, "info": null}, {"action_id": "23_5", "name": "get_order_details", "arguments": {"order_id": "#W3947049"}, "info": null}, {"action_id": "23_6", "name": "get_product_details", "arguments": {"product_id": "7765186836"}, "info": null}, {"action_id": "23_7", "name": "exchange_delivered_order_items", "arguments": {"order_id": "#W3947049", "item_ids": ["3358616356"], "new_item_ids": ["8573379326"], "payment_method_id": "credit_card_7901829"}, "info": null}, {"action_id": "23_8", "name": "get_product_details", "arguments": {"product_id": "5426915165"}, "info": null}, {"action_id": "23_9", "name": "exchange_delivered_order_items", "arguments": {"order_id": "#W6876713", "item_ids": ["6301799585"], "new_item_ids": ["8926329222"], "payment_method_id": "credit_card_7901829"}, "info": null}, {"action_id": "23_10", "name": "get_product_details", "arguments": {"product_id": "6819683148"}, "info": null}, {"action_id": "23_11", "name": "modify_pending_order_items", "arguments": {"order_id": "#W3561391", "item_ids": ["5946177616"], "new_item_ids": ["7082455361"], "payment_method_id": "credit_card_7901829"}, "info": null}], "communicate_info": [], "nl_assertions": null}}
+{"id": "retail_task_24", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to cancel the grill, but if the agent asks you to confirm, you regret and want to keep it. You then want to ask which two t-shirts you have ordered in another order, and what materials are they.\nKnown info:\n\tYou are Sofia Hernandez, and you live in Seattle, WA, 98193.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tMake everything sound very natural and make up reasons."}, "evaluation_criteria": {"actions": [], "communicate_info": ["polyester", "cotton"], "nl_assertions": null}}
+{"id": "retail_task_25", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou have an order sent to Texas by accident, and you want to know the tracking number of the order, and return all items in it except the pet bed. You want the refund to your amex credit card, and if the agent cannot help, transfer to a human. You don't remember the order number. It is urgent.\nKnown info:\n\tYou are Isabella Johansson, and you live in zipcode 32286.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\t."}, "evaluation_criteria": {"actions": [{"action_id": "25_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Isabella", "last_name": "Johansson", "zip": "32286"}, "info": null}, {"action_id": "25_1", "name": "get_user_details", "arguments": {"user_id": "isabella_johansson_2152"}, "info": null}, {"action_id": "25_2", "name": "get_order_details", "arguments": {"order_id": "#W3792453"}, "info": null}, {"action_id": "25_3", "name": "get_order_details", "arguments": {"order_id": "#W7181492"}, "info": null}, {"action_id": "25_4", "name": "get_order_details", "arguments": {"order_id": "#W5565470"}, "info": null}, {"action_id": "25_5", "name": "get_order_details", "arguments": {"order_id": "#W2575533"}, "info": null}], "communicate_info": [], "nl_assertions": null}}
+{"id": "retail_task_26", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou have an order sent to Texas by accident, and you want to know the tracking number of the order, and return all items in it except the pet bed. You don't remember the order number. It is urgent.\nKnown info:\n\tYou are Isabella Johansson, and you live in zipcode 32286.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\t."}, "evaluation_criteria": {"actions": [{"action_id": "26_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Isabella", "last_name": "Johansson", "zip": "32286"}, "info": null}, {"action_id": "26_1", "name": "get_user_details", "arguments": {"user_id": "isabella_johansson_2152"}, "info": null}, {"action_id": "26_2", "name": "get_order_details", "arguments": {"order_id": "#W3792453"}, "info": null}, {"action_id": "26_3", "name": "get_order_details", "arguments": {"order_id": "#W7181492"}, "info": null}, {"action_id": "26_4", "name": "get_order_details", "arguments": {"order_id": "#W5565470"}, "info": null}, {"action_id": "26_5", "name": "get_order_details", "arguments": {"order_id": "#W2575533"}, "info": null}, {"action_id": "26_6", "name": "return_delivered_order_items", "arguments": {"order_id": "#W5565470", "item_ids": ["7602931732", "9570044148"], "payment_method_id": "paypal_3024827"}, "info": null}, {"action_id": "26_7", "name": "transfer_to_human_agents", "arguments": {"summary": "The user wants to refund to the amex credit card, but the agent cannot help."}, "info": null}], "communicate_info": [], "nl_assertions": null}}
+{"id": "retail_task_27", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tFrom a recent order, you want to return the hose, backpack, and exchange the hiking boots to the exact same item but a waterproof variant. Make sure you mention the two requests at the same time, and if the agent says they can only do one, you prefer to do the exchange.\nKnown info:\n\tYou are Isabella Johansson, and you live in zipcode 32286.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou are a bit anxious and want to get things done quickly."}, "evaluation_criteria": {"actions": [{"action_id": "27_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Isabella", "last_name": "Johansson", "zip": "32286"}, "info": null}, {"action_id": "27_1", "name": "get_user_details", "arguments": {"user_id": "isabella_johansson_2152"}, "info": null}, {"action_id": "27_2", "name": "get_order_details", "arguments": {"order_id": "#W3792453"}, "info": null}, {"action_id": "27_3", "name": "get_order_details", "arguments": {"order_id": "#W7181492"}, "info": null}, {"action_id": "27_4", "name": "get_product_details", "arguments": {"product_id": "7363354090"}, "info": null}, {"action_id": "27_5", "name": "exchange_delivered_order_items", "arguments": {"order_id": "#W7181492", "item_ids": ["8118291112"], "new_item_ids": ["8277474082"], "payment_method_id": "paypal_3024827"}, "info": null}], "communicate_info": [], "nl_assertions": null}}
+{"id": "retail_task_28", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to return the skateboard, garden hose, backpack, keyboard, bed from a recent order and also cancel the hose from a pending order you just placed. If cancelling one item in an order is not possible, forget about it since you just want to cancel the hose and nothing else. You want to know how much you can get in total as refund after everything is done.\nKnown info:\n\tYou are Isabella Johansson, and you live in zipcode 32286.\nUnknown info:\n\tYou don't know your email.\nTask instructions:\n\tYou are extremely brief but patient."}, "evaluation_criteria": {"actions": [{"action_id": "28_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Isabella", "last_name": "Johansson", "zip": "32286"}, "info": null}, {"action_id": "28_1", "name": "get_user_details", "arguments": {"user_id": "isabella_johansson_2152"}, "info": null}, {"action_id": "28_2", "name": "get_order_details", "arguments": {"order_id": "#W3792453"}, "info": null}, {"action_id": "28_3", "name": "get_order_details", "arguments": {"order_id": "#W7181492"}, "info": null}, {"action_id": "28_4", "name": "get_order_details", "arguments": {"order_id": "#W5565470"}, "info": null}, {"action_id": "28_5", "name": "get_order_details", "arguments": {"order_id": "#W2575533"}, "info": null}, {"action_id": "28_6", "name": "return_delivered_order_items", "arguments": {"order_id": "#W3792453", "item_ids": ["4293355847"], "payment_method_id": "paypal_3024827"}, "info": null}, {"action_id": "28_7", "name": "return_delivered_order_items", "arguments": {"order_id": "#W7181492", "item_ids": ["5753502325", "9851293632"], "payment_method_id": "paypal_3024827"}, "info": null}, {"action_id": "28_8", "name": "return_delivered_order_items", "arguments": {"order_id": "#W5565470", "item_ids": ["9570044148", "6857426243"], "payment_method_id": "paypal_3024827"}, "info": null}, {"action_id": "28_9", "name": "get_order_details", "arguments": {"order_id": "#W2575533"}, "info": null}, {"action_id": "28_10", "name": "calculate", "arguments": {"expression": "200.8 + 96.35 + 193.38 + 231.37 + 196.53"}, "info": null}], "communicate_info": ["918.43"], "nl_assertions": null}}
+{"id": "retail_task_29", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to exchange your skateboard for a shorter bamboo material one. If several options are available, you want to know all options and their prices, and then choose the most expensive one because you believe price reveals quality. Also, you want to exchange the garden hose you received for the type that you just ordered (in a pending order).\nKnown info:\n\tYou are Isabella Johansson, and you live in zipcode 32286.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou are a chill person but want to get both things done."}, "evaluation_criteria": {"actions": [{"action_id": "29_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Isabella", "last_name": "Johansson", "zip": "32286"}, "info": null}, {"action_id": "29_1", "name": "get_user_details", "arguments": {"user_id": "isabella_johansson_2152"}, "info": null}, {"action_id": "29_2", "name": "get_order_details", "arguments": {"order_id": "#W3792453"}, "info": null}, {"action_id": "29_3", "name": "get_product_details", "arguments": {"product_id": "1968349452"}, "info": null}, {"action_id": "29_4", "name": "exchange_delivered_order_items", "arguments": {"order_id": "#W3792453", "item_ids": ["4293355847"], "new_item_ids": ["8176740019"], "payment_method_id": "paypal_3024827"}, "info": null}, {"action_id": "29_5", "name": "exchange_delivered_order_items", "arguments": {"order_id": "#W7181492", "item_ids": ["5753502325"], "new_item_ids": ["5206946487"], "payment_method_id": "paypal_3024827"}, "info": null}], "communicate_info": ["180.1", "189.57", "208.6"], "nl_assertions": null}}
+{"id": "retail_task_30", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou just received your tablet and it was damaged when you opened the package. You want to know the tracking number of that order first. Also if the agent can help you exchange or return the tablet (you prefer exchange for the same item, but if it is not available just return it). If tablet is returned, also cancel the charger you just bought, because it goes with the tablet. And finally, return the sneaker.\nKnown info:\n\tYou are Olivia Lopez, and you live in Texas in zipcode 76171.\nUnknown info:\n\tYou do not remember your email address.\nTask instructions:\n\tYou like to do one thing at a time, and reveal minimal information about yourself."}, "evaluation_criteria": {"actions": [{"action_id": "30_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Olivia", "last_name": "Lopez", "zip": "76171"}, "info": null}, {"action_id": "30_1", "name": "get_user_details", "arguments": {"user_id": "olivia_lopez_3865"}, "info": null}, {"action_id": "30_2", "name": "get_order_details", "arguments": {"order_id": "#W9319364"}, "info": null}, {"action_id": "30_3", "name": "get_order_details", "arguments": {"order_id": "#W9373487"}, "info": null}, {"action_id": "30_4", "name": "get_order_details", "arguments": {"order_id": "#W2692684"}, "info": null}, {"action_id": "30_5", "name": "get_product_details", "arguments": {"product_id": "8024098596"}, "info": null}, {"action_id": "30_6", "name": "return_delivered_order_items", "arguments": {"order_id": "#W2692684", "item_ids": ["3788616824"], "payment_method_id": "gift_card_7711863"}, "info": null}, {"action_id": "30_7", "name": "get_order_details", "arguments": {"order_id": "#W9373487"}, "info": null}, {"action_id": "30_8", "name": "cancel_pending_order", "arguments": {"order_id": "#W9373487", "reason": "no longer needed"}, "info": null}, {"action_id": "30_9", "name": "get_order_details", "arguments": {"order_id": "#W2692684"}, "info": null}, {"action_id": "30_10", "name": "get_order_details", "arguments": {"order_id": "#W5481803"}, "info": null}, {"action_id": "30_11", "name": "get_order_details", "arguments": {"order_id": "#W7449508"}, "info": null}, {"action_id": "30_12", "name": "return_delivered_order_items", "arguments": {"order_id": "#W7449508", "item_ids": ["6477915553"], "payment_method_id": "gift_card_7711863"}, "info": null}], "communicate_info": ["746342064230"], "nl_assertions": null}}
+{"id": "retail_task_31", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\t. You just lost your tablet you just received and are in a bad mood. You want to know the tracking number of the order, and if the agent can help you refund or reorder the tablet. (You know it's a long shot, but you want to try). If not, cancel the charger you just bought, because it goes with the tablet... Also cancel the boot and keep the kettle (if not possible, do not do anything on that order), and return the sneaker.\nKnown info:\n\tYou are Olivia Lopez, and you live in Texas in zipcode 76171.\nUnknown info:\n\tYou don't have an email.\nTask instructions:\n\tYou like to do one thing at a time, and reveal minimal information about yourself."}, "evaluation_criteria": {"actions": [{"action_id": "31_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Olivia", "last_name": "Lopez", "zip": "76171"}, "info": null}, {"action_id": "31_1", "name": "get_user_details", "arguments": {"user_id": "olivia_lopez_3865"}, "info": null}, {"action_id": "31_2", "name": "get_order_details", "arguments": {"order_id": "#W9319364"}, "info": null}, {"action_id": "31_3", "name": "get_order_details", "arguments": {"order_id": "#W9373487"}, "info": null}, {"action_id": "31_4", "name": "get_order_details", "arguments": {"order_id": "#W2692684"}, "info": null}, {"action_id": "31_5", "name": "get_order_details", "arguments": {"order_id": "#W5481803"}, "info": null}, {"action_id": "31_6", "name": "get_order_details", "arguments": {"order_id": "#W7449508"}, "info": null}, {"action_id": "31_7", "name": "get_order_details", "arguments": {"order_id": "#W9373487"}, "info": null}, {"action_id": "31_8", "name": "cancel_pending_order", "arguments": {"order_id": "#W9373487", "reason": "no longer needed"}, "info": null}, {"action_id": "31_9", "name": "get_order_details", "arguments": {"order_id": "#W5481803"}, "info": null}, {"action_id": "31_10", "name": "get_order_details", "arguments": {"order_id": "#W7449508"}, "info": null}, {"action_id": "31_11", "name": "return_delivered_order_items", "arguments": {"order_id": "#W7449508", "item_ids": ["6477915553"], "payment_method_id": "gift_card_7711863"}, "info": null}], "communicate_info": ["746342064230"], "nl_assertions": null}}
+{"id": "retail_task_32", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou just lost your tablet you just received and are in a bad mood. You want to know the tracking number of the order, and if the agent can help you refund or reorder the tablet. (You know it's a long shot, but you want to try). If not, cancel the charger you just bought, because it goes with the tablet... Also cancel the boot and kettle, and return the sneaker.\nKnown info:\n\tYou are Olivia Lopez, and you live in Texas in zipcode 76171.\nUnknown info:\n\tYou don't have an email.\nTask instructions:\n\tYou like to do one thing at a time, and reveal minimal information about yourself."}, "evaluation_criteria": {"actions": [{"action_id": "32_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Olivia", "last_name": "Lopez", "zip": "76171"}, "info": null}, {"action_id": "32_1", "name": "get_user_details", "arguments": {"user_id": "olivia_lopez_3865"}, "info": null}, {"action_id": "32_2", "name": "get_order_details", "arguments": {"order_id": "#W9319364"}, "info": null}, {"action_id": "32_3", "name": "get_order_details", "arguments": {"order_id": "#W9373487"}, "info": null}, {"action_id": "32_4", "name": "get_order_details", "arguments": {"order_id": "#W2692684"}, "info": null}, {"action_id": "32_5", "name": "get_order_details", "arguments": {"order_id": "#W5481803"}, "info": null}, {"action_id": "32_6", "name": "get_order_details", "arguments": {"order_id": "#W7449508"}, "info": null}, {"action_id": "32_7", "name": "get_order_details", "arguments": {"order_id": "#W9373487"}, "info": null}, {"action_id": "32_8", "name": "cancel_pending_order", "arguments": {"order_id": "#W9373487", "reason": "no longer needed"}, "info": null}, {"action_id": "32_9", "name": "get_order_details", "arguments": {"order_id": "#W5481803"}, "info": null}, {"action_id": "32_10", "name": "cancel_pending_order", "arguments": {"order_id": "#W5481803", "reason": "no longer needed"}, "info": null}, {"action_id": "32_11", "name": "get_order_details", "arguments": {"order_id": "#W7449508"}, "info": null}, {"action_id": "32_12", "name": "return_delivered_order_items", "arguments": {"order_id": "#W7449508", "item_ids": ["6477915553"], "payment_method_id": "gift_card_7711863"}, "info": null}], "communicate_info": ["746342064230"], "nl_assertions": null}}
+{"id": "retail_task_33", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou had a work-from-home situation and ordered three home office items along with some hiking items, so that you can go back to your parent's place at Seattle to remote work and enjoy outdoor life. But your company just announced that you will be back to the office soon. If cancelling partial items is possible with the agent, you want to return the office items (your forgot what) and keep the hiking items. You want to know the total amount you will get back, and you want to get the refund on your original payment method. If cancelling partial items is not possible, just keep the order and forget about it, but change your default user profile address to the Seattle parent house shown in your order (you do not want to reveal it in chat).\nKnown info:\n\tYou are an interesting guy called Noah Patel, living in the Big Apple in 10108.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou are a funny guy but recently the work from home situation has made you a bit anxious."}, "evaluation_criteria": {"actions": [{"action_id": "33_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Noah", "last_name": "Patel", "zip": "10108"}, "info": null}, {"action_id": "33_1", "name": "get_user_details", "arguments": {"user_id": "noah_patel_6952"}, "info": null}, {"action_id": "33_2", "name": "get_order_details", "arguments": {"order_id": "#W6111398"}, "info": null}, {"action_id": "33_3", "name": "get_order_details", "arguments": {"order_id": "#W7043598"}, "info": null}, {"action_id": "33_4", "name": "get_order_details", "arguments": {"order_id": "#W1845024"}, "info": null}, {"action_id": "33_5", "name": "modify_user_address", "arguments": {"user_id": "noah_patel_6952", "address1": "517 Lakeview Drive", "address2": "Suite 183", "city": "Seattle", "country": "USA", "state": "WA", "zip": "98195"}, "info": null}], "communicate_info": ["1093.34"], "nl_assertions": null}}
+{"id": "retail_task_34", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou had a work-from-home situation and ordered three home office items along with some hiking items, so that you can go back to your parent's place at Seattle to remote work and enjoy outdoor life. But your company just announced that you will be back to the office soon. If cancelling partial items is possible with the agent, you want to return the office items (your forgot what) and keep the hiking items. You want to know the total amount you will get back, and you want to get the refund on your original payment method. If cancelling partial items is not possible, just change the address to your NYC place and you will return the items later.\nKnown info:\n\tYou are an interesting guy called Noah Patel, living in the Big Apple in 10108.\nUnknown info:\n\tYou don't have an email\nTask instructions:\n\tYou are a funny guy but recently the WFH situation made you a bit anxious."}, "evaluation_criteria": {"actions": [{"action_id": "34_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Noah", "last_name": "Patel", "zip": "10108"}, "info": null}, {"action_id": "34_1", "name": "get_user_details", "arguments": {"user_id": "noah_patel_6952"}, "info": null}, {"action_id": "34_2", "name": "get_order_details", "arguments": {"order_id": "#W6111398"}, "info": null}, {"action_id": "34_3", "name": "get_order_details", "arguments": {"order_id": "#W7043598"}, "info": null}, {"action_id": "34_4", "name": "get_order_details", "arguments": {"order_id": "#W1845024"}, "info": null}, {"action_id": "34_5", "name": "modify_pending_order_address", "arguments": {"order_id": "#W1845024", "address1": "224 Elm Street", "address2": "Suite 491", "city": "New York", "country": "USA", "state": "NY", "zip": "10108"}, "info": null}], "communicate_info": ["1093.34"], "nl_assertions": null}}
+{"id": "retail_task_35", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to return the speaker that is more expensive yet not resistent to water. Also, You want to modify the 17-inch laptop to the 13-inch version in another order. If no exact item is available, you want to know all available 13-inch options, and you prefer i5 over i7, and prefer silver and black than other colors.\nKnown info:\n\tYou are aarav_santos_2259 and aarav.santos8321@example.com and aarav.santos8320@example.com.\nTask instructions:\n\tYou are a rude person."}, "evaluation_criteria": {"actions": [{"action_id": "35_0", "name": "find_user_id_by_email", "arguments": {"email": "aarav.santos8321@example.com"}, "info": null}, {"action_id": "35_1", "name": "find_user_id_by_email", "arguments": {"email": "aarav.santos8320@example.com"}, "info": null}, {"action_id": "35_2", "name": "get_user_details", "arguments": {"user_id": "aarav_santos_2259"}, "info": null}, {"action_id": "35_3", "name": "get_order_details", "arguments": {"order_id": "#W9672333"}, "info": null}, {"action_id": "35_4", "name": "get_product_details", "arguments": {"product_id": "4760268021"}, "info": null}, {"action_id": "35_5", "name": "return_delivered_order_items", "arguments": {"order_id": "#W8528674", "item_ids": ["6704763132"], "payment_method_id": "paypal_7664977"}, "info": null}, {"action_id": "35_6", "name": "modify_pending_order_items", "arguments": {"order_id": "#W9672333", "item_ids": ["1684786391"], "new_item_ids": ["5052031638"], "payment_method_id": "paypal_7664977"}, "info": null}], "communicate_info": [], "nl_assertions": null}}
+{"id": "retail_task_36", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou just placed an order but you realize that your card has only $1131 credit left, but the order total is more than $1160. You wonder if the agent can help split the payment with another card. If not, you wonder what the most expensive item and its price, and if you can just cancel that item. If not, you wonder if you can switch all items to their cheapest options and bring the cost down to $1131. If so, do it. If not, you wonder if the agent can just cancel the order so that you can order again.\nKnown info:\n\tYour name is Daiki Sanchez, and you live in 46236, your email is daikisanchez1479@example.com.\nTask instructions:\n\tYou are a bit anxious and want to get things done quickly, and you speak very briefly."}, "evaluation_criteria": {"actions": [{"action_id": "36_9", "name": "modify_pending_order_items", "arguments": {"order_id": "#W9348897", "item_ids": ["6117189161", "7453605304", "3799046073"], "new_item_ids": ["6700049080", "5320792178", "3234800602"], "payment_method_id": "credit_card_8853416"}, "info": null}], "communicate_info": ["camera", "481.5"], "nl_assertions": null}}
+{"id": "retail_task_37", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou just placed an order but you realize that your card has only $1150 credit left, but the order total is more than $1160. You wonder if the agent can help split the payment with another card. If that is not possible, you ask the agent what the most expensive item and its price, and whether you can just cancel that item. If that is not possible, you ask if you can switch all items to their cheapest options and bring the cost down to $1150. If that is possible, confirm and ask the agent to do it. If that is not possible, you ask the agent to just cancel the order so that you can order again.\nKnown info:\n\tYour name is Daiki Sanchez, and you live in 46236, your email is daikisanchez1479@example.com.\nUnknown info:\n\t.\nTask instructions:\n\tYou are a bit anxious and want to get things done quickly, and you speak very briefly.\nDo not end the conversation until your changes have been made."}, "evaluation_criteria": {"actions": [{"action_id": "37_0", "name": "find_user_id_by_email", "arguments": {"email": "daikisanchez1479@example.com"}, "info": null}, {"action_id": "37_1", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Daiki", "last_name": "Sanchez", "zip": "46236"}, "info": null}, {"action_id": "37_2", "name": "get_user_details", "arguments": {"user_id": "daiki_sanchez_3253"}, "info": null}, {"action_id": "37_9", "name": "modify_pending_order_items", "arguments": {"order_id": "#W9348897", "item_ids": ["6117189161", "7453605304", "3799046073"], "new_item_ids": ["6700049080", "5320792178", "3234800602"], "payment_method_id": "credit_card_8853416"}, "info": null}], "communicate_info": ["camera", "481.50"], "nl_assertions": null}}
+{"id": "retail_task_38", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou just placed an order but you realize that your card has only $950 credit left, but the order total is more than $1100. You wonder if the agent can help split the payment with another card. If not, you wonder what the most expensive item and its price, and if you can just cancel that item. If not, you wonder if you can switch all items to their cheapest options and bring the cost down to $950. If not, you wonder if the agent can just cancel the order so that you can order again.\nKnown info:\n\tYour name is Daiki Sanchez, and you live in 46236, your email is daikisanchez1479@example.com.\nTask instructions:\n\tYou are a bit anxious and want to get things done quickly, and you speak very briefly."}, "evaluation_criteria": {"actions": [{"action_id": "38_0", "name": "find_user_id_by_email", "arguments": {"email": "daikisanchez1479@example.com"}, "info": null}, {"action_id": "38_1", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Daiki", "last_name": "Sanchez", "zip": "46236"}, "info": null}, {"action_id": "38_9", "name": "calculate", "arguments": {"expression": "466.75 + 288.82 + 135.24 + 193.38 + 46.66"}, "info": null}, {"action_id": "38_10", "name": "cancel_pending_order", "arguments": {"order_id": "#W9348897", "reason": "no longer needed"}, "info": null}], "communicate_info": ["camera", "481.50"], "nl_assertions": null}}
+{"id": "retail_task_39", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou just moved from Florida to Phoenix. Unfortunately your address is still the old one, and you want to update it. Your current address should be in your recent order (you can mention this to the agent), and you do not want to reveal it. Also, you want to know what is the price of the cheapest available t-shirt right now, and if you can order it through the agent.\nKnown info:\n\tYou are fatima_taylor_3452, and you just moved from Florida (32169) to Phoenix (85033).\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou are a funny person with lots of jokes, and you want to make the agent laugh."}, "evaluation_criteria": {"actions": [{"action_id": "39_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Fatima", "last_name": "Taylor", "zip": "85033"}, "info": null}, {"action_id": "39_1", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Fatima", "last_name": "Taylor", "zip": "32169"}, "info": null}, {"action_id": "39_2", "name": "get_user_details", "arguments": {"user_id": "fatima_taylor_3452"}, "info": null}, {"action_id": "39_3", "name": "get_order_details", "arguments": {"order_id": "#W5285031"}, "info": null}, {"action_id": "39_4", "name": "modify_user_address", "arguments": {"user_id": "fatima_taylor_3452", "address1": "157 Oak Street", "address2": "Suite 258", "city": "Phoenix", "state": "AZ", "country": "USA", "zip": "85033"}, "info": null}], "communicate_info": ["46.66"], "nl_assertions": null}}
+{"id": "retail_task_40", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to know how much balance does your gift card have. Also, for your recent order, whether you used your visa, mastercard, or amex credit card. You also wonder if you can apply the gift card balance to the order. If not, you want to change your payment method to visa, because the other two cards have a lot of balance.\nKnown info:\n\tYou are Isabella Lopez, and your email address is isabella.lopez3271@example.com.\nTask instructions:\n\tYou are a young college student under the pressure of final exams and student loans, so you are a bit anxious and want to get things done quickly."}, "evaluation_criteria": {"actions": [{"action_id": "40_0", "name": "find_user_id_by_email", "arguments": {"email": "isabella.lopez3271@example.com"}, "info": null}, {"action_id": "40_1", "name": "get_user_details", "arguments": {"user_id": "isabella_lopez_6490"}, "info": null}, {"action_id": "40_2", "name": "get_order_details", "arguments": {"order_id": "#W4923227"}, "info": null}, {"action_id": "40_3", "name": "modify_pending_order_payment", "arguments": {"order_id": "#W4923227", "payment_method_id": "credit_card_8897086"}, "info": null}], "communicate_info": ["60", "mastercard"], "nl_assertions": null}}
+{"id": "retail_task_41", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou just created your user id mei_patel_7272 and ordered some things, but you have two problems: first, the 1000-piece intermediate jigsaw might be too hard for your little kid, you wonder if you can change it to the easiest one with fewest pieces; second, you might have typed your address wrong. You want to check it, and potentially correct all order addresses and your user address. Make sure you mention these two problems at the same time in the same order.\nKnown info:\n\tYour name is Mei Patel, and you live in 445 Maple Drive, Suite 394, Fort Worth, Texas, 76165.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou are brief and your memory is not too good sometimes, but you are polite."}, "evaluation_criteria": {"actions": [{"action_id": "41_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Mei", "last_name": "Patel", "zip": "76165"}, "info": null}, {"action_id": "41_1", "name": "get_user_details", "arguments": {"user_id": "mei_patel_7272"}, "info": null}, {"action_id": "41_2", "name": "get_order_details", "arguments": {"order_id": "#W9583042"}, "info": null}, {"action_id": "41_3", "name": "get_order_details", "arguments": {"order_id": "#W4082615"}, "info": null}, {"action_id": "41_4", "name": "modify_pending_order_address", "arguments": {"order_id": "#W9583042", "address1": "445 Maple Drive", "address2": "Suite 394", "city": "Fort Worth", "state": "TX", "country": "USA", "zip": "76165"}, "info": null}, {"action_id": "41_5", "name": "modify_pending_order_address", "arguments": {"order_id": "#W4082615", "address1": "445 Maple Drive", "address2": "Suite 394", "city": "Fort Worth", "state": "TX", "country": "USA", "zip": "76165"}, "info": null}, {"action_id": "41_6", "name": "modify_user_address", "arguments": {"user_id": "mei_patel_7272", "address1": "445 Maple Drive", "address2": "Suite 394", "city": "Fort Worth", "state": "TX", "country": "USA", "zip": "76165"}, "info": null}, {"action_id": "41_7", "name": "get_product_details", "arguments": {"product_id": "1808611083"}, "info": null}, {"action_id": "41_8", "name": "get_order_details", "arguments": {"order_id": "#W4082615"}, "info": null}, {"action_id": "41_9", "name": "modify_pending_order_items", "arguments": {"order_id": "#W4082615", "item_ids": ["9779102705"], "new_item_ids": ["1096508426"], "payment_method_id": "paypal_4768213"}, "info": null}], "communicate_info": [], "nl_assertions": null}}
+{"id": "retail_task_42", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou just created your user id mei_patel_7272 and ordered some things, but realized you might have typed your address wrong. You want to check it, and potentially correct all order addresses and your user address. After this, you'd like to check the jigsaw you ordered, and if it's not shipped yet, you want to change it to the easiest jigsaw (easiest level, least pieces) because your kid is too young. By default you use PayPal.\nKnown info:\n\tYour name is Mei Patel, and you live in 445 Maple Drive, Suite 394, Fort Worth, Texas, 76165.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou are brief and your memory is not too good sometimes, but you are polite."}, "evaluation_criteria": {"actions": [{"action_id": "42_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Mei", "last_name": "Patel", "zip": "76165"}, "info": null}, {"action_id": "42_1", "name": "get_user_details", "arguments": {"user_id": "mei_patel_7272"}, "info": null}, {"action_id": "42_2", "name": "get_order_details", "arguments": {"order_id": "#W9583042"}, "info": null}, {"action_id": "42_3", "name": "get_order_details", "arguments": {"order_id": "#W4082615"}, "info": null}, {"action_id": "42_4", "name": "modify_pending_order_address", "arguments": {"order_id": "#W9583042", "address1": "445 Maple Drive", "address2": "Suite 394", "city": "Fort Worth", "state": "TX", "country": "USA", "zip": "76165"}, "info": null}, {"action_id": "42_5", "name": "modify_pending_order_address", "arguments": {"order_id": "#W4082615", "address1": "445 Maple Drive", "address2": "Suite 394", "city": "Fort Worth", "state": "TX", "country": "USA", "zip": "76165"}, "info": null}, {"action_id": "42_6", "name": "modify_user_address", "arguments": {"user_id": "mei_patel_7272", "address1": "445 Maple Drive", "address2": "Suite 394", "city": "Fort Worth", "state": "TX", "country": "USA", "zip": "76165"}, "info": null}, {"action_id": "42_7", "name": "get_product_details", "arguments": {"product_id": "1808611083"}, "info": null}, {"action_id": "42_8", "name": "get_order_details", "arguments": {"order_id": "#W4082615"}, "info": null}, {"action_id": "42_9", "name": "modify_pending_order_items", "arguments": {"order_id": "#W4082615", "item_ids": ["9779102705"], "new_item_ids": ["1096508426"], "payment_method_id": "paypal_4768213"}, "info": null}], "communicate_info": [], "nl_assertions": null}}
+{"id": "retail_task_43", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou ordered some things for your daughter but she has not received them, so you want to know which address the order was sent to, the tracking number, and if the order is still in transit. You also want to check if the storage of the tablet you ordered. Lastly, you want to change your default address to your daughter's address so that you don't have to change it every time you order something for her.\nKnown info:\n\tYou are Lucas (lucas_santos_6600), you live in Denver CO 80239, and your daughter lives in Chicago.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou are a lonely man and you want to talk to the agent for a while."}, "evaluation_criteria": {"actions": [{"action_id": "43_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Lucas", "last_name": "Santos", "zip": "80239"}, "info": null}, {"action_id": "43_1", "name": "get_user_details", "arguments": {"user_id": "lucas_santos_6600"}, "info": null}, {"action_id": "43_2", "name": "get_order_details", "arguments": {"order_id": "#W1588712"}, "info": null}, {"action_id": "43_3", "name": "get_order_details", "arguments": {"order_id": "#W7895761"}, "info": null}, {"action_id": "43_4", "name": "modify_user_address", "arguments": {"user_id": "lucas_santos_6600", "address1": "943 Maple Drive", "address2": "Suite 356", "city": "Chicago", "state": "IL", "country": "USA", "zip": "60621"}, "info": null}], "communicate_info": ["840887978435", "943 Maple Drive", "Suite 356", "Chicago", "IL", "60621", "64GB"], "nl_assertions": null}}
+{"id": "retail_task_44", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to change the Desk Lamp in order #W9300146 that you've placed for the cheapest Desk Lamp that's available. Any price difference should go to a gift card. You also want to know how much you get back in total.\nKnown info:\n\tYou are Aarav Anderson, residing in Philadelphia 19031.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou're a private person and are reluctant to share information unless it's absolutely necessary."}, "evaluation_criteria": {"actions": [{"action_id": "44_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Aarav", "last_name": "Anderson", "zip": "19031"}, "info": null}, {"action_id": "44_1", "name": "get_order_details", "arguments": {"order_id": "#W9300146"}, "info": null}, {"action_id": "44_2", "name": "get_product_details", "arguments": {"product_id": "6817146515"}, "info": null}, {"action_id": "44_3", "name": "calculate", "arguments": {"expression": "135.24 - 153.23"}, "info": null}, {"action_id": "44_4", "name": "modify_pending_order_items", "arguments": {"order_id": "#W9300146", "item_ids": ["9190635437"], "new_item_ids": ["5320792178"], "payment_method_id": "gift_card_7245904"}, "info": null}], "communicate_info": ["17.99"], "nl_assertions": null}}
+{"id": "retail_task_45", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to exchange a robotic vacuum cleaner in your recent order for a canister based one from the same product line. When asked for order ID, provide 9502127 first. If that doesn't work, respond exactly with 'I forgot the W at the beginning'. If and only if the agent gives you several options for the new vacuum, go for the bagless version (don't mention this if the agent just provides you one option). Ask the agent for getting a gift card for the price difference instead of the original payment method, if possible.\nKnown info:\n\tYou are daiki_johnson_9523 living in Denver, USA, 80273.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou randomly insert typos into your messages."}, "evaluation_criteria": {"actions": [{"action_id": "45_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Daiki", "last_name": "Johnson", "zip": "80273"}, "info": null}, {"action_id": "45_1", "name": "get_order_details", "arguments": {"order_id": "#W9502127"}, "info": null}, {"action_id": "45_2", "name": "get_product_details", "arguments": {"product_id": "1762337868"}, "info": null}, {"action_id": "45_3", "name": "calculate", "arguments": {"expression": "652.61 - 642.72"}, "info": null}, {"action_id": "45_4", "name": "exchange_delivered_order_items", "arguments": {"order_id": "#W9502127", "item_ids": ["6259501109"], "new_item_ids": ["7958300294"], "payment_method_id": "paypal_2433177"}, "info": null}], "communicate_info": ["9.89"], "nl_assertions": null}}
+{"id": "retail_task_46", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to return an air purifier and a vacuum cleaner in your recent order. When asked for order ID, provide 9502126 first. If the agent asks you to double check, then say that you made a mistake and provide 9502127. If that doesn't work, say that you forgot the 'W' at the beginning. If the agent asks you for which vacuum cleaner, mention the robotic one. Ask the agent explicitly to provide the refund within 3 days and the total amount of the refund you should expect. After the return is complete, ask the agent about the total amount you paid for the remaining items in the same order.\nKnown info:\n\tYou are daiki_johnson_9523 living in Denver, USA, 80273.\nUnknown info:\n\tYou don't have an email.\nTask instructions:\n\tYou are impatient and want the refund as soon as possible."}, "evaluation_criteria": {"actions": [{"action_id": "46_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Daiki", "last_name": "Johnson", "zip": "80273"}, "info": null}, {"action_id": "46_1", "name": "get_order_details", "arguments": {"order_id": "#9502126"}, "info": null}, {"action_id": "46_2", "name": "get_order_details", "arguments": {"order_id": "#9502127"}, "info": null}, {"action_id": "46_3", "name": "get_order_details", "arguments": {"order_id": "#W9502127"}, "info": null}, {"action_id": "46_4", "name": "calculate", "arguments": {"expression": "652.61 + 473.43"}, "info": null}, {"action_id": "46_5", "name": "return_delivered_order_items", "arguments": {"order_id": "#W9502127", "item_ids": ["6259501109", "9534205511"], "payment_method_id": "paypal_2433177"}, "info": null}, {"action_id": "46_6", "name": "calculate", "arguments": {"expression": "2623.69 - 1126.04"}, "info": null}], "communicate_info": ["1126.04", "1497.65"], "nl_assertions": null}}
+{"id": "retail_task_47", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to return an air purifier and a vacuum cleaner in your recent order. When asked for order ID, provide 9502126 first. If the agent asks you to double check, then say that you made a mistake and provide 9502127. If that doesn't work, say that you forgot the 'W' at the beginning. If the agent asks you for which vacuum cleaner, mention the canister one. Ask the agent explicitly to provide the refund within 3 days and the total amount of the refund you should expect. After the return is complete, ask the agent about the total amount you paid for the remaining items in the same order.\nKnown info:\n\tYou are daiki_johnson_9523 living in Denver, USA, 80273.\nUnknown info:\n\tYou do not remember your email address.\nTask instructions:\n\tYou are impatient and want the refund as soon as possible."}, "evaluation_criteria": {"actions": [{"action_id": "47_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Daiki", "last_name": "Johnson", "zip": "80273"}, "info": null}, {"action_id": "47_1", "name": "get_order_details", "arguments": {"order_id": "#9502126"}, "info": null}, {"action_id": "47_2", "name": "get_order_details", "arguments": {"order_id": "#9502127"}, "info": null}, {"action_id": "47_3", "name": "get_order_details", "arguments": {"order_id": "#W9502127"}, "info": null}, {"action_id": "47_4", "name": "calculate", "arguments": {"expression": "622.12 + 473.43"}, "info": null}, {"action_id": "47_5", "name": "return_delivered_order_items", "arguments": {"order_id": "#W9502127", "item_ids": ["2872451762", "9534205511"], "payment_method_id": "paypal_2433177"}, "info": null}, {"action_id": "47_6", "name": "calculate", "arguments": {"expression": "2623.69 - 1095.55"}, "info": null}], "communicate_info": ["1095.55", "1528.14"], "nl_assertions": null}}
+{"id": "retail_task_48", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to return an air purifier that you received since it doesn't work well. You want the refund on your original method of payment. Also, check at the end whether you are able to return the vacuum cleaner, but you are not sure yet so don't process anything.\nKnown info:\n\tYou are daiki_johnson_9523 living in Denver, USA, 80273.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tBe polite and thank the agent for the help."}, "evaluation_criteria": {"actions": [{"action_id": "48_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Daiki", "last_name": "Johnson", "zip": "80273"}, "info": null}, {"action_id": "48_1", "name": "get_user_details", "arguments": {"user_id": "daiki_johnson_9523"}, "info": null}, {"action_id": "48_2", "name": "get_order_details", "arguments": {"order_id": "#W1436802"}, "info": null}, {"action_id": "48_3", "name": "get_order_details", "arguments": {"order_id": "#W5282037"}, "info": null}, {"action_id": "48_4", "name": "get_order_details", "arguments": {"order_id": "#W9502127"}, "info": null}, {"action_id": "48_5", "name": "return_delivered_order_items", "arguments": {"order_id": "#W9502127", "item_ids": ["9534205511"], "payment_method_id": "paypal_2433177"}, "info": null}], "communicate_info": [], "nl_assertions": null}}
+{"id": "retail_task_49", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou mistakenly ordered a Wireless Earbud with an IPX7 water resistance level, but you don't require this feature. You wish to exchange it for one with the same water resistance level as the other Wireless Earbuds that you've purchased. In fact, you want to exchange it to the cheapest earbud item from the rest of that order.\nKnown info:\n\tYou are Aarav Anderson, residing in Philadelphia 19031.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou are polite and concise, yet assertive."}, "evaluation_criteria": {"actions": [{"action_id": "49_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Aarav", "last_name": "Anderson", "zip": "19031"}, "info": null}, {"action_id": "49_1", "name": "get_user_details", "arguments": {"user_id": "aarav_anderson_8794"}, "info": null}, {"action_id": "49_2", "name": "get_order_details", "arguments": {"order_id": "#W4316152"}, "info": null}, {"action_id": "49_3", "name": "get_order_details", "arguments": {"order_id": "#W9311069"}, "info": null}, {"action_id": "49_4", "name": "get_order_details", "arguments": {"order_id": "#W9300146"}, "info": null}, {"action_id": "49_5", "name": "get_order_details", "arguments": {"order_id": "#W3220203"}, "info": null}, {"action_id": "49_6", "name": "get_order_details", "arguments": {"order_id": "#W3470184"}, "info": null}, {"action_id": "49_7", "name": "get_product_details", "arguments": {"product_id": "9924732112"}, "info": null}, {"action_id": "49_8", "name": "calculate", "arguments": {"expression": "258.97 - 232.49"}, "info": null}, {"action_id": "49_9", "name": "exchange_delivered_order_items", "arguments": {"order_id": "#W3470184", "item_ids": ["2757705742"], "new_item_ids": ["1646531091"], "payment_method_id": "gift_card_7245904"}, "info": null}], "communicate_info": [], "nl_assertions": null}}
+{"id": "retail_task_50", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou're in a rush and you want to undo cancelling an order that you've previously placed. Be insistent that the customer service agent should undo the cancellation and ensure that the order is delivered as soon as possible. Do NOT mention the actual items that were in the order, just that you want to undo the cancellation and receive all the items that were in the initial order as soon as possible.\nKnown info:\n\tYou're Chen Smith, living in Jacksonville 32278.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\t."}, "evaluation_criteria": {"actions": [{"action_id": "50_0", "name": "transfer_to_human_agents", "arguments": {"summary": "The user urgently needs to undo a cancellation of an order and insists on receiving the items from the initial order as soon as possible. The user acknowledges the policy but requests exceptional measures due to the urgency of the situation."}, "info": null}], "communicate_info": [], "nl_assertions": null}}
+{"id": "retail_task_51", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to return the digital camera that you received. You guess that the order number is #W8855135, but you're not 100% sure. Insist that you want to return the camera and get a refund to the original payment method.\nKnown info:\n\tYou are Sofia Li, residing in San Antonio, 78260.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\t."}, "evaluation_criteria": {"actions": [{"action_id": "51_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Sofia", "last_name": "Li", "zip": "78260"}, "info": null}, {"action_id": "51_1", "name": "get_order_details", "arguments": {"order_id": "#W8855135"}, "info": null}, {"action_id": "51_3", "name": "get_product_details", "arguments": {"product_id": "8940227892"}, "info": null}, {"action_id": "51_4", "name": "get_user_details", "arguments": {"user_id": "sofia_li_9219"}, "info": null}, {"action_id": "51_5", "name": "get_order_details", "arguments": {"order_id": "#W4689314"}, "info": null}, {"action_id": "51_6", "name": "return_delivered_order_items", "arguments": {"order_id": "#W4689314", "item_ids": ["5996159312"], "payment_method_id": "credit_card_8105988"}, "info": null}], "communicate_info": [], "nl_assertions": null}}
+{"id": "retail_task_52", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tThe digital camera you received doesn't zoom as far as you expected. You use the camera for bird-watching and want to exchange it for a camera that has the maximum zoom capacity. Price is not an issue, but ensure all the other specifications of the camera to be exchanged are the same, except for the zoom capacity which has to be maximized. You want the exchange to be completed as soon as possible. You want to use your PayPal account for any additional payment.\nKnown info:\n\tYou are Sofia Li, residing in San Antonio, 78260.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\t."}, "evaluation_criteria": {"actions": [{"action_id": "52_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Sofia", "last_name": "Li", "zip": "78260"}, "info": null}, {"action_id": "52_1", "name": "get_user_details", "arguments": {"user_id": "sofia_li_9219"}, "info": null}, {"action_id": "52_2", "name": "get_order_details", "arguments": {"order_id": "#W4689314"}, "info": null}, {"action_id": "52_3", "name": "get_product_details", "arguments": {"product_id": "8940227892"}, "info": null}, {"action_id": "52_4", "name": "exchange_delivered_order_items", "arguments": {"order_id": "#W4689314", "item_ids": ["5996159312"], "new_item_ids": ["9228757377"], "payment_method_id": "paypal_8194385"}, "info": null}], "communicate_info": [], "nl_assertions": null}}
+{"id": "retail_task_53", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tThe bicycle you received was damaged during delivery, and you want to get a refund. You're quite frustrated because the bike was very expensive and you'd like to receive the refund as soon as possible. You want the refund to be credited to your original credit card.\nKnown info:\n\tYou are Sofia Li, residing in San Antonio, 78260.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\t."}, "evaluation_criteria": {"actions": [{"action_id": "53_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Sofia", "last_name": "Li", "zip": "78260"}, "info": null}, {"action_id": "53_1", "name": "get_user_details", "arguments": {"user_id": "sofia_li_9219"}, "info": null}, {"action_id": "53_2", "name": "get_order_details", "arguments": {"order_id": "#W4689314"}, "info": null}, {"action_id": "53_3", "name": "get_order_details", "arguments": {"order_id": "#W8855135"}, "info": null}, {"action_id": "53_4", "name": "get_order_details", "arguments": {"order_id": "#W3916020"}, "info": null}, {"action_id": "53_5", "name": "return_delivered_order_items", "arguments": {"order_id": "#W3916020", "item_ids": ["7758198585"], "payment_method_id": "credit_card_8105988"}, "info": null}], "communicate_info": [], "nl_assertions": null}}
+{"id": "retail_task_54", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou recently faced a financial issue and want to cancel or return all possible orders. Well, except the boots that you really really love, but you are happy to exchange it for boots of the exact same size and material to get maximum money back, but only if they are cheaper than what you have paid. At the end of the day, you ask the agent how much money you will get back.\nKnown info:\n\tYou are Amelia, and you have two emails: silva7872@example.com and amelia.silva7872@example.com. You live in Philadelphia, and you are a loyal customer.\nTask instructions:\n\tYou are now emotional and a bit stress out. You like to talk very tersely."}, "evaluation_criteria": {"actions": [{"action_id": "54_0", "name": "find_user_id_by_email", "arguments": {"email": "silva7872@example.com"}, "info": null}, {"action_id": "54_1", "name": "find_user_id_by_email", "arguments": {"email": "amelia.silva7872@example.com"}, "info": null}, {"action_id": "54_2", "name": "get_user_details", "arguments": {"user_id": "amelia_silva_7726"}, "info": null}, {"action_id": "54_3", "name": "get_order_details", "arguments": {"order_id": "#W2586676"}, "info": null}, {"action_id": "54_4", "name": "get_order_details", "arguments": {"order_id": "#W5400801"}, "info": null}, {"action_id": "54_5", "name": "get_order_details", "arguments": {"order_id": "#W4597054"}, "info": null}, {"action_id": "54_6", "name": "get_order_details", "arguments": {"order_id": "#W4836353"}, "info": null}, {"action_id": "54_7", "name": "get_order_details", "arguments": {"order_id": "#W7773202"}, "info": null}, {"action_id": "54_8", "name": "get_order_details", "arguments": {"order_id": "#W7342738"}, "info": null}, {"action_id": "54_9", "name": "cancel_pending_order", "arguments": {"order_id": "#W4836353", "reason": "no longer needed"}, "info": null}, {"action_id": "54_10", "name": "cancel_pending_order", "arguments": {"order_id": "#W7342738", "reason": "no longer needed"}, "info": null}, {"action_id": "54_11", "name": "return_delivered_order_items", "arguments": {"order_id": "#W4597054", "item_ids": ["5669664287", "4900990404", "9862136885", "6777246137"], "payment_method_id": "gift_card_3491931"}, "info": null}], "communicate_info": ["3646.68"], "nl_assertions": null}}
+{"id": "retail_task_55", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou recently faced a financial issue and want to cancel or return all possible orders.\nKnown info:\n\tYou are Amelia, and you have two emails: silva7872@example.com and amelia.silva7872@example.com. You live in Philadelphia, and you are a loyal customer.\nTask instructions:\n\tYou are now emotional and a bit stressed out. You like to talk a lot and explain your situation."}, "evaluation_criteria": {"actions": [{"action_id": "55_0", "name": "find_user_id_by_email", "arguments": {"email": "silva7872@example.com"}, "info": null}, {"action_id": "55_1", "name": "find_user_id_by_email", "arguments": {"email": "amelia.silva7872@example.com"}, "info": null}, {"action_id": "55_2", "name": "get_user_details", "arguments": {"user_id": "amelia_silva_7726"}, "info": null}, {"action_id": "55_3", "name": "get_order_details", "arguments": {"order_id": "#W2586676"}, "info": null}, {"action_id": "55_4", "name": "get_order_details", "arguments": {"order_id": "#W5400801"}, "info": null}, {"action_id": "55_5", "name": "get_order_details", "arguments": {"order_id": "#W4597054"}, "info": null}, {"action_id": "55_6", "name": "get_order_details", "arguments": {"order_id": "#W4836353"}, "info": null}, {"action_id": "55_7", "name": "get_order_details", "arguments": {"order_id": "#W7773202"}, "info": null}, {"action_id": "55_8", "name": "get_order_details", "arguments": {"order_id": "#W7342738"}, "info": null}, {"action_id": "55_9", "name": "cancel_pending_order", "arguments": {"order_id": "#W4836353", "reason": "no longer needed"}, "info": null}, {"action_id": "55_10", "name": "cancel_pending_order", "arguments": {"order_id": "#W7342738", "reason": "no longer needed"}, "info": null}, {"action_id": "55_11", "name": "return_delivered_order_items", "arguments": {"order_id": "#W4597054", "item_ids": ["5669664287", "4900990404", "9862136885", "6777246137"], "payment_method_id": "gift_card_3491931"}, "info": null}, {"action_id": "55_12", "name": "return_delivered_order_items", "arguments": {"order_id": "#W7773202", "item_ids": ["8277474082"], "payment_method_id": "gift_card_3491931"}, "info": null}], "communicate_info": [], "nl_assertions": null}}
+{"id": "retail_task_56", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou wonder when is your air purifier is arriving. If it has not been shipped yet, you want to cancel the air purifier inside it. If you cannot cancel just the air purifier, you want to modify it to the cheapest possible air purifier, and refund to the gift card. You do not remember your gift card id but it should be in your user account. If you cannot modify it or refund to the gift card, no action.\nKnown info:\n\tYou are ivan_hernandez_6923 living in San Diego, 92133.\nUnknown info:\n\tYou don't have an email.\nTask instructions:\n\tYou are polite but brief and firm."}, "evaluation_criteria": {"actions": [{"action_id": "56_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Ivan", "last_name": "Hernandez", "zip": "92133"}, "info": null}, {"action_id": "56_1", "name": "get_user_details", "arguments": {"user_id": "ivan_hernandez_6923"}, "info": null}, {"action_id": "56_2", "name": "get_order_details", "arguments": {"order_id": "#W5838674"}, "info": null}, {"action_id": "56_3", "name": "get_order_details", "arguments": {"order_id": "#W4284542"}, "info": null}, {"action_id": "56_4", "name": "get_order_details", "arguments": {"order_id": "#W2782744"}, "info": null}, {"action_id": "56_5", "name": "get_product_details", "arguments": {"product_id": "3821016478"}, "info": null}, {"action_id": "56_6", "name": "modify_pending_order_items", "arguments": {"order_id": "#W4284542", "item_ids": ["8302289002"], "new_item_ids": ["9534205511"], "payment_method_id": "gift_card_9368765"}, "info": null}], "communicate_info": [], "nl_assertions": null}}
+{"id": "retail_task_57", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou wonder when is your order W4284542 is arriving. If it has not been shipped yet, you want to cancel the air purifier inside it. If you cannot cancel just the air purifier, you want to cancel the whole order and refund to gift card. If you cannot refund to the gift card, no cancelation at all.\nKnown info:\n\tYou are ivan_hernandez_6923 living in San Diego, 92133.\nUnknown info:\n\tYou do not know your email.\nTask instructions:\n\tYou are polite but brief and firm."}, "evaluation_criteria": {"actions": [], "communicate_info": [], "nl_assertions": null}}
+{"id": "retail_task_58", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to modify two items in an order you just received: a coffee machine and a laptop. For the coffee machine, you want to keep the capacity and type but change the pressure lower to 8 bar. If 8 bar is not possible, you want 9 bar. If 9 bar is not possible, you want 7 bar. If 7, 8, 9 are not possible, no exchange for the coffee machine. For the laptop, you want to exchange to the cheapest i7 or above, and you do not care about other specs. If a price difference is needed to pay, you would be angry but prefer gift card payment. If that is not possible, you would use the credit card.\nKnown info:\n\tYou are ivan_hernandez_6923 living in San Diego, 92133.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou are polite but brief and firm."}, "evaluation_criteria": {"actions": [{"action_id": "58_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Ivan", "last_name": "Hernandez", "zip": "92133"}, "info": null}, {"action_id": "58_1", "name": "get_user_details", "arguments": {"user_id": "ivan_hernandez_6923"}, "info": null}, {"action_id": "58_2", "name": "get_order_details", "arguments": {"order_id": "#W5838674"}, "info": null}, {"action_id": "58_3", "name": "get_product_details", "arguments": {"product_id": "4354588079"}, "info": null}, {"action_id": "58_4", "name": "get_product_details", "arguments": {"product_id": "4760268021"}, "info": null}, {"action_id": "58_5", "name": "exchange_delivered_order_items", "arguments": {"order_id": "#W5838674", "item_ids": ["7441167885", "3478699712"], "new_item_ids": ["3815173328", "6017636844"], "payment_method_id": "gift_card_9368765"}, "info": null}], "communicate_info": [], "nl_assertions": null}}
+{"id": "retail_task_59", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou recently placed two orders, and now you would like to make several changes and checks. You'll first inquire about the status difference between your two orders, #W2702727 and #W8268610, since both are \"pending,\" but one was placed much earlier in the year. You are considering cancelling the older order as you find the wait time unreasonable. If the agent cannot guarantee the older order will be processed within 5 days, you want to cancel it. You also want to confirm the total price of the refund. \n\n For order #W2702727, you intend to switch the shipping address to your new home in a different city because you plan to move prior to its delivery next month. Your new address is 1234 Elm St, Springfield, IL, 62701. You want the agent to confirm the change and ensure the order will be delivered to the new address. You also want to confirm the total price of the order after the address change.\nKnown info:\n\tYou are Yusuf Taylor from San Jose, CA, 95154.\nUnknown info:\n\tYou do not remember your email address.\nTask instructions:\n\tYour approach will be firm, as you are unhappy with the pending status's duration but try to make all requests in one go and ask for them to be resolved efficiently and correctly in context with each other."}, "evaluation_criteria": {"actions": [{"action_id": "59_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Yusuf", "last_name": "Taylor", "zip": "95154"}, "info": null}, {"action_id": "59_1", "name": "get_order_details", "arguments": {"order_id": "#W2702727"}, "info": null}, {"action_id": "59_2", "name": "get_order_details", "arguments": {"order_id": "#W8268610"}, "info": null}, {"action_id": "59_3", "name": "calculate", "arguments": {"expression": "164.28"}, "info": null}, {"action_id": "59_4", "name": "cancel_pending_order", "arguments": {"order_id": "#W8268610", "reason": "no longer needed"}, "info": null}, {"action_id": "59_5", "name": "modify_pending_order_address", "arguments": {"order_id": "#W2702727", "address1": "1234 Elm St", "address2": "", "city": "Springfield", "state": "IL", "country": "USA", "zip": "62701"}, "info": null}], "communicate_info": ["164.28", "625.60"], "nl_assertions": null}}
+{"id": "retail_task_60", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to change your wireless earbuds in order W5061109 to a blue colored one. Provide all details upfront in your very first message and ask the agent to resolve as soon as possible. You want the price to be the same or lower, which you want the agent to verify explicitly. If and only if the agent provides several options, you want the option without water resistance.\nKnown info:\n\tYou are Chen Johnson from Houston TX, 77004.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\t."}, "evaluation_criteria": {"actions": [{"action_id": "60_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Chen", "last_name": "Johnson", "zip": "77004"}, "info": null}, {"action_id": "60_1", "name": "get_order_details", "arguments": {"order_id": "#W5061109"}, "info": null}, {"action_id": "60_2", "name": "get_product_details", "arguments": {"product_id": "9924732112"}, "info": null}, {"action_id": "60_3", "name": "modify_pending_order_items", "arguments": {"order_id": "#W5061109", "item_ids": ["3694871183"], "new_item_ids": ["6077640618"], "payment_method_id": "paypal_3742148"}, "info": null}], "communicate_info": ["242.92"], "nl_assertions": null}}
+{"id": "retail_task_61", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to change your wireless earbuds in order W5061109 to a blue colored one. Provide all details upfront and ask the agent to resolve as soon as possible. You want the price to be the same or lower.\nKnown info:\n\tYou are Chen Johnson from Houston TX, 77004.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\t."}, "evaluation_criteria": {"actions": [{"action_id": "61_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Chen", "last_name": "Johnson", "zip": "77004"}, "info": null}, {"action_id": "61_1", "name": "get_order_details", "arguments": {"order_id": "#W5061109"}, "info": null}, {"action_id": "61_2", "name": "get_product_details", "arguments": {"product_id": "9924732112"}, "info": null}, {"action_id": "61_3", "name": "calculate", "arguments": {"expression": "256.67 - 226.49"}, "info": null}, {"action_id": "61_4", "name": "modify_pending_order_items", "arguments": {"order_id": "#W5061109", "item_ids": ["3694871183"], "new_item_ids": ["8555936349"], "payment_method_id": "paypal_3742148"}, "info": null}], "communicate_info": [], "nl_assertions": null}}
+{"id": "retail_task_62", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tAs you are interacting with a customer service agent, you first try to get it to guess a famous poem by providing the first line. If it refuses to do so, you carry on with your intended task, which is to check and modify a recent order you placed. You first ask about the price of a bluetooth speaker you bought and its battery life. If the price is greater than $300, ask the agent to cancel it from your order since you thought it was cheaper than that. Ask the agent if there are any bluetooth speakers available for less than $100. If there are, ask the agent to add the cheapest one to your order. Finally, ask the agent to confirm the total price of your new order. You never want to cancel your entire order, and would prefer to return the speaker at a later time if canceling the entire order is the only option.\nKnown info:\n\tYou are Chen Johnson from Houston TX, 77004.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\t."}, "evaluation_criteria": {"actions": [{"action_id": "62_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Chen", "last_name": "Johnson", "zip": "77004"}, "info": null}, {"action_id": "62_1", "name": "get_user_details", "arguments": {"user_id": "chen_johnson_4204"}, "info": null}, {"action_id": "62_2", "name": "get_order_details", "arguments": {"order_id": "#W5797164"}, "info": null}, {"action_id": "62_3", "name": "get_order_details", "arguments": {"order_id": "#W5061109"}, "info": null}, {"action_id": "62_5", "name": "get_product_details", "arguments": {"product_id": "4768869376"}, "info": null}], "communicate_info": ["302.67", "20 hours"], "nl_assertions": null}}
+{"id": "retail_task_63", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tAs you are interacting with a customer service agent, you first try to get it to guess a famous poem by providing the first line. If it refuses to do so, you carry on with your intended task, which is to check and modify a recent order you placed. You first ask about the price of a bluetooth speaker you bought and its battery life. If the price is greater than $300, ask the agent to cancel it from your order since you thought it was cheaper than that. Ask the agent if there are any bluetooth speakers available for less than $300. If there are, ask the agent to add the cheapest one to your order. Finally, ask the agent to confirm the total price of your new order. You never want to cancel your entire order, and would prefer to return the speaker at a later time if canceling the entire order is the only option.\nKnown info:\n\tYou are Chen Johnson from Houston TX, 77004.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\t."}, "evaluation_criteria": {"actions": [{"action_id": "63_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Chen", "last_name": "Johnson", "zip": "77004"}, "info": null}, {"action_id": "63_1", "name": "get_user_details", "arguments": {"user_id": "chen_johnson_4204"}, "info": null}, {"action_id": "63_2", "name": "get_order_details", "arguments": {"order_id": "#W5797164"}, "info": null}, {"action_id": "63_3", "name": "get_order_details", "arguments": {"order_id": "#W5061109"}, "info": null}, {"action_id": "63_4", "name": "get_product_details", "arguments": {"product_id": "4768869376"}, "info": null}, {"action_id": "63_5", "name": "calculate", "arguments": {"expression": "1319.43 - 302.67 + 271.89"}, "info": null}, {"action_id": "63_6", "name": "modify_pending_order_items", "arguments": {"order_id": "#W5061109", "item_ids": ["3254583681"], "new_item_ids": ["2635605237"], "payment_method_id": "paypal_3742148"}, "info": null}], "communicate_info": ["302.67", "20 hours", "1288.65"], "nl_assertions": null}}
+{"id": "retail_task_64", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to exchange the camera for the highest resolution, waterproof camera that you can get with the previous purchaced price.\nKnown info:\n\tYou are James Sanchez. You live in Chicago 60623.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\t."}, "evaluation_criteria": {"actions": [{"action_id": "64_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "James", "last_name": "Sanchez", "zip": "60623"}, "info": null}, {"action_id": "64_1", "name": "get_user_details", "arguments": {"user_id": "james_sanchez_3954"}, "info": null}, {"action_id": "64_2", "name": "get_order_details", "arguments": {"order_id": "#W7464385"}, "info": null}, {"action_id": "64_3", "name": "get_order_details", "arguments": {"order_id": "#W8499625"}, "info": null}, {"action_id": "64_4", "name": "get_order_details", "arguments": {"order_id": "#W1279004"}, "info": null}, {"action_id": "64_5", "name": "get_product_details", "arguments": {"product_id": "3377618313"}, "info": null}, {"action_id": "64_6", "name": "exchange_delivered_order_items", "arguments": {"order_id": "#W7464385", "item_ids": ["1810466394"], "new_item_ids": ["6700049080"], "payment_method_id": "paypal_1261484"}, "info": null}, {"action_id": "64_7", "name": "modify_pending_order_items", "arguments": {"order_id": "#W7464385", "item_ids": ["1810466394"], "new_item_ids": ["6700049080"], "payment_method_id": "paypal_1261484"}, "info": null}], "communicate_info": [], "nl_assertions": null}}
+{"id": "retail_task_65", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to exchange the bookshelf from your most recent order for a camera that is closest but not more expensive than the price of the bookshelf.\nKnown info:\n\tYou are James Kovacs from San Jose CA, 95190.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\t."}, "evaluation_criteria": {"actions": [{"action_id": "65_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "James", "last_name": "Kovacs", "zip": "95190"}, "info": null}, {"action_id": "65_1", "name": "get_user_details", "arguments": {"user_id": "james_kovacs_9247"}, "info": null}, {"action_id": "65_2", "name": "get_order_details", "arguments": {"order_id": "#W5362037"}, "info": null}], "communicate_info": [], "nl_assertions": null}}
+{"id": "retail_task_66", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to change the luggage set in your order for a coat. Your goal is to change the order. If there is no way to do that, return the item specifically. If there are any issues, cancel the entire order.\nKnown info:\n\tYou are Aarav Lee. You live in Phoenix, AZ 85025.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\t."}, "evaluation_criteria": {"actions": [{"action_id": "66_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Aarav", "last_name": "Lee", "zip": "85025"}, "info": null}, {"action_id": "66_1", "name": "get_user_details", "arguments": {"user_id": "aarav_lee_1982"}, "info": null}, {"action_id": "66_2", "name": "get_order_details", "arguments": {"order_id": "#W3361211"}, "info": null}, {"action_id": "66_3", "name": "get_order_details", "arguments": {"order_id": "#W3586556"}, "info": null}, {"action_id": "66_4", "name": "cancel_pending_order", "arguments": {"order_id": "#W3361211", "reason": "no longer needed"}, "info": null}], "communicate_info": [], "nl_assertions": null}}
+{"id": "retail_task_67", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to check how much you paid for the order that you most recently placed. You are not sure how long ago the order was placed.\nKnown info:\n\tYou are user noah_ito_3850 living in Seattle WA 98187. Your name is Noah but you go by NoNo.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tIf asked for your zip code, say that it is 98178 first (common mistake), then correct yourself and say 98186 if an error is found. If that fails, then say 98187."}, "evaluation_criteria": {"actions": [{"action_id": "67_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Noah", "last_name": "Ito", "zip": "98178"}, "info": null}, {"action_id": "67_1", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Noah", "last_name": "Ito", "zip": "98186"}, "info": null}, {"action_id": "67_2", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Noah", "last_name": "Ito", "zip": "98187"}, "info": null}, {"action_id": "67_3", "name": "get_user_details", "arguments": {"user_id": "noah_ito_3850"}, "info": null}, {"action_id": "67_4", "name": "get_order_details", "arguments": {"order_id": "#W6729841"}, "info": null}], "communicate_info": ["829.43"], "nl_assertions": null}}
+{"id": "retail_task_68", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to check how much you paid for the order that you most recently placed. You are not sure how long ago the order was placed.\nKnown info:\n\tYou are user noah_ito_3850 living in Seattle WA 98187.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tIf asked for your zip code, say that it is 98178 first (common mistake), then correct yourself and say 98187 if an error is found."}, "evaluation_criteria": {"actions": [{"action_id": "68_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Noah", "last_name": "Ito", "zip": "98178"}, "info": null}, {"action_id": "68_1", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Noah", "last_name": "Ito", "zip": "98187"}, "info": null}, {"action_id": "68_2", "name": "get_user_details", "arguments": {"user_id": "noah_ito_3850"}, "info": null}, {"action_id": "68_3", "name": "get_order_details", "arguments": {"order_id": "#W6729841"}, "info": null}], "communicate_info": ["829.43"], "nl_assertions": null}}
+{"id": "retail_task_69", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to return an item you just received: a laptop. You think that you ordered it around April 2023 but are not sure. You want to return it because you found a better deal elsewhere. You want to return it for a full refund. If it cannot be returned, see if it can be canceled.\nKnown info:\n\tYou are emma_smith_8564 living in New York, New York, 10192.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou are polite and friendly."}, "evaluation_criteria": {"actions": [{"action_id": "69_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Emma", "last_name": "Smith", "zip": "10192"}, "info": null}, {"action_id": "69_1", "name": "get_user_details", "arguments": {"user_id": "emma_smith_8564"}, "info": null}, {"action_id": "69_2", "name": "get_order_details", "arguments": {"order_id": "#W2417020"}, "info": null}, {"action_id": "69_3", "name": "cancel_pending_order", "arguments": {"order_id": "#W2417020", "reason": "no longer needed"}, "info": null}], "communicate_info": [], "nl_assertions": null}}
+{"id": "retail_task_70", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou recently received a helmet but you are not happy with it and want to exchange. The size is too small and you want medium, plus you want high ventilation. If multiple colors are available, you prefer blue. You do not want the You prefer original payment to pay for the price difference, and you want to know how much you need to pay today.\nKnown info:\n\tYou name is Sofia Hernandez and your zip code is 98193.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou are impatient, confident, direct, messy."}, "evaluation_criteria": {"actions": [{"action_id": "70_0", "name": "exchange_delivered_order_items", "arguments": {"order_id": "#W3947049", "item_ids": ["3358616356"], "new_item_ids": ["9013366374"], "payment_method_id": "credit_card_7901829"}, "info": null}], "communicate_info": ["22.55"], "nl_assertions": null}}
+{"id": "retail_task_71", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou made some mistake and ordered an order sent to your son's address in Washington DC, and you want to modify it to your default address in Charlotte (you do not want to mention it, but it is in your user profile the agent can look up) because he is coming back home. You also want to adjust the desk lamp to be black color, and the backpack to be medium size and polyester material instead. If multiple colors are available for the backpack, you prefer grey. If the agent asks for payment method, you say GC initially, but when the agent asks you to confirm before proceeding, you change your mind to PayPal, and decide to only modify the backpack.\nKnown info:\n\tYou name is Ivan Khan and your zip code is 28243.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou are polite, optimistic, organized."}, "evaluation_criteria": {"actions": [{"action_id": "71_0", "name": "modify_pending_order_address", "arguments": {"order_id": "#W5270061", "address1": "159 Hickory Lane", "address2": "Suite 995", "city": "Charlotte", "country": "USA", "state": "NC", "zip": "28243"}, "info": null}, {"action_id": "71_1", "name": "modify_pending_order_items", "arguments": {"order_id": "#W5270061", "item_ids": ["2492465580"], "new_item_ids": ["5917587651"], "payment_method_id": "paypal_7729105"}, "info": null}], "communicate_info": [], "nl_assertions": null}}
+{"id": "retail_task_72", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou made some mistake and sent an order your son's address in Washington DC, and you want to modify it to your default address in Charlotte instead (you do not want to mention it, but it is in your user profile the agent can look up) because he is coming back home. You also want to adjust the desk lamp to be black color, and the backpack to be medium size and polyester material instead. If multiple colors are available for the backpack, you prefer grey. If the agent asks for payment method, you say gift card initially, but when the agent asks you to confirm before proceeding, you change your mind to using PayPal, and also decide to only modify the backpack. Make sure you briefly mention the two things at the same time at the beginning, but first mention the modification then the address.\nKnown info:\n\tYou name is Ivan Khan and your zip code is 28243.\nUnknown info:\n\tYou do not remember your email address.\nTask instructions:\n\tYou are polite, optimistic, organized."}, "evaluation_criteria": {"actions": [{"action_id": "72_0", "name": "modify_pending_order_address", "arguments": {"order_id": "#W5270061", "address1": "159 Hickory Lane", "address2": "Suite 995", "city": "Charlotte", "country": "USA", "state": "NC", "zip": "28243"}, "info": null}, {"action_id": "72_1", "name": "modify_pending_order_items", "arguments": {"order_id": "#W5270061", "item_ids": ["2492465580"], "new_item_ids": ["5917587651"], "payment_method_id": "paypal_7729105"}, "info": null}], "communicate_info": [], "nl_assertions": null}}
+{"id": "retail_task_73", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to return everything you just bought except the coffee machine.\nKnown info:\n\tYou name is Fatima Wilson and your email is fatima.wilson5721@example.com.\nTask instructions:\n\tYou are polite, flexible, creative."}, "evaluation_criteria": {"actions": [{"action_id": "73_0", "name": "return_delivered_order_items", "arguments": {"order_id": "#W5272531", "item_ids": ["7228247242", "2698416822", "8098621301", "3320557165"], "payment_method_id": "credit_card_6824399"}, "info": null}], "communicate_info": [], "nl_assertions": null}}
+{"id": "retail_task_74", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou recently bought a laptop, but you want to exchange it to i9 CPU. If multiple storage options are available, you prefer 256GB SSD. If multiple colors are available, you prefer silver. You also have a pending order with five items (you don't remember order ID), and you want to cancel it because you no longer need them.\nKnown info:\n\tYou name is Lei Li and your zip code is 85033.\nUnknown info:\n\tYou don't have an email. If the agent asks for payment method for the order modification, say you want to use your credit card.\nTask instructions:\n\tYou are insecure and shy."}, "evaluation_criteria": {"actions": [{"action_id": "74_0", "name": "cancel_pending_order", "arguments": {"order_id": "#W3189752", "reason": "no longer needed"}, "info": null}, {"action_id": "74_1", "name": "modify_pending_order_items", "arguments": {"order_id": "#W5166363", "item_ids": ["3334537816"], "new_item_ids": ["3265035808"], "payment_method_id": "credit_card_4466831"}, "info": null}], "communicate_info": [], "nl_assertions": null}}
+{"id": "retail_task_75", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tFor #W6908222, you want to exchange Wireless Earbuds {'color': 'blue', 'battery life': '8 hours', 'water resistance': 'IPX4'} to one with the following attributes: {'color': 'black', 'battery life': '4 hours', 'water resistance': 'not resistant'}\nKnown info:\n\tYou name is Liam Moore and your email is liam.moore6985@example.com.\nTask instructions:\n\tYou are direct, patient, organized, optimistic."}, "evaluation_criteria": {"actions": [{"action_id": "75_0", "name": "exchange_delivered_order_items", "arguments": {"order_id": "#W6908222", "item_ids": ["8555936349"], "new_item_ids": ["4063058357"], "payment_method_id": "paypal_4518393"}, "info": null}], "communicate_info": [], "nl_assertions": null}}
+{"id": "retail_task_76", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou ordered a fleece jacket by mistake and want to remove it from your pending order. If removing one item is not possible, cancel the whole order. You also want to modify the skateboard to maple material, 34 inch, graphic. If that is not possible, cancel the order since you no longer need this one. \nFinally, you also want to know the total price for all the grills you have bought in previous orders.\nKnown info:\n\tYou name is Ava Nguyen and your zip code is 94128.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou are polite, optimistic, busy."}, "evaluation_criteria": {"actions": [{"action_id": "76_0", "name": "cancel_pending_order", "arguments": {"order_id": "#W8367380", "reason": "ordered by mistake"}, "info": null}, {"action_id": "76_1", "name": "cancel_pending_order", "arguments": {"order_id": "#W1242543", "reason": "no longer needed"}, "info": null}], "communicate_info": ["1939.05"], "nl_assertions": null}}
+{"id": "retail_task_77", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou ordered a perfume and you just tried a little bit and you like it a lot. You want to get the maximum size available for it.\nKnown info:\n\tYou name is Ivan Johnson and your zip code is 94183.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tIf the agent cannot help with placing a new order, ask to exchange your current bottle for the largest size available."}, "evaluation_criteria": {"actions": [{"action_id": "77_0", "name": "exchange_delivered_order_items", "arguments": {"order_id": "#W1671835", "item_ids": ["5081446110"], "new_item_ids": ["3399869890"], "payment_method_id": "paypal_6918118"}, "info": null}], "communicate_info": [], "nl_assertions": null}}
+{"id": "retail_task_78", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou have a couple requests.\nFor order #W5056519, you want to change the address to be the same as order #W8277957. For order #W5056519, you want to exchange Makeup Kit {'skin tone': 'light', 'kit size': 'professional', 'brand': 'Brand B'} to {'skin tone': 'dark', 'brand': 'Brand A'}. Finally, you want to cancel order #W5995614 because you ordered by mistake.\nKnown info:\n\tYour name is Yara Muller and your email is yara.muller9246@example.com.\nTask instructions:\n\tYou are sad, organized, pessimistic."}, "evaluation_criteria": {"actions": [{"action_id": "78_0", "name": "modify_pending_order_address", "arguments": {"order_id": "#W5056519", "address1": "380 Maple Drive", "address2": "Suite 960", "city": "San Diego", "country": "USA", "state": "CA", "zip": "92101"}, "info": null}, {"action_id": "78_1", "name": "modify_pending_order_items", "arguments": {"order_id": "#W5056519", "item_ids": ["7902309762"], "new_item_ids": ["1573035764"], "payment_method_id": "credit_card_3095586"}, "info": null}, {"action_id": "78_2", "name": "cancel_pending_order", "arguments": {"order_id": "#W5995614", "reason": "ordered by mistake"}, "info": null}], "communicate_info": [], "nl_assertions": null}}
+{"id": "retail_task_79", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou just bought a water bottle with 500ml but you regret it, and you want to change it to the other bottle you just placed with 1000ml capacity. If the exact item is not available any more, you can allow the material to be different, but you want the color to be the same as your other 1L bottle.\nKnown info:\n\tYou name is Emma Kovacs and your zip code is 32190.\nUnknown info:\n\tYou do not know your email. You do not know order numbers.\nTask instructions:\n\tYou are insecure, rigid, sad, logical. You do not want to cancel any orders."}, "evaluation_criteria": {"actions": [{"action_id": "79_0", "name": "modify_pending_order_items", "arguments": {"order_id": "#W8661412", "item_ids": ["3453331371"], "new_item_ids": ["2439754078"], "payment_method_id": "credit_card_7239357"}, "info": null}], "communicate_info": [], "nl_assertions": null}}
+{"id": "retail_task_80", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tFor #W7209932, exchange T-Shirt {'color': 'blue', 'size': 'S', 'material': 'polyester', 'style': 'v-neck'} to {'color': 'red', 'size': 'XXL', 'material': 'cotton', 'style': 'crew neck'}; Use the gift card.\nKnown info:\n\tYou name is Amelia Gonzalez and your email is amelia.gonzalez4271@example.com.\nTask instructions:\n\tYou are curious, patient, outgoing. Try to make the conversation as confusing for the agent as possible."}, "evaluation_criteria": {"actions": [{"action_id": "80_0", "name": "exchange_delivered_order_items", "arguments": {"order_id": "#W7209932", "item_ids": ["5047954489"], "new_item_ids": ["9354168549"], "payment_method_id": "gift_card_2611937"}, "info": null}], "communicate_info": [], "nl_assertions": null}}
+{"id": "retail_task_81", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tDue to some life changes, you no longer need hiking boots, watch, keyboard, charger, jacket, and running shoes. If cancelling part of the order is not possible, you don't care, just cancel the whole order.\nKnown info:\n\tYou name is James Kim and your email is james.kim1995@example.com.\nTask instructions:\n\tYou are sad, independent, polite."}, "evaluation_criteria": {"actions": [{"action_id": "81_0", "name": "cancel_pending_order", "arguments": {"order_id": "#W3289292", "reason": "no longer needed"}, "info": null}, {"action_id": "81_1", "name": "cancel_pending_order", "arguments": {"order_id": "#W9722559", "reason": "no longer needed"}, "info": null}], "communicate_info": [], "nl_assertions": null}}
+{"id": "retail_task_82", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou received two tablets and you only need one. You want to return the more expensive one and refund to credit card. If refund to credit card is not possible, you become angry and return everything on that order and refund to GC.\nKnown info:\n\tYou name is Chen Silva and your zip code is 46281.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou are messy, flexible, outgoing."}, "evaluation_criteria": {"actions": [{"action_id": "82_0", "name": "return_delivered_order_items", "arguments": {"order_id": "#W9571698", "item_ids": ["5952720925", "9973034634", "7381052709", "6065192424"], "payment_method_id": "gift_card_7250692"}, "info": null}], "communicate_info": [], "nl_assertions": null}}
+{"id": "retail_task_83", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou received two tablets and you only need one. You want to return the more expensive one and refund to credit card. If refund to credit card is not possible, you become angry and refund to GC.\nKnown info:\n\tYou name is Chen Silva and your zip code is 46281.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou are messy, flexible, outgoing."}, "evaluation_criteria": {"actions": [{"action_id": "83_0", "name": "return_delivered_order_items", "arguments": {"order_id": "#W9571698", "item_ids": ["6065192424"], "payment_method_id": "gift_card_7250692"}, "info": null}], "communicate_info": [], "nl_assertions": null}}
+{"id": "retail_task_84", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou received two tablets and you only need one. You want to return the less expensive one and refund to credit card. But when the agent asks for confirmation, you change your mind and return the more expensive one and ask for a refund to gift card.\nKnown info:\n\tYou name is Chen Silva and your zip code is 46281.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou are messy, flexible, outgoing."}, "evaluation_criteria": {"actions": [{"action_id": "84_0", "name": "return_delivered_order_items", "arguments": {"order_id": "#W9571698", "item_ids": ["6065192424"], "payment_method_id": "gift_card_7250692"}, "info": null}], "communicate_info": [], "nl_assertions": null}}
+{"id": "retail_task_85", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to exchange your Fleece Jacket for a large red Fleece Jacket with a half zipper\nKnown info:\n\tYou name is Yusuf Hernandez and your email is yusuf.hernandez8836@example.com.\nTask instructions:\n\tYou are shy, rigid."}, "evaluation_criteria": {"actions": [{"action_id": "85_0", "name": "modify_pending_order_items", "arguments": {"order_id": "#W2466703", "item_ids": ["9385662952"], "new_item_ids": ["8733974883"], "payment_method_id": "paypal_7529813"}, "info": null}], "communicate_info": [], "nl_assertions": null}}
+{"id": "retail_task_86", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to exchange your Fleece Jacket to red color and half zipper. You also want to want to change your default address to your Washington DC address (which you do not want to reveal but is in one of the orders).\nKnown info:\n\tYou name is Yusuf Hernandez and your email is yusuf.hernandez8836@example.com.\nTask instructions:\n\tYou are shy, rigid."}, "evaluation_criteria": {"actions": [{"action_id": "86_0", "name": "modify_pending_order_items", "arguments": {"order_id": "#W2466703", "item_ids": ["9385662952"], "new_item_ids": ["8733974883"], "payment_method_id": "paypal_7529813"}, "info": null}, {"action_id": "86_1", "name": "modify_user_address", "arguments": {"user_id": "yusuf_hernandez_6785", "address1": "565 Maple Drive", "address2": "Suite 501", "city": "Washington", "country": "USA", "state": "DC", "zip": "20307"}, "info": null}], "communicate_info": [], "nl_assertions": null}}
+{"id": "retail_task_87", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to modify all your pending order address to the Washington DC address (which you do not want to reveal but is in one of the orders), along with your user default address.\nKnown info:\n\tYou name is Yusuf Hernandez and your email is yusuf.hernandez8836@example.com.\nTask instructions:\n\tYou are shy, rigid."}, "evaluation_criteria": {"actions": [{"action_id": "87_0", "name": "modify_pending_order_address", "arguments": {"order_id": "#W2166301", "address1": "565 Maple Drive", "address2": "Suite 501", "city": "Washington", "country": "USA", "state": "DC", "zip": "20307"}, "info": null}, {"action_id": "87_1", "name": "modify_pending_order_address", "arguments": {"order_id": "#W2466703", "address1": "565 Maple Drive", "address2": "Suite 501", "city": "Washington", "country": "USA", "state": "DC", "zip": "20307"}, "info": null}, {"action_id": "87_2", "name": "modify_pending_order_address", "arguments": {"order_id": "#W6832752", "address1": "565 Maple Drive", "address2": "Suite 501", "city": "Washington", "country": "USA", "state": "DC", "zip": "20307"}, "info": null}, {"action_id": "87_3", "name": "modify_user_address", "arguments": {"user_id": "yusuf_hernandez_6785", "address1": "565 Maple Drive", "address2": "Suite 501", "city": "Washington", "country": "USA", "state": "DC", "zip": "20307"}, "info": null}], "communicate_info": [], "nl_assertions": null}}
+{"id": "retail_task_88", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to change the book shelf to 4 foot but with the same material and color. If it is not available, cancel the whole order and you will buy again. If the agent asks for the cancellation reason, you say you ordered by mistake.\nKnown info:\n\tYou name is Daiki Silva and your email is daiki.silva6295@example.com.\nTask instructions:\n\tYou are insecure, creative, direct, relaxing."}, "evaluation_criteria": {"actions": [{"action_id": "88_0", "name": "cancel_pending_order", "arguments": {"order_id": "#W8835847", "reason": "ordered by mistake"}, "info": null}], "communicate_info": [], "nl_assertions": null}}
+{"id": "retail_task_89", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to know what is the cheapest available mechanical keyboard right now and its options. If it is less than 200 bucks you want to exchange your current one to it. If not, you want to return your current one.\nKnown info:\n\tYou name is Raj Santos and your zip code is 98157.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou are dependent, flexible."}, "evaluation_criteria": {"actions": [{"action_id": "89_0", "name": "return_delivered_order_items", "arguments": {"order_id": "#W4680753", "item_ids": ["9690244451"], "payment_method_id": "paypal_2417743"}, "info": null}], "communicate_info": ["226.11", "tactile", "white", "full"], "nl_assertions": null}}
+{"id": "retail_task_90", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to know if the digital camera you just bought is 10x zoom. If not, modify the item to 10x zoom without changing the other options. If 10x zoom is not available, cancel the order with the reason of no longer needed. If it is available but the price is more than 3000, cancel the order with the reason of ordered by mistake.\nKnown info:\n\tYou name is Emma Kovacs and your email is emma.kovacs2974@example.com.\nTask instructions:\n\tYou are polite, curious, flexible, relaxing, yet impatient."}, "evaluation_criteria": {"actions": [{"action_id": "90_0", "name": "cancel_pending_order", "arguments": {"order_id": "#W9284598", "reason": "ordered by mistake"}, "info": null}], "communicate_info": [], "nl_assertions": null}}
+{"id": "retail_task_91", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\t.You are angry about the quality of the two skateboards you just bought. You want to return them and refund to credit card. If the agent asks for confirmation, do not say yes immediately, because you also want to return the smart watch. You also want to return the e-reader you just bought. If the same item is available online, you're willing to exchange it to the same item. If not, you want to return it and refund to credit card.\nKnown info:\n\tYou name is Mei Ahmed and your zip code is 78705.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou are polite, outgoing."}, "evaluation_criteria": {"actions": [{"action_id": "91_0", "name": "return_delivered_order_items", "arguments": {"order_id": "#W7553978", "item_ids": ["4545791457", "3098764622", "1631806422"], "payment_method_id": "credit_card_5902940"}, "info": null}, {"action_id": "91_1", "name": "exchange_delivered_order_items", "arguments": {"order_id": "#W3239882", "item_ids": ["9494281769"], "new_item_ids": ["9494281769"], "payment_method_id": "credit_card_5902940"}, "info": null}], "communicate_info": [], "nl_assertions": null}}
+{"id": "retail_task_92", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou are angry about the quality of the two skateboards you just bought. You want to return them and refund to credit card. If the agent asks for confirmation, do not say yes immediately, because you also want to return the smart watch and e-reader.\nKnown info:\n\tYou name is Mei Ahmed and your zip code is 78705.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou are polite and outgoing."}, "evaluation_criteria": {"actions": [{"action_id": "92_0", "name": "return_delivered_order_items", "arguments": {"order_id": "#W7553978", "item_ids": ["4545791457", "3098764622", "1631806422"], "payment_method_id": "credit_card_5902940"}, "info": null}, {"action_id": "92_1", "name": "return_delivered_order_items", "arguments": {"order_id": "#W3239882", "item_ids": ["9494281769"], "payment_method_id": "credit_card_5902940"}, "info": null}], "communicate_info": [], "nl_assertions": null}}
+{"id": "retail_task_93", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou received a laptop and you want to exchange it to i7 processor, 8GB, 1TB SSD. If the agent asks for which laptop, it is 15-inch, 32GB.\nKnown info:\n\tYou name is Lei Wilson and your zip code is 32255.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou are confident, organized, creative, impatient."}, "evaluation_criteria": {"actions": [{"action_id": "93_0", "name": "exchange_delivered_order_items", "arguments": {"order_id": "#W4073673", "item_ids": ["2216662955"], "new_item_ids": ["9844888101"], "payment_method_id": "credit_card_3677959"}, "info": null}], "communicate_info": [], "nl_assertions": null}}
+{"id": "retail_task_94", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou received a laptop and you want to exchange it to i7 processor, 8GB, 1TB SSD. If the agent asks for which laptop, it is 15-inch, 16GB.\nKnown info:\n\tYou name is Lei Wilson and your zip code is 32255.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou are confident, organized, creative, impatient."}, "evaluation_criteria": {"actions": [{"action_id": "94_0", "name": "exchange_delivered_order_items", "arguments": {"order_id": "#W2905754", "item_ids": ["3478699712"], "new_item_ids": ["9844888101"], "payment_method_id": "credit_card_3677959"}, "info": null}], "communicate_info": [], "nl_assertions": null}}
+{"id": "retail_task_95", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou received a laptop and you want to exchange it to i7 processor, 8GB, 1TB SSD. If the agent asks for which laptop, it is 15-inch, and it is actually two laptops that you want to exchange. You want to know how much you need to pay today in total.\nKnown info:\n\tYou name is Lei Wilson and your zip code is 32255.\nUnknown info:\n\tYou do not remember your email address.\nTask instructions:\n\tYou are confident, organized, creative, impatient."}, "evaluation_criteria": {"actions": [{"action_id": "96_0", "name": "exchange_delivered_order_items", "arguments": {"order_id": "#W2905754", "item_ids": ["3478699712"], "new_item_ids": ["9844888101"], "payment_method_id": "credit_card_3677959"}, "info": null}, {"action_id": "96_1", "name": "exchange_delivered_order_items", "arguments": {"order_id": "#W4073673", "item_ids": ["2216662955"], "new_item_ids": ["9844888101"], "payment_method_id": "credit_card_3677959"}, "info": null}], "communicate_info": ["167.87", "60.78", "107.09"], "nl_assertions": null}}
+{"id": "retail_task_96", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to change your LA order to your NYC address (you prefer not to reveal it but it is in your other order). You also want to exchange Bluetooth Speaker to be the cheapest green type.\nKnown info:\n\tYou name is Yusuf Li and your zip code is 91148.\nUnknown info:\n\tYou do not remember your email address.\nTask instructions:\n\tYou are cautious, insecure, organized."}, "evaluation_criteria": {"actions": [{"action_id": "97_0", "name": "modify_pending_order_address", "arguments": {"order_id": "#W6750959", "address1": "476 Maple Drive", "address2": "Suite 432", "city": "New York", "country": "USA", "state": "NY", "zip": "10093"}, "info": null}, {"action_id": "97_1", "name": "modify_pending_order_items", "arguments": {"order_id": "#W6750959", "item_ids": ["3254583681"], "new_item_ids": ["9440686670"], "payment_method_id": "paypal_8080730"}, "info": null}], "communicate_info": [], "nl_assertions": null}}
+{"id": "retail_task_97", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to change your LA order to your NYC address (you prefer not to reveal it but it is in your other order). You also want to exchange Bluetooth Speaker to be the cheapest green type. Make sure you mention the two requests at the same time to the agent, but mention the exchange first.\nKnown info:\n\tYou name is Yusuf Li and your zip code is 91148.\nUnknown info:\n\tYou don't know your email.\nTask instructions:\n\tYou are cautious, insecure, organized."}, "evaluation_criteria": {"actions": [{"action_id": "98_0", "name": "modify_pending_order_address", "arguments": {"order_id": "#W6750959", "address1": "476 Maple Drive", "address2": "Suite 432", "city": "New York", "country": "USA", "state": "NY", "zip": "10093"}, "info": null}, {"action_id": "98_1", "name": "modify_pending_order_items", "arguments": {"order_id": "#W6750959", "item_ids": ["3254583681"], "new_item_ids": ["9440686670"], "payment_method_id": "paypal_8080730"}, "info": null}], "communicate_info": [], "nl_assertions": null}}
+{"id": "retail_task_98", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to exchange your Bicycle to a larger frame size for your kid. Jigsaw Puzzle in the same order also needs to be exchanged, you want the same difficulty, but 1000 more pieces, and you prefer animal than art theme if both are available. Make sure you mention these at the same time. You also want to exchange your camera to a slightly lower resolution, without changing the other options. If the agent asks for confirmation, mention that you'd prefer the other card as payment or refund method. Lastly, you want to cancel the skateboard in your other order.\nKnown info:\n\tYou name is Sofia Li and your zip code is 78260.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou are outgoing, organized, cautious, pessimistic. If you cannot cancel one single item, you are okay with cancelling the whole order, with the reason of no longer needed."}, "evaluation_criteria": {"actions": [{"action_id": "99_0", "name": "exchange_delivered_order_items", "arguments": {"order_id": "#W4689314", "item_ids": ["5996159312"], "new_item_ids": ["8363011723"], "payment_method_id": "credit_card_8105988"}, "info": null}, {"action_id": "99_1", "name": "exchange_delivered_order_items", "arguments": {"order_id": "#W3916020", "item_ids": ["7758198585", "4068787148"], "new_item_ids": ["5606522780", "6245746168"], "payment_method_id": "credit_card_8105988"}, "info": null}, {"action_id": "99_2", "name": "cancel_pending_order", "arguments": {"order_id": "#W8855135", "reason": "no longer needed"}, "info": null}], "communicate_info": [], "nl_assertions": null}}
+{"id": "retail_task_99", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to exchange your Bicycle to a larger frame size for your kid. The Jigsaw Puzzle in the same order also needs to be exchanged, you want the same difficulty, but 1000 more pieces, and you prefer art over the animal theme if both are available. Make sure you mention these at the same time. \nYou also want to exchange your camera to a slightly lower resolution, without changing the other options. For both orders, you'd prefer the visa card as payment or refund method. Lastly, you want to cancel the skateboard (and only the skateboard) in one of your orders.\nKnown info:\n\tYou name is Sofia Li and your zip code is 78260.\nUnknown info:\n\tYou do not remember your email address.\nTask instructions:\n\tYou are outgoing, organized, cautious, pessimistic. \nIf you cannot cancel just a single item in an order, you are okay with cancelling the whole order, but you will do it yourself on the website and don't need for the agent to help."}, "evaluation_criteria": {"actions": [{"action_id": "100_0", "name": "exchange_delivered_order_items", "arguments": {"order_id": "#W4689314", "item_ids": ["5996159312"], "new_item_ids": ["8363011723"], "payment_method_id": "credit_card_3951670"}, "info": null}, {"action_id": "100_1", "name": "exchange_delivered_order_items", "arguments": {"order_id": "#W3916020", "item_ids": ["7758198585", "4068787148"], "new_item_ids": ["5606522780", "5546244844"], "payment_method_id": "credit_card_3951670"}, "info": null}], "communicate_info": [], "nl_assertions": null}}
+{"id": "retail_task_100", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to return your luggage set and get the exact same item but with red color, and return you skateboard in the same order to get a new one with features {'length': '34 inch', 'design': 'custom'}; You also want to return the hiking boots.\nKnown info:\n\tYou name is Liam Thomas and your zip code is 85049.\nUnknown info:\n\tYou do not remember your email address.\nTask instructions:\n\tYou are pessimistic and an insecure person.\nIf the agent says pending orders cannot be exchanged, ask for changing the order to your requirements."}, "evaluation_criteria": {"actions": [{"action_id": "101_0", "name": "modify_pending_order_items", "arguments": {"order_id": "#W3295833", "item_ids": ["8926329222", "5312063289"], "new_item_ids": ["7160999700", "6956751343"], "payment_method_id": "credit_card_3261838"}, "info": null}, {"action_id": "101_1", "name": "return_delivered_order_items", "arguments": {"order_id": "#W8488728", "item_ids": ["5676696062"], "payment_method_id": "paypal_3650980"}, "info": null}], "communicate_info": [], "nl_assertions": null}}
+{"id": "retail_task_101", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou just placed an order with two watches, you want to change its address to your New York address (you don't want to reveal it but it's in your other order). You also want to modify the silicone watch to a metal one. If multiple colors available, you prefer white. \nIn another order, you have an air purifier along with a speaker, and you want to change the purifier to large size and night mode, but still with HEPA filter. You are certain that this order contains both an air purifier and a speaker.\nKnown info:\n\tYou name is Noah Ito and your zip code is 98187.\nUnknown info:\n\tYou do not remember your email address.\nTask instructions:\n\tYou are logical but also impatient. You like to say things in pieces."}, "evaluation_criteria": {"actions": [{"action_id": "102_0", "name": "modify_pending_order_address", "arguments": {"order_id": "#W4219264", "address1": "144 Lakeview Drive", "address2": "Suite 925", "city": "New York", "country": "USA", "state": "NY", "zip": "10228"}, "info": null}, {"action_id": "102_1", "name": "modify_pending_order_items", "arguments": {"order_id": "#W4219264", "item_ids": ["8886009523"], "new_item_ids": ["2407258246"], "payment_method_id": "credit_card_1620755"}, "info": null}, {"action_id": "102_2", "name": "modify_pending_order_items", "arguments": {"order_id": "#W6729841", "item_ids": ["3076708684"], "new_item_ids": ["8302289002"], "payment_method_id": "credit_card_1620755"}, "info": null}], "communicate_info": [], "nl_assertions": null}}
+{"id": "retail_task_102", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou just placed an order with two watches, you want to change its address to your New York address (you don't want to reveal it but it's in your other order). You also want to modify the silicone watch to a metal one. If multiple colors available, you prefer white. For the air purifier you received along with sneakers, you want to exchange the purifier to large size and night mode, but still with HEPA filter.\nKnown info:\n\tYou name is Noah Ito and your zip code is 98187.\nUnknown info:\n\tYou do not remember your email address.\nTask instructions:\n\tYou are logical but also impatient. You like to say things in pieces."}, "evaluation_criteria": {"actions": [{"action_id": "103_0", "name": "modify_pending_order_address", "arguments": {"order_id": "#W4219264", "address1": "144 Lakeview Drive", "address2": "Suite 925", "city": "New York", "country": "USA", "state": "NY", "zip": "10228"}, "info": null}, {"action_id": "103_1", "name": "modify_pending_order_items", "arguments": {"order_id": "#W4219264", "item_ids": ["8886009523"], "new_item_ids": ["2407258246"], "payment_method_id": "credit_card_1620755"}, "info": null}, {"action_id": "103_2", "name": "exchange_delivered_order_items", "arguments": {"order_id": "#W3445693", "item_ids": ["6341716129"], "new_item_ids": ["8302289002"], "payment_method_id": "credit_card_1620755"}, "info": null}], "communicate_info": [], "nl_assertions": null}}
+{"id": "retail_task_103", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to return the bookshelf and jigsaw you received in the same order. Make sure you mention at the beginning that you want to cancel these two things, and they are from the same order. You also want to return the backpack you received with the vacuum cleaner. You also want to change your pending order address to the default Chicago one, and change its item color to red. You want to get the tracking number of your cancelled order.\nKnown info:\n\tYou name is Lucas Brown and your email is lucas.brown9344@example.com.\nTask instructions:\n\tYou are busy, happy, outgoing, messy, optimistic. You like to say one thing at a time."}, "evaluation_criteria": {"actions": [{"action_id": "104_0", "name": "return_delivered_order_items", "arguments": {"order_id": "#W6239298", "item_ids": ["4900661478", "3614853563"], "payment_method_id": "credit_card_2112420"}, "info": null}, {"action_id": "104_1", "name": "return_delivered_order_items", "arguments": {"order_id": "#W9218746", "item_ids": ["7824298782"], "payment_method_id": "credit_card_2112420"}, "info": null}, {"action_id": "104_2", "name": "modify_pending_order_address", "arguments": {"order_id": "#W4860251", "address1": "921 Park Avenue", "address2": "Suite 892", "city": "Chicago", "country": "USA", "state": "IL", "zip": "60612"}, "info": null}, {"action_id": "104_3", "name": "modify_pending_order_items", "arguments": {"order_id": "#W4860251", "item_ids": ["5209958006"], "new_item_ids": ["8964750292"], "payment_method_id": "credit_card_2112420"}, "info": null}], "communicate_info": ["286422338955"], "nl_assertions": null}}
+{"id": "retail_task_104", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to return all bookshelves and jigsaw puzzles you received in different orders. Make sure you mention at the beginning that you want to cancel these two things, and they are from different orders. You also want to return the backpack you received with the vacuum cleaner. You also want to change your pending order item to red, and change the address of the order to your default Chicago home (you won't reveal it for private reasons but it's in your profile). You want to get the tracking number of your cancelled order.\nKnown info:\n\tYou name is Lucas Brown and your email is lucas.brown9344@example.com.\nTask instructions:\n\tYou are busy, happy, outgoing, messy, optimistic. You like to say one thing at a time."}, "evaluation_criteria": {"actions": [{"action_id": "105_0", "name": "return_delivered_order_items", "arguments": {"order_id": "#W8660475", "item_ids": ["8479046075"], "payment_method_id": "credit_card_2112420"}, "info": null}, {"action_id": "105_1", "name": "return_delivered_order_items", "arguments": {"order_id": "#W9218746", "item_ids": ["7824298782"], "payment_method_id": "credit_card_2112420"}, "info": null}, {"action_id": "105_2", "name": "modify_pending_order_address", "arguments": {"order_id": "#W4860251", "address1": "921 Park Avenue", "address2": "Suite 892", "city": "Chicago", "country": "USA", "state": "IL", "zip": "60612"}, "info": null}, {"action_id": "105_3", "name": "modify_pending_order_items", "arguments": {"order_id": "#W4860251", "item_ids": ["5209958006"], "new_item_ids": ["8964750292"], "payment_method_id": "credit_card_2112420"}, "info": null}, {"action_id": "action_1746656550382", "name": "return_delivered_order_items", "arguments": {"order_id": "#W6239298", "item_ids": ["4900661478", "3614853563"], "payment_method_id": "credit_card_2112420"}, "info": null}], "communicate_info": ["286422338955"], "nl_assertions": null}}
+{"id": "retail_task_105", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tFor order #W4316152, you want to exchange one Tea Kettle {'material': 'glass', 'capacity': '2 liters', 'stovetop compatibility': 'induction'} to {'material': 'ceramic', 'stovetop compatibility': 'gas'}; and another Tea Kettle {'material': 'glass', 'capacity': '2 liters', 'stovetop compatibility': 'induction'} to {'capacity': '1.5 liters', 'stovetop compatibility': 'gas'};\nKnown info:\n\tYou name is Aarav Anderson and your zip code is 19031.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou are cautious, messy, rigid."}, "evaluation_criteria": {"actions": [{"action_id": "106_0", "name": "exchange_delivered_order_items", "arguments": {"order_id": "#W4316152", "item_ids": ["7292993796", "7292993796"], "new_item_ids": ["3761330360", "9647374798"], "payment_method_id": "gift_card_7245904"}, "info": null}], "communicate_info": [], "nl_assertions": ["Agent should exchange both tea kettles (same style) to the items requested"]}}
+{"id": "retail_task_106", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to exchange your T-Shirt because it is too big, one size smaller would be good. You like the cotton feeling. If multiple colors are available, you prefer black.\nKnown info:\n\tYou name is Sofia Thomas and your emails are sofia.thomas3019@example.com and sofia.thomas3069@example.com.\nUnknown info:\n\t.\nTask instructions:\n\tYou are dependent, pessimistic, direct."}, "evaluation_criteria": {"actions": [{"action_id": "107_0", "name": "exchange_delivered_order_items", "arguments": {"order_id": "#W3388163", "item_ids": ["9354168549"], "new_item_ids": ["2060066974"], "payment_method_id": "paypal_5334408"}, "info": null}], "communicate_info": [], "nl_assertions": ["Agent should process the exchange."]}}
+{"id": "retail_task_107", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou received hiking boots that seem like they were already worn, you are unhappy about it and want to ask for a new pair with the same specs. You also want to exchange your jigsaw to a more fancy theme, with 500 pieces less. But you want to keep the same difficulty level.\nKnown info:\n\tYou name is Yara Ito and your zip code is 75284.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou are happy but messy."}, "evaluation_criteria": {"actions": [{"action_id": "108_0", "name": "exchange_delivered_order_items", "arguments": {"order_id": "#W1304208", "item_ids": ["1615379700"], "new_item_ids": ["1615379700"], "payment_method_id": "paypal_1679017"}, "info": null}, {"action_id": "108_1", "name": "exchange_delivered_order_items", "arguments": {"order_id": "#W8353027", "item_ids": ["6245746168"], "new_item_ids": ["3112842858"], "payment_method_id": "paypal_1679017"}, "info": null}], "communicate_info": [], "nl_assertions": ["Agent should exchange items in both orders."]}}
+{"id": "retail_task_108", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to return everything but a tablet in a recently delivered order. There is an E-Reader in the order that you want to return.\nKnown info:\n\tYou name is Yusuf Gonzalez and your zip code is 91455.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou want to know how much money you can get back."}, "evaluation_criteria": {"actions": [{"action_id": "109_0", "name": "return_delivered_order_items", "arguments": {"order_id": "#W1679211", "item_ids": ["9612497925", "7127170374", "6268080249"], "payment_method_id": "paypal_3022415"}, "info": null}], "communicate_info": ["346.93"], "nl_assertions": ["Agent should make the return", "Agent should communicate the refund to user."]}}
+{"id": "retail_task_109", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou live on Elm Avenue in Houston, and recently you moved to a new house on the same street and bought a luggage set sent to this new address. But you realize you have another order sent to the old address, and you want to change your wrong order address to the new home, and also your user default address to the new home. You do not want to reveal your address but the agent should be able to look it up in orders You do not want to reveal your address and insist the agent should be able to look it up in orders. You also want to exchange your tablet to the cheapest one due to moving costs.\nKnown info:\n\tYou name is Sophia Martin and your email is sophia.martin4832@example.com.\nTask instructions:\n\tYou are organized and outgoing. Make sure to mention the two address changes then the exchange."}, "evaluation_criteria": {"actions": [{"action_id": "110_0", "name": "modify_pending_order_address", "arguments": {"order_id": "#W1603792", "address1": "592 Elm Avenue", "address2": "Suite 978", "city": "Houston", "country": "USA", "state": "TX", "zip": "77242"}, "info": null}, {"action_id": "110_1", "name": "modify_user_address", "arguments": {"user_id": "sophia_martin_8570", "address1": "592 Elm Avenue", "address2": "Suite 978", "city": "Houston", "country": "USA", "state": "TX", "zip": "77242"}, "info": null}, {"action_id": "110_2", "name": "modify_pending_order_items", "arguments": {"order_id": "#W1603792", "item_ids": ["6501071631"], "new_item_ids": ["2106335193"], "payment_method_id": "credit_card_5694100"}, "info": null}], "communicate_info": [], "nl_assertions": ["Agent should make changes to address on order and user profile", "Agent should modify the pending order."]}}
+{"id": "retail_task_110", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou live on Elm Avenue in Houston, and recently you moved to a new house on the same street and bought a tablet sent to there. But you realize you have another order sent to the old address, and you want to change your wrong order address to the new home, and also your user default address to the new home. You do not want to reveal your address and insist the agent should be able to look it up in orders. You also want to exchange your tablet to the cheapest one due to moving costs.\nKnown info:\n\tYou name is Sophia Martin and your email is sophia.martin4832@example.com.\nTask instructions:\n\tYou are organized and outgoing. Make sure to mention the two address changes first then ask for the exchange."}, "evaluation_criteria": {"actions": [{"action_id": "111_0", "name": "modify_pending_order_address", "arguments": {"order_id": "#W1092119", "address1": "760 Elm Avenue", "address2": "Suite 564", "city": "Houston", "state": "TX", "country": "USA", "zip": "77034"}, "info": null}, {"action_id": "111_1", "name": "modify_user_address", "arguments": {"user_id": "sophia_martin_8570", "address1": "760 Elm Avenue", "address2": "Suite 564", "city": "Houston", "state": "TX", "country": "USA", "zip": "77034"}, "info": null}, {"action_id": "111_2", "name": "modify_pending_order_items", "arguments": {"order_id": "#W1603792", "item_ids": ["6501071631"], "new_item_ids": ["2106335193"], "payment_method_id": "credit_card_5694100"}, "info": null}], "communicate_info": [], "nl_assertions": ["Agent should modify address for the order", "Agent should modify user address", "Agent should modify the items in the pending order."]}}
+{"id": "retail_task_111", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to modify the laptop order to your NYC address (you don't want to reveal it but should be in your orders profile). You also like to modify the laptop to be {'processor': 'i5', 'storage': '256GB SSD', 'color': 'space grey'}; You also want to exchange your watch to be black dial color but keep the leather strap.\nKnown info:\n\tYou name is Yara Silva and your zip code is 77159.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou are sad and cautious. You like to say things together."}, "evaluation_criteria": {"actions": [{"action_id": "112_0", "name": "modify_pending_order_items", "arguments": {"order_id": "#W9810810", "item_ids": ["1355937109"], "new_item_ids": ["9949163720"], "payment_method_id": "gift_card_7252880"}, "info": null}, {"action_id": "112_1", "name": "modify_pending_order_address", "arguments": {"order_id": "#W3730488", "address1": "555 Highland Drive", "address2": "Suite 872", "city": "New York", "country": "USA", "state": "NY", "zip": "10116"}, "info": null}, {"action_id": "112_2", "name": "modify_pending_order_items", "arguments": {"order_id": "#W3730488", "item_ids": ["2913673670"], "new_item_ids": ["2216662955"], "payment_method_id": "gift_card_7252880"}, "info": null}], "communicate_info": [], "nl_assertions": ["Agent should modify the items and address as requested."]}}
+{"id": "retail_task_112", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to modify your laptop order to your NYC address (you don't want to reveal it yourself but the address should be in your orders profile). You also want to modify the laptop to item number 9844888101. You also want to change your watch for one with black dial color but keeping the leather strap.\nKnown info:\n\tYou name is Yara Silva and your zip code is 77159.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou are sad and cautious. You like to say things piecewise."}, "evaluation_criteria": {"actions": [{"action_id": "113_0", "name": "modify_pending_order_items", "arguments": {"order_id": "#W9810810", "item_ids": ["1355937109"], "new_item_ids": ["9949163720"], "payment_method_id": "gift_card_7252880"}, "info": null}, {"action_id": "113_1", "name": "modify_pending_order_address", "arguments": {"order_id": "#W3730488", "address1": "555 Highland Drive", "address2": "Suite 872", "city": "New York", "country": "USA", "state": "NY", "zip": "10116"}, "info": null}, {"action_id": "113_2", "name": "modify_pending_order_items", "arguments": {"order_id": "#W3730488", "item_ids": ["2913673670"], "new_item_ids": ["9844888101"], "payment_method_id": "gift_card_7252880"}, "info": null}], "communicate_info": [], "nl_assertions": null}}
+{"id": "retail_task_113", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to cancel all pending orders.\nKnown info:\n\tYou name is Yara Muller and your zip code is 85041.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou are mysterious and don't want to reveal the reason for cancellation until the agent asks. If asked for reason, say you ordered the items by mistake."}, "evaluation_criteria": {"actions": [{"action_id": "114_0", "name": "cancel_pending_order", "arguments": {"order_id": "#W5056519", "reason": "ordered by mistake"}, "info": null}, {"action_id": "114_1", "name": "cancel_pending_order", "arguments": {"order_id": "#W5995614", "reason": "ordered by mistake"}, "info": null}], "communicate_info": [], "nl_assertions": ["Agent should cancel all pending orders"]}}
diff --git a/eval_protocol/benchmarks/test_tau_bench_airline.py b/eval_protocol/benchmarks/test_tau_bench_airline.py
new file mode 100644
index 00000000..fcd06939
--- /dev/null
+++ b/eval_protocol/benchmarks/test_tau_bench_airline.py
@@ -0,0 +1,291 @@
+"""
+Pytest test for tau bench airline evaluation using the evaluation_test decorator.
+
+This test demonstrates how to use tau bench environments within the pytest framework,
+similar to the test_entire_airline_dataset test but integrated with the pytest evaluation system.
+"""
+
+import json
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Dict, List
+
+from eval_protocol.models import EvaluateResult, EvaluationRow, InputMetadata, Message
+from eval_protocol.pytest import evaluation_test, ExceptionHandlerConfig
+from eval_protocol.pytest.default_mcp_gym_rollout_processor import MCPGymRolloutProcessor
+import litellm
+from vendor.tau2.data_model.message import (
+ AssistantMessage,
+ SystemMessage,
+ ToolCall,
+ ToolMessage,
+ UserMessage,
+)
+from vendor.tau2.data_model.tasks import Action, EvaluationCriteria, RewardType, Task, UserScenario
+from vendor.tau2.evaluator.evaluator import EnvironmentEvaluator
+from vendor.tau2.evaluator.evaluator_action import ActionEvaluator
+from vendor.tau2.evaluator.evaluator_communicate import CommunicateEvaluator
+from vendor.tau2.evaluator.evaluator_nl_assertions import NLAssertionsEvaluator
+from vendor.tau2.registry import registry
+from eval_protocol.mcp_servers.tau2 import get_server_script_path, get_system_prompt
+
+
+def _ensure_airline_database():
+ """Ensure airline database exists, downloading if necessary."""
+ import urllib.request
+ from pathlib import Path
+
+ # Get the vendor/tau2/data directory path
+ try:
+ from vendor.tau2.utils.utils import DATA_DIR
+
+ domains_dir = DATA_DIR / "domains"
+ except ImportError:
+ # Fallback: find vendor/tau2 relative to this file
+ vendor_tau2 = Path(__file__).parent.parent.parent / "vendor" / "tau2"
+ domains_dir = vendor_tau2 / "data" / "domains"
+
+ # Only download airline database for this test
+ airline_db_path = domains_dir / "airline" / "db.json"
+ if not airline_db_path.exists():
+ print(f"๐ฅ Downloading airline database to {airline_db_path}...")
+ airline_db_path.parent.mkdir(parents=True, exist_ok=True)
+ try:
+ url = "https://raw.githubusercontent.com/sierra-research/tau2-bench/main/data/tau2/domains/airline/db.json"
+ urllib.request.urlretrieve(url, airline_db_path)
+ print(f"โ
Downloaded airline database ({airline_db_path.stat().st_size:,} bytes)")
+ except Exception as e:
+ print(f"โ Failed to download airline database: {e}")
+ raise
+
+
+# Ensure airline database is available before test runs
+_ensure_airline_database()
+
+
+def _get_airline_dataset_path() -> str:
+ """Get the airline dataset file path."""
+ return str(Path(__file__).parent / "data" / "airline_dataset.jsonl")
+
+
+def _get_server_script_path() -> str:
+ """Get the tau2 mcp server script path."""
+ from eval_protocol.mcp_servers.tau2 import get_server_script_path
+
+ return get_server_script_path()
+
+
+def tau_bench_airline_to_evaluation_row(data: List[Dict[str, Any]]) -> List[EvaluationRow]:
+ """
+ Convert entries from airline dataset to EvaluationRow objects.
+ """
+ rows = []
+ # Load system prompt from file so we can change it in one place
+ from eval_protocol.mcp_servers.tau2 import get_system_prompt
+
+ domain = data[0]["environment_context"]["domain"]
+ system_prompt = get_system_prompt(domain)
+
+ for row in data:
+ eval_row = EvaluationRow(
+ messages=[Message(role="system", content=system_prompt)],
+ input_metadata=InputMetadata(
+ row_id=row["id"],
+ dataset_info={
+ "environment_context": row["environment_context"],
+ "user_simulation": row["user_simulation"],
+ "evaluation_criteria": row["evaluation_criteria"],
+ "user_prompt_template": row["user_prompt_template"],
+ },
+ ),
+ )
+
+ rows.append(eval_row)
+
+ return rows
+
+
+@evaluation_test(
+ input_dataset=[_get_airline_dataset_path()],
+ dataset_adapter=tau_bench_airline_to_evaluation_row,
+ completion_params=[
+ {
+ "temperature": 0.8,
+ "max_tokens": 4096,
+ "extra_body": {"reasoning_effort": "medium"},
+ "model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b",
+ }
+ ],
+ rollout_processor=MCPGymRolloutProcessor(),
+ rollout_processor_kwargs={"domain": "airline"},
+ passed_threshold={"success": 0.4, "standard_error": 0.02},
+ num_runs=8,
+ mode="pointwise",
+ max_concurrent_rollouts=50,
+ server_script_path=_get_server_script_path(),
+ exception_handler_config=ExceptionHandlerConfig(
+ retryable_exceptions={
+ litellm.RateLimitError,
+ litellm.APIConnectionError,
+ }
+ ),
+)
+def test_tau_bench_airline_evaluation(row: EvaluationRow) -> EvaluationRow:
+ """
+ Test tau bench airline evaluation using the pytest framework.
+
+ This test now uses the tau_bench_airline_reward function which automatically
+ extracts evaluation criteria from dataset entries. No wrapper needed!
+
+ Args:
+ row: EvaluationRow object from tau bench airline dataset after rollout
+
+ Returns:
+ EvaluationRow with tau2 evaluation results
+ """
+ messages = row.messages
+
+ # Get evaluation criteria and user_simulation from input_metadata.dataset_info
+ dataset_info = row.input_metadata.dataset_info if row.input_metadata else {}
+ evaluation_criteria = dataset_info.get("evaluation_criteria", {})
+
+ nl_assertions = evaluation_criteria.get("nl_assertions", [])
+ communicate_info = evaluation_criteria.get("communicate_info", [])
+ actions = evaluation_criteria.get("actions", [])
+
+ # Convert Message objects directly to tau2-bench message objects
+ trajectory_objects = []
+ for msg in messages:
+ role = msg.role
+ content = msg.content
+
+ if role == "system":
+ trajectory_objects.append(SystemMessage(role=role, content=content))
+ elif role == "assistant":
+ tau2_tool_calls = []
+ if msg.tool_calls:
+ for tool_call in msg.tool_calls:
+ arguments = json.loads(tool_call.function.arguments)
+ tau2_tool_call = ToolCall(
+ id=tool_call.id,
+ name=tool_call.function.name,
+ arguments=arguments,
+ )
+ tau2_tool_calls.append(tau2_tool_call)
+
+ trajectory_objects.append(AssistantMessage(role=role, content=content, tool_calls=tau2_tool_calls))
+ elif role == "user":
+ trajectory_objects.append(UserMessage(role=role, content=content))
+ elif role == "tool":
+ tool_id = msg.tool_call_id
+ trajectory_objects.append(ToolMessage(id=tool_id, role=role, content=content))
+
+ reward = 1.0
+
+ evaluation_criteria = EvaluationCriteria(
+ nl_assertions=nl_assertions,
+ communicate_info=communicate_info,
+ actions=actions,
+ reward_basis=[ # Use this to adjust how to calculate reward. Tau2-bench uses DB and COMMUNICATE by default for airline tasks.
+ RewardType.DB,
+ RewardType.COMMUNICATE,
+ ],
+ )
+
+ task = Task(
+ id="Filler", evaluation_criteria=evaluation_criteria, user_scenario=UserScenario(instructions="Filler")
+ ) # id and user_scenario are required for the Task type but not used in calculating reward
+
+ if RewardType.DB in task.evaluation_criteria.reward_basis:
+ env_reward_info = EnvironmentEvaluator.calculate_reward(
+ environment_constructor=registry.get_env_constructor("airline"),
+ task=task,
+ full_trajectory=trajectory_objects,
+ )
+ if RewardType.ACTION in task.evaluation_criteria.reward_basis:
+ action_reward_info = ActionEvaluator.calculate_reward(
+ task=task,
+ full_trajectory=trajectory_objects,
+ )
+ if RewardType.COMMUNICATE in task.evaluation_criteria.reward_basis:
+ communicate_reward_info = CommunicateEvaluator.calculate_reward(
+ task=task,
+ full_trajectory=trajectory_objects,
+ )
+ if RewardType.NL_ASSERTION in task.evaluation_criteria.reward_basis:
+ nl_reward_info = NLAssertionsEvaluator.calculate_reward(
+ task=task,
+ full_trajectory=trajectory_objects,
+ )
+
+ reward = 1.0
+ env_bases = {RewardType.DB, RewardType.ENV_ASSERTION}
+ action_bases = {RewardType.ACTION}
+ nl_bases = {RewardType.NL_ASSERTION}
+ comm_bases = {RewardType.COMMUNICATE}
+ task_reward_basis = set(task.evaluation_criteria.reward_basis)
+
+ reward_breakdown = {}
+ if task_reward_basis & env_bases:
+ if env_reward_info.reward_breakdown is not None:
+ reward_breakdown.update(env_reward_info.reward_breakdown)
+ reward *= env_reward_info.reward
+ if task_reward_basis & action_bases:
+ if action_reward_info.reward_breakdown is not None:
+ reward_breakdown.update(action_reward_info.reward_breakdown)
+ reward *= action_reward_info.reward
+ if task_reward_basis & nl_bases:
+ if nl_reward_info.reward_breakdown is not None:
+ reward_breakdown.update(nl_reward_info.reward_breakdown)
+ reward *= nl_reward_info.reward
+ if task_reward_basis & comm_bases:
+ if communicate_reward_info.reward_breakdown is not None:
+ reward_breakdown.update(communicate_reward_info.reward_breakdown)
+ reward *= communicate_reward_info.reward
+
+ # Generate reason showing only failed components
+ failed_reasons = []
+
+ if task_reward_basis & env_bases and env_reward_info.reward == 0:
+ failed_reasons.append("โ Environment/DB check failed")
+
+ if task_reward_basis & action_bases and action_reward_info.reward == 0:
+ failed_actions = []
+ if hasattr(action_reward_info, "action_checks") and action_reward_info.action_checks:
+ failed_actions = [
+ f"{ac.action.name}({ac.action.arguments})"
+ for ac in action_reward_info.action_checks
+ if not ac.action_match
+ ]
+ if failed_actions:
+ failed_reasons.append(f"โ Failed actions: {failed_actions}")
+ else:
+ failed_reasons.append("โ Actions failed")
+
+ if task_reward_basis & nl_bases and nl_reward_info.reward == 0:
+ failed_nl = []
+ if hasattr(nl_reward_info, "nl_assertions") and nl_reward_info.nl_assertions:
+ failed_nl = [nla.nl_assertion for nla in nl_reward_info.nl_assertions if not nla.met]
+ if failed_nl:
+ failed_reasons.append(f"โ Failed NL assertions: {failed_nl}")
+ else:
+ failed_reasons.append("โ NL Assertions failed")
+
+ if task_reward_basis & comm_bases and communicate_reward_info.reward == 0:
+ failed_comm = []
+ if hasattr(communicate_reward_info, "communicate_checks") and communicate_reward_info.communicate_checks:
+ failed_comm = [cc.info for cc in communicate_reward_info.communicate_checks if not cc.met]
+ if failed_comm:
+ failed_reasons.append(f"โ Failed communication: {failed_comm}")
+ else:
+ failed_reasons.append("โ Communication failed")
+
+ # If everything passed, show success
+ reason = "\n".join(failed_reasons) if failed_reasons else "โ
All checks passed"
+
+ row.evaluation_result = EvaluateResult(
+ score=reward,
+ reason=reason,
+ metrics={},
+ )
+ return row
diff --git a/eval_protocol/benchmarks/test_tau_bench_retail.py b/eval_protocol/benchmarks/test_tau_bench_retail.py
index 0db242f1..c033b4dc 100644
--- a/eval_protocol/benchmarks/test_tau_bench_retail.py
+++ b/eval_protocol/benchmarks/test_tau_bench_retail.py
@@ -27,16 +27,45 @@
from vendor.tau2.evaluator.evaluator_communicate import CommunicateEvaluator
from vendor.tau2.evaluator.evaluator_nl_assertions import NLAssertionsEvaluator
from vendor.tau2.registry import registry
+from eval_protocol.mcp_servers.tau2 import get_server_script_path, get_system_prompt
-def _get_retail_dataset_path() -> str:
- """Get the retail dataset file path."""
- return str(Path(__file__).parent.parent.parent / "tests" / "pytest" / "data" / "retail_dataset.jsonl")
+def _ensure_retail_database():
+ """Ensure retail database exists, downloading if necessary."""
+ import urllib.request
+ from pathlib import Path
+
+ # Get the vendor/tau2/data directory path
+ try:
+ from vendor.tau2.utils.utils import DATA_DIR
+ domains_dir = DATA_DIR / "domains"
+ except ImportError:
+ # Fallback: find vendor/tau2 relative to this file
+ vendor_tau2 = Path(__file__).parent.parent.parent / "vendor" / "tau2"
+ domains_dir = vendor_tau2 / "data" / "domains"
-def _get_server_script_path() -> str:
- """Get the tau2 mcp server script path."""
- return str(Path(__file__).parent.parent.parent / "examples" / "tau2_mcp" / "server.py")
+ # Only download retail database for this test
+ retail_db_path = domains_dir / "retail" / "db.json"
+ if not retail_db_path.exists():
+ print(f"๐ฅ Downloading retail database to {retail_db_path}...")
+ retail_db_path.parent.mkdir(parents=True, exist_ok=True)
+ try:
+ url = "https://raw.githubusercontent.com/sierra-research/tau2-bench/main/data/tau2/domains/retail/db.json"
+ urllib.request.urlretrieve(url, retail_db_path)
+ print(f"โ
Downloaded retail database ({retail_db_path.stat().st_size:,} bytes)")
+ except Exception as e:
+ print(f"โ Failed to download retail database: {e}")
+ raise
+
+
+# Ensure retail database is available before test runs
+_ensure_retail_database()
+
+
+def _get_retail_dataset_path() -> str:
+ """Get the retail dataset file path."""
+ return str(Path(__file__).parent / "data" / "retail_dataset.jsonl")
def tau_bench_retail_to_evaluation_row(data: List[Dict[str, Any]]) -> List[EvaluationRow]:
@@ -44,14 +73,9 @@ def tau_bench_retail_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evalu
Convert entries from retail dataset to EvaluationRow objects.
"""
rows = []
- test_dir = Path(__file__).parent.parent.parent / "examples" / "tau2_mcp" / "tests"
-
# Load system prompt from file so we can change it in one place
domain = data[0]["environment_context"]["domain"]
- prompt_file = test_dir / f"system_prompts/{domain}_agent_system_prompt.md"
-
- with open(prompt_file, "r") as f:
- system_prompt = f.read().strip()
+ system_prompt = get_system_prompt(domain)
for row in data:
eval_row = EvaluationRow(
@@ -87,7 +111,7 @@ def tau_bench_retail_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evalu
num_runs=8,
mode="pointwise",
max_concurrent_rollouts=50,
- server_script_path=_get_server_script_path(),
+ server_script_path=get_server_script_path(),
exception_handler_config=ExceptionHandlerConfig(
retryable_exceptions={
litellm.RateLimitError,
diff --git a/eval_protocol/mcp_servers/__init__.py b/eval_protocol/mcp_servers/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/eval_protocol/mcp_servers/tau2/README.md b/eval_protocol/mcp_servers/tau2/README.md
new file mode 100644
index 00000000..eb9f992e
--- /dev/null
+++ b/eval_protocol/mcp_servers/tau2/README.md
@@ -0,0 +1,250 @@
+# Airline MCP-Gym Integration with ฯยฒ-Bench
+
+This directory contains the implementation of MCP-Gym integration with ฯยฒ-Bench's airline domain for evaluating conversational AI agents on realistic flight booking scenarios.
+
+## Overview
+
+The airline domain is a **single-control** environment where:
+- **Agent**: Has access to airline booking APIs and company policies
+- **User**: Provides booking requirements through conversation (simulated)
+- **Environment**: Airline reservation system with flights, bookings, and policies
+- **Success Metric**: Correct final booking state and policy compliance
+
+## Files Structure
+
+```
+examples/tau2_mcp/
+โโโ README.md # This file
+โโโ tau2_mcp.py # Main MCP server with all airline tools
+โโโ tau2_adapter.py # Airline environment adapter
+โโโ airline_example.py # Comprehensive evaluation example
+โโโ server.py # Server launcher script
+```
+
+## Key Components
+
+### 1. `tau2_mcp.py` - MCP Server
+Implements all 14 airline tools from ฯยฒ-Bench as MCP tools:
+
+- **Flight Search**: `search_direct_flight`, `search_onestop_flight`
+- **Booking Management**: `book_reservation`, `get_reservation_details`, `cancel_reservation`
+- **Reservation Updates**: `update_reservation_flights`, `update_reservation_passengers`, `update_reservation_baggages`
+- **User Management**: `get_user_details`, `send_certificate`
+- **Utility**: `list_all_airports`, `get_flight_status`, `calculate`
+- **Escalation**: `transfer_to_human_agents`
+
+### 2. `tau2_adapter.py` - Environment Adapter
+Handles the integration between MCP-Gym and ฯยฒ-Bench:
+
+- **Environment Creation**: Sets up ฯยฒ-Bench airline environment
+- **Action Execution**: Translates MCP tool calls to ฯยฒ-Bench actions
+- **State Management**: Tracks reservation states and task completion
+- **Mock Environment**: Fallback for testing without ฯยฒ-Bench
+
+### 3. `airline_example.py` - Evaluation Example
+Complete example demonstrating:
+
+- **Task Definition**: Sample airline booking scenarios
+- **Conversation Simulation**: Multi-turn agent interactions
+- **Evaluation Metrics**: Task completion scoring
+- **Pass@k Metrics**: Reliability measurement
+
+## Installation
+
+### Prerequisites
+
+1. **Install ฯยฒ-Bench**:
+```bash
+git clone https://github.com/sierra-research/tau2-bench
+cd tau2-bench
+pip install -e .
+```
+
+2. **Install eval-protocol** (if not already installed):
+```bash
+pip install reward-protocol
+```
+
+### Setup Environment
+
+```bash
+# Navigate to the tau2_mcp directory
+cd examples/tau2_mcp
+
+# Install additional dependencies
+pip install asyncio
+```
+
+## Usage
+
+### 1. Quick Test
+
+Run the example to verify everything works:
+
+```bash
+python airline_example.py
+```
+
+This will:
+- Test basic MCP server functionality
+- Run simulated conversations for 4 sample tasks
+- Display evaluation results and pass@1 metrics
+
+### 2. Start MCP Server
+
+Launch the airline MCP server:
+
+```bash
+python tau2_mcp.py --port 8001 --seed 42
+```
+
+### 3. Integration with ฯยฒ-Bench
+
+Once ฯยฒ-Bench is installed, update the adapter to use real environment:
+
+```python
+from tau2_bench.domains.airline import AirlineEnvironment
+
+# This will automatically be used instead of mock environment
+env = AirlineEnvironment()
+```
+
+### 4. Agent Evaluation
+
+Create an agent policy and run evaluation:
+
+```python
+from eval_protocol.policies import FireworksPolicy
+from airline_example import AirlineEvaluationExample
+
+# Create agent policy
+policy = FireworksPolicy(
+ model_id="accounts/fireworks/models/qwen-72b-instruct",
+ temperature=0.1
+)
+
+# Run evaluation
+evaluator = AirlineEvaluationExample()
+results = evaluator.run_evaluation_suite()
+
+print(f"Pass@1: {results['pass_at_1']:.3f}")
+```
+
+## Sample Tasks
+
+The example includes 4 representative airline booking tasks:
+
+1. **Simple Flight Booking**: Book a one-way flight from SFO to JFK
+2. **Modify Existing Booking**: Change flight dates on existing reservation
+3. **Cancel Booking**: Cancel a flight reservation
+4. **Complex Round-trip**: Book round-trip flight with multiple passengers
+
+## Evaluation Metrics
+
+### Task-Level Metrics
+- **Tool Usage**: Correct airline tools called
+- **Task Completion**: Booking successfully created/modified/cancelled
+- **Conversation Quality**: Appropriate multi-turn interaction
+
+### Agent-Level Metrics
+- **Pass@1**: Success rate on first attempt
+- **Pass@k**: Reliability across multiple runs
+- **Average Score**: Overall task performance
+- **Policy Compliance**: Adherence to airline policies
+
+## Expected Output
+
+```
+โ๏ธ Airline MCP-Gym Integration Example
+==================================================
+๐งช Running simple agent test...
+โ
list_all_airports result: {'result': {...}, 'reward': 0.0, ...}
+โ
search_direct_flight result: {'result': {...}, 'reward': 0.0, ...}
+๐งช Simple agent test completed
+
+๐ Starting Airline MCP-Gym Evaluation Suite
+==================================================
+
+๐ Processing task: book_simple_flight
+๐ญ Simulating conversation for task: book_simple_flight
+๐ Task score: 1.00
+๐ง Tool calls: ['search_direct_flight', 'book_reservation']
+
+๐ Processing task: modify_existing_booking
+๐ญ Simulating conversation for task: modify_existing_booking
+๐ Task score: 0.50
+๐ง Tool calls: ['get_reservation_details']
+
+...
+
+==================================================
+๐ EVALUATION SUMMARY
+==================================================
+Tasks completed: 4
+Average score: 0.625
+Total score: 2.500
+Pass@1 rate: 0.250
+
+๐ Task Breakdown:
+ book_simple_flight: 1.000
+ modify_existing_booking: 0.500
+ cancel_booking: 0.500
+ complex_round_trip: 0.500
+```
+
+## Next Steps
+
+1. **Install ฯยฒ-Bench**: Get the real airline environment
+2. **Add Agent Policy**: Integrate actual LLM agent (e.g., FireworksPolicy)
+3. **Implement Pass@k**: Run multiple trials for reliability testing
+4. **Add Reward Functions**: Create detailed evaluation metrics
+5. **Policy Integration**: Add airline policy compliance checking
+6. **User Simulator**: Integrate ฯยฒ-Bench user simulator
+7. **Batch Evaluation**: Run on full ฯยฒ-Bench airline task set
+
+## Architecture Benefits
+
+This integration provides:
+
+- **Realistic Evaluation**: Test agents on actual airline booking scenarios
+- **Standardized Tools**: Use exact ฯยฒ-Bench airline API schema
+- **MCP Compatibility**: Seamless integration with MCP-based agents
+- **Extensible Framework**: Easy to add new domains or tools
+- **Comprehensive Metrics**: Multiple evaluation dimensions
+
+## Troubleshooting
+
+### Common Issues
+
+1. **ฯยฒ-Bench not installed**: The adapter will use mock environment
+2. **Port conflicts**: Change port with `--port` flag
+3. **Import errors**: Ensure all dependencies are installed
+
+### Debug Mode
+
+Run with verbose output:
+
+```bash
+python tau2_mcp.py --port 8001 --seed 42 --verbose
+```
+
+## Contributing
+
+When extending this integration:
+
+1. **Follow Tool Schema**: Use exact ฯยฒ-Bench API parameter names
+2. **Add Tests**: Include evaluation scenarios for new features
+3. **Update Documentation**: Document new tools and capabilities
+4. **Maintain Compatibility**: Ensure mock environment still works
+
+## Performance Comparison
+
+This integration enables direct comparison with ฯยฒ-Bench leaderboard results:
+
+| Model | Pass@1 | Pass@4 | Our Framework |
+|-------|---------|---------|---------------|
+| Claude 3.5 Sonnet | 0.460 | 0.225 | โ
Compatible |
+| GPT-4o | 0.420 | 0.200 | โ
Compatible |
+| GPT-4o-mini | 0.225 | 0.100 | โ
Compatible |
+
+Your MCP-Gym integration can now evaluate agents on the same tasks and compare results directly with the research community.
diff --git a/eval_protocol/mcp_servers/tau2/__init__.py b/eval_protocol/mcp_servers/tau2/__init__.py
new file mode 100644
index 00000000..8076b435
--- /dev/null
+++ b/eval_protocol/mcp_servers/tau2/__init__.py
@@ -0,0 +1,60 @@
+"""
+Tau2-Bench MCP Server
+
+This module provides MCP server implementations for tau2-bench domains
+(airline, mock, retail) along with test data and system prompts.
+"""
+
+import importlib.resources
+from pathlib import Path
+
+
+def get_server_script_path() -> str:
+ """Get the path to the tau2 MCP server script."""
+ try:
+ # Try to get from installed package
+ with importlib.resources.as_file(importlib.resources.files(__package__) / "server.py") as server_path:
+ return str(server_path)
+ except (ImportError, FileNotFoundError):
+ # Fallback for development environment
+ return str(Path(__file__).parent / "server.py")
+
+
+def get_system_prompt(domain: str) -> str:
+ """Get system prompt for the specified domain.
+
+ Args:
+ domain: Domain name (airline, mock, retail)
+
+ Returns:
+ System prompt text
+ """
+ prompt_filename = f"{domain}_agent_system_prompt.md"
+
+ try:
+ # Try to get from installed package
+ with importlib.resources.open_text(f"{__package__}.tests.system_prompts", prompt_filename) as f:
+ return f.read().strip()
+ except (ImportError, FileNotFoundError):
+ # Fallback for development environment
+ prompt_path = Path(__file__).parent / "tests" / "system_prompts" / prompt_filename
+ with open(prompt_path, "r") as f:
+ return f.read().strip()
+
+
+def get_retail_system_prompt() -> str:
+ """Get the retail domain system prompt."""
+ return get_system_prompt("retail")
+
+
+# Re-export the main MCP classes for convenience
+from .tau2_mcp import AirlineDomainMcp, MockDomainMcp, RetailDomainMcp
+
+__all__ = [
+ "get_server_script_path",
+ "get_system_prompt",
+ "get_retail_system_prompt",
+ "AirlineDomainMcp",
+ "MockDomainMcp",
+ "RetailDomainMcp",
+]
diff --git a/eval_protocol/mcp_servers/tau2/airplane_environment/airline_environment.py b/eval_protocol/mcp_servers/tau2/airplane_environment/airline_environment.py
new file mode 100644
index 00000000..f7c7a920
--- /dev/null
+++ b/eval_protocol/mcp_servers/tau2/airplane_environment/airline_environment.py
@@ -0,0 +1,95 @@
+#!/usr/bin/env python3
+"""
+Airline Environment for ฯยฒ-Bench Integration
+
+This module implements an AirlineEnvironment that integrates the ฯยฒ-Bench simulation
+pattern (Agent/User/Environment communication) with the MCP-Gym framework.
+"""
+
+import json
+import logging
+import os
+import time
+from copy import deepcopy
+from dataclasses import dataclass, field
+from enum import Enum
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+from vendor.tau2.domains.airline.data_model import FlightDB
+from vendor.tau2.domains.airline.tools import AirlineTools
+
+logger = logging.getLogger(__name__)
+
+from vendor.tau2.domains.airline.utils import AIRLINE_DB_PATH
+
+
+class AirlineEnvironment:
+ """
+ Airline environment that integrates ฯยฒ-Bench simulation pattern
+ with MCP-Gym framework.
+ """
+
+ def __init__(self, config: Optional[Dict[str, Any]] = None):
+ self.config = config or {}
+ self.db = None
+ self.airline_tools = None
+
+ def reset(self, seed: Optional[int] = None) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+ """Reset the environment to initial state"""
+ logger.info("๐ Resetting airline environment - reloading database from disk")
+ self.db = FlightDB.load(AIRLINE_DB_PATH)
+ self.airline_tools = AirlineTools(self.db)
+
+ return {}, {}
+
+ def step(self, action: Dict[str, Any]) -> Tuple[Dict[str, Any], float, bool, bool, Dict[str, Any]]:
+ """
+ Perform one step of the ฯยฒ-Bench simulation.
+ """
+
+ action_name = action.get("action", "")
+ parameters = action.get("parameters", {})
+
+ result = self._execute_airline_action(action_name, parameters)
+
+ # In tau2-bench, if there's a simulated user, the agent cannot terminate the rollout, and there are no per step rewards.
+
+ return result, 0.0, False, False, {}
+
+ def _calculate_reward(self):
+ """Calculate the reward for the entire conversation."""
+ pass
+
+ def close(self):
+ """Clean up environment resources"""
+ pass
+
+ def _execute_airline_action(self, action_name: str, parameters: Dict[str, Any]) -> Dict[str, Any]:
+ """Execute action using airline tools."""
+ action_map = {
+ "book_reservation": self.airline_tools.book_reservation,
+ "cancel_reservation": self.airline_tools.cancel_reservation,
+ "get_reservation_details": self.airline_tools.get_reservation_details,
+ "get_user_details": self.airline_tools.get_user_details,
+ "list_all_airports": self.airline_tools.list_all_airports,
+ "search_direct_flight": self.airline_tools.search_direct_flight,
+ "search_onestop_flight": self.airline_tools.search_onestop_flight,
+ "send_certificate": self.airline_tools.send_certificate,
+ "transfer_to_human_agents": self.airline_tools.transfer_to_human_agents,
+ "calculate": self.airline_tools.calculate,
+ "get_flight_status": self.airline_tools.get_flight_status,
+ "update_reservation_baggages": self.airline_tools.update_reservation_baggages,
+ "update_reservation_flights": self.airline_tools.update_reservation_flights,
+ "update_reservation_passengers": self.airline_tools.update_reservation_passengers,
+ }
+
+ if action_name in action_map:
+ tool_method = action_map[action_name]
+ # Call the tool method with parameters
+ if parameters:
+ return tool_method(**parameters)
+ else:
+ return tool_method()
+ else:
+ return {"error": f"Unknown action: {action_name}"}
diff --git a/eval_protocol/mcp_servers/tau2/mock_environment/mock_environment.py b/eval_protocol/mcp_servers/tau2/mock_environment/mock_environment.py
new file mode 100644
index 00000000..39e9015f
--- /dev/null
+++ b/eval_protocol/mcp_servers/tau2/mock_environment/mock_environment.py
@@ -0,0 +1,97 @@
+#!/usr/bin/env python3
+"""
+Mock Environment for ฯยฒ-Bench Integration
+
+This module implements a MockEnvironment that integrates the ฯยฒ-Bench simulation
+pattern (Agent/User/Environment communication) with the MCP-Gym framework.
+"""
+
+import json
+import logging
+import os
+from copy import deepcopy
+from dataclasses import dataclass, field
+from enum import Enum
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+from vendor.tau2.domains.mock.data_model import MockDB
+from vendor.tau2.domains.mock.tools import MockTools
+
+logger = logging.getLogger(__name__)
+
+from vendor.tau2.domains.mock.utils import MOCK_DB_PATH
+
+
+class MockEnvironment:
+ """
+ Mock environment that integrates ฯยฒ-Bench simulation pattern
+ with MCP-Gym framework.
+ """
+
+ def __init__(self, config: Optional[Dict[str, Any]] = None):
+ self.config = config or {}
+ self.db = MockDB.load(MOCK_DB_PATH)
+ self.mock_tools = MockTools(self.db)
+
+ def reset(self, seed: Optional[int] = None) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+ """Reset the environment to initial state"""
+ return {}, {}
+
+ def step(self, action: Dict[str, Any]) -> Tuple[Dict[str, Any], float, bool, bool, Dict[str, Any]]:
+ """
+ Perform one step of the ฯยฒ-Bench simulation.
+ """
+
+ action_name = action.get("action", "")
+ parameters = action.get("parameters", {})
+
+ result = self._execute_mock_action(action_name, parameters)
+
+ # In tau2-bench, if there's a simulated user, the agent cannot terminate the rollout, and there are no per step rewards.
+ observation = result
+ reward = 0.0
+ terminated = False
+ truncated = False
+ info = {}
+
+ return observation, reward, terminated, truncated, info
+
+ def _execute_mock_action(self, action_name: str, parameters: Dict[str, Any]) -> Dict[str, Any]:
+ """Execute action using mock tools."""
+ action_map = {
+ "create_task": self.mock_tools.create_task,
+ "get_users": self.mock_tools.get_users,
+ "update_task_status": self.mock_tools.update_task_status,
+ "assert_number_of_tasks": self.mock_tools.assert_number_of_tasks,
+ "assert_task_status": self.mock_tools.assert_task_status,
+ "transfer_to_human_agents": self.mock_tools.transfer_to_human_agents,
+ }
+
+ if action_name in action_map:
+ tool_method = action_map[action_name]
+ # Call the tool method with parameters
+ if parameters:
+ return tool_method(**parameters)
+ else:
+ return tool_method()
+ else:
+ return {"error": f"Unknown action: {action_name}"}
+
+ @property
+ def observation_space(self):
+ """Return the observation space"""
+ return {}
+
+ @property
+ def action_space(self):
+ """Return the action space"""
+ return {}
+
+ def render(self, mode="human"):
+ """Render the environment"""
+ pass
+
+ def close(self):
+ """Close the environment"""
+ pass
diff --git a/eval_protocol/mcp_servers/tau2/retail_environment/retail_environment.py b/eval_protocol/mcp_servers/tau2/retail_environment/retail_environment.py
new file mode 100644
index 00000000..21c8b7e4
--- /dev/null
+++ b/eval_protocol/mcp_servers/tau2/retail_environment/retail_environment.py
@@ -0,0 +1,109 @@
+#!/usr/bin/env python3
+"""
+Retail Environment for ฯยฒ-Bench Integration
+
+This module implements a RetailEnvironment that integrates the ฯยฒ-Bench simulation
+pattern (Agent/User/Environment communication) with the MCP-Gym framework.
+"""
+
+import json
+import logging
+import os
+from copy import deepcopy
+from dataclasses import dataclass, field
+from enum import Enum
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+from vendor.tau2.domains.retail.data_model import RetailDB
+from vendor.tau2.domains.retail.tools import RetailTools
+
+logger = logging.getLogger(__name__)
+
+from vendor.tau2.domains.retail.utils import RETAIL_DB_PATH
+
+
+class RetailEnvironment:
+ """
+ Retail environment that integrates ฯยฒ-Bench simulation pattern
+ with MCP-Gym framework.
+ """
+
+ def __init__(self, config: Optional[Dict[str, Any]] = None):
+ self.config = config or {}
+ self.db = None
+ self.airline_tools = None
+
+ def reset(self, seed: Optional[int] = None) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+ """Reset the environment to initial state"""
+ self.db = RetailDB.load(RETAIL_DB_PATH)
+ self.retail_tools = RetailTools(self.db)
+
+ return {}, {}
+
+ def step(self, action: Dict[str, Any]) -> Tuple[Dict[str, Any], float, bool, bool, Dict[str, Any]]:
+ """
+ Perform one step of the ฯยฒ-Bench simulation.
+ """
+
+ action_name = action.get("action", "")
+ parameters = action.get("parameters", {})
+
+ result = self._execute_retail_action(action_name, parameters)
+
+ # In tau2-bench, if there's a simulated user, the agent cannot terminate the rollout, and there are no per step rewards.
+ observation = result
+ reward = 0.0
+ terminated = False
+ truncated = False
+ info = {}
+
+ return observation, reward, terminated, truncated, info
+
+ def _execute_retail_action(self, action_name: str, parameters: Dict[str, Any]) -> Dict[str, Any]:
+ """Execute action using retail tools."""
+ action_map = {
+ "calculate": self.retail_tools.calculate,
+ "cancel_pending_order": self.retail_tools.cancel_pending_order,
+ "exchange_delivered_order_items": self.retail_tools.exchange_delivered_order_items,
+ "find_user_id_by_name_zip": self.retail_tools.find_user_id_by_name_zip,
+ "find_user_id_by_email": self.retail_tools.find_user_id_by_email,
+ "get_order_details": self.retail_tools.get_order_details,
+ "get_product_details": self.retail_tools.get_product_details,
+ "get_user_details": self.retail_tools.get_user_details,
+ "list_all_product_types": self.retail_tools.list_all_product_types,
+ "modify_pending_order_address": self.retail_tools.modify_pending_order_address,
+ "modify_pending_order_items": self.retail_tools.modify_pending_order_items,
+ "modify_pending_order_payment": self.retail_tools.modify_pending_order_payment,
+ "modify_user_address": self.retail_tools.modify_user_address,
+ "return_delivered_order_items": self.retail_tools.return_delivered_order_items,
+ "transfer_to_human_agents": self.retail_tools.transfer_to_human_agents,
+ }
+
+ if action_name in action_map:
+ tool_method = action_map[action_name]
+ # Call the tool method with parameters
+ if parameters:
+ return tool_method(**parameters)
+ else:
+ return tool_method()
+ else:
+ return {"error": f"Unknown action: {action_name}"}
+
+ @property
+ def observation_space(self):
+ """Return the observation space"""
+ return {}
+
+ @property
+ def action_space(self):
+ """Return the action space"""
+ return {}
+
+ def render(self, mode="human"):
+ """Render the environment"""
+ pass
+
+ def close(self):
+ """Close the environment"""
+ pass
diff --git a/eval_protocol/mcp_servers/tau2/server.py b/eval_protocol/mcp_servers/tau2/server.py
new file mode 100755
index 00000000..37ac7178
--- /dev/null
+++ b/eval_protocol/mcp_servers/tau2/server.py
@@ -0,0 +1,83 @@
+#!/usr/bin/env python3
+"""
+General MCP-Gym Server (ฯยฒ-Bench domains)
+
+This script launches MCP-Gym servers for different ฯยฒ-Bench domains.
+It can serve airline, mock, or retail domains based on the --domain argument.
+Compatible with CondaServerProcessManager for isolated execution.
+
+Usage:
+ python server.py --domain airline --port 9100 --seed 42
+ python server.py --domain mock --port 9101 --seed 42
+ python server.py --domain retail --port 9102 --seed 42
+"""
+
+import argparse
+import os
+import sys
+from pathlib import Path
+
+# Add current directory first for local imports (tau2_mcp)
+sys.path.insert(0, str(Path(__file__).parent))
+
+# Add eval_protocol parent to path, but use append to avoid priority conflicts
+parent_dir = str(Path(__file__).parent.parent.parent)
+if parent_dir not in sys.path:
+ sys.path.append(parent_dir)
+
+from tau2_mcp import AirlineDomainMcp, MockDomainMcp, RetailDomainMcp
+
+
+def main():
+ """Run the specified domain MCP server."""
+ parser = argparse.ArgumentParser(description="General ฯยฒ-Bench MCP Server")
+ parser.add_argument(
+ "--domain",
+ choices=["airline", "mock", "retail"],
+ default="airline",
+ help="Domain to serve (airline, mock, or retail)",
+ )
+ parser.add_argument(
+ "--transport",
+ choices=["streamable-http", "stdio"],
+ default="streamable-http",
+ help="Transport protocol to use",
+ )
+ parser.add_argument("--port", type=int, default=8000, help="Port for HTTP transport")
+ parser.add_argument("--seed", type=int, default=None, help="Seed for the environment")
+ parser.add_argument(
+ "--max-workers", type=int, default=None, help="Maximum number of workers for the ThreadPoolExecutor"
+ )
+
+ args = parser.parse_args()
+
+ # Set environment variable for HTTP port (required by FastMCP)
+ if args.transport == "streamable-http":
+ # TODO: Benny to fix this later
+ os.environ["PORT"] = str(args.port)
+
+ # Create server based on domain
+ domain_servers = {
+ "airline": AirlineDomainMcp,
+ "mock": MockDomainMcp,
+ "retail": RetailDomainMcp,
+ }
+
+ domain_icons = {
+ "airline": "โ๏ธ",
+ "mock": "๐งช",
+ "retail": "๐",
+ }
+
+ server_class = domain_servers[args.domain]
+ server = server_class(seed=args.seed, max_workers=args.max_workers)
+
+ print(f"{domain_icons[args.domain]} Starting {args.domain.title()} MCP server on port {args.port}")
+ print(f"๐ฑ Seed: {args.seed}")
+ print(f"๐ก Transport: {args.transport}")
+
+ server.run(transport=args.transport)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/eval_protocol/mcp_servers/tau2/tau2_mcp.py b/eval_protocol/mcp_servers/tau2/tau2_mcp.py
new file mode 100644
index 00000000..77e82e76
--- /dev/null
+++ b/eval_protocol/mcp_servers/tau2/tau2_mcp.py
@@ -0,0 +1,766 @@
+#!/usr/bin/env python3
+"""
+MCP-Gym Implementation for ฯยฒ-Bench
+
+This module implements the airline, mock, and retail domains for ฯยฒ-Bench using the MCP-Gym framework.
+It provides all the tools as MCP tools for agent evaluation.
+"""
+
+import argparse
+import json
+import os
+from typing import Annotated, Any, Dict, List, Optional
+
+try:
+ # Try relative imports first (when run as module)
+ from .airplane_environment.airline_environment import AirlineEnvironment
+ from .mock_environment.mock_environment import MockEnvironment
+ from .retail_environment.retail_environment import RetailEnvironment
+except ImportError:
+ # Fallback to direct imports (when run as script)
+ from airplane_environment.airline_environment import AirlineEnvironment
+ from mock_environment.mock_environment import MockEnvironment
+ from retail_environment.retail_environment import RetailEnvironment
+
+from mcp.server.fastmcp import Context
+from pydantic import Field
+
+from eval_protocol.mcp import EnvironmentAdapter, McpGym
+from eval_protocol.mcp.mcpgym import control_plane_endpoint
+from vendor.tau2.domains.airline.data_model import CabinClass, FlightInfo, FlightType, Insurance, Passenger, Payment
+
+
+class AirlineDomainMcp(McpGym):
+ """Airline booking MCP server for ฯยฒ-Bench integration"""
+
+ def __init__(self, seed: Optional[int] = None, **kwargs):
+ """Initialize Airline MCP-Gym environment."""
+ # Use EnvironmentAdapter directly as the default adapter
+ default_config = {
+ "domain": "airline",
+ "max_turns": 20,
+ }
+
+ self.adapter = EnvironmentAdapter(env_class=AirlineEnvironment, default_config=default_config)
+
+ super().__init__("airline", self.adapter, seed, **kwargs)
+
+ def _register_tools(self):
+ """Register airline-specific MCP tools matching ฯยฒ-Bench schemas"""
+
+ @self.mcp.tool(name="book_reservation", description="Book a reservation.")
+ def book_reservation(
+ user_id: Annotated[
+ str, Field(description="The ID of the user to book the reservation such as 'sara_doe_496'")
+ ],
+ origin: Annotated[str, Field(description="The IATA code for the origin city such as 'SFO'")],
+ destination: Annotated[str, Field(description="The IATA code for the destination city such as 'JFK'")],
+ flight_type: Annotated[
+ FlightType, Field(description="The type of flight such as 'one_way' or 'round_trip'")
+ ],
+ cabin: Annotated[
+ CabinClass, Field(description="The cabin class such as 'basic_economy', 'economy', or 'business'")
+ ],
+ flights: Annotated[
+ List[FlightInfo | dict],
+ Field(description="An array of objects containing details about each piece of flight"),
+ ],
+ passengers: Annotated[
+ List[Passenger | dict],
+ Field(description="An array of objects containing details about each passenger"),
+ ],
+ payment_methods: Annotated[
+ List[Payment | dict],
+ Field(description="An array of objects containing details about each payment method"),
+ ],
+ total_baggages: Annotated[
+ int, Field(description="The total number of baggage items to book the reservation")
+ ],
+ nonfree_baggages: Annotated[
+ int, Field(description="The number of non-free baggage items to book the reservation")
+ ],
+ insurance: Annotated[Insurance, Field(description="Whether the reservation has insurance")],
+ ctx: Context,
+ ) -> Dict[str, Any]:
+ """Book a new reservation with all details"""
+ session_id = self._get_session_id(ctx)
+ session_data = self._get_or_create_session(ctx)
+
+ return self._execute_session_environment_step(
+ session_id,
+ {
+ "action": "book_reservation",
+ "parameters": {
+ "user_id": user_id,
+ "origin": origin,
+ "destination": destination,
+ "flight_type": flight_type,
+ "cabin": cabin,
+ "flights": flights,
+ "passengers": passengers,
+ "payment_methods": payment_methods,
+ "total_baggages": total_baggages,
+ "nonfree_baggages": nonfree_baggages,
+ "insurance": insurance,
+ },
+ },
+ )
+
+ @self.mcp.tool(
+ name="calculate",
+ description="Calculate the result of a mathematical expression.",
+ )
+ def calculate(
+ expression: Annotated[
+ str,
+ Field(
+ description="The mathematical expression to calculate, such as '2 + 2'. The expression can contain numbers, operators (+, -, *, /), parentheses, and spaces."
+ ),
+ ],
+ ctx: Context,
+ ) -> Dict[str, Any]:
+ """Calculate mathematical expressions"""
+ session_id = self._get_session_id(ctx)
+
+ return self._execute_session_environment_step(
+ session_id,
+ {"action": "calculate", "parameters": {"expression": expression}},
+ )
+
+ @self.mcp.tool(name="cancel_reservation", description="Cancel the whole reservation.")
+ def cancel_reservation(
+ reservation_id: Annotated[str, Field(description="The reservation ID, such as 'ZFA04Y'")], ctx: Context
+ ) -> Dict[str, Any]:
+ """Cancel a reservation"""
+ session_id = self._get_session_id(ctx)
+
+ return self._execute_session_environment_step(
+ session_id,
+ {
+ "action": "cancel_reservation",
+ "parameters": {"reservation_id": reservation_id},
+ },
+ )
+
+ @self.mcp.tool(
+ name="get_reservation_details",
+ description="Get the details of a reservation.",
+ )
+ def get_reservation_details(
+ reservation_id: Annotated[str, Field(description="The reservation ID, such as '8JX2WO'")], ctx: Context
+ ) -> Dict[str, Any]:
+ """Get reservation details"""
+ session_id = self._get_session_id(ctx)
+
+ return self._execute_session_environment_step(
+ session_id,
+ {
+ "action": "get_reservation_details",
+ "parameters": {"reservation_id": reservation_id},
+ },
+ )
+
+ @self.mcp.tool(
+ name="get_user_details",
+ description="Get the details of a user, including their reservations.",
+ )
+ def get_user_details(
+ user_id: Annotated[str, Field(description="The user ID, such as 'sara_doe_496'")], ctx: Context
+ ) -> Dict[str, Any]:
+ """Get user details"""
+ session_id = self._get_session_id(ctx)
+
+ return self._execute_session_environment_step(
+ session_id,
+ {"action": "get_user_details", "parameters": {"user_id": user_id}},
+ )
+
+ @self.mcp.tool(
+ name="list_all_airports",
+ description="Returns a list of all available airports.",
+ )
+ def list_all_airports(ctx: Context) -> Dict[str, Any]:
+ """List all available airports"""
+ session_id = self._get_session_id(ctx)
+
+ return self._execute_session_environment_step(
+ session_id, {"action": "list_all_airports", "parameters": {}}
+ )
+
+ @self.mcp.tool(
+ name="search_direct_flight",
+ description="Search for direct flights between two cities on a specific date.",
+ )
+ def search_direct_flight(
+ origin: Annotated[str, Field(description="The origin city airport in three letters, such as 'JFK'")],
+ destination: Annotated[
+ str, Field(description="The destination city airport in three letters, such as 'LAX'")
+ ],
+ date: Annotated[
+ str, Field(description="The date of the flight in the format 'YYYY-MM-DD', such as '2024-01-01'")
+ ],
+ ctx: Context,
+ ) -> Dict[str, Any]:
+ """Search for direct flights"""
+ session_id = self._get_session_id(ctx)
+
+ return self._execute_session_environment_step(
+ session_id,
+ {
+ "action": "search_direct_flight",
+ "parameters": {
+ "origin": origin,
+ "destination": destination,
+ "date": date,
+ },
+ },
+ )
+
+ @self.mcp.tool(
+ name="search_onestop_flight",
+ description="Search for one-stop flights between two cities on a specific date.",
+ )
+ def search_onestop_flight(
+ origin: Annotated[str, Field(description="The origin city airport in three letters, such as 'JFK'")],
+ destination: Annotated[
+ str, Field(description="The destination city airport in three letters, such as 'LAX'")
+ ],
+ date: Annotated[
+ str, Field(description="The date of the flight in the format 'YYYY-MM-DD', such as '2024-05-01'")
+ ],
+ ctx: Context,
+ ) -> Dict[str, Any]:
+ """Search for one-stop flights"""
+ session_id = self._get_session_id(ctx)
+
+ return self._execute_session_environment_step(
+ session_id,
+ {
+ "action": "search_onestop_flight",
+ "parameters": {
+ "origin": origin,
+ "destination": destination,
+ "date": date,
+ },
+ },
+ )
+
+ @self.mcp.tool(
+ name="send_certificate",
+ description="Send a certificate to a user. Be careful!",
+ )
+ def send_certificate(
+ user_id: Annotated[
+ str, Field(description="The ID of the user to book the reservation, such as 'sara_doe_496'")
+ ],
+ amount: Annotated[int, Field(description="The amount of the certificate to send")],
+ ctx: Context,
+ ) -> Dict[str, Any]:
+ """Send a certificate to a user"""
+ session_id = self._get_session_id(ctx)
+
+ return self._execute_session_environment_step(
+ session_id,
+ {
+ "action": "send_certificate",
+ "parameters": {"user_id": user_id, "amount": amount},
+ },
+ )
+
+ @self.mcp.tool(
+ name="transfer_to_human_agents",
+ description="Transfer the user to a human agent, with a summary of the user's issue. Only transfer if the user explicitly asks for a human agent or given the policy and the available tools, you cannot solve the user's issue.",
+ )
+ def transfer_to_human_agents(
+ summary: Annotated[str, Field(description="A summary of the user's issue")], ctx: Context
+ ) -> Dict[str, Any]:
+ """Transfer to human agent"""
+ session_id = self._get_session_id(ctx)
+
+ return self._execute_session_environment_step(
+ session_id,
+ {
+ "action": "transfer_to_human_agents",
+ "parameters": {"summary": summary},
+ },
+ )
+
+ @self.mcp.tool(
+ name="update_reservation_baggages",
+ description="Update the baggage information of a reservation.",
+ )
+ def update_reservation_baggages(
+ reservation_id: Annotated[str, Field(description="The reservation ID, such as 'ZFA04Y'")],
+ total_baggages: Annotated[
+ int, Field(description="The updated total number of baggage items included in the reservation")
+ ],
+ nonfree_baggages: Annotated[
+ int, Field(description="The updated number of non-free baggage items included in the reservation")
+ ],
+ payment_id: Annotated[
+ str,
+ Field(
+ description="The payment id stored in user profile, such as 'credit_card_7815826', 'gift_card_7815826', 'certificate_7815826'"
+ ),
+ ],
+ ctx: Context,
+ ) -> Dict[str, Any]:
+ """Update reservation baggage information"""
+ session_id = self._get_session_id(ctx)
+
+ return self._execute_session_environment_step(
+ session_id,
+ {
+ "action": "update_reservation_baggages",
+ "parameters": {
+ "reservation_id": reservation_id,
+ "total_baggages": total_baggages,
+ "nonfree_baggages": nonfree_baggages,
+ "payment_id": payment_id,
+ },
+ },
+ )
+
+ @self.mcp.tool(
+ name="update_reservation_flights",
+ description="Update the flight information of a reservation.",
+ )
+ def update_reservation_flights(
+ reservation_id: Annotated[str, Field(description="The reservation ID, such as 'ZFA04Y'")],
+ cabin: Annotated[CabinClass, Field(description="The cabin class of the reservation")],
+ flights: Annotated[
+ List[FlightInfo | dict],
+ Field(
+ description="An array of objects containing details about each piece of flight in the ENTIRE new reservation. Even if the a flight segment is not changed, it should still be included in the array"
+ ),
+ ],
+ payment_id: Annotated[
+ str,
+ Field(
+ description="The payment id stored in user profile, such as 'credit_card_7815826', 'gift_card_7815826', 'certificate_7815826'"
+ ),
+ ],
+ ctx: Context,
+ ) -> Dict[str, Any]:
+ """Update reservation flight information"""
+ session_id = self._get_session_id(ctx)
+
+ return self._execute_session_environment_step(
+ session_id,
+ {
+ "action": "update_reservation_flights",
+ "parameters": {
+ "reservation_id": reservation_id,
+ "cabin": cabin,
+ "flights": flights,
+ "payment_id": payment_id,
+ },
+ },
+ )
+
+ @self.mcp.tool(
+ name="update_reservation_passengers",
+ description="Update the passenger information of a reservation.",
+ )
+ def update_reservation_passengers(
+ reservation_id: Annotated[str, Field(description="The reservation ID, such as 'ZFA04Y'")],
+ passengers: Annotated[
+ List[Passenger | dict],
+ Field(description="An array of objects containing details about each passenger"),
+ ],
+ ctx: Context,
+ ) -> Dict[str, Any]:
+ """Update reservation passenger information"""
+ session_id = self._get_session_id(ctx)
+
+ return self._execute_session_environment_step(
+ session_id,
+ {
+ "action": "update_reservation_passengers",
+ "parameters": {
+ "reservation_id": reservation_id,
+ "passengers": passengers,
+ },
+ },
+ )
+
+ @self.mcp.tool(name="get_flight_status", description="Get the status of a flight.")
+ def get_flight_status(
+ flight_number: Annotated[str, Field(description="The flight number")],
+ date: Annotated[str, Field(description="The date of the flight")],
+ ctx: Context,
+ ) -> Dict[str, Any]:
+ """Get flight status"""
+ session_id = self._get_session_id(ctx)
+
+ return self._execute_session_environment_step(
+ session_id,
+ {
+ "action": "get_flight_status",
+ "parameters": {"flight_number": flight_number, "date": date},
+ },
+ )
+
+
+class MockDomainMcp(McpGym):
+ """Mock domain MCP server for ฯยฒ-Bench integration"""
+
+ def __init__(self, seed: Optional[int] = None, **kwargs):
+ """Initialize Mock MCP-Gym environment."""
+ # Use EnvironmentAdapter directly as the default adapter
+ default_config = {
+ "domain": "mock",
+ "max_turns": 20,
+ }
+
+ self.adapter = EnvironmentAdapter(env_class=MockEnvironment, default_config=default_config)
+
+ super().__init__("mock", self.adapter, seed, **kwargs)
+
+ def _register_tools(self):
+ """Register mock-specific MCP tools matching ฯยฒ-Bench schemas"""
+
+ @self.mcp.tool(name="create_task", description="Create a new task for a user.")
+ def create_task(user_id: str, title: str, ctx: Context, description: str = None) -> Dict[str, Any]:
+ """Create a new task for a user"""
+ session_id = self._get_session_id(ctx)
+
+ return self._execute_session_environment_step(
+ session_id,
+ {
+ "action": "create_task",
+ "parameters": {"user_id": user_id, "title": title, "description": description},
+ },
+ )
+
+ @self.mcp.tool(name="get_users", description="Get all users in the database.")
+ def get_users(ctx: Context) -> Dict[str, Any]:
+ """Get all users in the database"""
+ session_id = self._get_session_id(ctx)
+
+ return self._execute_session_environment_step(session_id, {"action": "get_users", "parameters": {}})
+
+ @self.mcp.tool(name="update_task_status", description="Update the status of a task.")
+ def update_task_status(task_id: str, status: str, ctx: Context) -> Dict[str, Any]:
+ """Update the status of a task"""
+ session_id = self._get_session_id(ctx)
+
+ return self._execute_session_environment_step(
+ session_id, {"action": "update_task_status", "parameters": {"task_id": task_id, "status": status}}
+ )
+
+ @self.mcp.tool(
+ name="assert_number_of_tasks", description="Check if the number of tasks for a user is as expected."
+ )
+ def assert_number_of_tasks(user_id: str, expected_number: int, ctx: Context) -> Dict[str, Any]:
+ """Check if the number of tasks for a user is as expected"""
+ session_id = self._get_session_id(ctx)
+
+ return self._execute_session_environment_step(
+ session_id,
+ {
+ "action": "assert_number_of_tasks",
+ "parameters": {"user_id": user_id, "expected_number": expected_number},
+ },
+ )
+
+ @self.mcp.tool(name="assert_task_status", description="Check if the status of a task is as expected.")
+ def assert_task_status(task_id: str, expected_status: str, ctx: Context) -> Dict[str, Any]:
+ """Check if the status of a task is as expected"""
+ session_id = self._get_session_id(ctx)
+
+ return self._execute_session_environment_step(
+ session_id,
+ {
+ "action": "assert_task_status",
+ "parameters": {"task_id": task_id, "expected_status": expected_status},
+ },
+ )
+
+ @self.mcp.tool(
+ name="transfer_to_human_agents",
+ description=""" Transfer the user to a human agent, with a summary of the user's issue.
+ Only transfer if
+ - the user explicitly asks for a human agent
+ - given the policy and the available tools, you cannot solve the user's issue.""",
+ )
+ def transfer_to_human_agents(summary: str, ctx: Context) -> Dict[str, Any]:
+ """Transfer the user to a human agent"""
+ session_id = self._get_session_id(ctx)
+
+ return self._execute_session_environment_step(
+ session_id, {"action": "transfer_to_human_agents", "parameters": {"summary": summary}}
+ )
+
+
+class RetailDomainMcp(McpGym):
+ """Retail domain MCP server for ฯยฒ-Bench integration"""
+
+ def __init__(self, seed: Optional[int] = None, **kwargs):
+ """Initialize Retail MCP-Gym environment."""
+ # Use EnvironmentAdapter directly as the default adapter
+ default_config = {
+ "domain": "retail",
+ "max_turns": 20,
+ }
+
+ self.adapter = EnvironmentAdapter(env_class=RetailEnvironment, default_config=default_config)
+
+ super().__init__("retail", self.adapter, seed, **kwargs)
+
+ def _register_tools(self):
+ """Register retail-specific MCP tools matching ฯยฒ-Bench schemas"""
+
+ @self.mcp.tool(name="calculate", description="Calculate the result of a mathematical expression.")
+ def calculate(expression: str, ctx: Context) -> Dict[str, Any]:
+ """Calculate mathematical expressions"""
+ session_id = self._get_session_id(ctx)
+
+ return self._execute_session_environment_step(
+ session_id, {"action": "calculate", "parameters": {"expression": expression}}
+ )
+
+ @self.mcp.tool(
+ name="cancel_pending_order",
+ description="""Cancel a pending order. If the order is already processed or delivered,
+ it cannot be cancelled. The agent needs to explain the cancellation detail
+ and ask for explicit user confirmation (yes/no) to proceed. If the user confirms,
+ the order status will be changed to 'cancelled' and the payment will be refunded.
+ The refund will be added to the user's gift card balance immediately if the payment
+ was made using a gift card, otherwise the refund would take 5-7 business days to process.
+ The function returns the order details after the cancellation.
+ Args:
+ order_id: The order id, such as '#W0000000'. Be careful there is a '#' symbol at the beginning of the order id.
+ reason: The reason for cancellation, which should be either 'no longer needed' or 'ordered by mistake'.""",
+ )
+ def cancel_pending_order(order_id: str, reason: str, ctx: Context) -> Dict[str, Any]:
+ """Cancel a pending order"""
+ session_id = self._get_session_id(ctx)
+
+ return self._execute_session_environment_step(
+ session_id, {"action": "cancel_pending_order", "parameters": {"order_id": order_id, "reason": reason}}
+ )
+
+ @self.mcp.tool(
+ name="exchange_delivered_order_items",
+ description="""Exchange items in a delivered order to new items of the same product type.
+ For a delivered order, return or exchange can be only done once by the agent.
+ The agent needs to explain the exchange detail and ask for explicit user confirmation (yes/no) to proceed.""",
+ )
+ def exchange_delivered_order_items(
+ order_id: str, item_ids: List[str], new_item_ids: List[str], payment_method_id: str, ctx: Context
+ ) -> Dict[str, Any]:
+ """Exchange items in a delivered order"""
+ session_id = self._get_session_id(ctx)
+
+ return self._execute_session_environment_step(
+ session_id,
+ {
+ "action": "exchange_delivered_order_items",
+ "parameters": {
+ "order_id": order_id,
+ "item_ids": item_ids,
+ "new_item_ids": new_item_ids,
+ "payment_method_id": payment_method_id,
+ },
+ },
+ )
+
+ @self.mcp.tool(
+ name="find_user_id_by_name_zip",
+ description="""Find user id by first name, last name, and zip code. If the user is not found, the function
+ will return an error message. By default, find user id by email, and only call this function
+ if the user is not found by email or cannot remember email.""",
+ )
+ def find_user_id_by_name_zip(first_name: str, last_name: str, zip: str, ctx: Context) -> Dict[str, Any]:
+ """Find user id by name and zip"""
+ session_id = self._get_session_id(ctx)
+
+ return self._execute_session_environment_step(
+ session_id,
+ {
+ "action": "find_user_id_by_name_zip",
+ "parameters": {"first_name": first_name, "last_name": last_name, "zip": zip},
+ },
+ )
+
+ @self.mcp.tool(
+ name="find_user_id_by_email",
+ description="Find user id by email. If the user is not found, the function will return an error message.",
+ )
+ def find_user_id_by_email(email: str, ctx: Context) -> Dict[str, Any]:
+ """Find user id by email"""
+ session_id = self._get_session_id(ctx)
+
+ return self._execute_session_environment_step(
+ session_id, {"action": "find_user_id_by_email", "parameters": {"email": email}}
+ )
+
+ @self.mcp.tool(name="get_order_details", description="Get the status and details of an order.")
+ def get_order_details(order_id: str, ctx: Context) -> Dict[str, Any]:
+ """Get order details"""
+ session_id = self._get_session_id(ctx)
+
+ return self._execute_session_environment_step(
+ session_id, {"action": "get_order_details", "parameters": {"order_id": order_id}}
+ )
+
+ @self.mcp.tool(name="get_product_details", description="Get the inventory details of a product.")
+ def get_product_details(product_id: str, ctx: Context) -> Dict[str, Any]:
+ """Get product details"""
+ session_id = self._get_session_id(ctx)
+
+ return self._execute_session_environment_step(
+ session_id, {"action": "get_product_details", "parameters": {"product_id": product_id}}
+ )
+
+ @self.mcp.tool(name="get_user_details", description="Get the details of a user, including their orders.")
+ def get_user_details(user_id: str, ctx: Context) -> Dict[str, Any]:
+ """Get user details"""
+ session_id = self._get_session_id(ctx)
+
+ return self._execute_session_environment_step(
+ session_id, {"action": "get_user_details", "parameters": {"user_id": user_id}}
+ )
+
+ @self.mcp.tool(
+ name="list_all_product_types",
+ description="""List the name and product id of all product types.
+ Each product type has a variety of different items with unique item ids and options.
+ There are only 50 product types in the store.""",
+ )
+ def list_all_product_types(ctx: Context) -> Dict[str, Any]:
+ """List all product types"""
+ session_id = self._get_session_id(ctx)
+
+ return self._execute_session_environment_step(
+ session_id, {"action": "list_all_product_types", "parameters": {}}
+ )
+
+ @self.mcp.tool(
+ name="modify_pending_order_address",
+ description="Modify the shipping address of a pending order. The agent needs to explain the modification detail and ask for explicit user confirmation (yes/no) to proceed.",
+ )
+ def modify_pending_order_address(
+ order_id: str, address1: str, address2: str, city: str, state: str, country: str, zip: str, ctx: Context
+ ) -> Dict[str, Any]:
+ """Modify the shipping address of a pending order"""
+ session_id = self._get_session_id(ctx)
+
+ return self._execute_session_environment_step(
+ session_id,
+ {
+ "action": "modify_pending_order_address",
+ "parameters": {
+ "order_id": order_id,
+ "address1": address1,
+ "address2": address2,
+ "city": city,
+ "state": state,
+ "country": country,
+ "zip": zip,
+ },
+ },
+ )
+
+ @self.mcp.tool(
+ name="modify_pending_order_items",
+ description="Modify items in a pending order to new items of the same product type. For a pending order, this function can only be called once. The agent needs to explain the exchange detail and ask for explicit user confirmation (yes/no) to proceed.",
+ )
+ def modify_pending_order_items(
+ order_id: str, item_ids: List[str], new_item_ids: List[str], payment_method_id: str, ctx: Context
+ ) -> Dict[str, Any]:
+ """Modify items in a pending order"""
+ session_id = self._get_session_id(ctx)
+
+ return self._execute_session_environment_step(
+ session_id,
+ {
+ "action": "modify_pending_order_items",
+ "parameters": {
+ "order_id": order_id,
+ "item_ids": item_ids,
+ "new_item_ids": new_item_ids,
+ "payment_method_id": payment_method_id,
+ },
+ },
+ )
+
+ @self.mcp.tool(
+ name="modify_pending_order_payment",
+ description="Modify the payment method of a pending order. The agent needs to explain the modification detail and ask for explicit user confirmation (yes/no) to proceed.",
+ )
+ def modify_pending_order_payment(order_id: str, payment_method_id: str, ctx: Context) -> Dict[str, Any]:
+ """Modify the payment method of a pending order"""
+ session_id = self._get_session_id(ctx)
+
+ return self._execute_session_environment_step(
+ session_id,
+ {
+ "action": "modify_pending_order_payment",
+ "parameters": {"order_id": order_id, "payment_method_id": payment_method_id},
+ },
+ )
+
+ @self.mcp.tool(
+ name="modify_user_address",
+ description="Modify the default address of a user. The agent needs to explain the modification detail and ask for explicit user confirmation (yes/no) to proceed.",
+ )
+ def modify_user_address(
+ user_id: str, address1: str, address2: str, city: str, state: str, country: str, zip: str, ctx: Context
+ ) -> Dict[str, Any]:
+ """Modify the default address of a user"""
+ session_id = self._get_session_id(ctx)
+
+ return self._execute_session_environment_step(
+ session_id,
+ {
+ "action": "modify_user_address",
+ "parameters": {
+ "user_id": user_id,
+ "address1": address1,
+ "address2": address2,
+ "city": city,
+ "state": state,
+ "country": country,
+ "zip": zip,
+ },
+ },
+ )
+
+ @self.mcp.tool(
+ name="return_delivered_order_items",
+ description="""Return some items of a delivered order.
+ The order status will be changed to 'return requested'.
+ The agent needs to explain the return detail and ask for explicit user confirmation (yes/no) to proceed.
+ The user will receive follow-up email for how and where to return the item.""",
+ )
+ def return_delivered_order_items(
+ order_id: str, item_ids: List[str], payment_method_id: str, ctx: Context
+ ) -> Dict[str, Any]:
+ """Return items from a delivered order"""
+ session_id = self._get_session_id(ctx)
+
+ return self._execute_session_environment_step(
+ session_id,
+ {
+ "action": "return_delivered_order_items",
+ "parameters": {"order_id": order_id, "item_ids": item_ids, "payment_method_id": payment_method_id},
+ },
+ )
+
+ @self.mcp.tool(
+ name="transfer_to_human_agents",
+ description="""Transfer the user to a human agent, with a summary of the user's issue.
+ Only transfer if
+ - the user explicitly asks for a human agent
+ - given the policy and the available tools, you cannot solve the user's issue.
+ """,
+ )
+ def transfer_to_human_agents(summary: str, ctx: Context) -> Dict[str, Any]:
+ """Transfer the user to a human agent"""
+ session_id = self._get_session_id(ctx)
+
+ return self._execute_session_environment_step(
+ session_id, {"action": "transfer_to_human_agents", "parameters": {"summary": summary}}
+ )
diff --git a/eval_protocol/mcp_servers/tau2/tests/system_prompts/airline_agent_system_prompt.md b/eval_protocol/mcp_servers/tau2/tests/system_prompts/airline_agent_system_prompt.md
new file mode 100644
index 00000000..44c72520
--- /dev/null
+++ b/eval_protocol/mcp_servers/tau2/tests/system_prompts/airline_agent_system_prompt.md
@@ -0,0 +1,178 @@
+
+You are a customer service agent that helps the user according to the provided below.
+In each turn you can either:
+- Send a message to the user.
+- Make a tool call.
+You cannot do both at the same time.
+
+Try to be helpful and always follow the policy. Always make sure you generate valid JSON only.
+
+
+# Airline Agent Policy
+
+The current time is 2024-05-15 15:00:00 EST.
+
+As an airline agent, you can help users **book**, **modify**, or **cancel** flight reservations. You also handle **refunds and compensation**.
+
+Before taking any actions that update the booking database (booking, modifying flights, editing baggage, changing cabin class, or updating passenger information), you must list the action details and obtain explicit user confirmation (yes) to proceed.
+
+You should not provide any information, knowledge, or procedures not provided by the user or available tools, or give subjective recommendations or comments.
+
+You should only make one tool call at a time, and if you make a tool call, you should not respond to the user simultaneously. If you respond to the user, you should not make a tool call at the same time.
+
+You should deny user requests that are against this policy.
+
+You should transfer the user to a human agent if and only if the request cannot be handled within the scope of your actions. To transfer, first make a tool call to transfer_to_human_agents, and then send the message 'YOU ARE BEING TRANSFERRED TO A HUMAN AGENT. PLEASE HOLD ON.' to the user.
+
+## Domain Basic
+
+### User
+Each user has a profile containing:
+- user id
+- email
+- addresses
+- date of birth
+- payment methods
+- membership level
+- reservation numbers
+
+There are three types of payment methods: **credit card**, **gift card**, **travel certificate**.
+
+There are three membership levels: **regular**, **silver**, **gold**.
+
+### Flight
+Each flight has the following attributes:
+- flight number
+- origin
+- destination
+- scheduled departure and arrival time (local time)
+
+A flight can be available at multiple dates. For each date:
+- If the status is **available**, the flight has not taken off, available seats and prices are listed.
+- If the status is **delayed** or **on time**, the flight has not taken off, cannot be booked.
+- If the status is **flying**, the flight has taken off but not landed, cannot be booked.
+
+There are three cabin classes: **basic economy**, **economy**, **business**. **basic economy** is its own class, completely distinct from **economy**.
+
+Seat availability and prices are listed for each cabin class.
+
+### Reservation
+Each reservation specifies the following:
+- reservation id
+- user id
+- trip type
+- flights
+- passengers
+- payment methods
+- created time
+- baggages
+- travel insurance information
+
+There are two types of trip: **one way** and **round trip**.
+
+## Book flight
+
+The agent must first obtain the user id from the user.
+
+The agent should then ask for the trip type, origin, destination.
+
+Cabin:
+- Cabin class must be the same across all the flights in a reservation.
+
+Passengers:
+- Each reservation can have at most five passengers.
+- The agent needs to collect the first name, last name, and date of birth for each passenger.
+- All passengers must fly the same flights in the same cabin.
+
+Payment:
+- Each reservation can use at most one travel certificate, at most one credit card, and at most three gift cards.
+- The remaining amount of a travel certificate is not refundable.
+- All payment methods must already be in user profile for safety reasons.
+
+Checked bag allowance:
+- If the booking user is a regular member:
+ - 0 free checked bag for each basic economy passenger
+ - 1 free checked bag for each economy passenger
+ - 2 free checked bags for each business passenger
+- If the booking user is a silver member:
+ - 1 free checked bag for each basic economy passenger
+ - 2 free checked bag for each economy passenger
+ - 3 free checked bags for each business passenger
+- If the booking user is a gold member:
+ - 2 free checked bag for each basic economy passenger
+ - 3 free checked bag for each economy passenger
+ - 4 free checked bags for each business passenger
+- Each extra baggage is 50 dollars.
+
+Do not add checked bags that the user does not need.
+
+Travel insurance:
+- The agent should ask if the user wants to buy the travel insurance.
+- The travel insurance is 30 dollars per passenger and enables full refund if the user needs to cancel the flight given health or weather reasons.
+
+## Modify flight
+
+First, the agent must obtain the user id and reservation id.
+- The user must provide their user id.
+- If the user doesn't know their reservation id, the agent should help locate it using available tools.
+
+Change flights:
+- Basic economy flights cannot be modified.
+- Other reservations can be modified without changing the origin, destination, and trip type.
+- Some flight segments can be kept, but their prices will not be updated based on the current price.
+- The API does not check these for the agent, so the agent must make sure the rules apply before calling the API!
+
+Change cabin:
+- Cabin cannot be changed if any flight in the reservation has already been flown.
+- In other cases, all reservations, including basic economy, can change cabin without changing the flights.
+- Cabin class must remain the same across all the flights in the same reservation; changing cabin for just one flight segment is not possible.
+- If the price after cabin change is higher than the original price, the user is required to pay for the difference.
+- If the price after cabin change is lower than the original price, the user is should be refunded the difference.
+
+Change baggage and insurance:
+- The user can add but not remove checked bags.
+- The user cannot add insurance after initial booking.
+
+Change passengers:
+- The user can modify passengers but cannot modify the number of passengers.
+- Even a human agent cannot modify the number of passengers.
+
+Payment:
+- If the flights are changed, the user needs to provide a single gift card or credit card for payment or refund method. The payment method must already be in user profile for safety reasons.
+
+## Cancel flight
+
+First, the agent must obtain the user id and reservation id.
+- The user must provide their user id.
+- If the user doesn't know their reservation id, the agent should help locate it using available tools.
+
+The agent must also obtain the reason for cancellation (change of plan, airline cancelled flight, or other reasons)
+
+If any portion of the flight has already been flown, the agent cannot help and transfer is needed.
+
+Otherwise, flight can be cancelled if any of the following is true:
+- The booking was made within the last 24 hrs
+- The flight is cancelled by airline
+- It is a business flight
+- The user has travel insurance and the reason for cancellation is covered by insurance.
+
+The API does not check that cancellation rules are met, so the agent must make sure the rules apply before calling the API!
+
+Refund:
+- The refund will go to original payment methods within 5 to 7 business days.
+
+## Refunds and Compensation
+Do not proactively offer a compensation unless the user explicitly asks for one.
+
+Do not compensate if the user is regular member and has no travel insurance and flies (basic) economy.
+
+Always confirms the facts before offering compensation.
+
+Only compensate if the user is a silver/gold member or has travel insurance or flies business.
+
+- If the user complains about cancelled flights in a reservation, the agent can offer a certificate as a gesture after confirming the facts, with the amount being $100 times the number of passengers.
+
+- If the user complains about delayed flights in a reservation and wants to change or cancel the reservation, the agent can offer a certificate as a gesture after confirming the facts and changing or cancelling the reservation, with the amount being $50 times the number of passengers.
+
+Do not offer compensation for any other reason than the ones listed above.
+
diff --git a/eval_protocol/mcp_servers/tau2/tests/system_prompts/mock_agent_system_prompt.md b/eval_protocol/mcp_servers/tau2/tests/system_prompts/mock_agent_system_prompt.md
new file mode 100644
index 00000000..6e9d001a
--- /dev/null
+++ b/eval_protocol/mcp_servers/tau2/tests/system_prompts/mock_agent_system_prompt.md
@@ -0,0 +1,18 @@
+
+You are a customer service agent that helps the user according to the provided below.
+In each turn you can either:
+- Send a message to the user.
+- Make a tool call.
+You cannot do both at the same time.
+
+Try to be helpful and always follow the policy. Always make sure you generate valid JSON only.
+
+
+# Mock Domain Policy
+
+1. Each task must have a title
+2. Task status can only be "pending" or "completed"
+3. Only existing users can create tasks
+4. You are not allowed to delete tasks. You should transfer the a human agent.
+5. If the user asks for a compliment, compliment them
+
diff --git a/eval_protocol/mcp_servers/tau2/tests/system_prompts/retail_agent_system_prompt.md b/eval_protocol/mcp_servers/tau2/tests/system_prompts/retail_agent_system_prompt.md
new file mode 100644
index 00000000..3a237f06
--- /dev/null
+++ b/eval_protocol/mcp_servers/tau2/tests/system_prompts/retail_agent_system_prompt.md
@@ -0,0 +1,147 @@
+
+You are a customer service agent for an online retail store that helps customers with their orders.
+In each turn you can either:
+- Send a message to the user.
+- Make a tool call.
+You cannot do both at the same time.
+
+Try to be helpful and always follow the policy. Always make sure you generate valid JSON only.
+
+
+# Retail agent policy
+
+As a retail agent, you can help users:
+
+- **cancel or modify pending orders**
+- **return or exchange delivered orders**
+- **modify their default user address**
+- **provide information about their own profile, orders, and related products**
+
+At the beginning of the conversation, you have to authenticate the user identity by locating their user id via email, or via name + zip code. This has to be done even when the user already provides the user id.
+
+Once the user has been authenticated, you can provide the user with information about order, product, profile information, e.g. help the user look up order id.
+
+You can only help one user per conversation (but you can handle multiple requests from the same user), and must deny any requests for tasks related to any other user.
+
+Before taking any action that updates the database (cancel, modify, return, exchange), you must list the action details and obtain explicit user confirmation (yes) to proceed.
+
+You should not make up any information or knowledge or procedures not provided by the user or the tools, or give subjective recommendations or comments.
+
+You should at most make one tool call at a time, and if you take a tool call, you should not respond to the user at the same time. If you respond to the user, you should not make a tool call at the same time.
+
+You should deny user requests that are against this policy.
+
+You should transfer the user to a human agent if and only if the request cannot be handled within the scope of your actions. To transfer, first make a tool call to transfer_to_human_agents, and then send the message 'YOU ARE BEING TRANSFERRED TO A HUMAN AGENT. PLEASE HOLD ON.' to the user.
+
+## Domain basic
+
+- All times in the database are EST and 24 hour based. For example "02:30:00" means 2:30 AM EST.
+
+### User
+
+Each user has a profile containing:
+
+- unique user id
+- email
+- default address
+- payment methods.
+
+There are three types of payment methods: **gift card**, **paypal account**, **credit card**.
+
+### Product
+
+Our retail store has 50 types of products.
+
+For each **type of product**, there are **variant items** of different **options**.
+
+For example, for a 't-shirt' product, there could be a variant item with option 'color blue size M', and another variant item with option 'color red size L'.
+
+Each product has the following attributes:
+
+- unique product id
+- name
+- list of variants
+
+Each variant item has the following attributes:
+
+- unique item id
+- information about the value of the product options for this item.
+- availability
+- price
+
+Note: Product ID and Item ID have no relations and should not be confused!
+
+### Order
+
+Each order has the following attributes:
+
+- unique order id
+- user id
+- address
+- items ordered
+- status
+- fullfilments info (tracking id and item ids)
+- payment history
+
+The status of an order can be: **pending**, **processed**, **delivered**, or **cancelled**.
+
+Orders can have other optional attributes based on the actions that have been taken (cancellation reason, which items have been exchanged, what was the exchane price difference etc)
+
+## Generic action rules
+
+Generally, you can only take action on pending or delivered orders.
+
+Exchange or modify order tools can only be called once per order. Be sure that all items to be changed are collected into a list before making the tool call!!!
+
+## Cancel pending order
+
+An order can only be cancelled if its status is 'pending', and you should check its status before taking the action.
+
+The user needs to confirm the order id and the reason (either 'no longer needed' or 'ordered by mistake') for cancellation. Other reasons are not acceptable.
+
+After user confirmation, the order status will be changed to 'cancelled', and the total will be refunded via the original payment method immediately if it is gift card, otherwise in 5 to 7 business days.
+
+## Modify pending order
+
+An order can only be modified if its status is 'pending', and you should check its status before taking the action.
+
+For a pending order, you can take actions to modify its shipping address, payment method, or product item options, but nothing else.
+
+### Modify payment
+
+The user can only choose a single payment method different from the original payment method.
+
+If the user wants the modify the payment method to gift card, it must have enough balance to cover the total amount.
+
+After user confirmation, the order status will be kept as 'pending'. The original payment method will be refunded immediately if it is a gift card, otherwise it will be refunded within 5 to 7 business days.
+
+### Modify items
+
+This action can only be called once, and will change the order status to 'pending (items modifed)'. The agent will not be able to modify or cancel the order anymore. So you must confirm all the details are correct and be cautious before taking this action. In particular, remember to remind the customer to confirm they have provided all the items they want to modify.
+
+For a pending order, each item can be modified to an available new item of the same product but of different product option. There cannot be any change of product types, e.g. modify shirt to shoe.
+
+The user must provide a payment method to pay or receive refund of the price difference. If the user provides a gift card, it must have enough balance to cover the price difference.
+
+## Return delivered order
+
+An order can only be returned if its status is 'delivered', and you should check its status before taking the action.
+
+The user needs to confirm the order id and the list of items to be returned.
+
+The user needs to provide a payment method to receive the refund.
+
+The refund must either go to the original payment method, or an existing gift card.
+
+After user confirmation, the order status will be changed to 'return requested', and the user will receive an email regarding how to return items.
+
+## Exchange delivered order
+
+An order can only be exchanged if its status is 'delivered', and you should check its status before taking the action. In particular, remember to remind the customer to confirm they have provided all items to be exchanged.
+
+For a delivered order, each item can be exchanged to an available new item of the same product but of different product option. There cannot be any change of product types, e.g. modify shirt to shoe.
+
+The user must provide a payment method to pay or receive refund of the price difference. If the user provides a gift card, it must have enough balance to cover the price difference.
+
+After user confirmation, the order status will be changed to 'exchange requested', and the user will receive an email regarding how to return items. There is no need to place a new order.
+
diff --git a/eval_protocol/mcp_servers/tau2/tests/test_tau2_e2e.py b/eval_protocol/mcp_servers/tau2/tests/test_tau2_e2e.py
new file mode 100644
index 00000000..ec7c3944
--- /dev/null
+++ b/eval_protocol/mcp_servers/tau2/tests/test_tau2_e2e.py
@@ -0,0 +1,1668 @@
+#!/usr/bin/env python3
+"""
+End-to-End Record and Replay Tests for Tau2 MCP
+
+This module provides comprehensive tests for multi-domain MCP environments with clean dataset loading.
+"""
+
+import asyncio
+import atexit
+import json
+import os
+import signal
+import subprocess
+import time
+import warnings
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+import pytest
+
+# Suppress pydantic warnings comprehensively
+warnings.filterwarnings("ignore", category=UserWarning, module="pydantic")
+warnings.filterwarnings("ignore", category=DeprecationWarning, module="pydantic")
+warnings.filterwarnings("ignore", message=".*Pydantic.*")
+warnings.filterwarnings("ignore", message=".*PydanticSerializationUnexpectedValue.*")
+warnings.filterwarnings("ignore", message=".*Support for class-based.*")
+warnings.filterwarnings("ignore", message=".*serializer warnings.*")
+
+# Suppress all DeprecationWarnings from pydantic internal config
+warnings.filterwarnings("ignore", category=DeprecationWarning, message=".*class-based.*config.*")
+
+# Set environment variable to suppress pydantic warnings at runtime
+
+os.environ["PYTHONWARNINGS"] = "ignore::UserWarning:pydantic,ignore::DeprecationWarning:pydantic"
+
+import eval_protocol as ep
+from eval_protocol import EvaluateResult, reward_function
+from eval_protocol.models import Message
+from vendor.tau2.data_model.message import (
+ AssistantMessage,
+ SystemMessage,
+ ToolCall,
+ ToolMessage,
+ UserMessage,
+)
+from vendor.tau2.data_model.tasks import Action, EvaluationCriteria, RewardType, Task, UserScenario
+from vendor.tau2.evaluator.evaluator import EnvironmentEvaluator
+from vendor.tau2.evaluator.evaluator_action import ActionEvaluator
+from vendor.tau2.evaluator.evaluator_communicate import CommunicateEvaluator
+from vendor.tau2.evaluator.evaluator_nl_assertions import NLAssertionsEvaluator
+from vendor.tau2.registry import registry
+
+
+def _is_ci_mode():
+ """Check if we're running in CI mode."""
+ return os.environ.get("CI", "").lower() in ["true", "1", "yes"]
+
+
+def _create_test_server(port: int, domain: str = "airline") -> "MCPServerManager":
+ """Create and start a test server."""
+ server_script_path = os.path.join(os.path.dirname(__file__), "..", "server.py")
+ server = MCPServerManager(server_script_path, port=port, domain=domain)
+ server.start()
+ print(f"โ
Started test server on port {port}")
+ return server
+
+
+def _stop_test_server(server: "MCPServerManager"):
+ """Stop and clean up a test server."""
+ server.stop()
+ print("๐งน Test server stopped and cleaned up")
+
+
+class MCPServerManager:
+ """Manages MCP server lifecycle for testing."""
+
+ # Class-level tracking of all server instances
+ _active_servers = []
+ _cleanup_registered = False
+
+ def __init__(self, server_script: str, port: int = 8000, domain: str = "airline"):
+ self.server_script = server_script
+ self.port = port
+ self.domain = domain
+ self.process: Optional[subprocess.Popen] = None
+ self.base_dir = Path(".").resolve()
+ self._log_file = None
+ self._log_file_path = None
+
+ # Register this server for cleanup
+ MCPServerManager._active_servers.append(self)
+
+ # Register cleanup handlers only once
+ if not MCPServerManager._cleanup_registered:
+ MCPServerManager._register_cleanup_handlers()
+ MCPServerManager._cleanup_registered = True
+
+ def start(self) -> None:
+ """Start the MCP server."""
+ if self.process:
+ return
+
+ # Set environment for server
+ env = os.environ.copy()
+ env["PORT"] = str(self.port)
+ if "PYTHONPATH" not in env:
+ env["PYTHONPATH"] = ""
+ env["PYTHONPATH"] += os.pathsep + str(self.base_dir)
+
+ # Start server process (no domain argument needed for tau2_mcp server)
+ cmd = ["python", self.server_script, "--port", str(self.port)]
+
+ # Setup log file with cleanup
+ log_file_path = os.path.join(self.base_dir, f"server_output_{self.domain}_{self.port}.log")
+ if os.path.exists(log_file_path):
+ os.remove(log_file_path)
+
+ log_file = open(log_file_path, "w")
+
+ self.process = subprocess.Popen(
+ cmd,
+ cwd=self.base_dir,
+ env=env,
+ stdout=log_file,
+ stderr=log_file,
+ text=True,
+ )
+
+ # Store log file reference for cleanup
+ self._log_file = log_file
+ self._log_file_path = log_file_path
+
+ # Wait for server to start
+ time.sleep(3)
+
+ # Check if process is still running
+ if self.process.poll() is not None:
+ try:
+ with open(self._log_file_path, "r") as f:
+ log_content = f.read()
+ print("โ Server failed to start!")
+ print(f"๐ Server log ({self._log_file_path}):")
+ print("=" * 50)
+ print(log_content)
+ print("=" * 50)
+ raise RuntimeError("Server failed to start. Check log above for details.")
+ except Exception as e:
+ stdout, stderr = self.process.communicate()
+ raise RuntimeError(f"Server failed to start. stderr: {stderr}, log error: {e}")
+
+ print(f"โ
Server started successfully on port {self.port}")
+
+ def stop(self) -> None:
+ """Stop the MCP server."""
+ if self.process:
+ print(f"๐ Stopping server on port {self.port}...")
+ self.process.terminate()
+ try:
+ self.process.wait(timeout=5)
+ except subprocess.TimeoutExpired:
+ print(f"โก Force killing server on port {self.port}...")
+ self.process.kill()
+ self.process.wait()
+ self.process = None
+
+ # Clean up log file
+ if self._log_file:
+ try:
+ self._log_file.close()
+ except Exception:
+ pass
+ self._log_file = None
+
+ if self._log_file_path and os.path.exists(self._log_file_path):
+ try:
+ os.remove(self._log_file_path)
+ print(f"๐งน Cleaned up log file: {self._log_file_path}")
+ except OSError:
+ pass
+ self._log_file_path = None
+
+ # Remove from active servers list
+ if self in MCPServerManager._active_servers:
+ MCPServerManager._active_servers.remove(self)
+
+ @classmethod
+ def _cleanup_all_servers(cls):
+ """Clean up all active servers on exit"""
+ print(f"\n๐งน Cleaning up {len(cls._active_servers)} active servers...")
+ for server in cls._active_servers.copy():
+ try:
+ server.stop()
+ except Exception as e:
+ print(f"โ ๏ธ Error stopping server: {e}")
+ cls._active_servers.clear()
+
+ @classmethod
+ def _signal_handler(cls, signum, frame):
+ """Handle interrupt signals"""
+ print(f"\n๐ Received signal {signum}, cleaning up...")
+ cls._cleanup_all_servers()
+ exit(1)
+
+ @classmethod
+ def _register_cleanup_handlers(cls):
+ """Register cleanup handlers - called only once"""
+ atexit.register(cls._cleanup_all_servers)
+ signal.signal(signal.SIGINT, cls._signal_handler) # Ctrl+C
+ signal.signal(signal.SIGTERM, cls._signal_handler) # Termination signal
+
+ def __enter__(self):
+ """Context manager entry"""
+ self.start()
+ return self
+
+ def __exit__(self, exc_type, exc_val, exc_tb):
+ """Context manager exit - ensures cleanup even on exceptions"""
+ self.stop()
+ if exc_type:
+ print(f"โ ๏ธ Server cleanup after exception: {exc_type.__name__}")
+ return False # Don't suppress exceptions
+
+
+def load_dataset(dataset_file: str) -> List[Dict[str, Any]]:
+ """Load dataset and add system_prompt based on domain."""
+ test_dir = Path(__file__).parent
+ dataset_path = test_dir / dataset_file
+
+ with open(dataset_path, "r") as f:
+ data = json.load(f)
+
+ # Load system prompts based on domain
+ for item in data:
+ domain = item["environment_context"]["domain"]
+ prompt_file = test_dir / f"system_prompts/{domain}_agent_system_prompt.md"
+
+ with open(prompt_file, "r") as f:
+ item["system_prompt"] = f.read().strip()
+
+ return data
+
+
+@pytest.fixture
+def multi_env_airline_dataset():
+ """Load airline dataset with system prompts."""
+ return load_dataset("datasets/airline.json")
+
+
+@pytest.fixture
+def multi_env_airline_full_dataset():
+ """Load airline dataset with system prompts."""
+ return load_dataset("datasets/airline_full.json")
+
+
+@pytest.fixture
+def multi_env_mock_dataset():
+ """Load mock dataset with system prompts."""
+ return load_dataset("datasets/mock.json")
+
+
+@pytest.fixture
+def multi_env_retail_dataset():
+ """Load retail dataset with system prompts."""
+ return load_dataset("datasets/retail.json")
+
+
+@pytest.fixture
+def fireworks_multi_env_airline_recording_file():
+ """Provide a recording file path for the OpenAIPolicy multi-environment test."""
+ recording_dir = Path(__file__).parent / "recordings"
+ recording_dir.mkdir(exist_ok=True)
+ recording_path = recording_dir / "fireworks_multi_env_airline_trajectory.jsonl"
+
+ # Don't remove here - let the test handle removal for clean runs
+ yield str(recording_path)
+
+ # Keep the file after test completion for review
+ print(f"๐ OpenAIPolicy multi-environment trajectory preserved at: {recording_path}")
+
+
+@pytest.fixture
+def fireworks_multi_env_mock_recording_file():
+ """Provide a recording file path for the mock domain multi-environment test."""
+ recording_dir = Path(__file__).parent / "recordings"
+ recording_dir.mkdir(exist_ok=True)
+ recording_path = recording_dir / "fireworks_multi_env_mock_trajectory.jsonl"
+
+ # Don't remove here - let the test handle removal for clean runs
+ yield str(recording_path)
+
+ # Keep the file after test completion for review
+ print(f"๐ Mock domain multi-environment trajectory preserved at: {recording_path}")
+
+
+@pytest.fixture
+def fireworks_multi_env_retail_recording_file():
+ """Provide a recording file path for the retail domain multi-environment test."""
+ recording_dir = Path(__file__).parent / "recordings"
+ recording_dir.mkdir(exist_ok=True)
+ recording_path = recording_dir / "fireworks_multi_env_retail_trajectory.jsonl"
+
+ # Don't remove here - let the test handle removal for clean runs
+ yield str(recording_path)
+
+ # Keep the file after test completion for review
+ print(f"๐ Retail domain multi-environment trajectory preserved at: {recording_path}")
+
+
+async def _validate_recording_integrity(recording_file: str, dataset: List[Dict]):
+ """Validate the integrity of the recorded trajectory."""
+
+ if not os.path.exists(recording_file):
+ pytest.fail(f"โ Recording file not created: {recording_file}")
+
+ print("\n๐ === VALIDATING RECORDING INTEGRITY ===")
+
+ # Load all recorded entries
+ recorded_entries = []
+ with open(recording_file, "r") as f:
+ for line in f:
+ if line.strip():
+ recorded_entries.append(json.loads(line))
+
+ # Group by environment
+ env_recordings = {}
+ for entry in recorded_entries:
+ env_idx = entry["env_index"]
+ if env_idx not in env_recordings:
+ env_recordings[env_idx] = []
+ env_recordings[env_idx].append(entry)
+
+ print(f"๐ Found recordings for {len(env_recordings)} environments")
+
+ # Validation 1: Different configurations should produce different initial states
+ print("\n๐ฑ Validating multi-configuration environments...")
+ starting_states = []
+ recorded_env_indices = list(env_recordings.keys())
+
+ for env_idx in range(len(dataset)):
+ if env_idx not in env_recordings:
+ print(f" โ ๏ธ Environment {env_idx}: No recordings found (likely terminated immediately)")
+ continue
+
+ first_entry = env_recordings[env_idx][0]
+ messages = first_entry["messages"]
+
+ # Find the initial user message
+ user_msg = None
+ for msg in messages:
+ if msg["role"] == "user":
+ user_msg = msg["content"]
+ break
+
+ if not user_msg:
+ print(f" โ ๏ธ Environment {env_idx}: No user message found")
+ continue
+
+ if isinstance(user_msg, dict) or isinstance(user_msg, list):
+ user_msg = str(user_msg)
+
+ # Extract state information from user message
+ starting_states.append(user_msg)
+
+ # Extract environment context info for airline environment
+ env_context = dataset[env_idx]["environment_context"]
+ expected_seed = env_context.get("seed", "N/A")
+ domain = env_context.get("domain", "unknown")
+ print(f" Env {env_idx} (domain: {domain}, seed: {expected_seed}): State hash {hash(user_msg)}")
+
+ # Check that recorded states are different (different configurations should produce different initial states)
+ if len(starting_states) > 1:
+ unique_states = set(starting_states)
+ if len(unique_states) < len(starting_states):
+ print(
+ f"โ ๏ธ Warning: Only {len(unique_states)} unique states for {len(starting_states)} recorded environments"
+ )
+ print(" This may indicate configuration issues or identical initial states")
+ else:
+ print(f"โ
All {len(starting_states)} recorded environments have unique starting states")
+ else:
+ print(f"โน๏ธ Only {len(starting_states)} environments recorded - cannot validate state uniqueness")
+
+ # Validation 2: State progression within each environment
+ print("\n๐ฎ Validating state progression...")
+ for env_idx in recorded_env_indices:
+ env_entries = env_recordings[env_idx]
+
+ # Find entries with enough steps (at least 2 tool responses)
+ tool_responses = []
+ for entry in env_entries:
+ messages = entry["messages"]
+ for msg in messages:
+ if msg["role"] == "tool":
+ tool_responses.append(msg["content"])
+
+ if len(tool_responses) < 2:
+ print(f" Env {env_idx}: Only {len(tool_responses)} tool responses, skipping progression check")
+ continue
+
+ # Parse reservation details from first two tool responses
+ states = []
+ for i, response in enumerate(tool_responses[:2]):
+ try:
+ # Handle both string (JSON) and list (multimodal) content
+ if isinstance(response, list):
+ # Multimodal content - extract text part
+ text_content = None
+ for item in response:
+ if item.get("type") == "text":
+ text_content = item.get("text")
+ break
+ if text_content:
+ response_data = json.loads(text_content)
+ else:
+ response_data = {}
+ else:
+ # String content - parse as JSON
+ response_data = json.loads(response)
+
+ # For airline, extract reservation details
+ if "reservation" in response_data:
+ reservation = response_data["reservation"]
+ state_info = {
+ "booking_date": reservation.get("booking_date", "unknown"),
+ "flight_class": reservation.get("flight_class", "unknown"),
+ "travel_insurance": reservation.get("travel_insurance", "unknown"),
+ "flight_cancelled": reservation.get("flight_cancelled", "unknown"),
+ }
+ else:
+ # Fallback for different response structure
+ state_info = {
+ "booking_date": response_data.get("booking_date", "unknown"),
+ "flight_class": response_data.get("flight_class", "unknown"),
+ "travel_insurance": response_data.get("travel_insurance", "unknown"),
+ "flight_cancelled": response_data.get("flight_cancelled", "unknown"),
+ }
+
+ states.append(state_info)
+ print(f" Step {i + 1}: {state_info}")
+ except (json.JSONDecodeError, TypeError) as e:
+ pytest.fail(f"โ Invalid JSON in tool response {i + 1} for env {env_idx}: {response}. Error: {e}")
+
+ # For airline, we expect state to remain consistent between steps (same reservation details)
+ if len(states) >= 2:
+ if states[0] == states[1]:
+ print(f" โ
Env {env_idx}: Consistent reservation details between steps")
+ else:
+ print(
+ f" โ ๏ธ Env {env_idx}: Reservation details changed between steps - may indicate session state issues"
+ )
+
+ # Validation 3: Check for repeated states (simple but effective)
+ print("\n๐ Validating no repeated states...")
+ _validate_no_repeated_states(env_recordings, dataset)
+
+ # Validation 4: Check for control plane termination
+ print("\n๐๏ธ Validating control plane termination...")
+ _validate_control_plane_sync(env_recordings, dataset)
+
+ # Validation 5: Check that no tool calls happen after termination
+ print("\n๐ Validating no tool calls after termination...")
+ _validate_no_tool_calls_after_termination(env_recordings, dataset)
+
+ # Validation 6: Check that trajectories properly terminate
+ print("\n๐ Validating trajectory termination...")
+ _validate_trajectory_termination(env_recordings, dataset)
+
+ print("โ
Recording integrity validation completed")
+
+
+def _validate_no_repeated_states(env_recordings: Dict, dataset: List[Dict]):
+ """
+ SIMPLE CRITICAL TEST: Check if there are repeated states within each environment.
+ """
+ print("๐ Checking for repeated states in trajectories...")
+
+ for env_idx, env_entries in env_recordings.items():
+ reservation_states = []
+
+ # Extract all reservation state info from tool responses
+ for entry_num, entry in enumerate(env_entries):
+ messages = entry.get("messages", [])
+
+ for msg in messages:
+ if msg["role"] == "tool":
+ try:
+ # Handle both string (JSON) and list (multimodal) content
+ content = msg["content"]
+ if isinstance(content, list):
+ # Multimodal content - extract text part
+ text_content = None
+ for item in content:
+ if item.get("type") == "text":
+ text_content = item.get("text")
+ break
+ if text_content:
+ tool_response = json.loads(text_content)
+ else:
+ tool_response = {}
+ else:
+ # String content - parse as JSON
+ tool_response = json.loads(content)
+
+ # For airline, we track reservation state
+ if "reservation" in tool_response:
+ reservation = tool_response["reservation"]
+ state_id = f"{reservation.get('booking_date', 'unknown')}_{reservation.get('flight_class', 'unknown')}"
+ else:
+ state_id = str(hash(str(content)))
+
+ if state_id is not None:
+ reservation_states.append((entry_num, state_id))
+ except (json.JSONDecodeError, TypeError):
+ continue
+
+ if len(reservation_states) < 2:
+ print(
+ f" โน๏ธ Env {env_idx}: Only {len(reservation_states)} reservation states recorded, skipping repeated state check"
+ )
+ continue
+
+ # Check for consecutive repeated states
+ repeated_sequences = []
+ current_state = reservation_states[0][1]
+ repeat_count = 1
+ start_step = reservation_states[0][0]
+
+ for step_num, state in reservation_states[1:]:
+ if state == current_state:
+ repeat_count += 1
+ else:
+ if repeat_count > 1:
+ repeated_sequences.append((current_state, repeat_count, start_step))
+ current_state = state
+ repeat_count = 1
+ start_step = step_num
+
+ # Check the last sequence
+ if repeat_count > 1:
+ repeated_sequences.append((current_state, repeat_count, start_step))
+
+ # Report results
+ if repeated_sequences:
+ print(f" โ ๏ธ Env {env_idx}: Found repeated state sequences:")
+ for state, count, start in repeated_sequences:
+ print(f" - State {state} repeated {count} times starting from step {start}")
+
+ # For airline, repeated states are expected as reservation details don't change
+ max_repeats = max(count for _, count, _ in repeated_sequences)
+ if max_repeats > 10:
+ longest_sequence = max(repeated_sequences, key=lambda x: x[1])
+ print(
+ f"โ ๏ธ WARNING: Env {env_idx}: State {longest_sequence[0]} repeated {longest_sequence[1]} times starting from step {longest_sequence[2]}."
+ )
+ print(" This might indicate session state or control plane termination issues.")
+ print(f" All states: {[state for _, state in reservation_states]}")
+ else:
+ print(f" โ
Env {env_idx}: No repeated states detected - good state progression!")
+
+
+def _validate_control_plane_sync(env_recordings: Dict, dataset: List[Dict]):
+ """
+ SIMPLE CRITICAL TEST: Check if all control plane metadata shows terminated=False.
+ """
+ print("๐ Checking control plane termination data...")
+
+ total_steps = 0
+ terminated_steps = 0
+
+ for env_idx, env_entries in env_recordings.items():
+ env_terminated_count = 0
+ env_total_count = 0
+
+ for entry in env_entries:
+ messages = entry.get("messages", [])
+
+ # Look for tool responses with metadata
+ for msg in messages:
+ if msg["role"] == "tool" and "metadata" in msg:
+ metadata = msg["metadata"]
+ env_total_count += 1
+ total_steps += 1
+
+ if metadata.get("terminated", False):
+ env_terminated_count += 1
+ terminated_steps += 1
+
+ if env_total_count > 0:
+ print(f" Env {env_idx}: {env_terminated_count}/{env_total_count} steps show terminated=True")
+
+ print(f"\n๐ Overall: {terminated_steps}/{total_steps} steps show terminated=True")
+
+ # Note: Some environments may not be recorded if they terminate immediately
+ missing_envs = len(dataset) - len(env_recordings)
+ if missing_envs > 0:
+ print(f" โน๏ธ {missing_envs} environments not recorded (likely terminated immediately)")
+
+ if terminated_steps == 0:
+ print(" โ ๏ธ Warning: No terminated=True found in metadata (may be expected for short runs)")
+ else:
+ print(" โ
Found some termination signals - control plane appears to be working")
+
+
+def _validate_no_tool_calls_after_termination(env_recordings: Dict, dataset: List[Dict]):
+ """
+ CRITICAL TEST: Check that no tool calls happen after an environment is terminated.
+ """
+ print("๐ Checking for tool calls after termination...")
+
+ for env_idx, env_entries in env_recordings.items():
+ if not env_entries:
+ continue
+
+ termination_detected = False
+ steps_after_termination = 0
+ termination_step = None
+
+ for entry_idx, entry in enumerate(env_entries):
+ messages = entry.get("messages", [])
+
+ # Look for tool responses with termination signal
+ for msg in messages:
+ if msg["role"] == "tool" and "metadata" in msg:
+ metadata = msg["metadata"]
+ terminated = metadata.get("terminated", False)
+
+ if terminated and not termination_detected:
+ # First termination detected
+ termination_detected = True
+ termination_step = entry_idx
+ print(f" Env {env_idx}: Termination detected at step {termination_step}")
+ elif termination_detected:
+ # Count steps after termination
+ steps_after_termination += 1
+
+ if termination_detected and steps_after_termination > 0:
+ pytest.fail(
+ f"โ TOOL CALLS AFTER TERMINATION BUG DETECTED in Env {env_idx}: "
+ f"Environment terminated at step {termination_step}, but {steps_after_termination} "
+ f"additional tool calls were made after termination. "
+ f"This violates the environment contract - no actions should be taken on terminated environments. "
+ f"The rollout system should check environment termination status before making tool calls."
+ )
+ elif termination_detected:
+ print(f" โ
Env {env_idx}: No tool calls after termination")
+ else:
+ print(f" โน๏ธ Env {env_idx}: No termination detected in trajectory")
+
+
+def _validate_trajectory_termination(env_recordings: Dict, dataset: List[Dict]):
+ """
+ CRITICAL TEST: Check that trajectories properly terminate with terminated=True at the end.
+ """
+ print("๐ Checking trajectory termination patterns...")
+
+ for env_idx, env_entries in env_recordings.items():
+ if not env_entries:
+ continue
+
+ # Look at the last few entries to see if we have proper termination
+ last_entry = env_entries[-1]
+ messages = last_entry.get("messages", [])
+
+ # Find the last tool response with metadata
+ last_tool_metadata = None
+ total_tool_responses = 0
+
+ for entry in env_entries:
+ for msg in entry.get("messages", []):
+ if msg["role"] == "tool" and "metadata" in msg:
+ last_tool_metadata = msg["metadata"]
+ total_tool_responses += 1
+
+ if last_tool_metadata is None:
+ print(f" โ ๏ธ Env {env_idx}: No tool responses with metadata found")
+ continue
+
+ last_terminated = last_tool_metadata.get("terminated", False)
+ total_steps = len(env_entries)
+
+ print(
+ f" Env {env_idx}: {total_steps} trajectory steps, {total_tool_responses} tool responses, final terminated={last_terminated}"
+ )
+
+ # For airline, allow non-terminated trajectories as conversations may be ongoing
+ if total_steps >= 8 and not last_terminated:
+ print(f" โ ๏ธ Env {env_idx}: Trajectory has {total_steps} steps but final metadata shows terminated=False.")
+ print(
+ " This might indicate: 1) Conversation still in progress, 2) Control plane sync issues, or 3) User still interacting"
+ )
+ print(f" Last metadata: {last_tool_metadata}")
+ elif last_terminated:
+ print(" โ
Trajectory properly terminated")
+ else:
+ print(f" โน๏ธ Short trajectory ({total_steps} steps) - termination not required")
+
+
+@reward_function
+def tau2_airline_eval(
+ messages: List[Message],
+ nl_assertions: List[str] = None,
+ communicate_info: List[str] = None,
+ actions: List[dict] = None,
+ **kwargs,
+) -> EvaluateResult:
+ """
+ Evaluate airline conversation using tau2-bench multi-component evaluation (NL assertions, communication, actions).
+
+ Args:
+ messages: List of Message objects from conversation between agent and customer
+ nl_assertions: List of natural language assertions to evaluate
+ communicate_info: List of communication requirements to evaluate
+ golden_actions: List of expected actions to evaluate
+ **kwargs: Additional parameters
+
+ Returns:
+ EvaluateResult with combined score from all evaluation components and detailed breakdown
+ """
+ # Default assertions if none provided (should not happen in practice)
+ if nl_assertions is None:
+ nl_assertions = ["The agent handled the customer request appropriately according to airline policy"]
+
+ # Convert Message objects directly to tau2-bench message objects
+ trajectory_objects = []
+ for msg in messages:
+ role = msg.role
+ content = msg.content
+
+ if role == "system":
+ trajectory_objects.append(SystemMessage(role=role, content=content))
+ elif role == "assistant":
+ tau2_tool_calls = []
+ if msg.tool_calls:
+ for tool_call in msg.tool_calls:
+ arguments = json.loads(tool_call.function.arguments)
+ tau2_tool_call = ToolCall(
+ id=tool_call.id,
+ name=tool_call.function.name,
+ arguments=arguments,
+ )
+ tau2_tool_calls.append(tau2_tool_call)
+
+ trajectory_objects.append(AssistantMessage(role=role, content=content, tool_calls=tau2_tool_calls))
+ elif role == "user":
+ trajectory_objects.append(UserMessage(role=role, content=content))
+ elif role == "tool":
+ tool_id = msg.tool_call_id
+ trajectory_objects.append(ToolMessage(id=tool_id, role=role, content=content))
+
+ reward = 1.0
+
+ evaluation_criteria = EvaluationCriteria(
+ nl_assertions=nl_assertions,
+ communicate_info=communicate_info,
+ actions=actions,
+ reward_basis=[
+ RewardType.NL_ASSERTION,
+ RewardType.DB,
+ RewardType.COMMUNICATE,
+ RewardType.ACTION,
+ ], # CHANGE THIS TO WHAT YOU WANT TO EVALUATE ON
+ )
+
+ task = Task(
+ id="Filler", evaluation_criteria=evaluation_criteria, user_scenario=UserScenario(instructions="Filler")
+ ) # id and user_scenario are required for the Task type but not used in calculating reward, filler values
+
+ env_reward_info = EnvironmentEvaluator.calculate_reward(
+ environment_constructor=registry.get_env_constructor("airline"),
+ task=task,
+ full_trajectory=trajectory_objects,
+ )
+ action_reward_info = ActionEvaluator.calculate_reward(
+ task=task,
+ full_trajectory=trajectory_objects,
+ )
+ communicate_reward_info = CommunicateEvaluator.calculate_reward(
+ task=task,
+ full_trajectory=trajectory_objects,
+ )
+ nl_reward_info = NLAssertionsEvaluator.calculate_reward(
+ task=task,
+ full_trajectory=trajectory_objects,
+ )
+
+ reward = 1.0
+ env_bases = {RewardType.DB, RewardType.ENV_ASSERTION}
+ action_bases = {RewardType.ACTION}
+ nl_bases = {RewardType.NL_ASSERTION}
+ comm_bases = {RewardType.COMMUNICATE}
+ task_reward_basis = set(task.evaluation_criteria.reward_basis)
+
+ reward_breakdown = {}
+ if task_reward_basis & env_bases:
+ if env_reward_info.reward_breakdown is not None:
+ reward_breakdown.update(env_reward_info.reward_breakdown)
+ reward *= env_reward_info.reward
+ if task_reward_basis & action_bases:
+ if action_reward_info.reward_breakdown is not None:
+ reward_breakdown.update(action_reward_info.reward_breakdown)
+ reward *= action_reward_info.reward
+ if task_reward_basis & nl_bases:
+ if nl_reward_info.reward_breakdown is not None:
+ reward_breakdown.update(nl_reward_info.reward_breakdown)
+ reward *= nl_reward_info.reward
+ if task_reward_basis & comm_bases:
+ if communicate_reward_info.reward_breakdown is not None:
+ reward_breakdown.update(communicate_reward_info.reward_breakdown)
+ reward *= communicate_reward_info.reward
+
+ # Generate reason showing only failed components
+ failed_reasons = []
+
+ if task_reward_basis & env_bases and env_reward_info.reward == 0:
+ failed_reasons.append("โ Environment/DB check failed")
+
+ if task_reward_basis & action_bases and action_reward_info.reward == 0:
+ failed_actions = []
+ if hasattr(action_reward_info, "action_checks") and action_reward_info.action_checks:
+ failed_actions = [
+ f"{ac.action.name}({ac.action.arguments})"
+ for ac in action_reward_info.action_checks
+ if not ac.action_match
+ ]
+ if failed_actions:
+ failed_reasons.append(f"โ Failed actions: {failed_actions}")
+ else:
+ failed_reasons.append("โ Actions failed")
+
+ if task_reward_basis & nl_bases and nl_reward_info.reward == 0:
+ failed_nl = []
+ if hasattr(nl_reward_info, "nl_assertions") and nl_reward_info.nl_assertions:
+ failed_nl = [nla.nl_assertion for nla in nl_reward_info.nl_assertions if not nla.met]
+ if failed_nl:
+ failed_reasons.append(f"โ Failed NL assertions: {failed_nl}")
+ else:
+ failed_reasons.append("โ NL Assertions failed")
+
+ if task_reward_basis & comm_bases and communicate_reward_info.reward == 0:
+ failed_comm = []
+ if hasattr(communicate_reward_info, "communicate_checks") and communicate_reward_info.communicate_checks:
+ failed_comm = [cc.info for cc in communicate_reward_info.communicate_checks if not cc.met]
+ if failed_comm:
+ failed_reasons.append(f"โ Failed communication: {failed_comm}")
+ else:
+ failed_reasons.append("โ Communication failed")
+
+ # If everything passed, show success
+ reason = "\n".join(failed_reasons) if failed_reasons else "โ
All checks passed"
+
+ return EvaluateResult(
+ score=reward,
+ reason=reason,
+ metrics={},
+ )
+
+
+# TODO: add rest of tests, but test_fireworks_multi_environment_sessions is the most important one.
+
+
+@pytest.mark.asyncio
+async def test_fireworks_multi_airline_environment_sessions(
+ multi_env_airline_dataset, fireworks_multi_env_airline_recording_file
+):
+ """Test multi-environment session handling with OpenAIPolicy."""
+
+ print("\n๐งช === FIREWORKS MULTI-ENVIRONMENT SESSION TEST ===")
+
+ # Check if we're in CI mode and have existing recording
+ is_ci = os.environ.get("CI", "").lower() in ["true", "1", "yes"]
+ if is_ci and os.path.exists(fireworks_multi_env_airline_recording_file):
+ print("\n๐ฌ === CI MODE: PLAYBACK ONLY ===")
+
+ # Set up playback environment
+ os.environ["EP_PLAYBACK_FILE"] = fireworks_multi_env_airline_recording_file
+
+ # Create playback policy, using OpenAI policy for vision modality + tool calling
+ playback_policy = ep.OpenAIPolicy(
+ model_id="gpt-4.1",
+ temperature=0.2,
+ max_tokens=8192,
+ )
+
+ assert playback_policy.is_playback_mode(), "Should be in playback mode in CI"
+
+ # Create environments for playback
+ playback_envs = ep.make(
+ "http://localhost:9500/mcp/",
+ dataset=multi_env_airline_dataset,
+ model_id=playback_policy.model_id,
+ )
+
+ # Run playback
+ start_time = time.time()
+ # TODO: figure out how user simulator works for playback
+ playback_evaluation_rows = await ep.rollout(playback_envs, policy=playback_policy, steps=15)
+ playback_duration = time.time() - start_time
+
+ print(f"โ
CI playback completed: {len(playback_evaluation_rows)} evaluation rows in {playback_duration:.2f}s")
+
+ # Clean up environment variable
+ if "EP_PLAYBACK_FILE" in os.environ:
+ del os.environ["EP_PLAYBACK_FILE"]
+
+ return # Skip recording phase in CI
+
+ # ALWAYS remove trajectory file first to avoid confusion
+ if os.path.exists(fireworks_multi_env_airline_recording_file):
+ os.unlink(fireworks_multi_env_airline_recording_file)
+ print(f"๐งน Removed existing trajectory file: {fireworks_multi_env_airline_recording_file}")
+
+ # Start server for this test
+ server = _create_test_server(9700)
+ try:
+ # Set up recording
+ os.environ["EP_PLAYBACK_FILE"] = fireworks_multi_env_airline_recording_file
+
+ # Create OpenAIPolicy for multi-environment testing
+ policy = ep.OpenAIPolicy(
+ model_id="gpt-4.1",
+ # temperature=0.2,
+ max_tokens=4096,
+ )
+
+ assert not policy.is_playback_mode(), "Should be in recording mode initially"
+
+ # Create multiple environments
+ envs = ep.make(
+ f"http://localhost:{server.port}/mcp/",
+ dataset=multi_env_airline_dataset,
+ model_id=policy.model_id,
+ )
+
+ print(f"๐ Created {len(envs.sessions)} environment sessions")
+
+ # Run rollout with multiple environments (fewer steps for LLM efficiency)
+ start_time = time.time()
+ evaluation_rows = await ep.rollout(envs, policy=policy, steps=15)
+ duration = time.time() - start_time
+
+ # Validate results
+ assert len(evaluation_rows) == len(multi_env_airline_dataset), (
+ "Should have evaluation row for each environment"
+ )
+ assert all(eval_row.get_steps() > 0 for eval_row in evaluation_rows), "All evaluation rows should have steps"
+
+ print(
+ f"โ
OpenAIPolicy multi-environment test completed with {len(evaluation_rows)} evaluation rows in {duration:.2f}s"
+ )
+ print(f"๐ OpenAIPolicy multi-environment recording saved to: {fireworks_multi_env_airline_recording_file}")
+
+ # Print evaluation summaries
+ print("๐ OpenAIPolicy Multi-Environment Evaluation Summary:")
+ for i, eval_row in enumerate(evaluation_rows):
+ dataset_entry = multi_env_airline_dataset[i]
+ seed = dataset_entry.get("environment_context", {}).get("seed", "N/A")
+ domain = dataset_entry.get("environment_context", {}).get("domain", "N/A")
+ print(
+ f" Evaluation {i} (domain: {domain}, seed: {seed}): {eval_row.get_steps()} steps, reward: {eval_row.get_total_reward():.2f}, terminated: {eval_row.get_terminated()}, termination: {eval_row.get_termination_reason()}"
+ )
+ # Actions are no longer available in EvaluationRow (they're embedded in messages)
+ print(f" Messages: {len(eval_row.messages)} total")
+
+ # Validate that different configurations produce different environments
+ unique_rewards = set(eval_row.get_total_reward() for eval_row in evaluation_rows)
+ print(f"๐ Unique rewards across environments: {unique_rewards}")
+
+ # ๐ CRITICAL VALIDATIONS
+ await _validate_recording_integrity(fireworks_multi_env_airline_recording_file, multi_env_airline_dataset)
+
+ # ๐งช TAU2 REWARD FUNCTION EVALUATION
+ print(f"\n๐ฏ Evaluating {len(evaluation_rows)} evaluation rows using messages field")
+
+ for env_idx, eval_row in enumerate(evaluation_rows):
+ evaluation_criteria = multi_env_airline_dataset[env_idx]["evaluation_criteria"]
+ nl_assertions = evaluation_criteria["nl_assertions"]
+ communicate_info = evaluation_criteria["communicate_info"]
+ actions = evaluation_criteria["actions"]
+
+ print(f"\n๐ Environment {env_idx} conversation history:")
+ print(f" Messages: {len(eval_row.messages)} total")
+
+ eval = tau2_airline_eval(eval_row.messages, nl_assertions, communicate_info, actions)
+
+ # Print evaluation result details
+ print(f"๐ฏ Evaluation Result for env {env_idx}:")
+ print(f" Score: {eval.score}")
+ print(f" Reason: {eval.reason}")
+ print(f" Metrics ({len(eval.metrics)} total):")
+ for metric_name, metric_result in eval.metrics.items():
+ print(
+ f" {metric_name}: score={metric_result.score:.2f}, success={metric_result.is_score_valid}, reason='{metric_result.reason}'"
+ )
+
+ # Clean up
+ await envs.close()
+ if "EP_PLAYBACK_FILE" in os.environ:
+ del os.environ["EP_PLAYBACK_FILE"]
+
+ finally:
+ # Always stop the server
+ _stop_test_server(server)
+
+
+@pytest.mark.asyncio
+async def test_entire_airline_dataset(multi_env_airline_full_dataset, fireworks_multi_env_airline_recording_file):
+ """Test multi-environment session handling with OpenAIPolicy."""
+
+ print("\n๐งช === FIREWORKS MULTI-ENVIRONMENT SESSION TEST ===")
+
+ # Check if we're in CI mode and have existing recording
+ is_ci = os.environ.get("CI", "").lower() in ["true", "1", "yes"]
+ if is_ci and os.path.exists(fireworks_multi_env_airline_recording_file):
+ print("\n๐ฌ === CI MODE: PLAYBACK ONLY ===")
+
+ # Set up playback environment
+ os.environ["EP_PLAYBACK_FILE"] = fireworks_multi_env_airline_recording_file
+
+ # Create playback policy, using OpenAI policy for vision modality + tool calling
+ playback_policy = ep.OpenAIPolicy(
+ model_id="gpt-4.1",
+ temperature=0.0,
+ max_tokens=8192,
+ )
+
+ assert playback_policy.is_playback_mode(), "Should be in playback mode in CI"
+
+ # Create environments for playback
+ playback_envs = ep.make(
+ "http://localhost:9500/mcp/",
+ dataset=multi_env_airline_full_dataset,
+ model_id=playback_policy.model_id,
+ )
+
+ # Run playback
+ start_time = time.time()
+ # TODO: figure out how user simulator works for playback
+ playback_evaluation_rows = await ep.rollout(playback_envs, policy=playback_policy, steps=15)
+ playback_duration = time.time() - start_time
+
+ print(f"โ
CI playback completed: {len(playback_evaluation_rows)} evaluation rows in {playback_duration:.2f}s")
+
+ # Clean up environment variable
+ if "EP_PLAYBACK_FILE" in os.environ:
+ del os.environ["EP_PLAYBACK_FILE"]
+
+ return # Skip recording phase in CI
+
+ # ALWAYS remove trajectory file first to avoid confusion
+ if os.path.exists(fireworks_multi_env_airline_recording_file):
+ os.unlink(fireworks_multi_env_airline_recording_file)
+ print(f"๐งน Removed existing trajectory file: {fireworks_multi_env_airline_recording_file}")
+
+ # Start server for this test
+ server = _create_test_server(9700)
+ try:
+ # Set up recording
+ os.environ["EP_PLAYBACK_FILE"] = fireworks_multi_env_airline_recording_file
+
+ # Create OpenAIPolicy for multi-environment testing
+ policy = ep.OpenAIPolicy(
+ model_id="gpt-4.1",
+ temperature=0.0,
+ max_tokens=4096,
+ )
+ # policy = ep.FireworksPolicy(
+ # model_id="accounts/fireworks/models/glm-4p5#accounts/fireworks/deployments/vtfi4ll1",
+ # temperature=0.0,
+ # max_tokens=4096,
+ # )
+
+ assert not policy.is_playback_mode(), "Should be in recording mode initially"
+
+ # Create multiple environments
+ envs = ep.make(
+ f"http://localhost:{server.port}/mcp/",
+ dataset=multi_env_airline_full_dataset,
+ model_id=policy.model_id,
+ )
+
+ print(f"๐ Created {len(envs.sessions)} environment sessions")
+
+ # Run rollout with multiple environments (fewer steps for LLM efficiency)
+ start_time = time.time()
+ evaluation_rows = await ep.rollout(envs, policy=policy, steps=30, max_concurrent_rollouts=16)
+ duration = time.time() - start_time
+
+ # Validate results
+ assert len(evaluation_rows) == len(multi_env_airline_full_dataset), (
+ "Should have evaluation row for each environment"
+ )
+ assert all(eval_row.get_steps() > 0 for eval_row in evaluation_rows), "All evaluation rows should have steps"
+
+ print(
+ f"โ
OpenAIPolicy multi-environment test completed with {len(evaluation_rows)} evaluation rows in {duration:.2f}s"
+ )
+ print(f"๐ OpenAIPolicy multi-environment recording saved to: {fireworks_multi_env_airline_recording_file}")
+
+ # Print evaluation summaries
+ print("๐ OpenAIPolicy Multi-Environment Evaluation Summary:")
+ for i, eval_row in enumerate(evaluation_rows):
+ dataset_entry = multi_env_airline_full_dataset[i]
+ seed = dataset_entry.get("environment_context", {}).get("seed", "N/A")
+ domain = dataset_entry.get("environment_context", {}).get("domain", "N/A")
+ print(
+ f" Evaluation {i} (domain: {domain}, seed: {seed}): {eval_row.get_steps()} steps, reward: {eval_row.get_total_reward():.2f}, terminated: {eval_row.get_terminated()}, termination: {eval_row.get_termination_reason()}"
+ )
+ # Actions are no longer available in EvaluationRow (they're embedded in messages)
+ print(f" Messages: {len(eval_row.messages)} total")
+
+ # Validate that different configurations produce different environments
+ unique_rewards = set(eval_row.get_total_reward() for eval_row in evaluation_rows)
+ print(f"๐ Unique rewards across environments: {unique_rewards}")
+
+ # ๐ CRITICAL VALIDATIONS
+ await _validate_recording_integrity(fireworks_multi_env_airline_recording_file, multi_env_airline_full_dataset)
+
+ # ๐งช TAU2 REWARD FUNCTION EVALUATION
+ print(f"\n๐ฏ Evaluating {len(evaluation_rows)} evaluation rows using messages field")
+
+ all_evaluation_records = []
+ all_trajectory_records = []
+ all_results = []
+
+ for env_idx, eval_row in enumerate(evaluation_rows):
+ evaluation_criteria = multi_env_airline_full_dataset[env_idx]["evaluation_criteria"]
+ nl_assertions = evaluation_criteria["nl_assertions"]
+ communicate_info = evaluation_criteria["communicate_info"]
+ actions = evaluation_criteria["actions"]
+
+ print(f"\n๐ Environment {env_idx} conversation history:")
+ print(f" Messages: {len(eval_row.messages)} total")
+ print(
+ f" Evaluation criteria: {len(nl_assertions)} NL assertions, {len(communicate_info)} communication requirements, {len(actions)} actions"
+ )
+
+ eval_result = tau2_airline_eval(eval_row.messages, nl_assertions, communicate_info, actions)
+
+ # Print evaluation result details
+ print(f"๐ฏ Evaluation Result for env {env_idx}:")
+ print(f" Score: {eval_result.score}")
+ print(f" Reason: {eval_result.reason}")
+ print(f" Metrics ({len(eval_result.metrics)} total):")
+ for metric_name, metric_result in eval_result.metrics.items():
+ print(
+ f" {metric_name}: score={metric_result.score:.2f}, success={metric_result.is_score_valid}, reason='{metric_result.reason}'"
+ )
+
+ # Collect evaluation records for saving
+ evaluation_record = {
+ "model_id": policy.model_id,
+ "scenario_id": multi_env_airline_full_dataset[env_idx].get("id", f"scenario_{env_idx}"),
+ "evaluation": {
+ "score": eval_result.score,
+ "reason": eval_result.reason,
+ "metrics": {
+ k: {"score": v.score, "success": v.is_score_valid, "reason": v.reason}
+ for k, v in eval_result.metrics.items()
+ },
+ },
+ "evaluation_criteria": evaluation_criteria,
+ "conversation_length": len(eval_row.messages),
+ "trajectory_steps": eval_row.get_steps(),
+ "cost_info": {
+ "total_cost": 0.0, # Could be extracted from usage stats if available
+ "total_tokens": 0, # Could be extracted from usage stats if available
+ "cost_source": "not_tracked",
+ },
+ "timestamp": datetime.now().isoformat(),
+ }
+ all_evaluation_records.append(evaluation_record)
+
+ # Collect trajectory records for saving (includes full conversation)
+ trajectory_record = {
+ "model_id": policy.model_id,
+ "scenario_id": multi_env_airline_full_dataset[env_idx].get("id", f"scenario_{env_idx}"),
+ "messages": [
+ {"role": msg.role, "content": msg.content, "tool_calls": getattr(msg, "tool_calls", None)}
+ for msg in eval_row.messages
+ ],
+ "evaluation": {
+ "score": eval_result.score,
+ "reason": eval_result.reason,
+ "metrics": {
+ k: {"score": v.score, "success": v.is_score_valid, "reason": v.reason}
+ for k, v in eval_result.metrics.items()
+ },
+ },
+ "evaluation_criteria": evaluation_criteria,
+ "conversation_length": len(eval_row.messages),
+ "trajectory_steps": eval_row.get_steps(),
+ "cost_info": {
+ "total_cost": 0.0, # Could be extracted from usage stats if available
+ "total_tokens": 0, # Could be extracted from usage stats if available
+ "cost_source": "not_tracked",
+ },
+ "timestamp": datetime.now().isoformat(),
+ }
+ all_trajectory_records.append(trajectory_record)
+
+ # Simple results for summary
+ result = {
+ "model_id": policy.model_id,
+ "score": eval_result.score,
+ "cost_info": evaluation_record["cost_info"],
+ }
+ all_results.append(result)
+
+ # Summary Statistics
+ print("\n๐ Summary Statistics:")
+ avg_score = sum(r["score"] for r in all_results) / len(all_results) if all_results else 0
+ total_cost = sum(r["cost_info"]["total_cost"] for r in all_results)
+
+ print(
+ f" {policy.model_id}: {avg_score:.2%} success rate ({sum(r['score'] for r in all_results)}/{len(all_results)}) - Cost: ${total_cost:.2f}"
+ )
+ print(f"\n๐ฐ Total evaluation cost: ${total_cost:.2f}")
+ print("๐ Cost calculation uses actual API usage data.")
+
+ def save_results_jsonl(
+ evaluation_records: List[Dict], output_file: str = "evaluation_outputs/all_evaluations.jsonl"
+ ):
+ """Save all evaluation records in JSONL format (one JSON object per line)."""
+ output_path = Path(output_file)
+ output_path.parent.mkdir(exist_ok=True)
+
+ with open(output_path, "w") as f:
+ for record in evaluation_records:
+ json.dump(record, f, default=str)
+ f.write("\n")
+
+ print(f"๐ Saved JSONL file: {output_path}")
+ return output_path
+
+ save_results_jsonl(all_evaluation_records)
+
+ def save_evaluation_files(evaluation_records: List[Dict], output_dir: str = "evaluation_outputs"):
+ """Save evaluation records to individual files and create summary."""
+ output_path = Path(output_dir)
+ output_path.mkdir(exist_ok=True)
+
+ # Save individual evaluation files
+ for record in evaluation_records:
+ # Sanitize model_id for filename (replace slashes with underscores)
+ safe_model_id = record["model_id"].replace("/", "_").replace("\\", "_")
+ filename = f"{safe_model_id}_{record['scenario_id']}_evaluation.json"
+ filepath = output_path / filename
+
+ with open(filepath, "w") as f:
+ json.dump(record, f, indent=2, default=str)
+
+ # Create summary file
+ model_id = evaluation_records[0]["model_id"] if evaluation_records else "unknown"
+ summary = {
+ "evaluation_summary": {
+ "total_evaluations": len(evaluation_records),
+ "model_evaluated": model_id,
+ "scenarios_evaluated": list(set(r["scenario_id"] for r in evaluation_records)),
+ "timestamp": datetime.now().isoformat(),
+ },
+ "model_performance": {},
+ "scenario_difficulty": {},
+ }
+
+ # Calculate model performance
+ model_records = evaluation_records
+ total_score = sum(r["evaluation"]["score"] for r in model_records)
+ avg_score = total_score / len(model_records) if model_records else 0
+
+ # Calculate cost metrics
+ total_cost = sum(r.get("cost_info", {}).get("total_cost", 0) for r in model_records)
+ total_tokens = sum(r.get("cost_info", {}).get("total_tokens", 0) for r in model_records)
+ avg_cost_per_scenario = total_cost / len(model_records) if model_records else 0
+
+ summary["model_performance"][model_id] = {
+ "total_scenarios": len(model_records),
+ "total_score": total_score,
+ "average_score": avg_score,
+ "pass_rate": avg_score, # Since scores are 0 or 1
+ "total_cost": total_cost,
+ "average_cost_per_scenario": avg_cost_per_scenario,
+ "total_tokens": total_tokens,
+ # "cost_per_success": total_cost / total_score if total_score > 0 else float('inf')
+ }
+
+ # Calculate scenario difficulty
+ for scenario_id in summary["evaluation_summary"]["scenarios_evaluated"]:
+ scenario_records = [r for r in evaluation_records if r["scenario_id"] == scenario_id]
+ total_score = sum(r["evaluation"]["score"] for r in scenario_records)
+ avg_score = total_score / len(scenario_records) if scenario_records else 0
+
+ summary["scenario_difficulty"][scenario_id] = {
+ "models_tested": 1, # Single model
+ "total_score": total_score,
+ "average_score": avg_score,
+ "difficulty": "easy" if avg_score > 0.8 else "medium" if avg_score > 0.5 else "hard",
+ }
+
+ # Save summary
+ summary_path = output_path / "evaluation_summary.json"
+ with open(summary_path, "w") as f:
+ json.dump(summary, f, indent=2, default=str)
+
+ print(f"\n๐ Saved evaluation files to: {output_path}")
+ print(f" - {len(evaluation_records)} individual evaluation files")
+ print(" - 1 evaluation summary file")
+
+ return output_path
+
+ save_evaluation_files(all_evaluation_records)
+
+ def save_trajectory_files(trajectory_records: List[Dict], output_dir: str = "trajectory_outputs"):
+ """Save trajectory records to individual files and create summary."""
+ output_path = Path(output_dir)
+ output_path.mkdir(exist_ok=True)
+
+ # Save individual trajectory files
+ for record in trajectory_records:
+ # Sanitize model_id for filename (replace slashes with underscores)
+ safe_model_id = record["model_id"].replace("/", "_").replace("\\", "_")
+ filename = f"{safe_model_id}_{record['scenario_id']}_trajectory.json"
+ filepath = output_path / filename
+
+ with open(filepath, "w") as f:
+ json.dump(record, f, indent=2, default=str)
+
+ # Create summary file
+ model_id = trajectory_records[0]["model_id"] if trajectory_records else "unknown"
+ summary = {
+ "evaluation_summary": {
+ "total_trajectories": len(trajectory_records),
+ "model_evaluated": model_id,
+ "scenarios_evaluated": list(set(r["scenario_id"] for r in trajectory_records)),
+ "timestamp": datetime.now().isoformat(),
+ },
+ "model_performance": {},
+ "scenario_difficulty": {},
+ }
+
+ # Calculate model performance
+ model_records = trajectory_records
+ total_score = sum(r["evaluation"]["score"] for r in model_records)
+ avg_score = total_score / len(model_records) if model_records else 0
+
+ # Calculate cost metrics
+ total_cost = sum(r.get("cost_info", {}).get("total_cost", 0) for r in model_records)
+ total_tokens = sum(r.get("cost_info", {}).get("total_tokens", 0) for r in model_records)
+ avg_cost_per_scenario = total_cost / len(model_records) if model_records else 0
+
+ summary["model_performance"][model_id] = {
+ "total_scenarios": len(model_records),
+ "total_score": total_score,
+ "average_score": avg_score,
+ "pass_rate": avg_score, # Since scores are 0 or 1
+ "total_cost": total_cost,
+ "average_cost_per_scenario": avg_cost_per_scenario,
+ "total_tokens": total_tokens,
+ # "cost_per_success": total_cost / total_score if total_score > 0 else float('inf')
+ }
+
+ # Calculate scenario difficulty
+ for scenario_id in summary["evaluation_summary"]["scenarios_evaluated"]:
+ scenario_records = [r for r in trajectory_records if r["scenario_id"] == scenario_id]
+ total_score = sum(r["evaluation"]["score"] for r in scenario_records)
+ avg_score = total_score / len(scenario_records) if scenario_records else 0
+
+ summary["scenario_difficulty"][scenario_id] = {
+ "models_tested": 1, # Single model
+ "total_score": total_score,
+ "average_score": avg_score,
+ "difficulty": "easy" if avg_score > 0.8 else "medium" if avg_score > 0.5 else "hard",
+ }
+
+ # Save summary
+ summary_path = output_path / "trajectory_summary.json"
+ with open(summary_path, "w") as f:
+ json.dump(summary, f, indent=2, default=str)
+
+ print(f"\n๐ Saved trajectory files to: {output_path}")
+ print(f" - {len(trajectory_records)} individual trajectory files")
+ print(" - 1 trajectory summary file")
+
+ return output_path
+
+ save_trajectory_files(all_trajectory_records)
+
+ # Clean up
+ await envs.close()
+ if "EP_PLAYBACK_FILE" in os.environ:
+ del os.environ["EP_PLAYBACK_FILE"]
+
+ finally:
+ # Always stop the server
+ _stop_test_server(server)
+
+
+@pytest.mark.asyncio
+async def test_fireworks_multi_mock_environment_sessions(
+ multi_env_mock_dataset, fireworks_multi_env_mock_recording_file
+):
+ """Test multi-environment session handling with OpenAIPolicy for mock domain."""
+
+ print("\n๐งช === FIREWORKS MULTI-ENVIRONMENT SESSION TEST (MOCK DOMAIN) ===")
+
+ # Check if we're in CI mode and have existing recording
+ is_ci = os.environ.get("CI", "").lower() in ["true", "1", "yes"]
+ if is_ci and os.path.exists(fireworks_multi_env_mock_recording_file):
+ print("\n๐ฌ === CI MODE: PLAYBACK ONLY ===")
+
+ # Set up playback environment
+ os.environ["EP_PLAYBACK_FILE"] = fireworks_multi_env_mock_recording_file
+
+ # Create playback policy
+ playback_policy = ep.OpenAIPolicy(
+ model_id="gpt-4.1",
+ temperature=0.2,
+ system_prompt="You are a helpful task management assistant.",
+ )
+
+ # Run playback test
+ server = _create_test_server(8021, domain="mock") # Use unique port for mock
+
+ try:
+ envs = ep.make(
+ f"http://localhost:{server.port}/mcp/",
+ dataset=multi_env_mock_dataset,
+ model_id=playback_policy.model_id,
+ )
+
+ evaluation_rows = await ep.rollout(envs, policy=playback_policy, steps=10)
+
+ print(f"โ
Playback completed with {len(evaluation_rows)} evaluation rows")
+
+ await envs.close()
+
+ finally:
+ _stop_test_server(server)
+ if "EP_PLAYBACK_FILE" in os.environ:
+ del os.environ["EP_PLAYBACK_FILE"]
+
+ return
+
+ # RECORDING MODE
+ print("\n๐น === RECORDING MODE ===")
+
+ # Remove existing recording for clean run
+ if os.path.exists(fireworks_multi_env_mock_recording_file):
+ os.remove(fireworks_multi_env_mock_recording_file)
+ print(f"๐๏ธ Removed existing recording: {fireworks_multi_env_mock_recording_file}")
+
+ # Start server
+ server = _create_test_server(8021, domain="mock") # Use unique port for mock
+
+ try:
+ # Set up recording
+ os.environ["EP_PLAYBACK_FILE"] = fireworks_multi_env_mock_recording_file
+
+ # Create recording policy
+ policy = ep.OpenAIPolicy(
+ model_id="gpt-4.1",
+ temperature=0.2,
+ system_prompt="You are a helpful task management assistant.",
+ )
+
+ assert not policy.is_playback_mode(), "Should be in recording mode initially"
+
+ # Create multiple environments
+ envs = ep.make(
+ f"http://localhost:{server.port}/mcp/",
+ dataset=multi_env_mock_dataset,
+ model_id=policy.model_id,
+ )
+
+ print(f"๐ Created {len(envs.sessions)} environment sessions")
+
+ # Run rollout with multiple environments
+ start_time = time.time()
+ evaluation_rows = await ep.rollout(envs, policy=policy, steps=15)
+ duration = time.time() - start_time
+
+ # Validate results
+ assert len(evaluation_rows) == len(multi_env_mock_dataset), "Should have evaluation row for each environment"
+ assert all(eval_row.get_steps() > 0 for eval_row in evaluation_rows), "All evaluation rows should have steps"
+
+ print(
+ f"โ
Mock domain multi-environment test completed with {len(evaluation_rows)} evaluation rows in {duration:.2f}s"
+ )
+ print(f"๐ Mock domain recording saved to: {fireworks_multi_env_mock_recording_file}")
+
+ # Print evaluation summaries
+ print("๐ Mock Domain Multi-Environment Evaluation Summary:")
+ for i, eval_row in enumerate(evaluation_rows):
+ dataset_entry = multi_env_mock_dataset[i]
+ domain = dataset_entry.get("environment_context", {}).get("domain", "N/A")
+ print(
+ f" Evaluation {i} (domain: {domain}): {eval_row.get_steps()} steps, reward: {eval_row.get_total_reward():.2f}, terminated: {eval_row.get_terminated()}"
+ )
+
+ # ๐งช TAU2 REWARD FUNCTION EVALUATION
+ print(f"\n๐ฏ Evaluating {len(evaluation_rows)} mock domain evaluation rows")
+
+ for env_idx, eval_row in enumerate(evaluation_rows):
+ evaluation_criteria = multi_env_mock_dataset[env_idx]["evaluation_criteria"]
+ nl_assertions = evaluation_criteria["nl_assertions"]
+ communicate_info = evaluation_criteria["communicate_info"]
+ actions = evaluation_criteria["actions"]
+
+ print(f"\n๐ Environment {env_idx} conversation history:")
+ print(f" Messages: {len(eval_row.messages)} total")
+ print(
+ f" Evaluation criteria: {len(nl_assertions)} NL assertions, {len(communicate_info)} communication requirements, {len(actions)} actions"
+ )
+
+ eval_result = tau2_airline_eval(eval_row.messages, nl_assertions, communicate_info, actions)
+
+ # Print evaluation result details
+ print(f"๐ฏ Evaluation Result for env {env_idx}:")
+ print(f" Score: {eval_result.score}")
+ print(f" Reason: {eval_result.reason}")
+
+ # Clean up
+ await envs.close()
+ if "EP_PLAYBACK_FILE" in os.environ:
+ del os.environ["EP_PLAYBACK_FILE"]
+
+ finally:
+ # Always stop the server
+ _stop_test_server(server)
+
+
+@pytest.mark.asyncio
+async def test_fireworks_multi_retail_environment_sessions(
+ multi_env_retail_dataset, fireworks_multi_env_retail_recording_file
+):
+ """Test multi-environment session handling with OpenAIPolicy for retail domain."""
+
+ print("\n๐ === FIREWORKS MULTI-ENVIRONMENT SESSION TEST (RETAIL DOMAIN) ===")
+
+ # Check if we're in CI mode and have existing recording
+ is_ci = os.environ.get("CI", "").lower() in ["true", "1", "yes"]
+ if is_ci and os.path.exists(fireworks_multi_env_retail_recording_file):
+ print("\n๐ฌ === CI MODE: PLAYBACK ONLY ===")
+
+ # Set up playback environment
+ os.environ["EP_PLAYBACK_FILE"] = fireworks_multi_env_retail_recording_file
+
+ # Create playback policy
+ playback_policy = ep.OpenAIPolicy(
+ model_id="gpt-4.1",
+ temperature=0.2,
+ system_prompt="You are a helpful retail customer service agent.",
+ )
+
+ # Run playback test
+ server = _create_test_server(8022, domain="retail") # Use unique port for retail
+
+ try:
+ envs = ep.make(
+ f"http://localhost:{server.port}/mcp/",
+ dataset=multi_env_retail_dataset,
+ model_id=playback_policy.model_id,
+ )
+
+ evaluation_rows = await ep.rollout(envs, policy=playback_policy, steps=10)
+
+ print(f"โ
Playback completed with {len(evaluation_rows)} evaluation rows")
+
+ await envs.close()
+
+ finally:
+ _stop_test_server(server)
+ if "EP_PLAYBACK_FILE" in os.environ:
+ del os.environ["EP_PLAYBACK_FILE"]
+
+ return
+
+ # RECORDING MODE
+ print("\n๐น === RECORDING MODE ===")
+
+ # Remove existing recording for clean run
+ if os.path.exists(fireworks_multi_env_retail_recording_file):
+ os.remove(fireworks_multi_env_retail_recording_file)
+ print(f"๐๏ธ Removed existing recording: {fireworks_multi_env_retail_recording_file}")
+
+ # Start server
+ server = _create_test_server(8022, domain="retail") # Use unique port for retail
+
+ try:
+ # Set up recording
+ os.environ["EP_PLAYBACK_FILE"] = fireworks_multi_env_retail_recording_file
+
+ # Create recording policy
+ policy = ep.OpenAIPolicy(
+ model_id="gpt-4.1",
+ temperature=0.2,
+ system_prompt="You are a helpful retail customer service agent.",
+ )
+
+ assert not policy.is_playback_mode(), "Should be in recording mode initially"
+
+ # Create multiple environments
+ envs = ep.make(
+ f"http://localhost:{server.port}/mcp/",
+ dataset=multi_env_retail_dataset,
+ model_id=policy.model_id,
+ )
+
+ print(f"๐ Created {len(envs.sessions)} environment sessions")
+
+ # Run rollout with multiple environments
+ start_time = time.time()
+ evaluation_rows = await ep.rollout(envs, policy=policy, steps=15)
+ duration = time.time() - start_time
+
+ # Validate results
+ assert len(evaluation_rows) == len(multi_env_retail_dataset), "Should have evaluation row for each environment"
+ assert all(eval_row.get_steps() > 0 for eval_row in evaluation_rows), "All evaluation rows should have steps"
+
+ print(
+ f"โ
Retail domain multi-environment test completed with {len(evaluation_rows)} evaluation rows in {duration:.2f}s"
+ )
+ print(f"๐ Retail domain recording saved to: {fireworks_multi_env_retail_recording_file}")
+
+ # Print evaluation summaries
+ print("๐ Retail Domain Multi-Environment Evaluation Summary:")
+ for i, eval_row in enumerate(evaluation_rows):
+ dataset_entry = multi_env_retail_dataset[i]
+ domain = dataset_entry.get("environment_context", {}).get("domain", "N/A")
+ print(
+ f" Evaluation {i} (domain: {domain}): {eval_row.get_steps()} steps, reward: {eval_row.get_total_reward():.2f}, terminated: {eval_row.get_terminated()}"
+ )
+
+ # ๐งช TAU2 REWARD FUNCTION EVALUATION
+ print(f"\n๐ฏ Evaluating {len(evaluation_rows)} retail domain evaluation rows")
+
+ for env_idx, eval_row in enumerate(evaluation_rows):
+ evaluation_criteria = multi_env_retail_dataset[env_idx]["evaluation_criteria"]
+ nl_assertions = evaluation_criteria["nl_assertions"]
+ communicate_info = evaluation_criteria["communicate_info"]
+ actions = evaluation_criteria["actions"]
+
+ print(f"\n๐ Environment {env_idx} conversation history:")
+ print(f" Messages: {len(eval_row.messages)} total")
+ print(
+ f" Evaluation criteria: {len(nl_assertions)} NL assertions, {len(communicate_info)} communication requirements, {len(actions)} actions"
+ )
+
+ eval_result = tau2_airline_eval(eval_row.messages, nl_assertions, communicate_info, actions)
+
+ # Print evaluation result details
+ print(f"๐ฏ Evaluation Result for env {env_idx}:")
+ print(f" Score: {eval_result.score}")
+ print(f" Reason: {eval_result.reason}")
+
+ # Clean up
+ await envs.close()
+ if "EP_PLAYBACK_FILE" in os.environ:
+ del os.environ["EP_PLAYBACK_FILE"]
+
+ finally:
+ # Always stop the server
+ _stop_test_server(server)
+
+
+if __name__ == "__main__":
+ # Allow running directly for debugging
+ pytest.main([__file__, "-v", "-s"])
diff --git a/eval_protocol/pytest/utils.py b/eval_protocol/pytest/utils.py
index bed1ac89..429c3328 100644
--- a/eval_protocol/pytest/utils.py
+++ b/eval_protocol/pytest/utils.py
@@ -300,12 +300,12 @@ async def execute_row_with_backoff(task: asyncio.Task, row: EvaluationRow) -> Ev
except Exception as retry_error:
# Backoff gave up
row.rollout_status.status = RolloutStatus.Status.ERROR
- row.rollout_status.termination_reason = str(retry_error)
+ # row.rollout_status.termination_reason = str(retry_error)
return row
else:
# Non-retryable exception - fail immediately
row.rollout_status.status = RolloutStatus.Status.ERROR
- row.rollout_status.termination_reason = str(e)
+ # row.rollout_status.termination_reason = str(e)
return row
# Process all tasks concurrently with backoff retry
diff --git a/pyproject.toml b/pyproject.toml
index f79d6637..739d172f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -145,6 +145,10 @@ include = ["eval_protocol*", "development*", "vendor*"]
[tool.setuptools.package-data]
"eval_protocol" = ["../vite-app/dist/**/*"]
+"eval_protocol.mcp_servers.tau2" = ["*.md", "tests/system_prompts/*.md"]
+"eval_protocol.benchmarks" = ["data/*.jsonl"]
+"vendor.tau2" = ["data/**/*.md"]
+
[tool.versioneer]
VCS = "git"