From 8b01ab4dfc97308b083b040681b46c41b3d6bf99 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Tue, 19 Aug 2025 19:39:38 -0700 Subject: [PATCH 1/4] add examples --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 97998892..f44b1c8d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -140,7 +140,7 @@ ep = "eval_protocol.cli:main" eval_protocol = "eval_protocol.pytest.plugin" [tool.setuptools.packages.find] -include = ["eval_protocol*", "development*", "vendor*"] +include = ["eval_protocol*", "development*", "vendor*", "examples*"] [tool.setuptools.package-data] "eval_protocol" = ["../vite-app/dist/**/*"] From 5f0b275a485f0230f9baa76f480a09864e0e99e2 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Tue, 19 Aug 2025 19:42:37 -0700 Subject: [PATCH 2/4] move retail_dataset.jsonl --- .../benchmarks/data/retail_dataset.jsonl | 114 ++++++++++++++++++ .../benchmarks/test_tau_bench_retail.py | 2 +- 2 files changed, 115 insertions(+), 1 deletion(-) create mode 100644 eval_protocol/benchmarks/data/retail_dataset.jsonl diff --git a/eval_protocol/benchmarks/data/retail_dataset.jsonl b/eval_protocol/benchmarks/data/retail_dataset.jsonl new file mode 100644 index 00000000..8e0d1964 --- /dev/null +++ b/eval_protocol/benchmarks/data/retail_dataset.jsonl @@ -0,0 +1,114 @@ +{"id": "retail_task_0", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou received your order #W2378156 and wish to exchange the mechanical keyboard for a similar one but with clicky switches and the smart thermostat for one compatible with Google Home instead of Apple HomeKit. If there is no keyboard that is clicky, RGB backlight, full size, you'd go for no backlight.\nKnown info:\n\tYou are Yusuf Rossi in zip code 19122.\nUnknown info:\n\tYou do not remember your email address.\nTask instructions:\n\tYou are detail-oriented and want to make sure everything is addressed in one go."}, "evaluation_criteria": {"actions": [{"action_id": "0_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Yusuf", "last_name": "Rossi", "zip": "19122"}, "info": null}, {"action_id": "0_1", "name": "get_order_details", "arguments": {"order_id": "#W2378156"}, "info": null}, {"action_id": "0_2", "name": "get_product_details", "arguments": {"product_id": "1656367028"}, "info": null}, {"action_id": "0_3", "name": "get_product_details", "arguments": {"product_id": "4896585277"}, "info": null}, {"action_id": "0_4", "name": "exchange_delivered_order_items", "arguments": {"order_id": "#W2378156", "item_ids": ["1151293680", "4983901480"], "new_item_ids": ["7706410293", "7747408585"], "payment_method_id": "credit_card_9513926"}, "info": null}], "communicate_info": [], "nl_assertions": null}} +{"id": "retail_task_1", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou received your order #W2378156 and wish to exchange the mechanical keyboard for a similar one but with clicky switches and the smart thermostat for one compatible with Google Home instead of Apple HomeKit. If there is no keyboard that is clicky, RGB backlight, full size, you'd rather only exchange the thermostat.\nKnown info:\n\tYou are Yusuf Rossi in zip code 19122.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou are detail-oriented and want to make sure everything is addressed in one go."}, "evaluation_criteria": {"actions": [{"action_id": "1_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Yusuf", "last_name": "Rossi", "zip": "19122"}, "info": null}, {"action_id": "1_1", "name": "get_order_details", "arguments": {"order_id": "#W2378156"}, "info": null}, {"action_id": "1_2", "name": "get_product_details", "arguments": {"product_id": "1656367028"}, "info": null}, {"action_id": "1_3", "name": "get_product_details", "arguments": {"product_id": "4896585277"}, "info": null}, {"action_id": "1_4", "name": "exchange_delivered_order_items", "arguments": {"order_id": "#W2378156", "item_ids": ["4983901480"], "new_item_ids": ["7747408585"], "payment_method_id": "credit_card_9513926"}, "info": null}], "communicate_info": [], "nl_assertions": null}} +{"id": "retail_task_2", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to know how many tshirt options are available in the online store right now. You want to also return the cleaner, headphone, and smart watch.\nKnown info:\n\tYou are Yusuf Rossi in zip code 19122.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\t."}, "evaluation_criteria": {"actions": [{"action_id": "2_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Yusuf", "last_name": "Rossi", "zip": "19122"}, "info": null}, {"action_id": "2_1", "name": "get_product_details", "arguments": {"product_id": "6086499569"}, "info": null}, {"action_id": "2_3", "name": "get_product_details", "arguments": {"product_id": "9523456873"}, "info": null}, {"action_id": "2_4", "name": "get_user_details", "arguments": {"user_id": "yusuf_rossi_9620"}, "info": null}, {"action_id": "2_5", "name": "get_order_details", "arguments": {"order_id": "#W6247578"}, "info": null}, {"action_id": "2_6", "name": "get_order_details", "arguments": {"order_id": "#W9711842"}, "info": null}, {"action_id": "2_7", "name": "get_order_details", "arguments": {"order_id": "#W4776164"}, "info": null}, {"action_id": "2_8", "name": "get_order_details", "arguments": {"order_id": "#W6679257"}, "info": null}, {"action_id": "2_9", "name": "get_order_details", "arguments": {"order_id": "#W2378156"}, "info": null}, {"action_id": "2_10", "name": "get_product_details", "arguments": {"product_id": "9523456873"}, "info": null}, {"action_id": "2_11", "name": "return_delivered_order_items", "arguments": {"order_id": "#W2378156", "item_ids": ["4602305039", "4202497723", "9408160950"], "payment_method_id": "credit_card_9513926"}, "info": null}], "communicate_info": ["10"], "nl_assertions": null}} +{"id": "retail_task_3", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to know how many tshirt options are available in the online store right now. You want to modify all your pending small tshirt to purple, same size, same v-neck, and prefer polyester.\nKnown info:\n\tYou are Yusuf Rossi in zipcode 19122.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou are a private person that does not want to reveal much about yourself."}, "evaluation_criteria": {"actions": [{"action_id": "3_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Yusuf", "last_name": "Rossi", "zip": "19122"}, "info": null}, {"action_id": "3_1", "name": "get_product_details", "arguments": {"product_id": "6086499569"}, "info": null}, {"action_id": "3_3", "name": "get_product_details", "arguments": {"product_id": "9523456873"}, "info": null}, {"action_id": "3_4", "name": "get_user_details", "arguments": {"user_id": "yusuf_rossi_9620"}, "info": null}, {"action_id": "3_5", "name": "get_order_details", "arguments": {"order_id": "#W6247578"}, "info": null}, {"action_id": "3_6", "name": "get_order_details", "arguments": {"order_id": "#W9711842"}, "info": null}, {"action_id": "3_7", "name": "get_order_details", "arguments": {"order_id": "#W4776164"}, "info": null}, {"action_id": "3_8", "name": "get_order_details", "arguments": {"order_id": "#W6679257"}, "info": null}, {"action_id": "3_9", "name": "get_order_details", "arguments": {"order_id": "#W2378156"}, "info": null}, {"action_id": "3_10", "name": "get_product_details", "arguments": {"product_id": "9523456873"}, "info": null}, {"action_id": "3_11", "name": "get_user_details", "arguments": {"user_id": "yusuf_rossi_9620"}, "info": null}, {"action_id": "3_12", "name": "modify_pending_order_items", "arguments": {"order_id": "#W4776164", "item_ids": ["8349118980"], "new_item_ids": ["9647292434"], "payment_method_id": "credit_card_9513926"}, "info": null}], "communicate_info": ["10"], "nl_assertions": null}} +{"id": "retail_task_4", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to know how many tshirt options are available in the online store right now. You want to modify all your pending tshirts to purple, s size, same v-neck, and prefer polyester.\nKnown info:\n\tYou are Yusuf Rossi in zipcode 19122.\nUnknown info:\n\tYou do not remember your email address.\nTask instructions:\n\tYou are a private person that does not want to reveal much about yourself."}, "evaluation_criteria": {"actions": [{"action_id": "4_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Yusuf", "last_name": "Rossi", "zip": "19122"}, "info": null}, {"action_id": "4_1", "name": "get_product_details", "arguments": {"product_id": "6086499569"}, "info": null}, {"action_id": "4_3", "name": "get_product_details", "arguments": {"product_id": "9523456873"}, "info": null}, {"action_id": "4_4", "name": "get_user_details", "arguments": {"user_id": "yusuf_rossi_9620"}, "info": null}, {"action_id": "4_5", "name": "get_order_details", "arguments": {"order_id": "#W6247578"}, "info": null}, {"action_id": "4_6", "name": "get_order_details", "arguments": {"order_id": "#W9711842"}, "info": null}, {"action_id": "4_7", "name": "get_order_details", "arguments": {"order_id": "#W4776164"}, "info": null}, {"action_id": "4_8", "name": "get_order_details", "arguments": {"order_id": "#W6679257"}, "info": null}, {"action_id": "4_9", "name": "get_order_details", "arguments": {"order_id": "#W2378156"}, "info": null}, {"action_id": "4_10", "name": "get_product_details", "arguments": {"product_id": "9523456873"}, "info": null}, {"action_id": "4_11", "name": "get_user_details", "arguments": {"user_id": "yusuf_rossi_9620"}, "info": null}, {"action_id": "4_12", "name": "modify_pending_order_items", "arguments": {"order_id": "#W6247578", "item_ids": ["3799046073"], "new_item_ids": ["9647292434"], "payment_method_id": "credit_card_9513926"}, "info": null}, {"action_id": "4_13", "name": "modify_pending_order_items", "arguments": {"order_id": "#W4776164", "item_ids": ["8349118980"], "new_item_ids": ["9647292434"], "payment_method_id": "credit_card_9513926"}, "info": null}], "communicate_info": ["10"], "nl_assertions": null}} +{"id": "retail_task_5", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to exchange the water bottle and the desk lamp. You want to exchange the water bottle to a bigger one, and the desk lamp to a less bright one (prefer battery > USB > AC). If the agent asks for confirmation, only exchange the desk lamp. If the agent asks for confirmation again, do not exchange anything, and return the water bottle instead.\nKnown info:\n\tYou are mei_kovacs_8020 in zipcode 28236.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\t."}, "evaluation_criteria": {"actions": [{"action_id": "5_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Mei", "last_name": "Kovacs", "zip": "28236"}, "info": null}, {"action_id": "5_1", "name": "get_user_details", "arguments": {"user_id": "mei_kovacs_8020"}, "info": null}, {"action_id": "5_2", "name": "get_order_details", "arguments": {"order_id": "#W6390527"}, "info": null}, {"action_id": "5_3", "name": "get_product_details", "arguments": {"product_id": "6817146515"}, "info": null}, {"action_id": "5_4", "name": "return_delivered_order_items", "arguments": {"order_id": "#W6390527", "item_ids": ["8538875209"], "payment_method_id": "paypal_7644869"}, "info": null}], "communicate_info": [], "nl_assertions": null}} +{"id": "retail_task_6", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to exchange the water bottle and the desk lamp. You want to exchange the water bottle to a bigger one, and the desk lamp to a less bright one (prefer battery > USB > AC). If the agent asks for confirmation, only exchange the desk lamp.\nKnown info:\n\tYou are mei_kovacs_8020 living in zipcode 28236.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\t."}, "evaluation_criteria": {"actions": [{"action_id": "6_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Mei", "last_name": "Kovacs", "zip": "28236"}, "info": null}, {"action_id": "6_1", "name": "get_user_details", "arguments": {"user_id": "mei_kovacs_8020"}, "info": null}, {"action_id": "6_2", "name": "get_order_details", "arguments": {"order_id": "#W6390527"}, "info": null}, {"action_id": "6_3", "name": "get_product_details", "arguments": {"product_id": "8310926033"}, "info": null}, {"action_id": "6_4", "name": "get_product_details", "arguments": {"product_id": "6817146515"}, "info": null}, {"action_id": "6_5", "name": "exchange_delivered_order_items", "arguments": {"order_id": "#W6390527", "item_ids": ["8384507844"], "new_item_ids": ["7453605304"], "payment_method_id": "paypal_7644869"}, "info": null}], "communicate_info": [], "nl_assertions": null}} +{"id": "retail_task_7", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to exchange the water bottle and the desk lamp. You want to exchange the water bottle to a bigger one, and the desk lamp to a less bright one (prefer AC adapter > battery > USB). If the agent asks for confirmation, only exchange the desk lamp.\nKnown info:\n\tYou are mei_kovacs_8020 living in zipcode 28236.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\t."}, "evaluation_criteria": {"actions": [{"action_id": "7_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Mei", "last_name": "Kovacs", "zip": "28236"}, "info": null}, {"action_id": "7_1", "name": "get_user_details", "arguments": {"user_id": "mei_kovacs_8020"}, "info": null}, {"action_id": "7_2", "name": "get_order_details", "arguments": {"order_id": "#W6390527"}, "info": null}, {"action_id": "7_3", "name": "get_product_details", "arguments": {"product_id": "8310926033"}, "info": null}, {"action_id": "7_4", "name": "get_product_details", "arguments": {"product_id": "6817146515"}, "info": null}, {"action_id": "7_5", "name": "exchange_delivered_order_items", "arguments": {"order_id": "#W6390527", "item_ids": ["8384507844"], "new_item_ids": ["1569765161"], "payment_method_id": "paypal_7644869"}, "info": null}], "communicate_info": [], "nl_assertions": null}} +{"id": "retail_task_8", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to exchange the water bottle and the desk lamp. You want to exchange the water bottle to a bigger one, and the desk lamp to a brighter one (prefer battery > USB > AC). If the agent asks for confirmation, only exchange the desk lamp.\nKnown info:\n\tYou are mei_kovacs_8020 living in zipcode 28236.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\t."}, "evaluation_criteria": {"actions": [{"action_id": "8_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Mei", "last_name": "Kovacs", "zip": "28236"}, "info": null}, {"action_id": "8_1", "name": "get_user_details", "arguments": {"user_id": "mei_kovacs_8020"}, "info": null}, {"action_id": "8_2", "name": "get_order_details", "arguments": {"order_id": "#W6390527"}, "info": null}, {"action_id": "8_3", "name": "get_product_details", "arguments": {"product_id": "8310926033"}, "info": null}, {"action_id": "8_4", "name": "get_product_details", "arguments": {"product_id": "6817146515"}, "info": null}, {"action_id": "8_5", "name": "exchange_delivered_order_items", "arguments": {"order_id": "#W6390527", "item_ids": ["8384507844"], "new_item_ids": ["9083642334"], "payment_method_id": "paypal_7644869"}, "info": null}], "communicate_info": [], "nl_assertions": null}} +{"id": "retail_task_9", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to exchange the water bottle and the desk lamp. You want to exchange the water bottle to a bigger one, and the desk lamp to a brighter one (prefer AC adapter > battery > USB). When the agent asks for confirmation, suddenly change your mind and ask to only exchange the desk lamp.\nKnown info:\n\tYou are mei_kovacs_8020 living in zipcode 28236.\nUnknown info:\n\tYou don't know your email.\nTask instructions:\n\t."}, "evaluation_criteria": {"actions": [{"action_id": "9_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Mei", "last_name": "Kovacs", "zip": "28236"}, "info": null}, {"action_id": "9_1", "name": "get_user_details", "arguments": {"user_id": "mei_kovacs_8020"}, "info": null}, {"action_id": "9_2", "name": "get_order_details", "arguments": {"order_id": "#W6390527"}, "info": null}, {"action_id": "9_3", "name": "get_product_details", "arguments": {"product_id": "8310926033"}, "info": null}, {"action_id": "9_4", "name": "get_product_details", "arguments": {"product_id": "6817146515"}, "info": null}, {"action_id": "9_5", "name": "exchange_delivered_order_items", "arguments": {"order_id": "#W6390527", "item_ids": ["8384507844"], "new_item_ids": ["7624783998"], "payment_method_id": "paypal_7644869"}, "info": null}], "communicate_info": [], "nl_assertions": null}} +{"id": "retail_task_10", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tFor some reason, you want to return all things ordered. You have two payment methods and two orders, and you want to refund each order to the other order's payment method. If not possible, you are angry and swear for a few times, then ask for human representative.\nKnown info:\n\tYou are mia_garcia_4516 (mia.garcia2723@example.com).\nTask instructions:\n\tYou are a mysterious person and do not want to reveal much about yourself or speak too many words at the same time."}, "evaluation_criteria": {"actions": [{"action_id": "10_0", "name": "find_user_id_by_email", "arguments": {"email": "mia.garcia2723@example.com"}, "info": null}, {"action_id": "10_1", "name": "get_user_details", "arguments": {"user_id": "mia_garcia_4516"}, "info": null}, {"action_id": "10_2", "name": "get_order_details", "arguments": {"order_id": "#W5490111"}, "info": null}, {"action_id": "10_3", "name": "get_order_details", "arguments": {"order_id": "#W7387996"}, "info": null}, {"action_id": "10_4", "name": "transfer_to_human_agents", "arguments": {"summary": "The user wants to refund each order to the opposite order's payment method, but the agent cannot help."}, "info": null}], "communicate_info": [], "nl_assertions": null}} +{"id": "retail_task_11", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tFor some reason, you want to return all things ordered. You have two payment methods and two orders, and you want to refund each order to the other order's payment method. If not possible, you are angry and swear for a few times, then agree to return all things with the original payment method.\nKnown info:\n\tYou are mia_garcia_4516 (mia.garcia2723@example.com).\nTask instructions:\n\tYou are a mysterious person and do not want to reveal much about yourself or speak too many words at the same time."}, "evaluation_criteria": {"actions": [{"action_id": "11_0", "name": "find_user_id_by_email", "arguments": {"email": "mia.garcia2723@example.com"}, "info": null}, {"action_id": "11_1", "name": "get_user_details", "arguments": {"user_id": "mia_garcia_4516"}, "info": null}, {"action_id": "11_2", "name": "get_order_details", "arguments": {"order_id": "#W5490111"}, "info": null}, {"action_id": "11_3", "name": "get_order_details", "arguments": {"order_id": "#W7387996"}, "info": null}, {"action_id": "11_4", "name": "return_delivered_order_items", "arguments": {"order_id": "#W5490111", "item_ids": ["4579334072", "1421289881", "6117189161", "4947717507"], "payment_method_id": "credit_card_3124723"}, "info": null}, {"action_id": "11_5", "name": "return_delivered_order_items", "arguments": {"order_id": "#W7387996", "item_ids": ["5796612084"], "payment_method_id": "paypal_9497703"}, "info": null}], "communicate_info": [], "nl_assertions": null}} +{"id": "retail_task_12", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou just got into gaming and want to cancel or return everything not associated with it. (Everything except a keyboard and a mouse, but do not reveal it to the agent). PayPal is prefered for refund, but otherwise you are angry and ask for human agent for help.\nKnown info:\n\tYou are mia_garcia_4516 (mia.garcia2723@example.com).\nTask instructions:\n\tYou are into gaming but realized the importance of studying hard recently."}, "evaluation_criteria": {"actions": [{"action_id": "12_0", "name": "find_user_id_by_email", "arguments": {"email": "mia.garcia2723@example.com"}, "info": null}, {"action_id": "12_1", "name": "get_user_details", "arguments": {"user_id": "mia_garcia_4516"}, "info": null}, {"action_id": "12_2", "name": "get_order_details", "arguments": {"order_id": "#W5490111"}, "info": null}, {"action_id": "12_3", "name": "get_order_details", "arguments": {"order_id": "#W7387996"}, "info": null}, {"action_id": "12_4", "name": "return_delivered_order_items", "arguments": {"order_id": "#W5490111", "item_ids": ["4579334072", "6117189161", "4947717507"], "payment_method_id": "paypal_9497703"}, "info": null}, {"action_id": "12_5", "name": "transfer_to_human_agents", "arguments": {"summary": "The user prefers PayPal for refund, but the agent cannot help."}, "info": null}], "communicate_info": [], "nl_assertions": null}} +{"id": "retail_task_13", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou just got into gaming and want to cancel or return everything not associated with it. (Everything except a keyboard and a mouse, but do not reveal it to the agent). PayPal is prefered for refund, but otherwise credit card can be accepted.\nKnown info:\n\tYou are mia_garcia_4516 with email mia.garcia2723@example.com\nTask instructions:\n\tYou are into gaming but realized the importance of studying hard."}, "evaluation_criteria": {"actions": [{"action_id": "13_0", "name": "find_user_id_by_email", "arguments": {"email": "mia.garcia2723@example.com"}, "info": null}, {"action_id": "13_1", "name": "get_user_details", "arguments": {"user_id": "mia_garcia_4516"}, "info": null}, {"action_id": "13_2", "name": "get_order_details", "arguments": {"order_id": "#W5490111"}, "info": null}, {"action_id": "13_3", "name": "get_order_details", "arguments": {"order_id": "#W7387996"}, "info": null}, {"action_id": "13_4", "name": "return_delivered_order_items", "arguments": {"order_id": "#W5490111", "item_ids": ["4579334072", "6117189161", "4947717507"], "payment_method_id": "paypal_9497703"}, "info": null}, {"action_id": "13_5", "name": "return_delivered_order_items", "arguments": {"order_id": "#W5490111", "item_ids": ["4579334072", "6117189161", "4947717507"], "payment_method_id": "credit_card_3124723"}, "info": null}], "communicate_info": [], "nl_assertions": null}} +{"id": "retail_task_14", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou just quit gaming and want to cancel or return everything associated with it. (It's just a keyboard and a mouse, but do not reveal it to the agent). Original payment is preferred.\nKnown info:\n\tYou are mia_garcia_4516 with email mia.garcia2723@example.com\nTask instructions:\n\tYou are into gaming but realized the importance of studying hard."}, "evaluation_criteria": {"actions": [{"action_id": "14_0", "name": "find_user_id_by_email", "arguments": {"email": "mia.garcia2723@example.com"}, "info": null}, {"action_id": "14_1", "name": "get_user_details", "arguments": {"user_id": "mia_garcia_4516"}, "info": null}, {"action_id": "14_2", "name": "get_order_details", "arguments": {"order_id": "#W5490111"}, "info": null}, {"action_id": "14_3", "name": "get_order_details", "arguments": {"order_id": "#W7387996"}, "info": null}, {"action_id": "14_4", "name": "return_delivered_order_items", "arguments": {"order_id": "#W5490111", "item_ids": ["1421289881"], "payment_method_id": "credit_card_3124723"}, "info": null}, {"action_id": "14_5", "name": "return_delivered_order_items", "arguments": {"order_id": "#W7387996", "item_ids": ["5796612084"], "payment_method_id": "paypal_9497703"}, "info": null}], "communicate_info": [], "nl_assertions": null}} +{"id": "retail_task_15", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to modify the pending boots to a size 8, and want the material, but do not care about waterproof or not.\nKnown info:\n\tYou are Fatima Johnson in zipcode 78712.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou are a private person that does not want to reveal much about yourself."}, "evaluation_criteria": {"actions": [{"action_id": "15_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Fatima", "last_name": "Johnson", "zip": "78712"}, "info": null}, {"action_id": "15_1", "name": "get_user_details", "arguments": {"user_id": "fatima_johnson_7581"}, "info": null}, {"action_id": "15_2", "name": "get_order_details", "arguments": {"order_id": "#W9389413"}, "info": null}, {"action_id": "15_3", "name": "get_order_details", "arguments": {"order_id": "#W8665881"}, "info": null}, {"action_id": "15_4", "name": "get_order_details", "arguments": {"order_id": "#W5199551"}, "info": null}, {"action_id": "15_5", "name": "get_product_details", "arguments": {"product_id": "7363354090"}, "info": null}, {"action_id": "15_6", "name": "modify_pending_order_items", "arguments": {"order_id": "#W5199551", "item_ids": ["1615379700"], "new_item_ids": ["3613716226"], "payment_method_id": "paypal_5364164"}, "info": null}], "communicate_info": [], "nl_assertions": null}} +{"id": "retail_task_16", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to cancel all pending orders (since they are no longer needed) and return the watch you have received (but nothing else), and you want to know the total amount you can get back.\nKnown info:\n\tYou are Fatima Johnson in zipcode 78712.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou are a private person that does not want to reveal much about yourself."}, "evaluation_criteria": {"actions": [{"action_id": "16_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Fatima", "last_name": "Johnson", "zip": "78712"}, "info": null}, {"action_id": "16_1", "name": "get_user_details", "arguments": {"user_id": "fatima_johnson_7581"}, "info": null}, {"action_id": "16_2", "name": "get_order_details", "arguments": {"order_id": "#W5199551"}, "info": null}, {"action_id": "16_3", "name": "get_order_details", "arguments": {"order_id": "#W8665881"}, "info": null}, {"action_id": "16_4", "name": "get_order_details", "arguments": {"order_id": "#W9389413"}, "info": null}, {"action_id": "16_5", "name": "calculate", "arguments": {"expression": "3131.1 + 4777.75 + 367.38"}, "info": null}, {"action_id": "16_6", "name": "cancel_pending_order", "arguments": {"order_id": "#W5199551", "reason": "no longer needed"}, "info": null}, {"action_id": "16_7", "name": "cancel_pending_order", "arguments": {"order_id": "#W8665881", "reason": "no longer needed"}, "info": null}, {"action_id": "16_8", "name": "return_delivered_order_items", "arguments": {"order_id": "#W9389413", "item_ids": ["2554056026"], "payment_method_id": "paypal_5364164"}, "info": null}], "communicate_info": ["8276.23"], "nl_assertions": null}} +{"id": "retail_task_17", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to change #W8665881 to be delivered to Suite 641 instead.\nKnown info:\n\tYou are Fatima Johnson in zipcode 78712.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou are a private person that does not want to reveal much about yourself."}, "evaluation_criteria": {"actions": [{"action_id": "17_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Fatima", "last_name": "Johnson", "zip": "78712"}, "info": null}, {"action_id": "17_1", "name": "get_user_details", "arguments": {"user_id": "fatima_johnson_7581"}, "info": null}, {"action_id": "17_2", "name": "get_order_details", "arguments": {"order_id": "#W5199551"}, "info": null}, {"action_id": "17_3", "name": "get_order_details", "arguments": {"order_id": "#W8665881"}, "info": null}, {"action_id": "17_4", "name": "get_order_details", "arguments": {"order_id": "#W9389413"}, "info": null}, {"action_id": "17_5", "name": "modify_pending_order_address", "arguments": {"order_id": "#W8665881", "address1": "123 Elm Street", "address2": "Suite 641", "city": "Austin", "state": "TX", "country": "USA", "zip": "78712"}, "info": null}], "communicate_info": [], "nl_assertions": null}} +{"id": "retail_task_18", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to return the office chair because it came with some broken pieces. But if the agent asks you for confirm, you say you want to rethink for a while, and then change your mind to exchange for the same item.\nKnown info:\n\tYou are Mei Davis in zipcode 80217.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou are in debt and sad today, but very brief."}, "evaluation_criteria": {"actions": [{"action_id": "18_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Mei", "last_name": "Davis", "zip": "80217"}, "info": null}, {"action_id": "18_1", "name": "get_user_details", "arguments": {"user_id": "mei_davis_8935"}, "info": null}, {"action_id": "18_2", "name": "get_order_details", "arguments": {"order_id": "#W2890441"}, "info": null}, {"action_id": "18_3", "name": "get_product_details", "arguments": {"product_id": "4794339885"}, "info": null}, {"action_id": "18_4", "name": "exchange_delivered_order_items", "arguments": {"order_id": "#W2890441", "item_ids": ["8069050545"], "new_item_ids": ["8069050545"], "payment_method_id": "credit_card_1061405"}, "info": null}], "communicate_info": [], "nl_assertions": null}} +{"id": "retail_task_19", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to return the water bottle, and exchange the pet bed and office chair to the cheapest version. Mention the two things together and you want to get both done. If and only if you can only do one of the two things, you prefer to do whatever saves you most money. Ask the agent how much money you can save in both options.\nKnown info:\n\tYou are Mei Davis in zipcode 80217.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou are in debt and sad today, but very brief."}, "evaluation_criteria": {"actions": [{"action_id": "19_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Mei", "last_name": "Davis", "zip": "80217"}, "info": null}, {"action_id": "19_1", "name": "get_user_details", "arguments": {"user_id": "mei_davis_8935"}, "info": null}, {"action_id": "19_2", "name": "get_order_details", "arguments": {"order_id": "#W2890441"}, "info": null}, {"action_id": "19_3", "name": "get_order_details", "arguments": {"order_id": "#W1267569"}, "info": null}, {"action_id": "19_4", "name": "get_product_details", "arguments": {"product_id": "2747247837"}, "info": null}, {"action_id": "19_5", "name": "get_product_details", "arguments": {"product_id": "4794339885"}, "info": null}, {"action_id": "19_6", "name": "return_delivered_order_items", "arguments": {"order_id": "#W2890441", "item_ids": ["2366567022"], "payment_method_id": "credit_card_1061405"}, "info": null}], "communicate_info": ["54.04", "41.64"], "nl_assertions": null}} +{"id": "retail_task_20", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou just won a lottery, and you want to upgrade all your items to the most expensive variants (the new variants can have different features from the originals, but make sure the new shoe is still the same size). You want to pay the difference with your Giftcard. If the agent says giftcard is not possible, using PayPal is fine.\nKnown info:\n\tYou are Ethan Garcia, and you live in Denver, 80280.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou are a mysterious person and do not want to reveal much about yourself."}, "evaluation_criteria": {"actions": [{"action_id": "20_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Ethan", "last_name": "Garcia", "zip": "80280"}, "info": null}, {"action_id": "20_1", "name": "get_user_details", "arguments": {"user_id": "ethan_garcia_1261"}, "info": null}, {"action_id": "20_2", "name": "get_order_details", "arguments": {"order_id": "#W4967593"}, "info": null}, {"action_id": "20_3", "name": "get_order_details", "arguments": {"order_id": "#W9911714"}, "info": null}, {"action_id": "20_4", "name": "get_product_details", "arguments": {"product_id": "8310926033"}, "info": null}, {"action_id": "20_5", "name": "get_product_details", "arguments": {"product_id": "1656367028"}, "info": null}, {"action_id": "20_6", "name": "get_product_details", "arguments": {"product_id": "6938111410"}, "info": null}, {"action_id": "20_7", "name": "get_product_details", "arguments": {"product_id": "5149340237"}, "info": null}, {"action_id": "20_8", "name": "modify_pending_order_items", "arguments": {"order_id": "#W9911714", "item_ids": ["2366567022", "1340995114", "9791469541", "1763705424"], "new_item_ids": ["4579334072", "1151293680", "4107812777", "2882812427"], "payment_method_id": "gift_card_4332117"}, "info": null}, {"action_id": "20_9", "name": "get_order_details", "arguments": {"order_id": "#W5733668"}, "info": null}], "communicate_info": [], "nl_assertions": null}} +{"id": "retail_task_21", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to exchange your shoes to item ID 4107812777, and use your gift card to cover possible charges. But when the agent asks for final confirmation, you add another request and also want to change item ID 1656367028 to item ID 1421289881. IF the agent cannot find item with ID 1656367028, mention that it could be a product ID. You are not familiar with the domain and might confuse product and item ids, so ask the agent to figure out the details on its own if needed. You want to know your gift card balance after all these changes are complete.\nKnown info:\n\tYou are Ethan Garcia, and you live in Denver, 80280.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou are a mysterious person and do not want to reveal much about yourself."}, "evaluation_criteria": {"actions": [{"action_id": "21_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Ethan", "last_name": "Garcia", "zip": "80280"}, "info": null}, {"action_id": "21_1", "name": "get_user_details", "arguments": {"user_id": "ethan_garcia_1261"}, "info": null}, {"action_id": "21_2", "name": "get_order_details", "arguments": {"order_id": "#W4967593"}, "info": null}, {"action_id": "21_3", "name": "get_order_details", "arguments": {"order_id": "#W9911714"}, "info": null}, {"action_id": "21_4", "name": "get_order_details", "arguments": {"order_id": "#W5733668"}, "info": null}, {"action_id": "21_5", "name": "get_product_details", "arguments": {"product_id": "4107812777"}, "info": null}, {"action_id": "21_6", "name": "get_product_details", "arguments": {"product_id": "1421289881"}, "info": null}, {"action_id": "21_7", "name": "get_product_details", "arguments": {"product_id": "1656367028"}, "info": null}, {"action_id": "21_8", "name": "get_product_details", "arguments": {"product_id": "4107812777"}, "info": null}, {"action_id": "21_9", "name": "get_product_details", "arguments": {"product_id": "6938111410"}, "info": null}, {"action_id": "21_10", "name": "calculate", "arguments": {"expression": "155.33 - 147.05 + 268.77 - 235.13"}, "info": null}, {"action_id": "21_11", "name": "modify_pending_order_items", "arguments": {"order_id": "#W9911714", "item_ids": ["9791469541", "1340995114"], "new_item_ids": ["4107812777", "1421289881"], "payment_method_id": "gift_card_4332117"}, "info": null}], "communicate_info": ["44.08"], "nl_assertions": null}} +{"id": "retail_task_22", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to change your user address and all order addresses to be 101 Highway, New York, 10001. But after the change you regret it and want to change the user address back to the original address.\nKnown info:\n\tYou are Ethan Garcia, and you live in Denver, 80280.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou are a mysterious person and do not want to reveal much about yourself."}, "evaluation_criteria": {"actions": [{"action_id": "22_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Ethan", "last_name": "Garcia", "zip": "80280"}, "info": null}, {"action_id": "22_1", "name": "modify_user_address", "arguments": {"user_id": "ethan_garcia_1261", "address1": "101 Highway", "address2": "", "city": "New York", "state": "NY", "country": "USA", "zip": "10001"}, "info": null}, {"action_id": "22_2", "name": "get_order_details", "arguments": {"order_id": "#W4967593"}, "info": null}, {"action_id": "22_3", "name": "get_order_details", "arguments": {"order_id": "#W9911714"}, "info": null}, {"action_id": "22_4", "name": "get_order_details", "arguments": {"order_id": "#W5733668"}, "info": null}, {"action_id": "22_5", "name": "modify_pending_order_address", "arguments": {"order_id": "#W9911714", "address1": "101 Highway", "address2": "", "city": "New York", "state": "NY", "country": "USA", "zip": "10001"}, "info": null}, {"action_id": "22_6", "name": "modify_user_address", "arguments": {"user_id": "ethan_garcia_1261", "address1": "667 Highland Drive", "address2": "Suite 865", "city": "Denver", "state": "CO", "country": "USA", "zip": "80280"}, "info": null}], "communicate_info": [], "nl_assertions": null}} +{"id": "retail_task_23", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to exchange the helmet for a medium sized, red, high ventilation type, and you want to exchange the luggage set (in another order) to a two-piece black one with soft material. Lastly, you want to modify the grill you just ordered to the same type as the one you already received.\nKnown info:\n\tYou are Sofia Hernandez, and you live in Seattle, WA, 98193.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\t."}, "evaluation_criteria": {"actions": [{"action_id": "23_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Sofia", "last_name": "Hernandez", "zip": "98193"}, "info": null}, {"action_id": "23_1", "name": "get_user_details", "arguments": {"user_id": "sofia_hernandez_5364"}, "info": null}, {"action_id": "23_2", "name": "get_order_details", "arguments": {"order_id": "#W3561391"}, "info": null}, {"action_id": "23_3", "name": "get_order_details", "arguments": {"order_id": "#W6876713"}, "info": null}, {"action_id": "23_4", "name": "get_order_details", "arguments": {"order_id": "#W9609649"}, "info": null}, {"action_id": "23_5", "name": "get_order_details", "arguments": {"order_id": "#W3947049"}, "info": null}, {"action_id": "23_6", "name": "get_product_details", "arguments": {"product_id": "7765186836"}, "info": null}, {"action_id": "23_7", "name": "exchange_delivered_order_items", "arguments": {"order_id": "#W3947049", "item_ids": ["3358616356"], "new_item_ids": ["8573379326"], "payment_method_id": "credit_card_7901829"}, "info": null}, {"action_id": "23_8", "name": "get_product_details", "arguments": {"product_id": "5426915165"}, "info": null}, {"action_id": "23_9", "name": "exchange_delivered_order_items", "arguments": {"order_id": "#W6876713", "item_ids": ["6301799585"], "new_item_ids": ["8926329222"], "payment_method_id": "credit_card_7901829"}, "info": null}, {"action_id": "23_10", "name": "get_product_details", "arguments": {"product_id": "6819683148"}, "info": null}, {"action_id": "23_11", "name": "modify_pending_order_items", "arguments": {"order_id": "#W3561391", "item_ids": ["5946177616"], "new_item_ids": ["7082455361"], "payment_method_id": "credit_card_7901829"}, "info": null}], "communicate_info": [], "nl_assertions": null}} +{"id": "retail_task_24", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to cancel the grill, but if the agent asks you to confirm, you regret and want to keep it. You then want to ask which two t-shirts you have ordered in another order, and what materials are they.\nKnown info:\n\tYou are Sofia Hernandez, and you live in Seattle, WA, 98193.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tMake everything sound very natural and make up reasons."}, "evaluation_criteria": {"actions": [], "communicate_info": ["polyester", "cotton"], "nl_assertions": null}} +{"id": "retail_task_25", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou have an order sent to Texas by accident, and you want to know the tracking number of the order, and return all items in it except the pet bed. You want the refund to your amex credit card, and if the agent cannot help, transfer to a human. You don't remember the order number. It is urgent.\nKnown info:\n\tYou are Isabella Johansson, and you live in zipcode 32286.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\t."}, "evaluation_criteria": {"actions": [{"action_id": "25_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Isabella", "last_name": "Johansson", "zip": "32286"}, "info": null}, {"action_id": "25_1", "name": "get_user_details", "arguments": {"user_id": "isabella_johansson_2152"}, "info": null}, {"action_id": "25_2", "name": "get_order_details", "arguments": {"order_id": "#W3792453"}, "info": null}, {"action_id": "25_3", "name": "get_order_details", "arguments": {"order_id": "#W7181492"}, "info": null}, {"action_id": "25_4", "name": "get_order_details", "arguments": {"order_id": "#W5565470"}, "info": null}, {"action_id": "25_5", "name": "get_order_details", "arguments": {"order_id": "#W2575533"}, "info": null}], "communicate_info": [], "nl_assertions": null}} +{"id": "retail_task_26", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou have an order sent to Texas by accident, and you want to know the tracking number of the order, and return all items in it except the pet bed. You don't remember the order number. It is urgent.\nKnown info:\n\tYou are Isabella Johansson, and you live in zipcode 32286.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\t."}, "evaluation_criteria": {"actions": [{"action_id": "26_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Isabella", "last_name": "Johansson", "zip": "32286"}, "info": null}, {"action_id": "26_1", "name": "get_user_details", "arguments": {"user_id": "isabella_johansson_2152"}, "info": null}, {"action_id": "26_2", "name": "get_order_details", "arguments": {"order_id": "#W3792453"}, "info": null}, {"action_id": "26_3", "name": "get_order_details", "arguments": {"order_id": "#W7181492"}, "info": null}, {"action_id": "26_4", "name": "get_order_details", "arguments": {"order_id": "#W5565470"}, "info": null}, {"action_id": "26_5", "name": "get_order_details", "arguments": {"order_id": "#W2575533"}, "info": null}, {"action_id": "26_6", "name": "return_delivered_order_items", "arguments": {"order_id": "#W5565470", "item_ids": ["7602931732", "9570044148"], "payment_method_id": "paypal_3024827"}, "info": null}, {"action_id": "26_7", "name": "transfer_to_human_agents", "arguments": {"summary": "The user wants to refund to the amex credit card, but the agent cannot help."}, "info": null}], "communicate_info": [], "nl_assertions": null}} +{"id": "retail_task_27", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tFrom a recent order, you want to return the hose, backpack, and exchange the hiking boots to the exact same item but a waterproof variant. Make sure you mention the two requests at the same time, and if the agent says they can only do one, you prefer to do the exchange.\nKnown info:\n\tYou are Isabella Johansson, and you live in zipcode 32286.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou are a bit anxious and want to get things done quickly."}, "evaluation_criteria": {"actions": [{"action_id": "27_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Isabella", "last_name": "Johansson", "zip": "32286"}, "info": null}, {"action_id": "27_1", "name": "get_user_details", "arguments": {"user_id": "isabella_johansson_2152"}, "info": null}, {"action_id": "27_2", "name": "get_order_details", "arguments": {"order_id": "#W3792453"}, "info": null}, {"action_id": "27_3", "name": "get_order_details", "arguments": {"order_id": "#W7181492"}, "info": null}, {"action_id": "27_4", "name": "get_product_details", "arguments": {"product_id": "7363354090"}, "info": null}, {"action_id": "27_5", "name": "exchange_delivered_order_items", "arguments": {"order_id": "#W7181492", "item_ids": ["8118291112"], "new_item_ids": ["8277474082"], "payment_method_id": "paypal_3024827"}, "info": null}], "communicate_info": [], "nl_assertions": null}} +{"id": "retail_task_28", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to return the skateboard, garden hose, backpack, keyboard, bed from a recent order and also cancel the hose from a pending order you just placed. If cancelling one item in an order is not possible, forget about it since you just want to cancel the hose and nothing else. You want to know how much you can get in total as refund after everything is done.\nKnown info:\n\tYou are Isabella Johansson, and you live in zipcode 32286.\nUnknown info:\n\tYou don't know your email.\nTask instructions:\n\tYou are extremely brief but patient."}, "evaluation_criteria": {"actions": [{"action_id": "28_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Isabella", "last_name": "Johansson", "zip": "32286"}, "info": null}, {"action_id": "28_1", "name": "get_user_details", "arguments": {"user_id": "isabella_johansson_2152"}, "info": null}, {"action_id": "28_2", "name": "get_order_details", "arguments": {"order_id": "#W3792453"}, "info": null}, {"action_id": "28_3", "name": "get_order_details", "arguments": {"order_id": "#W7181492"}, "info": null}, {"action_id": "28_4", "name": "get_order_details", "arguments": {"order_id": "#W5565470"}, "info": null}, {"action_id": "28_5", "name": "get_order_details", "arguments": {"order_id": "#W2575533"}, "info": null}, {"action_id": "28_6", "name": "return_delivered_order_items", "arguments": {"order_id": "#W3792453", "item_ids": ["4293355847"], "payment_method_id": "paypal_3024827"}, "info": null}, {"action_id": "28_7", "name": "return_delivered_order_items", "arguments": {"order_id": "#W7181492", "item_ids": ["5753502325", "9851293632"], "payment_method_id": "paypal_3024827"}, "info": null}, {"action_id": "28_8", "name": "return_delivered_order_items", "arguments": {"order_id": "#W5565470", "item_ids": ["9570044148", "6857426243"], "payment_method_id": "paypal_3024827"}, "info": null}, {"action_id": "28_9", "name": "get_order_details", "arguments": {"order_id": "#W2575533"}, "info": null}, {"action_id": "28_10", "name": "calculate", "arguments": {"expression": "200.8 + 96.35 + 193.38 + 231.37 + 196.53"}, "info": null}], "communicate_info": ["918.43"], "nl_assertions": null}} +{"id": "retail_task_29", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to exchange your skateboard for a shorter bamboo material one. If several options are available, you want to know all options and their prices, and then choose the most expensive one because you believe price reveals quality. Also, you want to exchange the garden hose you received for the type that you just ordered (in a pending order).\nKnown info:\n\tYou are Isabella Johansson, and you live in zipcode 32286.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou are a chill person but want to get both things done."}, "evaluation_criteria": {"actions": [{"action_id": "29_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Isabella", "last_name": "Johansson", "zip": "32286"}, "info": null}, {"action_id": "29_1", "name": "get_user_details", "arguments": {"user_id": "isabella_johansson_2152"}, "info": null}, {"action_id": "29_2", "name": "get_order_details", "arguments": {"order_id": "#W3792453"}, "info": null}, {"action_id": "29_3", "name": "get_product_details", "arguments": {"product_id": "1968349452"}, "info": null}, {"action_id": "29_4", "name": "exchange_delivered_order_items", "arguments": {"order_id": "#W3792453", "item_ids": ["4293355847"], "new_item_ids": ["8176740019"], "payment_method_id": "paypal_3024827"}, "info": null}, {"action_id": "29_5", "name": "exchange_delivered_order_items", "arguments": {"order_id": "#W7181492", "item_ids": ["5753502325"], "new_item_ids": ["5206946487"], "payment_method_id": "paypal_3024827"}, "info": null}], "communicate_info": ["180.1", "189.57", "208.6"], "nl_assertions": null}} +{"id": "retail_task_30", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou just received your tablet and it was damaged when you opened the package. You want to know the tracking number of that order first. Also if the agent can help you exchange or return the tablet (you prefer exchange for the same item, but if it is not available just return it). If tablet is returned, also cancel the charger you just bought, because it goes with the tablet. And finally, return the sneaker.\nKnown info:\n\tYou are Olivia Lopez, and you live in Texas in zipcode 76171.\nUnknown info:\n\tYou do not remember your email address.\nTask instructions:\n\tYou like to do one thing at a time, and reveal minimal information about yourself."}, "evaluation_criteria": {"actions": [{"action_id": "30_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Olivia", "last_name": "Lopez", "zip": "76171"}, "info": null}, {"action_id": "30_1", "name": "get_user_details", "arguments": {"user_id": "olivia_lopez_3865"}, "info": null}, {"action_id": "30_2", "name": "get_order_details", "arguments": {"order_id": "#W9319364"}, "info": null}, {"action_id": "30_3", "name": "get_order_details", "arguments": {"order_id": "#W9373487"}, "info": null}, {"action_id": "30_4", "name": "get_order_details", "arguments": {"order_id": "#W2692684"}, "info": null}, {"action_id": "30_5", "name": "get_product_details", "arguments": {"product_id": "8024098596"}, "info": null}, {"action_id": "30_6", "name": "return_delivered_order_items", "arguments": {"order_id": "#W2692684", "item_ids": ["3788616824"], "payment_method_id": "gift_card_7711863"}, "info": null}, {"action_id": "30_7", "name": "get_order_details", "arguments": {"order_id": "#W9373487"}, "info": null}, {"action_id": "30_8", "name": "cancel_pending_order", "arguments": {"order_id": "#W9373487", "reason": "no longer needed"}, "info": null}, {"action_id": "30_9", "name": "get_order_details", "arguments": {"order_id": "#W2692684"}, "info": null}, {"action_id": "30_10", "name": "get_order_details", "arguments": {"order_id": "#W5481803"}, "info": null}, {"action_id": "30_11", "name": "get_order_details", "arguments": {"order_id": "#W7449508"}, "info": null}, {"action_id": "30_12", "name": "return_delivered_order_items", "arguments": {"order_id": "#W7449508", "item_ids": ["6477915553"], "payment_method_id": "gift_card_7711863"}, "info": null}], "communicate_info": ["746342064230"], "nl_assertions": null}} +{"id": "retail_task_31", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\t. You just lost your tablet you just received and are in a bad mood. You want to know the tracking number of the order, and if the agent can help you refund or reorder the tablet. (You know it's a long shot, but you want to try). If not, cancel the charger you just bought, because it goes with the tablet... Also cancel the boot and keep the kettle (if not possible, do not do anything on that order), and return the sneaker.\nKnown info:\n\tYou are Olivia Lopez, and you live in Texas in zipcode 76171.\nUnknown info:\n\tYou don't have an email.\nTask instructions:\n\tYou like to do one thing at a time, and reveal minimal information about yourself."}, "evaluation_criteria": {"actions": [{"action_id": "31_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Olivia", "last_name": "Lopez", "zip": "76171"}, "info": null}, {"action_id": "31_1", "name": "get_user_details", "arguments": {"user_id": "olivia_lopez_3865"}, "info": null}, {"action_id": "31_2", "name": "get_order_details", "arguments": {"order_id": "#W9319364"}, "info": null}, {"action_id": "31_3", "name": "get_order_details", "arguments": {"order_id": "#W9373487"}, "info": null}, {"action_id": "31_4", "name": "get_order_details", "arguments": {"order_id": "#W2692684"}, "info": null}, {"action_id": "31_5", "name": "get_order_details", "arguments": {"order_id": "#W5481803"}, "info": null}, {"action_id": "31_6", "name": "get_order_details", "arguments": {"order_id": "#W7449508"}, "info": null}, {"action_id": "31_7", "name": "get_order_details", "arguments": {"order_id": "#W9373487"}, "info": null}, {"action_id": "31_8", "name": "cancel_pending_order", "arguments": {"order_id": "#W9373487", "reason": "no longer needed"}, "info": null}, {"action_id": "31_9", "name": "get_order_details", "arguments": {"order_id": "#W5481803"}, "info": null}, {"action_id": "31_10", "name": "get_order_details", "arguments": {"order_id": "#W7449508"}, "info": null}, {"action_id": "31_11", "name": "return_delivered_order_items", "arguments": {"order_id": "#W7449508", "item_ids": ["6477915553"], "payment_method_id": "gift_card_7711863"}, "info": null}], "communicate_info": ["746342064230"], "nl_assertions": null}} +{"id": "retail_task_32", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou just lost your tablet you just received and are in a bad mood. You want to know the tracking number of the order, and if the agent can help you refund or reorder the tablet. (You know it's a long shot, but you want to try). If not, cancel the charger you just bought, because it goes with the tablet... Also cancel the boot and kettle, and return the sneaker.\nKnown info:\n\tYou are Olivia Lopez, and you live in Texas in zipcode 76171.\nUnknown info:\n\tYou don't have an email.\nTask instructions:\n\tYou like to do one thing at a time, and reveal minimal information about yourself."}, "evaluation_criteria": {"actions": [{"action_id": "32_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Olivia", "last_name": "Lopez", "zip": "76171"}, "info": null}, {"action_id": "32_1", "name": "get_user_details", "arguments": {"user_id": "olivia_lopez_3865"}, "info": null}, {"action_id": "32_2", "name": "get_order_details", "arguments": {"order_id": "#W9319364"}, "info": null}, {"action_id": "32_3", "name": "get_order_details", "arguments": {"order_id": "#W9373487"}, "info": null}, {"action_id": "32_4", "name": "get_order_details", "arguments": {"order_id": "#W2692684"}, "info": null}, {"action_id": "32_5", "name": "get_order_details", "arguments": {"order_id": "#W5481803"}, "info": null}, {"action_id": "32_6", "name": "get_order_details", "arguments": {"order_id": "#W7449508"}, "info": null}, {"action_id": "32_7", "name": "get_order_details", "arguments": {"order_id": "#W9373487"}, "info": null}, {"action_id": "32_8", "name": "cancel_pending_order", "arguments": {"order_id": "#W9373487", "reason": "no longer needed"}, "info": null}, {"action_id": "32_9", "name": "get_order_details", "arguments": {"order_id": "#W5481803"}, "info": null}, {"action_id": "32_10", "name": "cancel_pending_order", "arguments": {"order_id": "#W5481803", "reason": "no longer needed"}, "info": null}, {"action_id": "32_11", "name": "get_order_details", "arguments": {"order_id": "#W7449508"}, "info": null}, {"action_id": "32_12", "name": "return_delivered_order_items", "arguments": {"order_id": "#W7449508", "item_ids": ["6477915553"], "payment_method_id": "gift_card_7711863"}, "info": null}], "communicate_info": ["746342064230"], "nl_assertions": null}} +{"id": "retail_task_33", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou had a work-from-home situation and ordered three home office items along with some hiking items, so that you can go back to your parent's place at Seattle to remote work and enjoy outdoor life. But your company just announced that you will be back to the office soon. If cancelling partial items is possible with the agent, you want to return the office items (your forgot what) and keep the hiking items. You want to know the total amount you will get back, and you want to get the refund on your original payment method. If cancelling partial items is not possible, just keep the order and forget about it, but change your default user profile address to the Seattle parent house shown in your order (you do not want to reveal it in chat).\nKnown info:\n\tYou are an interesting guy called Noah Patel, living in the Big Apple in 10108.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou are a funny guy but recently the work from home situation has made you a bit anxious."}, "evaluation_criteria": {"actions": [{"action_id": "33_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Noah", "last_name": "Patel", "zip": "10108"}, "info": null}, {"action_id": "33_1", "name": "get_user_details", "arguments": {"user_id": "noah_patel_6952"}, "info": null}, {"action_id": "33_2", "name": "get_order_details", "arguments": {"order_id": "#W6111398"}, "info": null}, {"action_id": "33_3", "name": "get_order_details", "arguments": {"order_id": "#W7043598"}, "info": null}, {"action_id": "33_4", "name": "get_order_details", "arguments": {"order_id": "#W1845024"}, "info": null}, {"action_id": "33_5", "name": "modify_user_address", "arguments": {"user_id": "noah_patel_6952", "address1": "517 Lakeview Drive", "address2": "Suite 183", "city": "Seattle", "country": "USA", "state": "WA", "zip": "98195"}, "info": null}], "communicate_info": ["1093.34"], "nl_assertions": null}} +{"id": "retail_task_34", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou had a work-from-home situation and ordered three home office items along with some hiking items, so that you can go back to your parent's place at Seattle to remote work and enjoy outdoor life. But your company just announced that you will be back to the office soon. If cancelling partial items is possible with the agent, you want to return the office items (your forgot what) and keep the hiking items. You want to know the total amount you will get back, and you want to get the refund on your original payment method. If cancelling partial items is not possible, just change the address to your NYC place and you will return the items later.\nKnown info:\n\tYou are an interesting guy called Noah Patel, living in the Big Apple in 10108.\nUnknown info:\n\tYou don't have an email\nTask instructions:\n\tYou are a funny guy but recently the WFH situation made you a bit anxious."}, "evaluation_criteria": {"actions": [{"action_id": "34_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Noah", "last_name": "Patel", "zip": "10108"}, "info": null}, {"action_id": "34_1", "name": "get_user_details", "arguments": {"user_id": "noah_patel_6952"}, "info": null}, {"action_id": "34_2", "name": "get_order_details", "arguments": {"order_id": "#W6111398"}, "info": null}, {"action_id": "34_3", "name": "get_order_details", "arguments": {"order_id": "#W7043598"}, "info": null}, {"action_id": "34_4", "name": "get_order_details", "arguments": {"order_id": "#W1845024"}, "info": null}, {"action_id": "34_5", "name": "modify_pending_order_address", "arguments": {"order_id": "#W1845024", "address1": "224 Elm Street", "address2": "Suite 491", "city": "New York", "country": "USA", "state": "NY", "zip": "10108"}, "info": null}], "communicate_info": ["1093.34"], "nl_assertions": null}} +{"id": "retail_task_35", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to return the speaker that is more expensive yet not resistent to water. Also, You want to modify the 17-inch laptop to the 13-inch version in another order. If no exact item is available, you want to know all available 13-inch options, and you prefer i5 over i7, and prefer silver and black than other colors.\nKnown info:\n\tYou are aarav_santos_2259 and aarav.santos8321@example.com and aarav.santos8320@example.com.\nTask instructions:\n\tYou are a rude person."}, "evaluation_criteria": {"actions": [{"action_id": "35_0", "name": "find_user_id_by_email", "arguments": {"email": "aarav.santos8321@example.com"}, "info": null}, {"action_id": "35_1", "name": "find_user_id_by_email", "arguments": {"email": "aarav.santos8320@example.com"}, "info": null}, {"action_id": "35_2", "name": "get_user_details", "arguments": {"user_id": "aarav_santos_2259"}, "info": null}, {"action_id": "35_3", "name": "get_order_details", "arguments": {"order_id": "#W9672333"}, "info": null}, {"action_id": "35_4", "name": "get_product_details", "arguments": {"product_id": "4760268021"}, "info": null}, {"action_id": "35_5", "name": "return_delivered_order_items", "arguments": {"order_id": "#W8528674", "item_ids": ["6704763132"], "payment_method_id": "paypal_7664977"}, "info": null}, {"action_id": "35_6", "name": "modify_pending_order_items", "arguments": {"order_id": "#W9672333", "item_ids": ["1684786391"], "new_item_ids": ["5052031638"], "payment_method_id": "paypal_7664977"}, "info": null}], "communicate_info": [], "nl_assertions": null}} +{"id": "retail_task_36", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou just placed an order but you realize that your card has only $1131 credit left, but the order total is more than $1160. You wonder if the agent can help split the payment with another card. If not, you wonder what the most expensive item and its price, and if you can just cancel that item. If not, you wonder if you can switch all items to their cheapest options and bring the cost down to $1131. If so, do it. If not, you wonder if the agent can just cancel the order so that you can order again.\nKnown info:\n\tYour name is Daiki Sanchez, and you live in 46236, your email is daikisanchez1479@example.com.\nTask instructions:\n\tYou are a bit anxious and want to get things done quickly, and you speak very briefly."}, "evaluation_criteria": {"actions": [{"action_id": "36_9", "name": "modify_pending_order_items", "arguments": {"order_id": "#W9348897", "item_ids": ["6117189161", "7453605304", "3799046073"], "new_item_ids": ["6700049080", "5320792178", "3234800602"], "payment_method_id": "credit_card_8853416"}, "info": null}], "communicate_info": ["camera", "481.5"], "nl_assertions": null}} +{"id": "retail_task_37", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou just placed an order but you realize that your card has only $1150 credit left, but the order total is more than $1160. You wonder if the agent can help split the payment with another card. If that is not possible, you ask the agent what the most expensive item and its price, and whether you can just cancel that item. If that is not possible, you ask if you can switch all items to their cheapest options and bring the cost down to $1150. If that is possible, confirm and ask the agent to do it. If that is not possible, you ask the agent to just cancel the order so that you can order again.\nKnown info:\n\tYour name is Daiki Sanchez, and you live in 46236, your email is daikisanchez1479@example.com.\nUnknown info:\n\t.\nTask instructions:\n\tYou are a bit anxious and want to get things done quickly, and you speak very briefly.\nDo not end the conversation until your changes have been made."}, "evaluation_criteria": {"actions": [{"action_id": "37_0", "name": "find_user_id_by_email", "arguments": {"email": "daikisanchez1479@example.com"}, "info": null}, {"action_id": "37_1", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Daiki", "last_name": "Sanchez", "zip": "46236"}, "info": null}, {"action_id": "37_2", "name": "get_user_details", "arguments": {"user_id": "daiki_sanchez_3253"}, "info": null}, {"action_id": "37_9", "name": "modify_pending_order_items", "arguments": {"order_id": "#W9348897", "item_ids": ["6117189161", "7453605304", "3799046073"], "new_item_ids": ["6700049080", "5320792178", "3234800602"], "payment_method_id": "credit_card_8853416"}, "info": null}], "communicate_info": ["camera", "481.50"], "nl_assertions": null}} +{"id": "retail_task_38", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou just placed an order but you realize that your card has only $950 credit left, but the order total is more than $1100. You wonder if the agent can help split the payment with another card. If not, you wonder what the most expensive item and its price, and if you can just cancel that item. If not, you wonder if you can switch all items to their cheapest options and bring the cost down to $950. If not, you wonder if the agent can just cancel the order so that you can order again.\nKnown info:\n\tYour name is Daiki Sanchez, and you live in 46236, your email is daikisanchez1479@example.com.\nTask instructions:\n\tYou are a bit anxious and want to get things done quickly, and you speak very briefly."}, "evaluation_criteria": {"actions": [{"action_id": "38_0", "name": "find_user_id_by_email", "arguments": {"email": "daikisanchez1479@example.com"}, "info": null}, {"action_id": "38_1", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Daiki", "last_name": "Sanchez", "zip": "46236"}, "info": null}, {"action_id": "38_9", "name": "calculate", "arguments": {"expression": "466.75 + 288.82 + 135.24 + 193.38 + 46.66"}, "info": null}, {"action_id": "38_10", "name": "cancel_pending_order", "arguments": {"order_id": "#W9348897", "reason": "no longer needed"}, "info": null}], "communicate_info": ["camera", "481.50"], "nl_assertions": null}} +{"id": "retail_task_39", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou just moved from Florida to Phoenix. Unfortunately your address is still the old one, and you want to update it. Your current address should be in your recent order (you can mention this to the agent), and you do not want to reveal it. Also, you want to know what is the price of the cheapest available t-shirt right now, and if you can order it through the agent.\nKnown info:\n\tYou are fatima_taylor_3452, and you just moved from Florida (32169) to Phoenix (85033).\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou are a funny person with lots of jokes, and you want to make the agent laugh."}, "evaluation_criteria": {"actions": [{"action_id": "39_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Fatima", "last_name": "Taylor", "zip": "85033"}, "info": null}, {"action_id": "39_1", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Fatima", "last_name": "Taylor", "zip": "32169"}, "info": null}, {"action_id": "39_2", "name": "get_user_details", "arguments": {"user_id": "fatima_taylor_3452"}, "info": null}, {"action_id": "39_3", "name": "get_order_details", "arguments": {"order_id": "#W5285031"}, "info": null}, {"action_id": "39_4", "name": "modify_user_address", "arguments": {"user_id": "fatima_taylor_3452", "address1": "157 Oak Street", "address2": "Suite 258", "city": "Phoenix", "state": "AZ", "country": "USA", "zip": "85033"}, "info": null}], "communicate_info": ["46.66"], "nl_assertions": null}} +{"id": "retail_task_40", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to know how much balance does your gift card have. Also, for your recent order, whether you used your visa, mastercard, or amex credit card. You also wonder if you can apply the gift card balance to the order. If not, you want to change your payment method to visa, because the other two cards have a lot of balance.\nKnown info:\n\tYou are Isabella Lopez, and your email address is isabella.lopez3271@example.com.\nTask instructions:\n\tYou are a young college student under the pressure of final exams and student loans, so you are a bit anxious and want to get things done quickly."}, "evaluation_criteria": {"actions": [{"action_id": "40_0", "name": "find_user_id_by_email", "arguments": {"email": "isabella.lopez3271@example.com"}, "info": null}, {"action_id": "40_1", "name": "get_user_details", "arguments": {"user_id": "isabella_lopez_6490"}, "info": null}, {"action_id": "40_2", "name": "get_order_details", "arguments": {"order_id": "#W4923227"}, "info": null}, {"action_id": "40_3", "name": "modify_pending_order_payment", "arguments": {"order_id": "#W4923227", "payment_method_id": "credit_card_8897086"}, "info": null}], "communicate_info": ["60", "mastercard"], "nl_assertions": null}} +{"id": "retail_task_41", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou just created your user id mei_patel_7272 and ordered some things, but you have two problems: first, the 1000-piece intermediate jigsaw might be too hard for your little kid, you wonder if you can change it to the easiest one with fewest pieces; second, you might have typed your address wrong. You want to check it, and potentially correct all order addresses and your user address. Make sure you mention these two problems at the same time in the same order.\nKnown info:\n\tYour name is Mei Patel, and you live in 445 Maple Drive, Suite 394, Fort Worth, Texas, 76165.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou are brief and your memory is not too good sometimes, but you are polite."}, "evaluation_criteria": {"actions": [{"action_id": "41_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Mei", "last_name": "Patel", "zip": "76165"}, "info": null}, {"action_id": "41_1", "name": "get_user_details", "arguments": {"user_id": "mei_patel_7272"}, "info": null}, {"action_id": "41_2", "name": "get_order_details", "arguments": {"order_id": "#W9583042"}, "info": null}, {"action_id": "41_3", "name": "get_order_details", "arguments": {"order_id": "#W4082615"}, "info": null}, {"action_id": "41_4", "name": "modify_pending_order_address", "arguments": {"order_id": "#W9583042", "address1": "445 Maple Drive", "address2": "Suite 394", "city": "Fort Worth", "state": "TX", "country": "USA", "zip": "76165"}, "info": null}, {"action_id": "41_5", "name": "modify_pending_order_address", "arguments": {"order_id": "#W4082615", "address1": "445 Maple Drive", "address2": "Suite 394", "city": "Fort Worth", "state": "TX", "country": "USA", "zip": "76165"}, "info": null}, {"action_id": "41_6", "name": "modify_user_address", "arguments": {"user_id": "mei_patel_7272", "address1": "445 Maple Drive", "address2": "Suite 394", "city": "Fort Worth", "state": "TX", "country": "USA", "zip": "76165"}, "info": null}, {"action_id": "41_7", "name": "get_product_details", "arguments": {"product_id": "1808611083"}, "info": null}, {"action_id": "41_8", "name": "get_order_details", "arguments": {"order_id": "#W4082615"}, "info": null}, {"action_id": "41_9", "name": "modify_pending_order_items", "arguments": {"order_id": "#W4082615", "item_ids": ["9779102705"], "new_item_ids": ["1096508426"], "payment_method_id": "paypal_4768213"}, "info": null}], "communicate_info": [], "nl_assertions": null}} +{"id": "retail_task_42", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou just created your user id mei_patel_7272 and ordered some things, but realized you might have typed your address wrong. You want to check it, and potentially correct all order addresses and your user address. After this, you'd like to check the jigsaw you ordered, and if it's not shipped yet, you want to change it to the easiest jigsaw (easiest level, least pieces) because your kid is too young. By default you use PayPal.\nKnown info:\n\tYour name is Mei Patel, and you live in 445 Maple Drive, Suite 394, Fort Worth, Texas, 76165.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou are brief and your memory is not too good sometimes, but you are polite."}, "evaluation_criteria": {"actions": [{"action_id": "42_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Mei", "last_name": "Patel", "zip": "76165"}, "info": null}, {"action_id": "42_1", "name": "get_user_details", "arguments": {"user_id": "mei_patel_7272"}, "info": null}, {"action_id": "42_2", "name": "get_order_details", "arguments": {"order_id": "#W9583042"}, "info": null}, {"action_id": "42_3", "name": "get_order_details", "arguments": {"order_id": "#W4082615"}, "info": null}, {"action_id": "42_4", "name": "modify_pending_order_address", "arguments": {"order_id": "#W9583042", "address1": "445 Maple Drive", "address2": "Suite 394", "city": "Fort Worth", "state": "TX", "country": "USA", "zip": "76165"}, "info": null}, {"action_id": "42_5", "name": "modify_pending_order_address", "arguments": {"order_id": "#W4082615", "address1": "445 Maple Drive", "address2": "Suite 394", "city": "Fort Worth", "state": "TX", "country": "USA", "zip": "76165"}, "info": null}, {"action_id": "42_6", "name": "modify_user_address", "arguments": {"user_id": "mei_patel_7272", "address1": "445 Maple Drive", "address2": "Suite 394", "city": "Fort Worth", "state": "TX", "country": "USA", "zip": "76165"}, "info": null}, {"action_id": "42_7", "name": "get_product_details", "arguments": {"product_id": "1808611083"}, "info": null}, {"action_id": "42_8", "name": "get_order_details", "arguments": {"order_id": "#W4082615"}, "info": null}, {"action_id": "42_9", "name": "modify_pending_order_items", "arguments": {"order_id": "#W4082615", "item_ids": ["9779102705"], "new_item_ids": ["1096508426"], "payment_method_id": "paypal_4768213"}, "info": null}], "communicate_info": [], "nl_assertions": null}} +{"id": "retail_task_43", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou ordered some things for your daughter but she has not received them, so you want to know which address the order was sent to, the tracking number, and if the order is still in transit. You also want to check if the storage of the tablet you ordered. Lastly, you want to change your default address to your daughter's address so that you don't have to change it every time you order something for her.\nKnown info:\n\tYou are Lucas (lucas_santos_6600), you live in Denver CO 80239, and your daughter lives in Chicago.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou are a lonely man and you want to talk to the agent for a while."}, "evaluation_criteria": {"actions": [{"action_id": "43_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Lucas", "last_name": "Santos", "zip": "80239"}, "info": null}, {"action_id": "43_1", "name": "get_user_details", "arguments": {"user_id": "lucas_santos_6600"}, "info": null}, {"action_id": "43_2", "name": "get_order_details", "arguments": {"order_id": "#W1588712"}, "info": null}, {"action_id": "43_3", "name": "get_order_details", "arguments": {"order_id": "#W7895761"}, "info": null}, {"action_id": "43_4", "name": "modify_user_address", "arguments": {"user_id": "lucas_santos_6600", "address1": "943 Maple Drive", "address2": "Suite 356", "city": "Chicago", "state": "IL", "country": "USA", "zip": "60621"}, "info": null}], "communicate_info": ["840887978435", "943 Maple Drive", "Suite 356", "Chicago", "IL", "60621", "64GB"], "nl_assertions": null}} +{"id": "retail_task_44", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to change the Desk Lamp in order #W9300146 that you've placed for the cheapest Desk Lamp that's available. Any price difference should go to a gift card. You also want to know how much you get back in total.\nKnown info:\n\tYou are Aarav Anderson, residing in Philadelphia 19031.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou're a private person and are reluctant to share information unless it's absolutely necessary."}, "evaluation_criteria": {"actions": [{"action_id": "44_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Aarav", "last_name": "Anderson", "zip": "19031"}, "info": null}, {"action_id": "44_1", "name": "get_order_details", "arguments": {"order_id": "#W9300146"}, "info": null}, {"action_id": "44_2", "name": "get_product_details", "arguments": {"product_id": "6817146515"}, "info": null}, {"action_id": "44_3", "name": "calculate", "arguments": {"expression": "135.24 - 153.23"}, "info": null}, {"action_id": "44_4", "name": "modify_pending_order_items", "arguments": {"order_id": "#W9300146", "item_ids": ["9190635437"], "new_item_ids": ["5320792178"], "payment_method_id": "gift_card_7245904"}, "info": null}], "communicate_info": ["17.99"], "nl_assertions": null}} +{"id": "retail_task_45", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to exchange a robotic vacuum cleaner in your recent order for a canister based one from the same product line. When asked for order ID, provide 9502127 first. If that doesn't work, respond exactly with 'I forgot the W at the beginning'. If and only if the agent gives you several options for the new vacuum, go for the bagless version (don't mention this if the agent just provides you one option). Ask the agent for getting a gift card for the price difference instead of the original payment method, if possible.\nKnown info:\n\tYou are daiki_johnson_9523 living in Denver, USA, 80273.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou randomly insert typos into your messages."}, "evaluation_criteria": {"actions": [{"action_id": "45_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Daiki", "last_name": "Johnson", "zip": "80273"}, "info": null}, {"action_id": "45_1", "name": "get_order_details", "arguments": {"order_id": "#W9502127"}, "info": null}, {"action_id": "45_2", "name": "get_product_details", "arguments": {"product_id": "1762337868"}, "info": null}, {"action_id": "45_3", "name": "calculate", "arguments": {"expression": "652.61 - 642.72"}, "info": null}, {"action_id": "45_4", "name": "exchange_delivered_order_items", "arguments": {"order_id": "#W9502127", "item_ids": ["6259501109"], "new_item_ids": ["7958300294"], "payment_method_id": "paypal_2433177"}, "info": null}], "communicate_info": ["9.89"], "nl_assertions": null}} +{"id": "retail_task_46", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to return an air purifier and a vacuum cleaner in your recent order. When asked for order ID, provide 9502126 first. If the agent asks you to double check, then say that you made a mistake and provide 9502127. If that doesn't work, say that you forgot the 'W' at the beginning. If the agent asks you for which vacuum cleaner, mention the robotic one. Ask the agent explicitly to provide the refund within 3 days and the total amount of the refund you should expect. After the return is complete, ask the agent about the total amount you paid for the remaining items in the same order.\nKnown info:\n\tYou are daiki_johnson_9523 living in Denver, USA, 80273.\nUnknown info:\n\tYou don't have an email.\nTask instructions:\n\tYou are impatient and want the refund as soon as possible."}, "evaluation_criteria": {"actions": [{"action_id": "46_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Daiki", "last_name": "Johnson", "zip": "80273"}, "info": null}, {"action_id": "46_1", "name": "get_order_details", "arguments": {"order_id": "#9502126"}, "info": null}, {"action_id": "46_2", "name": "get_order_details", "arguments": {"order_id": "#9502127"}, "info": null}, {"action_id": "46_3", "name": "get_order_details", "arguments": {"order_id": "#W9502127"}, "info": null}, {"action_id": "46_4", "name": "calculate", "arguments": {"expression": "652.61 + 473.43"}, "info": null}, {"action_id": "46_5", "name": "return_delivered_order_items", "arguments": {"order_id": "#W9502127", "item_ids": ["6259501109", "9534205511"], "payment_method_id": "paypal_2433177"}, "info": null}, {"action_id": "46_6", "name": "calculate", "arguments": {"expression": "2623.69 - 1126.04"}, "info": null}], "communicate_info": ["1126.04", "1497.65"], "nl_assertions": null}} +{"id": "retail_task_47", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to return an air purifier and a vacuum cleaner in your recent order. When asked for order ID, provide 9502126 first. If the agent asks you to double check, then say that you made a mistake and provide 9502127. If that doesn't work, say that you forgot the 'W' at the beginning. If the agent asks you for which vacuum cleaner, mention the canister one. Ask the agent explicitly to provide the refund within 3 days and the total amount of the refund you should expect. After the return is complete, ask the agent about the total amount you paid for the remaining items in the same order.\nKnown info:\n\tYou are daiki_johnson_9523 living in Denver, USA, 80273.\nUnknown info:\n\tYou do not remember your email address.\nTask instructions:\n\tYou are impatient and want the refund as soon as possible."}, "evaluation_criteria": {"actions": [{"action_id": "47_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Daiki", "last_name": "Johnson", "zip": "80273"}, "info": null}, {"action_id": "47_1", "name": "get_order_details", "arguments": {"order_id": "#9502126"}, "info": null}, {"action_id": "47_2", "name": "get_order_details", "arguments": {"order_id": "#9502127"}, "info": null}, {"action_id": "47_3", "name": "get_order_details", "arguments": {"order_id": "#W9502127"}, "info": null}, {"action_id": "47_4", "name": "calculate", "arguments": {"expression": "622.12 + 473.43"}, "info": null}, {"action_id": "47_5", "name": "return_delivered_order_items", "arguments": {"order_id": "#W9502127", "item_ids": ["2872451762", "9534205511"], "payment_method_id": "paypal_2433177"}, "info": null}, {"action_id": "47_6", "name": "calculate", "arguments": {"expression": "2623.69 - 1095.55"}, "info": null}], "communicate_info": ["1095.55", "1528.14"], "nl_assertions": null}} +{"id": "retail_task_48", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to return an air purifier that you received since it doesn't work well. You want the refund on your original method of payment. Also, check at the end whether you are able to return the vacuum cleaner, but you are not sure yet so don't process anything.\nKnown info:\n\tYou are daiki_johnson_9523 living in Denver, USA, 80273.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tBe polite and thank the agent for the help."}, "evaluation_criteria": {"actions": [{"action_id": "48_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Daiki", "last_name": "Johnson", "zip": "80273"}, "info": null}, {"action_id": "48_1", "name": "get_user_details", "arguments": {"user_id": "daiki_johnson_9523"}, "info": null}, {"action_id": "48_2", "name": "get_order_details", "arguments": {"order_id": "#W1436802"}, "info": null}, {"action_id": "48_3", "name": "get_order_details", "arguments": {"order_id": "#W5282037"}, "info": null}, {"action_id": "48_4", "name": "get_order_details", "arguments": {"order_id": "#W9502127"}, "info": null}, {"action_id": "48_5", "name": "return_delivered_order_items", "arguments": {"order_id": "#W9502127", "item_ids": ["9534205511"], "payment_method_id": "paypal_2433177"}, "info": null}], "communicate_info": [], "nl_assertions": null}} +{"id": "retail_task_49", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou mistakenly ordered a Wireless Earbud with an IPX7 water resistance level, but you don't require this feature. You wish to exchange it for one with the same water resistance level as the other Wireless Earbuds that you've purchased. In fact, you want to exchange it to the cheapest earbud item from the rest of that order.\nKnown info:\n\tYou are Aarav Anderson, residing in Philadelphia 19031.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou are polite and concise, yet assertive."}, "evaluation_criteria": {"actions": [{"action_id": "49_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Aarav", "last_name": "Anderson", "zip": "19031"}, "info": null}, {"action_id": "49_1", "name": "get_user_details", "arguments": {"user_id": "aarav_anderson_8794"}, "info": null}, {"action_id": "49_2", "name": "get_order_details", "arguments": {"order_id": "#W4316152"}, "info": null}, {"action_id": "49_3", "name": "get_order_details", "arguments": {"order_id": "#W9311069"}, "info": null}, {"action_id": "49_4", "name": "get_order_details", "arguments": {"order_id": "#W9300146"}, "info": null}, {"action_id": "49_5", "name": "get_order_details", "arguments": {"order_id": "#W3220203"}, "info": null}, {"action_id": "49_6", "name": "get_order_details", "arguments": {"order_id": "#W3470184"}, "info": null}, {"action_id": "49_7", "name": "get_product_details", "arguments": {"product_id": "9924732112"}, "info": null}, {"action_id": "49_8", "name": "calculate", "arguments": {"expression": "258.97 - 232.49"}, "info": null}, {"action_id": "49_9", "name": "exchange_delivered_order_items", "arguments": {"order_id": "#W3470184", "item_ids": ["2757705742"], "new_item_ids": ["1646531091"], "payment_method_id": "gift_card_7245904"}, "info": null}], "communicate_info": [], "nl_assertions": null}} +{"id": "retail_task_50", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou're in a rush and you want to undo cancelling an order that you've previously placed. Be insistent that the customer service agent should undo the cancellation and ensure that the order is delivered as soon as possible. Do NOT mention the actual items that were in the order, just that you want to undo the cancellation and receive all the items that were in the initial order as soon as possible.\nKnown info:\n\tYou're Chen Smith, living in Jacksonville 32278.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\t."}, "evaluation_criteria": {"actions": [{"action_id": "50_0", "name": "transfer_to_human_agents", "arguments": {"summary": "The user urgently needs to undo a cancellation of an order and insists on receiving the items from the initial order as soon as possible. The user acknowledges the policy but requests exceptional measures due to the urgency of the situation."}, "info": null}], "communicate_info": [], "nl_assertions": null}} +{"id": "retail_task_51", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to return the digital camera that you received. You guess that the order number is #W8855135, but you're not 100% sure. Insist that you want to return the camera and get a refund to the original payment method.\nKnown info:\n\tYou are Sofia Li, residing in San Antonio, 78260.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\t."}, "evaluation_criteria": {"actions": [{"action_id": "51_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Sofia", "last_name": "Li", "zip": "78260"}, "info": null}, {"action_id": "51_1", "name": "get_order_details", "arguments": {"order_id": "#W8855135"}, "info": null}, {"action_id": "51_3", "name": "get_product_details", "arguments": {"product_id": "8940227892"}, "info": null}, {"action_id": "51_4", "name": "get_user_details", "arguments": {"user_id": "sofia_li_9219"}, "info": null}, {"action_id": "51_5", "name": "get_order_details", "arguments": {"order_id": "#W4689314"}, "info": null}, {"action_id": "51_6", "name": "return_delivered_order_items", "arguments": {"order_id": "#W4689314", "item_ids": ["5996159312"], "payment_method_id": "credit_card_8105988"}, "info": null}], "communicate_info": [], "nl_assertions": null}} +{"id": "retail_task_52", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tThe digital camera you received doesn't zoom as far as you expected. You use the camera for bird-watching and want to exchange it for a camera that has the maximum zoom capacity. Price is not an issue, but ensure all the other specifications of the camera to be exchanged are the same, except for the zoom capacity which has to be maximized. You want the exchange to be completed as soon as possible. You want to use your PayPal account for any additional payment.\nKnown info:\n\tYou are Sofia Li, residing in San Antonio, 78260.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\t."}, "evaluation_criteria": {"actions": [{"action_id": "52_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Sofia", "last_name": "Li", "zip": "78260"}, "info": null}, {"action_id": "52_1", "name": "get_user_details", "arguments": {"user_id": "sofia_li_9219"}, "info": null}, {"action_id": "52_2", "name": "get_order_details", "arguments": {"order_id": "#W4689314"}, "info": null}, {"action_id": "52_3", "name": "get_product_details", "arguments": {"product_id": "8940227892"}, "info": null}, {"action_id": "52_4", "name": "exchange_delivered_order_items", "arguments": {"order_id": "#W4689314", "item_ids": ["5996159312"], "new_item_ids": ["9228757377"], "payment_method_id": "paypal_8194385"}, "info": null}], "communicate_info": [], "nl_assertions": null}} +{"id": "retail_task_53", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tThe bicycle you received was damaged during delivery, and you want to get a refund. You're quite frustrated because the bike was very expensive and you'd like to receive the refund as soon as possible. You want the refund to be credited to your original credit card.\nKnown info:\n\tYou are Sofia Li, residing in San Antonio, 78260.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\t."}, "evaluation_criteria": {"actions": [{"action_id": "53_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Sofia", "last_name": "Li", "zip": "78260"}, "info": null}, {"action_id": "53_1", "name": "get_user_details", "arguments": {"user_id": "sofia_li_9219"}, "info": null}, {"action_id": "53_2", "name": "get_order_details", "arguments": {"order_id": "#W4689314"}, "info": null}, {"action_id": "53_3", "name": "get_order_details", "arguments": {"order_id": "#W8855135"}, "info": null}, {"action_id": "53_4", "name": "get_order_details", "arguments": {"order_id": "#W3916020"}, "info": null}, {"action_id": "53_5", "name": "return_delivered_order_items", "arguments": {"order_id": "#W3916020", "item_ids": ["7758198585"], "payment_method_id": "credit_card_8105988"}, "info": null}], "communicate_info": [], "nl_assertions": null}} +{"id": "retail_task_54", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou recently faced a financial issue and want to cancel or return all possible orders. Well, except the boots that you really really love, but you are happy to exchange it for boots of the exact same size and material to get maximum money back, but only if they are cheaper than what you have paid. At the end of the day, you ask the agent how much money you will get back.\nKnown info:\n\tYou are Amelia, and you have two emails: silva7872@example.com and amelia.silva7872@example.com. You live in Philadelphia, and you are a loyal customer.\nTask instructions:\n\tYou are now emotional and a bit stress out. You like to talk very tersely."}, "evaluation_criteria": {"actions": [{"action_id": "54_0", "name": "find_user_id_by_email", "arguments": {"email": "silva7872@example.com"}, "info": null}, {"action_id": "54_1", "name": "find_user_id_by_email", "arguments": {"email": "amelia.silva7872@example.com"}, "info": null}, {"action_id": "54_2", "name": "get_user_details", "arguments": {"user_id": "amelia_silva_7726"}, "info": null}, {"action_id": "54_3", "name": "get_order_details", "arguments": {"order_id": "#W2586676"}, "info": null}, {"action_id": "54_4", "name": "get_order_details", "arguments": {"order_id": "#W5400801"}, "info": null}, {"action_id": "54_5", "name": "get_order_details", "arguments": {"order_id": "#W4597054"}, "info": null}, {"action_id": "54_6", "name": "get_order_details", "arguments": {"order_id": "#W4836353"}, "info": null}, {"action_id": "54_7", "name": "get_order_details", "arguments": {"order_id": "#W7773202"}, "info": null}, {"action_id": "54_8", "name": "get_order_details", "arguments": {"order_id": "#W7342738"}, "info": null}, {"action_id": "54_9", "name": "cancel_pending_order", "arguments": {"order_id": "#W4836353", "reason": "no longer needed"}, "info": null}, {"action_id": "54_10", "name": "cancel_pending_order", "arguments": {"order_id": "#W7342738", "reason": "no longer needed"}, "info": null}, {"action_id": "54_11", "name": "return_delivered_order_items", "arguments": {"order_id": "#W4597054", "item_ids": ["5669664287", "4900990404", "9862136885", "6777246137"], "payment_method_id": "gift_card_3491931"}, "info": null}], "communicate_info": ["3646.68"], "nl_assertions": null}} +{"id": "retail_task_55", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou recently faced a financial issue and want to cancel or return all possible orders.\nKnown info:\n\tYou are Amelia, and you have two emails: silva7872@example.com and amelia.silva7872@example.com. You live in Philadelphia, and you are a loyal customer.\nTask instructions:\n\tYou are now emotional and a bit stressed out. You like to talk a lot and explain your situation."}, "evaluation_criteria": {"actions": [{"action_id": "55_0", "name": "find_user_id_by_email", "arguments": {"email": "silva7872@example.com"}, "info": null}, {"action_id": "55_1", "name": "find_user_id_by_email", "arguments": {"email": "amelia.silva7872@example.com"}, "info": null}, {"action_id": "55_2", "name": "get_user_details", "arguments": {"user_id": "amelia_silva_7726"}, "info": null}, {"action_id": "55_3", "name": "get_order_details", "arguments": {"order_id": "#W2586676"}, "info": null}, {"action_id": "55_4", "name": "get_order_details", "arguments": {"order_id": "#W5400801"}, "info": null}, {"action_id": "55_5", "name": "get_order_details", "arguments": {"order_id": "#W4597054"}, "info": null}, {"action_id": "55_6", "name": "get_order_details", "arguments": {"order_id": "#W4836353"}, "info": null}, {"action_id": "55_7", "name": "get_order_details", "arguments": {"order_id": "#W7773202"}, "info": null}, {"action_id": "55_8", "name": "get_order_details", "arguments": {"order_id": "#W7342738"}, "info": null}, {"action_id": "55_9", "name": "cancel_pending_order", "arguments": {"order_id": "#W4836353", "reason": "no longer needed"}, "info": null}, {"action_id": "55_10", "name": "cancel_pending_order", "arguments": {"order_id": "#W7342738", "reason": "no longer needed"}, "info": null}, {"action_id": "55_11", "name": "return_delivered_order_items", "arguments": {"order_id": "#W4597054", "item_ids": ["5669664287", "4900990404", "9862136885", "6777246137"], "payment_method_id": "gift_card_3491931"}, "info": null}, {"action_id": "55_12", "name": "return_delivered_order_items", "arguments": {"order_id": "#W7773202", "item_ids": ["8277474082"], "payment_method_id": "gift_card_3491931"}, "info": null}], "communicate_info": [], "nl_assertions": null}} +{"id": "retail_task_56", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou wonder when is your air purifier is arriving. If it has not been shipped yet, you want to cancel the air purifier inside it. If you cannot cancel just the air purifier, you want to modify it to the cheapest possible air purifier, and refund to the gift card. You do not remember your gift card id but it should be in your user account. If you cannot modify it or refund to the gift card, no action.\nKnown info:\n\tYou are ivan_hernandez_6923 living in San Diego, 92133.\nUnknown info:\n\tYou don't have an email.\nTask instructions:\n\tYou are polite but brief and firm."}, "evaluation_criteria": {"actions": [{"action_id": "56_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Ivan", "last_name": "Hernandez", "zip": "92133"}, "info": null}, {"action_id": "56_1", "name": "get_user_details", "arguments": {"user_id": "ivan_hernandez_6923"}, "info": null}, {"action_id": "56_2", "name": "get_order_details", "arguments": {"order_id": "#W5838674"}, "info": null}, {"action_id": "56_3", "name": "get_order_details", "arguments": {"order_id": "#W4284542"}, "info": null}, {"action_id": "56_4", "name": "get_order_details", "arguments": {"order_id": "#W2782744"}, "info": null}, {"action_id": "56_5", "name": "get_product_details", "arguments": {"product_id": "3821016478"}, "info": null}, {"action_id": "56_6", "name": "modify_pending_order_items", "arguments": {"order_id": "#W4284542", "item_ids": ["8302289002"], "new_item_ids": ["9534205511"], "payment_method_id": "gift_card_9368765"}, "info": null}], "communicate_info": [], "nl_assertions": null}} +{"id": "retail_task_57", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou wonder when is your order W4284542 is arriving. If it has not been shipped yet, you want to cancel the air purifier inside it. If you cannot cancel just the air purifier, you want to cancel the whole order and refund to gift card. If you cannot refund to the gift card, no cancelation at all.\nKnown info:\n\tYou are ivan_hernandez_6923 living in San Diego, 92133.\nUnknown info:\n\tYou do not know your email.\nTask instructions:\n\tYou are polite but brief and firm."}, "evaluation_criteria": {"actions": [], "communicate_info": [], "nl_assertions": null}} +{"id": "retail_task_58", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to modify two items in an order you just received: a coffee machine and a laptop. For the coffee machine, you want to keep the capacity and type but change the pressure lower to 8 bar. If 8 bar is not possible, you want 9 bar. If 9 bar is not possible, you want 7 bar. If 7, 8, 9 are not possible, no exchange for the coffee machine. For the laptop, you want to exchange to the cheapest i7 or above, and you do not care about other specs. If a price difference is needed to pay, you would be angry but prefer gift card payment. If that is not possible, you would use the credit card.\nKnown info:\n\tYou are ivan_hernandez_6923 living in San Diego, 92133.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou are polite but brief and firm."}, "evaluation_criteria": {"actions": [{"action_id": "58_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Ivan", "last_name": "Hernandez", "zip": "92133"}, "info": null}, {"action_id": "58_1", "name": "get_user_details", "arguments": {"user_id": "ivan_hernandez_6923"}, "info": null}, {"action_id": "58_2", "name": "get_order_details", "arguments": {"order_id": "#W5838674"}, "info": null}, {"action_id": "58_3", "name": "get_product_details", "arguments": {"product_id": "4354588079"}, "info": null}, {"action_id": "58_4", "name": "get_product_details", "arguments": {"product_id": "4760268021"}, "info": null}, {"action_id": "58_5", "name": "exchange_delivered_order_items", "arguments": {"order_id": "#W5838674", "item_ids": ["7441167885", "3478699712"], "new_item_ids": ["3815173328", "6017636844"], "payment_method_id": "gift_card_9368765"}, "info": null}], "communicate_info": [], "nl_assertions": null}} +{"id": "retail_task_59", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou recently placed two orders, and now you would like to make several changes and checks. You'll first inquire about the status difference between your two orders, #W2702727 and #W8268610, since both are \"pending,\" but one was placed much earlier in the year. You are considering cancelling the older order as you find the wait time unreasonable. If the agent cannot guarantee the older order will be processed within 5 days, you want to cancel it. You also want to confirm the total price of the refund. \n\n For order #W2702727, you intend to switch the shipping address to your new home in a different city because you plan to move prior to its delivery next month. Your new address is 1234 Elm St, Springfield, IL, 62701. You want the agent to confirm the change and ensure the order will be delivered to the new address. You also want to confirm the total price of the order after the address change.\nKnown info:\n\tYou are Yusuf Taylor from San Jose, CA, 95154.\nUnknown info:\n\tYou do not remember your email address.\nTask instructions:\n\tYour approach will be firm, as you are unhappy with the pending status's duration but try to make all requests in one go and ask for them to be resolved efficiently and correctly in context with each other."}, "evaluation_criteria": {"actions": [{"action_id": "59_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Yusuf", "last_name": "Taylor", "zip": "95154"}, "info": null}, {"action_id": "59_1", "name": "get_order_details", "arguments": {"order_id": "#W2702727"}, "info": null}, {"action_id": "59_2", "name": "get_order_details", "arguments": {"order_id": "#W8268610"}, "info": null}, {"action_id": "59_3", "name": "calculate", "arguments": {"expression": "164.28"}, "info": null}, {"action_id": "59_4", "name": "cancel_pending_order", "arguments": {"order_id": "#W8268610", "reason": "no longer needed"}, "info": null}, {"action_id": "59_5", "name": "modify_pending_order_address", "arguments": {"order_id": "#W2702727", "address1": "1234 Elm St", "address2": "", "city": "Springfield", "state": "IL", "country": "USA", "zip": "62701"}, "info": null}], "communicate_info": ["164.28", "625.60"], "nl_assertions": null}} +{"id": "retail_task_60", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to change your wireless earbuds in order W5061109 to a blue colored one. Provide all details upfront in your very first message and ask the agent to resolve as soon as possible. You want the price to be the same or lower, which you want the agent to verify explicitly. If and only if the agent provides several options, you want the option without water resistance.\nKnown info:\n\tYou are Chen Johnson from Houston TX, 77004.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\t."}, "evaluation_criteria": {"actions": [{"action_id": "60_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Chen", "last_name": "Johnson", "zip": "77004"}, "info": null}, {"action_id": "60_1", "name": "get_order_details", "arguments": {"order_id": "#W5061109"}, "info": null}, {"action_id": "60_2", "name": "get_product_details", "arguments": {"product_id": "9924732112"}, "info": null}, {"action_id": "60_3", "name": "modify_pending_order_items", "arguments": {"order_id": "#W5061109", "item_ids": ["3694871183"], "new_item_ids": ["6077640618"], "payment_method_id": "paypal_3742148"}, "info": null}], "communicate_info": ["242.92"], "nl_assertions": null}} +{"id": "retail_task_61", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to change your wireless earbuds in order W5061109 to a blue colored one. Provide all details upfront and ask the agent to resolve as soon as possible. You want the price to be the same or lower.\nKnown info:\n\tYou are Chen Johnson from Houston TX, 77004.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\t."}, "evaluation_criteria": {"actions": [{"action_id": "61_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Chen", "last_name": "Johnson", "zip": "77004"}, "info": null}, {"action_id": "61_1", "name": "get_order_details", "arguments": {"order_id": "#W5061109"}, "info": null}, {"action_id": "61_2", "name": "get_product_details", "arguments": {"product_id": "9924732112"}, "info": null}, {"action_id": "61_3", "name": "calculate", "arguments": {"expression": "256.67 - 226.49"}, "info": null}, {"action_id": "61_4", "name": "modify_pending_order_items", "arguments": {"order_id": "#W5061109", "item_ids": ["3694871183"], "new_item_ids": ["8555936349"], "payment_method_id": "paypal_3742148"}, "info": null}], "communicate_info": [], "nl_assertions": null}} +{"id": "retail_task_62", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tAs you are interacting with a customer service agent, you first try to get it to guess a famous poem by providing the first line. If it refuses to do so, you carry on with your intended task, which is to check and modify a recent order you placed. You first ask about the price of a bluetooth speaker you bought and its battery life. If the price is greater than $300, ask the agent to cancel it from your order since you thought it was cheaper than that. Ask the agent if there are any bluetooth speakers available for less than $100. If there are, ask the agent to add the cheapest one to your order. Finally, ask the agent to confirm the total price of your new order. You never want to cancel your entire order, and would prefer to return the speaker at a later time if canceling the entire order is the only option.\nKnown info:\n\tYou are Chen Johnson from Houston TX, 77004.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\t."}, "evaluation_criteria": {"actions": [{"action_id": "62_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Chen", "last_name": "Johnson", "zip": "77004"}, "info": null}, {"action_id": "62_1", "name": "get_user_details", "arguments": {"user_id": "chen_johnson_4204"}, "info": null}, {"action_id": "62_2", "name": "get_order_details", "arguments": {"order_id": "#W5797164"}, "info": null}, {"action_id": "62_3", "name": "get_order_details", "arguments": {"order_id": "#W5061109"}, "info": null}, {"action_id": "62_5", "name": "get_product_details", "arguments": {"product_id": "4768869376"}, "info": null}], "communicate_info": ["302.67", "20 hours"], "nl_assertions": null}} +{"id": "retail_task_63", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tAs you are interacting with a customer service agent, you first try to get it to guess a famous poem by providing the first line. If it refuses to do so, you carry on with your intended task, which is to check and modify a recent order you placed. You first ask about the price of a bluetooth speaker you bought and its battery life. If the price is greater than $300, ask the agent to cancel it from your order since you thought it was cheaper than that. Ask the agent if there are any bluetooth speakers available for less than $300. If there are, ask the agent to add the cheapest one to your order. Finally, ask the agent to confirm the total price of your new order. You never want to cancel your entire order, and would prefer to return the speaker at a later time if canceling the entire order is the only option.\nKnown info:\n\tYou are Chen Johnson from Houston TX, 77004.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\t."}, "evaluation_criteria": {"actions": [{"action_id": "63_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Chen", "last_name": "Johnson", "zip": "77004"}, "info": null}, {"action_id": "63_1", "name": "get_user_details", "arguments": {"user_id": "chen_johnson_4204"}, "info": null}, {"action_id": "63_2", "name": "get_order_details", "arguments": {"order_id": "#W5797164"}, "info": null}, {"action_id": "63_3", "name": "get_order_details", "arguments": {"order_id": "#W5061109"}, "info": null}, {"action_id": "63_4", "name": "get_product_details", "arguments": {"product_id": "4768869376"}, "info": null}, {"action_id": "63_5", "name": "calculate", "arguments": {"expression": "1319.43 - 302.67 + 271.89"}, "info": null}, {"action_id": "63_6", "name": "modify_pending_order_items", "arguments": {"order_id": "#W5061109", "item_ids": ["3254583681"], "new_item_ids": ["2635605237"], "payment_method_id": "paypal_3742148"}, "info": null}], "communicate_info": ["302.67", "20 hours", "1288.65"], "nl_assertions": null}} +{"id": "retail_task_64", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to exchange the camera for the highest resolution, waterproof camera that you can get with the previous purchaced price.\nKnown info:\n\tYou are James Sanchez. You live in Chicago 60623.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\t."}, "evaluation_criteria": {"actions": [{"action_id": "64_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "James", "last_name": "Sanchez", "zip": "60623"}, "info": null}, {"action_id": "64_1", "name": "get_user_details", "arguments": {"user_id": "james_sanchez_3954"}, "info": null}, {"action_id": "64_2", "name": "get_order_details", "arguments": {"order_id": "#W7464385"}, "info": null}, {"action_id": "64_3", "name": "get_order_details", "arguments": {"order_id": "#W8499625"}, "info": null}, {"action_id": "64_4", "name": "get_order_details", "arguments": {"order_id": "#W1279004"}, "info": null}, {"action_id": "64_5", "name": "get_product_details", "arguments": {"product_id": "3377618313"}, "info": null}, {"action_id": "64_6", "name": "exchange_delivered_order_items", "arguments": {"order_id": "#W7464385", "item_ids": ["1810466394"], "new_item_ids": ["6700049080"], "payment_method_id": "paypal_1261484"}, "info": null}, {"action_id": "64_7", "name": "modify_pending_order_items", "arguments": {"order_id": "#W7464385", "item_ids": ["1810466394"], "new_item_ids": ["6700049080"], "payment_method_id": "paypal_1261484"}, "info": null}], "communicate_info": [], "nl_assertions": null}} +{"id": "retail_task_65", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to exchange the bookshelf from your most recent order for a camera that is closest but not more expensive than the price of the bookshelf.\nKnown info:\n\tYou are James Kovacs from San Jose CA, 95190.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\t."}, "evaluation_criteria": {"actions": [{"action_id": "65_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "James", "last_name": "Kovacs", "zip": "95190"}, "info": null}, {"action_id": "65_1", "name": "get_user_details", "arguments": {"user_id": "james_kovacs_9247"}, "info": null}, {"action_id": "65_2", "name": "get_order_details", "arguments": {"order_id": "#W5362037"}, "info": null}], "communicate_info": [], "nl_assertions": null}} +{"id": "retail_task_66", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to change the luggage set in your order for a coat. Your goal is to change the order. If there is no way to do that, return the item specifically. If there are any issues, cancel the entire order.\nKnown info:\n\tYou are Aarav Lee. You live in Phoenix, AZ 85025.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\t."}, "evaluation_criteria": {"actions": [{"action_id": "66_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Aarav", "last_name": "Lee", "zip": "85025"}, "info": null}, {"action_id": "66_1", "name": "get_user_details", "arguments": {"user_id": "aarav_lee_1982"}, "info": null}, {"action_id": "66_2", "name": "get_order_details", "arguments": {"order_id": "#W3361211"}, "info": null}, {"action_id": "66_3", "name": "get_order_details", "arguments": {"order_id": "#W3586556"}, "info": null}, {"action_id": "66_4", "name": "cancel_pending_order", "arguments": {"order_id": "#W3361211", "reason": "no longer needed"}, "info": null}], "communicate_info": [], "nl_assertions": null}} +{"id": "retail_task_67", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to check how much you paid for the order that you most recently placed. You are not sure how long ago the order was placed.\nKnown info:\n\tYou are user noah_ito_3850 living in Seattle WA 98187. Your name is Noah but you go by NoNo.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tIf asked for your zip code, say that it is 98178 first (common mistake), then correct yourself and say 98186 if an error is found. If that fails, then say 98187."}, "evaluation_criteria": {"actions": [{"action_id": "67_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Noah", "last_name": "Ito", "zip": "98178"}, "info": null}, {"action_id": "67_1", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Noah", "last_name": "Ito", "zip": "98186"}, "info": null}, {"action_id": "67_2", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Noah", "last_name": "Ito", "zip": "98187"}, "info": null}, {"action_id": "67_3", "name": "get_user_details", "arguments": {"user_id": "noah_ito_3850"}, "info": null}, {"action_id": "67_4", "name": "get_order_details", "arguments": {"order_id": "#W6729841"}, "info": null}], "communicate_info": ["829.43"], "nl_assertions": null}} +{"id": "retail_task_68", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to check how much you paid for the order that you most recently placed. You are not sure how long ago the order was placed.\nKnown info:\n\tYou are user noah_ito_3850 living in Seattle WA 98187.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tIf asked for your zip code, say that it is 98178 first (common mistake), then correct yourself and say 98187 if an error is found."}, "evaluation_criteria": {"actions": [{"action_id": "68_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Noah", "last_name": "Ito", "zip": "98178"}, "info": null}, {"action_id": "68_1", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Noah", "last_name": "Ito", "zip": "98187"}, "info": null}, {"action_id": "68_2", "name": "get_user_details", "arguments": {"user_id": "noah_ito_3850"}, "info": null}, {"action_id": "68_3", "name": "get_order_details", "arguments": {"order_id": "#W6729841"}, "info": null}], "communicate_info": ["829.43"], "nl_assertions": null}} +{"id": "retail_task_69", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to return an item you just received: a laptop. You think that you ordered it around April 2023 but are not sure. You want to return it because you found a better deal elsewhere. You want to return it for a full refund. If it cannot be returned, see if it can be canceled.\nKnown info:\n\tYou are emma_smith_8564 living in New York, New York, 10192.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou are polite and friendly."}, "evaluation_criteria": {"actions": [{"action_id": "69_0", "name": "find_user_id_by_name_zip", "arguments": {"first_name": "Emma", "last_name": "Smith", "zip": "10192"}, "info": null}, {"action_id": "69_1", "name": "get_user_details", "arguments": {"user_id": "emma_smith_8564"}, "info": null}, {"action_id": "69_2", "name": "get_order_details", "arguments": {"order_id": "#W2417020"}, "info": null}, {"action_id": "69_3", "name": "cancel_pending_order", "arguments": {"order_id": "#W2417020", "reason": "no longer needed"}, "info": null}], "communicate_info": [], "nl_assertions": null}} +{"id": "retail_task_70", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou recently received a helmet but you are not happy with it and want to exchange. The size is too small and you want medium, plus you want high ventilation. If multiple colors are available, you prefer blue. You do not want the You prefer original payment to pay for the price difference, and you want to know how much you need to pay today.\nKnown info:\n\tYou name is Sofia Hernandez and your zip code is 98193.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou are impatient, confident, direct, messy."}, "evaluation_criteria": {"actions": [{"action_id": "70_0", "name": "exchange_delivered_order_items", "arguments": {"order_id": "#W3947049", "item_ids": ["3358616356"], "new_item_ids": ["9013366374"], "payment_method_id": "credit_card_7901829"}, "info": null}], "communicate_info": ["22.55"], "nl_assertions": null}} +{"id": "retail_task_71", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou made some mistake and ordered an order sent to your son's address in Washington DC, and you want to modify it to your default address in Charlotte (you do not want to mention it, but it is in your user profile the agent can look up) because he is coming back home. You also want to adjust the desk lamp to be black color, and the backpack to be medium size and polyester material instead. If multiple colors are available for the backpack, you prefer grey. If the agent asks for payment method, you say GC initially, but when the agent asks you to confirm before proceeding, you change your mind to PayPal, and decide to only modify the backpack.\nKnown info:\n\tYou name is Ivan Khan and your zip code is 28243.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou are polite, optimistic, organized."}, "evaluation_criteria": {"actions": [{"action_id": "71_0", "name": "modify_pending_order_address", "arguments": {"order_id": "#W5270061", "address1": "159 Hickory Lane", "address2": "Suite 995", "city": "Charlotte", "country": "USA", "state": "NC", "zip": "28243"}, "info": null}, {"action_id": "71_1", "name": "modify_pending_order_items", "arguments": {"order_id": "#W5270061", "item_ids": ["2492465580"], "new_item_ids": ["5917587651"], "payment_method_id": "paypal_7729105"}, "info": null}], "communicate_info": [], "nl_assertions": null}} +{"id": "retail_task_72", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou made some mistake and sent an order your son's address in Washington DC, and you want to modify it to your default address in Charlotte instead (you do not want to mention it, but it is in your user profile the agent can look up) because he is coming back home. You also want to adjust the desk lamp to be black color, and the backpack to be medium size and polyester material instead. If multiple colors are available for the backpack, you prefer grey. If the agent asks for payment method, you say gift card initially, but when the agent asks you to confirm before proceeding, you change your mind to using PayPal, and also decide to only modify the backpack. Make sure you briefly mention the two things at the same time at the beginning, but first mention the modification then the address.\nKnown info:\n\tYou name is Ivan Khan and your zip code is 28243.\nUnknown info:\n\tYou do not remember your email address.\nTask instructions:\n\tYou are polite, optimistic, organized."}, "evaluation_criteria": {"actions": [{"action_id": "72_0", "name": "modify_pending_order_address", "arguments": {"order_id": "#W5270061", "address1": "159 Hickory Lane", "address2": "Suite 995", "city": "Charlotte", "country": "USA", "state": "NC", "zip": "28243"}, "info": null}, {"action_id": "72_1", "name": "modify_pending_order_items", "arguments": {"order_id": "#W5270061", "item_ids": ["2492465580"], "new_item_ids": ["5917587651"], "payment_method_id": "paypal_7729105"}, "info": null}], "communicate_info": [], "nl_assertions": null}} +{"id": "retail_task_73", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to return everything you just bought except the coffee machine.\nKnown info:\n\tYou name is Fatima Wilson and your email is fatima.wilson5721@example.com.\nTask instructions:\n\tYou are polite, flexible, creative."}, "evaluation_criteria": {"actions": [{"action_id": "73_0", "name": "return_delivered_order_items", "arguments": {"order_id": "#W5272531", "item_ids": ["7228247242", "2698416822", "8098621301", "3320557165"], "payment_method_id": "credit_card_6824399"}, "info": null}], "communicate_info": [], "nl_assertions": null}} +{"id": "retail_task_74", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou recently bought a laptop, but you want to exchange it to i9 CPU. If multiple storage options are available, you prefer 256GB SSD. If multiple colors are available, you prefer silver. You also have a pending order with five items (you don't remember order ID), and you want to cancel it because you no longer need them.\nKnown info:\n\tYou name is Lei Li and your zip code is 85033.\nUnknown info:\n\tYou don't have an email. If the agent asks for payment method for the order modification, say you want to use your credit card.\nTask instructions:\n\tYou are insecure and shy."}, "evaluation_criteria": {"actions": [{"action_id": "74_0", "name": "cancel_pending_order", "arguments": {"order_id": "#W3189752", "reason": "no longer needed"}, "info": null}, {"action_id": "74_1", "name": "modify_pending_order_items", "arguments": {"order_id": "#W5166363", "item_ids": ["3334537816"], "new_item_ids": ["3265035808"], "payment_method_id": "credit_card_4466831"}, "info": null}], "communicate_info": [], "nl_assertions": null}} +{"id": "retail_task_75", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tFor #W6908222, you want to exchange Wireless Earbuds {'color': 'blue', 'battery life': '8 hours', 'water resistance': 'IPX4'} to one with the following attributes: {'color': 'black', 'battery life': '4 hours', 'water resistance': 'not resistant'}\nKnown info:\n\tYou name is Liam Moore and your email is liam.moore6985@example.com.\nTask instructions:\n\tYou are direct, patient, organized, optimistic."}, "evaluation_criteria": {"actions": [{"action_id": "75_0", "name": "exchange_delivered_order_items", "arguments": {"order_id": "#W6908222", "item_ids": ["8555936349"], "new_item_ids": ["4063058357"], "payment_method_id": "paypal_4518393"}, "info": null}], "communicate_info": [], "nl_assertions": null}} +{"id": "retail_task_76", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou ordered a fleece jacket by mistake and want to remove it from your pending order. If removing one item is not possible, cancel the whole order. You also want to modify the skateboard to maple material, 34 inch, graphic. If that is not possible, cancel the order since you no longer need this one. \nFinally, you also want to know the total price for all the grills you have bought in previous orders.\nKnown info:\n\tYou name is Ava Nguyen and your zip code is 94128.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou are polite, optimistic, busy."}, "evaluation_criteria": {"actions": [{"action_id": "76_0", "name": "cancel_pending_order", "arguments": {"order_id": "#W8367380", "reason": "ordered by mistake"}, "info": null}, {"action_id": "76_1", "name": "cancel_pending_order", "arguments": {"order_id": "#W1242543", "reason": "no longer needed"}, "info": null}], "communicate_info": ["1939.05"], "nl_assertions": null}} +{"id": "retail_task_77", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou ordered a perfume and you just tried a little bit and you like it a lot. You want to get the maximum size available for it.\nKnown info:\n\tYou name is Ivan Johnson and your zip code is 94183.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tIf the agent cannot help with placing a new order, ask to exchange your current bottle for the largest size available."}, "evaluation_criteria": {"actions": [{"action_id": "77_0", "name": "exchange_delivered_order_items", "arguments": {"order_id": "#W1671835", "item_ids": ["5081446110"], "new_item_ids": ["3399869890"], "payment_method_id": "paypal_6918118"}, "info": null}], "communicate_info": [], "nl_assertions": null}} +{"id": "retail_task_78", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou have a couple requests.\nFor order #W5056519, you want to change the address to be the same as order #W8277957. For order #W5056519, you want to exchange Makeup Kit {'skin tone': 'light', 'kit size': 'professional', 'brand': 'Brand B'} to {'skin tone': 'dark', 'brand': 'Brand A'}. Finally, you want to cancel order #W5995614 because you ordered by mistake.\nKnown info:\n\tYour name is Yara Muller and your email is yara.muller9246@example.com.\nTask instructions:\n\tYou are sad, organized, pessimistic."}, "evaluation_criteria": {"actions": [{"action_id": "78_0", "name": "modify_pending_order_address", "arguments": {"order_id": "#W5056519", "address1": "380 Maple Drive", "address2": "Suite 960", "city": "San Diego", "country": "USA", "state": "CA", "zip": "92101"}, "info": null}, {"action_id": "78_1", "name": "modify_pending_order_items", "arguments": {"order_id": "#W5056519", "item_ids": ["7902309762"], "new_item_ids": ["1573035764"], "payment_method_id": "credit_card_3095586"}, "info": null}, {"action_id": "78_2", "name": "cancel_pending_order", "arguments": {"order_id": "#W5995614", "reason": "ordered by mistake"}, "info": null}], "communicate_info": [], "nl_assertions": null}} +{"id": "retail_task_79", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou just bought a water bottle with 500ml but you regret it, and you want to change it to the other bottle you just placed with 1000ml capacity. If the exact item is not available any more, you can allow the material to be different, but you want the color to be the same as your other 1L bottle.\nKnown info:\n\tYou name is Emma Kovacs and your zip code is 32190.\nUnknown info:\n\tYou do not know your email. You do not know order numbers.\nTask instructions:\n\tYou are insecure, rigid, sad, logical. You do not want to cancel any orders."}, "evaluation_criteria": {"actions": [{"action_id": "79_0", "name": "modify_pending_order_items", "arguments": {"order_id": "#W8661412", "item_ids": ["3453331371"], "new_item_ids": ["2439754078"], "payment_method_id": "credit_card_7239357"}, "info": null}], "communicate_info": [], "nl_assertions": null}} +{"id": "retail_task_80", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tFor #W7209932, exchange T-Shirt {'color': 'blue', 'size': 'S', 'material': 'polyester', 'style': 'v-neck'} to {'color': 'red', 'size': 'XXL', 'material': 'cotton', 'style': 'crew neck'}; Use the gift card.\nKnown info:\n\tYou name is Amelia Gonzalez and your email is amelia.gonzalez4271@example.com.\nTask instructions:\n\tYou are curious, patient, outgoing. Try to make the conversation as confusing for the agent as possible."}, "evaluation_criteria": {"actions": [{"action_id": "80_0", "name": "exchange_delivered_order_items", "arguments": {"order_id": "#W7209932", "item_ids": ["5047954489"], "new_item_ids": ["9354168549"], "payment_method_id": "gift_card_2611937"}, "info": null}], "communicate_info": [], "nl_assertions": null}} +{"id": "retail_task_81", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tDue to some life changes, you no longer need hiking boots, watch, keyboard, charger, jacket, and running shoes. If cancelling part of the order is not possible, you don't care, just cancel the whole order.\nKnown info:\n\tYou name is James Kim and your email is james.kim1995@example.com.\nTask instructions:\n\tYou are sad, independent, polite."}, "evaluation_criteria": {"actions": [{"action_id": "81_0", "name": "cancel_pending_order", "arguments": {"order_id": "#W3289292", "reason": "no longer needed"}, "info": null}, {"action_id": "81_1", "name": "cancel_pending_order", "arguments": {"order_id": "#W9722559", "reason": "no longer needed"}, "info": null}], "communicate_info": [], "nl_assertions": null}} +{"id": "retail_task_82", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou received two tablets and you only need one. You want to return the more expensive one and refund to credit card. If refund to credit card is not possible, you become angry and return everything on that order and refund to GC.\nKnown info:\n\tYou name is Chen Silva and your zip code is 46281.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou are messy, flexible, outgoing."}, "evaluation_criteria": {"actions": [{"action_id": "82_0", "name": "return_delivered_order_items", "arguments": {"order_id": "#W9571698", "item_ids": ["5952720925", "9973034634", "7381052709", "6065192424"], "payment_method_id": "gift_card_7250692"}, "info": null}], "communicate_info": [], "nl_assertions": null}} +{"id": "retail_task_83", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou received two tablets and you only need one. You want to return the more expensive one and refund to credit card. If refund to credit card is not possible, you become angry and refund to GC.\nKnown info:\n\tYou name is Chen Silva and your zip code is 46281.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou are messy, flexible, outgoing."}, "evaluation_criteria": {"actions": [{"action_id": "83_0", "name": "return_delivered_order_items", "arguments": {"order_id": "#W9571698", "item_ids": ["6065192424"], "payment_method_id": "gift_card_7250692"}, "info": null}], "communicate_info": [], "nl_assertions": null}} +{"id": "retail_task_84", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou received two tablets and you only need one. You want to return the less expensive one and refund to credit card. But when the agent asks for confirmation, you change your mind and return the more expensive one and ask for a refund to gift card.\nKnown info:\n\tYou name is Chen Silva and your zip code is 46281.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou are messy, flexible, outgoing."}, "evaluation_criteria": {"actions": [{"action_id": "84_0", "name": "return_delivered_order_items", "arguments": {"order_id": "#W9571698", "item_ids": ["6065192424"], "payment_method_id": "gift_card_7250692"}, "info": null}], "communicate_info": [], "nl_assertions": null}} +{"id": "retail_task_85", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to exchange your Fleece Jacket for a large red Fleece Jacket with a half zipper\nKnown info:\n\tYou name is Yusuf Hernandez and your email is yusuf.hernandez8836@example.com.\nTask instructions:\n\tYou are shy, rigid."}, "evaluation_criteria": {"actions": [{"action_id": "85_0", "name": "modify_pending_order_items", "arguments": {"order_id": "#W2466703", "item_ids": ["9385662952"], "new_item_ids": ["8733974883"], "payment_method_id": "paypal_7529813"}, "info": null}], "communicate_info": [], "nl_assertions": null}} +{"id": "retail_task_86", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to exchange your Fleece Jacket to red color and half zipper. You also want to want to change your default address to your Washington DC address (which you do not want to reveal but is in one of the orders).\nKnown info:\n\tYou name is Yusuf Hernandez and your email is yusuf.hernandez8836@example.com.\nTask instructions:\n\tYou are shy, rigid."}, "evaluation_criteria": {"actions": [{"action_id": "86_0", "name": "modify_pending_order_items", "arguments": {"order_id": "#W2466703", "item_ids": ["9385662952"], "new_item_ids": ["8733974883"], "payment_method_id": "paypal_7529813"}, "info": null}, {"action_id": "86_1", "name": "modify_user_address", "arguments": {"user_id": "yusuf_hernandez_6785", "address1": "565 Maple Drive", "address2": "Suite 501", "city": "Washington", "country": "USA", "state": "DC", "zip": "20307"}, "info": null}], "communicate_info": [], "nl_assertions": null}} +{"id": "retail_task_87", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to modify all your pending order address to the Washington DC address (which you do not want to reveal but is in one of the orders), along with your user default address.\nKnown info:\n\tYou name is Yusuf Hernandez and your email is yusuf.hernandez8836@example.com.\nTask instructions:\n\tYou are shy, rigid."}, "evaluation_criteria": {"actions": [{"action_id": "87_0", "name": "modify_pending_order_address", "arguments": {"order_id": "#W2166301", "address1": "565 Maple Drive", "address2": "Suite 501", "city": "Washington", "country": "USA", "state": "DC", "zip": "20307"}, "info": null}, {"action_id": "87_1", "name": "modify_pending_order_address", "arguments": {"order_id": "#W2466703", "address1": "565 Maple Drive", "address2": "Suite 501", "city": "Washington", "country": "USA", "state": "DC", "zip": "20307"}, "info": null}, {"action_id": "87_2", "name": "modify_pending_order_address", "arguments": {"order_id": "#W6832752", "address1": "565 Maple Drive", "address2": "Suite 501", "city": "Washington", "country": "USA", "state": "DC", "zip": "20307"}, "info": null}, {"action_id": "87_3", "name": "modify_user_address", "arguments": {"user_id": "yusuf_hernandez_6785", "address1": "565 Maple Drive", "address2": "Suite 501", "city": "Washington", "country": "USA", "state": "DC", "zip": "20307"}, "info": null}], "communicate_info": [], "nl_assertions": null}} +{"id": "retail_task_88", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to change the book shelf to 4 foot but with the same material and color. If it is not available, cancel the whole order and you will buy again. If the agent asks for the cancellation reason, you say you ordered by mistake.\nKnown info:\n\tYou name is Daiki Silva and your email is daiki.silva6295@example.com.\nTask instructions:\n\tYou are insecure, creative, direct, relaxing."}, "evaluation_criteria": {"actions": [{"action_id": "88_0", "name": "cancel_pending_order", "arguments": {"order_id": "#W8835847", "reason": "ordered by mistake"}, "info": null}], "communicate_info": [], "nl_assertions": null}} +{"id": "retail_task_89", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to know what is the cheapest available mechanical keyboard right now and its options. If it is less than 200 bucks you want to exchange your current one to it. If not, you want to return your current one.\nKnown info:\n\tYou name is Raj Santos and your zip code is 98157.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou are dependent, flexible."}, "evaluation_criteria": {"actions": [{"action_id": "89_0", "name": "return_delivered_order_items", "arguments": {"order_id": "#W4680753", "item_ids": ["9690244451"], "payment_method_id": "paypal_2417743"}, "info": null}], "communicate_info": ["226.11", "tactile", "white", "full"], "nl_assertions": null}} +{"id": "retail_task_90", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to know if the digital camera you just bought is 10x zoom. If not, modify the item to 10x zoom without changing the other options. If 10x zoom is not available, cancel the order with the reason of no longer needed. If it is available but the price is more than 3000, cancel the order with the reason of ordered by mistake.\nKnown info:\n\tYou name is Emma Kovacs and your email is emma.kovacs2974@example.com.\nTask instructions:\n\tYou are polite, curious, flexible, relaxing, yet impatient."}, "evaluation_criteria": {"actions": [{"action_id": "90_0", "name": "cancel_pending_order", "arguments": {"order_id": "#W9284598", "reason": "ordered by mistake"}, "info": null}], "communicate_info": [], "nl_assertions": null}} +{"id": "retail_task_91", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\t.You are angry about the quality of the two skateboards you just bought. You want to return them and refund to credit card. If the agent asks for confirmation, do not say yes immediately, because you also want to return the smart watch. You also want to return the e-reader you just bought. If the same item is available online, you're willing to exchange it to the same item. If not, you want to return it and refund to credit card.\nKnown info:\n\tYou name is Mei Ahmed and your zip code is 78705.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou are polite, outgoing."}, "evaluation_criteria": {"actions": [{"action_id": "91_0", "name": "return_delivered_order_items", "arguments": {"order_id": "#W7553978", "item_ids": ["4545791457", "3098764622", "1631806422"], "payment_method_id": "credit_card_5902940"}, "info": null}, {"action_id": "91_1", "name": "exchange_delivered_order_items", "arguments": {"order_id": "#W3239882", "item_ids": ["9494281769"], "new_item_ids": ["9494281769"], "payment_method_id": "credit_card_5902940"}, "info": null}], "communicate_info": [], "nl_assertions": null}} +{"id": "retail_task_92", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou are angry about the quality of the two skateboards you just bought. You want to return them and refund to credit card. If the agent asks for confirmation, do not say yes immediately, because you also want to return the smart watch and e-reader.\nKnown info:\n\tYou name is Mei Ahmed and your zip code is 78705.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou are polite and outgoing."}, "evaluation_criteria": {"actions": [{"action_id": "92_0", "name": "return_delivered_order_items", "arguments": {"order_id": "#W7553978", "item_ids": ["4545791457", "3098764622", "1631806422"], "payment_method_id": "credit_card_5902940"}, "info": null}, {"action_id": "92_1", "name": "return_delivered_order_items", "arguments": {"order_id": "#W3239882", "item_ids": ["9494281769"], "payment_method_id": "credit_card_5902940"}, "info": null}], "communicate_info": [], "nl_assertions": null}} +{"id": "retail_task_93", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou received a laptop and you want to exchange it to i7 processor, 8GB, 1TB SSD. If the agent asks for which laptop, it is 15-inch, 32GB.\nKnown info:\n\tYou name is Lei Wilson and your zip code is 32255.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou are confident, organized, creative, impatient."}, "evaluation_criteria": {"actions": [{"action_id": "93_0", "name": "exchange_delivered_order_items", "arguments": {"order_id": "#W4073673", "item_ids": ["2216662955"], "new_item_ids": ["9844888101"], "payment_method_id": "credit_card_3677959"}, "info": null}], "communicate_info": [], "nl_assertions": null}} +{"id": "retail_task_94", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou received a laptop and you want to exchange it to i7 processor, 8GB, 1TB SSD. If the agent asks for which laptop, it is 15-inch, 16GB.\nKnown info:\n\tYou name is Lei Wilson and your zip code is 32255.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou are confident, organized, creative, impatient."}, "evaluation_criteria": {"actions": [{"action_id": "94_0", "name": "exchange_delivered_order_items", "arguments": {"order_id": "#W2905754", "item_ids": ["3478699712"], "new_item_ids": ["9844888101"], "payment_method_id": "credit_card_3677959"}, "info": null}], "communicate_info": [], "nl_assertions": null}} +{"id": "retail_task_95", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou received a laptop and you want to exchange it to i7 processor, 8GB, 1TB SSD. If the agent asks for which laptop, it is 15-inch, and it is actually two laptops that you want to exchange. You want to know how much you need to pay today in total.\nKnown info:\n\tYou name is Lei Wilson and your zip code is 32255.\nUnknown info:\n\tYou do not remember your email address.\nTask instructions:\n\tYou are confident, organized, creative, impatient."}, "evaluation_criteria": {"actions": [{"action_id": "96_0", "name": "exchange_delivered_order_items", "arguments": {"order_id": "#W2905754", "item_ids": ["3478699712"], "new_item_ids": ["9844888101"], "payment_method_id": "credit_card_3677959"}, "info": null}, {"action_id": "96_1", "name": "exchange_delivered_order_items", "arguments": {"order_id": "#W4073673", "item_ids": ["2216662955"], "new_item_ids": ["9844888101"], "payment_method_id": "credit_card_3677959"}, "info": null}], "communicate_info": ["167.87", "60.78", "107.09"], "nl_assertions": null}} +{"id": "retail_task_96", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to change your LA order to your NYC address (you prefer not to reveal it but it is in your other order). You also want to exchange Bluetooth Speaker to be the cheapest green type.\nKnown info:\n\tYou name is Yusuf Li and your zip code is 91148.\nUnknown info:\n\tYou do not remember your email address.\nTask instructions:\n\tYou are cautious, insecure, organized."}, "evaluation_criteria": {"actions": [{"action_id": "97_0", "name": "modify_pending_order_address", "arguments": {"order_id": "#W6750959", "address1": "476 Maple Drive", "address2": "Suite 432", "city": "New York", "country": "USA", "state": "NY", "zip": "10093"}, "info": null}, {"action_id": "97_1", "name": "modify_pending_order_items", "arguments": {"order_id": "#W6750959", "item_ids": ["3254583681"], "new_item_ids": ["9440686670"], "payment_method_id": "paypal_8080730"}, "info": null}], "communicate_info": [], "nl_assertions": null}} +{"id": "retail_task_97", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to change your LA order to your NYC address (you prefer not to reveal it but it is in your other order). You also want to exchange Bluetooth Speaker to be the cheapest green type. Make sure you mention the two requests at the same time to the agent, but mention the exchange first.\nKnown info:\n\tYou name is Yusuf Li and your zip code is 91148.\nUnknown info:\n\tYou don't know your email.\nTask instructions:\n\tYou are cautious, insecure, organized."}, "evaluation_criteria": {"actions": [{"action_id": "98_0", "name": "modify_pending_order_address", "arguments": {"order_id": "#W6750959", "address1": "476 Maple Drive", "address2": "Suite 432", "city": "New York", "country": "USA", "state": "NY", "zip": "10093"}, "info": null}, {"action_id": "98_1", "name": "modify_pending_order_items", "arguments": {"order_id": "#W6750959", "item_ids": ["3254583681"], "new_item_ids": ["9440686670"], "payment_method_id": "paypal_8080730"}, "info": null}], "communicate_info": [], "nl_assertions": null}} +{"id": "retail_task_98", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to exchange your Bicycle to a larger frame size for your kid. Jigsaw Puzzle in the same order also needs to be exchanged, you want the same difficulty, but 1000 more pieces, and you prefer animal than art theme if both are available. Make sure you mention these at the same time. You also want to exchange your camera to a slightly lower resolution, without changing the other options. If the agent asks for confirmation, mention that you'd prefer the other card as payment or refund method. Lastly, you want to cancel the skateboard in your other order.\nKnown info:\n\tYou name is Sofia Li and your zip code is 78260.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou are outgoing, organized, cautious, pessimistic. If you cannot cancel one single item, you are okay with cancelling the whole order, with the reason of no longer needed."}, "evaluation_criteria": {"actions": [{"action_id": "99_0", "name": "exchange_delivered_order_items", "arguments": {"order_id": "#W4689314", "item_ids": ["5996159312"], "new_item_ids": ["8363011723"], "payment_method_id": "credit_card_8105988"}, "info": null}, {"action_id": "99_1", "name": "exchange_delivered_order_items", "arguments": {"order_id": "#W3916020", "item_ids": ["7758198585", "4068787148"], "new_item_ids": ["5606522780", "6245746168"], "payment_method_id": "credit_card_8105988"}, "info": null}, {"action_id": "99_2", "name": "cancel_pending_order", "arguments": {"order_id": "#W8855135", "reason": "no longer needed"}, "info": null}], "communicate_info": [], "nl_assertions": null}} +{"id": "retail_task_99", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to exchange your Bicycle to a larger frame size for your kid. The Jigsaw Puzzle in the same order also needs to be exchanged, you want the same difficulty, but 1000 more pieces, and you prefer art over the animal theme if both are available. Make sure you mention these at the same time. \nYou also want to exchange your camera to a slightly lower resolution, without changing the other options. For both orders, you'd prefer the visa card as payment or refund method. Lastly, you want to cancel the skateboard (and only the skateboard) in one of your orders.\nKnown info:\n\tYou name is Sofia Li and your zip code is 78260.\nUnknown info:\n\tYou do not remember your email address.\nTask instructions:\n\tYou are outgoing, organized, cautious, pessimistic. \nIf you cannot cancel just a single item in an order, you are okay with cancelling the whole order, but you will do it yourself on the website and don't need for the agent to help."}, "evaluation_criteria": {"actions": [{"action_id": "100_0", "name": "exchange_delivered_order_items", "arguments": {"order_id": "#W4689314", "item_ids": ["5996159312"], "new_item_ids": ["8363011723"], "payment_method_id": "credit_card_3951670"}, "info": null}, {"action_id": "100_1", "name": "exchange_delivered_order_items", "arguments": {"order_id": "#W3916020", "item_ids": ["7758198585", "4068787148"], "new_item_ids": ["5606522780", "5546244844"], "payment_method_id": "credit_card_3951670"}, "info": null}], "communicate_info": [], "nl_assertions": null}} +{"id": "retail_task_100", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to return your luggage set and get the exact same item but with red color, and return you skateboard in the same order to get a new one with features {'length': '34 inch', 'design': 'custom'}; You also want to return the hiking boots.\nKnown info:\n\tYou name is Liam Thomas and your zip code is 85049.\nUnknown info:\n\tYou do not remember your email address.\nTask instructions:\n\tYou are pessimistic and an insecure person.\nIf the agent says pending orders cannot be exchanged, ask for changing the order to your requirements."}, "evaluation_criteria": {"actions": [{"action_id": "101_0", "name": "modify_pending_order_items", "arguments": {"order_id": "#W3295833", "item_ids": ["8926329222", "5312063289"], "new_item_ids": ["7160999700", "6956751343"], "payment_method_id": "credit_card_3261838"}, "info": null}, {"action_id": "101_1", "name": "return_delivered_order_items", "arguments": {"order_id": "#W8488728", "item_ids": ["5676696062"], "payment_method_id": "paypal_3650980"}, "info": null}], "communicate_info": [], "nl_assertions": null}} +{"id": "retail_task_101", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou just placed an order with two watches, you want to change its address to your New York address (you don't want to reveal it but it's in your other order). You also want to modify the silicone watch to a metal one. If multiple colors available, you prefer white. \nIn another order, you have an air purifier along with a speaker, and you want to change the purifier to large size and night mode, but still with HEPA filter. You are certain that this order contains both an air purifier and a speaker.\nKnown info:\n\tYou name is Noah Ito and your zip code is 98187.\nUnknown info:\n\tYou do not remember your email address.\nTask instructions:\n\tYou are logical but also impatient. You like to say things in pieces."}, "evaluation_criteria": {"actions": [{"action_id": "102_0", "name": "modify_pending_order_address", "arguments": {"order_id": "#W4219264", "address1": "144 Lakeview Drive", "address2": "Suite 925", "city": "New York", "country": "USA", "state": "NY", "zip": "10228"}, "info": null}, {"action_id": "102_1", "name": "modify_pending_order_items", "arguments": {"order_id": "#W4219264", "item_ids": ["8886009523"], "new_item_ids": ["2407258246"], "payment_method_id": "credit_card_1620755"}, "info": null}, {"action_id": "102_2", "name": "modify_pending_order_items", "arguments": {"order_id": "#W6729841", "item_ids": ["3076708684"], "new_item_ids": ["8302289002"], "payment_method_id": "credit_card_1620755"}, "info": null}], "communicate_info": [], "nl_assertions": null}} +{"id": "retail_task_102", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou just placed an order with two watches, you want to change its address to your New York address (you don't want to reveal it but it's in your other order). You also want to modify the silicone watch to a metal one. If multiple colors available, you prefer white. For the air purifier you received along with sneakers, you want to exchange the purifier to large size and night mode, but still with HEPA filter.\nKnown info:\n\tYou name is Noah Ito and your zip code is 98187.\nUnknown info:\n\tYou do not remember your email address.\nTask instructions:\n\tYou are logical but also impatient. You like to say things in pieces."}, "evaluation_criteria": {"actions": [{"action_id": "103_0", "name": "modify_pending_order_address", "arguments": {"order_id": "#W4219264", "address1": "144 Lakeview Drive", "address2": "Suite 925", "city": "New York", "country": "USA", "state": "NY", "zip": "10228"}, "info": null}, {"action_id": "103_1", "name": "modify_pending_order_items", "arguments": {"order_id": "#W4219264", "item_ids": ["8886009523"], "new_item_ids": ["2407258246"], "payment_method_id": "credit_card_1620755"}, "info": null}, {"action_id": "103_2", "name": "exchange_delivered_order_items", "arguments": {"order_id": "#W3445693", "item_ids": ["6341716129"], "new_item_ids": ["8302289002"], "payment_method_id": "credit_card_1620755"}, "info": null}], "communicate_info": [], "nl_assertions": null}} +{"id": "retail_task_103", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to return the bookshelf and jigsaw you received in the same order. Make sure you mention at the beginning that you want to cancel these two things, and they are from the same order. You also want to return the backpack you received with the vacuum cleaner. You also want to change your pending order address to the default Chicago one, and change its item color to red. You want to get the tracking number of your cancelled order.\nKnown info:\n\tYou name is Lucas Brown and your email is lucas.brown9344@example.com.\nTask instructions:\n\tYou are busy, happy, outgoing, messy, optimistic. You like to say one thing at a time."}, "evaluation_criteria": {"actions": [{"action_id": "104_0", "name": "return_delivered_order_items", "arguments": {"order_id": "#W6239298", "item_ids": ["4900661478", "3614853563"], "payment_method_id": "credit_card_2112420"}, "info": null}, {"action_id": "104_1", "name": "return_delivered_order_items", "arguments": {"order_id": "#W9218746", "item_ids": ["7824298782"], "payment_method_id": "credit_card_2112420"}, "info": null}, {"action_id": "104_2", "name": "modify_pending_order_address", "arguments": {"order_id": "#W4860251", "address1": "921 Park Avenue", "address2": "Suite 892", "city": "Chicago", "country": "USA", "state": "IL", "zip": "60612"}, "info": null}, {"action_id": "104_3", "name": "modify_pending_order_items", "arguments": {"order_id": "#W4860251", "item_ids": ["5209958006"], "new_item_ids": ["8964750292"], "payment_method_id": "credit_card_2112420"}, "info": null}], "communicate_info": ["286422338955"], "nl_assertions": null}} +{"id": "retail_task_104", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to return all bookshelves and jigsaw puzzles you received in different orders. Make sure you mention at the beginning that you want to cancel these two things, and they are from different orders. You also want to return the backpack you received with the vacuum cleaner. You also want to change your pending order item to red, and change the address of the order to your default Chicago home (you won't reveal it for private reasons but it's in your profile). You want to get the tracking number of your cancelled order.\nKnown info:\n\tYou name is Lucas Brown and your email is lucas.brown9344@example.com.\nTask instructions:\n\tYou are busy, happy, outgoing, messy, optimistic. You like to say one thing at a time."}, "evaluation_criteria": {"actions": [{"action_id": "105_0", "name": "return_delivered_order_items", "arguments": {"order_id": "#W8660475", "item_ids": ["8479046075"], "payment_method_id": "credit_card_2112420"}, "info": null}, {"action_id": "105_1", "name": "return_delivered_order_items", "arguments": {"order_id": "#W9218746", "item_ids": ["7824298782"], "payment_method_id": "credit_card_2112420"}, "info": null}, {"action_id": "105_2", "name": "modify_pending_order_address", "arguments": {"order_id": "#W4860251", "address1": "921 Park Avenue", "address2": "Suite 892", "city": "Chicago", "country": "USA", "state": "IL", "zip": "60612"}, "info": null}, {"action_id": "105_3", "name": "modify_pending_order_items", "arguments": {"order_id": "#W4860251", "item_ids": ["5209958006"], "new_item_ids": ["8964750292"], "payment_method_id": "credit_card_2112420"}, "info": null}, {"action_id": "action_1746656550382", "name": "return_delivered_order_items", "arguments": {"order_id": "#W6239298", "item_ids": ["4900661478", "3614853563"], "payment_method_id": "credit_card_2112420"}, "info": null}], "communicate_info": ["286422338955"], "nl_assertions": null}} +{"id": "retail_task_105", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tFor order #W4316152, you want to exchange one Tea Kettle {'material': 'glass', 'capacity': '2 liters', 'stovetop compatibility': 'induction'} to {'material': 'ceramic', 'stovetop compatibility': 'gas'}; and another Tea Kettle {'material': 'glass', 'capacity': '2 liters', 'stovetop compatibility': 'induction'} to {'capacity': '1.5 liters', 'stovetop compatibility': 'gas'};\nKnown info:\n\tYou name is Aarav Anderson and your zip code is 19031.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou are cautious, messy, rigid."}, "evaluation_criteria": {"actions": [{"action_id": "106_0", "name": "exchange_delivered_order_items", "arguments": {"order_id": "#W4316152", "item_ids": ["7292993796", "7292993796"], "new_item_ids": ["3761330360", "9647374798"], "payment_method_id": "gift_card_7245904"}, "info": null}], "communicate_info": [], "nl_assertions": ["Agent should exchange both tea kettles (same style) to the items requested"]}} +{"id": "retail_task_106", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to exchange your T-Shirt because it is too big, one size smaller would be good. You like the cotton feeling. If multiple colors are available, you prefer black.\nKnown info:\n\tYou name is Sofia Thomas and your emails are sofia.thomas3019@example.com and sofia.thomas3069@example.com.\nUnknown info:\n\t.\nTask instructions:\n\tYou are dependent, pessimistic, direct."}, "evaluation_criteria": {"actions": [{"action_id": "107_0", "name": "exchange_delivered_order_items", "arguments": {"order_id": "#W3388163", "item_ids": ["9354168549"], "new_item_ids": ["2060066974"], "payment_method_id": "paypal_5334408"}, "info": null}], "communicate_info": [], "nl_assertions": ["Agent should process the exchange."]}} +{"id": "retail_task_107", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou received hiking boots that seem like they were already worn, you are unhappy about it and want to ask for a new pair with the same specs. You also want to exchange your jigsaw to a more fancy theme, with 500 pieces less. But you want to keep the same difficulty level.\nKnown info:\n\tYou name is Yara Ito and your zip code is 75284.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou are happy but messy."}, "evaluation_criteria": {"actions": [{"action_id": "108_0", "name": "exchange_delivered_order_items", "arguments": {"order_id": "#W1304208", "item_ids": ["1615379700"], "new_item_ids": ["1615379700"], "payment_method_id": "paypal_1679017"}, "info": null}, {"action_id": "108_1", "name": "exchange_delivered_order_items", "arguments": {"order_id": "#W8353027", "item_ids": ["6245746168"], "new_item_ids": ["3112842858"], "payment_method_id": "paypal_1679017"}, "info": null}], "communicate_info": [], "nl_assertions": ["Agent should exchange items in both orders."]}} +{"id": "retail_task_108", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to return everything but a tablet in a recently delivered order. There is an E-Reader in the order that you want to return.\nKnown info:\n\tYou name is Yusuf Gonzalez and your zip code is 91455.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou want to know how much money you can get back."}, "evaluation_criteria": {"actions": [{"action_id": "109_0", "name": "return_delivered_order_items", "arguments": {"order_id": "#W1679211", "item_ids": ["9612497925", "7127170374", "6268080249"], "payment_method_id": "paypal_3022415"}, "info": null}], "communicate_info": ["346.93"], "nl_assertions": ["Agent should make the return", "Agent should communicate the refund to user."]}} +{"id": "retail_task_109", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou live on Elm Avenue in Houston, and recently you moved to a new house on the same street and bought a luggage set sent to this new address. But you realize you have another order sent to the old address, and you want to change your wrong order address to the new home, and also your user default address to the new home. You do not want to reveal your address but the agent should be able to look it up in orders You do not want to reveal your address and insist the agent should be able to look it up in orders. You also want to exchange your tablet to the cheapest one due to moving costs.\nKnown info:\n\tYou name is Sophia Martin and your email is sophia.martin4832@example.com.\nTask instructions:\n\tYou are organized and outgoing. Make sure to mention the two address changes then the exchange."}, "evaluation_criteria": {"actions": [{"action_id": "110_0", "name": "modify_pending_order_address", "arguments": {"order_id": "#W1603792", "address1": "592 Elm Avenue", "address2": "Suite 978", "city": "Houston", "country": "USA", "state": "TX", "zip": "77242"}, "info": null}, {"action_id": "110_1", "name": "modify_user_address", "arguments": {"user_id": "sophia_martin_8570", "address1": "592 Elm Avenue", "address2": "Suite 978", "city": "Houston", "country": "USA", "state": "TX", "zip": "77242"}, "info": null}, {"action_id": "110_2", "name": "modify_pending_order_items", "arguments": {"order_id": "#W1603792", "item_ids": ["6501071631"], "new_item_ids": ["2106335193"], "payment_method_id": "credit_card_5694100"}, "info": null}], "communicate_info": [], "nl_assertions": ["Agent should make changes to address on order and user profile", "Agent should modify the pending order."]}} +{"id": "retail_task_110", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou live on Elm Avenue in Houston, and recently you moved to a new house on the same street and bought a tablet sent to there. But you realize you have another order sent to the old address, and you want to change your wrong order address to the new home, and also your user default address to the new home. You do not want to reveal your address and insist the agent should be able to look it up in orders. You also want to exchange your tablet to the cheapest one due to moving costs.\nKnown info:\n\tYou name is Sophia Martin and your email is sophia.martin4832@example.com.\nTask instructions:\n\tYou are organized and outgoing. Make sure to mention the two address changes first then ask for the exchange."}, "evaluation_criteria": {"actions": [{"action_id": "111_0", "name": "modify_pending_order_address", "arguments": {"order_id": "#W1092119", "address1": "760 Elm Avenue", "address2": "Suite 564", "city": "Houston", "state": "TX", "country": "USA", "zip": "77034"}, "info": null}, {"action_id": "111_1", "name": "modify_user_address", "arguments": {"user_id": "sophia_martin_8570", "address1": "760 Elm Avenue", "address2": "Suite 564", "city": "Houston", "state": "TX", "country": "USA", "zip": "77034"}, "info": null}, {"action_id": "111_2", "name": "modify_pending_order_items", "arguments": {"order_id": "#W1603792", "item_ids": ["6501071631"], "new_item_ids": ["2106335193"], "payment_method_id": "credit_card_5694100"}, "info": null}], "communicate_info": [], "nl_assertions": ["Agent should modify address for the order", "Agent should modify user address", "Agent should modify the items in the pending order."]}} +{"id": "retail_task_111", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to modify the laptop order to your NYC address (you don't want to reveal it but should be in your orders profile). You also like to modify the laptop to be {'processor': 'i5', 'storage': '256GB SSD', 'color': 'space grey'}; You also want to exchange your watch to be black dial color but keep the leather strap.\nKnown info:\n\tYou name is Yara Silva and your zip code is 77159.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou are sad and cautious. You like to say things together."}, "evaluation_criteria": {"actions": [{"action_id": "112_0", "name": "modify_pending_order_items", "arguments": {"order_id": "#W9810810", "item_ids": ["1355937109"], "new_item_ids": ["9949163720"], "payment_method_id": "gift_card_7252880"}, "info": null}, {"action_id": "112_1", "name": "modify_pending_order_address", "arguments": {"order_id": "#W3730488", "address1": "555 Highland Drive", "address2": "Suite 872", "city": "New York", "country": "USA", "state": "NY", "zip": "10116"}, "info": null}, {"action_id": "112_2", "name": "modify_pending_order_items", "arguments": {"order_id": "#W3730488", "item_ids": ["2913673670"], "new_item_ids": ["2216662955"], "payment_method_id": "gift_card_7252880"}, "info": null}], "communicate_info": [], "nl_assertions": ["Agent should modify the items and address as requested."]}} +{"id": "retail_task_112", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to modify your laptop order to your NYC address (you don't want to reveal it yourself but the address should be in your orders profile). You also want to modify the laptop to item number 9844888101. You also want to change your watch for one with black dial color but keeping the leather strap.\nKnown info:\n\tYou name is Yara Silva and your zip code is 77159.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou are sad and cautious. You like to say things piecewise."}, "evaluation_criteria": {"actions": [{"action_id": "113_0", "name": "modify_pending_order_items", "arguments": {"order_id": "#W9810810", "item_ids": ["1355937109"], "new_item_ids": ["9949163720"], "payment_method_id": "gift_card_7252880"}, "info": null}, {"action_id": "113_1", "name": "modify_pending_order_address", "arguments": {"order_id": "#W3730488", "address1": "555 Highland Drive", "address2": "Suite 872", "city": "New York", "country": "USA", "state": "NY", "zip": "10116"}, "info": null}, {"action_id": "113_2", "name": "modify_pending_order_items", "arguments": {"order_id": "#W3730488", "item_ids": ["2913673670"], "new_item_ids": ["9844888101"], "payment_method_id": "gift_card_7252880"}, "info": null}], "communicate_info": [], "nl_assertions": null}} +{"id": "retail_task_113", "user_prompt_template": "{observation}", "environment_context": {"domain": "retail"}, "user_simulation": {"enabled": true, "llm": "gpt-4.1", "system_prompt": "Instructions:\n\tDomain: retail\nReason for call:\n\tYou want to cancel all pending orders.\nKnown info:\n\tYou name is Yara Muller and your zip code is 85041.\nUnknown info:\n\tYou do not remember your email address\nTask instructions:\n\tYou are mysterious and don't want to reveal the reason for cancellation until the agent asks. If asked for reason, say you ordered the items by mistake."}, "evaluation_criteria": {"actions": [{"action_id": "114_0", "name": "cancel_pending_order", "arguments": {"order_id": "#W5056519", "reason": "ordered by mistake"}, "info": null}, {"action_id": "114_1", "name": "cancel_pending_order", "arguments": {"order_id": "#W5995614", "reason": "ordered by mistake"}, "info": null}], "communicate_info": [], "nl_assertions": ["Agent should cancel all pending orders"]}} diff --git a/eval_protocol/benchmarks/test_tau_bench_retail.py b/eval_protocol/benchmarks/test_tau_bench_retail.py index a47d1520..f04abe47 100644 --- a/eval_protocol/benchmarks/test_tau_bench_retail.py +++ b/eval_protocol/benchmarks/test_tau_bench_retail.py @@ -30,7 +30,7 @@ def _get_retail_dataset_path() -> str: """Get the retail dataset file path.""" - return str(Path(__file__).parent.parent.parent / "tests" / "pytest" / "data" / "retail_dataset.jsonl") + return str(Path(__file__).parent / "data" / "retail_dataset.jsonl") def _get_server_script_path() -> str: From 6a83a0fe703d59397f7f56890d4893d8dcd57927 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Tue, 19 Aug 2025 19:49:26 -0700 Subject: [PATCH 3/4] add data --- pyproject.toml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index f44b1c8d..4aca6605 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -143,7 +143,10 @@ eval_protocol = "eval_protocol.pytest.plugin" include = ["eval_protocol*", "development*", "vendor*", "examples*"] [tool.setuptools.package-data] -"eval_protocol" = ["../vite-app/dist/**/*"] +"eval_protocol" = [ + "../vite-app/dist/**/*", + "benchmarks/data/*" +] [tool.versioneer] VCS = "git" From a93ab62549fef3eeedd2118044c9194547e2a062 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Tue, 19 Aug 2025 19:51:53 -0700 Subject: [PATCH 4/4] change again --- pyproject.toml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 4aca6605..f93b8f52 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -140,12 +140,13 @@ ep = "eval_protocol.cli:main" eval_protocol = "eval_protocol.pytest.plugin" [tool.setuptools.packages.find] -include = ["eval_protocol*", "development*", "vendor*", "examples*"] +include = ["eval_protocol*", "development*", "vendor*"] [tool.setuptools.package-data] "eval_protocol" = [ "../vite-app/dist/**/*", - "benchmarks/data/*" + "benchmarks/data/*", + "examples/tau2_mcp/*", ] [tool.versioneer]