From 1f1e9fd47bd59d88b00e85acd27ca4f2ce0090dd Mon Sep 17 00:00:00 2001
From: DeepakAkkil <deepak.akkil@merlyn.org>
Date: Wed, 2 Jul 2025 15:04:15 +0300
Subject: [PATCH 1/8] update task for clarity

---
 tasks/webvoyager_template.json | 164 ++++++++++++++++-----------------
 1 file changed, 82 insertions(+), 82 deletions(-)

diff --git a/tasks/webvoyager_template.json b/tasks/webvoyager_template.json
index cb5c4a6..c504e09 100644
--- a/tasks/webvoyager_template.json
+++ b/tasks/webvoyager_template.json
@@ -242,7 +242,7 @@
     {
         "intent_template": "Using the website {{website}}, find a {{rating}} rated {{recipe}} recipe that takes less than {{time}} to make. Note how many reviews the recipe has and the main ingredients required.",
         "instantiation_dict": {
-            "rating": "five-star",
+            "rating": "perfect 5.0 star",
             "recipe": "chocolate chip cookie",
             "time": "1 hour",
             "website": "https://www.allrecipes.com/"
@@ -1317,7 +1317,7 @@
             "Does the agent's answer contain details of a book in fiction genre?",
             "Was the book released in {{year}}?",
             "Does the book have a minimum of 50 customer reviews?",
-            "Is the book the highest-rated among fiction books released in {{year}}?",
+            "Is the book the highest-rated among fiction books released in {{year}} (Hint: There are typically books with 4.9 stars.)?",
             "Was the task completed using the website mentioned in the prompt: {{website}}"
         ],
         "reference_answers": "The Women Library Binding – Large Print, March 1, 2024, 4.8 stars",
@@ -1808,7 +1808,7 @@
         "domain": "Apple"
     },
     {
-        "intent_template": "Using the website {{website}}, check if there are trade-in offers for {{item}}",
+        "intent_template": "Using the website {{website}}, check if there are trade-in offers when purchasing {{item}}",
         "instantiation_dict": {
             "item": "iPhone 16 Pro",
             "website": "https://www.apple.com/"
@@ -2362,10 +2362,10 @@
         "domain": "ArXiv"
     },
     {
-        "intent_template": "Using the website {{website}}, find {{item}} in {{section}}, tell me how many styles of {{product}} are available",
+        "intent_template": "Using the website {{website}}, find {{item}} in {{section}}. Tell me how many styles of {{product}} are available",
         "instantiation_dict": {
-            "item": "store",
-            "section": "arXiv Help",
+            "item": "Arxiv store",
+            "section": "arXiv Help section",
             "product": "arXiv Logo Shirt",
             "website": "https://arxiv.org/"
         },
@@ -3030,11 +3030,11 @@
         "intent_template": "Using the website {{website}}, Find out which {{profile}} made the headlines in {{section}}.",
         "instantiation_dict": {
             "profile": "musician",
-            "section": "Music News",
+            "section": "Music section",
             "website": "https://www.bbc.com/"
         },
         "questions_for_annotation": [
-            "Does the agent correctly access the Music News section on BBC?",
+            "Does the agent correctly access the Music section on BBC?",
             "Does the agent's answer correctly mention name of atleast one musician who is mentioned in the Music News section?"
         ],
         "reference_answers": "Taylor Swift",
@@ -3152,7 +3152,7 @@
         "domain": "BBC News"
     },
     {
-        "intent_template": "Using the website {{website}}, find how many {{category}} related sections are present in the News page.",
+        "intent_template": "Using the website {{website}}, find how many {{category}} related sections/tabs are present in the News page.",
         "instantiation_dict": {
             "category": "War",
             "website": "https://www.bbc.com/"
@@ -6983,7 +6983,7 @@
             "destination_city": "Manchester",
             "date": "@eval:(now() + timedelta(days=40)).strftime('%B %d %Y')",
             "price_option": "lowest",
-            "website": "https://www.google.com/travel/flights/"
+            "website": "https://flights.google.com/"
         },
         "questions_for_annotation": [
             "Does the agent search for flight options from Edinburg to Manchester with return on the same day for the provided date?",
@@ -7004,7 +7004,7 @@
             "date": "today",
             "departure_city": "Chicago",
             "destination_city": "Paris",
-            "website": "https://www.google.com/travel/flights/"
+            "website": "https://flights.google.com/"
         },
         "questions_for_annotation": [
             "Does the agent search for one-way flight options from Chicago to Paris for today? (Note: Today refers to the day the agent performed the task)",
@@ -7025,7 +7025,7 @@
             "departure": "JFK",
             "destination": "Heathrow",
             "date": "@eval:(now() + timedelta(days=27)).strftime('%B %d')",
-            "website": "https://www.google.com/travel/flights/"
+            "website": "https://flights.google.com/"
         },
         "questions_for_annotation": [
             "Does the agent search for one-way flight options from JFK to Heathrow for 1 passenger for the provided date?",
@@ -7047,7 +7047,7 @@
             "destination_city": "New York",
             "date": "@eval:(now() + timedelta(days=32)).strftime('%B %d')",
             "criteria": "lowest carbon dioxide emissions",
-            "website": "https://www.google.com/travel/flights/"
+            "website": "https://flights.google.com/"
         },
         "questions_for_annotation": [
             "Does the agent search for a one-way flight from Calgary to New York for the provided date?",
@@ -7069,7 +7069,7 @@
             "destination_city": "London",
             "departure_date": "@eval:(now() + timedelta(days=47)).strftime('%B %d')",
             "flight_type": "non-stop",
-            "website": "https://www.google.com/travel/flights/"
+            "website": "https://flights.google.com/"
         },
         "questions_for_annotation": [
             "Does the agent search for a one-way flight from New York to London for the provided date?",
@@ -7091,7 +7091,7 @@
             "departure_city": "Tel Aviv",
             "destination_city": "Venice",
             "class_type": "First Class",
-            "website": "https://www.google.com/travel/flights/"
+            "website": "https://flights.google.com/"
         },
         "questions_for_annotation": [
             "Does the agent search for a round trip flight from Tel Aviv to Venice for the provided date?",
@@ -7114,7 +7114,7 @@
             "return_date": "@eval:(now() + timedelta(days=63)).strftime('%B %d %Y')",
             "class_type": "First Class",
             "price": "$1320",
-            "website": "https://www.google.com/travel/flights/"
+            "website": "https://flights.google.com/"
         },
         "questions_for_annotation": [
             "Does the agent search for a round trip flight from Phoenix to Miami for the provided date?",
@@ -7138,7 +7138,7 @@
             "passengers": "1 Adult",
             "departure_date": "@eval:(now() + timedelta(days=95)).strftime('%B %d %Y')",
             "duration": "2 months",
-            "website": "https://www.google.com/travel/flights/"
+            "website": "https://flights.google.com/"
         },
         "questions_for_annotation": [
             "Does the agent search for a one-way flight from Dublin to Athens Greece for 1 adult for the provided date?",
@@ -7160,7 +7160,7 @@
             "departure_city": "Pune",
             "destination_city": "New York",
             "date": "@eval:(now() + timedelta(days=10)).strftime('%B %d %Y')",
-            "website": "https://www.google.com/travel/flights/"
+            "website": "https://flights.google.com/"
         },
         "questions_for_annotation": [
             "Does the agent search for a one-way economy flight from Pune to New York for the provided date?",
@@ -7181,7 +7181,7 @@
             "destination_city": "Tokyo",
             "departure_date": "@eval:(now() + timedelta(days=123)).strftime('%B %d %Y')",
             "return_date": "@eval:(now() + timedelta(days=184)).strftime('%B %d %Y')",
-            "website": "https://www.google.com/travel/flights/"
+            "website": "https://flights.google.com/"
         },
         "questions_for_annotation": [
             "Does the agent search for a round trip flight from New York to Tokyo for the provided dates?",
@@ -7203,7 +7203,7 @@
             "destination_city": "Tokyo",
             "departure_date": "@eval:(now() + timedelta(days=50)).strftime('%B %d %Y')",
             "return_date": "@eval:(now() + timedelta(days=64)).strftime('%B %d %Y')",
-            "website": "https://www.google.com/travel/flights/"
+            "website": "https://flights.google.com/"
         },
         "questions_for_annotation": [
             "Does the agent search for a round trip flight from New York to Tokyo for the provided dates?",
@@ -7224,7 +7224,7 @@
             "destination_city": "London",
             "departure_date": "@eval:(now() + timedelta(days=12)).strftime('%B %d %Y')",
             "return_date": "@eval:(now() + timedelta(days=19)).strftime('%B %d %Y')",
-            "website": "https://www.google.com/travel/flights/"
+            "website": "https://flights.google.com/"
         },
         "questions_for_annotation": [
             "Does the agent search for a round trip flight from New York to London for the provided dates?",
@@ -7245,7 +7245,7 @@
             "destination_airport": "Tokyo Narita Airport",
             "departure_date": "@eval:(now() + timedelta(days=22)).strftime('%B %d %Y')",
             "return_date": "@eval:(now() + timedelta(days=36)).strftime('%B %d %Y')",
-            "website": "https://www.google.com/travel/flights/"
+            "website": "https://flights.google.com/"
         },
         "questions_for_annotation": [
             "Does the agent search for a round trip flight from New York to Tokyo Narita Airport for the provided dates?",
@@ -7266,7 +7266,7 @@
             "departure_city": "New York",
             "destination_city": "Tokyo",
             "departure_date": "@eval:(now() + timedelta(days=100)).strftime('%B %d %Y')",
-            "website": "https://www.google.com/travel/flights/"
+            "website": "https://flights.google.com/"
         },
         "questions_for_annotation": [
             "Does the agent search for a one-way flight from New York to Tokyo for the provided date?",
@@ -7290,7 +7290,7 @@
             "return_date": "@eval:(now() + timedelta(days=44)).strftime('%B %d %Y')",
             "passenger_count": "one adult",
             "priority": "shortest travel time",
-            "website": "https://www.google.com/travel/flights/"
+            "website": "https://flights.google.com/"
         },
         "questions_for_annotation": [
             "Does the agent search for a round trip flight for 1 adult passenger from New York to Tokyo for the provided date?",
@@ -7312,7 +7312,7 @@
             "departure_date": "@eval:(now() + timedelta(days=18)).strftime('%B %d %Y')",
             "return_date": "@eval:(now() + timedelta(days=25)).strftime('%B %d %Y')",
             "preference": "shortest total travel time",
-            "website": "https://www.google.com/travel/flights/"
+            "website": "https://flights.google.com/"
         },
         "questions_for_annotation": [
             "Does the agent search for a round trip flight from San Francisco to Berlin for the provided date?",
@@ -7334,7 +7334,7 @@
             "destination_city": "Sydney",
             "passenger_type": "adult",
             "departure_date": "@eval:(now() + timedelta(days=31)).strftime('%B %d %Y')",
-            "website": "https://www.google.com/travel/flights/"
+            "website": "https://flights.google.com/"
         },
         "questions_for_annotation": [
             "Does the agent search for a one-way flight from Tokyo to Sydney for 1 adult for the provided date?",
@@ -7356,7 +7356,7 @@
             "destination_city": "Los Angeles",
             "departure_date": "@eval:(now() + timedelta(days=16)).strftime('%B %d %Y')",
             "return_date": "@eval:(now() + timedelta(days=23)).strftime('%B %d %Y')",
-            "website": "https://www.google.com/travel/flights/"
+            "website": "https://flights.google.com/"
         },
         "questions_for_annotation": [
             "Does the agent search for a round trip flight from Rio de Janeiro to Los Angeles for the provided date?",
@@ -7378,7 +7378,7 @@
             "destination_city": "Vancouver",
             "departure_date": "@eval:(now() + timedelta(days=27)).strftime('%B %d %Y')",
             "stops": "1-stop",
-            "website": "https://www.google.com/travel/flights/"
+            "website": "https://flights.google.com/"
         },
         "questions_for_annotation": [
             "Does the agent search for a one-way flight from Mumbai to Vancouver for the provided date?",
@@ -7400,7 +7400,7 @@
             "arrival_city": "Amsterdam",
             "date": "@eval:(now() + timedelta(days=54)).strftime('%B %d %Y')",
             "duration": "shortest duration",
-            "website": "https://www.google.com/travel/flights/"
+            "website": "https://flights.google.com/"
         },
         "questions_for_annotation": [
             "Does the agent search for a one-way flight from Buenos Aires to Amsterdam for the provided date?",
@@ -7424,7 +7424,7 @@
             "departure_date": "@eval:(now() + timedelta(days=28)).strftime('%B %d %Y')",
             "return_date": "@eval:(now() + timedelta(days=30)).strftime('%B %d %Y')",
             "budget": "$1000",
-            "website": "https://www.google.com/travel/flights/"
+            "website": "https://flights.google.com/"
         },
         "questions_for_annotation": [
             "Does the agent search for a round trip flight from Bangkok to Madrid for the provided date?",
@@ -7446,7 +7446,7 @@
             "destination_city": "Toronto",
             "departure_date": "@eval:(now() + timedelta(days=61)).strftime('%B %d %Y')",
             "passenger_count": "one adult",
-            "website": "https://www.google.com/travel/flights/"
+            "website": "https://flights.google.com/"
         },
         "questions_for_annotation": [
             "Does the agent search for a one-way trip flight from Johannesburg to Toronto for the provided date for 1 adult?",
@@ -7469,7 +7469,7 @@
             "departure_date": "@eval:(now() + timedelta(days=82)).strftime('%B %d %Y')",
             "return_date": "@eval:(now() + timedelta(days=85)).strftime('%B %d %Y')",
             "stops": "one",
-            "website": "https://www.google.com/travel/flights/"
+            "website": "https://flights.google.com/"
         },
         "questions_for_annotation": [
             "Does the agent search for a round trip flight from Seattle to Paris for the provided date?",
@@ -7491,7 +7491,7 @@
             "destination_city": "Frankfurt",
             "departure_date": "@eval:(now() + timedelta(days=61)).strftime('%B %d %Y')",
             "return_date": "@eval:(now() + timedelta(days=71)).strftime('%B %d %Y')",
-            "website": "https://www.google.com/travel/flights/"
+            "website": "https://flights.google.com/"
         },
         "questions_for_annotation": [
             "Does the agent search for a round trip flight from Mexico City to Frankfurt for the provided date?",
@@ -7511,7 +7511,7 @@
             "departure_city": "Cape Town",
             "destination_city": "Singapore",
             "departure_date": "@eval:(now() + timedelta(days=37)).strftime('%B %d %Y')",
-            "website": "https://www.google.com/travel/flights/"
+            "website": "https://flights.google.com/"
         },
         "questions_for_annotation": [
             "Does the agent search for a one-way trip flight from Cape Town to Singapore for the provided date?",
@@ -7533,7 +7533,7 @@
             "departure": "Auckland",
             "destination": "Honolulu",
             "date": "@eval:(now() + timedelta(days=55)).strftime('%B %d %Y')",
-            "website": "https://www.google.com/travel/flights/"
+            "website": "https://flights.google.com/"
         },
         "questions_for_annotation": [
             "Does the agent search for a one-way flight from Auckland to Honolulu for the provided date?",
@@ -7556,7 +7556,7 @@
             "destination_city": "Toronto",
             "departure_date": "@eval:(now() + timedelta(days=73)).strftime('%B %d %Y')",
             "return_date": "@eval:(now() + timedelta(days=133)).strftime('%B %d %Y')",
-            "website": "https://www.google.com/travel/flights/"
+            "website": "https://flights.google.com/"
         },
         "questions_for_annotation": [
             "Does the agent search for a round trip flight from Stockholm to Toronto for the provided date?",
@@ -7578,7 +7578,7 @@
             "departure": "Shanghai",
             "destination": "Vancouver",
             "date": "@eval:(now() + timedelta(days=106)).strftime('%B %d %Y')",
-            "website": "https://www.google.com/travel/flights/"
+            "website": "https://flights.google.com/"
         },
         "questions_for_annotation": [
             "Does the agent search for a one-way flight from Shanghai to Vancouver for the provided date?",
@@ -7600,7 +7600,7 @@
             "destination": "Singapore",
             "trip_type": "one-way",
             "date": "@eval:(now() + timedelta(days=58)).strftime('%B %d %Y')",
-            "website": "https://www.google.com/travel/flights/"
+            "website": "https://flights.google.com/"
         },
         "questions_for_annotation": [
             "Does the agent search for a one-way flight from Lisbon to Singapore for the provided date?",
@@ -7622,7 +7622,7 @@
             "departure_city": "Cairo",
             "destination_city": "Montreal",
             "date": "@eval:(now() + timedelta(days=34)).strftime('%B %d %Y')",
-            "website": "https://www.google.com/travel/flights/"
+            "website": "https://flights.google.com/"
         },
         "questions_for_annotation": [
             "Does the agent search for a one-way flight from Cairo to Montreal for the provided date?",
@@ -7646,7 +7646,7 @@
             "departure_date": "@eval:(now() + timedelta(days=116)).strftime('%B %d %Y')",
             "return_date": "@eval:(now() + timedelta(days=183)).strftime('%B %d %Y')",
             "price": "$1000",
-            "website": "https://www.google.com/travel/flights/"
+            "website": "https://flights.google.com/"
         },
         "questions_for_annotation": [
             "Does the agent search for a round trip flight from Helsinki to New Delhi for the provided date?",
@@ -7667,7 +7667,7 @@
             "destination_city": "Beijing",
             "departure_date": "@eval:(now() + timedelta(days=43)).strftime('%B %d %Y')",
             "return_date": "@eval:(now() + timedelta(days=49)).strftime('%B %d %Y')",
-            "website": "https://www.google.com/travel/flights/"
+            "website": "https://flights.google.com/"
         },
         "questions_for_annotation": [
             "Does the agent search for a round trip flight from Buenos Aires to Beijing for the provided date?",
@@ -7689,7 +7689,7 @@
             "date": "@eval:(now() + timedelta(days=5)).strftime('%B %d %Y')",
             "city1": "Tokyo",
             "region": "Hokkaido",
-            "website": "https://www.google.com/travel/flights/"
+            "website": "https://flights.google.com/"
         },
         "questions_for_annotation": [
             "Does the agent search for a one-way flight from Prague to Tokyo for the provided date?",
@@ -7708,7 +7708,7 @@
         "intent_template": "Explore destinations from {{departure}} on the homepage of {{website}}, look at destinations on a map, and recommend some famous places to travel that are within a reasonable distance and price.",
         "instantiation_dict": {
             "departure": "Seattle",
-            "website": "https://www.google.com/travel/flights/"
+            "website": "https://flights.google.com/"
         },
         "questions_for_annotation": [
             "Does the agent navigate to the Explore option on Google Flights? (https://www.google.com/travel/explore)",
@@ -7729,7 +7729,7 @@
             "destination": "Glacier National Park",
             "date": "@eval:(now() + timedelta(days=25)).strftime('%B %d %Y')",
             "stops": "1",
-            "website": "https://www.google.com/travel/flights/"
+            "website": "https://flights.google.com/"
         },
         "questions_for_annotation": [
             "Does the agent search for a one-way flight from Hong Kong to Glacier National Park for the provided date?",
@@ -7750,7 +7750,7 @@
             "number": "5",
             "rating": "4.8",
             "location": "Seattle, WA",
-            "website": "https://www.google.com/maps/"
+            "website": "https://maps.google.com/"
         },
         "questions_for_annotation": [
             "Does the agent's answer contain details of 5 beauty saloons in Seattle, WA?",
@@ -7770,7 +7770,7 @@
             "street1": "main street",
             "street2": "Amherst street",
             "city": "Altavista",
-            "website": "https://www.google.com/maps/"
+            "website": "https://maps.google.com/"
         },
         "questions_for_annotation": [
             "Does the agent's answer correctly state a bus stop nearest to the intersection of main street and Amherst street in Altavista?(Answer: Amherst and 7th or Main Street Middle)",
@@ -7788,7 +7788,7 @@
         "instantiation_dict": {
             "store": "Apple Stores",
             "zip_code": "90028",
-            "website": "https://www.google.com/maps/"
+            "website": "https://maps.google.com/"
         },
         "questions_for_annotation": [
             "Does the agent's answer state an apple store near the zip code 90028? (Example answers: Apple The Grove, Apple Beverly Center)",
@@ -7807,7 +7807,7 @@
             "start_location": "Central Park Zoo",
             "end_location": "Broadway Theater",
             "city": "New York",
-            "website": "https://www.google.com/maps/"
+            "website": "https://maps.google.com/"
         },
         "questions_for_annotation": [
             "Does the agent correctly use google map to find walking time between Central Park Zoo and Broadway Theater?",
@@ -7825,7 +7825,7 @@
         "instantiation_dict": {
             "departure": "Boston Logan Airport",
             "destination": "North Station",
-            "website": "https://www.google.com/maps/"
+            "website": "https://maps.google.com/"
         },
         "questions_for_annotation": [
             "Does the agent correctly use google map to search how to travel between Boston Logan Airport and North Station?",
@@ -7844,7 +7844,7 @@
             "type": "parking garage",
             "location": "Thalia Hall in Chicago",
             "hours": "24 hours",
-            "website": "https://www.google.com/maps/"
+            "website": "https://maps.google.com/"
         },
         "questions_for_annotation": [
             "Does the agent perform a search for parking garage near Thalia Hall in Chicago?",
@@ -7864,7 +7864,7 @@
             "store": "Uniqlo",
             "city": "Greater Chicago Metropolitan Area",
             "state": "IL",
-            "website": "https://www.google.com/maps/"
+            "website": "https://maps.google.com/"
         },
         "questions_for_annotation": [
             "Does the agent perform a search for Uniqlo stores in Chicago?",
@@ -7883,7 +7883,7 @@
         "instantiation_dict": {
             "item": "bus stops",
             "location": "Alanson, MI",
-            "website": "https://www.google.com/maps/"
+            "website": "https://maps.google.com/"
         },
         "questions_for_annotation": [
             "Does the agent perform a search for bus stops in Alanson, MI?",
@@ -7902,7 +7902,7 @@
         "instantiation_dict": {
             "distance": "2",
             "zip_code": "90028",
-            "website": "https://www.google.com/maps/"
+            "website": "https://maps.google.com/"
         },
         "questions_for_annotation": [
             "Does the agent perform a search for climbing places near 90028?",
@@ -7921,7 +7921,7 @@
         "instantiation_dict": {
             "item": "art gallery",
             "location": "Los Angeles Hindu Temple",
-            "website": "https://www.google.com/maps/"
+            "website": "https://maps.google.com/"
         },
         "questions_for_annotation": [
             "Does the agent's answer correctly provide details of art gallery near Los Angeles Hindu Temple? (Answer: Honor Fraser Gallery or Walter Maciel Gallery, both within 100m)",
@@ -7940,7 +7940,7 @@
             "state": "California",
             "park_name": "Castle Mountains National Monument",
             "information_type": "Basic Information",
-            "website": "https://www.google.com/maps/"
+            "website": "https://maps.google.com/"
         },
         "questions_for_annotation": [
             "Does the agent's answer correctly provide basic information about the Castle Mountains National Monument in California?",
@@ -7959,7 +7959,7 @@
             "location": "Washington",
             "products": "kids' and maternity products",
             "facility": "parking lot",
-            "website": "https://www.google.com/maps/"
+            "website": "https://maps.google.com/"
         },
         "questions_for_annotation": [
             "Does the agent's answer correctly provide information about a kids' and maternity product store in Washington?",
@@ -7978,7 +7978,7 @@
         "instantiation_dict": {
             "location": "44012 zip code",
             "sort_by": "highest rating",
-            "website": "https://www.google.com/maps/"
+            "website": "https://maps.google.com/"
         },
         "questions_for_annotation": [
             "Does the agent's answer correctly state 5 places in 44012 that serve burger?",
@@ -7997,7 +7997,7 @@
         "instantiation_dict": {
             "location": "Gloucester",
             "destination": "North Plymouth",
-            "website": "https://www.google.com/maps/"
+            "website": "https://maps.google.com/"
         },
         "questions_for_annotation": [
             "Does the agent's answer correctly state a parking lot in Gloucester?",
@@ -8017,7 +8017,7 @@
         "instantiation_dict": {
             "item": "exclusive motorcycle parking",
             "location": "Radio City Music Hall",
-            "website": "https://www.google.com/maps/"
+            "website": "https://maps.google.com/"
         },
         "questions_for_annotation": [
             "Does the agent's answer correctly state a parking lot that is exclusively for Motorcycles and near Radio City Music Hall? (Example answer: Rising Wolf Garage)",
@@ -8035,7 +8035,7 @@
         "instantiation_dict": {
             "item": "EV charging supported parking",
             "location": "Smithsonian American Art Museum",
-            "website": "https://www.google.com/maps/"
+            "website": "https://maps.google.com/"
         },
         "questions_for_annotation": [
             "Does the agent's answer provide details of a EV charging parking lot near Smithsonian American Art Museum in Washington?",
@@ -8055,7 +8055,7 @@
         "instantiation_dict": {
             "service": "locksmiths",
             "location": "Texas City",
-            "website": "https://www.google.com/maps/"
+            "website": "https://maps.google.com/"
         },
         "questions_for_annotation": [
             "Does the agent's answer provide info about a locksmiths in Texas city that is open at 10.00AM but not open 24hrs?",
@@ -8073,7 +8073,7 @@
         "instantiation_dict": {
             "start_city": "Chicago",
             "end_city": "Los Angeles",
-            "website": "https://www.google.com/maps/"
+            "website": "https://maps.google.com/"
         },
         "questions_for_annotation": [
             "Does the agent search for a driving route from Chicago to Los Angeles on Google Maps?",
@@ -8092,7 +8092,7 @@
             "location": "Pittsburgh Airport",
             "hotel_brand": "Hilton",
             "place": "supermarket",
-            "website": "https://www.google.com/maps/"
+            "website": "https://maps.google.com/"
         },
         "questions_for_annotation": [
             "Does the agent's answer correctly provide details of the hilton hotel nearest to Pittsburgh Airport? (Answer:Hilton Garden Inn Pittsburgh Airport)",
@@ -8111,7 +8111,7 @@
         "instantiation_dict": {
             "item": "Tesla Destination Charger",
             "location": "Smithsonian National Air and Space Museum",
-            "website": "https://www.google.com/maps/"
+            "website": "https://maps.google.com/"
         },
         "questions_for_annotation": [
             "Does the agent's answer correctly find a Tesla destination Charger near the museum? (Answer:Tesla Destination Charger, 1330 Maryland Ave SW, Washington, DC 20024, 0.8miles)",
@@ -8130,7 +8130,7 @@
             "street1": "Elm Street",
             "street2": "Oak Street",
             "location": "Amesbury, Massachusetts",
-            "website": "https://www.google.com/maps/"
+            "website": "https://maps.google.com/"
         },
         "questions_for_annotation": [
             "Does the agent correctly provide the nearest bus stop at the intersection of Elm Street and Oak Street? (Answer:Elm Street & Oak Street, 18 Bay St, Amesbury, MA 01913)",
@@ -8149,7 +8149,7 @@
             "start_location": "The Metropolitan Museum of Art",
             "end_location": "Times Square",
             "city": "New York",
-            "website": "https://www.google.com/maps/"
+            "website": "https://maps.google.com/"
         },
         "questions_for_annotation": [
             "Does the agent's answer state the walking route FROM The Metropolitan Museum of Art To Times Square? (Note: Metropolitan Museum of Art MUST be the starting point, and not destination since travel times can differ based on direction due to uphills/downhills) Example answer: around 42 min (1.9 miles) via 7th Ave)",
@@ -8167,7 +8167,7 @@
         "instantiation_dict": {
             "departure": "San Francisco International Airport",
             "destination": "Union Square",
-            "website": "https://www.google.com/maps/"
+            "website": "https://maps.google.com/"
         },
         "questions_for_annotation": [
             "Does the agent search for driving options from San Francisco International Airport to Union Square on Google maps?",
@@ -8186,7 +8186,7 @@
             "facility": "parking facility",
             "location": "Fox Theater in Detroit",
             "time": "night",
-            "website": "https://www.google.com/maps/"
+            "website": "https://maps.google.com/"
         },
         "questions_for_annotation": [
             "Does the agent find a parking facility near Fox Theater in Detroit that is not open 24hrs?",
@@ -8205,7 +8205,7 @@
             "store": "Target",
             "location": "Atlanta, GA",
             "platform": "map",
-            "website": "https://www.google.com/maps/"
+            "website": "https://maps.google.com/"
         },
         "questions_for_annotation": [
             "Does the agent search for Target stores in Atlanta, GA?",
@@ -8225,7 +8225,7 @@
             "item": "bus stops",
             "location": "Ypsilanti, MI",
             "number": "three",
-            "website": "https://www.google.com/maps/"
+            "website": "https://maps.google.com/"
         },
         "questions_for_annotation": [
             "Does the agent search for bus stops in Ypsilanti, MI?",
@@ -8246,7 +8246,7 @@
             "landmark": "Brooklyn Bridge",
             "hours": "24 hours",
             "aspect": "user comments",
-            "website": "https://www.google.com/maps/"
+            "website": "https://maps.google.com/"
         },
         "questions_for_annotation": [
             "Does the agent successfully search for parking lots near Brooklyn Bridge?",
@@ -8264,7 +8264,7 @@
         "intent_template": "First search {{location}} on {{website}}, and then find the way to share the map. Return the generated sharing link in your response.",
         "instantiation_dict": {
             "location": "New York's Central Park Zoo",
-            "website": "https://www.google.com/maps/"
+            "website": "https://maps.google.com/"
         },
         "questions_for_annotation": [
             "Was New York's Central Park Zoo successfully located on Google Map?",
@@ -8285,7 +8285,7 @@
             "location": "Denver International Airport",
             "aspect1": "Accessibility",
             "aspect2": "Amenities",
-            "website": "https://www.google.com/maps/"
+            "website": "https://maps.google.com/"
         },
         "questions_for_annotation": [
             "Does the agent's answer correctly locate Denver International Airport in google maps ?",
@@ -8306,7 +8306,7 @@
             "location": "Texas",
             "reserve_name": "Big Bend National Park",
             "information_type": "Basic Information",
-            "website": "https://www.google.com/maps/"
+            "website": "https://maps.google.com/"
         },
         "questions_for_annotation": [
             "Does the agent correctly locate Big Bend National Park on Google Maps?",
@@ -8326,7 +8326,7 @@
             "food": "pizza",
             "zip_code": "30309",
             "criteria": "ratings",
-            "website": "https://www.google.com/maps/"
+            "website": "https://maps.google.com/"
         },
         "questions_for_annotation": [
             "Does the agent search for restaurants near zip code 30309 in google maps?",
@@ -8345,7 +8345,7 @@
         "instantiation_dict": {
             "location": " Salem, Massachusetts",
             "destination": "Marblehead",
-            "website": "https://www.google.com/maps/"
+            "website": "https://maps.google.com/"
         },
         "questions_for_annotation": [
             "Does the agent successfully find a parking area in Salem, Massachusetts?",
@@ -8365,7 +8365,7 @@
         "instantiation_dict": {
             "item": "bicycle parking",
             "location": "the Empire State Building",
-            "website": "https://www.google.com/maps/"
+            "website": "https://maps.google.com/"
         },
         "questions_for_annotation": [
             "Does the agent successfully search for a bicycle parking spot near Empire State Building?",
@@ -8383,7 +8383,7 @@
         "instantiation_dict": {
             "start_location": "Miami",
             "end_location": "New Orleans",
-            "website": "https://www.google.com/maps/"
+            "website": "https://maps.google.com/"
         },
         "questions_for_annotation": [
             "Does the agent successfully search for a driving route from Miami to New Orleans?",
@@ -8404,7 +8404,7 @@
             "dish": "'Boston lobster'",
             "rating": "4.6",
             "review_type": "one-star",
-            "website": "https://www.google.com/maps/"
+            "website": "https://maps.google.com/"
         },
         "questions_for_annotation": [
             "Does the agent successfully find a restaurant in Boston that serves Boston lobster?",
@@ -8672,15 +8672,15 @@
         "domain": "Search Engine"
     },
     {
-        "intent_template": "Using {{website}}, Navigate to the English wikipedia page on {{topic}}, What is the 32nd reference used in the article. Provide all information available in the reference, including a link to the source if available.",
+        "intent_template": "Using {{website}}, Navigate to the English wikipedia page on {{topic}}, What is the reference listed as 32nd in the references section? Provide all information available in the reference, including a link to the source if available.",
         "instantiation_dict": {
             "topic": "Mahatma Gandhi",
             "website": "any search engine"
         },
         "questions_for_annotation": [
             "Does the agent successfully navigate to English Wikipedia page on Mahatma Gandhi?",
-            "Does the agent's answer correctly provide information on the 32nd reference used in the article?",
-            "Does the agent answer provide all available information in the reference (including link if available)?"
+            "Does the agent's answer correctly provide information about the article/book that is listed as 32nd in the references section of the page?",
+            "Does the agent answer provide all available information in the reference (including link if available in wikipedia)?"
         ],
         "reference_answers": "Fischer, Louis (1982). Gandhi, his life and message for the world. New American Library. p. 96. ISBN 978-0-451-62142-9. <link>",
         "reference_answer_type": "possible",

From c108eff40542fa2eeea527e2d91b2f8d6d731687 Mon Sep 17 00:00:00 2001
From: DeepakAkkil <deepak.akkil@merlyn.org>
Date: Thu, 3 Jul 2025 14:42:20 +0300
Subject: [PATCH 2/8] Updated golden answers

---
 tasks/webvoyager_template.json | 142 ++++++++++++++++-----------------
 1 file changed, 71 insertions(+), 71 deletions(-)

diff --git a/tasks/webvoyager_template.json b/tasks/webvoyager_template.json
index c504e09..e2dc410 100644
--- a/tasks/webvoyager_template.json
+++ b/tasks/webvoyager_template.json
@@ -251,8 +251,8 @@
             "Manually check if there is a chocolate chip cookie recipe on allrecipe with perfect 5-star rating and takes a total time of less than 60 minutes. (e.g. Chocolate Chip Cookie Cups https://www.allrecipes.com/recipe/273824/chocolate-chip-cookie-cups/)",
             "if recipe exists that meets the conditions, does agent's answer correctly provide the number of reviews or ratings and the main ingredient. If recipe does not exist, does agent's answer state that it could not find such a recipe?"
         ],
-        "reference_answers": "'Chocolate Chip Cookie Cups', 5.0-star, 3 reviews, total time 45 mins, <Ingredients>",
-        "reference_answer_type": "possible",
+        "reference_answers": "There are two possible answers: 1. 'Chocolate Chip Cookie Cups', 5.0-star, 3 reviews, total time 45 mins, Ingredients: all-purpose flour, baking soda, salt, butter, white sugar, brown sugar, vanilla extract, eggs, chocolate chips. 2. 'Easy Chocolate Chip Cookie Cake' 5.0-star, 3 reviews, total time 50 mins, Ingredients: flour, baking soda, butter, white sugar, vanilla extract, eggs, chocolate chips cookies, milk, ground cinnamon.",
+        "reference_answer_type": "golden",
         "id": 12,
         "task": "<will be generated based on instantion_dict by running the instantiation script>",
         "status": "active",
@@ -1836,7 +1836,7 @@
             "Does the agent's answer correctly state the slogan for Macbook Pro (Hello, Apple intelligence)?",
             "Was the task completed using the website mentioned in the prompt: {{website}}"
         ],
-        "reference_answers": "If you can dream it, Mac can do it; Hello, Apple Intelligence",
+        "reference_answers": "Slogan for Mac: If you can dream it, Mac can do it; Slogan for Macbook Pro: Hello, Apple Intelligence",
         "reference_answer_type": "golden",
         "id": 85,
         "task": "<will be generated based on instantion_dict by running the instantiation script>",
@@ -1996,7 +1996,7 @@
             "Does the agent's answer correctly state that Mac Mini can be configured with more than 16 core GPUs",
             "Was the task completed using the website mentioned in the prompt: {{website}}"
         ],
-        "reference_answers": "Yes. Mac mini Apple M2 Pro chip, Configurable to: 19-core GPU",
+        "reference_answers": "Yes. Mac mini Apple M2 Pro chip, Configurable to: 20-core GPU",
         "reference_answer_type": "golden",
         "id": 94,
         "task": "<will be generated based on instantion_dict by running the instantiation script>",
@@ -2374,7 +2374,7 @@
             "Does agent's answer state the number of styles of arxiv logo shirt available in their Bonfire store? ",
             "Was the task completed using the website mentioned in the prompt: {{website}}"
         ],
-        "reference_answers": "3",
+        "reference_answers": "3 variants of arXiv Logo Shirt are available in the Arxiv store.",
         "reference_answer_type": "golden",
         "id": 114,
         "task": "<will be generated based on instantion_dict by running the instantiation script>",
@@ -2409,7 +2409,7 @@
             "Does the agent correctly state the number of articles with SimCSE keyword in any field that was originally announced in October 2023 (Hint: 4 papers)?",
             "Was the task completed using the website mentioned in the prompt: {{website}}"
         ],
-        "reference_answers": "3",
+        "reference_answers": "There are 4 papers with SinCSE keyword in the paper originally announced in Oct 2023. They are 1. Improving Contrastive Learning of Sentence Embeddings with Focal-InfoNCE, 2. Large Language Models can Contrastively Refine their Generation for Better Sentence Representation Learning, 3. Non-contrastive sentence representations via self-supervision, 4. Japanese SimCSE Technical Report",
         "reference_answer_type": "golden",
         "id": 116,
         "task": "<will be generated based on instantion_dict by running the instantiation script>",
@@ -2465,7 +2465,7 @@
             "Does the agent's answer correctly which one of the formula is for loss function? (Answer: Second)",
             "Was the task completed using the website mentioned in the prompt: {{website}}"
         ],
-        "reference_answers": "2 formulas, the second one is loss function",
+        "reference_answers": "There are five formulas, the second one is loss function",
         "reference_answer_type": "golden",
         "id": 119,
         "task": "<will be generated based on instantion_dict by running the instantiation script>",
@@ -2483,8 +2483,8 @@
             "Does the agent correctly use the link from Arxiv to navigate to the university's website?",
             "Does the agent's answer correctly state the number of undergraduate students currently at Cornell sourced from the University website? (info available in the About page)"
         ],
-        "reference_answers": "Cornell University, 16071 UNDERGRADUATE STUDENTS",
-        "reference_answer_type": "possible",
+        "reference_answers": "Cornell University, 16,128 UNDERGRADUATE STUDENTS as of 3 July 2025. This number is slightly subject to change. ",
+        "reference_answer_type": "golden",
         "id": 120,
         "task": "<will be generated based on instantion_dict by running the instantiation script>",
         "status": "active",
@@ -2598,7 +2598,7 @@
             "Does the agent correctly navigate to {{section}} from the Arxiv info section without using external search engines?",
             "Does the agent's answer correctly state the different category of merchanidises available?"
         ],
-        "reference_answers": "5, arXiv Logo Shirt, arXiv Logo Mug, arXiv is Open Science, arXiv Morning Mug, arXiv Forever",
+        "reference_answers": "There are 6 categories of mechandises. They are arxiv baseball cap, arXiv Logo Shirt, arXiv Logo Mug, arXiv is Open Science, arXiv Morning Mug, arXiv Forever",
         "reference_answer_type": "golden",
         "id": 126,
         "task": "<will be generated based on instantion_dict by running the instantiation script>",
@@ -2636,7 +2636,7 @@
             "Does the agent's answer include their abbreviations? (Answer: Econometrics (econ.EM), General Economics (econ.GN), and Theoretical Economics (econ.TH))",
             "Was the task completed using the website mentioned in the prompt: {{website}}"
         ],
-        "reference_answers": "Econometrics (econ.EM), General Economics (econ.GN), and Theoretical Economics (econ.TH)",
+        "reference_answers": "There are 3 sub-categories for Economics: Econometrics (econ.EM), General Economics (econ.GN), and Theoretical Economics (econ.TH)",
         "reference_answer_type": "golden",
         "id": 128,
         "task": "<will be generated based on instantion_dict by running the instantiation script>",
@@ -2853,7 +2853,7 @@
             "Does the agent add XL size of arXiv Forever short sleeve to cart (if it is available for sale) from arxiv non-profit store?"
         ],
         "reference_answers": "QR code image, Action: add to chart",
-        "reference_answer_type": "golden",
+        "reference_answer_type": "possible",
         "id": 139,
         "task": "<will be generated based on instantion_dict by running the instantiation script>",
         "status": "active",
@@ -2984,7 +2984,7 @@
             "Does the article answer correctly answer the question on human activities are causing climate change? (Answer: Widespread use of fossil fuels - coal, oil and gas - in homes, factories and transport)"
         ],
         "reference_answers": "This recent climate change has been caused by human activity, mainly the widespread use of fossil fuels - coal, oil and gas - in homes, factories and transport.",
-        "reference_answer_type": "golden",
+        "reference_answer_type": "possible",
         "id": 146,
         "task": "<will be generated based on instantion_dict by running the instantiation script>",
         "status": "active",
@@ -4390,7 +4390,7 @@
             "Does the agent's answer correctly state the translation of 'sustainability' in French?",
             "Was the task completed using the website mentioned in the prompt: {{website}}"
         ],
-        "reference_answers": "可持续性; durabilité , viabilité",
+        "reference_answers": "Translation of the word sustainability to Chinese, and French: 可持续性; durabilité/viabilité",
         "reference_answer_type": "golden",
         "id": 214,
         "task": "<will be generated based on instantion_dict by running the instantiation script>",
@@ -4520,7 +4520,7 @@
             "Does the agent's answer correctly state the number of meanings of the word 'unblemished'? (Answer:2)",
             "Was the task completed using the website mentioned in the prompt: {{website}}"
         ],
-        "reference_answers": "2",
+        "reference_answers": "2 Different meanings of the word 'unblemished': 1. An unblemished reputation, character; 2. (of skin) without any marks that spoil its appearance",
         "reference_answer_type": "golden",
         "id": 221,
         "task": "<will be generated based on instantion_dict by running the instantiation script>",
@@ -4538,7 +4538,7 @@
             "Does the agent's answer correctly state atleast two synonyms for the phrase?",
             "Was the task completed using the website mentioned in the prompt: {{website}}"
         ],
-        "reference_answers": "behaves themselves; be on their best behaviour",
+        "reference_answers": "The agent should state atleast two synonyms for the phrase to behave well: E.g. behaves themselves; be on their best behaviour",
         "reference_answer_type": "golden",
         "id": 222,
         "task": "<will be generated based on instantion_dict by running the instantiation script>",
@@ -4665,7 +4665,7 @@
             "Does the agent's answer provide the correct translation of the word 'nostalgia' into Chinese?",
             "Was the task completed using the website mentioned in the prompt: {{website}}"
         ],
-        "reference_answers": "怀旧",
+        "reference_answers": "Chinese translation of the word Nostalgia: 怀旧",
         "reference_answer_type": "golden",
         "id": 229,
         "task": "<will be generated based on instantion_dict by running the instantiation script>",
@@ -4948,7 +4948,7 @@
             "Does the agent successfully change the UI language from English to Deutsch in the Cambridge Dictionary Website?"
         ],
         "reference_answers": "Action: Click English (UK), change language to: Deutsch",
-        "reference_answer_type": "golden",
+        "reference_answer_type": "possible",
         "id": 245,
         "task": "<will be generated based on instantion_dict by running the instantiation script>",
         "status": "active",
@@ -5225,7 +5225,7 @@
             "Does the agent's answer correctly state other courses taught by the instructor?",
             "Was the task completed using the website mentioned in the prompt: {{website}}"
         ],
-        "reference_answers": "Xi Yang; Introduction to Finance: The Role of Financial Markets",
+        "reference_answers": "Xi Yang; Another course they teach is: Introduction to Finance: The Role of Financial Markets",
         "reference_answer_type": "golden",
         "id": 259,
         "task": "<will be generated based on instantion_dict by running the instantiation script>",
@@ -5603,7 +5603,7 @@
             "Does the agent's answer correctly state three universities/companies from Australia are partners of Coursera?",
             "Was the task completed using the website mentioned in the prompt: {{website}}"
         ],
-        "reference_answers": "Macquarie University; The University of Melbourne; The University of Sydney; University of Western Australia; UNSW Sydney (The University of New South Wales)",
+        "reference_answers": "Coursera partners in Australia are: Macquarie University; The University of Melbourne; The University of Sydney; University of Western Australia; UNSW Sydney (The University of New South Wales)",
         "reference_answer_type": "golden",
         "id": 278,
         "task": "<will be generated based on instantion_dict by running the instantiation script>",
@@ -5624,7 +5624,7 @@
             "Does the agent's answer correctly state the name of each video in module 2? (Answer: Introduction; Space Debris; Mitigation; Measurements; Protection; Atmospheric Re-entry)",
             "Was the task completed using the website mentioned in the prompt: {{website}}"
         ],
-        "reference_answers": "6 videos; Introduction; Space Debris; Mitigation; Measurements; Protection; Atmospheric Re-entry",
+        "reference_answers": "Module 2 contains 6 videos; Introduction; Space Debris; Mitigation; Measurements; Protection; Atmospheric Re-entry",
         "reference_answer_type": "golden",
         "id": 279,
         "task": "<will be generated based on instantion_dict by running the instantiation script>",
@@ -5842,7 +5842,7 @@
             "Does the agent's answer correctly state how many of the NBA teams have 'New' in their name? (Answer: 2, New York Knicks; New Orleans Pelicans)",
             "Was the task completed using the website mentioned in the prompt: {{website}}"
         ],
-        "reference_answers": "30; New York Knicks; New Orleans Pelicans",
+        "reference_answers": "Total 30 teams are in NBA; There are two teams with 'New'in their name, they are New York Knicks; New Orleans Pelicans",
         "reference_answer_type": "golden",
         "id": 291,
         "task": "<will be generated based on instantion_dict by running the instantiation script>",
@@ -5933,7 +5933,7 @@
             "Does the agent's answer correctly state who is the player with the highest salary on the Boston Celtics roster for the 2024-25 season?",
             "Is the salary information accurate and sourced from ESPN (e.g. https://www.espn.com/nba/team/roster/_/name/bos/boston-celtics)?"
         ],
-        "reference_answers": "Jrue Holiday",
+        "reference_answers": "Jaylen Brown",
         "reference_answer_type": "golden",
         "id": 296,
         "task": "<will be generated based on instantion_dict by running the instantiation script>",
@@ -6063,11 +6063,11 @@
             "website": "https://www.espn.com/"
         },
         "questions_for_annotation": [
-            "Does the agent's answer correctly state the number of teams with 'Golden' in their name listed on ESPN? (Answer: 31)",
+            "Does the agent's answer correctly state the number of teams with 'Golden' in their name listed on ESPN? (Answer: 32)",
             "Does agent's answer correctly state number of teams with 'Golden' in their name that are in the NHL? (Answer: 1, Vegas Golden Knights)",
             "Was the task completed using the website mentioned in the prompt: {{website}}"
         ],
-        "reference_answers": "31 teams in search results, 1 team Vegas Golden Knights (NHL)",
+        "reference_answers": "32 teams in search results, 1 team in NHL: Vegas Golden Knights (NHL)",
         "reference_answer_type": "golden",
         "id": 303,
         "task": "<will be generated based on instantion_dict by running the instantiation script>",
@@ -6086,7 +6086,7 @@
             "Which MLB teams have 'City' in their name? (Answer: only team is Kansas City Royals)",
             "Was the task completed using the website mentioned in the prompt: {{website}}"
         ],
-        "reference_answers": "30 teams in search results, Kansas City Royals",
+        "reference_answers": "30 teams in total in MLB. Only one team has 'City' in its name: Kansas City Royals",
         "reference_answer_type": "golden",
         "id": 304,
         "task": "<will be generated based on instantion_dict by running the instantiation script>",
@@ -6126,7 +6126,7 @@
             "Does the agent's answer correctly state the name of the player in New York Yankees who plays as an 'infielder' and has the heaviest body weight? (e.g. from https://www.espn.com/mlb/team/roster/_/name/nyy/new-york-yankees)",
             "Was the task completed using the website mentioned in the prompt: {{website}}"
         ],
-        "reference_answers": "Carlos Rodon, 255 lbs",
+        "reference_answers": "Ben Rice, 228 lbs",
         "reference_answer_type": "golden",
         "id": 306,
         "task": "<will be generated based on instantion_dict by running the instantiation script>",
@@ -6219,7 +6219,7 @@
             "Does the agent's answer correctly state if there are other players on the Los Angeles Lakers with the same games played percentage as Anthony Davis? (Answer: One other player: D'Angelo Russell) ",
             "Was the task completed using the website mentioned in the prompt: {{website}}"
         ],
-        "reference_answers": "76/82 = 92.6%, D'Angelo Russell has same, https://www.espn.com/nba/team/stats/_/name/lal/los-angeles-lakers",
+        "reference_answers": "Games played % by Anthony Davis in 23-24 Reguar Season: 76/82 = 92.6%, D'Angelo Russell has same, https://www.espn.com/nba/team/stats/_/name/lal/los-angeles-lakers",
         "reference_answer_type": "golden",
         "id": 311,
         "task": "<will be generated based on instantion_dict by running the instantiation script>",
@@ -6248,7 +6248,7 @@
         "domain": "ESPN"
     },
     {
-        "intent_template": "Using the website {{website}}, find out which four teams the {{division}} contains in the {{league}}.",
+        "intent_template": "Using the website {{website}}, find out which four teams plays in the {{division}} division in {{league}}.",
         "instantiation_dict": {
             "division": "NFC North",
             "league": "NFL",
@@ -6258,7 +6258,7 @@
             "Does the agent's answer correctly state the four teams in NFC North in NFL? (Answer:Chicago Bears, Detroit Lions, Green Bay Packers, and Minnesota Vikings)",
             "Was the task completed using the website mentioned in the prompt: {{website}}"
         ],
-        "reference_answers": "Chicago Bears, Detroit Lions, Green Bay Packers, and Minnesota Vikings",
+        "reference_answers": "4 Teams in NFC North divisin are: Chicago Bears, Detroit Lions, Green Bay Packers, and Minnesota Vikings",
         "reference_answer_type": "golden",
         "id": 313,
         "task": "<will be generated based on instantion_dict by running the instantiation script>",
@@ -6313,7 +6313,7 @@
             "Was the task completed using the website mentioned in the prompt: {{website}}"
         ],
         "reference_answers": "resource-watch/resource-watch",
-        "reference_answer_type": "golden",
+        "reference_answer_type": "possible",
         "id": 316,
         "task": "<will be generated based on instantion_dict by running the instantiation script>",
         "status": "active",
@@ -6431,7 +6431,7 @@
             "Does the agent's answer correctly state the files that were changed in the most recent commit of the ALBERT repository? (Answer: classifier_utils.py and squad_utils.py)",
             "Was the task completed using the website mentioned in the prompt: {{website}}"
         ],
-        "reference_answers": "classifier_utils.py and squad_utils.py",
+        "reference_answers": "classifier_utils.py and squad_utils.py are the two files that were changed in the most recent commit of the ALBERT repository on Apr 13, 2023.",
         "reference_answer_type": "golden",
         "id": 322,
         "task": "<will be generated based on instantion_dict by running the instantiation script>",
@@ -6578,17 +6578,17 @@
     {
         "intent_template": "Using the website {{website}}, identify and report the most popular (in terms of stars) {{type}} on {{platform}}",
         "instantiation_dict": {
-            "type": "open-source image processing tool",
+            "type": "open-source image-processing tool",
             "platform": "GitHub",
             "website": "https://github.com/"
         },
         "questions_for_annotation": [
-            "Does the agent search for repository for open-source image processing tool?",
+            "Does the agent search for repository for open-source image-processing tool?",
             "Does the identified repo have the highest stars in the search result?",
             "Does the agent's answer report some information about the repository including its name?",
             "Was the task completed using the website mentioned in the prompt: {{website}}"
         ],
-        "reference_answers": "OpenCV",
+        "reference_answers": "OpenCV: Open Source Computer Vision Library has the highest stars in the search result.",
         "reference_answer_type": "golden",
         "id": 330,
         "task": "<will be generated based on instantion_dict by running the instantiation script>",
@@ -6761,7 +6761,7 @@
             "Does the agent's answer correctly state the maximum number of private repositories allowed in the Pro Team? (Answer: Unlimited)",
             "Was the task completed using the website mentioned in the prompt: {{website}}"
         ],
-        "reference_answers": "Unlimited",
+        "reference_answers": "Both Free plan and Pro Team allow Unlimited private repositories.",
         "reference_answer_type": "golden",
         "id": 339,
         "task": "<will be generated based on instantion_dict by running the instantiation script>",
@@ -6935,7 +6935,7 @@
             "Does the agent's answer provide details of the most starred project from the search results?",
             "Was the task completed using the website mentioned in the prompt: {{website}}"
         ],
-        "reference_answers": "WerWolv/ImHex",
+        "reference_answers": "sherlock-project/sherlock",
         "reference_answer_type": "golden",
         "id": 348,
         "task": "<will be generated based on instantion_dict by running the instantiation script>",
@@ -6969,7 +6969,7 @@
             "Does the agent attempt to sign up for a new account on github using the email test123@gmail.com",
             "Does the agent's answer correctly state that the emailid 'test123@gmail.com' is already associated with an account on Github?"
         ],
-        "reference_answers": "Perform Action. email 'test123@gmail.com' already exists",
+        "reference_answers": "email 'test123@gmail.com' already exists",
         "reference_answer_type": "golden",
         "id": 350,
         "task": "<will be generated based on instantion_dict by running the instantiation script>",
@@ -7776,7 +7776,7 @@
             "Does the agent's answer correctly state a bus stop nearest to the intersection of main street and Amherst street in Altavista?(Answer: Amherst and 7th or Main Street Middle)",
             "Was the task completed using the website mentioned in the prompt: {{website}}"
         ],
-        "reference_answers": "'Amherst and 7th' or 'Main Street Middle'",
+        "reference_answers": "'Amherst and 7th' or 'Main Street Middle' both are correct answers",
         "reference_answer_type": "golden",
         "id": 387,
         "task": "<will be generated based on instantion_dict by running the instantiation script>",
@@ -7871,7 +7871,7 @@
             "Does the agent's answer correctly state all the stores? (Answer: 2 stores, UNIQLO State Street and Uniqlo Woodfield Mall)",
             "Was the task completed using the website mentioned in the prompt: {{website}}"
         ],
-        "reference_answers": "UNIQLO State Street",
+        "reference_answers": "2 stores, UNIQLO State Street and Uniqlo Woodfield Mall",
         "reference_answer_type": "golden",
         "id": 392,
         "task": "<will be generated based on instantion_dict by running the instantiation script>",
@@ -7890,7 +7890,7 @@
             "Does the agent's answer correctly provide details of the bus stops? (Answer: only 1 bus stop, Alanson, MI (EZ-Mart) Bus Stop)",
             "Was the task completed using the website mentioned in the prompt: {{website}}"
         ],
-        "reference_answers": "Alanson, MI (EZ-Mart) Bus Stop",
+        "reference_answers": "only 1 bus stop, Alanson, MI (EZ-Mart) Bus Stop",
         "reference_answer_type": "golden",
         "id": 393,
         "task": "<will be generated based on instantion_dict by running the instantiation script>",
@@ -7909,7 +7909,7 @@
             "Does the agent's answer correctly provide details of a climbing place within 2 miles from 90028? (Example Answer: Hollywood Boulders)",
             "Was the task completed using the website mentioned in the prompt: {{website}}"
         ],
-        "reference_answers": "Hollywood Boulders",
+        "reference_answers": "Hollywood Boulders is the only climbing place within 2 miles from 90028",
         "reference_answer_type": "golden",
         "id": 394,
         "task": "<will be generated based on instantion_dict by running the instantiation script>",
@@ -7927,7 +7927,7 @@
             "Does the agent's answer correctly provide details of art gallery near Los Angeles Hindu Temple? (Answer: Honor Fraser Gallery or Walter Maciel Gallery, both within 100m)",
             "Was the task completed using the website mentioned in the prompt: {{website}}"
         ],
-        "reference_answers": "'Honor Fraser Gallery' or 'Walter Maciel Gallery'.",
+        "reference_answers": "Honor Fraser Gallery or Walter Maciel Gallery, both within 100m.",
         "reference_answer_type": "golden",
         "id": 395,
         "task": "<will be generated based on instantion_dict by running the instantiation script>",
@@ -8429,8 +8429,8 @@
             "Does the agent's answer correctly state the latest press in the category of Retail?",
             "Does the agent's answer correctly state the latest press in the category of Prime Video?"
         ],
-        "reference_answers": "May 5, 2023",
-        "reference_answer_type": "golden",
+        "reference_answers": "<title>, <date> for Retail; <title>, <date> for Prime Video",
+        "reference_answer_type": "possible",
         "id": 421,
         "task": "<will be generated based on instantion_dict by running the instantiation script>",
         "status": "active",
@@ -8646,7 +8646,7 @@
         "questions_for_annotation": [
             "Does the agent's answer correctly state which year Tom Brady had most touchdowns? (Answer: 2007)"
         ],
-        "reference_answers": "2007",
+        "reference_answers": "Tom Brady had most touchdowns in 2007",
         "reference_answer_type": "golden",
         "id": 433,
         "task": "<will be generated based on instantion_dict by running the instantiation script>",
@@ -8718,7 +8718,7 @@
             "Does the agent's answer correctly state when and where the last soccer world cup was hosted? (Answer: Qatar, 2022)",
             "Does the agent's answer correctly state the winner of the last soccer world cup? (Answer: Argentina)"
         ],
-        "reference_answers": "Qatar; 2022; Argentina",
+        "reference_answers": "Qatar; 2022; Argentina won",
         "reference_answer_type": "golden",
         "id": 437,
         "task": "<will be generated based on instantion_dict by running the instantiation script>",
@@ -8737,7 +8737,7 @@
             "Does the agent's answer correctly state what are the first 7 bits of the SHA obtained from Bert's latest commit? (Answer: eedf571)",
             "Does the agent's answer correctly state changes were made in Bert's latest commit? (Answer: Readme changes that adds links to 24 smaller BERT Model)"
         ],
-        "reference_answers": "eedf571, Smaller BERT Models",
+        "reference_answers": "SHA for the last commit: eedf571,  Readme changes that adds links to 24 smaller BERT Model",
         "reference_answer_type": "golden",
         "id": 438,
         "task": "<will be generated based on instantion_dict by running the instantiation script>",
@@ -8773,7 +8773,7 @@
             "Are the movies sorted by box office earning?"
         ],
         "reference_answers": "The Lion King (2019); Frozen II (2019); The Super Mario Bros. Movie (2023); Frozen (2013); Incredibles 2 (2018)",
-        "reference_answer_type": "golden",
+        "reference_answer_type": "possible",
         "id": 440,
         "task": "<will be generated based on instantion_dict by running the instantiation script>",
         "status": "active",
@@ -8829,7 +8829,7 @@
             "Does the agent navigate to Country Profile of  Nepal? (Hint:https://data.un.org/en/iso/np.html)",
             "Does the agent correctly find the % of seats held by women as of Jan 1 2021?(Answer: 32.7%)"
         ],
-        "reference_answers": "32.7",
+        "reference_answers": "32.7%",
         "reference_answer_type": "golden",
         "id": 443,
         "task": "<will be generated based on instantion_dict by running the instantiation script>",
@@ -8942,7 +8942,7 @@
             "Does the agent's answer correctly state the review scores of the movie Inception in Imdb and Metacritic?"
         ],
         "reference_answers": "IMDb score 8.8, Metacritic score 74%.",
-        "reference_answer_type": "golden",
+        "reference_answer_type": "possible",
         "id": 449,
         "task": "<will be generated based on instantion_dict by running the instantiation script>",
         "status": "active",
@@ -8990,9 +8990,9 @@
             "website": "any search engine"
         },
         "questions_for_annotation": [
-            "Does the agent's answer state in which year Cristiano Ronaldo scored the most goals in a single club season? (Answer 2014â€“15 season, 61 goals) "
+            "Does the agent's answer state in which year Cristiano Ronaldo scored the most goals in a single club season? (Answer 2014-15 season, 61 goals) "
         ],
-        "reference_answers": "2014-15 season",
+        "reference_answers": "2014-15 season, 61 goals",
         "reference_answer_type": "golden",
         "id": 452,
         "task": "<will be generated based on instantion_dict by running the instantiation script>",
@@ -9009,7 +9009,7 @@
             "Does the agent's answer correctly state when was the last mens UEFA Champions League final held and who won?(Answer: UEFA final 2024 was held in London, between  Borussia Dortmund and Real Madrid)",
             "Does the agent's answer state which team won the most recent UEFA Champions League final? (Answer: Real Madrid)"
         ],
-        "reference_answers": "Manchester City Football Club; June 10, 2023; Atatürk Olympic Stadium, Istanbul, Turkey",
+        "reference_answers": "UEFA final 2024 was held in London, between  Borussia Dortmund and Real Madrid; Real Madrid won",
         "reference_answer_type": "possible",
         "id": 453,
         "task": "<will be generated based on instantion_dict by running the instantiation script>",
@@ -9423,7 +9423,7 @@
             "Does the agent's answer explicitly state the type of the parameter spaces_between_special_tokens?(Answer: Boolean)",
             "Does the agent's answer explicitly state the default value of the parameter spaces_between_special_tokens?(Answer: False)"
         ],
-        "reference_answers": "bool, defaults to False",
+        "reference_answers": "Boolean, defaults to False",
         "reference_answer_type": "golden",
         "id": 475,
         "task": "<will be generated based on instantion_dict by running the instantiation script>",
@@ -9628,8 +9628,8 @@
             "Does the agent correctly return the most relevant and downloaded translation model for (en-zh) from the search result? (Answer: Helsinki-NLP/opus-mt-en-zh)",
             "Does the agent's answer report the latest performance metrics and usage guidelines of the model?"
         ],
-        "reference_answers": "Helsinki-NLP/opus-mt-en-zh; testset, BLEU, chr-F: Tatoeba-test.eng.zho, 31.4, 0.268",
-        "reference_answer_type": "golden",
+        "reference_answers": "Helsinki-NLP/opus-mt-en-zh, Performance: Tatoeba-test.eng.zho (testset), 31.4 (BLEU), 0.268 (chr-F)",
+        "reference_answer_type": "possible",
         "id": 486,
         "task": "<will be generated based on instantion_dict by running the instantiation script>",
         "status": "active",
@@ -9667,7 +9667,7 @@
             "Does the agent's answer correctly state the default value of temperature used? (Answer: 1.0)",
             "Was the task completed using the website mentioned in the prompt: {{website}}"
         ],
-        "reference_answers": "\"temperature\": 1.0",
+        "reference_answers": "temperature: 1.0",
         "reference_answer_type": "golden",
         "id": 488,
         "task": "<will be generated based on instantion_dict by running the instantiation script>",
@@ -9775,7 +9775,7 @@
             "Does the agent correctly locate the documentation page of Transformers in Hugging Face?(https://huggingface.co/docs/transformers/index)",
             "Does the agent's answer correctly state how to add new tokens to the tokenizer component? (Answer: using the add_tokens method in tokenizer)"
         ],
-        "reference_answers": "use add_tokens method",
+        "reference_answers": "using the add_tokens method in tokenizer",
         "reference_answer_type": "golden",
         "id": 494,
         "task": "<will be generated based on instantion_dict by running the instantiation script>",
@@ -9813,7 +9813,7 @@
             "Does the agent successfully locate the docs on Text Embeddings Inference? (https://huggingface.co/docs/text-embeddings-inference/index)",
             "Does the agent's answer provide a summary of some of the strengths of Text Embeddings Inference? (Answer: Streamlined Deployment; Efficient Resource Utilization; Dynamic Batching, Optimised Inference, Production Ready etc)"
         ],
-        "reference_answers": "Streamlined Deployment; Efficient Resource Utilization; Dynamic Batching ...",
+        "reference_answers": "Streamlined Deployment; Efficient Resource Utilization; Dynamic Batching, Optimised Inference, Production Ready etc",
         "reference_answer_type": "golden",
         "id": 496,
         "task": "<will be generated based on instantion_dict by running the instantiation script>",
@@ -9834,7 +9834,7 @@
             "Does the agent open the model with highest number of downloads? (https://huggingface.co/openai/shap-e)",
             "Does the agent's answer correctly state if the model is being used in any hugging face Spaces? (Answer: Yes, used in 50+ spaces)"
         ],
-        "reference_answers": "openai/shap-e; there are Spaces like hysts/Shap-E ...",
+        "reference_answers": "openai/shap-e; there are Spaces like hysts/Shap-E. Overall used in 90+ spaces ...",
         "reference_answer_type": "golden",
         "id": 497,
         "task": "<will be generated based on instantion_dict by running the instantiation script>",
@@ -9874,7 +9874,7 @@
             "Does the agent's answer correctly state the total number of rows in the dataset? (Answer: 17584)",
             "Was the task completed using the website mentioned in the prompt: {{website}}"
         ],
-        "reference_answers": "content: Please provide a reasonable subgoal-based plan to solve the given task.\\nTask: What was the opening date of the museum dedicated to the war that, after it occurred, Boston became one of the wealthiest international ports?; Initial Environment Description: None.",
+        "reference_answers": "Format: Parquet; Number of rows: 17584",
         "reference_answer_type": "golden",
         "id": 499,
         "task": "<will be generated based on instantion_dict by running the instantiation script>",
@@ -9891,7 +9891,7 @@
             "Does the agent's answer correctly state the number of models uploaded by Google? (Correct Answer available: https://huggingface.co/google)",
             "Was the task completed using the website mentioned in the prompt: {{website}}"
         ],
-        "reference_answers": "990 and 53",
+        "reference_answers": "1005 and 56",
         "reference_answer_type": "possible",
         "id": 500,
         "task": "<will be generated based on instantion_dict by running the instantiation script>",
@@ -9992,7 +9992,7 @@
             "Does the agent's answer correctly state the final length after 6s? (Answer: 0.252m or 25.21cm or 9.925 inch)",
             "Was the task completed using the website mentioned in the prompt: {{website}}"
         ],
-        "reference_answers": "-73.26° from vertical; 0.252 m",
+        "reference_answers": "-73.26° from vertical; 0.252m or 25.21cm or 9.925 inch length after 6s",
         "reference_answer_type": "golden",
         "id": 505,
         "task": "<will be generated based on instantion_dict by running the instantiation script>",
@@ -10064,8 +10064,8 @@
             "website": "https://www.wolframalpha.com/"
         },
         "questions_for_annotation": [
-            "Does the agent's answer state the electrical resistivity of UNS A92024 at 20 degrees Celsius? (4.9Ã—10^-6 Î© cm (ohm centimeters) (at 20 Â°C))",
-            "Does the agent's answer state the electrical resistivity of UNS G10800 at 20 degrees Celsius? (1.8Ã—10^-5 Î© cm (ohm centimeters) (at 20 Â°C))",
+            "Does the agent's answer state the electrical resistivity of UNS A92024 at 20 degrees Celsius? (4.9x10^-6 Ω cm (ohm centimeters) (at 20 °C))",
+            "Does the agent's answer state the electrical resistivity of UNS G10800 at 20 degrees Celsius? (1.8x10^-5 Ω cm (ohm centimeters) (at 20 °C))",
             "Was the task completed using the website mentioned in the prompt: {{website}}"
         ],
         "reference_answers": "UNS A92024: 4.9×10^-6 Ω cm (ohm centimeters) (at 20 °C); UNS G10800: 1.8×10^-5 Ω cm (ohm centimeters)",
@@ -10166,7 +10166,7 @@
             "Does the agent's answer correctly state the average movie ticket price in Boise?",
             "Was the task completed using the website mentioned in the prompt: {{website}}"
         ],
-        "reference_answers": "Providence $13.81; Nashville $12.65; Boise $12.65",
+        "reference_answers": "Providence $14.37; Nashville $13.30; Boise $11.60 in the year 2023",
         "reference_answer_type": "golden",
         "id": 514,
         "task": "<will be generated based on instantion_dict by running the instantiation script>",
@@ -10396,7 +10396,7 @@
             "Does the agent's answer state all prime numbers between 1000 and 1200? (Answer: 28 numbers, some examples: 1009, 1013, 1031, 1039, 1051, 1069, 1129, 1153, 1187, 1193)",
             "Was the task completed using the website mentioned in the prompt: {{website}}"
         ],
-        "reference_answers": "1009, 1013, 1019, 1021, 1031, 1033, 1039, 1049, 1051, 1061, 1063, 1069, 1087, 1091, 1093, 1097, 1103, 1109, 1117, 1123, 1129, 1151, 1153, 1163, 1171, 1181, 1187, 1193.",
+        "reference_answers": "28 Numbers: 1009, 1013, 1019, 1021, 1031, 1033, 1039, 1049, 1051, 1061, 1063, 1069, 1087, 1091, 1093, 1097, 1103, 1109, 1117, 1123, 1129, 1151, 1153, 1163, 1171, 1181, 1187, 1193.",
         "reference_answer_type": "golden",
         "id": 526,
         "task": "<will be generated based on instantion_dict by running the instantiation script>",
@@ -10453,7 +10453,7 @@
             "Was the task completed using the website mentioned in the prompt: {{website}}"
         ],
         "reference_answers": "x^2(\\sin(\frac{2π}{15}) - 2) + 2xy \\cos(\frac{2π}{15}) + 4 = y^2(2 + \\sin(\frac{2π}{15}))",
-        "reference_answer_type": "golden",
+        "reference_answer_type": "possible",
         "id": 529,
         "task": "<will be generated based on instantion_dict by running the instantiation script>",
         "status": "active",
@@ -10531,7 +10531,7 @@
             "Does the agent's answer state how many of them have only 2 rows? (Answer: 12)",
             "Was the task completed using the website mentioned in the prompt: {{website}}"
         ],
-        "reference_answers": "35; 12",
+        "reference_answers": "(2-sided) combinations of polyominoes: 35; 12 of them have only 2 rows",
         "reference_answer_type": "golden",
         "id": 533,
         "task": "<will be generated based on instantion_dict by running the instantiation script>",
@@ -10571,7 +10571,7 @@
             "Does the agent's answer state the metabolic properties associated with the action?(Answer: energy expenditure = 2720 kJ (kilojoules), average energy expenditure per step = 1.1 kJ/step (kilojoules per step), fat burned = 0.042 kg (kilograms), oxygen consumption = 129.9.8 L (liters), metabolic equivalent = 7 metabolic equivalents (all estimates based on CDC standards))",
             "Was the task completed using the website mentioned in the prompt: {{website}}"
         ],
-        "reference_answers": "energy expenditure | 2720 kJ (kilojoules); average energy expenditure per step | 1.1 kJ/step (kilojoules per step); fat burned | 0.0842 kg (kilograms); oxygen consumption | 129.9 L (liters); metabolic equivalent | 7 metabolic equivalents",
+        "reference_answers": "energy expenditure = 2720 kJ (kilojoules), average energy expenditure per step = 1.1 kJ/step (kilojoules per step), fat burned = 0.042 kg (kilograms), oxygen consumption = 129.9.8 L (liters), metabolic equivalent = 7 metabolic equivalents (all estimates based on CDC standards)",
         "reference_answer_type": "golden",
         "id": 535,
         "task": "<will be generated based on instantion_dict by running the instantiation script>",

From dcd37a7c25a57c90a233c5be17fc3e37b02bd702 Mon Sep 17 00:00:00 2001
From: DeepakAkkil <deepak.akkil@merlyn.org>
Date: Thu, 3 Jul 2025 14:48:50 +0300
Subject: [PATCH 3/8] Include golden answer in the instantiated script

---
 tasks/instantiate_tasks.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tasks/instantiate_tasks.py b/tasks/instantiate_tasks.py
index 09b6e0e..2f1c279 100644
--- a/tasks/instantiate_tasks.py
+++ b/tasks/instantiate_tasks.py
@@ -72,10 +72,15 @@ def generate_dynamic_data(evaluation_string):
             question = question.replace(f"{{{{{key}}}}}", str(value))
             questions_for_annotators[q_index] = question
     item['task'] = intent_updated
+    item['answer_type'] = item.get('reference_answer_type', 'possible')
+    item['golden_answer'] = item.get('reference_answers', '')
     if item.get('status') and "remove" in item.get('status'):
         continue
     domain=item['domain']
-    updated_data.append({"index":updated_task_index,"domain":domain,"task":intent_updated, "questions_for_annotation":item['questions_for_annotation']})
+    if item['answer_type'] == 'golden':
+        updated_data.append({"index":updated_task_index,"domain":domain,"task":intent_updated, "questions_for_annotation":item['questions_for_annotation'], "golden_answer": item['golden_answer']})
+    else:
+        updated_data.append({"index":updated_task_index,"domain":domain,"task":intent_updated, "questions_for_annotation":item['questions_for_annotation']})
     updated_task_index+=1
 today_date = datetime.now().strftime("%d_%m_%Y")
 

From 0c08cb8766bb51ef26c917ed034e17ec62fd4837 Mon Sep 17 00:00:00 2001
From: DeepakAkkil <deepak.akkil@merlyn.org>
Date: Sun, 13 Jul 2025 09:53:39 +0300
Subject: [PATCH 4/8] minor update

---
 tasks/webvoyager_template.json | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tasks/webvoyager_template.json b/tasks/webvoyager_template.json
index e2dc410..085ae7a 100644
--- a/tasks/webvoyager_template.json
+++ b/tasks/webvoyager_template.json
@@ -6119,7 +6119,7 @@
         "instantiation_dict": {
             "position": "infielders",
             "team": "New York Yankees",
-            "season": "@eval:(datetime.now().year - 1)",
+            "season": "for the latest season",
             "website": "https://www.espn.com/"
         },
         "questions_for_annotation": [
@@ -8956,10 +8956,10 @@
         },
         "questions_for_annotation": [
             "Does the agent correctly provide a SIGCHI conference that is still accepting full paper submission?",
-            "Does the agent's answer correctly state the soonest full paper submission deadline? (Note: This would require checking a bunch of upcoming conferences and their submission deadlines)"
+            "Does the agent's answer correctly state the soonest full paper submission deadline? (Note: This would require checking a bunch of upcoming SIGCHI conferences, (example SIGCHI conferences: CHI, Ubicomp, IUI, CSCW etc) and their submission deadlines)"
         ],
-        "reference_answers": "9.58s held by Usain Bolt of Jamaica",
-        "reference_answer_type": "golden",
+        "reference_answers": "",
+        "reference_answer_type": "possible",
         "id": 450,
         "task": "<will be generated based on instantion_dict by running the instantiation script>",
         "status": "active",
@@ -10275,7 +10275,7 @@
             "website": "https://www.wolframalpha.com/"
         },
         "questions_for_annotation": [
-            "Does the agent's answer correctly state the population growth of Canada between 2020 and 2023? (Answer: mean growth of 0.9998% per year)",
+            "Does the agent's answer correctly state the population growth of Canada between 2020 and 2023? (Answer: mean growth of 0.9886% per year based on updated WolframAlpha data)",
             "Was the task completed using the website mentioned in the prompt: {{website}}"
         ],
         "reference_answers": "mean population growth rate of Canada from 2020 to 2023 is 0.9998% per year",

From a88d10c0fb41e8f48c2d205164d8664e9bda8cd2 Mon Sep 17 00:00:00 2001
From: DeepakAkkil <deepak.akkil@merlyn.org>
Date: Mon, 21 Jul 2025 14:01:52 +0300
Subject: [PATCH 5/8] minor updates before release

---
 leaderboard/home.html   | 162 ++++++++++++++--------------------------
 leaderboard/script.js   |   2 +-
 leaderboard/styles.css  |   8 +-
 leaderboard/viewer.html | 135 ---------------------------------
 4 files changed, 64 insertions(+), 243 deletions(-)
 delete mode 100644 leaderboard/viewer.html

diff --git a/leaderboard/home.html b/leaderboard/home.html
index 594d5f5..b3c1209 100644
--- a/leaderboard/home.html
+++ b/leaderboard/home.html
@@ -1,3 +1,4 @@
+<!-- MODIFIED HTML -->
 <!DOCTYPE html>
 <html lang="en">
 <head>
@@ -20,7 +21,7 @@ <h1>Emergence WebVoyager</h1>
             <nav class="main-nav">
                 <ul>
                     <li><a href="home.html" class="active">Leaderboard</a></li>
-                    <li><a href="viewer.html">Execution Viewer</a></li>
+                    <li><a href="viewer.html" class="disabled-link">Execution Viewer</a></li>
                 </ul>
             </nav>
         </header>
@@ -38,12 +39,10 @@ <h2>WebAgent Performance Leaderboard</h2>
                     </tr>
                 </thead>
                 <tbody>
-                    <!-- Agent 1 -->
+                    <!-- Operator (Placeholder now) -->
                     <tr>
                         <td>Operator <button class="expand-btn" data-agent="operator"><i class="fas fa-plus"></i></button></td>
-                        <td>68.79%</td>
-                        <td>93s</td>
-                        <td>175s</td>
+                        <td colspan="3" class="coming-soon">Coming Soon</td>
                     </tr>
                     <tr id="operator-details" class="domain-details">
                         <td colspan="4">
@@ -57,113 +56,64 @@ <h2>WebAgent Performance Leaderboard</h2>
                                     </tr>
                                 </thead>
                                 <tbody>
-                                    <tr class="domain-row success">
-                                        <td>Apple</td>
-                                        <td>100%</td>
-                                        <td>137s</td>
-                                        <td>NA</td>
-                                    </tr>
-                                    <tr class="domain-row success">
-                                        <td>Cambridge Dictionary</td>
-                                        <td>88.6%</td>
-                                        <td>65s</td>
-                                        <td>174s</td>
-                                    </tr>
-                                    <tr class="domain-row success">
-                                        <td>Coursera</td>
-                                        <td>88.6%</td>
-                                        <td>80s</td>
-                                        <td>93s</td>
-                                    </tr>
-                                    <tr class="domain-row success">
-                                        <td>BBC News</td>
-                                        <td>85.7%</td>
-                                        <td>98s</td>
-                                        <td>227s</td>
-                                    </tr>
-                                    <tr class="domain-row medium">
-                                        <td>Allrecipes</td>
-                                        <td>74.3%</td>
-                                        <td>69s</td>
-                                        <td>88s</td>
-                                    </tr>
-                                    <tr class="domain-row medium">
-                                        <td>GitHub</td>
-                                        <td>74.3%</td>
-                                        <td>73s</td>
-                                        <td>143s</td>
-                                    </tr>
-                                    <tr class="domain-row medium">
-                                        <td>Wolfram Alpha</td>
-                                        <td>74.3%</td>
-                                        <td>49s</td>
-                                        <td>76s</td>
-                                    </tr>
-                                    <tr class="domain-row medium">
-                                        <td>Google Maps</td>
-                                        <td>71.4%</td>
-                                        <td>88s</td>
-                                        <td>175s</td>
-                                    </tr>
-                                    <tr class="domain-row medium">
-                                        <td>Search Engine</td>
-                                        <td>71.1%</td>
-                                        <td>83s</td>
-                                        <td>243s</td>
-                                    </tr>
-                                    <tr class="domain-row medium">
-                                        <td>Google Flights</td>
-                                        <td>62.9%</td>
-                                        <td>86s</td>
-                                        <td>73s</td>
-                                    </tr>
-                                    <tr class="domain-row medium">
-                                        <td>Hugging Face</td>
-                                        <td>60.0%</td>
-                                        <td>68s</td>
-                                        <td>99s</td>
-                                    </tr>
-                                    <tr class="domain-row low">
-                                        <td>ESPN</td>
-                                        <td>57.1%</td>
-                                        <td>122s</td>
-                                        <td>236s</td>
-                                    </tr>
-                                    <tr class="domain-row low">
-                                        <td>Arxiv</td>
-                                        <td>54.3%</td>
-                                        <td>69s</td>
-                                        <td>88s</td>
-                                    </tr>
-                                    <tr class="domain-row low">
-                                        <td>Amazon</td>
-                                        <td>40.0%</td>
-                                        <td>169s</td>
-                                        <td>208s</td>
-                                    </tr>
-                                    <tr class="domain-row low">
-                                        <td>Booking</td>
-                                        <td>28.6%</td>
-                                        <td>133s</td>
-                                        <td>287s</td>
+                                    <tr>
+                                        <td colspan="4" class="coming-soon">Coming Soon</td>
                                     </tr>
-         
                                 </tbody>
                             </table>
                         </td>
                     </tr>
 
-                    <!-- Agent 2 -->
-                    <tr>
-                        <td>Computer Use</td>
-                        <td colspan="3" class="coming-soon">Coming Soon</td>
-                    </tr>
+                <!-- Anthropic Computer Use -->
+                <tr>
+                    <td>Anthropic Computer Use <button class="expand-btn" data-agent="computer-use"><i class="fas fa-plus"></i></button></td>
+                    <td colspan="3" class="coming-soon">Coming Soon</td>
+                </tr>
+                <tr id="computer-use-details" class="domain-details">
+                    <td colspan="4">
+                        <table class="domain-table">
+                            <thead>
+                                <tr>
+                                    <th>Domain</th>
+                                    <th>Success Rate</th>
+                                    <th>Avg Completion Time (Success)</th>
+                                    <th>Avg Completion Time (Failure)</th>
+                                </tr>
+                            </thead>
+                            <tbody>
+                                <tr>
+                                    <td colspan="4" class="coming-soon">Coming Soon</td>
+                                </tr>
+                            </tbody>
+                        </table>
+                    </td>
+                </tr>
+
+                <!-- Emergence WebAgent -->
+                <tr>
+                    <td>Emergence WebAgent <button class="expand-btn" data-agent="emergence"><i class="fas fa-plus"></i></button></td>
+                    <td colspan="3" class="coming-soon">Coming Soon</td>
+                </tr>
+                <tr id="emergence-details" class="domain-details">
+                    <td colspan="4">
+                        <table class="domain-table">
+                            <thead>
+                                <tr>
+                                    <th>Domain</th>
+                                    <th>Success Rate</th>
+                                    <th>Avg Completion Time (Success)</th>
+                                    <th>Avg Completion Time (Failure)</th>
+                                </tr>
+                            </thead>
+                            <tbody>
+                                <tr>
+                                    <td colspan="4" class="coming-soon">Coming Soon</td>
+                                </tr>
+                            </tbody>
+                        </table>
+                    </td>
+                </tr>
 
-                    <!-- Agent 3 -->
-                    <tr>
-                        <td>Emergence WebAgent</td>
-                        <td colspan="3" class="coming-soon">Coming Soon</td>
-                    </tr>
 
                 </tbody>
             </table>
@@ -223,4 +173,4 @@ <h2>WebAgent Performance Leaderboard</h2>
         });
     </script>
 </body>
-</html>
\ No newline at end of file
+</html>
diff --git a/leaderboard/script.js b/leaderboard/script.js
index 1b9be47..68a3bae 100644
--- a/leaderboard/script.js
+++ b/leaderboard/script.js
@@ -117,7 +117,7 @@ document.addEventListener('DOMContentLoaded', function() {
         contentContainer.classList.add('hidden');
         loadingContainer.classList.remove('hidden');
         
-        const url = `https://storage.googleapis.com/emergence_webvoyager_annotations/${agent}/${index}.json`;
+        const url = `emergence_webvoyager_annotations/${agent}/${index}.json`;
         fetch(url)
             .then(response => {
                 if (!response.ok) {
diff --git a/leaderboard/styles.css b/leaderboard/styles.css
index 5200817..a939567 100644
--- a/leaderboard/styles.css
+++ b/leaderboard/styles.css
@@ -13,7 +13,13 @@
     --box-shadow: 0 2px 8px rgba(0, 0, 0, 0.1);
     --transition: all 0.3s ease;
 }
-
+.disabled-link {
+    color: gray;
+    cursor: not-allowed;
+    pointer-events: none;
+    text-decoration: none;
+    opacity: 0.6;
+}
 /* Leaderboard Specific Styles */
 .leaderboard-container {
     padding: 20px;
diff --git a/leaderboard/viewer.html b/leaderboard/viewer.html
deleted file mode 100644
index 1f0a2fe..0000000
--- a/leaderboard/viewer.html
+++ /dev/null
@@ -1,135 +0,0 @@
-<!DOCTYPE html>
-<html lang="en">
-<head>
-    <meta charset="UTF-8">
-    <meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0">
-    <title>Execution Viewer - Emergence WebVoyager Viewer</title>
-    <link rel="stylesheet" href="https://use.typekit.net/okr0yzj.css">
-    <link rel="stylesheet" href="styles.css">
-    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.0/css/all.min.css">
-    <link rel="icon" href="https://cdn.prod.website-files.com/65f013e864fdd3a9bd005229/6650fee79381f46a3ac9ea8a_Favicon.png" type="image/png">
-</head>
-<body>
-    <div class="top-logo">
-        <img src="https://cdn.prod.website-files.com/66de0985d14d58299084e1e4/670071614eb67b9382724ca1_logo-emergence-black.svg" alt="Emergence Logo" class="company-logo">
-    </div>
-    
-    <div class="container">
-        <header>
-            <h1>Emergence WebVoyager</h1>
-            <nav class="main-nav">
-                <ul>
-                    <li><a href="home.html">Leaderboard</a></li>
-                    <li><a href="viewer.html" class="active">Execution Viewer</a></li>
-                </ul>
-            </nav>
-        </header>
-        
-        <div class="navigation-controls">
-            <div class="dropdown-container">
-                <label for="agent-select">Select WebAgent:</label>
-                <select id="agent-select">
-                    <option value="Operator">Operator</option>
-                    <option value="Computer Use" disabled>Computer Use (Coming Soon)</option>
-                    <option value="Emergence WebAgent" disabled>Emergence WebAgent (Coming Soon)</option>
-                </select>
-            </div>
-            
-            <div class="index-controls">
-                <button id="prev-btn" class="nav-btn"><i class="fas fa-arrow-left"></i> Previous</button>
-                <div class="index-input-container">
-                    <label for="index-input">Index (1-535):</label>
-                    <input type="number" id="index-input" min="1" max="535" value="1">
-                </div>
-                <button id="next-btn" class="nav-btn">Next <i class="fas fa-arrow-right"></i></button>
-            </div>
-        </div>
-        
-        <div id="error-container" class="error-container hidden">
-            <p id="error-message"></p>
-        </div>
-        
-        <div id="loading-container" class="loading-container hidden">
-            <div class="spinner"></div>
-            <p>Loading data...</p>
-        </div>
-        
-        <div id="content-container" class="content-container hidden">
-            <div class="data-section">
-                <h2>Task Information</h2>
-                <div class="data-grid">
-                    <div class="data-item">
-                        <span class="label">Index:</span>
-                        <span id="index-display" class="value"></span>
-                    </div>
-                    <div class="data-item">
-                        <span class="label">Domain:</span>
-                        <span id="domain-display" class="value"></span>
-                    </div>
-                    <div class="data-item">
-                        <span class="label">Date of Execution:</span>
-                        <span id="start-time-display" class="value"></span>
-                    </div>
-                    <div class="data-item">
-                        <span class="label">Task Status:</span>
-                        <span id="task-status-display" class="value"></span>
-                    </div>
-                    <div class="data-item full-width">
-                        <span class="label">Task:</span>
-                        <span id="task-display" class="value"></span>
-                    </div>
-                </div>
-            </div>
-            
-            <div class="answer-section">
-                <h2>Agent Answer</h2>
-                <div id="answer-container" class="value"></div>
-            </div>
-            
-            <div class="video-section">
-                <div id="video-container">
-                    <video id="task-video" controls autoplay muted>
-                        <p>Your browser does not support the video tag.</p>
-                    </video>
-                </div>
-                <div class="video-controls">
-                    <div class="speed-controls">
-                        <span>Playback Speed: </span>
-                        <button class="speed-btn" data-speed="1">1x</button>
-                        <button class="speed-btn active" data-speed="3">3x</button>
-                        <button class="speed-btn" data-speed="5">5x</button>
-                    </div>
-                </div>
-            </div>
-            
-            <div class="comments-section">
-                <h2>Comments</h2>
-                <div class="data-grid">
-                    <div class="data-item full-width">
-                        <span id="comments-display" class="value"></span>
-                    </div>
-                    <div class="data-item full-width">
-                        <span class="label">Annotation Timestamp:</span>
-                        <span id="annotation-timestamp-display" class="value"></span>
-                    </div>
-                </div>
-            </div>
-            
-            <div class="annotations-section">
-                <h2>Annotations</h2>
-                <div id="annotations-container">
-                    <!-- Annotation items will be inserted here dynamically -->
-                </div>
-            </div>
-        </div>
-    </div>
-    
-    <footer class="footer">
-        <p>Got feedback or found a bug? Hop into our <a href="https://discord.com/invite/wgNfmFuqJF" target="_blank"><i class="fab fa-discord"></i>Discord!</a></p>
-    </footer>
-    
-    <script src="data_samples.js"></script>
-    <script src="index.js"></script>
-    <script src="script.js"></script>
-</body>
-</html>

From 074b1f732bb41291b41d6e75c428a3309c6da280 Mon Sep 17 00:00:00 2001
From: DeepakAkkil <deepak.akkil@merlyn.org>
Date: Mon, 21 Jul 2025 14:06:31 +0300
Subject: [PATCH 6/8] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 74d6481..5234c42 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@ An enhanced version of webvoyager with methodological consideration
 
 ## Leaderboard
 - [Leaderboard Home](https://emergenceai.github.io/EmergenceWebVoyager/leaderboard/home.html)
-- [Leaderboard Viewer](https://emergenceai.github.io/EmergenceWebVoyager/leaderboard/viewer.html)
+- [Leaderboard Viewer]Coming Soon
 
 ## Execution Videos
 Check out the execution videos on the leaderboard pages linked above.

From a7dd9e47264a6910aff7c0e7d510256b39c4a1e5 Mon Sep 17 00:00:00 2001
From: DeepakAkkil <deepak.akkil@merlyn.org>
Date: Mon, 21 Jul 2025 14:07:06 +0300
Subject: [PATCH 7/8] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 5234c42..145b306 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@ An enhanced version of webvoyager with methodological consideration
 
 ## Leaderboard
 - [Leaderboard Home](https://emergenceai.github.io/EmergenceWebVoyager/leaderboard/home.html)
-- [Leaderboard Viewer]Coming Soon
+- Leaderboard Viewer : Coming Soon
 
 ## Execution Videos
 Check out the execution videos on the leaderboard pages linked above.

From 32547e1f0094f8343c45659aeca023e38e938ace Mon Sep 17 00:00:00 2001
From: DeepakAkkil <deepak.akkil@merlyn.org>
Date: Mon, 21 Jul 2025 14:13:31 +0300
Subject: [PATCH 8/8] minor update to readme

---
 README.md | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 145b306..8b59d7e 100644
--- a/README.md
+++ b/README.md
@@ -3,10 +3,7 @@ An enhanced version of webvoyager with methodological consideration
 
 ## Leaderboard
 - [Leaderboard Home](https://emergenceai.github.io/EmergenceWebVoyager/leaderboard/home.html)
-- Leaderboard Viewer : Coming Soon
-
-## Execution Videos
-Check out the execution videos on the leaderboard pages linked above.
+- Leaderboard Viewer : Coming Soon 
 
 ## Discord
 Got feedback or found a bug? Hop into our [Discord](https://discord.com/invite/wgNfmFuqJF)!
\ No newline at end of file