252 lines
104 KiB
JSON
252 lines
104 KiB
JSON
{"dataset": "test_task_0", "task_id": "8f6261cf-d665-4e61-93af-f50f0d366245", "annotation_id": "8f6261cf-d665-4e61-93af-f50f0d366245", "instruction": "Find all events taking place in New York City during the month of September.", "strict_f1": 0.378, "loose_f1": 0.541, "semantic_score": 0.318, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_0", "task_id": "918d7ef3-a6ec-458a-88f1-1c2726fd2883", "annotation_id": "918d7ef3-a6ec-458a-88f1-1c2726fd2883", "instruction": "Find help page about buying tickets.", "strict_f1": 0.308, "loose_f1": 0.308, "semantic_score": 0.5, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "9c3cba90-742a-4f3b-a2e7-767b565fae96", "annotation_id": "9c3cba90-742a-4f3b-a2e7-767b565fae96", "instruction": "check two ticket with best seat that has promo code first show happening in Hamilton New York on April.", "strict_f1": 0.341, "loose_f1": 0.537, "semantic_score": 0.336, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "failed", "agent_success": false, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "1d73ad40-f7f8-435e-a83d-8b38534427fd", "annotation_id": "1d73ad40-f7f8-435e-a83d-8b38534427fd", "instruction": "Find the cheapest women's plus size brown color loungewear in 3xl size.", "strict_f1": 0.233, "loose_f1": 0.419, "semantic_score": 0.344, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_0", "task_id": "91695df8-f256-47c9-8c37-06e8d0fc758f", "annotation_id": "91695df8-f256-47c9-8c37-06e8d0fc758f", "instruction": "Rent a truck with the lowest rent with two dozen furniture pads for 100 miles at zip 08817 on April 12 at 2:30 pm rented truck is to be returned to the exact location and date, and the pickup and drop off will be at the nearest location.", "strict_f1": 0.151, "loose_f1": 0.377, "semantic_score": 0.221, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_0", "task_id": "923fd4e0-1862-45b0-9bba-d57b956844da", "annotation_id": "923fd4e0-1862-45b0-9bba-d57b956844da", "instruction": "search gas pickup truck in Fremont with 2010 and 2017 with less than 80000 mile", "strict_f1": 0.412, "loose_f1": 0.412, "semantic_score": 0.28, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "0572899e-7c07-4a2f-a77b-bba4f432a7ad", "annotation_id": "0572899e-7c07-4a2f-a77b-bba4f432a7ad", "instruction": "find my trip with confirmation number SFTBAO including first and last name Joe Lukeman", "strict_f1": 0.261, "loose_f1": 0.348, "semantic_score": 0.314, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "failed", "agent_success": false, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "1b4859f4-6951-4f6a-8a74-1c9647900eb1", "annotation_id": "1b4859f4-6951-4f6a-8a74-1c9647900eb1", "instruction": "Find the status of March 25 flights from New York airports to Columbus in Ohio.", "strict_f1": 0.143, "loose_f1": 0.321, "semantic_score": 0.282, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_0", "task_id": "9223f1b4-43ad-4636-9541-99ff9e6ad918", "annotation_id": "9223f1b4-43ad-4636-9541-99ff9e6ad918", "instruction": "Browse the venues that are playing the Wicked show from Oct 5 to Oct 24 2023", "strict_f1": 0.148, "loose_f1": 0.148, "semantic_score": 0.158, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "failed", "agent_success": false, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "a52fcf7a-50aa-4256-8796-654b3dc3adac", "annotation_id": "a52fcf7a-50aa-4256-8796-654b3dc3adac", "instruction": "Buy a diamond pass in New York's, Great escape park, add one meal dining plan to it, and select the flexible payment plan for Jame Jones. The email address is jame_jones@hotmail.com, zip code 10005 and age is 35.", "strict_f1": 0.053, "loose_f1": 0.211, "semantic_score": 0.068, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "failed", "agent_success": false, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "a4f3beb3-422a-4049-8d17-44eade56fed1", "annotation_id": "a4f3beb3-422a-4049-8d17-44eade56fed1", "instruction": "Add to my wish list the highest rated activity in Amsterdam.", "strict_f1": 0.429, "loose_f1": 0.714, "semantic_score": 0.34, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "e39333ef-a5a5-4117-9af2-3bb243b364f2", "annotation_id": "e39333ef-a5a5-4117-9af2-3bb243b364f2", "instruction": "Find the movie Donnie Darko and show its complete cast.", "strict_f1": 0.167, "loose_f1": 0.167, "semantic_score": 0.233, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "e8b1cc02-1143-47cf-a7bb-9a16d08e155a", "annotation_id": "e8b1cc02-1143-47cf-a7bb-9a16d08e155a", "instruction": "Browse the list of top 250 movies and add the first one to my watchlist.", "strict_f1": 0.182, "loose_f1": 0.545, "semantic_score": 0.233, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "failed", "agent_success": false, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "b59b1997-884f-42f0-b083-fc93d6ce64fe", "annotation_id": "b59b1997-884f-42f0-b083-fc93d6ce64fe", "instruction": "see Nissan and Honda cars for sale near Kentwood, MI 49512", "strict_f1": 0.222, "loose_f1": 0.222, "semantic_score": 0.156, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "e483a49f-029d-446c-892c-c56b92fc463b", "annotation_id": "e483a49f-029d-446c-892c-c56b92fc463b", "instruction": "Build an entry-level pc with an windows 11 64 bit intel i7 CPU with a256gb ssd drive + 4gb ram and adding cheapest component and accessories available.", "strict_f1": 0.4, "loose_f1": 0.494, "semantic_score": 0.238, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_0", "task_id": "14d50319-3f81-4aa6-8ee8-d1b66e4d5d64", "annotation_id": "14d50319-3f81-4aa6-8ee8-d1b66e4d5d64", "instruction": "Find 32\" Curved monitor and add the third one to the wish list.", "strict_f1": 0.286, "loose_f1": 0.286, "semantic_score": 0.167, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "f9a882f7-826e-469a-ad69-0d5f912734c9", "annotation_id": "f9a882f7-826e-469a-ad69-0d5f912734c9", "instruction": "Search the cheapest Curry brand unisex athletic shoes with the number 5.5, add to cart and checkout.", "strict_f1": 0.481, "loose_f1": 0.556, "semantic_score": 0.447, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_0", "task_id": "a5dd5729-415a-4fe2-a840-4935bf9428d4", "annotation_id": "a5dd5729-415a-4fe2-a840-4935bf9428d4", "instruction": "Browse spider-man toys for kids and sort by lowest price.", "strict_f1": 0.0, "loose_f1": 0.0, "semantic_score": 0.0, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "a2500e0b-9244-4f0e-b686-fa290c32b829", "annotation_id": "a2500e0b-9244-4f0e-b686-fa290c32b829", "instruction": "Find the store location and hours of the closest Gamestop to zip code 90028 and set as home store", "strict_f1": 0.0, "loose_f1": 0.25, "semantic_score": 0.1, "judge_verdict": "fail", "judge_score": 0.5, "agent_status": "failed", "agent_success": false, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "942666cb-147a-4033-be7e-d56ccca47506", "annotation_id": "942666cb-147a-4033-be7e-d56ccca47506", "instruction": "Find and view the biography for the Host of the Price is Right.", "strict_f1": 0.222, "loose_f1": 0.222, "semantic_score": 0.175, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "4fa7cab9-8448-4cdb-842f-dba109b3a13e", "annotation_id": "4fa7cab9-8448-4cdb-842f-dba109b3a13e", "instruction": "Find a private room in New York for 1 April and checkout on 2 April for 2 adults", "strict_f1": 0.636, "loose_f1": 0.636, "semantic_score": 0.345, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "961e4feb-2b5b-4372-a5db-c7d3222aac21", "annotation_id": "961e4feb-2b5b-4372-a5db-c7d3222aac21", "instruction": "Check the status of train S92 for any disruptions.", "strict_f1": 0.0, "loose_f1": 0.182, "semantic_score": 0.067, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "a29533ef-86ae-42fd-a7d2-a6a62ffe689d", "annotation_id": "a29533ef-86ae-42fd-a7d2-a6a62ffe689d", "instruction": "Get the report from the final environmental impact statement for the Jamaica Bus Depot expansion.", "strict_f1": 0.3, "loose_f1": 0.6, "semantic_score": 0.243, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "c52fcdf7-1f23-4074-91bb-1a121af02a80", "annotation_id": "c52fcdf7-1f23-4074-91bb-1a121af02a80", "instruction": "Plan a trip to reach JFK airport from central park by 11am on April 12", "strict_f1": 0.204, "loose_f1": 0.367, "semantic_score": 0.229, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_0", "task_id": "1a53fb39-4d08-4722-addd-f04b0025ef81", "annotation_id": "1a53fb39-4d08-4722-addd-f04b0025ef81", "instruction": "Rent \" The Whale \" movie on demand with format \"high definition\".", "strict_f1": 0.125, "loose_f1": 0.188, "semantic_score": 0.25, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_0", "task_id": "453ebdd8-0989-455e-87ba-ebad183c0a04", "annotation_id": "453ebdd8-0989-455e-87ba-ebad183c0a04", "instruction": "Browse the page with event planning tips.", "strict_f1": 0.0, "loose_f1": 0.0, "semantic_score": 0.0, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_0", "task_id": "4b8fb0aa-7d7c-4a22-bfd5-f09316a050c3", "annotation_id": "4b8fb0aa-7d7c-4a22-bfd5-f09316a050c3", "instruction": "Search for a paid fishing class event on chicago", "strict_f1": 0.0, "loose_f1": 0.0, "semantic_score": 0.0, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_0", "task_id": "d6545454-33e8-4a35-988e-fa6cc0eb5873", "annotation_id": "d6545454-33e8-4a35-988e-fa6cc0eb5873", "instruction": "check available hotels with one room for two adult in Harlem less than $200 to check in on Mar 17th and check out on Mar 20th", "strict_f1": 0.278, "loose_f1": 0.444, "semantic_score": 0.29, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_0", "task_id": "c4380ce9-af36-4025-936d-354bf768c8b9", "annotation_id": "c4380ce9-af36-4025-936d-354bf768c8b9", "instruction": "Look for a business class flight to Paris from Salt Lake City on June 2, with a return on June 7, and checkout", "strict_f1": 0.237, "loose_f1": 0.373, "semantic_score": 0.269, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_0", "task_id": "81b4816a-9107-4951-ae3a-6587f28b49e8", "annotation_id": "81b4816a-9107-4951-ae3a-6587f28b49e8", "instruction": "Open my likes list.", "strict_f1": 0.4, "loose_f1": 0.4, "semantic_score": 0.35, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "105d3ad2-9a1b-4eef-9215-30d432a47e73", "annotation_id": "105d3ad2-9a1b-4eef-9215-30d432a47e73", "instruction": "Find out what popular events are being held this weekend in the category performing and visual arts near Chester, UK", "strict_f1": 0.194, "loose_f1": 0.387, "semantic_score": 0.243, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "aecaba3f-5ba7-44ba-8c05-e06c5d39a3c3", "annotation_id": "aecaba3f-5ba7-44ba-8c05-e06c5d39a3c3", "instruction": "Plan a bus trip going from the Boston Logan Airport to South Station", "strict_f1": 0.138, "loose_f1": 0.552, "semantic_score": 0.145, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "c094948f-afc6-415c-968a-9e105e2db118", "annotation_id": "c094948f-afc6-415c-968a-9e105e2db118", "instruction": "View the latest job openings in safety with a salary above 100k per annum, check the details, and apply.", "strict_f1": 0.621, "loose_f1": 0.621, "semantic_score": 0.35, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "4357a1ab-c012-47bd-94a8-720150cb8775", "annotation_id": "4357a1ab-c012-47bd-94a8-720150cb8775", "instruction": "Add The Wire to the watchlist.", "strict_f1": 0.0, "loose_f1": 0.222, "semantic_score": 0.0, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "failed", "agent_success": false, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "18fc60d7-aa69-4c07-9bf1-64543eae52c9", "annotation_id": "18fc60d7-aa69-4c07-9bf1-64543eae52c9", "instruction": "Add a e-gift card to bag of $100 for recipient John and email address abc@test.com from buckeye.foobar@gmail.com with message gift card.", "strict_f1": 0.0, "loose_f1": 0.296, "semantic_score": 0.044, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "2daa15a5-649e-43fa-912f-00111b163fb6", "annotation_id": "2daa15a5-649e-43fa-912f-00111b163fb6", "instruction": "Add formula 1 to my followed sports.", "strict_f1": 0.143, "loose_f1": 0.571, "semantic_score": 0.14, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "failed", "agent_success": false, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "691c18cc-b1ad-44e5-a506-584198162ae1", "annotation_id": "691c18cc-b1ad-44e5-a506-584198162ae1", "instruction": "add WWE superstar ALIYAH to your favorite by following her.", "strict_f1": 0.286, "loose_f1": 0.571, "semantic_score": 0.317, "judge_verdict": "fail", "judge_score": 0.5, "agent_status": "failed", "agent_success": false, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "ed60077a-1853-4b0d-8174-b339d08de32e", "annotation_id": "ed60077a-1853-4b0d-8174-b339d08de32e", "instruction": "Search the latest story about NFL and share it on facebook.", "strict_f1": 0.08, "loose_f1": 0.32, "semantic_score": 0.225, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "failed", "agent_success": false, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "7f90a191-9dbe-478a-8ae2-8aa45b790158", "annotation_id": "7f90a191-9dbe-478a-8ae2-8aa45b790158", "instruction": "Find more films from the director of Smile.", "strict_f1": 0.087, "loose_f1": 0.261, "semantic_score": 0.175, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "790ba0ec-4e7d-4df0-ac86-ea52b3a73532", "annotation_id": "790ba0ec-4e7d-4df0-ac86-ea52b3a73532", "instruction": "Add my birthday detail, January 5, 1980, and dairy and peanut allergy in my profile, also add love ramen noodles to my bio, and save.", "strict_f1": 0.4, "loose_f1": 0.5, "semantic_score": 0.222, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "failed", "agent_success": false, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "8dcf6423-262a-439b-9ee7-279a920468fa", "annotation_id": "8dcf6423-262a-439b-9ee7-279a920468fa", "instruction": "Tell me more about the Adirondack route.", "strict_f1": 0.074, "loose_f1": 0.222, "semantic_score": 0.3, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "6bf5cdf6-abc8-4425-b813-1a0b51ed16bb", "annotation_id": "6bf5cdf6-abc8-4425-b813-1a0b51ed16bb", "instruction": "Find tickets between $200-300 for next Amy Grant concert", "strict_f1": 0.176, "loose_f1": 0.412, "semantic_score": 0.19, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "7dfdeddd-b449-44cb-a0b0-1fde889219e2", "annotation_id": "7dfdeddd-b449-44cb-a0b0-1fde889219e2", "instruction": "Find the mobile delivery tickets for the up coming event on march 23th at Columbus with low cast.", "strict_f1": 0.069, "loose_f1": 0.276, "semantic_score": 0.225, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "failed", "agent_success": false, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "94f88670-09a4-4926-9353-f5eed2d81c01", "annotation_id": "94f88670-09a4-4926-9353-f5eed2d81c01", "instruction": "Get the cheapest hotel room for 1 adult with a free wifi in Seoul from 10th to 12th April for work purposes with no prepayment and a review score of 7+.", "strict_f1": 0.345, "loose_f1": 0.586, "semantic_score": 0.274, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_0", "task_id": "ef3a7151-b9cd-4ddb-aa76-e7e66815c23d", "annotation_id": "ef3a7151-b9cd-4ddb-aa76-e7e66815c23d", "instruction": "Show me the list of Men's Blazers, Black, Size M.", "strict_f1": 0.333, "loose_f1": 0.333, "semantic_score": 0.237, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "b674adf9-8950-4d27-b888-b789ccd49479", "annotation_id": "b674adf9-8950-4d27-b888-b789ccd49479", "instruction": "Look for an intern career within Newegg in City of Industry, California, USA, and bookmark it.", "strict_f1": 0.25, "loose_f1": 0.333, "semantic_score": 0.189, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "failed", "agent_success": false, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "7ac28815-ae2a-4eef-afed-d75ec2d11b2b", "annotation_id": "7ac28815-ae2a-4eef-afed-d75ec2d11b2b", "instruction": "Browse the used inventory of the Model Y with performance all-wheel drive.", "strict_f1": 0.154, "loose_f1": 0.462, "semantic_score": 0.233, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "4aa42fe7-cbef-48ed-95e9-fc041762ea65", "annotation_id": "4aa42fe7-cbef-48ed-95e9-fc041762ea65", "instruction": "Book the cheapest parking spot near Bradley Airport", "strict_f1": 0.0, "loose_f1": 0.154, "semantic_score": 0.05, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "failed", "agent_success": false, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "02aaea66-b7f4-4fac-8849-92480bf9b542", "annotation_id": "02aaea66-b7f4-4fac-8849-92480bf9b542", "instruction": "Find a place to stay near Great Escape New Park from April 21 to April 24 for 2 adults and 1 kid, and book the cheapest themed room.", "strict_f1": 0.5, "loose_f1": 0.583, "semantic_score": 0.443, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_0", "task_id": "19b955ba-fdcd-4345-b33a-fc6a88b5a85d", "annotation_id": "19b955ba-fdcd-4345-b33a-fc6a88b5a85d", "instruction": "Apply for a job on the Six Flags White Water park", "strict_f1": 0.167, "loose_f1": 0.333, "semantic_score": 0.35, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "374f69aa-bbd8-4202-9026-afc84f197421", "annotation_id": "374f69aa-bbd8-4202-9026-afc84f197421", "instruction": "What are the family rides available at frontier city?", "strict_f1": 0.364, "loose_f1": 0.455, "semantic_score": 0.4, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "e37f9fa1-70bc-4f55-86c4-8b7593e8fb48", "annotation_id": "e37f9fa1-70bc-4f55-86c4-8b7593e8fb48", "instruction": "Book a one-way, fastest, and most flexible direct flight ticket for two from Banglore to Goa on March 29 evening.", "strict_f1": 0.667, "loose_f1": 0.756, "semantic_score": 0.375, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_0", "task_id": "4dceb921-f7ff-4069-b860-47cf663072ee", "annotation_id": "4dceb921-f7ff-4069-b860-47cf663072ee", "instruction": "Show me reviews for Elden Ring, best to worst.", "strict_f1": 0.273, "loose_f1": 0.364, "semantic_score": 0.17, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "9d8f230b-433f-4aa1-ac42-4a8cf2e0a5a6", "annotation_id": "9d8f230b-433f-4aa1-ac42-4a8cf2e0a5a6", "instruction": "Browse newly released movies and filter by genre to find a comedy to rent.", "strict_f1": 0.071, "loose_f1": 0.214, "semantic_score": 0.167, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "e77dc7d4-59c9-48ad-a36e-fcd999a4e5e1", "annotation_id": "e77dc7d4-59c9-48ad-a36e-fcd999a4e5e1", "instruction": "Find the top rated indie rock music track and set the track as current obsession and after that watch all other obsessions too.", "strict_f1": 0.286, "loose_f1": 0.571, "semantic_score": 0.283, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "01c6e863-be94-4c6b-b9c1-55c203332f50", "annotation_id": "01c6e863-be94-4c6b-b9c1-55c203332f50", "instruction": "Find an editor's choice review with a score of 10 in the boardgame category.", "strict_f1": 0.071, "loose_f1": 0.143, "semantic_score": 0.125, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_0", "task_id": "36b6bb33-de92-489a-baa1-a286a7b2dabd", "annotation_id": "36b6bb33-de92-489a-baa1-a286a7b2dabd", "instruction": "Find a walkthrough for the game \"The Legend of Zelda: Breath of the Wild\".", "strict_f1": 0.0, "loose_f1": 0.095, "semantic_score": 0.05, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_0", "task_id": "50d8cbaa-d4e5-48fa-8733-c5621de7ab7e", "annotation_id": "50d8cbaa-d4e5-48fa-8733-c5621de7ab7e", "instruction": "Find the most wanted contemporary Jazz album in the US from the last decade.", "strict_f1": 0.48, "loose_f1": 0.64, "semantic_score": 0.356, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "7eecbe7f-58d9-4b2f-b1ec-27e31562ea77", "annotation_id": "7eecbe7f-58d9-4b2f-b1ec-27e31562ea77", "instruction": "Find the name of Depeche mode's latest album and add it to my wantlist.", "strict_f1": 0.138, "loose_f1": 0.345, "semantic_score": 0.2, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "8251e820-4b8a-4221-b2d9-8158cada3dcf", "annotation_id": "8251e820-4b8a-4221-b2d9-8158cada3dcf", "instruction": "Open the page with a overview about the submission of releases.", "strict_f1": 0.5, "loose_f1": 0.5, "semantic_score": 0.333, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "a8a7ecb3-3b3d-4643-a4c1-2a899b6a0465", "annotation_id": "a8a7ecb3-3b3d-4643-a4c1-2a899b6a0465", "instruction": "Browse a list of rock vinyls shipped from Germany.", "strict_f1": 0.182, "loose_f1": 0.364, "semantic_score": 0.225, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "774f0c2f-6caa-447f-a338-2df822882cbe", "annotation_id": "774f0c2f-6caa-447f-a338-2df822882cbe", "instruction": "Add five of red 50-dollar gift carts to the cart, during checkout, check if coupon 100OFF is working or not.", "strict_f1": 0.108, "loose_f1": 0.432, "semantic_score": 0.16, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_0", "task_id": "79d9203e-dc14-430c-81e2-ac6bcf002a07", "annotation_id": "79d9203e-dc14-430c-81e2-ac6bcf002a07", "instruction": "View all of the Most Popular TV on RT.", "strict_f1": 0.0, "loose_f1": 0.2, "semantic_score": 0.2, "judge_verdict": "error", "judge_score": 0.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "c1f584e2-e353-4298-b98b-fb21cbf2c16c", "annotation_id": "c1f584e2-e353-4298-b98b-fb21cbf2c16c", "instruction": "Find the latest Game of Thrones toys for English speakers and add the top two result to the cart and checkout.", "strict_f1": 0.235, "loose_f1": 0.353, "semantic_score": 0.167, "judge_verdict": "pass", "judge_score": 0.5, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "c13d245a-4f01-4b5c-b721-b1dd989f5f13", "annotation_id": "c13d245a-4f01-4b5c-b721-b1dd989f5f13", "instruction": "Check travel requirements for trips between Tokyo and Guangzhou.", "strict_f1": 0.103, "loose_f1": 0.256, "semantic_score": 0.243, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_0", "task_id": "b0bb4740-9b8d-4144-b443-c6e3d7db59f8", "annotation_id": "b0bb4740-9b8d-4144-b443-c6e3d7db59f8", "instruction": "Find a store in Tempe, Arizona, make it my store, and then visit the store page and see whats happening in store.", "strict_f1": 0.074, "loose_f1": 0.296, "semantic_score": 0.18, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "failed", "agent_success": false, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "ccb397da-ddee-4b7f-8d41-249b6f37e963", "annotation_id": "ccb397da-ddee-4b7f-8d41-249b6f37e963", "instruction": "Find a bedroom nightstand made from solid wood and compare the top three highest-rated products, and show only the differences.", "strict_f1": 0.244, "loose_f1": 0.39, "semantic_score": 0.169, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "63693bf5-2b77-486e-868e-4a5d0b98f83b", "annotation_id": "63693bf5-2b77-486e-868e-4a5d0b98f83b", "instruction": "Choose a car which is closest to 10001 New York City for 1 day.", "strict_f1": 0.128, "loose_f1": 0.255, "semantic_score": 0.214, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_0", "task_id": "e74c6528-0ec1-4169-90a9-970e66fabc0c", "annotation_id": "e74c6528-0ec1-4169-90a9-970e66fabc0c", "instruction": "Find a large car with lowest price from Mar 28 to Apr 1 in Zurich.", "strict_f1": 0.159, "loose_f1": 0.222, "semantic_score": 0.312, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_0", "task_id": "96016e7b-4097-49ac-9b72-97b061a886a6", "annotation_id": "96016e7b-4097-49ac-9b72-97b061a886a6", "instruction": "Explore the trending playlists,filter by tag #Electronics, play Countdown by Adrien Brunelat and like it.", "strict_f1": 0.0, "loose_f1": 0.0, "semantic_score": 0.033, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_0", "task_id": "0d56ec88-1203-4c38-8d2c-90b3a3d12c25", "annotation_id": "0d56ec88-1203-4c38-8d2c-90b3a3d12c25", "instruction": "Find me the cheapest external HD for an Xbox One.", "strict_f1": 0.25, "loose_f1": 0.25, "semantic_score": 0.188, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "3ca5cfbb-1416-457b-8c13-94a2ed0002bf", "annotation_id": "3ca5cfbb-1416-457b-8c13-94a2ed0002bf", "instruction": "Find the top number one track by the artist \"Alan Tam\" of all time.", "strict_f1": 0.4, "loose_f1": 0.4, "semantic_score": 0.34, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "12cbd3a8-1087-413d-8114-3d754caaa362", "annotation_id": "12cbd3a8-1087-413d-8114-3d754caaa362", "instruction": "Add the pre made College Supply List to my list.", "strict_f1": 0.087, "loose_f1": 0.348, "semantic_score": 0.225, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_0", "task_id": "845fbfa9-1b98-4df4-b7c5-4c71ef3e5b1b", "annotation_id": "845fbfa9-1b98-4df4-b7c5-4c71ef3e5b1b", "instruction": "check the national cafe menu", "strict_f1": 0.0, "loose_f1": 0.323, "semantic_score": 0.2, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "925a2307-b2b7-4189-bf25-e3f463c24e1c", "annotation_id": "925a2307-b2b7-4189-bf25-e3f463c24e1c", "instruction": "Check my AMC gift card balance with gift card number 87654321 and pin number 9753.", "strict_f1": 0.1, "loose_f1": 0.3, "semantic_score": 0.1, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "failed", "agent_success": false, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "ee1e95ab-4c5d-44c6-b302-783fd13a471e", "annotation_id": "ee1e95ab-4c5d-44c6-b302-783fd13a471e", "instruction": "Add two Maryla\nnd park all-season go-cart passes add-on in the cart, and choose the digital ticket option.", "strict_f1": 0.085, "loose_f1": 0.426, "semantic_score": 0.18, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "failed", "agent_success": false, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "38093911-e502-4333-9819-19e130be4056", "annotation_id": "38093911-e502-4333-9819-19e130be4056", "instruction": "Give a 10 rating to The Terminator II: Judgement Day.", "strict_f1": 0.167, "loose_f1": 0.333, "semantic_score": 0.24, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "failed", "agent_success": false, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "f38789fb-477a-4b16-92bf-f71e36fe262d", "annotation_id": "f38789fb-477a-4b16-92bf-f71e36fe262d", "instruction": "Check travel restrictions when traveling from US to Cambodia.", "strict_f1": 0.167, "loose_f1": 0.5, "semantic_score": 0.1, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "6cf8ca9c-672d-426e-ab6c-c865475edcd4", "annotation_id": "6cf8ca9c-672d-426e-ab6c-c865475edcd4", "instruction": "Search for a 10 day cruise to Alaska from San Francisco in June 2023.", "strict_f1": 0.636, "loose_f1": 0.818, "semantic_score": 0.433, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "01815816-53e8-43b4-8923-b0f4390a9a15", "annotation_id": "01815816-53e8-43b4-8923-b0f4390a9a15", "instruction": "Find technical specs for the latest Macbook Air.", "strict_f1": 0.133, "loose_f1": 0.133, "semantic_score": 0.175, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "1a13b675-36a8-4b42-b246-db49797e5d2b", "annotation_id": "1a13b675-36a8-4b42-b246-db49797e5d2b", "instruction": "Add a set of queen-sized bed sheets with at least a 4-star rating to the cart.", "strict_f1": 0.5, "loose_f1": 0.5, "semantic_score": 0.25, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "1c2baca4-8c20-4e04-b6f6-90db4f565a72", "annotation_id": "1c2baca4-8c20-4e04-b6f6-90db4f565a72", "instruction": "Find the 24 count cheapest septic safe mega-size toiler paper in buy-and-save offers and add to the cart for pick-up.", "strict_f1": 0.394, "loose_f1": 0.606, "semantic_score": 0.288, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_0", "task_id": "1140f858-b580-4c96-a06c-08fa9d020131", "annotation_id": "1140f858-b580-4c96-a06c-08fa9d020131", "instruction": "Search for \"you are Amazing\" congrats gift card which can be printed and folded at home, priced between 50 and 100, and add two 50-dollar cards to the cart.", "strict_f1": 0.263, "loose_f1": 0.474, "semantic_score": 0.32, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_0", "task_id": "181e41bd-db3d-4313-a5dc-e7c79c9f27eb", "annotation_id": "181e41bd-db3d-4313-a5dc-e7c79c9f27eb", "instruction": "Enter the bargain cave to buy a 9 mm pistol from the clearance sale and that has an average rating of 4 and 5.", "strict_f1": 0.17, "loose_f1": 0.426, "semantic_score": 0.24, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_0", "task_id": "f9c80513-c1c2-42ef-b768-9a6d0bb5a9a5", "annotation_id": "f9c80513-c1c2-42ef-b768-9a6d0bb5a9a5", "instruction": "Find discussions of community and open one with most replies.", "strict_f1": 0.267, "loose_f1": 0.533, "semantic_score": 0.3, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "a7000718-a44d-447f-9e3f-b4b855c508ef", "annotation_id": "a7000718-a44d-447f-9e3f-b4b855c508ef", "instruction": "Set the first recommended song on the homepage as a current obsession", "strict_f1": 0.133, "loose_f1": 0.4, "semantic_score": 0.233, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "failed", "agent_success": false, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "8c82b107-9cf0-4b0f-aa23-433aaba5a8b5", "annotation_id": "8c82b107-9cf0-4b0f-aa23-433aaba5a8b5", "instruction": "Find newest farming & crafting simulation games that are free.", "strict_f1": 0.182, "loose_f1": 0.545, "semantic_score": 0.233, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "c472a4fe-33a0-4b6f-8d42-adcc067ba4ed", "annotation_id": "c472a4fe-33a0-4b6f-8d42-adcc067ba4ed", "instruction": "Browse books by author Stephen King in German.", "strict_f1": 0.154, "loose_f1": 0.308, "semantic_score": 0.175, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "bf7321eb-fb52-4440-8a5f-157114d935e9", "annotation_id": "bf7321eb-fb52-4440-8a5f-157114d935e9", "instruction": "Find upcoming NBA games of the Miami Heat and book the 5 level 320 tickets for Sunday, April 16.", "strict_f1": 0.143, "loose_f1": 0.5, "semantic_score": 0.257, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_0", "task_id": "7f90189b-f824-4bab-b764-c6fba29cff7a", "annotation_id": "7f90189b-f824-4bab-b764-c6fba29cff7a", "instruction": "Book the cheapest economy flight between Miami and Nairobi for two adults on May 4 with a return option on May 8.", "strict_f1": 0.455, "loose_f1": 0.515, "semantic_score": 0.416, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_0", "task_id": "070e63b9-e341-485d-ad10-82bf9ea45715", "annotation_id": "070e63b9-e341-485d-ad10-82bf9ea45715", "instruction": "Contact the support service.", "strict_f1": 0.267, "loose_f1": 0.4, "semantic_score": 0.333, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "2e78c6c2-d807-4d7a-8099-70b04d367d57", "annotation_id": "2e78c6c2-d807-4d7a-8099-70b04d367d57", "instruction": "Search for Monthly Parking Starting april 12 near Florida Keys", "strict_f1": 0.111, "loose_f1": 0.278, "semantic_score": 0.167, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "f02ef5ca-8ebc-451b-bf6d-d67463a467b3", "annotation_id": "f02ef5ca-8ebc-451b-bf6d-d67463a467b3", "instruction": "Check the availability of 2 adult and 2 child tickets for most popular FamilyFriendly Event in Denmark on 25th march.", "strict_f1": 0.629, "loose_f1": 0.686, "semantic_score": 0.356, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "failed", "agent_success": false, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "08840c76-eabb-4537-ab41-f62d1e7c94c3", "annotation_id": "08840c76-eabb-4537-ab41-f62d1e7c94c3", "instruction": "Add a front load washing machine under $800 to the cart.", "strict_f1": 0.174, "loose_f1": 0.261, "semantic_score": 0.2, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "d2171cc3-2514-4271-b255-d026f0563b2d", "annotation_id": "d2171cc3-2514-4271-b255-d026f0563b2d", "instruction": "Add the #1 and #2 ranked boardgame to a collection.", "strict_f1": 0.552, "loose_f1": 0.69, "semantic_score": 0.286, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "32174c75-e2f4-4a98-9233-3758cf4b0860", "annotation_id": "32174c75-e2f4-4a98-9233-3758cf4b0860", "instruction": "Get the nearest attorney in West Hollywood, Florida, who can handle drug charges, accepts credit cards, and speaks Spanish.", "strict_f1": 0.195, "loose_f1": 0.39, "semantic_score": 0.185, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_0", "task_id": "34c474ef-389c-421d-acbf-de5531437083", "annotation_id": "34c474ef-389c-421d-acbf-de5531437083", "instruction": "Look for hair salon coupons in San Diego.", "strict_f1": 0.154, "loose_f1": 0.308, "semantic_score": 0.125, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "ba5c335f-42ac-4955-92d2-96a636e4cbee", "annotation_id": "ba5c335f-42ac-4955-92d2-96a636e4cbee", "instruction": "Find Chevrolet Silverado with black exterior color and maximum value of $30,000.", "strict_f1": 0.421, "loose_f1": 0.421, "semantic_score": 0.222, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "d496bbbe-44f9-48cc-a11b-9540196dcf60", "annotation_id": "d496bbbe-44f9-48cc-a11b-9540196dcf60", "instruction": "Search for a full-time job in sales in Springfield and apply for the most recent job.", "strict_f1": 0.233, "loose_f1": 0.419, "semantic_score": 0.27, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "failed", "agent_success": false, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "dd8a2207-a5b0-4116-a63d-b62835d68b4e", "annotation_id": "dd8a2207-a5b0-4116-a63d-b62835d68b4e", "instruction": "Find only SUVs & Wagons that can be picked up at SFO on April 20, 2023 and returned on April 27, 2023.", "strict_f1": 0.471, "loose_f1": 0.588, "semantic_score": 0.275, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_1", "task_id": "cdf4d2ec-202c-465c-b09c-de3790c109f6", "annotation_id": "cdf4d2ec-202c-465c-b09c-de3790c109f6", "instruction": "Find morning sports experiences in english for one adult and 2 children in portugal on may 2", "strict_f1": 0.473, "loose_f1": 0.545, "semantic_score": 0.425, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_1", "task_id": "826e2b3e-0711-4ee7-848d-52cbf88f0b58", "annotation_id": "826e2b3e-0711-4ee7-848d-52cbf88f0b58", "instruction": "Add a pair of large Yaktrax traction cleats to the cart.", "strict_f1": 0.154, "loose_f1": 0.154, "semantic_score": 0.117, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_1", "task_id": "24bbf21c-e433-46d6-8a3b-896b0015c0e5", "annotation_id": "24bbf21c-e433-46d6-8a3b-896b0015c0e5", "instruction": "Find the list of all neighborhood maps for Brooklyn", "strict_f1": 0.111, "loose_f1": 0.333, "semantic_score": 0.3, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_1", "task_id": "7f78da3a-cfef-4667-8b05-cef9f836280d", "annotation_id": "7f78da3a-cfef-4667-8b05-cef9f836280d", "instruction": "check all jobs position available by department in Downtown, Manhattan.", "strict_f1": 0.133, "loose_f1": 0.4, "semantic_score": 0.24, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_1", "task_id": "a34a5ed4-b475-45b6-a4f9-bfdc774d80e5", "annotation_id": "a34a5ed4-b475-45b6-a4f9-bfdc774d80e5", "instruction": "Find the closest in-store Gamestop location within 50 miles of 21122.", "strict_f1": 0.0, "loose_f1": 0.167, "semantic_score": 0.04, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_1", "task_id": "5fe49ab4-30cc-4f2b-be9f-4cf174b1584d", "annotation_id": "5fe49ab4-30cc-4f2b-be9f-4cf174b1584d", "instruction": "Show crazy credits for the movie \" Prometheus\".", "strict_f1": 0.118, "loose_f1": 0.235, "semantic_score": 0.233, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_1", "task_id": "73782df1-0d1a-4ffb-a718-60110e25a3f3", "annotation_id": "73782df1-0d1a-4ffb-a718-60110e25a3f3", "instruction": "Find the most popular movies and showcase those with the highest IMDb ratings.", "strict_f1": 0.0, "loose_f1": 0.0, "semantic_score": 0.1, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_1", "task_id": "012446b3-ee30-480b-86ec-3a3cdeaba9dc", "annotation_id": "012446b3-ee30-480b-86ec-3a3cdeaba9dc", "instruction": "Show me cooking online experiences for 4 people on may 10", "strict_f1": 0.136, "loose_f1": 0.364, "semantic_score": 0.188, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_1", "task_id": "87e3b392-e1de-4ca0-aee6-54a466b138c7", "annotation_id": "87e3b392-e1de-4ca0-aee6-54a466b138c7", "instruction": "Find the statistics of the player Cristiano Ronaldo for the national team of Portugal in all the world cups.", "strict_f1": 0.045, "loose_f1": 0.136, "semantic_score": 0.117, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_1", "task_id": "88a8cd3b-3d01-4b4c-89bd-956fead2ce41", "annotation_id": "88a8cd3b-3d01-4b4c-89bd-956fead2ce41", "instruction": "Check the Italian Serie A Schedule fixtures on April 8th 2023", "strict_f1": 0.2, "loose_f1": 0.4, "semantic_score": 0.214, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_1", "task_id": "03fd47fc-8412-487a-b895-26c80a95c398", "annotation_id": "03fd47fc-8412-487a-b895-26c80a95c398", "instruction": "Make a build list with AI for a gaming pc with ryzen 9,rtx 6950xt and price $2000-$2500.", "strict_f1": 0.053, "loose_f1": 0.158, "semantic_score": 0.125, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_1", "task_id": "96cf7de0-e64f-466f-a9b9-99485461bc45", "annotation_id": "96cf7de0-e64f-466f-a9b9-99485461bc45", "instruction": "View top-rated new restaurants in Toronto and book a table for two for lunch on April 22, am in any Malaysian restaurant after 9 pm for dinner services.", "strict_f1": 0.368, "loose_f1": 0.579, "semantic_score": 0.318, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_1", "task_id": "d6a40526-d2ac-4367-b0c2-00ca99667e6a", "annotation_id": "d6a40526-d2ac-4367-b0c2-00ca99667e6a", "instruction": "Get the lowest priced women's plus size one piece swimsuit in color black with customer rating of atleat 5.", "strict_f1": 0.242, "loose_f1": 0.424, "semantic_score": 0.28, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_1", "task_id": "b60cf528-e446-4f46-b1de-c8ea262a6121", "annotation_id": "b60cf528-e446-4f46-b1de-c8ea262a6121", "instruction": "Find Auto train going from Moynihan to Chicago IL leaving on April 22nd to 29th for 2 adult.", "strict_f1": 0.259, "loose_f1": 0.519, "semantic_score": 0.247, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_1", "task_id": "96f184d8-cc70-4cce-8c7f-9ab60a27665c", "annotation_id": "96f184d8-cc70-4cce-8c7f-9ab60a27665c", "instruction": "Get the most reviewed shopping store that accepts apple pay in Central New York.", "strict_f1": 0.333, "loose_f1": 0.5, "semantic_score": 0.275, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_1", "task_id": "d5b1ca5f-a11f-4374-833f-d1a9ef711698", "annotation_id": "d5b1ca5f-a11f-4374-833f-d1a9ef711698", "instruction": "Add Pro Display XDR with nano texture to bag with all the accessories and check total.", "strict_f1": 0.205, "loose_f1": 0.41, "semantic_score": 0.25, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_1", "task_id": "c1b8361d-8be0-4eee-94f5-be115a4da8fc", "annotation_id": "c1b8361d-8be0-4eee-94f5-be115a4da8fc", "instruction": "Book an interior room for over 10 days on a cruise for two people from Brisbane to Papua New Guinea leaving November 2024. Check out with default options and pay in full.", "strict_f1": 0.714, "loose_f1": 0.75, "semantic_score": 0.495, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_1", "task_id": "5535f398-45ef-42e6-9d72-179b0a3d0ad8", "annotation_id": "5535f398-45ef-42e6-9d72-179b0a3d0ad8", "instruction": "Browse used Audi cars made before 2015 and sort by lowest price", "strict_f1": 0.062, "loose_f1": 0.125, "semantic_score": 0.175, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "failed", "agent_success": false, "agent_error": null}
|
|
{"dataset": "test_task_1", "task_id": "68c320f1-cf4e-416d-b375-c6ffe37ed145", "annotation_id": "68c320f1-cf4e-416d-b375-c6ffe37ed145", "instruction": "Browse used Mercedes cars made between 2004 to 2012 and sort by highest price", "strict_f1": 0.136, "loose_f1": 0.182, "semantic_score": 0.243, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_1", "task_id": "f485ddf4-695b-428f-b455-dc13901b1e46", "annotation_id": "f485ddf4-695b-428f-b455-dc13901b1e46", "instruction": "Find a 2022 Tesla Model 3.", "strict_f1": 0.276, "loose_f1": 0.414, "semantic_score": 0.286, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_1", "task_id": "a0b55038-7337-4c6b-9778-e0c43aa5d47f", "annotation_id": "a0b55038-7337-4c6b-9778-e0c43aa5d47f", "instruction": "Find a dentist for teeth whitening in Chicago.", "strict_f1": 0.0, "loose_f1": 0.143, "semantic_score": 0.15, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "failed", "agent_success": false, "agent_error": null}
|
|
{"dataset": "test_task_1", "task_id": "e56ef366-a861-4d96-a1ad-8891d11a0950", "annotation_id": "e56ef366-a861-4d96-a1ad-8891d11a0950", "instruction": "check all restaurant available in Buchanan, MI to book a dine in reservation with Spanish cuisine for 2 guest at 8pm on 18th of March.", "strict_f1": 0.357, "loose_f1": 0.571, "semantic_score": 0.233, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "failed", "agent_success": false, "agent_error": null}
|
|
{"dataset": "test_task_1", "task_id": "7b99ca15-508d-4a80-8138-bcb93b909f5f", "annotation_id": "7b99ca15-508d-4a80-8138-bcb93b909f5f", "instruction": "Add a Swivel vacuum under $150 to my cart.", "strict_f1": 0.133, "loose_f1": 0.4, "semantic_score": 0.1, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_1", "task_id": "5099fc8a-f0cd-488e-b7ab-5858a0c672e0", "annotation_id": "5099fc8a-f0cd-488e-b7ab-5858a0c672e0", "instruction": "Sign up for a REI Co-Op membership.", "strict_f1": 0.087, "loose_f1": 0.174, "semantic_score": 0.35, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_1", "task_id": "615b59c3-4bef-4d46-8df6-7224b22bd27c", "annotation_id": "615b59c3-4bef-4d46-8df6-7224b22bd27c", "instruction": "Select a E-Gift card with Congrats design and add to cart with Best Wishes as a message. Send it to James Smith with email abc@abc.com", "strict_f1": 0.0, "loose_f1": 0.32, "semantic_score": 0.018, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_1", "task_id": "35c34724-ea09-4443-887d-7e63a2abd24d", "annotation_id": "35c34724-ea09-4443-887d-7e63a2abd24d", "instruction": "Show me the options for a roundtrip leaving from las vegas on flexile dates on the interactive map", "strict_f1": 0.25, "loose_f1": 0.375, "semantic_score": 0.188, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_1", "task_id": "5be61758-82c4-4d00-b074-c25dfbb767f4", "annotation_id": "5be61758-82c4-4d00-b074-c25dfbb767f4", "instruction": "Search Cantonese food near Chicago, IL that are open now and priced $$.", "strict_f1": 0.105, "loose_f1": 0.421, "semantic_score": 0.114, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "failed", "agent_success": false, "agent_error": null}
|
|
{"dataset": "test_task_1", "task_id": "cb24d074-173c-4893-bb00-1acbb27a81a6", "annotation_id": "cb24d074-173c-4893-bb00-1acbb27a81a6", "instruction": "Browse throw pillows and filter by black and decorative sets.", "strict_f1": 0.143, "loose_f1": 0.143, "semantic_score": 0.083, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_1", "task_id": "e617d6c6-28ee-46cf-965e-ee1448910ce8", "annotation_id": "e617d6c6-28ee-46cf-965e-ee1448910ce8", "instruction": "Search for job in Miami Florida in Human resources.", "strict_f1": 0.125, "loose_f1": 0.125, "semantic_score": 0.117, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_1", "task_id": "62a611ff-1ef2-4777-8bb6-494b0ea7bbd8", "annotation_id": "62a611ff-1ef2-4777-8bb6-494b0ea7bbd8", "instruction": "Find 2 tickets to see the New York Yankees play on March 30, 2023 and filter to show price including fees.", "strict_f1": 0.19, "loose_f1": 0.571, "semantic_score": 0.2, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "failed", "agent_success": false, "agent_error": null}
|
|
{"dataset": "test_task_1", "task_id": "20b51cb9-9f6f-4ddc-b5d1-edb1cc629c38", "annotation_id": "20b51cb9-9f6f-4ddc-b5d1-edb1cc629c38", "instruction": "Check the current standings for MLS.", "strict_f1": 0.333, "loose_f1": 0.333, "semantic_score": 0.25, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_1", "task_id": "ae088c6f-2c7a-466c-9974-33aff8761414", "annotation_id": "ae088c6f-2c7a-466c-9974-33aff8761414", "instruction": "Find the current NFL standings for the AFC East division and determine which team is in first place.", "strict_f1": 0.0, "loose_f1": 0.0, "semantic_score": 0.0, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_1", "task_id": "4056a671-65f7-4b8d-8cb9-46e789130a19", "annotation_id": "4056a671-65f7-4b8d-8cb9-46e789130a19", "instruction": "Search for the lowest millage used Honda Crosstour 2012 to 2013 near 49102 less than $25000.", "strict_f1": 0.167, "loose_f1": 0.333, "semantic_score": 0.178, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_1", "task_id": "3c10b271-ae24-4289-9504-e36ab8565243", "annotation_id": "3c10b271-ae24-4289-9504-e36ab8565243", "instruction": "Find a list of Tours that contain visits to the Louvre rated 5 stars", "strict_f1": 0.133, "loose_f1": 0.533, "semantic_score": 0.225, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_1", "task_id": "380d2539-703d-4e2c-b3c0-b608b34733db", "annotation_id": "380d2539-703d-4e2c-b3c0-b608b34733db", "instruction": "Find cars that can be picked up at SFO on April 20, 2023 and returned on April 27, 2023.", "strict_f1": 0.154, "loose_f1": 0.308, "semantic_score": 0.286, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_1", "task_id": "3c76cc80-ddcb-48ed-941d-74cffecfc33f", "annotation_id": "3c76cc80-ddcb-48ed-941d-74cffecfc33f", "instruction": "Download the e-receipt with the last name Smith and confirmation number X123456989.", "strict_f1": 0.129, "loose_f1": 0.194, "semantic_score": 0.2, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_1", "task_id": "44678c96-50bf-42b2-b3d9-1ec94c0a0a61", "annotation_id": "44678c96-50bf-42b2-b3d9-1ec94c0a0a61", "instruction": "Book a Standard SUV to pick up on 10001 on april 10, 2023 for one day", "strict_f1": 0.522, "loose_f1": 0.783, "semantic_score": 0.34, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "failed", "agent_success": false, "agent_error": null}
|
|
{"dataset": "test_task_1", "task_id": "a3f650df-02e8-409a-987c-3acddf0ad1f5", "annotation_id": "a3f650df-02e8-409a-987c-3acddf0ad1f5", "instruction": "Find and select the car with the most number of seats to pick up in a top airport location in alabama on april 10", "strict_f1": 0.286, "loose_f1": 0.357, "semantic_score": 0.44, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_1", "task_id": "984775e2-6c07-4bb6-becc-669938487d0b", "annotation_id": "984775e2-6c07-4bb6-becc-669938487d0b", "instruction": "Find the cheapest Disney theme parks with water rides and reserve the deal.", "strict_f1": 0.37, "loose_f1": 0.593, "semantic_score": 0.311, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_1", "task_id": "87ed4a0a-3a5f-454a-9721-55069db484e2", "annotation_id": "87ed4a0a-3a5f-454a-9721-55069db484e2", "instruction": "Find a new drone priced between 25 to 50 dollar and ships from USA with average customer rating of 4 and upwards and save the search at the end.", "strict_f1": 0.261, "loose_f1": 0.261, "semantic_score": 0.18, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_1", "task_id": "3b0a3ed2-e48c-4e9d-a954-d9cc7730c9fa", "annotation_id": "3b0a3ed2-e48c-4e9d-a954-d9cc7730c9fa", "instruction": "search the cheapest small car rental deals from Little Ferry, New Jersey, United States on 23th March to same location dropoff on april 4th 2pm.", "strict_f1": 0.175, "loose_f1": 0.316, "semantic_score": 0.25, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_1", "task_id": "4e065fe4-3bb7-4e5a-afa0-eaef36770772", "annotation_id": "4e065fe4-3bb7-4e5a-afa0-eaef36770772", "instruction": "Find a resort near Patong beach in Phuket, Thailand nearest to the airport from May 9 to May 12 for two with a pay-later option.", "strict_f1": 0.4, "loose_f1": 0.629, "semantic_score": 0.318, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_1", "task_id": "513f4cef-feaa-45be-818d-75c876830af0", "annotation_id": "513f4cef-feaa-45be-818d-75c876830af0", "instruction": "Browse the clearance section and filter for women's dresses in size small.", "strict_f1": 0.286, "loose_f1": 0.457, "semantic_score": 0.312, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_1", "task_id": "a3b7ff6e-246b-48f0-901e-64b8d8223ad3", "annotation_id": "a3b7ff6e-246b-48f0-901e-64b8d8223ad3", "instruction": "Find information about what I should do when I lose an item on a bus.", "strict_f1": 0.182, "loose_f1": 0.364, "semantic_score": 0.35, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_1", "task_id": "c59043d8-eef8-42f7-b883-6bdf51476883", "annotation_id": "c59043d8-eef8-42f7-b883-6bdf51476883", "instruction": "Book a place on the next available mountain biking class running within 100 miles of New York City for Joe Bloggs. The email address is joe@joebloggs.com and the phone number is 1111111111. Emergency contact's name is June and phone number is 2222222222. The gear sizing is 6 ft height\n.", "strict_f1": 0.163, "loose_f1": 0.327, "semantic_score": 0.136, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_1", "task_id": "38fb3573-356d-4750-a2b8-1df305d88431", "annotation_id": "38fb3573-356d-4750-a2b8-1df305d88431", "instruction": "Find thrill rides in Six Flags Great America, Chicago, IL.", "strict_f1": 0.545, "loose_f1": 0.545, "semantic_score": 0.34, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_1", "task_id": "4d42f600-80da-41ae-bfdc-cd6ba9aedec0", "annotation_id": "4d42f600-80da-41ae-bfdc-cd6ba9aedec0", "instruction": "Create a new Collection Car Wash and Find the most reviewed car wash in San Francisco and save it in the new ccllection", "strict_f1": 0.098, "loose_f1": 0.328, "semantic_score": 0.175, "judge_verdict": "fail", "judge_score": 0.5, "agent_status": "failed", "agent_success": false, "agent_error": null}
|
|
{"dataset": "test_task_1", "task_id": "c88962c1-5c55-4e48-949d-920d4bd525fc", "annotation_id": "c88962c1-5c55-4e48-949d-920d4bd525fc", "instruction": "Find devices that support Live TV on demand streaming.", "strict_f1": 0.0, "loose_f1": 0.0, "semantic_score": 0.0, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_1", "task_id": "cfcc4105-6ff5-4119-b069-3c2673588df5", "annotation_id": "cfcc4105-6ff5-4119-b069-3c2673588df5", "instruction": "Add the movie \"The Shawshank Redemption\" to your wishlist.", "strict_f1": 0.2, "loose_f1": 0.6, "semantic_score": 0.125, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_1", "task_id": "0de53aa5-cb8f-4789-b621-14262a4d5aae", "annotation_id": "0de53aa5-cb8f-4789-b621-14262a4d5aae", "instruction": "On April 21, look up a flight from Mumbai to Stockholm.", "strict_f1": 0.286, "loose_f1": 0.429, "semantic_score": 0.25, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_1", "task_id": "2f14b18e-9ddc-4bc8-b8eb-21658a7e7192", "annotation_id": "2f14b18e-9ddc-4bc8-b8eb-21658a7e7192", "instruction": "Check in with confirmation number 10987654 for my flight with last name Lewis.", "strict_f1": 0.182, "loose_f1": 0.364, "semantic_score": 0.125, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "failed", "agent_success": false, "agent_error": null}
|
|
{"dataset": "test_task_1", "task_id": "50692ff0-03cc-473f-8db8-2c9af48b30a5", "annotation_id": "50692ff0-03cc-473f-8db8-2c9af48b30a5", "instruction": "Show the highest-rated boxing gym with a personal training facility and Monthly membership options in Dallas, Texas", "strict_f1": 0.163, "loose_f1": 0.327, "semantic_score": 0.244, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_1", "task_id": "7acc430a-9d18-4b33-82fc-449748f80660", "annotation_id": "7acc430a-9d18-4b33-82fc-449748f80660", "instruction": "Signup for virtual healthcare visit.", "strict_f1": 0.085, "loose_f1": 0.254, "semantic_score": 0.1, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_1", "task_id": "e4bb7e5b-6ca2-4f43-96ba-53970d104df2", "annotation_id": "e4bb7e5b-6ca2-4f43-96ba-53970d104df2", "instruction": "Find the schedule for upcoming MLB games for the New York Yankees.", "strict_f1": 0.0, "loose_f1": 0.0, "semantic_score": 0.2, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_1", "task_id": "4ffca7c8-576a-4e54-a1b5-558b3f829766", "annotation_id": "4ffca7c8-576a-4e54-a1b5-558b3f829766", "instruction": "Find solutions for Airport and fill the contact form with message to \"Send Brochure\". Contact information John Smith. Email: abc@abc.com. Phone #: 88889999", "strict_f1": 0.0, "loose_f1": 0.0, "semantic_score": 0.0, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_1", "task_id": "8a39a66d-cd0c-4741-b616-d2d7be631abf", "annotation_id": "8a39a66d-cd0c-4741-b616-d2d7be631abf", "instruction": "Compare available plans for the AeroAPI.", "strict_f1": 0.0, "loose_f1": 0.182, "semantic_score": 0.133, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_1", "task_id": "cb1fecfb-4eb5-4f9c-8067-a8c8d9977d7f", "annotation_id": "cb1fecfb-4eb5-4f9c-8067-a8c8d9977d7f", "instruction": "Find flights going from Indira Gandhi to Los Cabos.", "strict_f1": 0.25, "loose_f1": 0.35, "semantic_score": 0.386, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_1", "task_id": "00199892-39ab-41ad-a4fb-4c6bf8b94366", "annotation_id": "00199892-39ab-41ad-a4fb-4c6bf8b94366", "instruction": "Find grocery deals at Costco", "strict_f1": 0.444, "loose_f1": 0.444, "semantic_score": 0.4, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_1", "task_id": "2cc71f04-851c-4a75-8728-a80783984a32", "annotation_id": "2cc71f04-851c-4a75-8728-a80783984a32", "instruction": "Play spiking track which is at number 1.", "strict_f1": 0.222, "loose_f1": 0.296, "semantic_score": 0.425, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_1", "task_id": "34e047fe-2011-4b6d-820c-4a3d49475678", "annotation_id": "34e047fe-2011-4b6d-820c-4a3d49475678", "instruction": "Share a Blackpink event on Twitter.", "strict_f1": 0.095, "loose_f1": 0.381, "semantic_score": 0.229, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "failed", "agent_success": false, "agent_error": null}
|
|
{"dataset": "test_task_1", "task_id": "640e0425-bceb-45ff-ba4d-dbc5b62e31d5", "annotation_id": "640e0425-bceb-45ff-ba4d-dbc5b62e31d5", "instruction": "Find the \"Rock And Roll Over\" reviews", "strict_f1": 0.5, "loose_f1": 0.667, "semantic_score": 0.3, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_1", "task_id": "97219f72-ed3f-41f9-8b15-f8e1ce10b925", "annotation_id": "97219f72-ed3f-41f9-8b15-f8e1ce10b925", "instruction": "Browse hot deals near zip code 10019.", "strict_f1": 0.0, "loose_f1": 0.273, "semantic_score": 0.1, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_1", "task_id": "3d731c1d-56f4-4ab4-84ed-5f6b55b4c647", "annotation_id": "3d731c1d-56f4-4ab4-84ed-5f6b55b4c647", "instruction": "Browse Myrtle Beach sites with a hot tub for three nights starting on June 20th.", "strict_f1": 0.333, "loose_f1": 0.417, "semantic_score": 0.244, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_1", "task_id": "6de5b415-0670-459a-a921-4a1b51b01110", "annotation_id": "6de5b415-0670-459a-a921-4a1b51b01110", "instruction": "Find a campground in Orlando for 2 adults to check-in in mar 29 and check out in mar 30", "strict_f1": 0.345, "loose_f1": 0.483, "semantic_score": 0.278, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_1", "task_id": "eb03aa9f-bb6d-42e6-8fff-9897176d6db9", "annotation_id": "eb03aa9f-bb6d-42e6-8fff-9897176d6db9", "instruction": "Track flight with number D145 of Qatar Airways on April 18t\nh.", "strict_f1": 0.053, "loose_f1": 0.158, "semantic_score": 0.1, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_1", "task_id": "295380ed-c6f4-406a-bd53-c7d216ad9bbf", "annotation_id": "295380ed-c6f4-406a-bd53-c7d216ad9bbf", "instruction": "Check all available one way flights for a single passenger from Manhattan to Philadelphia on May 23rd in first class.", "strict_f1": 0.341, "loose_f1": 0.634, "semantic_score": 0.313, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_1", "task_id": "610fe8c0-6598-4bd5-9864-8e41378db276", "annotation_id": "610fe8c0-6598-4bd5-9864-8e41378db276", "instruction": "Find Toyota Corolla from the year 2018 to 2023 in red color.", "strict_f1": 0.435, "loose_f1": 0.696, "semantic_score": 0.322, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_1", "task_id": "ba40d2fb-c57f-48a8-b7a4-3fdac15f6bd4", "annotation_id": "ba40d2fb-c57f-48a8-b7a4-3fdac15f6bd4", "instruction": "Show me the options for vans to rent in Portugal Cove in USD", "strict_f1": 0.129, "loose_f1": 0.194, "semantic_score": 0.357, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_1", "task_id": "c4848d71-d72f-41d3-b871-d9dd22e291db", "annotation_id": "c4848d71-d72f-41d3-b871-d9dd22e291db", "instruction": "Find one way flights for one person from Sacramento to Houston IAH on June 2, 2023 that can be booked using miles.", "strict_f1": 0.195, "loose_f1": 0.341, "semantic_score": 0.256, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_1", "task_id": "3fdbadc8-9a87-4435-b8c9-7835a381f774", "annotation_id": "3fdbadc8-9a87-4435-b8c9-7835a381f774", "instruction": "Find the top-rated bar and grill restaurant in Miami, Florida that has live music and is good for groups.", "strict_f1": 0.476, "loose_f1": 0.476, "semantic_score": 0.31, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_1", "task_id": "0343d8d9-87ca-4bfa-b065-547b3aa9572d", "annotation_id": "0343d8d9-87ca-4bfa-b065-547b3aa9572d", "instruction": "What are the upcoming soccer events on ESPN2?", "strict_f1": 0.0, "loose_f1": 0.171, "semantic_score": 0.04, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_1", "task_id": "0435ac39-8a76-42db-b504-31ec210f6f66", "annotation_id": "0435ac39-8a76-42db-b504-31ec210f6f66", "instruction": "Find the team schedule of the Brooklyn Nets", "strict_f1": 0.444, "loose_f1": 0.667, "semantic_score": 0.4, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_1", "task_id": "23cbeb29-1d7a-4a90-bb28-41e8a08c840a", "annotation_id": "23cbeb29-1d7a-4a90-bb28-41e8a08c840a", "instruction": "Find an Indian classical track longer than 30 minutes which has been added in the past month and add to playlist named Meditation Music, if playlist not found create one.", "strict_f1": 0.256, "loose_f1": 0.462, "semantic_score": 0.282, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_1", "task_id": "7216606d-2de6-4f80-a7b0-b860957bf07a", "annotation_id": "7216606d-2de6-4f80-a7b0-b860957bf07a", "instruction": "Find travel requirements from Amsterdam to Cairo.", "strict_f1": 0.044, "loose_f1": 0.222, "semantic_score": 0.129, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_1", "task_id": "0b342c57-78f4-4b70-a819-0c038a9f7839", "annotation_id": "0b342c57-78f4-4b70-a819-0c038a9f7839", "instruction": "Search for parking near restaurants in Boston and find a parking spot near Hard Rock Cafe for an oversized car from 9 to 11 pm on May 11, and ensure that the pre-purchase and open gate with phone option is available.", "strict_f1": 0.286, "loose_f1": 0.5, "semantic_score": 0.327, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_1", "task_id": "1c0acb0e-a343-447d-9981-445ef30625af", "annotation_id": "1c0acb0e-a343-447d-9981-445ef30625af", "instruction": "Look for outdoor parking in Arlington, Virgina near 5601 Chapin Ave, Alexandria, where I can pay with my phone and charge my car on April 27 from 10 am to April 28, 10 am.", "strict_f1": 0.475, "loose_f1": 0.678, "semantic_score": 0.362, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_1", "task_id": "292749f0-e8c5-4776-b1dd-11dbe16633a2", "annotation_id": "292749f0-e8c5-4776-b1dd-11dbe16633a2", "instruction": "Find the cheapest parking spot in 10001.", "strict_f1": 0.286, "loose_f1": 0.476, "semantic_score": 0.317, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_1", "task_id": "b3baca22-7184-4bd3-a9c7-647ff0153ae8", "annotation_id": "b3baca22-7184-4bd3-a9c7-647ff0153ae8", "instruction": "Find 5 star rated saltwater rods.", "strict_f1": 0.0, "loose_f1": 0.308, "semantic_score": 0.1, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_1", "task_id": "c200352a-28d2-436c-8707-aa695e04a397", "annotation_id": "c200352a-28d2-436c-8707-aa695e04a397", "instruction": "What are the food festivals happening in Colorado This weekend?", "strict_f1": 0.182, "loose_f1": 0.364, "semantic_score": 0.1, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_1", "task_id": "d075a906-d264-4e45-8158-ec6b539e8f8b", "annotation_id": "d075a906-d264-4e45-8158-ec6b539e8f8b", "instruction": "Find if any free fashion conventions are happening in San Fransisco and if found save the top three results.", "strict_f1": 0.37, "loose_f1": 0.519, "semantic_score": 0.227, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_1", "task_id": "834dda72-269e-404c-bef8-f446de0965bf", "annotation_id": "834dda72-269e-404c-bef8-f446de0965bf", "instruction": "Browse through the Las Vegas city guide and find message services nearest to Henderson, the service provider should have a BBB rating of A+.", "strict_f1": 0.24, "loose_f1": 0.4, "semantic_score": 0.211, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_1", "task_id": "fdf322bd-95b6-44e3-8962-dc041d614419", "annotation_id": "fdf322bd-95b6-44e3-8962-dc041d614419", "instruction": "Find a vacation package including the cheapest flight, hotel, and car with basic economy fares between New York and Las Vegas from May 16 to May 24, and book 2 rooms for 4 adults in 4 and 5-star hotels with the casino.", "strict_f1": 0.489, "loose_f1": 0.556, "semantic_score": 0.422, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_1", "task_id": "5b888855-b921-4c61-8f79-73902ee0eafa", "annotation_id": "5b888855-b921-4c61-8f79-73902ee0eafa", "instruction": "Check balance of gift card no. 1234567 with pin 0001", "strict_f1": 0.074, "loose_f1": 0.37, "semantic_score": 0.138, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_1", "task_id": "6679ce8e-6061-4880-954a-12c56158543e", "annotation_id": "6679ce8e-6061-4880-954a-12c56158543e", "instruction": "Find the cheapest single room in Dubai Marriott or Sheraton for two adults and a child under one between May 7 and May 10", "strict_f1": 0.418, "loose_f1": 0.478, "semantic_score": 0.435, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_1", "task_id": "5eacf1eb-971f-4907-9e1c-cd8a6120b1e4", "annotation_id": "5eacf1eb-971f-4907-9e1c-cd8a6120b1e4", "instruction": "Find direction to a store in Ohio, Cincinnati which does't have Sephora in it.", "strict_f1": 0.15, "loose_f1": 0.3, "semantic_score": 0.317, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "failed", "agent_success": false, "agent_error": null}
|
|
{"dataset": "test_task_1", "task_id": "69ed8097-741e-4243-a26c-034a2ea4737b", "annotation_id": "69ed8097-741e-4243-a26c-034a2ea4737b", "instruction": "Browse coffee makers that are rated 5 stars.", "strict_f1": 0.0, "loose_f1": 0.0, "semantic_score": 0.0, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_1", "task_id": "628e5ce1-d601-4faa-9d32-a58af9100993", "annotation_id": "628e5ce1-d601-4faa-9d32-a58af9100993", "instruction": "Browse medium cars for rent for a week in Las Vegas starting on Jun 5.", "strict_f1": 0.323, "loose_f1": 0.452, "semantic_score": 0.35, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_1", "task_id": "5c29c805-388d-471a-80e9-ca0fbaf820be", "annotation_id": "5c29c805-388d-471a-80e9-ca0fbaf820be", "instruction": "Find Hotel in Chicago with lowest price for 2 adults checking in on Apr 14 for 3 days.", "strict_f1": 0.516, "loose_f1": 0.71, "semantic_score": 0.333, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_1", "task_id": "2fc80069-bdba-4064-82d0-1123a85f2651", "annotation_id": "2fc80069-bdba-4064-82d0-1123a85f2651", "instruction": "Compare Apple watches and learn more about the ultra version.", "strict_f1": 0.091, "loose_f1": 0.273, "semantic_score": 0.167, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_1", "task_id": "74992300-b61b-4679-b0ba-3b1ab7454358", "annotation_id": "74992300-b61b-4679-b0ba-3b1ab7454358", "instruction": "Check the status of your iPhone repair.", "strict_f1": 0.133, "loose_f1": 0.2, "semantic_score": 0.333, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_1", "task_id": "f2b311ad-246c-4464-beb6-9178cfffb50b", "annotation_id": "f2b311ad-246c-4464-beb6-9178cfffb50b", "instruction": "Find cheapest flight from New York to Toronto, Canada on 29 April.", "strict_f1": 0.364, "loose_f1": 0.591, "semantic_score": 0.28, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_1", "task_id": "9c14e1b7-2015-434a-a8a1-b1e782d0a2e2", "annotation_id": "9c14e1b7-2015-434a-a8a1-b1e782d0a2e2", "instruction": "Show me the scores for the 2019 super bowl", "strict_f1": 0.0, "loose_f1": 0.333, "semantic_score": 0.1, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_1", "task_id": "490dc61c-873d-47b6-9050-369cd18e1253", "annotation_id": "490dc61c-873d-47b6-9050-369cd18e1253", "instruction": "Find a flight from Dresden to anywhere under $100", "strict_f1": 0.303, "loose_f1": 0.485, "semantic_score": 0.278, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_1", "task_id": "323f1c41-b4c7-41b6-821e-2fed2fe09922", "annotation_id": "323f1c41-b4c7-41b6-821e-2fed2fe09922", "instruction": "Find black shorts in XXL or 3XL sizes for sale under men department, add the top product to the cart.", "strict_f1": 0.364, "loose_f1": 0.364, "semantic_score": 0.343, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_1", "task_id": "e6f6e6c8-f1e6-42bb-a3af-696ed8de571b", "annotation_id": "e6f6e6c8-f1e6-42bb-a3af-696ed8de571b", "instruction": "Open the baggage fee calculator.", "strict_f1": 0.571, "loose_f1": 0.571, "semantic_score": 0.4, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_1", "task_id": "b50ce7bf-50ee-4228-9510-efd60b867ab1", "annotation_id": "b50ce7bf-50ee-4228-9510-efd60b867ab1", "instruction": "Find a person by address Nice st - 1234, Good, FL.", "strict_f1": 0.077, "loose_f1": 0.231, "semantic_score": 0.167, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "failed", "agent_success": false, "agent_error": null}
|
|
{"dataset": "test_task_1", "task_id": "c5a20fb8-8351-4c74-986d-4792612df340", "annotation_id": "c5a20fb8-8351-4c74-986d-4792612df340", "instruction": "Find medicaid covered dentists sorted by distance.", "strict_f1": 0.182, "loose_f1": 0.424, "semantic_score": 0.243, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_1", "task_id": "cc5908a9-263b-4dd2-96ac-405fda7240e9", "annotation_id": "cc5908a9-263b-4dd2-96ac-405fda7240e9", "instruction": "Show list of popular businesses in Cleveland.", "strict_f1": 0.182, "loose_f1": 0.545, "semantic_score": 0.3, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_1", "task_id": "2667b421-cad6-44fe-8187-4309632f59b0", "annotation_id": "2667b421-cad6-44fe-8187-4309632f59b0", "instruction": "upvote a comment on the most relevant kiss chords & tabs", "strict_f1": 0.111, "loose_f1": 0.222, "semantic_score": 0.2, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_1", "task_id": "82094208-02a3-46de-a55f-4c48924cb16c", "annotation_id": "82094208-02a3-46de-a55f-4c48924cb16c", "instruction": "Find the weight of baggage allowance for economy class on a flight from Japan to Iran.", "strict_f1": 0.154, "loose_f1": 0.231, "semantic_score": 0.4, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_2", "task_id": "13b786ab-36d0-481c-b018-68906fee5de5", "annotation_id": "13b786ab-36d0-481c-b018-68906fee5de5", "instruction": "Book the lowest-priced train or bus from Alexandria, Washington to Penn Station, New York, on May 3 for a single traveler.", "strict_f1": 0.235, "loose_f1": 0.471, "semantic_score": 0.214, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_2", "task_id": "9a6622b0-e58e-40e2-ace6-5e3344afac44", "annotation_id": "9a6622b0-e58e-40e2-ace6-5e3344afac44", "instruction": "Get the frozen vegan cheese pizza between 5 to 10 usd.", "strict_f1": 0.16, "loose_f1": 0.16, "semantic_score": 0.2, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_2", "task_id": "d494ed27-0eee-48e0-9778-ff34ea7b58a6", "annotation_id": "d494ed27-0eee-48e0-9778-ff34ea7b58a6", "instruction": "Start the process of buying a gift card with a beach theme.", "strict_f1": 0.08, "loose_f1": 0.32, "semantic_score": 0.225, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_2", "task_id": "b4aeab35-a6d9-4524-a1df-337432ff2800", "annotation_id": "b4aeab35-a6d9-4524-a1df-337432ff2800", "instruction": "Search for the playlists \"Pop Workout mix\" and filtered by tag #Dance & edm.", "strict_f1": 0.2, "loose_f1": 0.2, "semantic_score": 0.225, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_2", "task_id": "81b23d7b-e0e9-47f5-92f1-a4fc7ac077fe", "annotation_id": "81b23d7b-e0e9-47f5-92f1-a4fc7ac077fe", "instruction": "Book two rooms for four adults in a 5-star hotel with pool, minibar, bathtub, and spa facilities, that are nearest to the city center in Dubai from June 13 to 16.", "strict_f1": 0.275, "loose_f1": 0.431, "semantic_score": 0.181, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_2", "task_id": "4ff37796-bb8a-4bcf-9202-ecf1f1811f7f", "annotation_id": "4ff37796-bb8a-4bcf-9202-ecf1f1811f7f", "instruction": "Show me schedules for the Blue water route on Wednesdays", "strict_f1": 0.19, "loose_f1": 0.286, "semantic_score": 0.143, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_2", "task_id": "c8d65dd7-62aa-4555-880d-9dbeed1b9143", "annotation_id": "c8d65dd7-62aa-4555-880d-9dbeed1b9143", "instruction": "Book the lowest-priced and quickest flight for 5 adults and 1 child on May 20 from Mumbai to any airport near Washington.", "strict_f1": 0.316, "loose_f1": 0.474, "semantic_score": 0.308, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_2", "task_id": "fe471e16-d33f-4f87-accf-dcf7e91617cb", "annotation_id": "fe471e16-d33f-4f87-accf-dcf7e91617cb", "instruction": "Find a flight for three adults from New York to New Orleans on April 14, return on April 16.", "strict_f1": 0.163, "loose_f1": 0.286, "semantic_score": 0.24, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_2", "task_id": "31dc6f28-9dd4-4ec7-ab4e-cd4e914b599e", "annotation_id": "31dc6f28-9dd4-4ec7-ab4e-cd4e914b599e", "instruction": "Look up round trip flights for 1 person to Japan at the Narita airport departing on 6/16 and returning on 6/23.", "strict_f1": 0.242, "loose_f1": 0.303, "semantic_score": 0.333, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_2", "task_id": "f968b06b-b3cd-4285-8867-224773fc39eb", "annotation_id": "f968b06b-b3cd-4285-8867-224773fc39eb", "instruction": "Find me the deals available for the Great escape park", "strict_f1": 0.2, "loose_f1": 0.5, "semantic_score": 0.28, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_2", "task_id": "9ceab2a3-7919-4f15-871a-21638fd93b24", "annotation_id": "9ceab2a3-7919-4f15-871a-21638fd93b24", "instruction": "Find a car to rent in Detroit, MI for pick up on April 2 and drop-off on April 3, both at 11AM, only show cars from Enterprise and Avis", "strict_f1": 0.195, "loose_f1": 0.39, "semantic_score": 0.182, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_2", "task_id": "c86ddda7-8d34-4029-ae11-d8229d5e1d9a", "annotation_id": "c86ddda7-8d34-4029-ae11-d8229d5e1d9a", "instruction": "Rent a small air-conditioned car with collision damage coverage from the Liverpool Airport car rental center for pick-up on May 5, 11:30 am, and drop-off on May 7, 11 am.", "strict_f1": 0.275, "loose_f1": 0.392, "semantic_score": 0.269, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_2", "task_id": "ca83e6a2-8217-4e49-a102-2ed62d125a89", "annotation_id": "ca83e6a2-8217-4e49-a102-2ed62d125a89", "instruction": "Get cheapest medium car with like for like fuel policy from the Boston airport shuttle at 4 pm on April 16th and drop it off the next day at 3:30 pm,if booking is not available on that day, move the booking a day further and try to book again\n.", "strict_f1": 0.531, "loose_f1": 0.653, "semantic_score": 0.372, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_2", "task_id": "e6789e88-535e-4d79-83e9-d92e625b77d6", "annotation_id": "e6789e88-535e-4d79-83e9-d92e625b77d6", "instruction": "Book the lowest priced auto train ticket for Mayank Raheja who is deep an need seat in lower level, is carrying a bike but has no service animal,needs assistance from Lorton to Sanford on June 10 between 12 to 6 pm.", "strict_f1": 0.259, "loose_f1": 0.481, "semantic_score": 0.176, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_2", "task_id": "6b215dbb-a2c4-451c-9c34-9bafe6660c14", "annotation_id": "6b215dbb-a2c4-451c-9c34-9bafe6660c14", "instruction": "Find a manufacturer-certified Chevrolet truck for sale within 10 miles of zip 42701, if found schedule a test drive for James Smith on April 17 at 4 pm.", "strict_f1": 0.286, "loose_f1": 0.49, "semantic_score": 0.217, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "failed", "agent_success": false, "agent_error": null}
|
|
{"dataset": "test_task_2", "task_id": "88475d27-7c58-4abf-86fc-bf459e9606e1", "annotation_id": "88475d27-7c58-4abf-86fc-bf459e9606e1", "instruction": "Find a vacation home in Orlando with a private pool, at least three bedrooms, and availability for the first week of December.", "strict_f1": 0.429, "loose_f1": 0.486, "semantic_score": 0.417, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_2", "task_id": "e3f2ad4a-65bd-4393-94ac-3140c91d4a07", "annotation_id": "e3f2ad4a-65bd-4393-94ac-3140c91d4a07", "instruction": "Search for groups about rap and join the second one.", "strict_f1": 0.211, "loose_f1": 0.421, "semantic_score": 0.2, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "failed", "agent_success": false, "agent_error": null}
|
|
{"dataset": "test_task_2", "task_id": "eb1e79b0-f94d-4568-bd32-072e84d82a7e", "annotation_id": "eb1e79b0-f94d-4568-bd32-072e84d82a7e", "instruction": "Show me a list of electronic music dvds in very good condition.", "strict_f1": 0.118, "loose_f1": 0.471, "semantic_score": 0.225, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_2", "task_id": "f1986a5b-c074-40fe-9c89-d083389f0cc4", "annotation_id": "f1986a5b-c074-40fe-9c89-d083389f0cc4", "instruction": "Find bluetooth vertical mouse with most reviews and add two to my shopping cart..", "strict_f1": 0.1, "loose_f1": 0.25, "semantic_score": 0.186, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_2", "task_id": "b11eded9-34ab-4340-91a9-948de79057b9", "annotation_id": "b11eded9-34ab-4340-91a9-948de79057b9", "instruction": "Find games that are availble and free with ING plus rewards.", "strict_f1": 0.071, "loose_f1": 0.286, "semantic_score": 0.083, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_2", "task_id": "a5c7b71c-56c5-432f-a98b-bcd523129680", "annotation_id": "a5c7b71c-56c5-432f-a98b-bcd523129680", "instruction": "Schedule an appointment to apply for transportation access pass in the Charlie Card store on May 8, 10:15 am, fill in my details (James Smith, james.smith@gmail.com), and book.", "strict_f1": 0.258, "loose_f1": 0.516, "semantic_score": 0.26, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "failed", "agent_success": false, "agent_error": null}
|
|
{"dataset": "test_task_2", "task_id": "a4a06039-add9-4726-8bfc-a4831b467576", "annotation_id": "a4a06039-add9-4726-8bfc-a4831b467576", "instruction": "Find the weekly ad for the store closest to zip code 82718", "strict_f1": 0.231, "loose_f1": 0.385, "semantic_score": 0.25, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_2", "task_id": "a690af04-9013-4ca0-b8fb-47bc4dd722ac", "annotation_id": "a690af04-9013-4ca0-b8fb-47bc4dd722ac", "instruction": "Create a new list called Bathroom Remodeling", "strict_f1": 0.143, "loose_f1": 0.238, "semantic_score": 0.25, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_2", "task_id": "757a129b-68b2-4799-80f1-db1add10f92c", "annotation_id": "757a129b-68b2-4799-80f1-db1add10f92c", "instruction": "Find Playstation 5 digital edition", "strict_f1": 0.286, "loose_f1": 0.286, "semantic_score": 0.167, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_2", "task_id": "3e472e41-99d6-4215-8880-9c3a48d3c33a", "annotation_id": "3e472e41-99d6-4215-8880-9c3a48d3c33a", "instruction": "Book a dine in reserveration in New Delhi for 5 Guests on 22nd March, 7:00 pm, terrace seating arrangement preferred.", "strict_f1": 0.082, "loose_f1": 0.329, "semantic_score": 0.121, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_2", "task_id": "a55e347d-b445-4a45-ae02-bc4c31f12660", "annotation_id": "a55e347d-b445-4a45-ae02-bc4c31f12660", "instruction": "What are the cheapest hotel options for a luxury trip in Chicago during summer", "strict_f1": 0.643, "loose_f1": 0.786, "semantic_score": 0.45, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_2", "task_id": "c0a27903-5c43-48ce-b385-4379c6a25561", "annotation_id": "c0a27903-5c43-48ce-b385-4379c6a25561", "instruction": "Find a apartment in New Delhi for 5 days starting April 11.", "strict_f1": 0.103, "loose_f1": 0.172, "semantic_score": 0.317, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_2", "task_id": "c614be5b-a32a-4d67-9fb9-d0702257e83a", "annotation_id": "c614be5b-a32a-4d67-9fb9-d0702257e83a", "instruction": "Book a flight for 2 adults from Belo Horizonte to Buenos Aires on April 24", "strict_f1": 0.107, "loose_f1": 0.214, "semantic_score": 0.188, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_2", "task_id": "e3143d13-bc33-4b5a-9fd0-6a493cf5997b", "annotation_id": "e3143d13-bc33-4b5a-9fd0-6a493cf5997b", "instruction": "Add a set of wireless headphones to your cart with a budget of $100 or less, that has an active noise-cancelling feature.", "strict_f1": 0.429, "loose_f1": 0.571, "semantic_score": 0.214, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_2", "task_id": "e1b0fb9c-fa7a-4fa1-8107-8e4b5320f6f2", "annotation_id": "e1b0fb9c-fa7a-4fa1-8107-8e4b5320f6f2", "instruction": "Find flights from Seattle to New York on June 5th and only show those that can be purchased with miles.", "strict_f1": 0.286, "loose_f1": 0.571, "semantic_score": 0.257, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_2", "task_id": "bf866e4d-2307-4d30-aebe-8265b5654498", "annotation_id": "bf866e4d-2307-4d30-aebe-8265b5654498", "instruction": "Book a room from Apr 30 to Jun 5 for two adults in Tallahassee.", "strict_f1": 0.492, "loose_f1": 0.525, "semantic_score": 0.461, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_2", "task_id": "333e1892-c879-4ac9-a6a8-fbceda53ae08", "annotation_id": "333e1892-c879-4ac9-a6a8-fbceda53ae08", "instruction": "Find a hotel in Dublin, Ireland for May 2nd to May 8th for 2 adults and 1 room.", "strict_f1": 0.522, "loose_f1": 0.609, "semantic_score": 0.273, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_2", "task_id": "3594153c-7dbb-40bc-bad9-8c730840ab1a", "annotation_id": "3594153c-7dbb-40bc-bad9-8c730840ab1a", "instruction": "Show me tickets for food and drink attractions in Ireland from April 11 to April 12", "strict_f1": 0.5, "loose_f1": 0.5, "semantic_score": 0.25, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_2", "task_id": "400c291f-6a0c-46fb-874e-d5c174fdedfc", "annotation_id": "400c291f-6a0c-46fb-874e-d5c174fdedfc", "instruction": "Search for a one way flight from Dublin to Malta on April 22, 2023 for 2 adults.", "strict_f1": 0.304, "loose_f1": 0.435, "semantic_score": 0.35, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_2", "task_id": "66d49483-19c8-4d13-af4c-a6639e8b3b81", "annotation_id": "66d49483-19c8-4d13-af4c-a6639e8b3b81", "instruction": "Look for the lowest priced new aluminum fish boat within 100 miles of Iowa state zip 52554 and call for price.", "strict_f1": 0.471, "loose_f1": 0.588, "semantic_score": 0.427, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_2", "task_id": "9334211a-8ae8-43fc-9581-a0427950e82f", "annotation_id": "9334211a-8ae8-43fc-9581-a0427950e82f", "instruction": "show the Life Vests priced between $100 and $200.", "strict_f1": 0.4, "loose_f1": 0.4, "semantic_score": 0.35, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_2", "task_id": "afcc3ff5-c043-4787-8608-cb21dab9dc42", "annotation_id": "afcc3ff5-c043-4787-8608-cb21dab9dc42", "instruction": "Locate a store in New York 10007 and view store timings.", "strict_f1": 0.061, "loose_f1": 0.242, "semantic_score": 0.275, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_2", "task_id": "29c19896-2459-46b3-a488-77e5368c9651", "annotation_id": "29c19896-2459-46b3-a488-77e5368c9651", "instruction": "Find the page with instructions on how to return orders online.", "strict_f1": 0.222, "loose_f1": 0.444, "semantic_score": 0.35, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_2", "task_id": "3592a015-b7d3-4651-a30c-5cd1c0e96ea2", "annotation_id": "3592a015-b7d3-4651-a30c-5cd1c0e96ea2", "instruction": "Search for the best-rated multi-city 10+ night European cruise deal under 999 dollars and book.", "strict_f1": 0.275, "loose_f1": 0.314, "semantic_score": 0.463, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_2", "task_id": "0f9382d3-d73b-48db-aff4-55482fbb16c8", "annotation_id": "0f9382d3-d73b-48db-aff4-55482fbb16c8", "instruction": "Show schedule for East Boston Ferry outbound Lewis Wharf and Long Wharf (North) stop.", "strict_f1": 0.333, "loose_f1": 0.5, "semantic_score": 0.233, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_2", "task_id": "4ce1633f-0f30-4bfc-a06c-2c7da8a920dc", "annotation_id": "4ce1633f-0f30-4bfc-a06c-2c7da8a920dc", "instruction": "Find lowest price SUV for a partner AARP SUV deal discount for pickup Roanoke Regional Airport, ROA Sat, Apr 05, 12:00 PM and drop off on April 10th.", "strict_f1": 0.182, "loose_f1": 0.436, "semantic_score": 0.277, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_2", "task_id": "4d0de000-9bff-43c0-8c4f-d67efa003e2e", "annotation_id": "4d0de000-9bff-43c0-8c4f-d67efa003e2e", "instruction": "Find the cheapest free SUV upgrade deal with a pay-later option for an Australian for a pick up on June 2, 11 am, and drop off on June 6 at 4:30 pm from Ontario International Airport.", "strict_f1": 0.385, "loose_f1": 0.5, "semantic_score": 0.306, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_2", "task_id": "9ba4e237-43dc-4c22-b334-0ebea2afb326", "annotation_id": "9ba4e237-43dc-4c22-b334-0ebea2afb326", "instruction": "Browse Marriott Bonvoy credit cards.", "strict_f1": 0.1, "loose_f1": 0.2, "semantic_score": 0.35, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_2", "task_id": "63c65d96-371e-4c2c-87fc-ec96cbc02002", "annotation_id": "63c65d96-371e-4c2c-87fc-ec96cbc02002", "instruction": "Show me health-related online classes.", "strict_f1": 0.0, "loose_f1": 0.0, "semantic_score": 0.2, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_2", "task_id": "620185b2-3fe5-4692-b3e7-39e3beacd5c7", "annotation_id": "620185b2-3fe5-4692-b3e7-39e3beacd5c7", "instruction": "Find a walkthrough guide for Assassin's Creed Valhalla.", "strict_f1": 0.154, "loose_f1": 0.462, "semantic_score": 0.14, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_2", "task_id": "c37a7e97-8378-4382-b630-21d353888307", "annotation_id": "c37a7e97-8378-4382-b630-21d353888307", "instruction": "Browse men's winter coats in size large that is on clearance.", "strict_f1": 0.211, "loose_f1": 0.526, "semantic_score": 0.167, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_2", "task_id": "9b125b87-61b4-4457-b7c2-4b51056fa1a4", "annotation_id": "9b125b87-61b4-4457-b7c2-4b51056fa1a4", "instruction": "Tell me information about what identification I need to bring on my trip.", "strict_f1": 0.444, "loose_f1": 0.444, "semantic_score": 0.35, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_2", "task_id": "e5fdff20-d4a2-4c3f-adbb-da97b470db65", "annotation_id": "e5fdff20-d4a2-4c3f-adbb-da97b470db65", "instruction": "Show me the best city tours.", "strict_f1": 0.0, "loose_f1": 0.444, "semantic_score": 0.1, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_2", "task_id": "353ab375-d3e0-4842-9ae9-b7502b95c53a", "annotation_id": "353ab375-d3e0-4842-9ae9-b7502b95c53a", "instruction": "Find the last game of the season for the Toronto Raptors.", "strict_f1": 0.0, "loose_f1": 0.0, "semantic_score": 0.1, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_2", "task_id": "150146b2-ae80-4760-b2a9-1c7475ce5de2", "annotation_id": "150146b2-ae80-4760-b2a9-1c7475ce5de2", "instruction": "Show me products from Calvin Klein brand menswear list.", "strict_f1": 0.222, "loose_f1": 0.222, "semantic_score": 0.225, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_2", "task_id": "1570a4b7-176f-4250-8c55-08f43dc032ad", "annotation_id": "1570a4b7-176f-4250-8c55-08f43dc032ad", "instruction": "Find adventure cruises under $999 for Alaska", "strict_f1": 0.316, "loose_f1": 0.421, "semantic_score": 0.378, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_2", "task_id": "75294fc6-f203-4fd3-bc1a-44bd83d1c69d", "annotation_id": "75294fc6-f203-4fd3-bc1a-44bd83d1c69d", "instruction": "Add the cheapest SSD to my cart", "strict_f1": 0.167, "loose_f1": 0.333, "semantic_score": 0.2, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|