45 lines
19 KiB
JSON
45 lines
19 KiB
JSON
{"dataset": "test_task_0", "task_id": "8f6261cf-d665-4e61-93af-f50f0d366245", "annotation_id": "8f6261cf-d665-4e61-93af-f50f0d366245", "instruction": "Find all events taking place in New York City during the month of September.", "strict_f1": 0.378, "loose_f1": 0.541, "semantic_score": 0.318, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_0", "task_id": "918d7ef3-a6ec-458a-88f1-1c2726fd2883", "annotation_id": "918d7ef3-a6ec-458a-88f1-1c2726fd2883", "instruction": "Find help page about buying tickets.", "strict_f1": 0.308, "loose_f1": 0.308, "semantic_score": 0.5, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "9c3cba90-742a-4f3b-a2e7-767b565fae96", "annotation_id": "9c3cba90-742a-4f3b-a2e7-767b565fae96", "instruction": "check two ticket with best seat that has promo code first show happening in Hamilton New York on April.", "strict_f1": 0.341, "loose_f1": 0.537, "semantic_score": 0.336, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "failed", "agent_success": false, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "1d73ad40-f7f8-435e-a83d-8b38534427fd", "annotation_id": "1d73ad40-f7f8-435e-a83d-8b38534427fd", "instruction": "Find the cheapest women's plus size brown color loungewear in 3xl size.", "strict_f1": 0.233, "loose_f1": 0.419, "semantic_score": 0.344, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_0", "task_id": "91695df8-f256-47c9-8c37-06e8d0fc758f", "annotation_id": "91695df8-f256-47c9-8c37-06e8d0fc758f", "instruction": "Rent a truck with the lowest rent with two dozen furniture pads for 100 miles at zip 08817 on April 12 at 2:30 pm rented truck is to be returned to the exact location and date, and the pickup and drop off will be at the nearest location.", "strict_f1": 0.151, "loose_f1": 0.377, "semantic_score": 0.221, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_0", "task_id": "923fd4e0-1862-45b0-9bba-d57b956844da", "annotation_id": "923fd4e0-1862-45b0-9bba-d57b956844da", "instruction": "search gas pickup truck in Fremont with 2010 and 2017 with less than 80000 mile", "strict_f1": 0.412, "loose_f1": 0.412, "semantic_score": 0.28, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "0572899e-7c07-4a2f-a77b-bba4f432a7ad", "annotation_id": "0572899e-7c07-4a2f-a77b-bba4f432a7ad", "instruction": "find my trip with confirmation number SFTBAO including first and last name Joe Lukeman", "strict_f1": 0.261, "loose_f1": 0.348, "semantic_score": 0.314, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "failed", "agent_success": false, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "1b4859f4-6951-4f6a-8a74-1c9647900eb1", "annotation_id": "1b4859f4-6951-4f6a-8a74-1c9647900eb1", "instruction": "Find the status of March 25 flights from New York airports to Columbus in Ohio.", "strict_f1": 0.143, "loose_f1": 0.321, "semantic_score": 0.282, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_0", "task_id": "9223f1b4-43ad-4636-9541-99ff9e6ad918", "annotation_id": "9223f1b4-43ad-4636-9541-99ff9e6ad918", "instruction": "Browse the venues that are playing the Wicked show from Oct 5 to Oct 24 2023", "strict_f1": 0.148, "loose_f1": 0.148, "semantic_score": 0.158, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "failed", "agent_success": false, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "a52fcf7a-50aa-4256-8796-654b3dc3adac", "annotation_id": "a52fcf7a-50aa-4256-8796-654b3dc3adac", "instruction": "Buy a diamond pass in New York's, Great escape park, add one meal dining plan to it, and select the flexible payment plan for Jame Jones. The email address is jame_jones@hotmail.com, zip code 10005 and age is 35.", "strict_f1": 0.053, "loose_f1": 0.211, "semantic_score": 0.068, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "failed", "agent_success": false, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "a4f3beb3-422a-4049-8d17-44eade56fed1", "annotation_id": "a4f3beb3-422a-4049-8d17-44eade56fed1", "instruction": "Add to my wish list the highest rated activity in Amsterdam.", "strict_f1": 0.429, "loose_f1": 0.714, "semantic_score": 0.34, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "e39333ef-a5a5-4117-9af2-3bb243b364f2", "annotation_id": "e39333ef-a5a5-4117-9af2-3bb243b364f2", "instruction": "Find the movie Donnie Darko and show its complete cast.", "strict_f1": 0.167, "loose_f1": 0.167, "semantic_score": 0.233, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "e8b1cc02-1143-47cf-a7bb-9a16d08e155a", "annotation_id": "e8b1cc02-1143-47cf-a7bb-9a16d08e155a", "instruction": "Browse the list of top 250 movies and add the first one to my watchlist.", "strict_f1": 0.182, "loose_f1": 0.545, "semantic_score": 0.233, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "failed", "agent_success": false, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "b59b1997-884f-42f0-b083-fc93d6ce64fe", "annotation_id": "b59b1997-884f-42f0-b083-fc93d6ce64fe", "instruction": "see Nissan and Honda cars for sale near Kentwood, MI 49512", "strict_f1": 0.222, "loose_f1": 0.222, "semantic_score": 0.156, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "e483a49f-029d-446c-892c-c56b92fc463b", "annotation_id": "e483a49f-029d-446c-892c-c56b92fc463b", "instruction": "Build an entry-level pc with an windows 11 64 bit intel i7 CPU with a256gb ssd drive + 4gb ram and adding cheapest component and accessories available.", "strict_f1": 0.4, "loose_f1": 0.494, "semantic_score": 0.238, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_0", "task_id": "14d50319-3f81-4aa6-8ee8-d1b66e4d5d64", "annotation_id": "14d50319-3f81-4aa6-8ee8-d1b66e4d5d64", "instruction": "Find 32\" Curved monitor and add the third one to the wish list.", "strict_f1": 0.286, "loose_f1": 0.286, "semantic_score": 0.167, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "f9a882f7-826e-469a-ad69-0d5f912734c9", "annotation_id": "f9a882f7-826e-469a-ad69-0d5f912734c9", "instruction": "Search the cheapest Curry brand unisex athletic shoes with the number 5.5, add to cart and checkout.", "strict_f1": 0.481, "loose_f1": 0.556, "semantic_score": 0.447, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_0", "task_id": "a5dd5729-415a-4fe2-a840-4935bf9428d4", "annotation_id": "a5dd5729-415a-4fe2-a840-4935bf9428d4", "instruction": "Browse spider-man toys for kids and sort by lowest price.", "strict_f1": 0.0, "loose_f1": 0.0, "semantic_score": 0.0, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "a2500e0b-9244-4f0e-b686-fa290c32b829", "annotation_id": "a2500e0b-9244-4f0e-b686-fa290c32b829", "instruction": "Find the store location and hours of the closest Gamestop to zip code 90028 and set as home store", "strict_f1": 0.0, "loose_f1": 0.25, "semantic_score": 0.1, "judge_verdict": "fail", "judge_score": 0.5, "agent_status": "failed", "agent_success": false, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "942666cb-147a-4033-be7e-d56ccca47506", "annotation_id": "942666cb-147a-4033-be7e-d56ccca47506", "instruction": "Find and view the biography for the Host of the Price is Right.", "strict_f1": 0.222, "loose_f1": 0.222, "semantic_score": 0.175, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "4fa7cab9-8448-4cdb-842f-dba109b3a13e", "annotation_id": "4fa7cab9-8448-4cdb-842f-dba109b3a13e", "instruction": "Find a private room in New York for 1 April and checkout on 2 April for 2 adults", "strict_f1": 0.636, "loose_f1": 0.636, "semantic_score": 0.345, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "961e4feb-2b5b-4372-a5db-c7d3222aac21", "annotation_id": "961e4feb-2b5b-4372-a5db-c7d3222aac21", "instruction": "Check the status of train S92 for any disruptions.", "strict_f1": 0.0, "loose_f1": 0.182, "semantic_score": 0.067, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "a29533ef-86ae-42fd-a7d2-a6a62ffe689d", "annotation_id": "a29533ef-86ae-42fd-a7d2-a6a62ffe689d", "instruction": "Get the report from the final environmental impact statement for the Jamaica Bus Depot expansion.", "strict_f1": 0.3, "loose_f1": 0.6, "semantic_score": 0.243, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "c52fcdf7-1f23-4074-91bb-1a121af02a80", "annotation_id": "c52fcdf7-1f23-4074-91bb-1a121af02a80", "instruction": "Plan a trip to reach JFK airport from central park by 11am on April 12", "strict_f1": 0.204, "loose_f1": 0.367, "semantic_score": 0.229, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_0", "task_id": "1a53fb39-4d08-4722-addd-f04b0025ef81", "annotation_id": "1a53fb39-4d08-4722-addd-f04b0025ef81", "instruction": "Rent \" The Whale \" movie on demand with format \"high definition\".", "strict_f1": 0.125, "loose_f1": 0.188, "semantic_score": 0.25, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_0", "task_id": "453ebdd8-0989-455e-87ba-ebad183c0a04", "annotation_id": "453ebdd8-0989-455e-87ba-ebad183c0a04", "instruction": "Browse the page with event planning tips.", "strict_f1": 0.0, "loose_f1": 0.0, "semantic_score": 0.0, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_0", "task_id": "4b8fb0aa-7d7c-4a22-bfd5-f09316a050c3", "annotation_id": "4b8fb0aa-7d7c-4a22-bfd5-f09316a050c3", "instruction": "Search for a paid fishing class event on chicago", "strict_f1": 0.0, "loose_f1": 0.0, "semantic_score": 0.0, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_0", "task_id": "d6545454-33e8-4a35-988e-fa6cc0eb5873", "annotation_id": "d6545454-33e8-4a35-988e-fa6cc0eb5873", "instruction": "check available hotels with one room for two adult in Harlem less than $200 to check in on Mar 17th and check out on Mar 20th", "strict_f1": 0.278, "loose_f1": 0.444, "semantic_score": 0.29, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_0", "task_id": "c4380ce9-af36-4025-936d-354bf768c8b9", "annotation_id": "c4380ce9-af36-4025-936d-354bf768c8b9", "instruction": "Look for a business class flight to Paris from Salt Lake City on June 2, with a return on June 7, and checkout", "strict_f1": 0.237, "loose_f1": 0.373, "semantic_score": 0.269, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_0", "task_id": "81b4816a-9107-4951-ae3a-6587f28b49e8", "annotation_id": "81b4816a-9107-4951-ae3a-6587f28b49e8", "instruction": "Open my likes list.", "strict_f1": 0.4, "loose_f1": 0.4, "semantic_score": 0.35, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "105d3ad2-9a1b-4eef-9215-30d432a47e73", "annotation_id": "105d3ad2-9a1b-4eef-9215-30d432a47e73", "instruction": "Find out what popular events are being held this weekend in the category performing and visual arts near Chester, UK", "strict_f1": 0.194, "loose_f1": 0.387, "semantic_score": 0.243, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "aecaba3f-5ba7-44ba-8c05-e06c5d39a3c3", "annotation_id": "aecaba3f-5ba7-44ba-8c05-e06c5d39a3c3", "instruction": "Plan a bus trip going from the Boston Logan Airport to South Station", "strict_f1": 0.138, "loose_f1": 0.552, "semantic_score": 0.145, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "c094948f-afc6-415c-968a-9e105e2db118", "annotation_id": "c094948f-afc6-415c-968a-9e105e2db118", "instruction": "View the latest job openings in safety with a salary above 100k per annum, check the details, and apply.", "strict_f1": 0.621, "loose_f1": 0.621, "semantic_score": 0.35, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "4357a1ab-c012-47bd-94a8-720150cb8775", "annotation_id": "4357a1ab-c012-47bd-94a8-720150cb8775", "instruction": "Add The Wire to the watchlist.", "strict_f1": 0.0, "loose_f1": 0.222, "semantic_score": 0.0, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "failed", "agent_success": false, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "18fc60d7-aa69-4c07-9bf1-64543eae52c9", "annotation_id": "18fc60d7-aa69-4c07-9bf1-64543eae52c9", "instruction": "Add a e-gift card to bag of $100 for recipient John and email address abc@test.com from buckeye.foobar@gmail.com with message gift card.", "strict_f1": 0.0, "loose_f1": 0.296, "semantic_score": 0.044, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "2daa15a5-649e-43fa-912f-00111b163fb6", "annotation_id": "2daa15a5-649e-43fa-912f-00111b163fb6", "instruction": "Add formula 1 to my followed sports.", "strict_f1": 0.143, "loose_f1": 0.571, "semantic_score": 0.14, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "failed", "agent_success": false, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "691c18cc-b1ad-44e5-a506-584198162ae1", "annotation_id": "691c18cc-b1ad-44e5-a506-584198162ae1", "instruction": "add WWE superstar ALIYAH to your favorite by following her.", "strict_f1": 0.286, "loose_f1": 0.571, "semantic_score": 0.317, "judge_verdict": "fail", "judge_score": 0.5, "agent_status": "failed", "agent_success": false, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "ed60077a-1853-4b0d-8174-b339d08de32e", "annotation_id": "ed60077a-1853-4b0d-8174-b339d08de32e", "instruction": "Search the latest story about NFL and share it on facebook.", "strict_f1": 0.08, "loose_f1": 0.32, "semantic_score": 0.225, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "failed", "agent_success": false, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "7f90a191-9dbe-478a-8ae2-8aa45b790158", "annotation_id": "7f90a191-9dbe-478a-8ae2-8aa45b790158", "instruction": "Find more films from the director of Smile.", "strict_f1": 0.087, "loose_f1": 0.261, "semantic_score": 0.175, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "790ba0ec-4e7d-4df0-ac86-ea52b3a73532", "annotation_id": "790ba0ec-4e7d-4df0-ac86-ea52b3a73532", "instruction": "Add my birthday detail, January 5, 1980, and dairy and peanut allergy in my profile, also add love ramen noodles to my bio, and save.", "strict_f1": 0.4, "loose_f1": 0.5, "semantic_score": 0.222, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "failed", "agent_success": false, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "8dcf6423-262a-439b-9ee7-279a920468fa", "annotation_id": "8dcf6423-262a-439b-9ee7-279a920468fa", "instruction": "Tell me more about the Adirondack route.", "strict_f1": 0.074, "loose_f1": 0.222, "semantic_score": 0.3, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "6bf5cdf6-abc8-4425-b813-1a0b51ed16bb", "annotation_id": "6bf5cdf6-abc8-4425-b813-1a0b51ed16bb", "instruction": "Find tickets between $200-300 for next Amy Grant concert", "strict_f1": 0.176, "loose_f1": 0.412, "semantic_score": 0.19, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "7dfdeddd-b449-44cb-a0b0-1fde889219e2", "annotation_id": "7dfdeddd-b449-44cb-a0b0-1fde889219e2", "instruction": "Find the mobile delivery tickets for the up coming event on march 23th at Columbus with low cast.", "strict_f1": 0.069, "loose_f1": 0.276, "semantic_score": 0.225, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "failed", "agent_success": false, "agent_error": null}
|
|
{"dataset": "test_task_0", "task_id": "94f88670-09a4-4926-9353-f5eed2d81c01", "annotation_id": "94f88670-09a4-4926-9353-f5eed2d81c01", "instruction": "Get the cheapest hotel room for 1 adult with a free wifi in Seoul from 10th to 12th April for work purposes with no prepayment and a review score of 7+.", "strict_f1": 0.345, "loose_f1": 0.586, "semantic_score": 0.274, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
|
{"dataset": "test_task_0", "task_id": "ef3a7151-b9cd-4ddb-aa76-e7e66815c23d", "annotation_id": "ef3a7151-b9cd-4ddb-aa76-e7e66815c23d", "instruction": "Show me the list of Men's Blazers, Black, Size M.", "strict_f1": 0.333, "loose_f1": 0.333, "semantic_score": 0.237, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|