This commit is contained in:
Aleksandr Dubchak 2026-04-23 01:21:29 +03:00
parent 98d5e90894
commit 1dd92ab887
107 changed files with 419184 additions and 7726 deletions

View file

@ -25,3 +25,21 @@
{"dataset": "test_task_0", "task_id": "1a53fb39-4d08-4722-addd-f04b0025ef81", "annotation_id": "1a53fb39-4d08-4722-addd-f04b0025ef81", "instruction": "Rent \" The Whale \" movie on demand with format \"high definition\".", "strict_f1": 0.125, "loose_f1": 0.188, "semantic_score": 0.25, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
{"dataset": "test_task_0", "task_id": "453ebdd8-0989-455e-87ba-ebad183c0a04", "annotation_id": "453ebdd8-0989-455e-87ba-ebad183c0a04", "instruction": "Browse the page with event planning tips.", "strict_f1": 0.0, "loose_f1": 0.0, "semantic_score": 0.0, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
{"dataset": "test_task_0", "task_id": "4b8fb0aa-7d7c-4a22-bfd5-f09316a050c3", "annotation_id": "4b8fb0aa-7d7c-4a22-bfd5-f09316a050c3", "instruction": "Search for a paid fishing class event on chicago", "strict_f1": 0.0, "loose_f1": 0.0, "semantic_score": 0.0, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
{"dataset": "test_task_0", "task_id": "d6545454-33e8-4a35-988e-fa6cc0eb5873", "annotation_id": "d6545454-33e8-4a35-988e-fa6cc0eb5873", "instruction": "check available hotels with one room for two adult in Harlem less than $200 to check in on Mar 17th and check out on Mar 20th", "strict_f1": 0.278, "loose_f1": 0.444, "semantic_score": 0.29, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
{"dataset": "test_task_0", "task_id": "c4380ce9-af36-4025-936d-354bf768c8b9", "annotation_id": "c4380ce9-af36-4025-936d-354bf768c8b9", "instruction": "Look for a business class flight to Paris from Salt Lake City on June 2, with a return on June 7, and checkout", "strict_f1": 0.237, "loose_f1": 0.373, "semantic_score": 0.269, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
{"dataset": "test_task_0", "task_id": "81b4816a-9107-4951-ae3a-6587f28b49e8", "annotation_id": "81b4816a-9107-4951-ae3a-6587f28b49e8", "instruction": "Open my likes list.", "strict_f1": 0.4, "loose_f1": 0.4, "semantic_score": 0.35, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "success", "agent_success": true, "agent_error": null}
{"dataset": "test_task_0", "task_id": "105d3ad2-9a1b-4eef-9215-30d432a47e73", "annotation_id": "105d3ad2-9a1b-4eef-9215-30d432a47e73", "instruction": "Find out what popular events are being held this weekend in the category performing and visual arts near Chester, UK", "strict_f1": 0.194, "loose_f1": 0.387, "semantic_score": 0.243, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "success", "agent_success": true, "agent_error": null}
{"dataset": "test_task_0", "task_id": "aecaba3f-5ba7-44ba-8c05-e06c5d39a3c3", "annotation_id": "aecaba3f-5ba7-44ba-8c05-e06c5d39a3c3", "instruction": "Plan a bus trip going from the Boston Logan Airport to South Station", "strict_f1": 0.138, "loose_f1": 0.552, "semantic_score": 0.145, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
{"dataset": "test_task_0", "task_id": "c094948f-afc6-415c-968a-9e105e2db118", "annotation_id": "c094948f-afc6-415c-968a-9e105e2db118", "instruction": "View the latest job openings in safety with a salary above 100k per annum, check the details, and apply.", "strict_f1": 0.621, "loose_f1": 0.621, "semantic_score": 0.35, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "success", "agent_success": true, "agent_error": null}
{"dataset": "test_task_0", "task_id": "4357a1ab-c012-47bd-94a8-720150cb8775", "annotation_id": "4357a1ab-c012-47bd-94a8-720150cb8775", "instruction": "Add The Wire to the watchlist.", "strict_f1": 0.0, "loose_f1": 0.222, "semantic_score": 0.0, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "failed", "agent_success": false, "agent_error": null}
{"dataset": "test_task_0", "task_id": "18fc60d7-aa69-4c07-9bf1-64543eae52c9", "annotation_id": "18fc60d7-aa69-4c07-9bf1-64543eae52c9", "instruction": "Add a e-gift card to bag of $100 for recipient John and email address abc@test.com from buckeye.foobar@gmail.com with message gift card.", "strict_f1": 0.0, "loose_f1": 0.296, "semantic_score": 0.044, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
{"dataset": "test_task_0", "task_id": "2daa15a5-649e-43fa-912f-00111b163fb6", "annotation_id": "2daa15a5-649e-43fa-912f-00111b163fb6", "instruction": "Add formula 1 to my followed sports.", "strict_f1": 0.143, "loose_f1": 0.571, "semantic_score": 0.14, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "failed", "agent_success": false, "agent_error": null}
{"dataset": "test_task_0", "task_id": "691c18cc-b1ad-44e5-a506-584198162ae1", "annotation_id": "691c18cc-b1ad-44e5-a506-584198162ae1", "instruction": "add WWE superstar ALIYAH to your favorite by following her.", "strict_f1": 0.286, "loose_f1": 0.571, "semantic_score": 0.317, "judge_verdict": "fail", "judge_score": 0.5, "agent_status": "failed", "agent_success": false, "agent_error": null}
{"dataset": "test_task_0", "task_id": "ed60077a-1853-4b0d-8174-b339d08de32e", "annotation_id": "ed60077a-1853-4b0d-8174-b339d08de32e", "instruction": "Search the latest story about NFL and share it on facebook.", "strict_f1": 0.08, "loose_f1": 0.32, "semantic_score": 0.225, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "failed", "agent_success": false, "agent_error": null}
{"dataset": "test_task_0", "task_id": "7f90a191-9dbe-478a-8ae2-8aa45b790158", "annotation_id": "7f90a191-9dbe-478a-8ae2-8aa45b790158", "instruction": "Find more films from the director of Smile.", "strict_f1": 0.087, "loose_f1": 0.261, "semantic_score": 0.175, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
{"dataset": "test_task_0", "task_id": "790ba0ec-4e7d-4df0-ac86-ea52b3a73532", "annotation_id": "790ba0ec-4e7d-4df0-ac86-ea52b3a73532", "instruction": "Add my birthday detail, January 5, 1980, and dairy and peanut allergy in my profile, also add love ramen noodles to my bio, and save.", "strict_f1": 0.4, "loose_f1": 0.5, "semantic_score": 0.222, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "failed", "agent_success": false, "agent_error": null}
{"dataset": "test_task_0", "task_id": "8dcf6423-262a-439b-9ee7-279a920468fa", "annotation_id": "8dcf6423-262a-439b-9ee7-279a920468fa", "instruction": "Tell me more about the Adirondack route.", "strict_f1": 0.074, "loose_f1": 0.222, "semantic_score": 0.3, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
{"dataset": "test_task_0", "task_id": "6bf5cdf6-abc8-4425-b813-1a0b51ed16bb", "annotation_id": "6bf5cdf6-abc8-4425-b813-1a0b51ed16bb", "instruction": "Find tickets between $200-300 for next Amy Grant concert", "strict_f1": 0.176, "loose_f1": 0.412, "semantic_score": 0.19, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
{"dataset": "test_task_0", "task_id": "7dfdeddd-b449-44cb-a0b0-1fde889219e2", "annotation_id": "7dfdeddd-b449-44cb-a0b0-1fde889219e2", "instruction": "Find the mobile delivery tickets for the up coming event on march 23th at Columbus with low cast.", "strict_f1": 0.069, "loose_f1": 0.276, "semantic_score": 0.225, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "failed", "agent_success": false, "agent_error": null}
{"dataset": "test_task_0", "task_id": "94f88670-09a4-4926-9353-f5eed2d81c01", "annotation_id": "94f88670-09a4-4926-9353-f5eed2d81c01", "instruction": "Get the cheapest hotel room for 1 adult with a free wifi in Seoul from 10th to 12th April for work purposes with no prepayment and a review score of 7+.", "strict_f1": 0.345, "loose_f1": 0.586, "semantic_score": 0.274, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
{"dataset": "test_task_0", "task_id": "ef3a7151-b9cd-4ddb-aa76-e7e66815c23d", "annotation_id": "ef3a7151-b9cd-4ddb-aa76-e7e66815c23d", "instruction": "Show me the list of Men's Blazers, Black, Size M.", "strict_f1": 0.333, "loose_f1": 0.333, "semantic_score": 0.237, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "success", "agent_success": true, "agent_error": null}