mind2web
This commit is contained in:
parent
2b5d923f63
commit
98d5e90894
754 changed files with 1175740 additions and 142424 deletions
27
Mind2Web/eval_v2/summaries/results.jsonl
Normal file
27
Mind2Web/eval_v2/summaries/results.jsonl
Normal file
|
|
@ -0,0 +1,27 @@
|
|||
{"dataset": "test_task_0", "task_id": "8f6261cf-d665-4e61-93af-f50f0d366245", "annotation_id": "8f6261cf-d665-4e61-93af-f50f0d366245", "instruction": "Find all events taking place in New York City during the month of September.", "strict_f1": 0.378, "loose_f1": 0.541, "semantic_score": 0.318, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
||||
{"dataset": "test_task_0", "task_id": "918d7ef3-a6ec-458a-88f1-1c2726fd2883", "annotation_id": "918d7ef3-a6ec-458a-88f1-1c2726fd2883", "instruction": "Find help page about buying tickets.", "strict_f1": 0.308, "loose_f1": 0.308, "semantic_score": 0.5, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
||||
{"dataset": "test_task_0", "task_id": "9c3cba90-742a-4f3b-a2e7-767b565fae96", "annotation_id": "9c3cba90-742a-4f3b-a2e7-767b565fae96", "instruction": "check two ticket with best seat that has promo code first show happening in Hamilton New York on April.", "strict_f1": 0.341, "loose_f1": 0.537, "semantic_score": 0.336, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "failed", "agent_success": false, "agent_error": null}
|
||||
{"dataset": "test_task_0", "task_id": "1d73ad40-f7f8-435e-a83d-8b38534427fd", "annotation_id": "1d73ad40-f7f8-435e-a83d-8b38534427fd", "instruction": "Find the cheapest women's plus size brown color loungewear in 3xl size.", "strict_f1": 0.233, "loose_f1": 0.419, "semantic_score": 0.344, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
||||
{"dataset": "test_task_0", "task_id": "91695df8-f256-47c9-8c37-06e8d0fc758f", "annotation_id": "91695df8-f256-47c9-8c37-06e8d0fc758f", "instruction": "Rent a truck with the lowest rent with two dozen furniture pads for 100 miles at zip 08817 on April 12 at 2:30 pm rented truck is to be returned to the exact location and date, and the pickup and drop off will be at the nearest location.", "strict_f1": 0.151, "loose_f1": 0.377, "semantic_score": 0.221, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
||||
{"dataset": "test_task_0", "task_id": "923fd4e0-1862-45b0-9bba-d57b956844da", "annotation_id": "923fd4e0-1862-45b0-9bba-d57b956844da", "instruction": "search gas pickup truck in Fremont with 2010 and 2017 with less than 80000 mile", "strict_f1": 0.412, "loose_f1": 0.412, "semantic_score": 0.28, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
||||
{"dataset": "test_task_0", "task_id": "0572899e-7c07-4a2f-a77b-bba4f432a7ad", "annotation_id": "0572899e-7c07-4a2f-a77b-bba4f432a7ad", "instruction": "find my trip with confirmation number SFTBAO including first and last name Joe Lukeman", "strict_f1": 0.261, "loose_f1": 0.348, "semantic_score": 0.314, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "failed", "agent_success": false, "agent_error": null}
|
||||
{"dataset": "test_task_0", "task_id": "1b4859f4-6951-4f6a-8a74-1c9647900eb1", "annotation_id": "1b4859f4-6951-4f6a-8a74-1c9647900eb1", "instruction": "Find the status of March 25 flights from New York airports to Columbus in Ohio.", "strict_f1": 0.143, "loose_f1": 0.321, "semantic_score": 0.282, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
||||
{"dataset": "test_task_0", "task_id": "9223f1b4-43ad-4636-9541-99ff9e6ad918", "annotation_id": "9223f1b4-43ad-4636-9541-99ff9e6ad918", "instruction": "Browse the venues that are playing the Wicked show from Oct 5 to Oct 24 2023", "strict_f1": 0.148, "loose_f1": 0.148, "semantic_score": 0.158, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "failed", "agent_success": false, "agent_error": null}
|
||||
{"dataset": "test_task_0", "task_id": "a52fcf7a-50aa-4256-8796-654b3dc3adac", "annotation_id": "a52fcf7a-50aa-4256-8796-654b3dc3adac", "instruction": "Buy a diamond pass in New York's, Great escape park, add one meal dining plan to it, and select the flexible payment plan for Jame Jones. The email address is jame_jones@hotmail.com, zip code 10005 and age is 35.", "strict_f1": 0.053, "loose_f1": 0.211, "semantic_score": 0.068, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "failed", "agent_success": false, "agent_error": null}
|
||||
{"dataset": "test_task_0", "task_id": "a4f3beb3-422a-4049-8d17-44eade56fed1", "annotation_id": "a4f3beb3-422a-4049-8d17-44eade56fed1", "instruction": "Add to my wish list the highest rated activity in Amsterdam.", "strict_f1": 0.429, "loose_f1": 0.714, "semantic_score": 0.34, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
||||
{"dataset": "test_task_0", "task_id": "e39333ef-a5a5-4117-9af2-3bb243b364f2", "annotation_id": "e39333ef-a5a5-4117-9af2-3bb243b364f2", "instruction": "Find the movie Donnie Darko and show its complete cast.", "strict_f1": 0.167, "loose_f1": 0.167, "semantic_score": 0.233, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
||||
{"dataset": "test_task_0", "task_id": "e8b1cc02-1143-47cf-a7bb-9a16d08e155a", "annotation_id": "e8b1cc02-1143-47cf-a7bb-9a16d08e155a", "instruction": "Browse the list of top 250 movies and add the first one to my watchlist.", "strict_f1": 0.182, "loose_f1": 0.545, "semantic_score": 0.233, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "failed", "agent_success": false, "agent_error": null}
|
||||
{"dataset": "test_task_0", "task_id": "b59b1997-884f-42f0-b083-fc93d6ce64fe", "annotation_id": "b59b1997-884f-42f0-b083-fc93d6ce64fe", "instruction": "see Nissan and Honda cars for sale near Kentwood, MI 49512", "strict_f1": 0.222, "loose_f1": 0.222, "semantic_score": 0.156, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
||||
{"dataset": "test_task_0", "task_id": "e483a49f-029d-446c-892c-c56b92fc463b", "annotation_id": "e483a49f-029d-446c-892c-c56b92fc463b", "instruction": "Build an entry-level pc with an windows 11 64 bit intel i7 CPU with a256gb ssd drive + 4gb ram and adding cheapest component and accessories available.", "strict_f1": 0.4, "loose_f1": 0.494, "semantic_score": 0.238, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
||||
{"dataset": "test_task_0", "task_id": "14d50319-3f81-4aa6-8ee8-d1b66e4d5d64", "annotation_id": "14d50319-3f81-4aa6-8ee8-d1b66e4d5d64", "instruction": "Find 32\" Curved monitor and add the third one to the wish list.", "strict_f1": 0.286, "loose_f1": 0.286, "semantic_score": 0.167, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
||||
{"dataset": "test_task_0", "task_id": "f9a882f7-826e-469a-ad69-0d5f912734c9", "annotation_id": "f9a882f7-826e-469a-ad69-0d5f912734c9", "instruction": "Search the cheapest Curry brand unisex athletic shoes with the number 5.5, add to cart and checkout.", "strict_f1": 0.481, "loose_f1": 0.556, "semantic_score": 0.447, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
||||
{"dataset": "test_task_0", "task_id": "a5dd5729-415a-4fe2-a840-4935bf9428d4", "annotation_id": "a5dd5729-415a-4fe2-a840-4935bf9428d4", "instruction": "Browse spider-man toys for kids and sort by lowest price.", "strict_f1": 0.0, "loose_f1": 0.0, "semantic_score": 0.0, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
||||
{"dataset": "test_task_0", "task_id": "a2500e0b-9244-4f0e-b686-fa290c32b829", "annotation_id": "a2500e0b-9244-4f0e-b686-fa290c32b829", "instruction": "Find the store location and hours of the closest Gamestop to zip code 90028 and set as home store", "strict_f1": 0.0, "loose_f1": 0.25, "semantic_score": 0.1, "judge_verdict": "fail", "judge_score": 0.5, "agent_status": "failed", "agent_success": false, "agent_error": null}
|
||||
{"dataset": "test_task_0", "task_id": "942666cb-147a-4033-be7e-d56ccca47506", "annotation_id": "942666cb-147a-4033-be7e-d56ccca47506", "instruction": "Find and view the biography for the Host of the Price is Right.", "strict_f1": 0.222, "loose_f1": 0.222, "semantic_score": 0.175, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
||||
{"dataset": "test_task_0", "task_id": "4fa7cab9-8448-4cdb-842f-dba109b3a13e", "annotation_id": "4fa7cab9-8448-4cdb-842f-dba109b3a13e", "instruction": "Find a private room in New York for 1 April and checkout on 2 April for 2 adults", "strict_f1": 0.636, "loose_f1": 0.636, "semantic_score": 0.345, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
||||
{"dataset": "test_task_0", "task_id": "961e4feb-2b5b-4372-a5db-c7d3222aac21", "annotation_id": "961e4feb-2b5b-4372-a5db-c7d3222aac21", "instruction": "Check the status of train S92 for any disruptions.", "strict_f1": 0.0, "loose_f1": 0.182, "semantic_score": 0.067, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
||||
{"dataset": "test_task_0", "task_id": "a29533ef-86ae-42fd-a7d2-a6a62ffe689d", "annotation_id": "a29533ef-86ae-42fd-a7d2-a6a62ffe689d", "instruction": "Get the report from the final environmental impact statement for the Jamaica Bus Depot expansion.", "strict_f1": 0.3, "loose_f1": 0.6, "semantic_score": 0.243, "judge_verdict": "pass", "judge_score": 1.0, "agent_status": "success", "agent_success": true, "agent_error": null}
|
||||
{"dataset": "test_task_0", "task_id": "c52fcdf7-1f23-4074-91bb-1a121af02a80", "annotation_id": "c52fcdf7-1f23-4074-91bb-1a121af02a80", "instruction": "Plan a trip to reach JFK airport from central park by 11am on April 12", "strict_f1": 0.204, "loose_f1": 0.367, "semantic_score": 0.229, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
||||
{"dataset": "test_task_0", "task_id": "1a53fb39-4d08-4722-addd-f04b0025ef81", "annotation_id": "1a53fb39-4d08-4722-addd-f04b0025ef81", "instruction": "Rent \" The Whale \" movie on demand with format \"high definition\".", "strict_f1": 0.125, "loose_f1": 0.188, "semantic_score": 0.25, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
||||
{"dataset": "test_task_0", "task_id": "453ebdd8-0989-455e-87ba-ebad183c0a04", "annotation_id": "453ebdd8-0989-455e-87ba-ebad183c0a04", "instruction": "Browse the page with event planning tips.", "strict_f1": 0.0, "loose_f1": 0.0, "semantic_score": 0.0, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
||||
{"dataset": "test_task_0", "task_id": "4b8fb0aa-7d7c-4a22-bfd5-f09316a050c3", "annotation_id": "4b8fb0aa-7d7c-4a22-bfd5-f09316a050c3", "instruction": "Search for a paid fishing class event on chicago", "strict_f1": 0.0, "loose_f1": 0.0, "semantic_score": 0.0, "judge_verdict": "fail", "judge_score": 0.0, "agent_status": "timeout", "agent_success": false, "agent_error": "timeout after 300 seconds"}
|
||||
Loading…
Add table
Add a link
Reference in a new issue