Quality_evaluation/Mind2Web/run_dataset.py
Aleksandr Dubchak 98d5e90894 mind2web
2026-04-23 00:04:11 +03:00

106 lines
2.8 KiB
Python

import json
import time
from pathlib import Path
import urllib.request
from mind2web_runner import run_task
TASK_FILES = [
"test_1_task_0.json",
"test_1_task_1.json",
"test_1_task_2.json",
]
OUTPUT_PATH = "../results_small.jsonl"
TIMEOUT_SEC = 600
def wait_browser_ready(url: str = "http://localhost:9222/json/version", timeout: int = 120) -> bool:
started = time.time()
last_error = None
while time.time() - started < timeout:
try:
with urllib.request.urlopen(url, timeout=5) as resp:
body = resp.read().decode("utf-8", errors="ignore")
if resp.status == 200 and "webSocketDebuggerUrl" in body:
print("browser ready")
return True
except Exception as e:
last_error = e
time.sleep(2)
print("browser not ready in time")
print("last error:", last_error)
return False
# def reset_browser() -> None:
# print("resetting browser container...")
# subprocess.run(
# ["docker", "compose", "restart", "browser"],
# check=True,
# cwd="/Users/aleksandr/Desktop/Quality_evaluation/BrowserUse_and_ComputerUse_skills",
# )
#
# ready = wait_browser_ready()
# if not ready:
# raise RuntimeError("Browser did not become ready after restart")
#
# time.sleep(3)
# print("browser restarted")
def load_single_task(path: str) -> dict:
with open(path, "r", encoding="utf-8") as f:
return json.load(f)
def append_jsonl(path: str, row: dict) -> None:
with open(path, "a", encoding="utf-8") as f:
f.write(json.dumps(row, ensure_ascii=False) + "\n")
def main():
out_path = Path(OUTPUT_PATH)
if out_path.exists():
out_path.unlink()
summary = {"success": 0, "timeout": 0, "error": 0, "failed": 0}
for i, task_file in enumerate(TASK_FILES, start=1):
print(f"\n===== TASK {i}/{len(TASK_FILES)} =====")
print("file:", task_file)
task = load_single_task(task_file)
print("annotation_id:", task.get("annotation_id"))
print("instruction:", task.get("confirmed_task"))
# reset_browser()
started = time.time()
result = run_task(task, timeout_sec=TIMEOUT_SEC)
elapsed = round(time.time() - started, 2)
result["source_file"] = task_file
result["elapsed_sec"] = elapsed
append_jsonl(OUTPUT_PATH, result)
status = result["status"]
if status in summary:
summary[status] += 1
else:
summary["failed"] += 1
print("status:", result["status"])
print("success:", result["success"])
print("elapsed:", elapsed)
print("error:", result["error"])
print("browser_view:", result["browser_view"])
print("\n===== SUMMARY =====")
print(summary)
if __name__ == "__main__":
main()