mind2web

2026-04-23 00:04:11 +03:00 · 2026-04-23 00:04:11 +03:00 · 98d5e90894
commit 98d5e90894
parent 2b5d923f63
754 changed files with 1175740 additions and 142424 deletions
--- a/Mind2Web/eval_v2/dataset_loader.py
+++ b/Mind2Web/eval_v2/dataset_loader.py
@ -0,0 +1,120 @@
+# Он должен:
+# 	•	читать task json
+# 	•	доставать instruction
+# 	•	доставать gold actions
+# 	•	возвращать всё в нормальном виде
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from typing import Any
+
+
+def load_task_json(path: str | Path) -> dict[str, Any]:
+    file_path = Path(path)
+    with file_path.open("r", encoding="utf-8") as f:
+        data = json.load(f)
+
+    if not isinstance(data, dict):
+        raise ValueError(f"Expected dict in {file_path}, got {type(data).__name__}")
+
+    return data
+
+
+def extract_instruction(task: dict[str, Any]) -> str:
+    candidates = [
+        task.get("confirmed_task"),
+        task.get("task"),
+        task.get("instruction"),
+        task.get("intent"),
+    ]
+    for value in candidates:
+        if isinstance(value, str) and value.strip():
+            return value.strip()
+    return ""
+
+
+def extract_annotation_id(task: dict[str, Any]) -> str:
+    candidates = [
+        task.get("annotation_id"),
+        task.get("id"),
+        task.get("task_id"),
+    ]
+    for value in candidates:
+        if isinstance(value, str) and value.strip():
+            return value.strip()
+    return ""
+
+
+def extract_gold_actions(task: dict[str, Any]) -> list[dict[str, Any]]:
+    """
+    Mind2Web variants may store actions under different keys.
+    We try common candidates in a safe order.
+    """
+    candidates = [
+        task.get("actions"),
+        task.get("action_reprs"),
+        task.get("operation"),
+        task.get("operations"),
+        task.get("action_uid"),
+    ]
+
+    for value in candidates:
+        if isinstance(value, list):
+            return value
+
+    # Some datasets may nest actions deeper
+    if isinstance(task.get("trace"), list):
+        return task["trace"]
+
+    if isinstance(task.get("gold_actions"), list):
+        return task["gold_actions"]
+
+    return []
+
+
+def summarize_task(task: dict[str, Any]) -> dict[str, Any]:
+    instruction = extract_instruction(task)
+    annotation_id = extract_annotation_id(task)
+    gold_actions = extract_gold_actions(task)
+
+    return {
+        "annotation_id": annotation_id,
+        "instruction": instruction,
+        "gold_actions_count": len(gold_actions),
+        "gold_actions_preview": gold_actions[:2],
+    }
+
+
+def normalize_gold_actions(gold_actions: list[dict[str, Any]]) -> list[dict[str, Any]]:
+    normalized = []
+
+    for action in gold_actions:
+        if not isinstance(action, dict):
+            normalized.append({
+                "type": "unknown",
+                "raw": action,
+            })
+            continue
+
+        op = action.get("operation", {})
+
+        if isinstance(op, dict):
+            action_type = (
+                    op.get("op")
+                    or op.get("operation")
+                    or op.get("type")
+                    or "unknown"
+            )
+            normalized.append({
+                "type": action_type,
+                "raw": op,
+            })
+        else:
+            normalized.append({
+                "type": "unknown",
+                "raw": op,
+            })
+
+    return normalized