120 lines
3.1 KiB
Python
120 lines
3.1 KiB
Python
# Он должен:
|
|
# • читать task json
|
|
# • доставать instruction
|
|
# • доставать gold actions
|
|
# • возвращать всё в нормальном виде
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
|
|
def load_task_json(path: str | Path) -> dict[str, Any]:
|
|
file_path = Path(path)
|
|
with file_path.open("r", encoding="utf-8") as f:
|
|
data = json.load(f)
|
|
|
|
if not isinstance(data, dict):
|
|
raise ValueError(f"Expected dict in {file_path}, got {type(data).__name__}")
|
|
|
|
return data
|
|
|
|
|
|
def extract_instruction(task: dict[str, Any]) -> str:
|
|
candidates = [
|
|
task.get("confirmed_task"),
|
|
task.get("task"),
|
|
task.get("instruction"),
|
|
task.get("intent"),
|
|
]
|
|
for value in candidates:
|
|
if isinstance(value, str) and value.strip():
|
|
return value.strip()
|
|
return ""
|
|
|
|
|
|
def extract_annotation_id(task: dict[str, Any]) -> str:
|
|
candidates = [
|
|
task.get("annotation_id"),
|
|
task.get("id"),
|
|
task.get("task_id"),
|
|
]
|
|
for value in candidates:
|
|
if isinstance(value, str) and value.strip():
|
|
return value.strip()
|
|
return ""
|
|
|
|
|
|
def extract_gold_actions(task: dict[str, Any]) -> list[dict[str, Any]]:
|
|
"""
|
|
Mind2Web variants may store actions under different keys.
|
|
We try common candidates in a safe order.
|
|
"""
|
|
candidates = [
|
|
task.get("actions"),
|
|
task.get("action_reprs"),
|
|
task.get("operation"),
|
|
task.get("operations"),
|
|
task.get("action_uid"),
|
|
]
|
|
|
|
for value in candidates:
|
|
if isinstance(value, list):
|
|
return value
|
|
|
|
# Some datasets may nest actions deeper
|
|
if isinstance(task.get("trace"), list):
|
|
return task["trace"]
|
|
|
|
if isinstance(task.get("gold_actions"), list):
|
|
return task["gold_actions"]
|
|
|
|
return []
|
|
|
|
|
|
def summarize_task(task: dict[str, Any]) -> dict[str, Any]:
|
|
instruction = extract_instruction(task)
|
|
annotation_id = extract_annotation_id(task)
|
|
gold_actions = extract_gold_actions(task)
|
|
|
|
return {
|
|
"annotation_id": annotation_id,
|
|
"instruction": instruction,
|
|
"gold_actions_count": len(gold_actions),
|
|
"gold_actions_preview": gold_actions[:2],
|
|
}
|
|
|
|
|
|
def normalize_gold_actions(gold_actions: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
|
normalized = []
|
|
|
|
for action in gold_actions:
|
|
if not isinstance(action, dict):
|
|
normalized.append({
|
|
"type": "unknown",
|
|
"raw": action,
|
|
})
|
|
continue
|
|
|
|
op = action.get("operation", {})
|
|
|
|
if isinstance(op, dict):
|
|
action_type = (
|
|
op.get("op")
|
|
or op.get("operation")
|
|
or op.get("type")
|
|
or "unknown"
|
|
)
|
|
normalized.append({
|
|
"type": action_type,
|
|
"raw": op,
|
|
})
|
|
else:
|
|
normalized.append({
|
|
"type": "unknown",
|
|
"raw": op,
|
|
})
|
|
|
|
return normalized
|