42 lines
1.1 KiB
Python
42 lines
1.1 KiB
Python
import json
|
|
from typing import List
|
|
from one_Task_class import Task
|
|
|
|
|
|
def load_mind2web_tasks(path: str, limit: int | None = None) -> List[Task]:
|
|
with open(path, "r", encoding="utf-8") as f:
|
|
data = json.load(f)
|
|
|
|
if not isinstance(data, list):
|
|
raise ValueError(f"Expected list of tasks in {path}, got {type(data).__name__}")
|
|
|
|
tasks: List[Task] = []
|
|
|
|
for item in data:
|
|
if not isinstance(item, dict):
|
|
continue
|
|
|
|
annotation_id = item.get("annotation_id")
|
|
confirmed_task = item.get("confirmed_task")
|
|
website = item.get("website")
|
|
|
|
# пропускаем битые записи
|
|
if not annotation_id or not confirmed_task:
|
|
continue
|
|
|
|
task = Task(
|
|
id=annotation_id,
|
|
dataset="mind2web",
|
|
website=website,
|
|
instruction=confirmed_task,
|
|
start_url=None,
|
|
expected=None,
|
|
raw=item,
|
|
)
|
|
|
|
tasks.append(task)
|
|
|
|
if limit is not None and len(tasks) >= limit:
|
|
break
|
|
|
|
return tasks
|