mind2web
This commit is contained in:
parent
2b5d923f63
commit
98d5e90894
754 changed files with 1175740 additions and 142424 deletions
42
stuff/loaders.py
Normal file
42
stuff/loaders.py
Normal file
|
|
@ -0,0 +1,42 @@
|
|||
import json
|
||||
from typing import List
|
||||
from one_Task_class import Task
|
||||
|
||||
|
||||
def load_mind2web_tasks(path: str, limit: int | None = None) -> List[Task]:
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
|
||||
if not isinstance(data, list):
|
||||
raise ValueError(f"Expected list of tasks in {path}, got {type(data).__name__}")
|
||||
|
||||
tasks: List[Task] = []
|
||||
|
||||
for item in data:
|
||||
if not isinstance(item, dict):
|
||||
continue
|
||||
|
||||
annotation_id = item.get("annotation_id")
|
||||
confirmed_task = item.get("confirmed_task")
|
||||
website = item.get("website")
|
||||
|
||||
# пропускаем битые записи
|
||||
if not annotation_id or not confirmed_task:
|
||||
continue
|
||||
|
||||
task = Task(
|
||||
id=annotation_id,
|
||||
dataset="mind2web",
|
||||
website=website,
|
||||
instruction=confirmed_task,
|
||||
start_url=None,
|
||||
expected=None,
|
||||
raw=item,
|
||||
)
|
||||
|
||||
tasks.append(task)
|
||||
|
||||
if limit is not None and len(tasks) >= limit:
|
||||
break
|
||||
|
||||
return tasks
|
||||
Loading…
Add table
Add a link
Reference in a new issue