mind2web
This commit is contained in:
parent
2b5d923f63
commit
98d5e90894
754 changed files with 1175740 additions and 142424 deletions
32
stuff/RunTrace.py
Normal file
32
stuff/RunTrace.py
Normal file
|
|
@ -0,0 +1,32 @@
|
|||
from dataclasses import dataclass, field
|
||||
from typing import List, Optional
|
||||
from StepTrace import StepTrace
|
||||
|
||||
|
||||
|
||||
@dataclass
|
||||
class RunTrace:
|
||||
run_id: str
|
||||
task_id: str
|
||||
dataset: str
|
||||
instruction: str
|
||||
|
||||
model_name: Optional[str]
|
||||
|
||||
started_at: float
|
||||
finished_at: Optional[float]
|
||||
|
||||
success: bool
|
||||
final_url: Optional[str]
|
||||
final_answer: Optional[str]
|
||||
|
||||
error: Optional[str]
|
||||
fail_reason: Optional[str]
|
||||
|
||||
total_steps: int
|
||||
total_tokens: Optional[int]
|
||||
total_latency_sec: Optional[float]
|
||||
|
||||
screenshots_dir: Optional[str]
|
||||
|
||||
steps: List[StepTrace] = field(default_factory=list)
|
||||
23
stuff/StepTrace.py
Normal file
23
stuff/StepTrace.py
Normal file
|
|
@ -0,0 +1,23 @@
|
|||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
|
||||
|
||||
@dataclass
|
||||
class StepTrace:
|
||||
step_no: int
|
||||
timestamp: float
|
||||
|
||||
thought: Optional[str]
|
||||
|
||||
action_type: str
|
||||
action_target: Optional[str]
|
||||
action_value: Optional[str]
|
||||
|
||||
url_before: Optional[str]
|
||||
url_after: Optional[str]
|
||||
|
||||
screenshot_before: Optional[str]
|
||||
screenshot_after: Optional[str]
|
||||
|
||||
success: bool
|
||||
error: Optional[str] = None
|
||||
47
stuff/debug_browser_tool.py
Normal file
47
stuff/debug_browser_tool.py
Normal file
|
|
@ -0,0 +1,47 @@
|
|||
import os
|
||||
import sys
|
||||
import asyncio
|
||||
from dotenv import load_dotenv
|
||||
|
||||
PROJECT_ROOT = os.path.abspath(
|
||||
os.path.join(os.path.dirname(__file__), "")
|
||||
)
|
||||
|
||||
SKILLS_DIR = os.path.join(PROJECT_ROOT, "../BrowserUse_and_ComputerUse_skills")
|
||||
HERMES_CODE_DIR = os.path.join(SKILLS_DIR, "hermes_code")
|
||||
ENV_PATH = os.path.join(SKILLS_DIR, ".env")
|
||||
|
||||
for path in [PROJECT_ROOT, SKILLS_DIR, HERMES_CODE_DIR]:
|
||||
if path not in sys.path:
|
||||
sys.path.append(path)
|
||||
|
||||
load_dotenv(ENV_PATH, override=True)
|
||||
|
||||
os.environ["MODEL"] = "qwen3.5-122b"
|
||||
os.environ["MODEL_DEFAULT"] = "qwen3.5-122b"
|
||||
os.environ["BASE_URL"] = "https://llm.lambda.coredump.ru/v1"
|
||||
os.environ["OPENAI_BASE_URL"] = "https://llm.lambda.coredump.ru/v1"
|
||||
os.environ["API_KEY"] = "sk-4rzg5cB88S4MCSgNOLAzIw"
|
||||
os.environ["OPENAI_API_KEY"] = "sk-4rzg5cB88S4MCSgNOLAzIw"
|
||||
os.environ["PROVIDER"] = "custom"
|
||||
os.environ["BROWSER_URL"] = "http://localhost:9222"
|
||||
os.environ["BROWSER_VIEW_URL"] = "http://localhost:6080"
|
||||
|
||||
print("MODEL:", os.getenv("MODEL"), flush=True)
|
||||
print("BROWSER_URL:", os.getenv("BROWSER_URL"), flush=True)
|
||||
|
||||
from hermes_code.tools.browser_use_tool import run_browser_task
|
||||
|
||||
|
||||
async def main():
|
||||
print("DEBUG: before run_browser_task", flush=True)
|
||||
result = await asyncio.wait_for(
|
||||
run_browser_task("Открой nba.com"),
|
||||
timeout=60,
|
||||
)
|
||||
print("DEBUG: after run_browser_task", flush=True)
|
||||
print(result, flush=True)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
14
stuff/eval_t.py
Normal file
14
stuff/eval_t.py
Normal file
|
|
@ -0,0 +1,14 @@
|
|||
import mind2web_runner
|
||||
print("IMPORTED EVAL_AGENT =", eval_agent.__file__)
|
||||
|
||||
from mind2web_runner import run_agent_task
|
||||
|
||||
|
||||
def main():
|
||||
result = run_agent_task("открой nba.com")
|
||||
print("==== RESULT ====")
|
||||
print(result)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
31
stuff/extracting.py
Normal file
31
stuff/extracting.py
Normal file
|
|
@ -0,0 +1,31 @@
|
|||
import json
|
||||
|
||||
INPUT_PATH = "../Mind2Web/test_task/test_task_2.json"
|
||||
OUTPUT_PATH = "../Mind2Web/test_1_task_2.json"
|
||||
|
||||
with open(INPUT_PATH, "r") as f:
|
||||
data = json.load(f)
|
||||
|
||||
# --- защита от кривых данных ---
|
||||
if isinstance(data, dict):
|
||||
print("⚠️ Файл уже обрезан (dict вместо list)")
|
||||
task = data
|
||||
elif isinstance(data, list):
|
||||
print(f"✅ Найден список задач: {len(data)}")
|
||||
task = data[0]
|
||||
else:
|
||||
raise ValueError("❌ Неизвестный формат JSON")
|
||||
|
||||
# --- проверка структуры ---
|
||||
print("Ключи задачи:", task.keys())
|
||||
|
||||
if "confirmed_task" not in task:
|
||||
print("❌ ВНИМАНИЕ: нет confirmed_task — файл уже обрезан или не тот")
|
||||
else:
|
||||
print("✅ Цель:", task["confirmed_task"])
|
||||
|
||||
# --- сохранение ---
|
||||
with open(OUTPUT_PATH, "w") as f:
|
||||
json.dump(task, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f"💾 Сохранено в {OUTPUT_PATH}")
|
||||
42
stuff/loaders.py
Normal file
42
stuff/loaders.py
Normal file
|
|
@ -0,0 +1,42 @@
|
|||
import json
|
||||
from typing import List
|
||||
from one_Task_class import Task
|
||||
|
||||
|
||||
def load_mind2web_tasks(path: str, limit: int | None = None) -> List[Task]:
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
|
||||
if not isinstance(data, list):
|
||||
raise ValueError(f"Expected list of tasks in {path}, got {type(data).__name__}")
|
||||
|
||||
tasks: List[Task] = []
|
||||
|
||||
for item in data:
|
||||
if not isinstance(item, dict):
|
||||
continue
|
||||
|
||||
annotation_id = item.get("annotation_id")
|
||||
confirmed_task = item.get("confirmed_task")
|
||||
website = item.get("website")
|
||||
|
||||
# пропускаем битые записи
|
||||
if not annotation_id or not confirmed_task:
|
||||
continue
|
||||
|
||||
task = Task(
|
||||
id=annotation_id,
|
||||
dataset="mind2web",
|
||||
website=website,
|
||||
instruction=confirmed_task,
|
||||
start_url=None,
|
||||
expected=None,
|
||||
raw=item,
|
||||
)
|
||||
|
||||
tasks.append(task)
|
||||
|
||||
if limit is not None and len(tasks) >= limit:
|
||||
break
|
||||
|
||||
return tasks
|
||||
22
stuff/main.py
Normal file
22
stuff/main.py
Normal file
|
|
@ -0,0 +1,22 @@
|
|||
from loaders import load_mind2web_tasks
|
||||
from runner import run_many_tasks
|
||||
|
||||
# твой агент (как у тебя уже есть)
|
||||
from agent import agent
|
||||
|
||||
|
||||
def main():
|
||||
tasks = load_mind2web_tasks(
|
||||
"../Mind2Web/test_task/test_task_2.json",
|
||||
limit=2
|
||||
)
|
||||
|
||||
results = run_many_tasks(tasks, agent)
|
||||
|
||||
print("\n=== FINAL RESULTS ===")
|
||||
print(f"Total: {len(results)}")
|
||||
print(f"Success: {sum(r.success for r in results)}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
13
stuff/one_Task_class.py
Normal file
13
stuff/one_Task_class.py
Normal file
|
|
@ -0,0 +1,13 @@
|
|||
from dataclasses import dataclass
|
||||
from typing import Optional, Any
|
||||
|
||||
|
||||
@dataclass
|
||||
class Task:
|
||||
id: str
|
||||
dataset: str
|
||||
website: Optional[str]
|
||||
instruction: str
|
||||
start_url: Optional[str]
|
||||
expected: Optional[Any]
|
||||
raw: dict
|
||||
99
stuff/run_agent_task.py
Normal file
99
stuff/run_agent_task.py
Normal file
|
|
@ -0,0 +1,99 @@
|
|||
import json
|
||||
import time
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
|
||||
async def run_agent_task(
|
||||
agent,
|
||||
instruction: str,
|
||||
session_id: str,
|
||||
history: Optional[list] = None,
|
||||
) -> Dict[str, Any]:
|
||||
history = history or []
|
||||
|
||||
started_at = time.time()
|
||||
|
||||
result = agent.run_conversation(
|
||||
instruction,
|
||||
conversation_history=history,
|
||||
task_id=session_id,
|
||||
)
|
||||
|
||||
finished_at = time.time()
|
||||
|
||||
normalized: Dict[str, Any] = {
|
||||
"success": False,
|
||||
"final_url": None,
|
||||
"final_answer": result.get("final_response"),
|
||||
"fail_reason": None,
|
||||
"error": None,
|
||||
"total_tokens": result.get("total_tokens"),
|
||||
"input_tokens": result.get("input_tokens"),
|
||||
"output_tokens": result.get("output_tokens"),
|
||||
"model_name": result.get("model"),
|
||||
"screenshots_dir": None,
|
||||
"steps": [],
|
||||
"started_at": started_at,
|
||||
"finished_at": finished_at,
|
||||
"raw_result": result,
|
||||
}
|
||||
|
||||
# Общая диагностика верхнего уровня
|
||||
if not result.get("completed", False):
|
||||
normalized["fail_reason"] = "not_completed"
|
||||
|
||||
if result.get("interrupted", False):
|
||||
normalized["fail_reason"] = "interrupted"
|
||||
|
||||
# Ищем tool output
|
||||
messages = result.get("messages", [])
|
||||
for msg in messages:
|
||||
if msg.get("role") != "tool":
|
||||
continue
|
||||
|
||||
content = msg.get("content")
|
||||
if not content:
|
||||
continue
|
||||
|
||||
try:
|
||||
tool_payload = json.loads(content)
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
# Основной success браузерного шага
|
||||
normalized["success"] = bool(tool_payload.get("success", False))
|
||||
|
||||
# Публичная ссылка на browser view
|
||||
normalized["final_url"] = tool_payload.get("browser_view")
|
||||
|
||||
# Если tool сам вернул текст результата, это полезно сохранить
|
||||
if not normalized["final_answer"]:
|
||||
normalized["final_answer"] = tool_payload.get("result")
|
||||
|
||||
if not normalized["success"] and normalized["fail_reason"] is None:
|
||||
normalized["fail_reason"] = "tool_failed"
|
||||
|
||||
# Сохраняем шаг как минимум на уровне tool-call
|
||||
normalized["steps"].append(
|
||||
{
|
||||
"timestamp": finished_at,
|
||||
"thought": None,
|
||||
"action_type": "TOOL_CALL",
|
||||
"action_target": msg.get("tool_call_id"),
|
||||
"action_value": None,
|
||||
"url_before": None,
|
||||
"url_after": tool_payload.get("browser_view"),
|
||||
"screenshot_before": None,
|
||||
"screenshot_after": None,
|
||||
"success": bool(tool_payload.get("success", False)),
|
||||
"error": None if tool_payload.get("success", False) else tool_payload.get("result"),
|
||||
}
|
||||
)
|
||||
# Если tool output не нашли, но completed=True, это отдельный класс ошибки
|
||||
if not normalized["steps"] and result.get("completed", False):
|
||||
normalized["fail_reason"] = normalized["fail_reason"] or "no_tool_output"
|
||||
|
||||
# Верхнеуровневая ошибка, если агент вообще не завершился нормально
|
||||
if not normalized["success"] and normalized["error"] is None and normalized["fail_reason"] is None:
|
||||
normalized["fail_reason"] = "unknown_failure"
|
||||
return normalized
|
||||
78
stuff/run_task.py
Normal file
78
stuff/run_task.py
Normal file
|
|
@ -0,0 +1,78 @@
|
|||
import time
|
||||
import uuid
|
||||
from one_Task_class import Task
|
||||
from RunTrace import RunTrace
|
||||
from StepTrace import StepTrace
|
||||
from run_agent_task import run_agent_task
|
||||
|
||||
|
||||
async def run_task(task: Task, agent, model_name: str = None) -> RunTrace:
|
||||
started_at = time.time()
|
||||
|
||||
trace = RunTrace(
|
||||
run_id=str(uuid.uuid4()),
|
||||
task_id=task.id,
|
||||
dataset=task.dataset,
|
||||
instruction=task.instruction,
|
||||
model_name=model_name,
|
||||
started_at=started_at,
|
||||
finished_at=None,
|
||||
success=False,
|
||||
final_url=None,
|
||||
final_answer=None,
|
||||
error=None,
|
||||
fail_reason=None,
|
||||
total_steps=0,
|
||||
total_tokens=None,
|
||||
total_latency_sec=None,
|
||||
screenshots_dir=None,
|
||||
steps=[],
|
||||
)
|
||||
|
||||
try:
|
||||
result = await run_agent_task(
|
||||
agent=agent,
|
||||
instruction=task.instruction,
|
||||
session_id=f"eval-{task.dataset}-{task.id}",
|
||||
history=[],
|
||||
)
|
||||
|
||||
trace.success = bool(result.get("success", False))
|
||||
trace.final_url = result.get("final_url")
|
||||
trace.final_answer = result.get("final_answer")
|
||||
trace.error = result.get("error")
|
||||
trace.fail_reason = result.get("fail_reason")
|
||||
trace.total_tokens = result.get("total_tokens")
|
||||
trace.screenshots_dir = result.get("screenshots_dir")
|
||||
|
||||
raw_steps = result.get("steps", [])
|
||||
trace.total_steps = len(raw_steps)
|
||||
|
||||
for i, step in enumerate(raw_steps):
|
||||
trace.steps.append(
|
||||
StepTrace(
|
||||
step_no=i,
|
||||
timestamp=step.get("timestamp", time.time()),
|
||||
thought=step.get("thought"),
|
||||
action_type=step.get("action_type", "unknown"),
|
||||
action_target=step.get("action_target"),
|
||||
action_value=step.get("action_value"),
|
||||
url_before=step.get("url_before"),
|
||||
url_after=step.get("url_after"),
|
||||
screenshot_before=step.get("screenshot_before"),
|
||||
screenshot_after=step.get("screenshot_after"),
|
||||
success=step.get("success", True),
|
||||
error=step.get("error"),
|
||||
)
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
trace.error = str(e)
|
||||
trace.fail_reason = "runtime_exception"
|
||||
|
||||
finally:
|
||||
finished_at = time.time()
|
||||
trace.finished_at = finished_at
|
||||
trace.total_latency_sec = finished_at - started_at
|
||||
|
||||
return trace
|
||||
23
stuff/runner.py
Normal file
23
stuff/runner.py
Normal file
|
|
@ -0,0 +1,23 @@
|
|||
from typing import List
|
||||
|
||||
from one_Task_class import Task
|
||||
from RunTrace import RunTrace
|
||||
from run_task import run_task
|
||||
|
||||
|
||||
def run_many_tasks(tasks: List[Task], agent) -> List[RunTrace]:
|
||||
results: List[RunTrace] = []
|
||||
|
||||
for i, task in enumerate(tasks):
|
||||
print(f"\n=== Running task {i + 1}/{len(tasks)} ===")
|
||||
print(f"Instruction: {task.instruction}\n")
|
||||
|
||||
trace = run_task(task, agent)
|
||||
|
||||
results.append(trace)
|
||||
|
||||
print("Success:", trace.success)
|
||||
if trace.error:
|
||||
print("Error:", trace.error)
|
||||
|
||||
return results
|
||||
Loading…
Add table
Add a link
Reference in a new issue