Quality_evaluation/stuff/run_agent_task.py
Aleksandr Dubchak 98d5e90894 mind2web
2026-04-23 00:04:11 +03:00

99 lines
3.4 KiB
Python

import json
import time
from typing import Any, Dict, Optional
async def run_agent_task(
agent,
instruction: str,
session_id: str,
history: Optional[list] = None,
) -> Dict[str, Any]:
history = history or []
started_at = time.time()
result = agent.run_conversation(
instruction,
conversation_history=history,
task_id=session_id,
)
finished_at = time.time()
normalized: Dict[str, Any] = {
"success": False,
"final_url": None,
"final_answer": result.get("final_response"),
"fail_reason": None,
"error": None,
"total_tokens": result.get("total_tokens"),
"input_tokens": result.get("input_tokens"),
"output_tokens": result.get("output_tokens"),
"model_name": result.get("model"),
"screenshots_dir": None,
"steps": [],
"started_at": started_at,
"finished_at": finished_at,
"raw_result": result,
}
# Общая диагностика верхнего уровня
if not result.get("completed", False):
normalized["fail_reason"] = "not_completed"
if result.get("interrupted", False):
normalized["fail_reason"] = "interrupted"
# Ищем tool output
messages = result.get("messages", [])
for msg in messages:
if msg.get("role") != "tool":
continue
content = msg.get("content")
if not content:
continue
try:
tool_payload = json.loads(content)
except Exception:
continue
# Основной success браузерного шага
normalized["success"] = bool(tool_payload.get("success", False))
# Публичная ссылка на browser view
normalized["final_url"] = tool_payload.get("browser_view")
# Если tool сам вернул текст результата, это полезно сохранить
if not normalized["final_answer"]:
normalized["final_answer"] = tool_payload.get("result")
if not normalized["success"] and normalized["fail_reason"] is None:
normalized["fail_reason"] = "tool_failed"
# Сохраняем шаг как минимум на уровне tool-call
normalized["steps"].append(
{
"timestamp": finished_at,
"thought": None,
"action_type": "TOOL_CALL",
"action_target": msg.get("tool_call_id"),
"action_value": None,
"url_before": None,
"url_after": tool_payload.get("browser_view"),
"screenshot_before": None,
"screenshot_after": None,
"success": bool(tool_payload.get("success", False)),
"error": None if tool_payload.get("success", False) else tool_payload.get("result"),
}
)
# Если tool output не нашли, но completed=True, это отдельный класс ошибки
if not normalized["steps"] and result.get("completed", False):
normalized["fail_reason"] = normalized["fail_reason"] or "no_tool_output"
# Верхнеуровневая ошибка, если агент вообще не завершился нормально
if not normalized["success"] and normalized["error"] is None and normalized["fail_reason"] is None:
normalized["fail_reason"] = "unknown_failure"
return normalized