Quality_evaluation/stuff/run_agent_task.py

import json
import time
from typing import Any, Dict, Optional


async def run_agent_task(
        agent,
        instruction: str,
        session_id: str,
        history: Optional[list] = None,
) -> Dict[str, Any]:
    history = history or []

    started_at = time.time()

    result = agent.run_conversation(
        instruction,
        conversation_history=history,
        task_id=session_id,
    )

    finished_at = time.time()

    normalized: Dict[str, Any] = {
        "success": False,
        "final_url": None,
        "final_answer": result.get("final_response"),
        "fail_reason": None,
        "error": None,
        "total_tokens": result.get("total_tokens"),
        "input_tokens": result.get("input_tokens"),
        "output_tokens": result.get("output_tokens"),
        "model_name": result.get("model"),
        "screenshots_dir": None,
        "steps": [],
        "started_at": started_at,
        "finished_at": finished_at,
        "raw_result": result,
    }

    # Общая диагностика верхнего уровня
    if not result.get("completed", False):
        normalized["fail_reason"] = "not_completed"

    if result.get("interrupted", False):
        normalized["fail_reason"] = "interrupted"

    # Ищем tool output
    messages = result.get("messages", [])
    for msg in messages:
        if msg.get("role") != "tool":
            continue

        content = msg.get("content")
        if not content:
            continue

        try:
            tool_payload = json.loads(content)
        except Exception:
            continue

        # Основной success браузерного шага
        normalized["success"] = bool(tool_payload.get("success", False))

        # Публичная ссылка на browser view
        normalized["final_url"] = tool_payload.get("browser_view")

        # Если tool сам вернул текст результата, это полезно сохранить
        if not normalized["final_answer"]:
            normalized["final_answer"] = tool_payload.get("result")

        if not normalized["success"] and normalized["fail_reason"] is None:
            normalized["fail_reason"] = "tool_failed"

        # Сохраняем шаг как минимум на уровне tool-call
        normalized["steps"].append(
            {
                "timestamp": finished_at,
                "thought": None,
                "action_type": "TOOL_CALL",
                "action_target": msg.get("tool_call_id"),
                "action_value": None,
                "url_before": None,
                "url_after": tool_payload.get("browser_view"),
                "screenshot_before": None,
                "screenshot_after": None,
                "success": bool(tool_payload.get("success", False)),
                "error": None if tool_payload.get("success", False) else tool_payload.get("result"),
            }
        )
    # Если tool output не нашли, но completed=True, это отдельный класс ошибки
    if not normalized["steps"] and result.get("completed", False):
        normalized["fail_reason"] = normalized["fail_reason"] or "no_tool_output"

    # Верхнеуровневая ошибка, если агент вообще не завершился нормально
    if not normalized["success"] and normalized["error"] is None and normalized["fail_reason"] is None:
        normalized["fail_reason"] = "unknown_failure"
    return normalized