Quality_evaluation/stuff/run_task.py

import time
import uuid
from one_Task_class import Task
from RunTrace import RunTrace
from StepTrace import StepTrace
from run_agent_task import run_agent_task


async def run_task(task: Task, agent, model_name: str = None) -> RunTrace:
    started_at = time.time()

    trace = RunTrace(
        run_id=str(uuid.uuid4()),
        task_id=task.id,
        dataset=task.dataset,
        instruction=task.instruction,
        model_name=model_name,
        started_at=started_at,
        finished_at=None,
        success=False,
        final_url=None,
        final_answer=None,
        error=None,
        fail_reason=None,
        total_steps=0,
        total_tokens=None,
        total_latency_sec=None,
        screenshots_dir=None,
        steps=[],
    )

    try:
        result = await run_agent_task(
            agent=agent,
            instruction=task.instruction,
            session_id=f"eval-{task.dataset}-{task.id}",
            history=[],
        )

        trace.success = bool(result.get("success", False))
        trace.final_url = result.get("final_url")
        trace.final_answer = result.get("final_answer")
        trace.error = result.get("error")
        trace.fail_reason = result.get("fail_reason")
        trace.total_tokens = result.get("total_tokens")
        trace.screenshots_dir = result.get("screenshots_dir")

        raw_steps = result.get("steps", [])
        trace.total_steps = len(raw_steps)

        for i, step in enumerate(raw_steps):
            trace.steps.append(
                StepTrace(
                    step_no=i,
                    timestamp=step.get("timestamp", time.time()),
                    thought=step.get("thought"),
                    action_type=step.get("action_type", "unknown"),
                    action_target=step.get("action_target"),
                    action_value=step.get("action_value"),
                    url_before=step.get("url_before"),
                    url_after=step.get("url_after"),
                    screenshot_before=step.get("screenshot_before"),
                    screenshot_after=step.get("screenshot_after"),
                    success=step.get("success", True),
                    error=step.get("error"),
                )
            )

    except Exception as e:
        trace.error = str(e)
        trace.fail_reason = "runtime_exception"

    finally:
        finished_at = time.time()
        trace.finished_at = finished_at
        trace.total_latency_sec = finished_at - started_at

    return trace