Quality_evaluation/Mind2Web/eval_v2/llm_judge.py

from __future__ import annotations

import json
import time
from typing import Any

from openai import OpenAI

client = OpenAI(
    base_url="https://foundation-models.api.cloud.ru/v1",
    api_key="NDg4MjFiMmEtYmUwMS00NjQ2LWFhMDQtODBkOGJkNGE3OWFl.70cdaa81798e68cb8e115500c2081310",
)

JUDGE_MODEL = "t-tech/T-lite-it-2.1"

JUDGE_PROMPT = """
You are an evaluator of web automation agents.

TASK:
{instruction}

AGENT FINAL ANSWER:
{final_answer}

Evaluate whether the final answer satisfies the task.

Rules:
1. Exact task requirements must be preserved.
2. If the agent changes city/date/entity/item/destination in a way that violates the task, verdict must be fail.
3. If the final answer explicitly says the task was not completed, blocked, failed, timed out, or only partially completed, verdict must be fail.
4. If the final answer appears to satisfy the task, verdict may be pass.

Scoring:
- 1.0 = fully correct
- 0.5 = partially correct
- 0.0 = incorrect

Return valid JSON only:
{{
  "verdict": "pass" or "fail",
  "score": a number between 0.0 and 1.0,
  "reason": "short reason"
}}
"""


def llm_judge(instruction: str, final_answer: str) -> dict[str, Any]:
    text = (final_answer or "").strip()
    text_lower = text.lower()

    explicit_fail_markers = [
        "task not completed",
        "partial result",
        "could not complete the task",
        "agent did not complete the task",
        "timed out",
        "timeout after",
        "captcha blocked",
        "task failed:",
        "failed to complete",
        "not completed",
    ]

    opening = text_lower[:300]
    if any(marker in opening for marker in explicit_fail_markers):
        return {
            "verdict": "fail",
            "score": 0.0,
            "reason": "explicit failure in final answer",
        }

    prompt = JUDGE_PROMPT.format(
        instruction=instruction,
        final_answer=final_answer,
    )

    last_error: Exception | None = None

    for attempt in range(3):
        try:
            resp = client.chat.completions.create(
                model=JUDGE_MODEL,
                messages=[{"role": "user", "content": prompt}],
                temperature=0,
            )

            content = (resp.choices[0].message.content or "").strip()
            parsed = json.loads(content)

            verdict = parsed.get("verdict", "fail")
            score = parsed.get("score", None)
            reason = parsed.get("reason", "no reason")

            if score is None:
                score = 1.0 if verdict == "pass" else 0.0

            return {
                "verdict": verdict,
                "score": float(score),
                "reason": reason,
            }

        except Exception as e:
            last_error = e
            time.sleep(3 * (attempt + 1))

    return {
        "verdict": "error",
        "score": 0.0,
        "reason": f"judge_failed: {last_error}",
    }