Quality_evaluation/Mind2Web/eval_v2/llm_judge.py
Aleksandr Dubchak 98d5e90894 mind2web
2026-04-23 00:04:11 +03:00

111 lines
2.8 KiB
Python

from __future__ import annotations
import json
import time
from typing import Any
from openai import OpenAI
client = OpenAI(
base_url="https://foundation-models.api.cloud.ru/v1",
api_key="NDg4MjFiMmEtYmUwMS00NjQ2LWFhMDQtODBkOGJkNGE3OWFl.70cdaa81798e68cb8e115500c2081310",
)
JUDGE_MODEL = "t-tech/T-lite-it-2.1"
JUDGE_PROMPT = """
You are an evaluator of web automation agents.
TASK:
{instruction}
AGENT FINAL ANSWER:
{final_answer}
Evaluate whether the final answer satisfies the task.
Rules:
1. Exact task requirements must be preserved.
2. If the agent changes city/date/entity/item/destination in a way that violates the task, verdict must be fail.
3. If the final answer explicitly says the task was not completed, blocked, failed, timed out, or only partially completed, verdict must be fail.
4. If the final answer appears to satisfy the task, verdict may be pass.
Scoring:
- 1.0 = fully correct
- 0.5 = partially correct
- 0.0 = incorrect
Return valid JSON only:
{{
"verdict": "pass" or "fail",
"score": a number between 0.0 and 1.0,
"reason": "short reason"
}}
"""
def llm_judge(instruction: str, final_answer: str) -> dict[str, Any]:
text = (final_answer or "").strip()
text_lower = text.lower()
explicit_fail_markers = [
"task not completed",
"partial result",
"could not complete the task",
"agent did not complete the task",
"timed out",
"timeout after",
"captcha blocked",
"task failed:",
"failed to complete",
"not completed",
]
opening = text_lower[:300]
if any(marker in opening for marker in explicit_fail_markers):
return {
"verdict": "fail",
"score": 0.0,
"reason": "explicit failure in final answer",
}
prompt = JUDGE_PROMPT.format(
instruction=instruction,
final_answer=final_answer,
)
last_error: Exception | None = None
for attempt in range(3):
try:
resp = client.chat.completions.create(
model=JUDGE_MODEL,
messages=[{"role": "user", "content": prompt}],
temperature=0,
)
content = (resp.choices[0].message.content or "").strip()
parsed = json.loads(content)
verdict = parsed.get("verdict", "fail")
score = parsed.get("score", None)
reason = parsed.get("reason", "no reason")
if score is None:
score = 1.0 if verdict == "pass" else 0.0
return {
"verdict": verdict,
"score": float(score),
"reason": reason,
}
except Exception as e:
last_error = e
time.sleep(3 * (attempt + 1))
return {
"verdict": "error",
"score": 0.0,
"reason": f"judge_failed: {last_error}",
}