mind2web
This commit is contained in:
parent
2b5d923f63
commit
98d5e90894
754 changed files with 1175740 additions and 142424 deletions
111
Mind2Web/eval_v2/llm_judge.py
Normal file
111
Mind2Web/eval_v2/llm_judge.py
Normal file
|
|
@ -0,0 +1,111 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import time
|
||||
from typing import Any
|
||||
|
||||
from openai import OpenAI
|
||||
|
||||
client = OpenAI(
|
||||
base_url="https://foundation-models.api.cloud.ru/v1",
|
||||
api_key="NDg4MjFiMmEtYmUwMS00NjQ2LWFhMDQtODBkOGJkNGE3OWFl.70cdaa81798e68cb8e115500c2081310",
|
||||
)
|
||||
|
||||
JUDGE_MODEL = "t-tech/T-lite-it-2.1"
|
||||
|
||||
JUDGE_PROMPT = """
|
||||
You are an evaluator of web automation agents.
|
||||
|
||||
TASK:
|
||||
{instruction}
|
||||
|
||||
AGENT FINAL ANSWER:
|
||||
{final_answer}
|
||||
|
||||
Evaluate whether the final answer satisfies the task.
|
||||
|
||||
Rules:
|
||||
1. Exact task requirements must be preserved.
|
||||
2. If the agent changes city/date/entity/item/destination in a way that violates the task, verdict must be fail.
|
||||
3. If the final answer explicitly says the task was not completed, blocked, failed, timed out, or only partially completed, verdict must be fail.
|
||||
4. If the final answer appears to satisfy the task, verdict may be pass.
|
||||
|
||||
Scoring:
|
||||
- 1.0 = fully correct
|
||||
- 0.5 = partially correct
|
||||
- 0.0 = incorrect
|
||||
|
||||
Return valid JSON only:
|
||||
{{
|
||||
"verdict": "pass" or "fail",
|
||||
"score": a number between 0.0 and 1.0,
|
||||
"reason": "short reason"
|
||||
}}
|
||||
"""
|
||||
|
||||
|
||||
def llm_judge(instruction: str, final_answer: str) -> dict[str, Any]:
|
||||
text = (final_answer or "").strip()
|
||||
text_lower = text.lower()
|
||||
|
||||
explicit_fail_markers = [
|
||||
"task not completed",
|
||||
"partial result",
|
||||
"could not complete the task",
|
||||
"agent did not complete the task",
|
||||
"timed out",
|
||||
"timeout after",
|
||||
"captcha blocked",
|
||||
"task failed:",
|
||||
"failed to complete",
|
||||
"not completed",
|
||||
]
|
||||
|
||||
opening = text_lower[:300]
|
||||
if any(marker in opening for marker in explicit_fail_markers):
|
||||
return {
|
||||
"verdict": "fail",
|
||||
"score": 0.0,
|
||||
"reason": "explicit failure in final answer",
|
||||
}
|
||||
|
||||
prompt = JUDGE_PROMPT.format(
|
||||
instruction=instruction,
|
||||
final_answer=final_answer,
|
||||
)
|
||||
|
||||
last_error: Exception | None = None
|
||||
|
||||
for attempt in range(3):
|
||||
try:
|
||||
resp = client.chat.completions.create(
|
||||
model=JUDGE_MODEL,
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
temperature=0,
|
||||
)
|
||||
|
||||
content = (resp.choices[0].message.content or "").strip()
|
||||
parsed = json.loads(content)
|
||||
|
||||
verdict = parsed.get("verdict", "fail")
|
||||
score = parsed.get("score", None)
|
||||
reason = parsed.get("reason", "no reason")
|
||||
|
||||
if score is None:
|
||||
score = 1.0 if verdict == "pass" else 0.0
|
||||
|
||||
return {
|
||||
"verdict": verdict,
|
||||
"score": float(score),
|
||||
"reason": reason,
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
last_error = e
|
||||
time.sleep(3 * (attempt + 1))
|
||||
|
||||
return {
|
||||
"verdict": "error",
|
||||
"score": 0.0,
|
||||
"reason": f"judge_failed: {last_error}",
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue