mind2web

2026-04-23 00:04:11 +03:00 · 2026-04-23 00:04:11 +03:00 · 98d5e90894
commit 98d5e90894
parent 2b5d923f63
754 changed files with 1175740 additions and 142424 deletions
--- a/Mind2Web/eval_v2/llm_judge.py
+++ b/Mind2Web/eval_v2/llm_judge.py
@ -0,0 +1,111 @@
+from __future__ import annotations
+
+import json
+import time
+from typing import Any
+
+from openai import OpenAI
+
+client = OpenAI(
+    base_url="https://foundation-models.api.cloud.ru/v1",
+    api_key="NDg4MjFiMmEtYmUwMS00NjQ2LWFhMDQtODBkOGJkNGE3OWFl.70cdaa81798e68cb8e115500c2081310",
+)
+
+JUDGE_MODEL = "t-tech/T-lite-it-2.1"
+
+JUDGE_PROMPT = """
+You are an evaluator of web automation agents.
+
+TASK:
+{instruction}
+
+AGENT FINAL ANSWER:
+{final_answer}
+
+Evaluate whether the final answer satisfies the task.
+
+Rules:
+1. Exact task requirements must be preserved.
+2. If the agent changes city/date/entity/item/destination in a way that violates the task, verdict must be fail.
+3. If the final answer explicitly says the task was not completed, blocked, failed, timed out, or only partially completed, verdict must be fail.
+4. If the final answer appears to satisfy the task, verdict may be pass.
+
+Scoring:
+- 1.0 = fully correct
+- 0.5 = partially correct
+- 0.0 = incorrect
+
+Return valid JSON only:
+{{
+  "verdict": "pass" or "fail",
+  "score": a number between 0.0 and 1.0,
+  "reason": "short reason"
+}}
+"""
+
+
+def llm_judge(instruction: str, final_answer: str) -> dict[str, Any]:
+    text = (final_answer or "").strip()
+    text_lower = text.lower()
+
+    explicit_fail_markers = [
+        "task not completed",
+        "partial result",
+        "could not complete the task",
+        "agent did not complete the task",
+        "timed out",
+        "timeout after",
+        "captcha blocked",
+        "task failed:",
+        "failed to complete",
+        "not completed",
+    ]
+
+    opening = text_lower[:300]
+    if any(marker in opening for marker in explicit_fail_markers):
+        return {
+            "verdict": "fail",
+            "score": 0.0,
+            "reason": "explicit failure in final answer",
+        }
+
+    prompt = JUDGE_PROMPT.format(
+        instruction=instruction,
+        final_answer=final_answer,
+    )
+
+    last_error: Exception | None = None
+
+    for attempt in range(3):
+        try:
+            resp = client.chat.completions.create(
+                model=JUDGE_MODEL,
+                messages=[{"role": "user", "content": prompt}],
+                temperature=0,
+            )
+
+            content = (resp.choices[0].message.content or "").strip()
+            parsed = json.loads(content)
+
+            verdict = parsed.get("verdict", "fail")
+            score = parsed.get("score", None)
+            reason = parsed.get("reason", "no reason")
+
+            if score is None:
+                score = 1.0 if verdict == "pass" else 0.0
+
+            return {
+                "verdict": verdict,
+                "score": float(score),
+                "reason": reason,
+            }
+
+        except Exception as e:
+            last_error = e
+            time.sleep(3 * (attempt + 1))
+
+    return {
+        "verdict": "error",
+        "score": 0.0,
+        "reason": f"judge_failed: {last_error}",
+    }