111 lines
2.8 KiB
Python
111 lines
2.8 KiB
Python
from __future__ import annotations
|
|
|
|
import json
|
|
import time
|
|
from typing import Any
|
|
|
|
from openai import OpenAI
|
|
|
|
client = OpenAI(
|
|
base_url="https://foundation-models.api.cloud.ru/v1",
|
|
api_key="NDg4MjFiMmEtYmUwMS00NjQ2LWFhMDQtODBkOGJkNGE3OWFl.70cdaa81798e68cb8e115500c2081310",
|
|
)
|
|
|
|
JUDGE_MODEL = "t-tech/T-lite-it-2.1"
|
|
|
|
JUDGE_PROMPT = """
|
|
You are an evaluator of web automation agents.
|
|
|
|
TASK:
|
|
{instruction}
|
|
|
|
AGENT FINAL ANSWER:
|
|
{final_answer}
|
|
|
|
Evaluate whether the final answer satisfies the task.
|
|
|
|
Rules:
|
|
1. Exact task requirements must be preserved.
|
|
2. If the agent changes city/date/entity/item/destination in a way that violates the task, verdict must be fail.
|
|
3. If the final answer explicitly says the task was not completed, blocked, failed, timed out, or only partially completed, verdict must be fail.
|
|
4. If the final answer appears to satisfy the task, verdict may be pass.
|
|
|
|
Scoring:
|
|
- 1.0 = fully correct
|
|
- 0.5 = partially correct
|
|
- 0.0 = incorrect
|
|
|
|
Return valid JSON only:
|
|
{{
|
|
"verdict": "pass" or "fail",
|
|
"score": a number between 0.0 and 1.0,
|
|
"reason": "short reason"
|
|
}}
|
|
"""
|
|
|
|
|
|
def llm_judge(instruction: str, final_answer: str) -> dict[str, Any]:
|
|
text = (final_answer or "").strip()
|
|
text_lower = text.lower()
|
|
|
|
explicit_fail_markers = [
|
|
"task not completed",
|
|
"partial result",
|
|
"could not complete the task",
|
|
"agent did not complete the task",
|
|
"timed out",
|
|
"timeout after",
|
|
"captcha blocked",
|
|
"task failed:",
|
|
"failed to complete",
|
|
"not completed",
|
|
]
|
|
|
|
opening = text_lower[:300]
|
|
if any(marker in opening for marker in explicit_fail_markers):
|
|
return {
|
|
"verdict": "fail",
|
|
"score": 0.0,
|
|
"reason": "explicit failure in final answer",
|
|
}
|
|
|
|
prompt = JUDGE_PROMPT.format(
|
|
instruction=instruction,
|
|
final_answer=final_answer,
|
|
)
|
|
|
|
last_error: Exception | None = None
|
|
|
|
for attempt in range(3):
|
|
try:
|
|
resp = client.chat.completions.create(
|
|
model=JUDGE_MODEL,
|
|
messages=[{"role": "user", "content": prompt}],
|
|
temperature=0,
|
|
)
|
|
|
|
content = (resp.choices[0].message.content or "").strip()
|
|
parsed = json.loads(content)
|
|
|
|
verdict = parsed.get("verdict", "fail")
|
|
score = parsed.get("score", None)
|
|
reason = parsed.get("reason", "no reason")
|
|
|
|
if score is None:
|
|
score = 1.0 if verdict == "pass" else 0.0
|
|
|
|
return {
|
|
"verdict": verdict,
|
|
"score": float(score),
|
|
"reason": reason,
|
|
}
|
|
|
|
except Exception as e:
|
|
last_error = e
|
|
time.sleep(3 * (attempt + 1))
|
|
|
|
return {
|
|
"verdict": "error",
|
|
"score": 0.0,
|
|
"reason": f"judge_failed: {last_error}",
|
|
}
|