Quality_evaluation/Mind2Web/eval_v2/comparator.py
Aleksandr Dubchak 98d5e90894 mind2web
2026-04-23 00:04:11 +03:00

91 lines
2.2 KiB
Python

from __future__ import annotations
def canonical_action_type(action_type: str) -> str:
if not action_type:
return "unknown"
t = action_type.strip().lower()
mapping = {
"click": "click",
"type": "type",
"select": "select",
"hover": "hover",
"enter": "type",
"navigate": "navigate",
"scroll": "scroll",
"search_page": "search_page",
"extract": "extract",
"done": "done",
}
return mapping.get(t, t)
def normalize_for_compare(actions: list[dict]) -> list[dict]:
result = []
for action in actions:
if not isinstance(action, dict):
result.append({"type": "unknown", "raw": action})
continue
action_type = canonical_action_type(action.get("type", "unknown"))
result.append({
"type": action_type,
"raw": action.get("raw"),
})
return result
def compare_action_sequences(gold_actions: list[dict], agent_actions: list[dict]) -> dict:
gold = normalize_for_compare(gold_actions)
agent = normalize_for_compare(agent_actions)
min_len = min(len(gold), len(agent))
aligned = []
exact_matches = 0
for i in range(min_len):
g = gold[i]
a = agent[i]
matched = g["type"] == a["type"]
if matched:
exact_matches += 1
aligned.append({
"step": i + 1,
"gold_type": g["type"],
"agent_type": a["type"],
"match": matched,
})
missing_gold = gold[min_len:]
extra_agent = agent[min_len:]
gold_len = len(gold)
agent_len = len(agent)
precision = exact_matches / agent_len if agent_len else 0.0
recall = exact_matches / gold_len if gold_len else 0.0
f1 = (
2 * precision * recall / (precision + recall)
if (precision + recall) > 0
else 0.0
)
return {
"gold_len": gold_len,
"agent_len": agent_len,
"exact_matches": exact_matches,
"precision": round(precision, 3),
"recall": round(recall, 3),
"f1": round(f1, 3),
"aligned": aligned,
"missing_gold": missing_gold,
"extra_agent": extra_agent,
}