Quality_evaluation/Mind2Web/eval_v2/comparator.py

from __future__ import annotations


def canonical_action_type(action_type: str) -> str:
    if not action_type:
        return "unknown"

    t = action_type.strip().lower()

    mapping = {
        "click": "click",
        "type": "type",
        "select": "select",
        "hover": "hover",
        "enter": "type",

        "navigate": "navigate",
        "scroll": "scroll",
        "search_page": "search_page",
        "extract": "extract",
        "done": "done",
    }

    return mapping.get(t, t)


def normalize_for_compare(actions: list[dict]) -> list[dict]:
    result = []

    for action in actions:
        if not isinstance(action, dict):
            result.append({"type": "unknown", "raw": action})
            continue

        action_type = canonical_action_type(action.get("type", "unknown"))
        result.append({
            "type": action_type,
            "raw": action.get("raw"),
        })

    return result


def compare_action_sequences(gold_actions: list[dict], agent_actions: list[dict]) -> dict:
    gold = normalize_for_compare(gold_actions)
    agent = normalize_for_compare(agent_actions)

    min_len = min(len(gold), len(agent))

    aligned = []
    exact_matches = 0

    for i in range(min_len):
        g = gold[i]
        a = agent[i]
        matched = g["type"] == a["type"]
        if matched:
            exact_matches += 1

        aligned.append({
            "step": i + 1,
            "gold_type": g["type"],
            "agent_type": a["type"],
            "match": matched,
        })

    missing_gold = gold[min_len:]
    extra_agent = agent[min_len:]

    gold_len = len(gold)
    agent_len = len(agent)

    precision = exact_matches / agent_len if agent_len else 0.0
    recall = exact_matches / gold_len if gold_len else 0.0
    f1 = (
        2 * precision * recall / (precision + recall)
        if (precision + recall) > 0
        else 0.0
    )

    return {
        "gold_len": gold_len,
        "agent_len": agent_len,
        "exact_matches": exact_matches,
        "precision": round(precision, 3),
        "recall": round(recall, 3),
        "f1": round(f1, 3),
        "aligned": aligned,
        "missing_gold": missing_gold,
        "extra_agent": extra_agent,
    }