from __future__ import annotations def canonical_action_type(action_type: str) -> str: if not action_type: return "unknown" t = action_type.strip().lower() mapping = { "click": "click", "type": "type", "select": "select", "hover": "hover", "enter": "type", "navigate": "navigate", "scroll": "scroll", "search_page": "search_page", "extract": "extract", "done": "done", } return mapping.get(t, t) def normalize_for_compare(actions: list[dict]) -> list[dict]: result = [] for action in actions: if not isinstance(action, dict): result.append({"type": "unknown", "raw": action}) continue action_type = canonical_action_type(action.get("type", "unknown")) result.append({ "type": action_type, "raw": action.get("raw"), }) return result def compare_action_sequences(gold_actions: list[dict], agent_actions: list[dict]) -> dict: gold = normalize_for_compare(gold_actions) agent = normalize_for_compare(agent_actions) min_len = min(len(gold), len(agent)) aligned = [] exact_matches = 0 for i in range(min_len): g = gold[i] a = agent[i] matched = g["type"] == a["type"] if matched: exact_matches += 1 aligned.append({ "step": i + 1, "gold_type": g["type"], "agent_type": a["type"], "match": matched, }) missing_gold = gold[min_len:] extra_agent = agent[min_len:] gold_len = len(gold) agent_len = len(agent) precision = exact_matches / agent_len if agent_len else 0.0 recall = exact_matches / gold_len if gold_len else 0.0 f1 = ( 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0 ) return { "gold_len": gold_len, "agent_len": agent_len, "exact_matches": exact_matches, "precision": round(precision, 3), "recall": round(recall, 3), "f1": round(f1, 3), "aligned": aligned, "missing_gold": missing_gold, "extra_agent": extra_agent, }