Quality_evaluation/Mind2Web/eval_v2/gold_parser.py

from __future__ import annotations

import ast
import re
from typing import Any


def _short_text(value: Any, limit: int = 200) -> str:
    if value is None:
        return ""
    text = str(value).strip().replace("\n", " ")
    return text[:limit]


def _extract_text_from_attributes(attr_text: str) -> list[str]:
    """
    Пытаемся вытащить человекочитаемые куски из attributes-строки.
    """
    results: list[str] = []

    if not attr_text:
        return results

    # пробуем распарсить как dict-строку
    try:
        parsed = ast.literal_eval(attr_text)
        if isinstance(parsed, dict):
            for key in ("placeholder", "aria_label", "title", "value", "name", "text"):
                v = parsed.get(key)
                if isinstance(v, str) and v.strip():
                    results.append(v.strip())
    except Exception:
        pass

    # fallback regex
    patterns = [
        r'"placeholder":\s*"([^"]+)"',
        r'"aria-label":\s*"([^"]+)"',
        r'"aria_label":\s*"([^"]+)"',
        r'"title":\s*"([^"]+)"',
        r'"value":\s*"([^"]+)"',
        r'"name":\s*"([^"]+)"',
        r'"text":\s*"([^"]+)"',
    ]

    for pattern in patterns:
        for match in re.findall(pattern, attr_text):
            if match.strip():
                results.append(match.strip())

    return results


def _extract_candidate_texts(candidates: Any, limit: int = 5) -> list[str]:
    results: list[str] = []

    if not isinstance(candidates, list):
        return results

    for item in candidates[:limit]:
        pieces: list[str] = []

        # обычный dict
        if isinstance(item, dict):
            for key in ("text", "value", "inner_text", "title", "aria_label", "placeholder"):
                v = item.get(key)
                if isinstance(v, str) and v.strip():
                    pieces.append(v.strip())

            # raw attributes
            attrs = item.get("attributes")
            if isinstance(attrs, str):
                pieces.extend(_extract_text_from_attributes(attrs))

            # tag бывает полезен как контекст
            tag = item.get("tag")
            if isinstance(tag, str) and tag.strip():
                pieces.append(tag.strip())

        # иногда candidate уже сериализован строкой
        elif isinstance(item, str):
            try:
                parsed = ast.literal_eval(item)
                if isinstance(parsed, dict):
                    for key in ("text", "value", "inner_text", "title", "aria_label", "placeholder"):
                        v = parsed.get(key)
                        if isinstance(v, str) and v.strip():
                            pieces.append(v.strip())

                    attrs = parsed.get("attributes")
                    if isinstance(attrs, str):
                        pieces.extend(_extract_text_from_attributes(attrs))

                    tag = parsed.get("tag")
                    if isinstance(tag, str) and tag.strip():
                        pieces.append(tag.strip())
                else:
                    pieces.append(_short_text(item, 120))
            except Exception:
                pieces.append(_short_text(item, 120))
        else:
            pieces.append(_short_text(item, 120))

        # чистим дубли
        cleaned = []
        seen = set()
        for p in pieces:
            p = p.strip()
            if p and p.lower() not in seen:
                cleaned.append(p)
                seen.add(p.lower())

        if cleaned:
            results.append(" | ".join(cleaned))
        else:
            results.append(_short_text(item, 120))

    return results


def parse_gold_action(action: dict[str, Any]) -> dict[str, Any]:
    operation = action.get("operation", {})
    if not isinstance(operation, dict):
        operation = {}

    action_type = (
        operation.get("op")
        or operation.get("operation")
        or operation.get("type")
        or "unknown"
    )

    value = operation.get("value", "")
    cleaned_html = action.get("cleaned_html", "")
    raw_html = action.get("raw_html", "")

    pos_candidates = action.get("pos_candidates", [])
    neg_candidates = action.get("neg_candidates", [])

    return {
        "type": str(action_type).strip().lower(),
        "value": _short_text(value, 200),
        "html_snippet": _short_text(cleaned_html or raw_html, 300),
        "pos_candidates": _extract_candidate_texts(pos_candidates, limit=5),
        "neg_candidates": _extract_candidate_texts(neg_candidates, limit=3),
        "raw_operation": operation,
    }


def parse_gold_actions(gold_actions: list[dict[str, Any]]) -> list[dict[str, Any]]:
    parsed: list[dict[str, Any]] = []

    for action in gold_actions:
        if not isinstance(action, dict):
            parsed.append({
                "type": "unknown",
                "value": "",
                "html_snippet": _short_text(action, 300),
                "pos_candidates": [],
                "neg_candidates": [],
                "raw_operation": {},
            })
            continue

        parsed.append(parse_gold_action(action))

    return parsed