Quality_evaluation/Mind2Web/eval_v2/gold_parser.py
Aleksandr Dubchak 98d5e90894 mind2web
2026-04-23 00:04:11 +03:00

167 lines
No EOL
5.1 KiB
Python

from __future__ import annotations
import ast
import re
from typing import Any
def _short_text(value: Any, limit: int = 200) -> str:
if value is None:
return ""
text = str(value).strip().replace("\n", " ")
return text[:limit]
def _extract_text_from_attributes(attr_text: str) -> list[str]:
"""
Пытаемся вытащить человекочитаемые куски из attributes-строки.
"""
results: list[str] = []
if not attr_text:
return results
# пробуем распарсить как dict-строку
try:
parsed = ast.literal_eval(attr_text)
if isinstance(parsed, dict):
for key in ("placeholder", "aria_label", "title", "value", "name", "text"):
v = parsed.get(key)
if isinstance(v, str) and v.strip():
results.append(v.strip())
except Exception:
pass
# fallback regex
patterns = [
r'"placeholder":\s*"([^"]+)"',
r'"aria-label":\s*"([^"]+)"',
r'"aria_label":\s*"([^"]+)"',
r'"title":\s*"([^"]+)"',
r'"value":\s*"([^"]+)"',
r'"name":\s*"([^"]+)"',
r'"text":\s*"([^"]+)"',
]
for pattern in patterns:
for match in re.findall(pattern, attr_text):
if match.strip():
results.append(match.strip())
return results
def _extract_candidate_texts(candidates: Any, limit: int = 5) -> list[str]:
results: list[str] = []
if not isinstance(candidates, list):
return results
for item in candidates[:limit]:
pieces: list[str] = []
# обычный dict
if isinstance(item, dict):
for key in ("text", "value", "inner_text", "title", "aria_label", "placeholder"):
v = item.get(key)
if isinstance(v, str) and v.strip():
pieces.append(v.strip())
# raw attributes
attrs = item.get("attributes")
if isinstance(attrs, str):
pieces.extend(_extract_text_from_attributes(attrs))
# tag бывает полезен как контекст
tag = item.get("tag")
if isinstance(tag, str) and tag.strip():
pieces.append(tag.strip())
# иногда candidate уже сериализован строкой
elif isinstance(item, str):
try:
parsed = ast.literal_eval(item)
if isinstance(parsed, dict):
for key in ("text", "value", "inner_text", "title", "aria_label", "placeholder"):
v = parsed.get(key)
if isinstance(v, str) and v.strip():
pieces.append(v.strip())
attrs = parsed.get("attributes")
if isinstance(attrs, str):
pieces.extend(_extract_text_from_attributes(attrs))
tag = parsed.get("tag")
if isinstance(tag, str) and tag.strip():
pieces.append(tag.strip())
else:
pieces.append(_short_text(item, 120))
except Exception:
pieces.append(_short_text(item, 120))
else:
pieces.append(_short_text(item, 120))
# чистим дубли
cleaned = []
seen = set()
for p in pieces:
p = p.strip()
if p and p.lower() not in seen:
cleaned.append(p)
seen.add(p.lower())
if cleaned:
results.append(" | ".join(cleaned))
else:
results.append(_short_text(item, 120))
return results
def parse_gold_action(action: dict[str, Any]) -> dict[str, Any]:
operation = action.get("operation", {})
if not isinstance(operation, dict):
operation = {}
action_type = (
operation.get("op")
or operation.get("operation")
or operation.get("type")
or "unknown"
)
value = operation.get("value", "")
cleaned_html = action.get("cleaned_html", "")
raw_html = action.get("raw_html", "")
pos_candidates = action.get("pos_candidates", [])
neg_candidates = action.get("neg_candidates", [])
return {
"type": str(action_type).strip().lower(),
"value": _short_text(value, 200),
"html_snippet": _short_text(cleaned_html or raw_html, 300),
"pos_candidates": _extract_candidate_texts(pos_candidates, limit=5),
"neg_candidates": _extract_candidate_texts(neg_candidates, limit=3),
"raw_operation": operation,
}
def parse_gold_actions(gold_actions: list[dict[str, Any]]) -> list[dict[str, Any]]:
parsed: list[dict[str, Any]] = []
for action in gold_actions:
if not isinstance(action, dict):
parsed.append({
"type": "unknown",
"value": "",
"html_snippet": _short_text(action, 300),
"pos_candidates": [],
"neg_candidates": [],
"raw_operation": {},
})
continue
parsed.append(parse_gold_action(action))
return parsed