mind2web
This commit is contained in:
parent
2b5d923f63
commit
98d5e90894
754 changed files with 1175740 additions and 142424 deletions
167
Mind2Web/eval_v2/gold_parser.py
Normal file
167
Mind2Web/eval_v2/gold_parser.py
Normal file
|
|
@ -0,0 +1,167 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import ast
|
||||
import re
|
||||
from typing import Any
|
||||
|
||||
|
||||
def _short_text(value: Any, limit: int = 200) -> str:
|
||||
if value is None:
|
||||
return ""
|
||||
text = str(value).strip().replace("\n", " ")
|
||||
return text[:limit]
|
||||
|
||||
|
||||
def _extract_text_from_attributes(attr_text: str) -> list[str]:
|
||||
"""
|
||||
Пытаемся вытащить человекочитаемые куски из attributes-строки.
|
||||
"""
|
||||
results: list[str] = []
|
||||
|
||||
if not attr_text:
|
||||
return results
|
||||
|
||||
# пробуем распарсить как dict-строку
|
||||
try:
|
||||
parsed = ast.literal_eval(attr_text)
|
||||
if isinstance(parsed, dict):
|
||||
for key in ("placeholder", "aria_label", "title", "value", "name", "text"):
|
||||
v = parsed.get(key)
|
||||
if isinstance(v, str) and v.strip():
|
||||
results.append(v.strip())
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# fallback regex
|
||||
patterns = [
|
||||
r'"placeholder":\s*"([^"]+)"',
|
||||
r'"aria-label":\s*"([^"]+)"',
|
||||
r'"aria_label":\s*"([^"]+)"',
|
||||
r'"title":\s*"([^"]+)"',
|
||||
r'"value":\s*"([^"]+)"',
|
||||
r'"name":\s*"([^"]+)"',
|
||||
r'"text":\s*"([^"]+)"',
|
||||
]
|
||||
|
||||
for pattern in patterns:
|
||||
for match in re.findall(pattern, attr_text):
|
||||
if match.strip():
|
||||
results.append(match.strip())
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def _extract_candidate_texts(candidates: Any, limit: int = 5) -> list[str]:
|
||||
results: list[str] = []
|
||||
|
||||
if not isinstance(candidates, list):
|
||||
return results
|
||||
|
||||
for item in candidates[:limit]:
|
||||
pieces: list[str] = []
|
||||
|
||||
# обычный dict
|
||||
if isinstance(item, dict):
|
||||
for key in ("text", "value", "inner_text", "title", "aria_label", "placeholder"):
|
||||
v = item.get(key)
|
||||
if isinstance(v, str) and v.strip():
|
||||
pieces.append(v.strip())
|
||||
|
||||
# raw attributes
|
||||
attrs = item.get("attributes")
|
||||
if isinstance(attrs, str):
|
||||
pieces.extend(_extract_text_from_attributes(attrs))
|
||||
|
||||
# tag бывает полезен как контекст
|
||||
tag = item.get("tag")
|
||||
if isinstance(tag, str) and tag.strip():
|
||||
pieces.append(tag.strip())
|
||||
|
||||
# иногда candidate уже сериализован строкой
|
||||
elif isinstance(item, str):
|
||||
try:
|
||||
parsed = ast.literal_eval(item)
|
||||
if isinstance(parsed, dict):
|
||||
for key in ("text", "value", "inner_text", "title", "aria_label", "placeholder"):
|
||||
v = parsed.get(key)
|
||||
if isinstance(v, str) and v.strip():
|
||||
pieces.append(v.strip())
|
||||
|
||||
attrs = parsed.get("attributes")
|
||||
if isinstance(attrs, str):
|
||||
pieces.extend(_extract_text_from_attributes(attrs))
|
||||
|
||||
tag = parsed.get("tag")
|
||||
if isinstance(tag, str) and tag.strip():
|
||||
pieces.append(tag.strip())
|
||||
else:
|
||||
pieces.append(_short_text(item, 120))
|
||||
except Exception:
|
||||
pieces.append(_short_text(item, 120))
|
||||
else:
|
||||
pieces.append(_short_text(item, 120))
|
||||
|
||||
# чистим дубли
|
||||
cleaned = []
|
||||
seen = set()
|
||||
for p in pieces:
|
||||
p = p.strip()
|
||||
if p and p.lower() not in seen:
|
||||
cleaned.append(p)
|
||||
seen.add(p.lower())
|
||||
|
||||
if cleaned:
|
||||
results.append(" | ".join(cleaned))
|
||||
else:
|
||||
results.append(_short_text(item, 120))
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def parse_gold_action(action: dict[str, Any]) -> dict[str, Any]:
|
||||
operation = action.get("operation", {})
|
||||
if not isinstance(operation, dict):
|
||||
operation = {}
|
||||
|
||||
action_type = (
|
||||
operation.get("op")
|
||||
or operation.get("operation")
|
||||
or operation.get("type")
|
||||
or "unknown"
|
||||
)
|
||||
|
||||
value = operation.get("value", "")
|
||||
cleaned_html = action.get("cleaned_html", "")
|
||||
raw_html = action.get("raw_html", "")
|
||||
|
||||
pos_candidates = action.get("pos_candidates", [])
|
||||
neg_candidates = action.get("neg_candidates", [])
|
||||
|
||||
return {
|
||||
"type": str(action_type).strip().lower(),
|
||||
"value": _short_text(value, 200),
|
||||
"html_snippet": _short_text(cleaned_html or raw_html, 300),
|
||||
"pos_candidates": _extract_candidate_texts(pos_candidates, limit=5),
|
||||
"neg_candidates": _extract_candidate_texts(neg_candidates, limit=3),
|
||||
"raw_operation": operation,
|
||||
}
|
||||
|
||||
|
||||
def parse_gold_actions(gold_actions: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
||||
parsed: list[dict[str, Any]] = []
|
||||
|
||||
for action in gold_actions:
|
||||
if not isinstance(action, dict):
|
||||
parsed.append({
|
||||
"type": "unknown",
|
||||
"value": "",
|
||||
"html_snippet": _short_text(action, 300),
|
||||
"pos_candidates": [],
|
||||
"neg_candidates": [],
|
||||
"raw_operation": {},
|
||||
})
|
||||
continue
|
||||
|
||||
parsed.append(parse_gold_action(action))
|
||||
|
||||
return parsed
|
||||
Loading…
Add table
Add a link
Reference in a new issue