167 lines
No EOL
5.1 KiB
Python
167 lines
No EOL
5.1 KiB
Python
from __future__ import annotations
|
|
|
|
import ast
|
|
import re
|
|
from typing import Any
|
|
|
|
|
|
def _short_text(value: Any, limit: int = 200) -> str:
|
|
if value is None:
|
|
return ""
|
|
text = str(value).strip().replace("\n", " ")
|
|
return text[:limit]
|
|
|
|
|
|
def _extract_text_from_attributes(attr_text: str) -> list[str]:
|
|
"""
|
|
Пытаемся вытащить человекочитаемые куски из attributes-строки.
|
|
"""
|
|
results: list[str] = []
|
|
|
|
if not attr_text:
|
|
return results
|
|
|
|
# пробуем распарсить как dict-строку
|
|
try:
|
|
parsed = ast.literal_eval(attr_text)
|
|
if isinstance(parsed, dict):
|
|
for key in ("placeholder", "aria_label", "title", "value", "name", "text"):
|
|
v = parsed.get(key)
|
|
if isinstance(v, str) and v.strip():
|
|
results.append(v.strip())
|
|
except Exception:
|
|
pass
|
|
|
|
# fallback regex
|
|
patterns = [
|
|
r'"placeholder":\s*"([^"]+)"',
|
|
r'"aria-label":\s*"([^"]+)"',
|
|
r'"aria_label":\s*"([^"]+)"',
|
|
r'"title":\s*"([^"]+)"',
|
|
r'"value":\s*"([^"]+)"',
|
|
r'"name":\s*"([^"]+)"',
|
|
r'"text":\s*"([^"]+)"',
|
|
]
|
|
|
|
for pattern in patterns:
|
|
for match in re.findall(pattern, attr_text):
|
|
if match.strip():
|
|
results.append(match.strip())
|
|
|
|
return results
|
|
|
|
|
|
def _extract_candidate_texts(candidates: Any, limit: int = 5) -> list[str]:
|
|
results: list[str] = []
|
|
|
|
if not isinstance(candidates, list):
|
|
return results
|
|
|
|
for item in candidates[:limit]:
|
|
pieces: list[str] = []
|
|
|
|
# обычный dict
|
|
if isinstance(item, dict):
|
|
for key in ("text", "value", "inner_text", "title", "aria_label", "placeholder"):
|
|
v = item.get(key)
|
|
if isinstance(v, str) and v.strip():
|
|
pieces.append(v.strip())
|
|
|
|
# raw attributes
|
|
attrs = item.get("attributes")
|
|
if isinstance(attrs, str):
|
|
pieces.extend(_extract_text_from_attributes(attrs))
|
|
|
|
# tag бывает полезен как контекст
|
|
tag = item.get("tag")
|
|
if isinstance(tag, str) and tag.strip():
|
|
pieces.append(tag.strip())
|
|
|
|
# иногда candidate уже сериализован строкой
|
|
elif isinstance(item, str):
|
|
try:
|
|
parsed = ast.literal_eval(item)
|
|
if isinstance(parsed, dict):
|
|
for key in ("text", "value", "inner_text", "title", "aria_label", "placeholder"):
|
|
v = parsed.get(key)
|
|
if isinstance(v, str) and v.strip():
|
|
pieces.append(v.strip())
|
|
|
|
attrs = parsed.get("attributes")
|
|
if isinstance(attrs, str):
|
|
pieces.extend(_extract_text_from_attributes(attrs))
|
|
|
|
tag = parsed.get("tag")
|
|
if isinstance(tag, str) and tag.strip():
|
|
pieces.append(tag.strip())
|
|
else:
|
|
pieces.append(_short_text(item, 120))
|
|
except Exception:
|
|
pieces.append(_short_text(item, 120))
|
|
else:
|
|
pieces.append(_short_text(item, 120))
|
|
|
|
# чистим дубли
|
|
cleaned = []
|
|
seen = set()
|
|
for p in pieces:
|
|
p = p.strip()
|
|
if p and p.lower() not in seen:
|
|
cleaned.append(p)
|
|
seen.add(p.lower())
|
|
|
|
if cleaned:
|
|
results.append(" | ".join(cleaned))
|
|
else:
|
|
results.append(_short_text(item, 120))
|
|
|
|
return results
|
|
|
|
|
|
def parse_gold_action(action: dict[str, Any]) -> dict[str, Any]:
|
|
operation = action.get("operation", {})
|
|
if not isinstance(operation, dict):
|
|
operation = {}
|
|
|
|
action_type = (
|
|
operation.get("op")
|
|
or operation.get("operation")
|
|
or operation.get("type")
|
|
or "unknown"
|
|
)
|
|
|
|
value = operation.get("value", "")
|
|
cleaned_html = action.get("cleaned_html", "")
|
|
raw_html = action.get("raw_html", "")
|
|
|
|
pos_candidates = action.get("pos_candidates", [])
|
|
neg_candidates = action.get("neg_candidates", [])
|
|
|
|
return {
|
|
"type": str(action_type).strip().lower(),
|
|
"value": _short_text(value, 200),
|
|
"html_snippet": _short_text(cleaned_html or raw_html, 300),
|
|
"pos_candidates": _extract_candidate_texts(pos_candidates, limit=5),
|
|
"neg_candidates": _extract_candidate_texts(neg_candidates, limit=3),
|
|
"raw_operation": operation,
|
|
}
|
|
|
|
|
|
def parse_gold_actions(gold_actions: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
|
parsed: list[dict[str, Any]] = []
|
|
|
|
for action in gold_actions:
|
|
if not isinstance(action, dict):
|
|
parsed.append({
|
|
"type": "unknown",
|
|
"value": "",
|
|
"html_snippet": _short_text(action, 300),
|
|
"pos_candidates": [],
|
|
"neg_candidates": [],
|
|
"raw_operation": {},
|
|
})
|
|
continue
|
|
|
|
parsed.append(parse_gold_action(action))
|
|
|
|
return parsed |