from __future__ import annotations import ast import re from typing import Any def _short_text(value: Any, limit: int = 200) -> str: if value is None: return "" text = str(value).strip().replace("\n", " ") return text[:limit] def _extract_text_from_attributes(attr_text: str) -> list[str]: """ Пытаемся вытащить человекочитаемые куски из attributes-строки. """ results: list[str] = [] if not attr_text: return results # пробуем распарсить как dict-строку try: parsed = ast.literal_eval(attr_text) if isinstance(parsed, dict): for key in ("placeholder", "aria_label", "title", "value", "name", "text"): v = parsed.get(key) if isinstance(v, str) and v.strip(): results.append(v.strip()) except Exception: pass # fallback regex patterns = [ r'"placeholder":\s*"([^"]+)"', r'"aria-label":\s*"([^"]+)"', r'"aria_label":\s*"([^"]+)"', r'"title":\s*"([^"]+)"', r'"value":\s*"([^"]+)"', r'"name":\s*"([^"]+)"', r'"text":\s*"([^"]+)"', ] for pattern in patterns: for match in re.findall(pattern, attr_text): if match.strip(): results.append(match.strip()) return results def _extract_candidate_texts(candidates: Any, limit: int = 5) -> list[str]: results: list[str] = [] if not isinstance(candidates, list): return results for item in candidates[:limit]: pieces: list[str] = [] # обычный dict if isinstance(item, dict): for key in ("text", "value", "inner_text", "title", "aria_label", "placeholder"): v = item.get(key) if isinstance(v, str) and v.strip(): pieces.append(v.strip()) # raw attributes attrs = item.get("attributes") if isinstance(attrs, str): pieces.extend(_extract_text_from_attributes(attrs)) # tag бывает полезен как контекст tag = item.get("tag") if isinstance(tag, str) and tag.strip(): pieces.append(tag.strip()) # иногда candidate уже сериализован строкой elif isinstance(item, str): try: parsed = ast.literal_eval(item) if isinstance(parsed, dict): for key in ("text", "value", "inner_text", "title", "aria_label", "placeholder"): v = parsed.get(key) if isinstance(v, str) and v.strip(): pieces.append(v.strip()) attrs = parsed.get("attributes") if isinstance(attrs, str): pieces.extend(_extract_text_from_attributes(attrs)) tag = parsed.get("tag") if isinstance(tag, str) and tag.strip(): pieces.append(tag.strip()) else: pieces.append(_short_text(item, 120)) except Exception: pieces.append(_short_text(item, 120)) else: pieces.append(_short_text(item, 120)) # чистим дубли cleaned = [] seen = set() for p in pieces: p = p.strip() if p and p.lower() not in seen: cleaned.append(p) seen.add(p.lower()) if cleaned: results.append(" | ".join(cleaned)) else: results.append(_short_text(item, 120)) return results def parse_gold_action(action: dict[str, Any]) -> dict[str, Any]: operation = action.get("operation", {}) if not isinstance(operation, dict): operation = {} action_type = ( operation.get("op") or operation.get("operation") or operation.get("type") or "unknown" ) value = operation.get("value", "") cleaned_html = action.get("cleaned_html", "") raw_html = action.get("raw_html", "") pos_candidates = action.get("pos_candidates", []) neg_candidates = action.get("neg_candidates", []) return { "type": str(action_type).strip().lower(), "value": _short_text(value, 200), "html_snippet": _short_text(cleaned_html or raw_html, 300), "pos_candidates": _extract_candidate_texts(pos_candidates, limit=5), "neg_candidates": _extract_candidate_texts(neg_candidates, limit=3), "raw_operation": operation, } def parse_gold_actions(gold_actions: list[dict[str, Any]]) -> list[dict[str, Any]]: parsed: list[dict[str, Any]] = [] for action in gold_actions: if not isinstance(action, dict): parsed.append({ "type": "unknown", "value": "", "html_snippet": _short_text(action, 300), "pos_candidates": [], "neg_candidates": [], "raw_operation": {}, }) continue parsed.append(parse_gold_action(action)) return parsed