import requests import time import json import os import re from datasets import load_dataset from datetime import datetime # Конфигурация API API_URL = "http://localhost:8088/api/browser/tasks" HEADERS = {"Content-Type": "application/json"} # Загружаем датасет dataset = load_dataset("iMeanAI/Mind2Web-Live", split="train") TEST_SIZE = 10 # Количество задач для теста dataset = dataset.select(range(TEST_SIZE)) print(f"Загружено задач: {len(dataset)}") print(f"Поля: {dataset[0].keys()}\n") TIMESTAMP = datetime.now().strftime("%Y%m%d_%H%M%S") RESULTS_FILE = f"mind2web_benchmark_{TIMESTAMP}.json" CHECKPOINT_FILE = f"mind2web_checkpoint_{TIMESTAMP}.json" results = [] start_idx = 3 if os.path.exists(CHECKPOINT_FILE): with open(CHECKPOINT_FILE, 'r', encoding='utf-8') as f: checkpoint = json.load(f) results = checkpoint.get('results', []) start_idx = checkpoint.get('last_idx', 0) + 1 print(f"🔄 Возобновление с задачи {start_idx + 1}/{len(dataset)}") def save_checkpoint(results, idx): with open(CHECKPOINT_FILE, 'w', encoding='utf-8') as f: json.dump({'results': results, 'last_idx': idx}, f) def save_results(results): with open(RESULTS_FILE, 'w', encoding='utf-8') as f: json.dump(results, f, indent=2, ensure_ascii=False) def evaluate_quality(history, evaluation): """ Оценивает качество выполнения задачи на основе истории агента. evaluation: список узлов с полями: - match_function_name: тип проверки - content: данные для проверки - method: метод (selector, xpath и т.д.) """ if not evaluation or not isinstance(evaluation, list): return { "completion_rate": None, "task_success": None, "passed_nodes": 0, "total_nodes": 0, "details": [], "error": "No evaluation data" } # Собираем данные из истории visited_urls = [] clicked_elements = [] typed_text = [] selected_options = [] for step in history: if step.get("kind") != "action" or not step.get("data"): continue data = step["data"] print(data) # URL навигация if "navigate" in data: visited_urls.append(data["navigate"]["url"]) print("посещенный урл") print(visited_urls[-1]) # Клики и взаимодействия if data.get("interacted_element"): elem = data["interacted_element"] elem_info = { "xpath": elem.get("x_path"), "selector": None, # Можно сгенерировать из attributes "text": elem.get("ax_name"), "node_name": elem.get("node_name"), "attributes": elem.get("attributes", {}) } print(elem_info) if "click" in data: clicked_elements.append(elem_info) if "input" in data: typed_text.append({ **elem_info, "typed_text": data["input"].get("text") }) print("Инпут") print(typed_text[-1]) if "select" in data: selected_options.append({ **elem_info, "selected_value": data["select"].get("value") }) print("селект") print(selected_options[-1]) passed_nodes = 0 details = [] for i, node in enumerate(evaluation): match_func = node.get("match_function_name", "") content = node.get("content", {}) method = node.get("method", "") print(node) passed = False reason = "" if match_func == "url_included_match": # URL должен содержать reference_answer expected = content.get("reference_answer", "").lower() target_url = content.get("url", "").lower() for url in visited_urls: if expected in url.lower(): passed = True reason = f"URL содержит '{expected}': {url}" print(reason) break if not passed: reason = f"URL не содержит '{expected}'. Посещенные: {visited_urls}" print(reason) elif match_func == "url_exactly_match": # URL должен точно совпадать (с учетом параметров) expected_url = content.get("url", "").lower() for url in visited_urls: if url.lower() == expected_url: passed = True reason = f"URL точно совпадает: {url}" print(reason) break if not passed: reason = f"URL не совпадает с '{expected_url}'" print(reason) elif match_func == "element_path_exactly_match": # Проверка элемента по XPath или CSS селектору expected_path = content.get("reference_answer", "") target_url = content.get("url", "") # Проверяем, были ли мы на нужной странице correct_page = any(target_url in url for url in visited_urls) if correct_page: # Ищем элемент по XPath или селектору for click in clicked_elements: if method == "selector": # CSS селектор - нужно проверить # Пока упрощенно: ищем по части селектора if expected_path in str(click.get("attributes", {})): passed = True reason = f"Элемент найден по селектору: {click.get('text')}" break else: # XPath if click.get("xpath") == expected_path: passed = True reason = f"Элемент найден по XPath: {click.get('text')}" break if not passed: reason = f"Элемент '{expected_path}' не найден на странице {target_url}" else: reason = f"Не перешли на страницу {target_url}" elif match_func == "element_value_match": # Проверка значения элемента (для input/select) expected_value = content.get("reference_answer", "") for typed in typed_text: if expected_value.lower() in (typed.get("typed_text") or "").lower(): passed = True reason = f"Введен текст: '{typed['typed_text']}'" break else: reason = f"Неизвестный тип проверки: {match_func}" if passed: passed_nodes += 1 details.append({ "node_index": i, "match_function": match_func, "expected": content.get("reference_answer") or content.get("url"), "passed": passed, "reason": reason }) total_nodes = len(evaluation) completion_rate = passed_nodes / total_nodes if total_nodes > 0 else 0 task_success = (passed_nodes == total_nodes) return { "completion_rate": round(completion_rate, 3), "task_success": task_success, "passed_nodes": passed_nodes, "total_nodes": total_nodes, "details": details } # Основной цикл cnt = 0 # Пропустить первые N задач for idx, item in enumerate(dataset): if idx < start_idx: continue if cnt > 0: cnt -= 1 continue task_desc = item['task'] ref_length = item.get('reference_task_length', 0) evaluation = item.get('evaluation', []) print(evaluation) print(f"\n[{idx + 1}/{len(dataset)}] Task: {task_desc[:70]}...") print(f" Эталонная длина: {ref_length} шагов") print(f" Ключевых узлов: {len(evaluation) if isinstance(evaluation, list) else 'N/A'}") start_time = time.time() try: # 1. Создаем задачу resp = requests.post( API_URL, json={ "task": task_desc, "timeout": 300, "metadata": {"source": "mind2web", "reference_length": ref_length} }, headers=HEADERS, timeout=10 ) if resp.status_code != 202: print(f" ❌ Ошибка создания: {resp.status_code}") result = { "index": idx, "task_description": task_desc, "status": "creation_failed", "error": f"HTTP {resp.status_code}", "total_time_sec": 0, "timestamp": datetime.now().isoformat() } results.append(result) save_results(results) save_checkpoint(results, idx) continue api_task_id = resp.json()["task_id"] queue_time = time.time() - start_time print(f" 📝 Task ID: {api_task_id} | Очередь: {queue_time:.2f}с") # 2. Ожидание завершения status = "queued" poll_count = 0 while status in ["queued", "running"]: time.sleep(2) poll_count += 1 try: status_resp = requests.get(f"{API_URL}/{api_task_id}", timeout=5) if status_resp.status_code == 200: status = status_resp.json().get("status", "unknown") if poll_count % 5 == 0: elapsed = time.time() - start_time print(f" ⏳ Статус: {status} | Прошло: {elapsed:.1f}с") except: pass total_time = time.time() - start_time # 3. Получение результата result_resp = requests.get(f"{API_URL}/{api_task_id}/result", timeout=10) result_data = result_resp.json() if result_resp.status_code == 200 else None # 4. Получение истории (НОВОЕ!) history_resp = requests.get(f"{API_URL}/{api_task_id}/history", timeout=10) history_data = history_resp.json() if history_resp.status_code == 200 else None # 5. Оценка качества quality = {} if history_data and history_data.get("history"): history = history_data["history"] print(history) quality = evaluate_quality(history, evaluation) print( f" 📊 Качество: CR={quality.get('completion_rate', 0):.1%} ({quality.get('passed_nodes', 0)}/{quality.get('total_nodes', 0)} узлов)") else: quality = {"error": "No history available"} # 6. Сохранение результатов result = { "index": idx, "api_task_id": api_task_id, "task_description": task_desc, "reference_length": ref_length, "status": status, "queue_time_sec": round(queue_time, 2), "total_time_sec": round(total_time, 2), "result": result_data, "quality": quality, # НОВОЕ ПОЛЕ "timestamp": datetime.now().isoformat() } results.append(result) save_results(results) save_checkpoint(results, idx) status_emoji = "✅" if status == "succeeded" else "❌" print(f" {status_emoji} Статус: {status} | Время: {total_time:.1f}с") except Exception as e: print(f" ❌ Ошибка: {type(e).__name__}: {e}") result = { "index": idx, "task_description": task_desc, "status": "exception", "error": str(e), "total_time_sec": 0, "timestamp": datetime.now().isoformat() } results.append(result) save_results(results) save_checkpoint(results, idx) # Удаляем чекпоинт if os.path.exists(CHECKPOINT_FILE): os.remove(CHECKPOINT_FILE) # Финальная статистика print("\n" + "=" * 60) print("📊 ИТОГОВЫЕ МЕТРИКИ") print("=" * 60) succeeded = [r for r in results if r.get("status") == "succeeded"] failed = [r for r in results if r.get("status") == "failed"] other = [r for r in results if r.get("status") not in ["succeeded", "failed"]] print(f"\n📈 СТАТУСЫ:") print(f" Всего задач: {len(results)}") print(f" ✅ Успешно (succeeded): {len(succeeded)} ({len(succeeded) / max(len(results), 1) * 100:.1f}%)") print(f" ❌ Провалено (failed): {len(failed)} ({len(failed) / max(len(results), 1) * 100:.1f}%)") # Статистика качества quality_results = [r for r in results if r.get("quality", {}).get("completion_rate") is not None] if quality_results: cr_values = [r["quality"]["completion_rate"] for r in quality_results] success_values = [r["quality"]["task_success"] for r in quality_results] print(f"\n🎯 КАЧЕСТВО:") print(f" Средний Completion Rate (CR): {sum(cr_values) / len(cr_values):.1%}") print(f" Task Success Rate (SR): {sum(success_values) / len(success_values):.1%}") print(f" Полностью успешных задач: {sum(success_values)}/{len(quality_results)}") if succeeded: times = [r["total_time_sec"] for r in succeeded] print(f"\n⏱️ ВРЕМЯ ВЫПОЛНЕНИЯ:") print(f" Среднее: {sum(times) / len(times):.2f} сек") print(f" Медиана: {sorted(times)[len(times) // 2]:.2f} сек") print(f" Скорость: {3600 / (sum(times) / len(times)):.1f} задач/час") print(f"\n💾 Результаты сохранены в: {RESULTS_FILE}")