diff --git a/api/test-api.py b/api/test-api.py index 1eef5b68..10323840 100644 --- a/api/test-api.py +++ b/api/test-api.py @@ -1,6 +1,8 @@ import requests import time import json +import os +import re from datasets import load_dataset from datetime import datetime @@ -11,187 +13,370 @@ HEADERS = {"Content-Type": "application/json"} # Загружаем датасет dataset = load_dataset("iMeanAI/Mind2Web-Live", split="train") -# Для теста берем первые N задач (замените на полный датасет при необходимости) -TEST_SIZE = 10 # или len(dataset) для полного бенчмарка +TEST_SIZE = 10 # Количество задач для теста dataset = dataset.select(range(TEST_SIZE)) print(f"Загружено задач: {len(dataset)}") print(f"Поля: {dataset[0].keys()}\n") -cnt = 3 + +TIMESTAMP = datetime.now().strftime("%Y%m%d_%H%M%S") +RESULTS_FILE = f"mind2web_benchmark_{TIMESTAMP}.json" +CHECKPOINT_FILE = f"mind2web_checkpoint_{TIMESTAMP}.json" + results = [] +start_idx = 3 + +if os.path.exists(CHECKPOINT_FILE): + with open(CHECKPOINT_FILE, 'r', encoding='utf-8') as f: + checkpoint = json.load(f) + results = checkpoint.get('results', []) + start_idx = checkpoint.get('last_idx', 0) + 1 + print(f"🔄 Возобновление с задачи {start_idx + 1}/{len(dataset)}") + + +def save_checkpoint(results, idx): + with open(CHECKPOINT_FILE, 'w', encoding='utf-8') as f: + json.dump({'results': results, 'last_idx': idx}, f) + + +def save_results(results): + with open(RESULTS_FILE, 'w', encoding='utf-8') as f: + json.dump(results, f, indent=2, ensure_ascii=False) + + +def evaluate_quality(history, evaluation): + """ + Оценивает качество выполнения задачи на основе истории агента. + + evaluation: список узлов с полями: + - match_function_name: тип проверки + - content: данные для проверки + - method: метод (selector, xpath и т.д.) + """ + if not evaluation or not isinstance(evaluation, list): + return { + "completion_rate": None, + "task_success": None, + "passed_nodes": 0, + "total_nodes": 0, + "details": [], + "error": "No evaluation data" + } + + # Собираем данные из истории + visited_urls = [] + clicked_elements = [] + typed_text = [] + selected_options = [] + + for step in history: + if step.get("kind") != "action" or not step.get("data"): + continue + + data = step["data"] + print(data) + # URL навигация + if "navigate" in data: + visited_urls.append(data["navigate"]["url"]) + print("посещенный урл") + print(visited_urls[-1]) + + # Клики и взаимодействия + if data.get("interacted_element"): + elem = data["interacted_element"] + elem_info = { + "xpath": elem.get("x_path"), + "selector": None, # Можно сгенерировать из attributes + "text": elem.get("ax_name"), + "node_name": elem.get("node_name"), + "attributes": elem.get("attributes", {}) + } + print(elem_info) + + if "click" in data: + clicked_elements.append(elem_info) + + if "input" in data: + typed_text.append({ + **elem_info, + "typed_text": data["input"].get("text") + }) + print("Инпут") + print(typed_text[-1]) + + if "select" in data: + selected_options.append({ + **elem_info, + "selected_value": data["select"].get("value") + }) + print("селект") + print(selected_options[-1]) + + passed_nodes = 0 + details = [] + + for i, node in enumerate(evaluation): + match_func = node.get("match_function_name", "") + content = node.get("content", {}) + method = node.get("method", "") + print(node) + passed = False + reason = "" + + if match_func == "url_included_match": + # URL должен содержать reference_answer + expected = content.get("reference_answer", "").lower() + target_url = content.get("url", "").lower() + + for url in visited_urls: + if expected in url.lower(): + passed = True + reason = f"URL содержит '{expected}': {url}" + print(reason) + break + if not passed: + reason = f"URL не содержит '{expected}'. Посещенные: {visited_urls}" + print(reason) + + elif match_func == "url_exactly_match": + # URL должен точно совпадать (с учетом параметров) + expected_url = content.get("url", "").lower() + + for url in visited_urls: + if url.lower() == expected_url: + passed = True + reason = f"URL точно совпадает: {url}" + print(reason) + break + if not passed: + reason = f"URL не совпадает с '{expected_url}'" + print(reason) + + elif match_func == "element_path_exactly_match": + # Проверка элемента по XPath или CSS селектору + expected_path = content.get("reference_answer", "") + target_url = content.get("url", "") + + # Проверяем, были ли мы на нужной странице + correct_page = any(target_url in url for url in visited_urls) + + if correct_page: + # Ищем элемент по XPath или селектору + for click in clicked_elements: + if method == "selector": + # CSS селектор - нужно проверить + # Пока упрощенно: ищем по части селектора + if expected_path in str(click.get("attributes", {})): + passed = True + reason = f"Элемент найден по селектору: {click.get('text')}" + break + else: + # XPath + if click.get("xpath") == expected_path: + passed = True + reason = f"Элемент найден по XPath: {click.get('text')}" + break + + if not passed: + reason = f"Элемент '{expected_path}' не найден на странице {target_url}" + else: + reason = f"Не перешли на страницу {target_url}" + + elif match_func == "element_value_match": + # Проверка значения элемента (для input/select) + expected_value = content.get("reference_answer", "") + + for typed in typed_text: + if expected_value.lower() in (typed.get("typed_text") or "").lower(): + passed = True + reason = f"Введен текст: '{typed['typed_text']}'" + break + + else: + reason = f"Неизвестный тип проверки: {match_func}" + + if passed: + passed_nodes += 1 + + details.append({ + "node_index": i, + "match_function": match_func, + "expected": content.get("reference_answer") or content.get("url"), + "passed": passed, + "reason": reason + }) + + total_nodes = len(evaluation) + completion_rate = passed_nodes / total_nodes if total_nodes > 0 else 0 + task_success = (passed_nodes == total_nodes) + + return { + "completion_rate": round(completion_rate, 3), + "task_success": task_success, + "passed_nodes": passed_nodes, + "total_nodes": total_nodes, + "details": details + } + + +# Основной цикл +cnt = 0 # Пропустить первые N задач for idx, item in enumerate(dataset): - if cnt > 0: - cnt -=1 + if idx < start_idx: + continue + if cnt > 0: + cnt -= 1 continue - # Поля из датасета - task_desc = item['task'] # Описание задачи - ref_length = item['reference_task_length'] # Эталонная длина в шагах - evaluation = item['evaluation'] # Критерии оценки - - # ID задачи (используем index + timestamp для уникальности) - task_id_orig = f"mind2web_{idx}_{int(time.time())}" + task_desc = item['task'] + ref_length = item.get('reference_task_length', 0) + evaluation = item.get('evaluation', []) + print(evaluation) print(f"\n[{idx + 1}/{len(dataset)}] Task: {task_desc[:70]}...") print(f" Эталонная длина: {ref_length} шагов") + print(f" Ключевых узлов: {len(evaluation) if isinstance(evaluation, list) else 'N/A'}") start_time = time.time() - # 1. Создаем задачу через API try: + # 1. Создаем задачу resp = requests.post( API_URL, json={ "task": task_desc, - "timeout": 300, # Увеличим таймаут для сложных задач - "metadata": { - "source": "mind2web", - "reference_length": ref_length - } + "timeout": 300, + "metadata": {"source": "mind2web", "reference_length": ref_length} }, headers=HEADERS, timeout=10 ) if resp.status_code != 202: - print(f" ❌ Ошибка создания задачи: {resp.status_code}") - print(f" Ответ: {resp.text}") + print(f" ❌ Ошибка создания: {resp.status_code}") + result = { + "index": idx, + "task_description": task_desc, + "status": "creation_failed", + "error": f"HTTP {resp.status_code}", + "total_time_sec": 0, + "timestamp": datetime.now().isoformat() + } + results.append(result) + save_results(results) + save_checkpoint(results, idx) continue api_task_id = resp.json()["task_id"] - created_at = time.time() - queue_time = created_at - start_time - + queue_time = time.time() - start_time print(f" 📝 Task ID: {api_task_id} | Очередь: {queue_time:.2f}с") - # 2. Ожидание завершения с прогрессом + # 2. Ожидание завершения status = "queued" poll_count = 0 while status in ["queued", "running"]: - time.sleep(2) # Интервал опроса + time.sleep(2) poll_count += 1 - try: status_resp = requests.get(f"{API_URL}/{api_task_id}", timeout=5) if status_resp.status_code == 200: - status_data = status_resp.json() - status = status_data.get("status", "unknown") - - # Показываем прогресс каждые 5 опросов + status = status_resp.json().get("status", "unknown") if poll_count % 5 == 0: elapsed = time.time() - start_time print(f" ⏳ Статус: {status} | Прошло: {elapsed:.1f}с") - except Exception as e: - print(f" ⚠️ Ошибка опроса: {e}") + except: pass - end_time = time.time() - execution_time = end_time - start_time + total_time = time.time() - start_time # 3. Получение результата result_resp = requests.get(f"{API_URL}/{api_task_id}/result", timeout=10) + result_data = result_resp.json() if result_resp.status_code == 200 else None - result_data = None - if result_resp.status_code == 200: - try: - result_data = result_resp.json() - except: - result_data = result_resp.text + # 4. Получение истории (НОВОЕ!) + history_resp = requests.get(f"{API_URL}/{api_task_id}/history", timeout=10) + history_data = history_resp.json() if history_resp.status_code == 200 else None - # 4. Запись метрик + # 5. Оценка качества + quality = {} + if history_data and history_data.get("history"): + history = history_data["history"] + print(history) + quality = evaluate_quality(history, evaluation) + print( + f" 📊 Качество: CR={quality.get('completion_rate', 0):.1%} ({quality.get('passed_nodes', 0)}/{quality.get('total_nodes', 0)} узлов)") + else: + quality = {"error": "No history available"} + + # 6. Сохранение результатов result = { "index": idx, - "original_task_id": task_id_orig, "api_task_id": api_task_id, "task_description": task_desc, "reference_length": ref_length, "status": status, "queue_time_sec": round(queue_time, 2), - "execution_time_sec": round(execution_time, 2), - "total_time_sec": round(end_time - start_time, 2), + "total_time_sec": round(total_time, 2), "result": result_data, + "quality": quality, # НОВОЕ ПОЛЕ "timestamp": datetime.now().isoformat() } results.append(result) - timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - filename = f"mind2web_benchmark.json" - with open(filename, "w", encoding="utf-8") as f: - json.dump(results, f, indent=2, ensure_ascii=False) - # Эмодзи статуса - status_emoji = "✅" if status == "succeeded" else "❌" - print(f" {status_emoji} Статус: {status} | Время: {execution_time:.1f}с") - except requests.exceptions.Timeout: - print(f" ❌ Таймаут при создании задачи") + save_results(results) + save_checkpoint(results, idx) + + status_emoji = "✅" if status == "succeeded" else "❌" + print(f" {status_emoji} Статус: {status} | Время: {total_time:.1f}с") + except Exception as e: print(f" ❌ Ошибка: {type(e).__name__}: {e}") - continue + result = { + "index": idx, + "task_description": task_desc, + "status": "exception", + "error": str(e), + "total_time_sec": 0, + "timestamp": datetime.now().isoformat() + } + results.append(result) + save_results(results) + save_checkpoint(results, idx) -# Сохранение детальных результатов -timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") -filename = f"mind2web_benchmark_{timestamp}.json" - -with open(filename, "w", encoding="utf-8") as f: - json.dump(results, f, indent=2, ensure_ascii=False) +# Удаляем чекпоинт +if os.path.exists(CHECKPOINT_FILE): + os.remove(CHECKPOINT_FILE) +# Финальная статистика print("\n" + "=" * 60) -print("📊 ИТОГОВЫЕ МЕТРИКИ СКОРОСТИ") +print("📊 ИТОГОВЫЕ МЕТРИКИ") print("=" * 60) -# Статистика по статусам -completed = [r for r in results if r["status"] == "completed"] -failed = [r for r in results if r["status"] == "failed"] -unknown = [r for r in results if r["status"] not in ["completed", "failed"]] +succeeded = [r for r in results if r.get("status") == "succeeded"] +failed = [r for r in results if r.get("status") == "failed"] +other = [r for r in results if r.get("status") not in ["succeeded", "failed"]] print(f"\n📈 СТАТУСЫ:") print(f" Всего задач: {len(results)}") -print(f" ✅ Успешно: {len(completed)} ({len(completed) / max(len(results), 1) * 100:.1f}%)") -print(f" ❌ Провалено: {len(failed)} ({len(failed) / max(len(results), 1) * 100:.1f}%)") -if unknown: - print(f" ❓ Неизвестный статус: {len(unknown)}") +print(f" ✅ Успешно (succeeded): {len(succeeded)} ({len(succeeded) / max(len(results), 1) * 100:.1f}%)") +print(f" ❌ Провалено (failed): {len(failed)} ({len(failed) / max(len(results), 1) * 100:.1f}%)") -if completed: - total_times = [r["total_time_sec"] for r in completed] - queue_times = [r["queue_time_sec"] for r in completed] - exec_times = [r["execution_time_sec"] for r in completed] +# Статистика качества +quality_results = [r for r in results if r.get("quality", {}).get("completion_rate") is not None] +if quality_results: + cr_values = [r["quality"]["completion_rate"] for r in quality_results] + success_values = [r["quality"]["task_success"] for r in quality_results] + print(f"\n🎯 КАЧЕСТВО:") + print(f" Средний Completion Rate (CR): {sum(cr_values) / len(cr_values):.1%}") + print(f" Task Success Rate (SR): {sum(success_values) / len(success_values):.1%}") + print(f" Полностью успешных задач: {sum(success_values)}/{len(quality_results)}") + +if succeeded: + times = [r["total_time_sec"] for r in succeeded] print(f"\n⏱️ ВРЕМЯ ВЫПОЛНЕНИЯ:") - print(f" Среднее: {sum(total_times) / len(total_times):.2f} сек") - print(f" Медиана (p50): {sorted(total_times)[len(total_times) // 2]:.2f} сек") - if len(total_times) >= 20: - print(f" p95: {sorted(total_times)[int(len(total_times) * 0.95)]:.2f} сек") - print(f" Мин: {min(total_times):.2f} сек") - print(f" Макс: {max(total_times):.2f} сек") + print(f" Среднее: {sum(times) / len(times):.2f} сек") + print(f" Медиана: {sorted(times)[len(times) // 2]:.2f} сек") + print(f" Скорость: {3600 / (sum(times) / len(times)):.1f} задач/час") - print(f"\n📊 ПРОИЗВОДИТЕЛЬНОСТЬ:") - print(f" Среднее время в очереди: {sum(queue_times) / len(queue_times):.2f} сек") - tasks_per_hour = 3600 / (sum(total_times) / len(total_times)) - print(f" Скорость выполнения: {tasks_per_hour:.1f} задач/час") - - # Эффективность относительно эталонной длины - if all("reference_length" in r for r in completed): - avg_ref_length = sum(r["reference_length"] for r in completed) / len(completed) - time_per_step = (sum(total_times) / len(total_times)) / avg_ref_length - print(f" Среднее время на шаг: {time_per_step:.2f} сек") - -print(f"\n💾 Результаты сохранены в: {filename}") - -# Создание краткого отчета для сравнения -summary = { - "benchmark": "Online-Mind2Web", - "timestamp": timestamp, - "api_endpoint": API_URL, - "total_tasks": len(results), - "completed": len(completed), - "failed": len(failed), - "success_rate": len(completed) / max(len(results), 1) * 100, - "avg_time_sec": sum(total_times) / len(total_times) if completed else None, - "median_time_sec": sorted(total_times)[len(total_times) // 2] if completed else None, - "tasks_per_hour": 3600 / (sum(total_times) / len(total_times)) if completed else None -} - -summary_file = f"mind2web_summary_{timestamp}.json" -with open(summary_file, "w", encoding="utf-8") as f: - json.dump(summary, f, indent=2, ensure_ascii=False) - -print(f"📋 Краткий отчет сохранен в: {summary_file}") \ No newline at end of file +print(f"\n💾 Результаты сохранены в: {RESULTS_FILE}") \ No newline at end of file