tests add for api

2026-04-23 03:24:51 +03:00
1 changed files with 296 additions and 111 deletions
--- a/api/test-api.py
+++ b/api/test-api.py
@ -1,6 +1,8 @@
 import requests
 import time
 import json
+import os
+import re
 from datasets import load_dataset
 from datetime import datetime

@ -11,187 +13,370 @@ HEADERS = {"Content-Type": "application/json"}
 # Загружаем датасет
 dataset = load_dataset("iMeanAI/Mind2Web-Live", split="train")

-# Для теста берем первые N задач (замените на полный датасет при необходимости)
-TEST_SIZE = 10  # или len(dataset) для полного бенчмарка
+TEST_SIZE = 10  # Количество задач для теста
 dataset = dataset.select(range(TEST_SIZE))

 print(f"Загружено задач: {len(dataset)}")
 print(f"Поля: {dataset[0].keys()}\n")
-cnt = 3
+
+TIMESTAMP = datetime.now().strftime("%Y%m%d_%H%M%S")
+RESULTS_FILE = f"mind2web_benchmark_{TIMESTAMP}.json"
+CHECKPOINT_FILE = f"mind2web_checkpoint_{TIMESTAMP}.json"
+
 results = []
+start_idx = 3
+
+if os.path.exists(CHECKPOINT_FILE):
+    with open(CHECKPOINT_FILE, 'r', encoding='utf-8') as f:
+        checkpoint = json.load(f)
+        results = checkpoint.get('results', [])
+        start_idx = checkpoint.get('last_idx', 0) + 1
+        print(f"🔄 Возобновление с задачи {start_idx + 1}/{len(dataset)}")
+
+
+def save_checkpoint(results, idx):
+    with open(CHECKPOINT_FILE, 'w', encoding='utf-8') as f:
+        json.dump({'results': results, 'last_idx': idx}, f)
+
+
+def save_results(results):
+    with open(RESULTS_FILE, 'w', encoding='utf-8') as f:
+        json.dump(results, f, indent=2, ensure_ascii=False)
+
+
+def evaluate_quality(history, evaluation):
+    """
+    Оценивает качество выполнения задачи на основе истории агента.
+
+    evaluation: список узлов с полями:
+        - match_function_name: тип проверки
+        - content: данные для проверки
+        - method: метод (selector, xpath и т.д.)
+    """
+    if not evaluation or not isinstance(evaluation, list):
+        return {
+            "completion_rate": None,
+            "task_success": None,
+            "passed_nodes": 0,
+            "total_nodes": 0,
+            "details": [],
+            "error": "No evaluation data"
+        }
+
+    # Собираем данные из истории
+    visited_urls = []
+    clicked_elements = []
+    typed_text = []
+    selected_options = []
+
+    for step in history:
+        if step.get("kind") != "action" or not step.get("data"):
+            continue
+
+        data = step["data"]
+        print(data)
+        # URL навигация
+        if "navigate" in data:
+            visited_urls.append(data["navigate"]["url"])
+            print("посещенный урл")
+            print(visited_urls[-1])
+
+        # Клики и взаимодействия
+        if data.get("interacted_element"):
+            elem = data["interacted_element"]
+            elem_info = {
+                "xpath": elem.get("x_path"),
+                "selector": None,  # Можно сгенерировать из attributes
+                "text": elem.get("ax_name"),
+                "node_name": elem.get("node_name"),
+                "attributes": elem.get("attributes", {})
+            }
+            print(elem_info)
+
+            if "click" in data:
+                clicked_elements.append(elem_info)
+
+            if "input" in data:
+                typed_text.append({
+                    **elem_info,
+                    "typed_text": data["input"].get("text")
+                })
+                print("Инпут")
+                print(typed_text[-1])
+
+            if "select" in data:
+                selected_options.append({
+                    **elem_info,
+                    "selected_value": data["select"].get("value")
+                })
+                print("селект")
+                print(selected_options[-1])
+
+    passed_nodes = 0
+    details = []
+
+    for i, node in enumerate(evaluation):
+        match_func = node.get("match_function_name", "")
+        content = node.get("content", {})
+        method = node.get("method", "")
+        print(node)
+        passed = False
+        reason = ""
+
+        if match_func == "url_included_match":
+            # URL должен содержать reference_answer
+            expected = content.get("reference_answer", "").lower()
+            target_url = content.get("url", "").lower()
+
+            for url in visited_urls:
+                if expected in url.lower():
+                    passed = True
+                    reason = f"URL содержит '{expected}': {url}"
+                    print(reason)
+                    break
+            if not passed:
+                reason = f"URL не содержит '{expected}'. Посещенные: {visited_urls}"
+                print(reason)
+
+        elif match_func == "url_exactly_match":
+            # URL должен точно совпадать (с учетом параметров)
+            expected_url = content.get("url", "").lower()
+
+            for url in visited_urls:
+                if url.lower() == expected_url:
+                    passed = True
+                    reason = f"URL точно совпадает: {url}"
+                    print(reason)
+                    break
+            if not passed:
+                reason = f"URL не совпадает с '{expected_url}'"
+                print(reason)
+
+        elif match_func == "element_path_exactly_match":
+            # Проверка элемента по XPath или CSS селектору
+            expected_path = content.get("reference_answer", "")
+            target_url = content.get("url", "")
+
+            # Проверяем, были ли мы на нужной странице
+            correct_page = any(target_url in url for url in visited_urls)
+
+            if correct_page:
+                # Ищем элемент по XPath или селектору
+                for click in clicked_elements:
+                    if method == "selector":
+                        # CSS селектор - нужно проверить
+                        # Пока упрощенно: ищем по части селектора
+                        if expected_path in str(click.get("attributes", {})):
+                            passed = True
+                            reason = f"Элемент найден по селектору: {click.get('text')}"
+                            break
+                    else:
+                        # XPath
+                        if click.get("xpath") == expected_path:
+                            passed = True
+                            reason = f"Элемент найден по XPath: {click.get('text')}"
+                            break
+
+                if not passed:
+                    reason = f"Элемент '{expected_path}' не найден на странице {target_url}"
+            else:
+                reason = f"Не перешли на страницу {target_url}"
+
+        elif match_func == "element_value_match":
+            # Проверка значения элемента (для input/select)
+            expected_value = content.get("reference_answer", "")
+
+            for typed in typed_text:
+                if expected_value.lower() in (typed.get("typed_text") or "").lower():
+                    passed = True
+                    reason = f"Введен текст: '{typed['typed_text']}'"
+                    break
+
+        else:
+            reason = f"Неизвестный тип проверки: {match_func}"
+
+        if passed:
+            passed_nodes += 1
+
+        details.append({
+            "node_index": i,
+            "match_function": match_func,
+            "expected": content.get("reference_answer") or content.get("url"),
+            "passed": passed,
+            "reason": reason
+        })
+
+    total_nodes = len(evaluation)
+    completion_rate = passed_nodes / total_nodes if total_nodes > 0 else 0
+    task_success = (passed_nodes == total_nodes)
+
+    return {
+        "completion_rate": round(completion_rate, 3),
+        "task_success": task_success,
+        "passed_nodes": passed_nodes,
+        "total_nodes": total_nodes,
+        "details": details
+    }
+
+
+# Основной цикл
+cnt = 0  # Пропустить первые N задач

 for idx, item in enumerate(dataset):
-    if cnt > 0:
-        cnt -=1
+    if idx < start_idx:
+        continue
+    if cnt > 0:
+        cnt -= 1
        continue
-    # Поля из датасета
-    task_desc = item['task']  # Описание задачи
-    ref_length = item['reference_task_length']  # Эталонная длина в шагах
-    evaluation = item['evaluation']  # Критерии оценки
-
-    # ID задачи (используем index + timestamp для уникальности)
-    task_id_orig = f"mind2web_{idx}_{int(time.time())}"

+    task_desc = item['task']
+    ref_length = item.get('reference_task_length', 0)
+    evaluation = item.get('evaluation', [])
+    print(evaluation)
    print(f"\n[{idx + 1}/{len(dataset)}] Task: {task_desc[:70]}...")
    print(f"  Эталонная длина: {ref_length} шагов")
+    print(f"  Ключевых узлов: {len(evaluation) if isinstance(evaluation, list) else 'N/A'}")

    start_time = time.time()

-    # 1. Создаем задачу через API
    try:
+        # 1. Создаем задачу
        resp = requests.post(
            API_URL,
            json={
                "task": task_desc,
-                "timeout": 300,  # Увеличим таймаут для сложных задач
-                "metadata": {
-                    "source": "mind2web",
-                    "reference_length": ref_length
-                }
+                "timeout": 300,
+                "metadata": {"source": "mind2web", "reference_length": ref_length}
            },
            headers=HEADERS,
            timeout=10
        )

        if resp.status_code != 202:
-            print(f"  ❌ Ошибка создания задачи: {resp.status_code}")
-            print(f"     Ответ: {resp.text}")
+            print(f"  ❌ Ошибка создания: {resp.status_code}")
+            result = {
+                "index": idx,
+                "task_description": task_desc,
+                "status": "creation_failed",
+                "error": f"HTTP {resp.status_code}",
+                "total_time_sec": 0,
+                "timestamp": datetime.now().isoformat()
+            }
+            results.append(result)
+            save_results(results)
+            save_checkpoint(results, idx)
            continue

        api_task_id = resp.json()["task_id"]
-        created_at = time.time()
-        queue_time = created_at - start_time
-
+        queue_time = time.time() - start_time
        print(f"  📝 Task ID: {api_task_id} | Очередь: {queue_time:.2f}с")

-        # 2. Ожидание завершения с прогрессом
+        # 2. Ожидание завершения
        status = "queued"
        poll_count = 0
        while status in ["queued", "running"]:
-            time.sleep(2)  # Интервал опроса
+            time.sleep(2)
            poll_count += 1
-
            try:
                status_resp = requests.get(f"{API_URL}/{api_task_id}", timeout=5)
                if status_resp.status_code == 200:
-                    status_data = status_resp.json()
-                    status = status_data.get("status", "unknown")
-
-                    # Показываем прогресс каждые 5 опросов
+                    status = status_resp.json().get("status", "unknown")
                    if poll_count % 5 == 0:
                        elapsed = time.time() - start_time
                        print(f"  ⏳ Статус: {status} | Прошло: {elapsed:.1f}с")
-            except Exception as e:
-                print(f"  ⚠️ Ошибка опроса: {e}")
+            except:
                pass

-        end_time = time.time()
-        execution_time = end_time - start_time
+        total_time = time.time() - start_time

        # 3. Получение результата
        result_resp = requests.get(f"{API_URL}/{api_task_id}/result", timeout=10)
+        result_data = result_resp.json() if result_resp.status_code == 200 else None

-        result_data = None
-        if result_resp.status_code == 200:
-            try:
-                result_data = result_resp.json()
-            except:
-                result_data = result_resp.text
+        # 4. Получение истории (НОВОЕ!)
+        history_resp = requests.get(f"{API_URL}/{api_task_id}/history", timeout=10)
+        history_data = history_resp.json() if history_resp.status_code == 200 else None

-        # 4. Запись метрик
+        # 5. Оценка качества
+        quality = {}
+        if history_data and history_data.get("history"):
+            history = history_data["history"]
+            print(history)
+            quality = evaluate_quality(history, evaluation)
+            print(
+                f"  📊 Качество: CR={quality.get('completion_rate', 0):.1%} ({quality.get('passed_nodes', 0)}/{quality.get('total_nodes', 0)} узлов)")
+        else:
+            quality = {"error": "No history available"}
+
+        # 6. Сохранение результатов
        result = {
            "index": idx,
-            "original_task_id": task_id_orig,
            "api_task_id": api_task_id,
            "task_description": task_desc,
            "reference_length": ref_length,
            "status": status,
            "queue_time_sec": round(queue_time, 2),
-            "execution_time_sec": round(execution_time, 2),
-            "total_time_sec": round(end_time - start_time, 2),
+            "total_time_sec": round(total_time, 2),
            "result": result_data,
+            "quality": quality,  # НОВОЕ ПОЛЕ
            "timestamp": datetime.now().isoformat()
        }
        results.append(result)
-        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        filename = f"mind2web_benchmark.json"
-        with open(filename, "w", encoding="utf-8") as f:
-            json.dump(results, f, indent=2, ensure_ascii=False)
-        # Эмодзи статуса
-        status_emoji = "✅" if status == "succeeded" else "❌"
-        print(f"  {status_emoji} Статус: {status} | Время: {execution_time:.1f}с")

-    except requests.exceptions.Timeout:
-        print(f"  ❌ Таймаут при создании задачи")
+        save_results(results)
+        save_checkpoint(results, idx)
+
+        status_emoji = "✅" if status == "succeeded" else "❌"
+        print(f"  {status_emoji} Статус: {status} | Время: {total_time:.1f}с")
+
    except Exception as e:
        print(f"  ❌ Ошибка: {type(e).__name__}: {e}")
-        continue
+        result = {
+            "index": idx,
+            "task_description": task_desc,
+            "status": "exception",
+            "error": str(e),
+            "total_time_sec": 0,
+            "timestamp": datetime.now().isoformat()
+        }
+        results.append(result)
+        save_results(results)
+        save_checkpoint(results, idx)

-# Сохранение детальных результатов
-timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-filename = f"mind2web_benchmark_{timestamp}.json"
-
-with open(filename, "w", encoding="utf-8") as f:
-    json.dump(results, f, indent=2, ensure_ascii=False)
+# Удаляем чекпоинт
+if os.path.exists(CHECKPOINT_FILE):
+    os.remove(CHECKPOINT_FILE)

+# Финальная статистика
 print("\n" + "=" * 60)
-print("📊 ИТОГОВЫЕ МЕТРИКИ СКОРОСТИ")
+print("📊 ИТОГОВЫЕ МЕТРИКИ")
 print("=" * 60)

-# Статистика по статусам
-completed = [r for r in results if r["status"] == "completed"]
-failed = [r for r in results if r["status"] == "failed"]
-unknown = [r for r in results if r["status"] not in ["completed", "failed"]]
+succeeded = [r for r in results if r.get("status") == "succeeded"]
+failed = [r for r in results if r.get("status") == "failed"]
+other = [r for r in results if r.get("status") not in ["succeeded", "failed"]]

 print(f"\n📈 СТАТУСЫ:")
 print(f"  Всего задач: {len(results)}")
-print(f"  ✅ Успешно: {len(completed)} ({len(completed) / max(len(results), 1) * 100:.1f}%)")
-print(f"  ❌ Провалено: {len(failed)} ({len(failed) / max(len(results), 1) * 100:.1f}%)")
-if unknown:
-    print(f"  ❓ Неизвестный статус: {len(unknown)}")
+print(f"  ✅ Успешно (succeeded): {len(succeeded)} ({len(succeeded) / max(len(results), 1) * 100:.1f}%)")
+print(f"  ❌ Провалено (failed): {len(failed)} ({len(failed) / max(len(results), 1) * 100:.1f}%)")

-if completed:
-    total_times = [r["total_time_sec"] for r in completed]
-    queue_times = [r["queue_time_sec"] for r in completed]
-    exec_times = [r["execution_time_sec"] for r in completed]
+# Статистика качества
+quality_results = [r for r in results if r.get("quality", {}).get("completion_rate") is not None]
+if quality_results:
+    cr_values = [r["quality"]["completion_rate"] for r in quality_results]
+    success_values = [r["quality"]["task_success"] for r in quality_results]

+    print(f"\n🎯 КАЧЕСТВО:")
+    print(f"  Средний Completion Rate (CR): {sum(cr_values) / len(cr_values):.1%}")
+    print(f"  Task Success Rate (SR): {sum(success_values) / len(success_values):.1%}")
+    print(f"  Полностью успешных задач: {sum(success_values)}/{len(quality_results)}")
+
+if succeeded:
+    times = [r["total_time_sec"] for r in succeeded]
    print(f"\n⏱️ ВРЕМЯ ВЫПОЛНЕНИЯ:")
-    print(f"  Среднее: {sum(total_times) / len(total_times):.2f} сек")
-    print(f"  Медиана (p50): {sorted(total_times)[len(total_times) // 2]:.2f} сек")
-    if len(total_times) >= 20:
-        print(f"  p95: {sorted(total_times)[int(len(total_times) * 0.95)]:.2f} сек")
-    print(f"  Мин: {min(total_times):.2f} сек")
-    print(f"  Макс: {max(total_times):.2f} сек")
+    print(f"  Среднее: {sum(times) / len(times):.2f} сек")
+    print(f"  Медиана: {sorted(times)[len(times) // 2]:.2f} сек")
+    print(f"  Скорость: {3600 / (sum(times) / len(times)):.1f} задач/час")

-    print(f"\n📊 ПРОИЗВОДИТЕЛЬНОСТЬ:")
-    print(f"  Среднее время в очереди: {sum(queue_times) / len(queue_times):.2f} сек")
-    tasks_per_hour = 3600 / (sum(total_times) / len(total_times))
-    print(f"  Скорость выполнения: {tasks_per_hour:.1f} задач/час")
-
-    # Эффективность относительно эталонной длины
-    if all("reference_length" in r for r in completed):
-        avg_ref_length = sum(r["reference_length"] for r in completed) / len(completed)
-        time_per_step = (sum(total_times) / len(total_times)) / avg_ref_length
-        print(f"  Среднее время на шаг: {time_per_step:.2f} сек")
-
-print(f"\n💾 Результаты сохранены в: {filename}")
-
-# Создание краткого отчета для сравнения
-summary = {
-    "benchmark": "Online-Mind2Web",
-    "timestamp": timestamp,
-    "api_endpoint": API_URL,
-    "total_tasks": len(results),
-    "completed": len(completed),
-    "failed": len(failed),
-    "success_rate": len(completed) / max(len(results), 1) * 100,
-    "avg_time_sec": sum(total_times) / len(total_times) if completed else None,
-    "median_time_sec": sorted(total_times)[len(total_times) // 2] if completed else None,
-    "tasks_per_hour": 3600 / (sum(total_times) / len(total_times)) if completed else None
-}
-
-summary_file = f"mind2web_summary_{timestamp}.json"
-with open(summary_file, "w", encoding="utf-8") as f:
-    json.dump(summary, f, indent=2, ensure_ascii=False)
-
-print(f"📋 Краткий отчет сохранен в: {summary_file}")
+print(f"\n💾 Результаты сохранены в: {RESULTS_FILE}")