tests add for api

This commit is contained in:
Максим Туревич 2026-04-23 03:24:51 +03:00
parent ff1799cd98
commit 9c3989b3b0

View file

@ -1,6 +1,8 @@
import requests
import time
import json
import os
import re
from datasets import load_dataset
from datetime import datetime
@ -11,187 +13,370 @@ HEADERS = {"Content-Type": "application/json"}
# Загружаем датасет
dataset = load_dataset("iMeanAI/Mind2Web-Live", split="train")
# Для теста берем первые N задач (замените на полный датасет при необходимости)
TEST_SIZE = 10 # или len(dataset) для полного бенчмарка
TEST_SIZE = 10 # Количество задач для теста
dataset = dataset.select(range(TEST_SIZE))
print(f"Загружено задач: {len(dataset)}")
print(f"Поля: {dataset[0].keys()}\n")
cnt = 3
TIMESTAMP = datetime.now().strftime("%Y%m%d_%H%M%S")
RESULTS_FILE = f"mind2web_benchmark_{TIMESTAMP}.json"
CHECKPOINT_FILE = f"mind2web_checkpoint_{TIMESTAMP}.json"
results = []
start_idx = 3
if os.path.exists(CHECKPOINT_FILE):
with open(CHECKPOINT_FILE, 'r', encoding='utf-8') as f:
checkpoint = json.load(f)
results = checkpoint.get('results', [])
start_idx = checkpoint.get('last_idx', 0) + 1
print(f"🔄 Возобновление с задачи {start_idx + 1}/{len(dataset)}")
def save_checkpoint(results, idx):
with open(CHECKPOINT_FILE, 'w', encoding='utf-8') as f:
json.dump({'results': results, 'last_idx': idx}, f)
def save_results(results):
with open(RESULTS_FILE, 'w', encoding='utf-8') as f:
json.dump(results, f, indent=2, ensure_ascii=False)
def evaluate_quality(history, evaluation):
"""
Оценивает качество выполнения задачи на основе истории агента.
evaluation: список узлов с полями:
- match_function_name: тип проверки
- content: данные для проверки
- method: метод (selector, xpath и т.д.)
"""
if not evaluation or not isinstance(evaluation, list):
return {
"completion_rate": None,
"task_success": None,
"passed_nodes": 0,
"total_nodes": 0,
"details": [],
"error": "No evaluation data"
}
# Собираем данные из истории
visited_urls = []
clicked_elements = []
typed_text = []
selected_options = []
for step in history:
if step.get("kind") != "action" or not step.get("data"):
continue
data = step["data"]
print(data)
# URL навигация
if "navigate" in data:
visited_urls.append(data["navigate"]["url"])
print("посещенный урл")
print(visited_urls[-1])
# Клики и взаимодействия
if data.get("interacted_element"):
elem = data["interacted_element"]
elem_info = {
"xpath": elem.get("x_path"),
"selector": None, # Можно сгенерировать из attributes
"text": elem.get("ax_name"),
"node_name": elem.get("node_name"),
"attributes": elem.get("attributes", {})
}
print(elem_info)
if "click" in data:
clicked_elements.append(elem_info)
if "input" in data:
typed_text.append({
**elem_info,
"typed_text": data["input"].get("text")
})
print("Инпут")
print(typed_text[-1])
if "select" in data:
selected_options.append({
**elem_info,
"selected_value": data["select"].get("value")
})
print("селект")
print(selected_options[-1])
passed_nodes = 0
details = []
for i, node in enumerate(evaluation):
match_func = node.get("match_function_name", "")
content = node.get("content", {})
method = node.get("method", "")
print(node)
passed = False
reason = ""
if match_func == "url_included_match":
# URL должен содержать reference_answer
expected = content.get("reference_answer", "").lower()
target_url = content.get("url", "").lower()
for url in visited_urls:
if expected in url.lower():
passed = True
reason = f"URL содержит '{expected}': {url}"
print(reason)
break
if not passed:
reason = f"URL не содержит '{expected}'. Посещенные: {visited_urls}"
print(reason)
elif match_func == "url_exactly_match":
# URL должен точно совпадать (с учетом параметров)
expected_url = content.get("url", "").lower()
for url in visited_urls:
if url.lower() == expected_url:
passed = True
reason = f"URL точно совпадает: {url}"
print(reason)
break
if not passed:
reason = f"URL не совпадает с '{expected_url}'"
print(reason)
elif match_func == "element_path_exactly_match":
# Проверка элемента по XPath или CSS селектору
expected_path = content.get("reference_answer", "")
target_url = content.get("url", "")
# Проверяем, были ли мы на нужной странице
correct_page = any(target_url in url for url in visited_urls)
if correct_page:
# Ищем элемент по XPath или селектору
for click in clicked_elements:
if method == "selector":
# CSS селектор - нужно проверить
# Пока упрощенно: ищем по части селектора
if expected_path in str(click.get("attributes", {})):
passed = True
reason = f"Элемент найден по селектору: {click.get('text')}"
break
else:
# XPath
if click.get("xpath") == expected_path:
passed = True
reason = f"Элемент найден по XPath: {click.get('text')}"
break
if not passed:
reason = f"Элемент '{expected_path}' не найден на странице {target_url}"
else:
reason = f"Не перешли на страницу {target_url}"
elif match_func == "element_value_match":
# Проверка значения элемента (для input/select)
expected_value = content.get("reference_answer", "")
for typed in typed_text:
if expected_value.lower() in (typed.get("typed_text") or "").lower():
passed = True
reason = f"Введен текст: '{typed['typed_text']}'"
break
else:
reason = f"Неизвестный тип проверки: {match_func}"
if passed:
passed_nodes += 1
details.append({
"node_index": i,
"match_function": match_func,
"expected": content.get("reference_answer") or content.get("url"),
"passed": passed,
"reason": reason
})
total_nodes = len(evaluation)
completion_rate = passed_nodes / total_nodes if total_nodes > 0 else 0
task_success = (passed_nodes == total_nodes)
return {
"completion_rate": round(completion_rate, 3),
"task_success": task_success,
"passed_nodes": passed_nodes,
"total_nodes": total_nodes,
"details": details
}
# Основной цикл
cnt = 0 # Пропустить первые N задач
for idx, item in enumerate(dataset):
if idx < start_idx:
continue
if cnt > 0:
cnt -= 1
continue
# Поля из датасета
task_desc = item['task'] # Описание задачи
ref_length = item['reference_task_length'] # Эталонная длина в шагах
evaluation = item['evaluation'] # Критерии оценки
# ID задачи (используем index + timestamp для уникальности)
task_id_orig = f"mind2web_{idx}_{int(time.time())}"
task_desc = item['task']
ref_length = item.get('reference_task_length', 0)
evaluation = item.get('evaluation', [])
print(evaluation)
print(f"\n[{idx + 1}/{len(dataset)}] Task: {task_desc[:70]}...")
print(f" Эталонная длина: {ref_length} шагов")
print(f" Ключевых узлов: {len(evaluation) if isinstance(evaluation, list) else 'N/A'}")
start_time = time.time()
# 1. Создаем задачу через API
try:
# 1. Создаем задачу
resp = requests.post(
API_URL,
json={
"task": task_desc,
"timeout": 300, # Увеличим таймаут для сложных задач
"metadata": {
"source": "mind2web",
"reference_length": ref_length
}
"timeout": 300,
"metadata": {"source": "mind2web", "reference_length": ref_length}
},
headers=HEADERS,
timeout=10
)
if resp.status_code != 202:
print(f" ❌ Ошибка создания задачи: {resp.status_code}")
print(f" Ответ: {resp.text}")
print(f" ❌ Ошибка создания: {resp.status_code}")
result = {
"index": idx,
"task_description": task_desc,
"status": "creation_failed",
"error": f"HTTP {resp.status_code}",
"total_time_sec": 0,
"timestamp": datetime.now().isoformat()
}
results.append(result)
save_results(results)
save_checkpoint(results, idx)
continue
api_task_id = resp.json()["task_id"]
created_at = time.time()
queue_time = created_at - start_time
queue_time = time.time() - start_time
print(f" 📝 Task ID: {api_task_id} | Очередь: {queue_time:.2f}с")
# 2. Ожидание завершения с прогрессом
# 2. Ожидание завершения
status = "queued"
poll_count = 0
while status in ["queued", "running"]:
time.sleep(2) # Интервал опроса
time.sleep(2)
poll_count += 1
try:
status_resp = requests.get(f"{API_URL}/{api_task_id}", timeout=5)
if status_resp.status_code == 200:
status_data = status_resp.json()
status = status_data.get("status", "unknown")
# Показываем прогресс каждые 5 опросов
status = status_resp.json().get("status", "unknown")
if poll_count % 5 == 0:
elapsed = time.time() - start_time
print(f" ⏳ Статус: {status} | Прошло: {elapsed:.1f}с")
except Exception as e:
print(f" ⚠️ Ошибка опроса: {e}")
except:
pass
end_time = time.time()
execution_time = end_time - start_time
total_time = time.time() - start_time
# 3. Получение результата
result_resp = requests.get(f"{API_URL}/{api_task_id}/result", timeout=10)
result_data = result_resp.json() if result_resp.status_code == 200 else None
result_data = None
if result_resp.status_code == 200:
try:
result_data = result_resp.json()
except:
result_data = result_resp.text
# 4. Получение истории (НОВОЕ!)
history_resp = requests.get(f"{API_URL}/{api_task_id}/history", timeout=10)
history_data = history_resp.json() if history_resp.status_code == 200 else None
# 4. Запись метрик
# 5. Оценка качества
quality = {}
if history_data and history_data.get("history"):
history = history_data["history"]
print(history)
quality = evaluate_quality(history, evaluation)
print(
f" 📊 Качество: CR={quality.get('completion_rate', 0):.1%} ({quality.get('passed_nodes', 0)}/{quality.get('total_nodes', 0)} узлов)")
else:
quality = {"error": "No history available"}
# 6. Сохранение результатов
result = {
"index": idx,
"original_task_id": task_id_orig,
"api_task_id": api_task_id,
"task_description": task_desc,
"reference_length": ref_length,
"status": status,
"queue_time_sec": round(queue_time, 2),
"execution_time_sec": round(execution_time, 2),
"total_time_sec": round(end_time - start_time, 2),
"total_time_sec": round(total_time, 2),
"result": result_data,
"quality": quality, # НОВОЕ ПОЛЕ
"timestamp": datetime.now().isoformat()
}
results.append(result)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"mind2web_benchmark.json"
with open(filename, "w", encoding="utf-8") as f:
json.dump(results, f, indent=2, ensure_ascii=False)
# Эмодзи статуса
status_emoji = "" if status == "succeeded" else ""
print(f" {status_emoji} Статус: {status} | Время: {execution_time:.1f}с")
except requests.exceptions.Timeout:
print(f" ❌ Таймаут при создании задачи")
save_results(results)
save_checkpoint(results, idx)
status_emoji = "" if status == "succeeded" else ""
print(f" {status_emoji} Статус: {status} | Время: {total_time:.1f}с")
except Exception as e:
print(f" ❌ Ошибка: {type(e).__name__}: {e}")
continue
result = {
"index": idx,
"task_description": task_desc,
"status": "exception",
"error": str(e),
"total_time_sec": 0,
"timestamp": datetime.now().isoformat()
}
results.append(result)
save_results(results)
save_checkpoint(results, idx)
# Сохранение детальных результатов
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"mind2web_benchmark_{timestamp}.json"
with open(filename, "w", encoding="utf-8") as f:
json.dump(results, f, indent=2, ensure_ascii=False)
# Удаляем чекпоинт
if os.path.exists(CHECKPOINT_FILE):
os.remove(CHECKPOINT_FILE)
# Финальная статистика
print("\n" + "=" * 60)
print("📊 ИТОГОВЫЕ МЕТРИКИ СКОРОСТИ")
print("📊 ИТОГОВЫЕ МЕТРИКИ")
print("=" * 60)
# Статистика по статусам
completed = [r for r in results if r["status"] == "completed"]
failed = [r for r in results if r["status"] == "failed"]
unknown = [r for r in results if r["status"] not in ["completed", "failed"]]
succeeded = [r for r in results if r.get("status") == "succeeded"]
failed = [r for r in results if r.get("status") == "failed"]
other = [r for r in results if r.get("status") not in ["succeeded", "failed"]]
print(f"\n📈 СТАТУСЫ:")
print(f" Всего задач: {len(results)}")
print(f" ✅ Успешно: {len(completed)} ({len(completed) / max(len(results), 1) * 100:.1f}%)")
print(f" ❌ Провалено: {len(failed)} ({len(failed) / max(len(results), 1) * 100:.1f}%)")
if unknown:
print(f" ❓ Неизвестный статус: {len(unknown)}")
print(f" ✅ Успешно (succeeded): {len(succeeded)} ({len(succeeded) / max(len(results), 1) * 100:.1f}%)")
print(f" ❌ Провалено (failed): {len(failed)} ({len(failed) / max(len(results), 1) * 100:.1f}%)")
if completed:
total_times = [r["total_time_sec"] for r in completed]
queue_times = [r["queue_time_sec"] for r in completed]
exec_times = [r["execution_time_sec"] for r in completed]
# Статистика качества
quality_results = [r for r in results if r.get("quality", {}).get("completion_rate") is not None]
if quality_results:
cr_values = [r["quality"]["completion_rate"] for r in quality_results]
success_values = [r["quality"]["task_success"] for r in quality_results]
print(f"\n🎯 КАЧЕСТВО:")
print(f" Средний Completion Rate (CR): {sum(cr_values) / len(cr_values):.1%}")
print(f" Task Success Rate (SR): {sum(success_values) / len(success_values):.1%}")
print(f" Полностью успешных задач: {sum(success_values)}/{len(quality_results)}")
if succeeded:
times = [r["total_time_sec"] for r in succeeded]
print(f"\n⏱️ ВРЕМЯ ВЫПОЛНЕНИЯ:")
print(f" Среднее: {sum(total_times) / len(total_times):.2f} сек")
print(f" Медиана (p50): {sorted(total_times)[len(total_times) // 2]:.2f} сек")
if len(total_times) >= 20:
print(f" p95: {sorted(total_times)[int(len(total_times) * 0.95)]:.2f} сек")
print(f" Мин: {min(total_times):.2f} сек")
print(f" Макс: {max(total_times):.2f} сек")
print(f" Среднее: {sum(times) / len(times):.2f} сек")
print(f" Медиана: {sorted(times)[len(times) // 2]:.2f} сек")
print(f" Скорость: {3600 / (sum(times) / len(times)):.1f} задач/час")
print(f"\n📊 ПРОИЗВОДИТЕЛЬНОСТЬ:")
print(f" Среднее время в очереди: {sum(queue_times) / len(queue_times):.2f} сек")
tasks_per_hour = 3600 / (sum(total_times) / len(total_times))
print(f" Скорость выполнения: {tasks_per_hour:.1f} задач/час")
# Эффективность относительно эталонной длины
if all("reference_length" in r for r in completed):
avg_ref_length = sum(r["reference_length"] for r in completed) / len(completed)
time_per_step = (sum(total_times) / len(total_times)) / avg_ref_length
print(f" Среднее время на шаг: {time_per_step:.2f} сек")
print(f"\n💾 Результаты сохранены в: {filename}")
# Создание краткого отчета для сравнения
summary = {
"benchmark": "Online-Mind2Web",
"timestamp": timestamp,
"api_endpoint": API_URL,
"total_tasks": len(results),
"completed": len(completed),
"failed": len(failed),
"success_rate": len(completed) / max(len(results), 1) * 100,
"avg_time_sec": sum(total_times) / len(total_times) if completed else None,
"median_time_sec": sorted(total_times)[len(total_times) // 2] if completed else None,
"tasks_per_hour": 3600 / (sum(total_times) / len(total_times)) if completed else None
}
summary_file = f"mind2web_summary_{timestamp}.json"
with open(summary_file, "w", encoding="utf-8") as f:
json.dump(summary, f, indent=2, ensure_ascii=False)
print(f"📋 Краткий отчет сохранен в: {summary_file}")
print(f"\n💾 Результаты сохранены в: {RESULTS_FILE}")