Compare commits
1 commit
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
9c3989b3b0 |
1 changed files with 296 additions and 111 deletions
407
api/test-api.py
407
api/test-api.py
|
|
@ -1,6 +1,8 @@
|
|||
import requests
|
||||
import time
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
from datasets import load_dataset
|
||||
from datetime import datetime
|
||||
|
||||
|
|
@ -11,187 +13,370 @@ HEADERS = {"Content-Type": "application/json"}
|
|||
# Загружаем датасет
|
||||
dataset = load_dataset("iMeanAI/Mind2Web-Live", split="train")
|
||||
|
||||
# Для теста берем первые N задач (замените на полный датасет при необходимости)
|
||||
TEST_SIZE = 10 # или len(dataset) для полного бенчмарка
|
||||
TEST_SIZE = 10 # Количество задач для теста
|
||||
dataset = dataset.select(range(TEST_SIZE))
|
||||
|
||||
print(f"Загружено задач: {len(dataset)}")
|
||||
print(f"Поля: {dataset[0].keys()}\n")
|
||||
cnt = 3
|
||||
|
||||
TIMESTAMP = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
RESULTS_FILE = f"mind2web_benchmark_{TIMESTAMP}.json"
|
||||
CHECKPOINT_FILE = f"mind2web_checkpoint_{TIMESTAMP}.json"
|
||||
|
||||
results = []
|
||||
start_idx = 3
|
||||
|
||||
if os.path.exists(CHECKPOINT_FILE):
|
||||
with open(CHECKPOINT_FILE, 'r', encoding='utf-8') as f:
|
||||
checkpoint = json.load(f)
|
||||
results = checkpoint.get('results', [])
|
||||
start_idx = checkpoint.get('last_idx', 0) + 1
|
||||
print(f"🔄 Возобновление с задачи {start_idx + 1}/{len(dataset)}")
|
||||
|
||||
|
||||
def save_checkpoint(results, idx):
|
||||
with open(CHECKPOINT_FILE, 'w', encoding='utf-8') as f:
|
||||
json.dump({'results': results, 'last_idx': idx}, f)
|
||||
|
||||
|
||||
def save_results(results):
|
||||
with open(RESULTS_FILE, 'w', encoding='utf-8') as f:
|
||||
json.dump(results, f, indent=2, ensure_ascii=False)
|
||||
|
||||
|
||||
def evaluate_quality(history, evaluation):
|
||||
"""
|
||||
Оценивает качество выполнения задачи на основе истории агента.
|
||||
|
||||
evaluation: список узлов с полями:
|
||||
- match_function_name: тип проверки
|
||||
- content: данные для проверки
|
||||
- method: метод (selector, xpath и т.д.)
|
||||
"""
|
||||
if not evaluation or not isinstance(evaluation, list):
|
||||
return {
|
||||
"completion_rate": None,
|
||||
"task_success": None,
|
||||
"passed_nodes": 0,
|
||||
"total_nodes": 0,
|
||||
"details": [],
|
||||
"error": "No evaluation data"
|
||||
}
|
||||
|
||||
# Собираем данные из истории
|
||||
visited_urls = []
|
||||
clicked_elements = []
|
||||
typed_text = []
|
||||
selected_options = []
|
||||
|
||||
for step in history:
|
||||
if step.get("kind") != "action" or not step.get("data"):
|
||||
continue
|
||||
|
||||
data = step["data"]
|
||||
print(data)
|
||||
# URL навигация
|
||||
if "navigate" in data:
|
||||
visited_urls.append(data["navigate"]["url"])
|
||||
print("посещенный урл")
|
||||
print(visited_urls[-1])
|
||||
|
||||
# Клики и взаимодействия
|
||||
if data.get("interacted_element"):
|
||||
elem = data["interacted_element"]
|
||||
elem_info = {
|
||||
"xpath": elem.get("x_path"),
|
||||
"selector": None, # Можно сгенерировать из attributes
|
||||
"text": elem.get("ax_name"),
|
||||
"node_name": elem.get("node_name"),
|
||||
"attributes": elem.get("attributes", {})
|
||||
}
|
||||
print(elem_info)
|
||||
|
||||
if "click" in data:
|
||||
clicked_elements.append(elem_info)
|
||||
|
||||
if "input" in data:
|
||||
typed_text.append({
|
||||
**elem_info,
|
||||
"typed_text": data["input"].get("text")
|
||||
})
|
||||
print("Инпут")
|
||||
print(typed_text[-1])
|
||||
|
||||
if "select" in data:
|
||||
selected_options.append({
|
||||
**elem_info,
|
||||
"selected_value": data["select"].get("value")
|
||||
})
|
||||
print("селект")
|
||||
print(selected_options[-1])
|
||||
|
||||
passed_nodes = 0
|
||||
details = []
|
||||
|
||||
for i, node in enumerate(evaluation):
|
||||
match_func = node.get("match_function_name", "")
|
||||
content = node.get("content", {})
|
||||
method = node.get("method", "")
|
||||
print(node)
|
||||
passed = False
|
||||
reason = ""
|
||||
|
||||
if match_func == "url_included_match":
|
||||
# URL должен содержать reference_answer
|
||||
expected = content.get("reference_answer", "").lower()
|
||||
target_url = content.get("url", "").lower()
|
||||
|
||||
for url in visited_urls:
|
||||
if expected in url.lower():
|
||||
passed = True
|
||||
reason = f"URL содержит '{expected}': {url}"
|
||||
print(reason)
|
||||
break
|
||||
if not passed:
|
||||
reason = f"URL не содержит '{expected}'. Посещенные: {visited_urls}"
|
||||
print(reason)
|
||||
|
||||
elif match_func == "url_exactly_match":
|
||||
# URL должен точно совпадать (с учетом параметров)
|
||||
expected_url = content.get("url", "").lower()
|
||||
|
||||
for url in visited_urls:
|
||||
if url.lower() == expected_url:
|
||||
passed = True
|
||||
reason = f"URL точно совпадает: {url}"
|
||||
print(reason)
|
||||
break
|
||||
if not passed:
|
||||
reason = f"URL не совпадает с '{expected_url}'"
|
||||
print(reason)
|
||||
|
||||
elif match_func == "element_path_exactly_match":
|
||||
# Проверка элемента по XPath или CSS селектору
|
||||
expected_path = content.get("reference_answer", "")
|
||||
target_url = content.get("url", "")
|
||||
|
||||
# Проверяем, были ли мы на нужной странице
|
||||
correct_page = any(target_url in url for url in visited_urls)
|
||||
|
||||
if correct_page:
|
||||
# Ищем элемент по XPath или селектору
|
||||
for click in clicked_elements:
|
||||
if method == "selector":
|
||||
# CSS селектор - нужно проверить
|
||||
# Пока упрощенно: ищем по части селектора
|
||||
if expected_path in str(click.get("attributes", {})):
|
||||
passed = True
|
||||
reason = f"Элемент найден по селектору: {click.get('text')}"
|
||||
break
|
||||
else:
|
||||
# XPath
|
||||
if click.get("xpath") == expected_path:
|
||||
passed = True
|
||||
reason = f"Элемент найден по XPath: {click.get('text')}"
|
||||
break
|
||||
|
||||
if not passed:
|
||||
reason = f"Элемент '{expected_path}' не найден на странице {target_url}"
|
||||
else:
|
||||
reason = f"Не перешли на страницу {target_url}"
|
||||
|
||||
elif match_func == "element_value_match":
|
||||
# Проверка значения элемента (для input/select)
|
||||
expected_value = content.get("reference_answer", "")
|
||||
|
||||
for typed in typed_text:
|
||||
if expected_value.lower() in (typed.get("typed_text") or "").lower():
|
||||
passed = True
|
||||
reason = f"Введен текст: '{typed['typed_text']}'"
|
||||
break
|
||||
|
||||
else:
|
||||
reason = f"Неизвестный тип проверки: {match_func}"
|
||||
|
||||
if passed:
|
||||
passed_nodes += 1
|
||||
|
||||
details.append({
|
||||
"node_index": i,
|
||||
"match_function": match_func,
|
||||
"expected": content.get("reference_answer") or content.get("url"),
|
||||
"passed": passed,
|
||||
"reason": reason
|
||||
})
|
||||
|
||||
total_nodes = len(evaluation)
|
||||
completion_rate = passed_nodes / total_nodes if total_nodes > 0 else 0
|
||||
task_success = (passed_nodes == total_nodes)
|
||||
|
||||
return {
|
||||
"completion_rate": round(completion_rate, 3),
|
||||
"task_success": task_success,
|
||||
"passed_nodes": passed_nodes,
|
||||
"total_nodes": total_nodes,
|
||||
"details": details
|
||||
}
|
||||
|
||||
|
||||
# Основной цикл
|
||||
cnt = 0 # Пропустить первые N задач
|
||||
|
||||
for idx, item in enumerate(dataset):
|
||||
if cnt > 0:
|
||||
cnt -=1
|
||||
if idx < start_idx:
|
||||
continue
|
||||
if cnt > 0:
|
||||
cnt -= 1
|
||||
continue
|
||||
# Поля из датасета
|
||||
task_desc = item['task'] # Описание задачи
|
||||
ref_length = item['reference_task_length'] # Эталонная длина в шагах
|
||||
evaluation = item['evaluation'] # Критерии оценки
|
||||
|
||||
# ID задачи (используем index + timestamp для уникальности)
|
||||
task_id_orig = f"mind2web_{idx}_{int(time.time())}"
|
||||
|
||||
task_desc = item['task']
|
||||
ref_length = item.get('reference_task_length', 0)
|
||||
evaluation = item.get('evaluation', [])
|
||||
print(evaluation)
|
||||
print(f"\n[{idx + 1}/{len(dataset)}] Task: {task_desc[:70]}...")
|
||||
print(f" Эталонная длина: {ref_length} шагов")
|
||||
print(f" Ключевых узлов: {len(evaluation) if isinstance(evaluation, list) else 'N/A'}")
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
# 1. Создаем задачу через API
|
||||
try:
|
||||
# 1. Создаем задачу
|
||||
resp = requests.post(
|
||||
API_URL,
|
||||
json={
|
||||
"task": task_desc,
|
||||
"timeout": 300, # Увеличим таймаут для сложных задач
|
||||
"metadata": {
|
||||
"source": "mind2web",
|
||||
"reference_length": ref_length
|
||||
}
|
||||
"timeout": 300,
|
||||
"metadata": {"source": "mind2web", "reference_length": ref_length}
|
||||
},
|
||||
headers=HEADERS,
|
||||
timeout=10
|
||||
)
|
||||
|
||||
if resp.status_code != 202:
|
||||
print(f" ❌ Ошибка создания задачи: {resp.status_code}")
|
||||
print(f" Ответ: {resp.text}")
|
||||
print(f" ❌ Ошибка создания: {resp.status_code}")
|
||||
result = {
|
||||
"index": idx,
|
||||
"task_description": task_desc,
|
||||
"status": "creation_failed",
|
||||
"error": f"HTTP {resp.status_code}",
|
||||
"total_time_sec": 0,
|
||||
"timestamp": datetime.now().isoformat()
|
||||
}
|
||||
results.append(result)
|
||||
save_results(results)
|
||||
save_checkpoint(results, idx)
|
||||
continue
|
||||
|
||||
api_task_id = resp.json()["task_id"]
|
||||
created_at = time.time()
|
||||
queue_time = created_at - start_time
|
||||
|
||||
queue_time = time.time() - start_time
|
||||
print(f" 📝 Task ID: {api_task_id} | Очередь: {queue_time:.2f}с")
|
||||
|
||||
# 2. Ожидание завершения с прогрессом
|
||||
# 2. Ожидание завершения
|
||||
status = "queued"
|
||||
poll_count = 0
|
||||
while status in ["queued", "running"]:
|
||||
time.sleep(2) # Интервал опроса
|
||||
time.sleep(2)
|
||||
poll_count += 1
|
||||
|
||||
try:
|
||||
status_resp = requests.get(f"{API_URL}/{api_task_id}", timeout=5)
|
||||
if status_resp.status_code == 200:
|
||||
status_data = status_resp.json()
|
||||
status = status_data.get("status", "unknown")
|
||||
|
||||
# Показываем прогресс каждые 5 опросов
|
||||
status = status_resp.json().get("status", "unknown")
|
||||
if poll_count % 5 == 0:
|
||||
elapsed = time.time() - start_time
|
||||
print(f" ⏳ Статус: {status} | Прошло: {elapsed:.1f}с")
|
||||
except Exception as e:
|
||||
print(f" ⚠️ Ошибка опроса: {e}")
|
||||
except:
|
||||
pass
|
||||
|
||||
end_time = time.time()
|
||||
execution_time = end_time - start_time
|
||||
total_time = time.time() - start_time
|
||||
|
||||
# 3. Получение результата
|
||||
result_resp = requests.get(f"{API_URL}/{api_task_id}/result", timeout=10)
|
||||
result_data = result_resp.json() if result_resp.status_code == 200 else None
|
||||
|
||||
result_data = None
|
||||
if result_resp.status_code == 200:
|
||||
try:
|
||||
result_data = result_resp.json()
|
||||
except:
|
||||
result_data = result_resp.text
|
||||
# 4. Получение истории (НОВОЕ!)
|
||||
history_resp = requests.get(f"{API_URL}/{api_task_id}/history", timeout=10)
|
||||
history_data = history_resp.json() if history_resp.status_code == 200 else None
|
||||
|
||||
# 4. Запись метрик
|
||||
# 5. Оценка качества
|
||||
quality = {}
|
||||
if history_data and history_data.get("history"):
|
||||
history = history_data["history"]
|
||||
print(history)
|
||||
quality = evaluate_quality(history, evaluation)
|
||||
print(
|
||||
f" 📊 Качество: CR={quality.get('completion_rate', 0):.1%} ({quality.get('passed_nodes', 0)}/{quality.get('total_nodes', 0)} узлов)")
|
||||
else:
|
||||
quality = {"error": "No history available"}
|
||||
|
||||
# 6. Сохранение результатов
|
||||
result = {
|
||||
"index": idx,
|
||||
"original_task_id": task_id_orig,
|
||||
"api_task_id": api_task_id,
|
||||
"task_description": task_desc,
|
||||
"reference_length": ref_length,
|
||||
"status": status,
|
||||
"queue_time_sec": round(queue_time, 2),
|
||||
"execution_time_sec": round(execution_time, 2),
|
||||
"total_time_sec": round(end_time - start_time, 2),
|
||||
"total_time_sec": round(total_time, 2),
|
||||
"result": result_data,
|
||||
"quality": quality, # НОВОЕ ПОЛЕ
|
||||
"timestamp": datetime.now().isoformat()
|
||||
}
|
||||
results.append(result)
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
filename = f"mind2web_benchmark.json"
|
||||
with open(filename, "w", encoding="utf-8") as f:
|
||||
json.dump(results, f, indent=2, ensure_ascii=False)
|
||||
# Эмодзи статуса
|
||||
status_emoji = "✅" if status == "succeeded" else "❌"
|
||||
print(f" {status_emoji} Статус: {status} | Время: {execution_time:.1f}с")
|
||||
|
||||
except requests.exceptions.Timeout:
|
||||
print(f" ❌ Таймаут при создании задачи")
|
||||
save_results(results)
|
||||
save_checkpoint(results, idx)
|
||||
|
||||
status_emoji = "✅" if status == "succeeded" else "❌"
|
||||
print(f" {status_emoji} Статус: {status} | Время: {total_time:.1f}с")
|
||||
|
||||
except Exception as e:
|
||||
print(f" ❌ Ошибка: {type(e).__name__}: {e}")
|
||||
continue
|
||||
result = {
|
||||
"index": idx,
|
||||
"task_description": task_desc,
|
||||
"status": "exception",
|
||||
"error": str(e),
|
||||
"total_time_sec": 0,
|
||||
"timestamp": datetime.now().isoformat()
|
||||
}
|
||||
results.append(result)
|
||||
save_results(results)
|
||||
save_checkpoint(results, idx)
|
||||
|
||||
# Сохранение детальных результатов
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
filename = f"mind2web_benchmark_{timestamp}.json"
|
||||
|
||||
with open(filename, "w", encoding="utf-8") as f:
|
||||
json.dump(results, f, indent=2, ensure_ascii=False)
|
||||
# Удаляем чекпоинт
|
||||
if os.path.exists(CHECKPOINT_FILE):
|
||||
os.remove(CHECKPOINT_FILE)
|
||||
|
||||
# Финальная статистика
|
||||
print("\n" + "=" * 60)
|
||||
print("📊 ИТОГОВЫЕ МЕТРИКИ СКОРОСТИ")
|
||||
print("📊 ИТОГОВЫЕ МЕТРИКИ")
|
||||
print("=" * 60)
|
||||
|
||||
# Статистика по статусам
|
||||
completed = [r for r in results if r["status"] == "completed"]
|
||||
failed = [r for r in results if r["status"] == "failed"]
|
||||
unknown = [r for r in results if r["status"] not in ["completed", "failed"]]
|
||||
succeeded = [r for r in results if r.get("status") == "succeeded"]
|
||||
failed = [r for r in results if r.get("status") == "failed"]
|
||||
other = [r for r in results if r.get("status") not in ["succeeded", "failed"]]
|
||||
|
||||
print(f"\n📈 СТАТУСЫ:")
|
||||
print(f" Всего задач: {len(results)}")
|
||||
print(f" ✅ Успешно: {len(completed)} ({len(completed) / max(len(results), 1) * 100:.1f}%)")
|
||||
print(f" ❌ Провалено: {len(failed)} ({len(failed) / max(len(results), 1) * 100:.1f}%)")
|
||||
if unknown:
|
||||
print(f" ❓ Неизвестный статус: {len(unknown)}")
|
||||
print(f" ✅ Успешно (succeeded): {len(succeeded)} ({len(succeeded) / max(len(results), 1) * 100:.1f}%)")
|
||||
print(f" ❌ Провалено (failed): {len(failed)} ({len(failed) / max(len(results), 1) * 100:.1f}%)")
|
||||
|
||||
if completed:
|
||||
total_times = [r["total_time_sec"] for r in completed]
|
||||
queue_times = [r["queue_time_sec"] for r in completed]
|
||||
exec_times = [r["execution_time_sec"] for r in completed]
|
||||
# Статистика качества
|
||||
quality_results = [r for r in results if r.get("quality", {}).get("completion_rate") is not None]
|
||||
if quality_results:
|
||||
cr_values = [r["quality"]["completion_rate"] for r in quality_results]
|
||||
success_values = [r["quality"]["task_success"] for r in quality_results]
|
||||
|
||||
print(f"\n🎯 КАЧЕСТВО:")
|
||||
print(f" Средний Completion Rate (CR): {sum(cr_values) / len(cr_values):.1%}")
|
||||
print(f" Task Success Rate (SR): {sum(success_values) / len(success_values):.1%}")
|
||||
print(f" Полностью успешных задач: {sum(success_values)}/{len(quality_results)}")
|
||||
|
||||
if succeeded:
|
||||
times = [r["total_time_sec"] for r in succeeded]
|
||||
print(f"\n⏱️ ВРЕМЯ ВЫПОЛНЕНИЯ:")
|
||||
print(f" Среднее: {sum(total_times) / len(total_times):.2f} сек")
|
||||
print(f" Медиана (p50): {sorted(total_times)[len(total_times) // 2]:.2f} сек")
|
||||
if len(total_times) >= 20:
|
||||
print(f" p95: {sorted(total_times)[int(len(total_times) * 0.95)]:.2f} сек")
|
||||
print(f" Мин: {min(total_times):.2f} сек")
|
||||
print(f" Макс: {max(total_times):.2f} сек")
|
||||
print(f" Среднее: {sum(times) / len(times):.2f} сек")
|
||||
print(f" Медиана: {sorted(times)[len(times) // 2]:.2f} сек")
|
||||
print(f" Скорость: {3600 / (sum(times) / len(times)):.1f} задач/час")
|
||||
|
||||
print(f"\n📊 ПРОИЗВОДИТЕЛЬНОСТЬ:")
|
||||
print(f" Среднее время в очереди: {sum(queue_times) / len(queue_times):.2f} сек")
|
||||
tasks_per_hour = 3600 / (sum(total_times) / len(total_times))
|
||||
print(f" Скорость выполнения: {tasks_per_hour:.1f} задач/час")
|
||||
|
||||
# Эффективность относительно эталонной длины
|
||||
if all("reference_length" in r for r in completed):
|
||||
avg_ref_length = sum(r["reference_length"] for r in completed) / len(completed)
|
||||
time_per_step = (sum(total_times) / len(total_times)) / avg_ref_length
|
||||
print(f" Среднее время на шаг: {time_per_step:.2f} сек")
|
||||
|
||||
print(f"\n💾 Результаты сохранены в: {filename}")
|
||||
|
||||
# Создание краткого отчета для сравнения
|
||||
summary = {
|
||||
"benchmark": "Online-Mind2Web",
|
||||
"timestamp": timestamp,
|
||||
"api_endpoint": API_URL,
|
||||
"total_tasks": len(results),
|
||||
"completed": len(completed),
|
||||
"failed": len(failed),
|
||||
"success_rate": len(completed) / max(len(results), 1) * 100,
|
||||
"avg_time_sec": sum(total_times) / len(total_times) if completed else None,
|
||||
"median_time_sec": sorted(total_times)[len(total_times) // 2] if completed else None,
|
||||
"tasks_per_hour": 3600 / (sum(total_times) / len(total_times)) if completed else None
|
||||
}
|
||||
|
||||
summary_file = f"mind2web_summary_{timestamp}.json"
|
||||
with open(summary_file, "w", encoding="utf-8") as f:
|
||||
json.dump(summary, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f"📋 Краткий отчет сохранен в: {summary_file}")
|
||||
print(f"\n💾 Результаты сохранены в: {RESULTS_FILE}")
|
||||
Loading…
Add table
Add a link
Reference in a new issue