382 lines
No EOL
14 KiB
Python
382 lines
No EOL
14 KiB
Python
import requests
|
||
import time
|
||
import json
|
||
import os
|
||
import re
|
||
from datasets import load_dataset
|
||
from datetime import datetime
|
||
|
||
# Конфигурация API
|
||
API_URL = "http://localhost:8088/api/browser/tasks"
|
||
HEADERS = {"Content-Type": "application/json"}
|
||
|
||
# Загружаем датасет
|
||
dataset = load_dataset("iMeanAI/Mind2Web-Live", split="train")
|
||
|
||
TEST_SIZE = 10 # Количество задач для теста
|
||
dataset = dataset.select(range(TEST_SIZE))
|
||
|
||
print(f"Загружено задач: {len(dataset)}")
|
||
print(f"Поля: {dataset[0].keys()}\n")
|
||
|
||
TIMESTAMP = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||
RESULTS_FILE = f"mind2web_benchmark_{TIMESTAMP}.json"
|
||
CHECKPOINT_FILE = f"mind2web_checkpoint_{TIMESTAMP}.json"
|
||
|
||
results = []
|
||
start_idx = 3
|
||
|
||
if os.path.exists(CHECKPOINT_FILE):
|
||
with open(CHECKPOINT_FILE, 'r', encoding='utf-8') as f:
|
||
checkpoint = json.load(f)
|
||
results = checkpoint.get('results', [])
|
||
start_idx = checkpoint.get('last_idx', 0) + 1
|
||
print(f"🔄 Возобновление с задачи {start_idx + 1}/{len(dataset)}")
|
||
|
||
|
||
def save_checkpoint(results, idx):
|
||
with open(CHECKPOINT_FILE, 'w', encoding='utf-8') as f:
|
||
json.dump({'results': results, 'last_idx': idx}, f)
|
||
|
||
|
||
def save_results(results):
|
||
with open(RESULTS_FILE, 'w', encoding='utf-8') as f:
|
||
json.dump(results, f, indent=2, ensure_ascii=False)
|
||
|
||
|
||
def evaluate_quality(history, evaluation):
|
||
"""
|
||
Оценивает качество выполнения задачи на основе истории агента.
|
||
|
||
evaluation: список узлов с полями:
|
||
- match_function_name: тип проверки
|
||
- content: данные для проверки
|
||
- method: метод (selector, xpath и т.д.)
|
||
"""
|
||
if not evaluation or not isinstance(evaluation, list):
|
||
return {
|
||
"completion_rate": None,
|
||
"task_success": None,
|
||
"passed_nodes": 0,
|
||
"total_nodes": 0,
|
||
"details": [],
|
||
"error": "No evaluation data"
|
||
}
|
||
|
||
# Собираем данные из истории
|
||
visited_urls = []
|
||
clicked_elements = []
|
||
typed_text = []
|
||
selected_options = []
|
||
|
||
for step in history:
|
||
if step.get("kind") != "action" or not step.get("data"):
|
||
continue
|
||
|
||
data = step["data"]
|
||
print(data)
|
||
# URL навигация
|
||
if "navigate" in data:
|
||
visited_urls.append(data["navigate"]["url"])
|
||
print("посещенный урл")
|
||
print(visited_urls[-1])
|
||
|
||
# Клики и взаимодействия
|
||
if data.get("interacted_element"):
|
||
elem = data["interacted_element"]
|
||
elem_info = {
|
||
"xpath": elem.get("x_path"),
|
||
"selector": None, # Можно сгенерировать из attributes
|
||
"text": elem.get("ax_name"),
|
||
"node_name": elem.get("node_name"),
|
||
"attributes": elem.get("attributes", {})
|
||
}
|
||
print(elem_info)
|
||
|
||
if "click" in data:
|
||
clicked_elements.append(elem_info)
|
||
|
||
if "input" in data:
|
||
typed_text.append({
|
||
**elem_info,
|
||
"typed_text": data["input"].get("text")
|
||
})
|
||
print("Инпут")
|
||
print(typed_text[-1])
|
||
|
||
if "select" in data:
|
||
selected_options.append({
|
||
**elem_info,
|
||
"selected_value": data["select"].get("value")
|
||
})
|
||
print("селект")
|
||
print(selected_options[-1])
|
||
|
||
passed_nodes = 0
|
||
details = []
|
||
|
||
for i, node in enumerate(evaluation):
|
||
match_func = node.get("match_function_name", "")
|
||
content = node.get("content", {})
|
||
method = node.get("method", "")
|
||
print(node)
|
||
passed = False
|
||
reason = ""
|
||
|
||
if match_func == "url_included_match":
|
||
# URL должен содержать reference_answer
|
||
expected = content.get("reference_answer", "").lower()
|
||
target_url = content.get("url", "").lower()
|
||
|
||
for url in visited_urls:
|
||
if expected in url.lower():
|
||
passed = True
|
||
reason = f"URL содержит '{expected}': {url}"
|
||
print(reason)
|
||
break
|
||
if not passed:
|
||
reason = f"URL не содержит '{expected}'. Посещенные: {visited_urls}"
|
||
print(reason)
|
||
|
||
elif match_func == "url_exactly_match":
|
||
# URL должен точно совпадать (с учетом параметров)
|
||
expected_url = content.get("url", "").lower()
|
||
|
||
for url in visited_urls:
|
||
if url.lower() == expected_url:
|
||
passed = True
|
||
reason = f"URL точно совпадает: {url}"
|
||
print(reason)
|
||
break
|
||
if not passed:
|
||
reason = f"URL не совпадает с '{expected_url}'"
|
||
print(reason)
|
||
|
||
elif match_func == "element_path_exactly_match":
|
||
# Проверка элемента по XPath или CSS селектору
|
||
expected_path = content.get("reference_answer", "")
|
||
target_url = content.get("url", "")
|
||
|
||
# Проверяем, были ли мы на нужной странице
|
||
correct_page = any(target_url in url for url in visited_urls)
|
||
|
||
if correct_page:
|
||
# Ищем элемент по XPath или селектору
|
||
for click in clicked_elements:
|
||
if method == "selector":
|
||
# CSS селектор - нужно проверить
|
||
# Пока упрощенно: ищем по части селектора
|
||
if expected_path in str(click.get("attributes", {})):
|
||
passed = True
|
||
reason = f"Элемент найден по селектору: {click.get('text')}"
|
||
break
|
||
else:
|
||
# XPath
|
||
if click.get("xpath") == expected_path:
|
||
passed = True
|
||
reason = f"Элемент найден по XPath: {click.get('text')}"
|
||
break
|
||
|
||
if not passed:
|
||
reason = f"Элемент '{expected_path}' не найден на странице {target_url}"
|
||
else:
|
||
reason = f"Не перешли на страницу {target_url}"
|
||
|
||
elif match_func == "element_value_match":
|
||
# Проверка значения элемента (для input/select)
|
||
expected_value = content.get("reference_answer", "")
|
||
|
||
for typed in typed_text:
|
||
if expected_value.lower() in (typed.get("typed_text") or "").lower():
|
||
passed = True
|
||
reason = f"Введен текст: '{typed['typed_text']}'"
|
||
break
|
||
|
||
else:
|
||
reason = f"Неизвестный тип проверки: {match_func}"
|
||
|
||
if passed:
|
||
passed_nodes += 1
|
||
|
||
details.append({
|
||
"node_index": i,
|
||
"match_function": match_func,
|
||
"expected": content.get("reference_answer") or content.get("url"),
|
||
"passed": passed,
|
||
"reason": reason
|
||
})
|
||
|
||
total_nodes = len(evaluation)
|
||
completion_rate = passed_nodes / total_nodes if total_nodes > 0 else 0
|
||
task_success = (passed_nodes == total_nodes)
|
||
|
||
return {
|
||
"completion_rate": round(completion_rate, 3),
|
||
"task_success": task_success,
|
||
"passed_nodes": passed_nodes,
|
||
"total_nodes": total_nodes,
|
||
"details": details
|
||
}
|
||
|
||
|
||
# Основной цикл
|
||
cnt = 0 # Пропустить первые N задач
|
||
|
||
for idx, item in enumerate(dataset):
|
||
if idx < start_idx:
|
||
continue
|
||
if cnt > 0:
|
||
cnt -= 1
|
||
continue
|
||
|
||
task_desc = item['task']
|
||
ref_length = item.get('reference_task_length', 0)
|
||
evaluation = item.get('evaluation', [])
|
||
print(evaluation)
|
||
print(f"\n[{idx + 1}/{len(dataset)}] Task: {task_desc[:70]}...")
|
||
print(f" Эталонная длина: {ref_length} шагов")
|
||
print(f" Ключевых узлов: {len(evaluation) if isinstance(evaluation, list) else 'N/A'}")
|
||
|
||
start_time = time.time()
|
||
|
||
try:
|
||
# 1. Создаем задачу
|
||
resp = requests.post(
|
||
API_URL,
|
||
json={
|
||
"task": task_desc,
|
||
"timeout": 300,
|
||
"metadata": {"source": "mind2web", "reference_length": ref_length}
|
||
},
|
||
headers=HEADERS,
|
||
timeout=10
|
||
)
|
||
|
||
if resp.status_code != 202:
|
||
print(f" ❌ Ошибка создания: {resp.status_code}")
|
||
result = {
|
||
"index": idx,
|
||
"task_description": task_desc,
|
||
"status": "creation_failed",
|
||
"error": f"HTTP {resp.status_code}",
|
||
"total_time_sec": 0,
|
||
"timestamp": datetime.now().isoformat()
|
||
}
|
||
results.append(result)
|
||
save_results(results)
|
||
save_checkpoint(results, idx)
|
||
continue
|
||
|
||
api_task_id = resp.json()["task_id"]
|
||
queue_time = time.time() - start_time
|
||
print(f" 📝 Task ID: {api_task_id} | Очередь: {queue_time:.2f}с")
|
||
|
||
# 2. Ожидание завершения
|
||
status = "queued"
|
||
poll_count = 0
|
||
while status in ["queued", "running"]:
|
||
time.sleep(2)
|
||
poll_count += 1
|
||
try:
|
||
status_resp = requests.get(f"{API_URL}/{api_task_id}", timeout=5)
|
||
if status_resp.status_code == 200:
|
||
status = status_resp.json().get("status", "unknown")
|
||
if poll_count % 5 == 0:
|
||
elapsed = time.time() - start_time
|
||
print(f" ⏳ Статус: {status} | Прошло: {elapsed:.1f}с")
|
||
except:
|
||
pass
|
||
|
||
total_time = time.time() - start_time
|
||
|
||
# 3. Получение результата
|
||
result_resp = requests.get(f"{API_URL}/{api_task_id}/result", timeout=10)
|
||
result_data = result_resp.json() if result_resp.status_code == 200 else None
|
||
|
||
# 4. Получение истории (НОВОЕ!)
|
||
history_resp = requests.get(f"{API_URL}/{api_task_id}/history", timeout=10)
|
||
history_data = history_resp.json() if history_resp.status_code == 200 else None
|
||
|
||
# 5. Оценка качества
|
||
quality = {}
|
||
if history_data and history_data.get("history"):
|
||
history = history_data["history"]
|
||
print(history)
|
||
quality = evaluate_quality(history, evaluation)
|
||
print(
|
||
f" 📊 Качество: CR={quality.get('completion_rate', 0):.1%} ({quality.get('passed_nodes', 0)}/{quality.get('total_nodes', 0)} узлов)")
|
||
else:
|
||
quality = {"error": "No history available"}
|
||
|
||
# 6. Сохранение результатов
|
||
result = {
|
||
"index": idx,
|
||
"api_task_id": api_task_id,
|
||
"task_description": task_desc,
|
||
"reference_length": ref_length,
|
||
"status": status,
|
||
"queue_time_sec": round(queue_time, 2),
|
||
"total_time_sec": round(total_time, 2),
|
||
"result": result_data,
|
||
"quality": quality, # НОВОЕ ПОЛЕ
|
||
"timestamp": datetime.now().isoformat()
|
||
}
|
||
results.append(result)
|
||
|
||
save_results(results)
|
||
save_checkpoint(results, idx)
|
||
|
||
status_emoji = "✅" if status == "succeeded" else "❌"
|
||
print(f" {status_emoji} Статус: {status} | Время: {total_time:.1f}с")
|
||
|
||
except Exception as e:
|
||
print(f" ❌ Ошибка: {type(e).__name__}: {e}")
|
||
result = {
|
||
"index": idx,
|
||
"task_description": task_desc,
|
||
"status": "exception",
|
||
"error": str(e),
|
||
"total_time_sec": 0,
|
||
"timestamp": datetime.now().isoformat()
|
||
}
|
||
results.append(result)
|
||
save_results(results)
|
||
save_checkpoint(results, idx)
|
||
|
||
# Удаляем чекпоинт
|
||
if os.path.exists(CHECKPOINT_FILE):
|
||
os.remove(CHECKPOINT_FILE)
|
||
|
||
# Финальная статистика
|
||
print("\n" + "=" * 60)
|
||
print("📊 ИТОГОВЫЕ МЕТРИКИ")
|
||
print("=" * 60)
|
||
|
||
succeeded = [r for r in results if r.get("status") == "succeeded"]
|
||
failed = [r for r in results if r.get("status") == "failed"]
|
||
other = [r for r in results if r.get("status") not in ["succeeded", "failed"]]
|
||
|
||
print(f"\n📈 СТАТУСЫ:")
|
||
print(f" Всего задач: {len(results)}")
|
||
print(f" ✅ Успешно (succeeded): {len(succeeded)} ({len(succeeded) / max(len(results), 1) * 100:.1f}%)")
|
||
print(f" ❌ Провалено (failed): {len(failed)} ({len(failed) / max(len(results), 1) * 100:.1f}%)")
|
||
|
||
# Статистика качества
|
||
quality_results = [r for r in results if r.get("quality", {}).get("completion_rate") is not None]
|
||
if quality_results:
|
||
cr_values = [r["quality"]["completion_rate"] for r in quality_results]
|
||
success_values = [r["quality"]["task_success"] for r in quality_results]
|
||
|
||
print(f"\n🎯 КАЧЕСТВО:")
|
||
print(f" Средний Completion Rate (CR): {sum(cr_values) / len(cr_values):.1%}")
|
||
print(f" Task Success Rate (SR): {sum(success_values) / len(success_values):.1%}")
|
||
print(f" Полностью успешных задач: {sum(success_values)}/{len(quality_results)}")
|
||
|
||
if succeeded:
|
||
times = [r["total_time_sec"] for r in succeeded]
|
||
print(f"\n⏱️ ВРЕМЯ ВЫПОЛНЕНИЯ:")
|
||
print(f" Среднее: {sum(times) / len(times):.2f} сек")
|
||
print(f" Медиана: {sorted(times)[len(times) // 2]:.2f} сек")
|
||
print(f" Скорость: {3600 / (sum(times) / len(times)):.1f} задач/час")
|
||
|
||
print(f"\n💾 Результаты сохранены в: {RESULTS_FILE}") |