BrowserUse_and_ComputerUse_.../api/test-api.py
Максим Туревич 9c3989b3b0 tests add for api
2026-04-23 03:24:51 +03:00

382 lines
No EOL
14 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import requests
import time
import json
import os
import re
from datasets import load_dataset
from datetime import datetime
# Конфигурация API
API_URL = "http://localhost:8088/api/browser/tasks"
HEADERS = {"Content-Type": "application/json"}
# Загружаем датасет
dataset = load_dataset("iMeanAI/Mind2Web-Live", split="train")
TEST_SIZE = 10 # Количество задач для теста
dataset = dataset.select(range(TEST_SIZE))
print(f"Загружено задач: {len(dataset)}")
print(f"Поля: {dataset[0].keys()}\n")
TIMESTAMP = datetime.now().strftime("%Y%m%d_%H%M%S")
RESULTS_FILE = f"mind2web_benchmark_{TIMESTAMP}.json"
CHECKPOINT_FILE = f"mind2web_checkpoint_{TIMESTAMP}.json"
results = []
start_idx = 3
if os.path.exists(CHECKPOINT_FILE):
with open(CHECKPOINT_FILE, 'r', encoding='utf-8') as f:
checkpoint = json.load(f)
results = checkpoint.get('results', [])
start_idx = checkpoint.get('last_idx', 0) + 1
print(f"🔄 Возобновление с задачи {start_idx + 1}/{len(dataset)}")
def save_checkpoint(results, idx):
with open(CHECKPOINT_FILE, 'w', encoding='utf-8') as f:
json.dump({'results': results, 'last_idx': idx}, f)
def save_results(results):
with open(RESULTS_FILE, 'w', encoding='utf-8') as f:
json.dump(results, f, indent=2, ensure_ascii=False)
def evaluate_quality(history, evaluation):
"""
Оценивает качество выполнения задачи на основе истории агента.
evaluation: список узлов с полями:
- match_function_name: тип проверки
- content: данные для проверки
- method: метод (selector, xpath и т.д.)
"""
if not evaluation or not isinstance(evaluation, list):
return {
"completion_rate": None,
"task_success": None,
"passed_nodes": 0,
"total_nodes": 0,
"details": [],
"error": "No evaluation data"
}
# Собираем данные из истории
visited_urls = []
clicked_elements = []
typed_text = []
selected_options = []
for step in history:
if step.get("kind") != "action" or not step.get("data"):
continue
data = step["data"]
print(data)
# URL навигация
if "navigate" in data:
visited_urls.append(data["navigate"]["url"])
print("посещенный урл")
print(visited_urls[-1])
# Клики и взаимодействия
if data.get("interacted_element"):
elem = data["interacted_element"]
elem_info = {
"xpath": elem.get("x_path"),
"selector": None, # Можно сгенерировать из attributes
"text": elem.get("ax_name"),
"node_name": elem.get("node_name"),
"attributes": elem.get("attributes", {})
}
print(elem_info)
if "click" in data:
clicked_elements.append(elem_info)
if "input" in data:
typed_text.append({
**elem_info,
"typed_text": data["input"].get("text")
})
print("Инпут")
print(typed_text[-1])
if "select" in data:
selected_options.append({
**elem_info,
"selected_value": data["select"].get("value")
})
print("селект")
print(selected_options[-1])
passed_nodes = 0
details = []
for i, node in enumerate(evaluation):
match_func = node.get("match_function_name", "")
content = node.get("content", {})
method = node.get("method", "")
print(node)
passed = False
reason = ""
if match_func == "url_included_match":
# URL должен содержать reference_answer
expected = content.get("reference_answer", "").lower()
target_url = content.get("url", "").lower()
for url in visited_urls:
if expected in url.lower():
passed = True
reason = f"URL содержит '{expected}': {url}"
print(reason)
break
if not passed:
reason = f"URL не содержит '{expected}'. Посещенные: {visited_urls}"
print(reason)
elif match_func == "url_exactly_match":
# URL должен точно совпадать (с учетом параметров)
expected_url = content.get("url", "").lower()
for url in visited_urls:
if url.lower() == expected_url:
passed = True
reason = f"URL точно совпадает: {url}"
print(reason)
break
if not passed:
reason = f"URL не совпадает с '{expected_url}'"
print(reason)
elif match_func == "element_path_exactly_match":
# Проверка элемента по XPath или CSS селектору
expected_path = content.get("reference_answer", "")
target_url = content.get("url", "")
# Проверяем, были ли мы на нужной странице
correct_page = any(target_url in url for url in visited_urls)
if correct_page:
# Ищем элемент по XPath или селектору
for click in clicked_elements:
if method == "selector":
# CSS селектор - нужно проверить
# Пока упрощенно: ищем по части селектора
if expected_path in str(click.get("attributes", {})):
passed = True
reason = f"Элемент найден по селектору: {click.get('text')}"
break
else:
# XPath
if click.get("xpath") == expected_path:
passed = True
reason = f"Элемент найден по XPath: {click.get('text')}"
break
if not passed:
reason = f"Элемент '{expected_path}' не найден на странице {target_url}"
else:
reason = f"Не перешли на страницу {target_url}"
elif match_func == "element_value_match":
# Проверка значения элемента (для input/select)
expected_value = content.get("reference_answer", "")
for typed in typed_text:
if expected_value.lower() in (typed.get("typed_text") or "").lower():
passed = True
reason = f"Введен текст: '{typed['typed_text']}'"
break
else:
reason = f"Неизвестный тип проверки: {match_func}"
if passed:
passed_nodes += 1
details.append({
"node_index": i,
"match_function": match_func,
"expected": content.get("reference_answer") or content.get("url"),
"passed": passed,
"reason": reason
})
total_nodes = len(evaluation)
completion_rate = passed_nodes / total_nodes if total_nodes > 0 else 0
task_success = (passed_nodes == total_nodes)
return {
"completion_rate": round(completion_rate, 3),
"task_success": task_success,
"passed_nodes": passed_nodes,
"total_nodes": total_nodes,
"details": details
}
# Основной цикл
cnt = 0 # Пропустить первые N задач
for idx, item in enumerate(dataset):
if idx < start_idx:
continue
if cnt > 0:
cnt -= 1
continue
task_desc = item['task']
ref_length = item.get('reference_task_length', 0)
evaluation = item.get('evaluation', [])
print(evaluation)
print(f"\n[{idx + 1}/{len(dataset)}] Task: {task_desc[:70]}...")
print(f" Эталонная длина: {ref_length} шагов")
print(f" Ключевых узлов: {len(evaluation) if isinstance(evaluation, list) else 'N/A'}")
start_time = time.time()
try:
# 1. Создаем задачу
resp = requests.post(
API_URL,
json={
"task": task_desc,
"timeout": 300,
"metadata": {"source": "mind2web", "reference_length": ref_length}
},
headers=HEADERS,
timeout=10
)
if resp.status_code != 202:
print(f" ❌ Ошибка создания: {resp.status_code}")
result = {
"index": idx,
"task_description": task_desc,
"status": "creation_failed",
"error": f"HTTP {resp.status_code}",
"total_time_sec": 0,
"timestamp": datetime.now().isoformat()
}
results.append(result)
save_results(results)
save_checkpoint(results, idx)
continue
api_task_id = resp.json()["task_id"]
queue_time = time.time() - start_time
print(f" 📝 Task ID: {api_task_id} | Очередь: {queue_time:.2f}с")
# 2. Ожидание завершения
status = "queued"
poll_count = 0
while status in ["queued", "running"]:
time.sleep(2)
poll_count += 1
try:
status_resp = requests.get(f"{API_URL}/{api_task_id}", timeout=5)
if status_resp.status_code == 200:
status = status_resp.json().get("status", "unknown")
if poll_count % 5 == 0:
elapsed = time.time() - start_time
print(f" ⏳ Статус: {status} | Прошло: {elapsed:.1f}с")
except:
pass
total_time = time.time() - start_time
# 3. Получение результата
result_resp = requests.get(f"{API_URL}/{api_task_id}/result", timeout=10)
result_data = result_resp.json() if result_resp.status_code == 200 else None
# 4. Получение истории (НОВОЕ!)
history_resp = requests.get(f"{API_URL}/{api_task_id}/history", timeout=10)
history_data = history_resp.json() if history_resp.status_code == 200 else None
# 5. Оценка качества
quality = {}
if history_data and history_data.get("history"):
history = history_data["history"]
print(history)
quality = evaluate_quality(history, evaluation)
print(
f" 📊 Качество: CR={quality.get('completion_rate', 0):.1%} ({quality.get('passed_nodes', 0)}/{quality.get('total_nodes', 0)} узлов)")
else:
quality = {"error": "No history available"}
# 6. Сохранение результатов
result = {
"index": idx,
"api_task_id": api_task_id,
"task_description": task_desc,
"reference_length": ref_length,
"status": status,
"queue_time_sec": round(queue_time, 2),
"total_time_sec": round(total_time, 2),
"result": result_data,
"quality": quality, # НОВОЕ ПОЛЕ
"timestamp": datetime.now().isoformat()
}
results.append(result)
save_results(results)
save_checkpoint(results, idx)
status_emoji = "" if status == "succeeded" else ""
print(f" {status_emoji} Статус: {status} | Время: {total_time:.1f}с")
except Exception as e:
print(f" ❌ Ошибка: {type(e).__name__}: {e}")
result = {
"index": idx,
"task_description": task_desc,
"status": "exception",
"error": str(e),
"total_time_sec": 0,
"timestamp": datetime.now().isoformat()
}
results.append(result)
save_results(results)
save_checkpoint(results, idx)
# Удаляем чекпоинт
if os.path.exists(CHECKPOINT_FILE):
os.remove(CHECKPOINT_FILE)
# Финальная статистика
print("\n" + "=" * 60)
print("📊 ИТОГОВЫЕ МЕТРИКИ")
print("=" * 60)
succeeded = [r for r in results if r.get("status") == "succeeded"]
failed = [r for r in results if r.get("status") == "failed"]
other = [r for r in results if r.get("status") not in ["succeeded", "failed"]]
print(f"\n📈 СТАТУСЫ:")
print(f" Всего задач: {len(results)}")
print(f" ✅ Успешно (succeeded): {len(succeeded)} ({len(succeeded) / max(len(results), 1) * 100:.1f}%)")
print(f" ❌ Провалено (failed): {len(failed)} ({len(failed) / max(len(results), 1) * 100:.1f}%)")
# Статистика качества
quality_results = [r for r in results if r.get("quality", {}).get("completion_rate") is not None]
if quality_results:
cr_values = [r["quality"]["completion_rate"] for r in quality_results]
success_values = [r["quality"]["task_success"] for r in quality_results]
print(f"\n🎯 КАЧЕСТВО:")
print(f" Средний Completion Rate (CR): {sum(cr_values) / len(cr_values):.1%}")
print(f" Task Success Rate (SR): {sum(success_values) / len(success_values):.1%}")
print(f" Полностью успешных задач: {sum(success_values)}/{len(quality_results)}")
if succeeded:
times = [r["total_time_sec"] for r in succeeded]
print(f"\n⏱️ ВРЕМЯ ВЫПОЛНЕНИЯ:")
print(f" Среднее: {sum(times) / len(times):.2f} сек")
print(f" Медиана: {sorted(times)[len(times) // 2]:.2f} сек")
print(f" Скорость: {3600 / (sum(times) / len(times)):.1f} задач/час")
print(f"\n💾 Результаты сохранены в: {RESULTS_FILE}")