tests add for api
This commit is contained in:
parent
5aeb26b222
commit
be86dc34d6
2 changed files with 249 additions and 0 deletions
52
api/mind2web_benchmark.json
Normal file
52
api/mind2web_benchmark.json
Normal file
|
|
@ -0,0 +1,52 @@
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"index": 3,
|
||||||
|
"original_task_id": "mind2web_3_1776818853",
|
||||||
|
"api_task_id": "80a5c0420dc244a8afccb3b89742089d",
|
||||||
|
"task_description": "Show me the coming soon AMC Artisan Films in amctheatres",
|
||||||
|
"reference_length": 6,
|
||||||
|
"status": "succeeded",
|
||||||
|
"queue_time_sec": 0.02,
|
||||||
|
"execution_time_sec": 40.26,
|
||||||
|
"total_time_sec": 40.26,
|
||||||
|
"result": {
|
||||||
|
"task_id": "80a5c0420dc244a8afccb3b89742089d",
|
||||||
|
"status": "succeeded",
|
||||||
|
"success": true,
|
||||||
|
"execution_time": 39.63884091377258,
|
||||||
|
"result": "**Coming Soon AMC Artisan Films**\n\nHere are all the upcoming AMC Artisan Films currently listed:\n\n| Film Title | Release Date |\n|------------|--------------|\n| **Hamlet** | April 10, 2026 |\n| **The AI Doc: Or How I Became an Apocaloptimist** | April 17, 2026 |\n| **Lorne** | April 17, 2026 |\n| **Mother Mary** | April 17, 2026 |\n| **MICHAEL** | April 24, 2026 |\n\nAll films are available for ticket purchase through AMC Theatres website.",
|
||||||
|
"error": null,
|
||||||
|
"raw_response": {
|
||||||
|
"success": true,
|
||||||
|
"result": "**Coming Soon AMC Artisan Films**\n\nHere are all the upcoming AMC Artisan Films currently listed:\n\n| Film Title | Release Date |\n|------------|--------------|\n| **Hamlet** | April 10, 2026 |\n| **The AI Doc: Or How I Became an Apocaloptimist** | April 17, 2026 |\n| **Lorne** | April 17, 2026 |\n| **Mother Mary** | April 17, 2026 |\n| **MICHAEL** | April 24, 2026 |\n\nAll films are available for ticket purchase through AMC Theatres website.",
|
||||||
|
"browser_view": "https://face-veteran-investigate-daniel.trycloudflare.com"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"timestamp": "2026-04-22T03:48:13.806637"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"index": 4,
|
||||||
|
"original_task_id": "mind2web_4_1776818893",
|
||||||
|
"api_task_id": "6dffc48b440d4a56b82d13f2a3a60b3e",
|
||||||
|
"task_description": "Go to the page with help in choosing a solar energy product for homes in tesla",
|
||||||
|
"reference_length": 5,
|
||||||
|
"status": "succeeded",
|
||||||
|
"queue_time_sec": 0.01,
|
||||||
|
"execution_time_sec": 251.22,
|
||||||
|
"total_time_sec": 251.22,
|
||||||
|
"result": {
|
||||||
|
"task_id": "6dffc48b440d4a56b82d13f2a3a60b3e",
|
||||||
|
"status": "succeeded",
|
||||||
|
"success": true,
|
||||||
|
"execution_time": 249.99302887916565,
|
||||||
|
"result": "**Tesla Solar Energy Products Guide - Research Summary**\n\nDue to persistent access restrictions (403 errors) on Tesla's official website preventing direct navigation to product pages (/solarpanels, /powerwall, /why-solar), I gathered comprehensive information from third-party sources.\n\n**Tesla Home Solar Products Available:**\n\n**1. Solar Panels (Traditional)**\n- **Cost:** $8,100-$17,000 after federal tax credit (for average home)\n- **Efficiency:** ~2.74-$3.30 per watt; typical systems achieve $36k before incentives\n- **Best For:** Budget-conscious homeowners prioritizing performance over aesthetics\n\n**2. Solar Roof**\n- **Cost:** $54,000-$72,000 for average 9kW system ($6-8/watt)\n- **Efficiency:** Only 8-14% (vs traditional panels at 20-22%) - rated poorly at 3/25\n- **Warranty:** 25 years equipment & efficiency coverage\n- **Degradation:** Poor - only guarantees 95% after year 5, drops to ~85% after year 25\n- **Durability:** Class F wind (110 mph), Class 4 hail (2-inch), Class A fire rating\n- **Best For:** Homeowners prioritizing aesthetics over performance; willing to pay premium for integrated roof appearance\n\n**3. Powerwall (Energy Storage)**\n- **Cost:** $13,配 k-$16配 k each\n- **Best For:** Energy independence; backup power; maximizing solar self-consumption\n\n**Key Recommendations:**\n✅ Choose Traditional Solar Panels if:\n - You want better value (~half the cost of Solar Roof)\n - Performance/efficiency is your priority\n - You already have a functional roof\n \n❌ Avoid Solar Roof unless:\n - Aesthetics are your #1 priority (looks like regular shingles)\n - You need a new roof anyway and want integrated solution\n - Budget is not a concern ($6k+ premium over traditional panels)\n \n⚠️ Important Notes:\n - Solar Roof has poor efficiency degradation compared to competitors like Maxeon (which offers better warranties up to 40 years)\n - Labor warranty comes from installer (~10 years industry average) not Tesla directly\n - Snow slides off easily despite no published snow load rating\n \nTo proceed with ordering or getting exact quotes for your home location:\n→ Visit tesla.com directly or contact local installers through third-party platforms like EnergySage or SunPower dealers.",
|
||||||
|
"error": null,
|
||||||
|
"raw_response": {
|
||||||
|
"success": true,
|
||||||
|
"result": "**Tesla Solar Energy Products Guide - Research Summary**\n\nDue to persistent access restrictions (403 errors) on Tesla's official website preventing direct navigation to product pages (/solarpanels, /powerwall, /why-solar), I gathered comprehensive information from third-party sources.\n\n**Tesla Home Solar Products Available:**\n\n**1. Solar Panels (Traditional)**\n- **Cost:** $8,100-$17,000 after federal tax credit (for average home)\n- **Efficiency:** ~2.74-$3.30 per watt; typical systems achieve $36k before incentives\n- **Best For:** Budget-conscious homeowners prioritizing performance over aesthetics\n\n**2. Solar Roof**\n- **Cost:** $54,000-$72,000 for average 9kW system ($6-8/watt)\n- **Efficiency:** Only 8-14% (vs traditional panels at 20-22%) - rated poorly at 3/25\n- **Warranty:** 25 years equipment & efficiency coverage\n- **Degradation:** Poor - only guarantees 95% after year 5, drops to ~85% after year 25\n- **Durability:** Class F wind (110 mph), Class 4 hail (2-inch), Class A fire rating\n- **Best For:** Homeowners prioritizing aesthetics over performance; willing to pay premium for integrated roof appearance\n\n**3. Powerwall (Energy Storage)**\n- **Cost:** $13,配 k-$16配 k each\n- **Best For:** Energy independence; backup power; maximizing solar self-consumption\n\n**Key Recommendations:**\n✅ Choose Traditional Solar Panels if:\n - You want better value (~half the cost of Solar Roof)\n - Performance/efficiency is your priority\n - You already have a functional roof\n \n❌ Avoid Solar Roof unless:\n - Aesthetics are your #1 priority (looks like regular shingles)\n - You need a new roof anyway and want integrated solution\n - Budget is not a concern ($6k+ premium over traditional panels)\n \n⚠️ Important Notes:\n - Solar Roof has poor efficiency degradation compared to competitors like Maxeon (which offers better warranties up to 40 years)\n - Labor warranty comes from installer (~10 years industry average) not Tesla directly\n - Snow slides off easily despite no published snow load rating\n \nTo proceed with ordering or getting exact quotes for your home location:\n→ Visit tesla.com directly or contact local installers through third-party platforms like EnergySage or SunPower dealers.",
|
||||||
|
"browser_view": "https://face-veteran-investigate-daniel.trycloudflare.com"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"timestamp": "2026-04-22T03:52:25.037604"
|
||||||
|
}
|
||||||
|
]
|
||||||
197
api/test-api.py
Normal file
197
api/test-api.py
Normal file
|
|
@ -0,0 +1,197 @@
|
||||||
|
import requests
|
||||||
|
import time
|
||||||
|
import json
|
||||||
|
from datasets import load_dataset
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
# Конфигурация API
|
||||||
|
API_URL = "http://localhost:8088/api/browser/tasks"
|
||||||
|
HEADERS = {"Content-Type": "application/json"}
|
||||||
|
|
||||||
|
# Загружаем датасет
|
||||||
|
dataset = load_dataset("iMeanAI/Mind2Web-Live", split="train")
|
||||||
|
|
||||||
|
# Для теста берем первые N задач (замените на полный датасет при необходимости)
|
||||||
|
TEST_SIZE = 10 # или len(dataset) для полного бенчмарка
|
||||||
|
dataset = dataset.select(range(TEST_SIZE))
|
||||||
|
|
||||||
|
print(f"Загружено задач: {len(dataset)}")
|
||||||
|
print(f"Поля: {dataset[0].keys()}\n")
|
||||||
|
cnt = 3
|
||||||
|
results = []
|
||||||
|
|
||||||
|
for idx, item in enumerate(dataset):
|
||||||
|
if cnt > 0:
|
||||||
|
cnt -=1
|
||||||
|
continue
|
||||||
|
# Поля из датасета
|
||||||
|
task_desc = item['task'] # Описание задачи
|
||||||
|
ref_length = item['reference_task_length'] # Эталонная длина в шагах
|
||||||
|
evaluation = item['evaluation'] # Критерии оценки
|
||||||
|
|
||||||
|
# ID задачи (используем index + timestamp для уникальности)
|
||||||
|
task_id_orig = f"mind2web_{idx}_{int(time.time())}"
|
||||||
|
|
||||||
|
print(f"\n[{idx + 1}/{len(dataset)}] Task: {task_desc[:70]}...")
|
||||||
|
print(f" Эталонная длина: {ref_length} шагов")
|
||||||
|
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
# 1. Создаем задачу через API
|
||||||
|
try:
|
||||||
|
resp = requests.post(
|
||||||
|
API_URL,
|
||||||
|
json={
|
||||||
|
"task": task_desc,
|
||||||
|
"timeout": 300, # Увеличим таймаут для сложных задач
|
||||||
|
"metadata": {
|
||||||
|
"source": "mind2web",
|
||||||
|
"reference_length": ref_length
|
||||||
|
}
|
||||||
|
},
|
||||||
|
headers=HEADERS,
|
||||||
|
timeout=10
|
||||||
|
)
|
||||||
|
|
||||||
|
if resp.status_code != 202:
|
||||||
|
print(f" ❌ Ошибка создания задачи: {resp.status_code}")
|
||||||
|
print(f" Ответ: {resp.text}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
api_task_id = resp.json()["task_id"]
|
||||||
|
created_at = time.time()
|
||||||
|
queue_time = created_at - start_time
|
||||||
|
|
||||||
|
print(f" 📝 Task ID: {api_task_id} | Очередь: {queue_time:.2f}с")
|
||||||
|
|
||||||
|
# 2. Ожидание завершения с прогрессом
|
||||||
|
status = "queued"
|
||||||
|
poll_count = 0
|
||||||
|
while status in ["queued", "running"]:
|
||||||
|
time.sleep(2) # Интервал опроса
|
||||||
|
poll_count += 1
|
||||||
|
|
||||||
|
try:
|
||||||
|
status_resp = requests.get(f"{API_URL}/{api_task_id}", timeout=5)
|
||||||
|
if status_resp.status_code == 200:
|
||||||
|
status_data = status_resp.json()
|
||||||
|
status = status_data.get("status", "unknown")
|
||||||
|
|
||||||
|
# Показываем прогресс каждые 5 опросов
|
||||||
|
if poll_count % 5 == 0:
|
||||||
|
elapsed = time.time() - start_time
|
||||||
|
print(f" ⏳ Статус: {status} | Прошло: {elapsed:.1f}с")
|
||||||
|
except Exception as e:
|
||||||
|
print(f" ⚠️ Ошибка опроса: {e}")
|
||||||
|
pass
|
||||||
|
|
||||||
|
end_time = time.time()
|
||||||
|
execution_time = end_time - start_time
|
||||||
|
|
||||||
|
# 3. Получение результата
|
||||||
|
result_resp = requests.get(f"{API_URL}/{api_task_id}/result", timeout=10)
|
||||||
|
|
||||||
|
result_data = None
|
||||||
|
if result_resp.status_code == 200:
|
||||||
|
try:
|
||||||
|
result_data = result_resp.json()
|
||||||
|
except:
|
||||||
|
result_data = result_resp.text
|
||||||
|
|
||||||
|
# 4. Запись метрик
|
||||||
|
result = {
|
||||||
|
"index": idx,
|
||||||
|
"original_task_id": task_id_orig,
|
||||||
|
"api_task_id": api_task_id,
|
||||||
|
"task_description": task_desc,
|
||||||
|
"reference_length": ref_length,
|
||||||
|
"status": status,
|
||||||
|
"queue_time_sec": round(queue_time, 2),
|
||||||
|
"execution_time_sec": round(execution_time, 2),
|
||||||
|
"total_time_sec": round(end_time - start_time, 2),
|
||||||
|
"result": result_data,
|
||||||
|
"timestamp": datetime.now().isoformat()
|
||||||
|
}
|
||||||
|
results.append(result)
|
||||||
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||||
|
filename = f"mind2web_benchmark.json"
|
||||||
|
with open(filename, "w", encoding="utf-8") as f:
|
||||||
|
json.dump(results, f, indent=2, ensure_ascii=False)
|
||||||
|
# Эмодзи статуса
|
||||||
|
status_emoji = "✅" if status == "succeeded" else "❌"
|
||||||
|
print(f" {status_emoji} Статус: {status} | Время: {execution_time:.1f}с")
|
||||||
|
|
||||||
|
except requests.exceptions.Timeout:
|
||||||
|
print(f" ❌ Таймаут при создании задачи")
|
||||||
|
except Exception as e:
|
||||||
|
print(f" ❌ Ошибка: {type(e).__name__}: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Сохранение детальных результатов
|
||||||
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||||
|
filename = f"mind2web_benchmark_{timestamp}.json"
|
||||||
|
|
||||||
|
with open(filename, "w", encoding="utf-8") as f:
|
||||||
|
json.dump(results, f, indent=2, ensure_ascii=False)
|
||||||
|
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("📊 ИТОГОВЫЕ МЕТРИКИ СКОРОСТИ")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
# Статистика по статусам
|
||||||
|
completed = [r for r in results if r["status"] == "completed"]
|
||||||
|
failed = [r for r in results if r["status"] == "failed"]
|
||||||
|
unknown = [r for r in results if r["status"] not in ["completed", "failed"]]
|
||||||
|
|
||||||
|
print(f"\n📈 СТАТУСЫ:")
|
||||||
|
print(f" Всего задач: {len(results)}")
|
||||||
|
print(f" ✅ Успешно: {len(completed)} ({len(completed) / max(len(results), 1) * 100:.1f}%)")
|
||||||
|
print(f" ❌ Провалено: {len(failed)} ({len(failed) / max(len(results), 1) * 100:.1f}%)")
|
||||||
|
if unknown:
|
||||||
|
print(f" ❓ Неизвестный статус: {len(unknown)}")
|
||||||
|
|
||||||
|
if completed:
|
||||||
|
total_times = [r["total_time_sec"] for r in completed]
|
||||||
|
queue_times = [r["queue_time_sec"] for r in completed]
|
||||||
|
exec_times = [r["execution_time_sec"] for r in completed]
|
||||||
|
|
||||||
|
print(f"\n⏱️ ВРЕМЯ ВЫПОЛНЕНИЯ:")
|
||||||
|
print(f" Среднее: {sum(total_times) / len(total_times):.2f} сек")
|
||||||
|
print(f" Медиана (p50): {sorted(total_times)[len(total_times) // 2]:.2f} сек")
|
||||||
|
if len(total_times) >= 20:
|
||||||
|
print(f" p95: {sorted(total_times)[int(len(total_times) * 0.95)]:.2f} сек")
|
||||||
|
print(f" Мин: {min(total_times):.2f} сек")
|
||||||
|
print(f" Макс: {max(total_times):.2f} сек")
|
||||||
|
|
||||||
|
print(f"\n📊 ПРОИЗВОДИТЕЛЬНОСТЬ:")
|
||||||
|
print(f" Среднее время в очереди: {sum(queue_times) / len(queue_times):.2f} сек")
|
||||||
|
tasks_per_hour = 3600 / (sum(total_times) / len(total_times))
|
||||||
|
print(f" Скорость выполнения: {tasks_per_hour:.1f} задач/час")
|
||||||
|
|
||||||
|
# Эффективность относительно эталонной длины
|
||||||
|
if all("reference_length" in r for r in completed):
|
||||||
|
avg_ref_length = sum(r["reference_length"] for r in completed) / len(completed)
|
||||||
|
time_per_step = (sum(total_times) / len(total_times)) / avg_ref_length
|
||||||
|
print(f" Среднее время на шаг: {time_per_step:.2f} сек")
|
||||||
|
|
||||||
|
print(f"\n💾 Результаты сохранены в: {filename}")
|
||||||
|
|
||||||
|
# Создание краткого отчета для сравнения
|
||||||
|
summary = {
|
||||||
|
"benchmark": "Online-Mind2Web",
|
||||||
|
"timestamp": timestamp,
|
||||||
|
"api_endpoint": API_URL,
|
||||||
|
"total_tasks": len(results),
|
||||||
|
"completed": len(completed),
|
||||||
|
"failed": len(failed),
|
||||||
|
"success_rate": len(completed) / max(len(results), 1) * 100,
|
||||||
|
"avg_time_sec": sum(total_times) / len(total_times) if completed else None,
|
||||||
|
"median_time_sec": sorted(total_times)[len(total_times) // 2] if completed else None,
|
||||||
|
"tasks_per_hour": 3600 / (sum(total_times) / len(total_times)) if completed else None
|
||||||
|
}
|
||||||
|
|
||||||
|
summary_file = f"mind2web_summary_{timestamp}.json"
|
||||||
|
with open(summary_file, "w", encoding="utf-8") as f:
|
||||||
|
json.dump(summary, f, indent=2, ensure_ascii=False)
|
||||||
|
|
||||||
|
print(f"📋 Краткий отчет сохранен в: {summary_file}")
|
||||||
Loading…
Add table
Add a link
Reference in a new issue