tests add for api

This commit is contained in:
Максим Туревич 2026-04-22 03:46:17 +03:00
parent 50589232d6
commit 365ab8dd79
4 changed files with 403 additions and 0 deletions

View file

@ -0,0 +1,77 @@
[
{
"index": 2,
"original_task_id": "mind2web_2_1776817552",
"api_task_id": "2334bb039ee04dc6b23b5b0240314064",
"task_description": "Find hard side Carry-on Luggage in black color in target",
"reference_length": 9,
"status": "succeeded",
"queue_time_sec": 0.02,
"execution_time_sec": 74.38,
"total_time_sec": 74.38,
"result": {
"task_id": "2334bb039ee04dc6b23b5b0240314064",
"status": "succeeded",
"success": true,
"execution_time": 72.45919299125671,
"result": "Found hard side carry-on luggage in black at Target:\n\n1. **Signature Hardside Carry On Spinner Suitcase Matte Black** - Open Story™\n - Price: $128.00\n - Rating: 4.3/5 (71 ratings)\n - Features: TSA Locks, Water-Resistant, Polycarbonate\n\n2. **Hardside Carry On Spinner Suitcase Black** - Open Story™\n - Price: $90.00\n - Rating: 4.3/5 (500 ratings)\n - Features: Telescoping Handle, 8 Wheels\n\n3. **SWISSGEAR Energie Hardside Carry On Spinner Suitcase** - Black\n - Price: $86.39 (on sale)\n - Rating: 4.2/5 (76 ratings)\n - Features: Structured Polycarbonate, 8 Spinner Wheels\n\nAll three products match your requirements for hard side carry-on luggage in black color.",
"error": null,
"raw_response": {
"success": true,
"result": "Found hard side carry-on luggage in black at Target:\n\n1. **Signature Hardside Carry On Spinner Suitcase Matte Black** - Open Story™\n - Price: $128.00\n - Rating: 4.3/5 (71 ratings)\n - Features: TSA Locks, Water-Resistant, Polycarbonate\n\n2. **Hardside Carry On Spinner Suitcase Black** - Open Story™\n - Price: $90.00\n - Rating: 4.3/5 (500 ratings)\n - Features: Telescoping Handle, 8 Wheels\n\n3. **SWISSGEAR Energie Hardside Carry On Spinner Suitcase** - Black\n - Price: $86.39 (on sale)\n - Rating: 4.2/5 (76 ratings)\n - Features: Structured Polycarbonate, 8 Spinner Wheels\n\nAll three products match your requirements for hard side carry-on luggage in black color.",
"browser_view": "https://face-veteran-investigate-daniel.trycloudflare.com"
}
},
"timestamp": "2026-04-22T03:27:07.331142"
},
{
"index": 3,
"original_task_id": "mind2web_3_1776817627",
"api_task_id": "fb9ca5f23cfe427d80c31747f61260d6",
"task_description": "Show me the coming soon AMC Artisan Films in amctheatres",
"reference_length": 6,
"status": "succeeded",
"queue_time_sec": 0.01,
"execution_time_sec": 58.33,
"total_time_sec": 58.33,
"result": {
"task_id": "fb9ca5f23cfe427d80c31747f61260d6",
"status": "succeeded",
"success": true,
"execution_time": 56.48335075378418,
"result": "Here are the AMC Artisan Films coming soon:\n\n1. **MICHAEL** - April 24\n - A front-row seat to the life and legacy of one of the most influential artists\n\n2. **The AI Doc: Or How I Became an Apocaloptimist**\n - Release date not specified\n\n3. **Lorne** - April 17, 2026\n - From Academy Award-winning filmmaker Morgan Neville\n - Behind-the-scenes glimpse at comedy empire builder\n - Rating: R | Runtime: 1 hr 41 min\n\n4. **Hamlet** - April 10, 2026\n - Rating: NR | Runtime: 1 hr 54 min\n\n5. **Mother Mary** - April 17, 2026\n - Starring Anne Hathaway, Michaela Coel, Hunter Schafer, FKA Twigs\n - Directed by David Lowery | Rating: R | Runtime: 1 hr 50 min",
"error": null,
"raw_response": {
"success": true,
"result": "Here are the AMC Artisan Films coming soon:\n\n1. **MICHAEL** - April 24\n - A front-row seat to the life and legacy of one of the most influential artists\n\n2. **The AI Doc: Or How I Became an Apocaloptimist**\n - Release date not specified\n\n3. **Lorne** - April 17, 2026\n - From Academy Award-winning filmmaker Morgan Neville\n - Behind-the-scenes glimpse at comedy empire builder\n - Rating: R | Runtime: 1 hr 41 min\n\n4. **Hamlet** - April 10, 2026\n - Rating: NR | Runtime: 1 hr 54 min\n\n5. **Mother Mary** - April 17, 2026\n - Starring Anne Hathaway, Michaela Coel, Hunter Schafer, FKA Twigs\n - Directed by David Lowery | Rating: R | Runtime: 1 hr 50 min",
"browser_view": "https://face-veteran-investigate-daniel.trycloudflare.com"
}
},
"timestamp": "2026-04-22T03:28:05.665642"
},
{
"index": 4,
"original_task_id": "mind2web_4_1776817685",
"api_task_id": "e3db75a7582e4fde856d8e9b746417a2",
"task_description": "Go to the page with help in choosing a solar energy product for homes in tesla",
"reference_length": 5,
"status": "succeeded",
"queue_time_sec": 0.0,
"execution_time_sec": 295.44,
"total_time_sec": 295.44,
"result": {
"task_id": "e3db75a7582e4fde856d8e9b746417a2",
"status": "succeeded",
"success": true,
"execution_time": 293.71661829948425,
"result": "Successfully found help content for choosing solar energy products via Tesla Assist chat. The response includes: (1) Design & Aesthetics comparison between Solar Panels and Solar Roof, (2) Durability information for both products, (3) Roof Eligibility table showing which roof types work with each product (Asphalt/Composition Shingles, Concrete Tile, Flat Roof, Standing Seam Metal, Other), (4) Key Benefits including savings & independence info, (5) Next steps with Order Now and Schedule Consultation links.",
"error": null,
"raw_response": {
"success": true,
"result": "Successfully found help content for choosing solar energy products via Tesla Assist chat. The response includes: (1) Design & Aesthetics comparison between Solar Panels and Solar Roof, (2) Durability information for both products, (3) Roof Eligibility table showing which roof types work with each product (Asphalt/Composition Shingles, Concrete Tile, Flat Roof, Standing Seam Metal, Other), (4) Key Benefits including savings & independence info, (5) Next steps with Order Now and Schedule Consultation links.",
"browser_view": "https://face-veteran-investigate-daniel.trycloudflare.com"
}
},
"timestamp": "2026-04-22T03:33:01.112179"
}
]

View file

@ -0,0 +1,12 @@
{
"benchmark": "Online-Mind2Web",
"timestamp": "20260422_015724",
"api_endpoint": "http://localhost:8088/api/browser/tasks",
"total_tasks": 0,
"completed": 0,
"failed": 0,
"success_rate": 0.0,
"avg_time_sec": null,
"median_time_sec": null,
"tasks_per_hour": null
}

197
api/test-api.py Normal file
View file

@ -0,0 +1,197 @@
import requests
import time
import json
from datasets import load_dataset
from datetime import datetime
# Конфигурация API
API_URL = "http://localhost:8088/api/browser/tasks"
HEADERS = {"Content-Type": "application/json"}
# Загружаем датасет
dataset = load_dataset("iMeanAI/Mind2Web-Live", split="train")
# Для теста берем первые N задач (замените на полный датасет при необходимости)
TEST_SIZE = 10 # или len(dataset) для полного бенчмарка
dataset = dataset.select(range(TEST_SIZE))
print(f"Загружено задач: {len(dataset)}")
print(f"Поля: {dataset[0].keys()}\n")
cnt = 3
results = []
for idx, item in enumerate(dataset):
if cnt > 0:
cnt -=1
continue
# Поля из датасета
task_desc = item['task'] # Описание задачи
ref_length = item['reference_task_length'] # Эталонная длина в шагах
evaluation = item['evaluation'] # Критерии оценки
# ID задачи (используем index + timestamp для уникальности)
task_id_orig = f"mind2web_{idx}_{int(time.time())}"
print(f"\n[{idx + 1}/{len(dataset)}] Task: {task_desc[:70]}...")
print(f" Эталонная длина: {ref_length} шагов")
start_time = time.time()
# 1. Создаем задачу через API
try:
resp = requests.post(
API_URL,
json={
"task": task_desc,
"timeout": 300, # Увеличим таймаут для сложных задач
"metadata": {
"source": "mind2web",
"reference_length": ref_length
}
},
headers=HEADERS,
timeout=10
)
if resp.status_code != 202:
print(f" ❌ Ошибка создания задачи: {resp.status_code}")
print(f" Ответ: {resp.text}")
continue
api_task_id = resp.json()["task_id"]
created_at = time.time()
queue_time = created_at - start_time
print(f" 📝 Task ID: {api_task_id} | Очередь: {queue_time:.2f}с")
# 2. Ожидание завершения с прогрессом
status = "queued"
poll_count = 0
while status in ["queued", "running"]:
time.sleep(2) # Интервал опроса
poll_count += 1
try:
status_resp = requests.get(f"{API_URL}/{api_task_id}", timeout=5)
if status_resp.status_code == 200:
status_data = status_resp.json()
status = status_data.get("status", "unknown")
# Показываем прогресс каждые 5 опросов
if poll_count % 5 == 0:
elapsed = time.time() - start_time
print(f" ⏳ Статус: {status} | Прошло: {elapsed:.1f}с")
except Exception as e:
print(f" ⚠️ Ошибка опроса: {e}")
pass
end_time = time.time()
execution_time = end_time - start_time
# 3. Получение результата
result_resp = requests.get(f"{API_URL}/{api_task_id}/result", timeout=10)
result_data = None
if result_resp.status_code == 200:
try:
result_data = result_resp.json()
except:
result_data = result_resp.text
# 4. Запись метрик
result = {
"index": idx,
"original_task_id": task_id_orig,
"api_task_id": api_task_id,
"task_description": task_desc,
"reference_length": ref_length,
"status": status,
"queue_time_sec": round(queue_time, 2),
"execution_time_sec": round(execution_time, 2),
"total_time_sec": round(end_time - start_time, 2),
"result": result_data,
"timestamp": datetime.now().isoformat()
}
results.append(result)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"mind2web_benchmark.json"
with open(filename, "w", encoding="utf-8") as f:
json.dump(results, f, indent=2, ensure_ascii=False)
# Эмодзи статуса
status_emoji = "" if status == "succeeded" else ""
print(f" {status_emoji} Статус: {status} | Время: {execution_time:.1f}с")
except requests.exceptions.Timeout:
print(f" ❌ Таймаут при создании задачи")
except Exception as e:
print(f" ❌ Ошибка: {type(e).__name__}: {e}")
continue
# Сохранение детальных результатов
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"mind2web_benchmark_{timestamp}.json"
with open(filename, "w", encoding="utf-8") as f:
json.dump(results, f, indent=2, ensure_ascii=False)
print("\n" + "=" * 60)
print("📊 ИТОГОВЫЕ МЕТРИКИ СКОРОСТИ")
print("=" * 60)
# Статистика по статусам
completed = [r for r in results if r["status"] == "completed"]
failed = [r for r in results if r["status"] == "failed"]
unknown = [r for r in results if r["status"] not in ["completed", "failed"]]
print(f"\n📈 СТАТУСЫ:")
print(f" Всего задач: {len(results)}")
print(f" ✅ Успешно: {len(completed)} ({len(completed) / max(len(results), 1) * 100:.1f}%)")
print(f" ❌ Провалено: {len(failed)} ({len(failed) / max(len(results), 1) * 100:.1f}%)")
if unknown:
print(f" ❓ Неизвестный статус: {len(unknown)}")
if completed:
total_times = [r["total_time_sec"] for r in completed]
queue_times = [r["queue_time_sec"] for r in completed]
exec_times = [r["execution_time_sec"] for r in completed]
print(f"\n⏱️ ВРЕМЯ ВЫПОЛНЕНИЯ:")
print(f" Среднее: {sum(total_times) / len(total_times):.2f} сек")
print(f" Медиана (p50): {sorted(total_times)[len(total_times) // 2]:.2f} сек")
if len(total_times) >= 20:
print(f" p95: {sorted(total_times)[int(len(total_times) * 0.95)]:.2f} сек")
print(f" Мин: {min(total_times):.2f} сек")
print(f" Макс: {max(total_times):.2f} сек")
print(f"\n📊 ПРОИЗВОДИТЕЛЬНОСТЬ:")
print(f" Среднее время в очереди: {sum(queue_times) / len(queue_times):.2f} сек")
tasks_per_hour = 3600 / (sum(total_times) / len(total_times))
print(f" Скорость выполнения: {tasks_per_hour:.1f} задач/час")
# Эффективность относительно эталонной длины
if all("reference_length" in r for r in completed):
avg_ref_length = sum(r["reference_length"] for r in completed) / len(completed)
time_per_step = (sum(total_times) / len(total_times)) / avg_ref_length
print(f" Среднее время на шаг: {time_per_step:.2f} сек")
print(f"\n💾 Результаты сохранены в: {filename}")
# Создание краткого отчета для сравнения
summary = {
"benchmark": "Online-Mind2Web",
"timestamp": timestamp,
"api_endpoint": API_URL,
"total_tasks": len(results),
"completed": len(completed),
"failed": len(failed),
"success_rate": len(completed) / max(len(results), 1) * 100,
"avg_time_sec": sum(total_times) / len(total_times) if completed else None,
"median_time_sec": sorted(total_times)[len(total_times) // 2] if completed else None,
"tasks_per_hour": 3600 / (sum(total_times) / len(total_times)) if completed else None
}
summary_file = f"mind2web_summary_{timestamp}.json"
with open(summary_file, "w", encoding="utf-8") as f:
json.dump(summary, f, indent=2, ensure_ascii=False)
print(f"📋 Краткий отчет сохранен в: {summary_file}")

117
requirements_test.txt Normal file
View file

@ -0,0 +1,117 @@
aiofiles==25.1.0
aiohappyeyeballs==2.6.1
aiosignal==1.4.0
annotated-doc==0.0.4
annotated-types==0.7.0
anthropic==0.76.0
anyio==4.12.1
attrs==26.1.0
backoff==2.2.1
beautifulsoup4==4.14.3
browser-use==0.12.6
browser-use-sdk==2.0.15
bubus==1.5.6
cdp-use==1.4.5
certifi==2026.2.25
cffi==2.0.0
charset-normalizer==3.4.7
click==8.3.1
cloudpickle==3.1.2
cryptography==46.0.7
Cython==3.2.4
datasets==4.8.4
dill==0.4.1
distro==1.9.0
docstring_parser==0.18.0
filelock==3.29.0
frozenlist==1.8.0
fsspec==2026.2.0
google-api-core==2.29.0
google-api-python-client==2.188.0
google-auth==2.48.0
google-auth-httplib2==0.3.1
google-auth-oauthlib==1.2.4
google-genai==1.65.0
googleapis-common-protos==1.74.0
groq==1.0.0
h11==0.16.0
hf-xet==1.4.3
httpcore==1.0.9
httplib2==0.31.2
httptools==0.7.1
httpx==0.28.1
httpx-sse==0.4.3
huggingface_hub==1.11.0
idna==3.12
inquirerpy==0.3.4
jiter==0.14.0
jsonschema==4.26.0
jsonschema-specifications==2025.9.1
lxml==6.1.0
markdown-it-py==4.0.0
markdownify==1.2.2
mcp==1.26.0
mdurl==0.1.2
multidict==6.7.1
multiprocess==0.70.19
numpy==2.4.4
oauthlib==3.3.1
ollama==0.6.1
openai==2.16.0
packaging==26.1
pandas==3.0.2
pfzy==0.3.4
pillow==12.1.1
portalocker==3.2.0
posthog==7.7.0
prompt_toolkit==3.0.52
propcache==0.4.1
proto-plus==1.27.2
protobuf==6.33.6
psutil==7.2.2
pyarrow==24.0.0
pyasn1==0.6.3
pyasn1_modules==0.4.2
pycparser==3.0
pydantic-settings==2.14.0
pydantic_core==2.41.5
Pygments==2.20.0
PyJWT==2.12.1
pyobjc-core==12.1
pyobjc-framework-Cocoa==12.1
pyotp==2.9.0
pyparsing==3.3.2
pypdf==6.9.1
python-dateutil==2.9.0.post0
python-docx==1.2.0
python-dotenv==1.2.1
python-multipart==0.0.26
PyYAML==6.0.3
referencing==0.37.0
reportlab==4.4.9
requests==2.32.5
requests-oauthlib==2.0.0
rich==14.3.1
rpds-py==0.30.0
rsa==4.9.1
screeninfo==0.8.1
shellingham==1.5.4
six==1.17.0
sniffio==1.3.1
soupsieve==2.8.3
sse-starlette==3.3.4
starlette==1.0.0
tenacity==9.1.4
tqdm==4.67.3
typer==0.24.1
typing-inspection==0.4.2
typing_extensions==4.15.0
uritemplate==4.2.0
urllib3==2.6.3
uuid7==0.1.0
uvloop==0.22.1
watchfiles==1.1.1
wcwidth==0.6.0
websockets==16.0
xxhash==3.6.0
yarl==1.23.0