tests add for api
This commit is contained in:
parent
50589232d6
commit
365ab8dd79
4 changed files with 403 additions and 0 deletions
77
api/mind2web_benchmark.json
Normal file
77
api/mind2web_benchmark.json
Normal file
|
|
@ -0,0 +1,77 @@
|
|||
[
|
||||
{
|
||||
"index": 2,
|
||||
"original_task_id": "mind2web_2_1776817552",
|
||||
"api_task_id": "2334bb039ee04dc6b23b5b0240314064",
|
||||
"task_description": "Find hard side Carry-on Luggage in black color in target",
|
||||
"reference_length": 9,
|
||||
"status": "succeeded",
|
||||
"queue_time_sec": 0.02,
|
||||
"execution_time_sec": 74.38,
|
||||
"total_time_sec": 74.38,
|
||||
"result": {
|
||||
"task_id": "2334bb039ee04dc6b23b5b0240314064",
|
||||
"status": "succeeded",
|
||||
"success": true,
|
||||
"execution_time": 72.45919299125671,
|
||||
"result": "Found hard side carry-on luggage in black at Target:\n\n1. **Signature Hardside Carry On Spinner Suitcase Matte Black** - Open Story™\n - Price: $128.00\n - Rating: 4.3/5 (71 ratings)\n - Features: TSA Locks, Water-Resistant, Polycarbonate\n\n2. **Hardside Carry On Spinner Suitcase Black** - Open Story™\n - Price: $90.00\n - Rating: 4.3/5 (500 ratings)\n - Features: Telescoping Handle, 8 Wheels\n\n3. **SWISSGEAR Energie Hardside Carry On Spinner Suitcase** - Black\n - Price: $86.39 (on sale)\n - Rating: 4.2/5 (76 ratings)\n - Features: Structured Polycarbonate, 8 Spinner Wheels\n\nAll three products match your requirements for hard side carry-on luggage in black color.",
|
||||
"error": null,
|
||||
"raw_response": {
|
||||
"success": true,
|
||||
"result": "Found hard side carry-on luggage in black at Target:\n\n1. **Signature Hardside Carry On Spinner Suitcase Matte Black** - Open Story™\n - Price: $128.00\n - Rating: 4.3/5 (71 ratings)\n - Features: TSA Locks, Water-Resistant, Polycarbonate\n\n2. **Hardside Carry On Spinner Suitcase Black** - Open Story™\n - Price: $90.00\n - Rating: 4.3/5 (500 ratings)\n - Features: Telescoping Handle, 8 Wheels\n\n3. **SWISSGEAR Energie Hardside Carry On Spinner Suitcase** - Black\n - Price: $86.39 (on sale)\n - Rating: 4.2/5 (76 ratings)\n - Features: Structured Polycarbonate, 8 Spinner Wheels\n\nAll three products match your requirements for hard side carry-on luggage in black color.",
|
||||
"browser_view": "https://face-veteran-investigate-daniel.trycloudflare.com"
|
||||
}
|
||||
},
|
||||
"timestamp": "2026-04-22T03:27:07.331142"
|
||||
},
|
||||
{
|
||||
"index": 3,
|
||||
"original_task_id": "mind2web_3_1776817627",
|
||||
"api_task_id": "fb9ca5f23cfe427d80c31747f61260d6",
|
||||
"task_description": "Show me the coming soon AMC Artisan Films in amctheatres",
|
||||
"reference_length": 6,
|
||||
"status": "succeeded",
|
||||
"queue_time_sec": 0.01,
|
||||
"execution_time_sec": 58.33,
|
||||
"total_time_sec": 58.33,
|
||||
"result": {
|
||||
"task_id": "fb9ca5f23cfe427d80c31747f61260d6",
|
||||
"status": "succeeded",
|
||||
"success": true,
|
||||
"execution_time": 56.48335075378418,
|
||||
"result": "Here are the AMC Artisan Films coming soon:\n\n1. **MICHAEL** - April 24\n - A front-row seat to the life and legacy of one of the most influential artists\n\n2. **The AI Doc: Or How I Became an Apocaloptimist**\n - Release date not specified\n\n3. **Lorne** - April 17, 2026\n - From Academy Award-winning filmmaker Morgan Neville\n - Behind-the-scenes glimpse at comedy empire builder\n - Rating: R | Runtime: 1 hr 41 min\n\n4. **Hamlet** - April 10, 2026\n - Rating: NR | Runtime: 1 hr 54 min\n\n5. **Mother Mary** - April 17, 2026\n - Starring Anne Hathaway, Michaela Coel, Hunter Schafer, FKA Twigs\n - Directed by David Lowery | Rating: R | Runtime: 1 hr 50 min",
|
||||
"error": null,
|
||||
"raw_response": {
|
||||
"success": true,
|
||||
"result": "Here are the AMC Artisan Films coming soon:\n\n1. **MICHAEL** - April 24\n - A front-row seat to the life and legacy of one of the most influential artists\n\n2. **The AI Doc: Or How I Became an Apocaloptimist**\n - Release date not specified\n\n3. **Lorne** - April 17, 2026\n - From Academy Award-winning filmmaker Morgan Neville\n - Behind-the-scenes glimpse at comedy empire builder\n - Rating: R | Runtime: 1 hr 41 min\n\n4. **Hamlet** - April 10, 2026\n - Rating: NR | Runtime: 1 hr 54 min\n\n5. **Mother Mary** - April 17, 2026\n - Starring Anne Hathaway, Michaela Coel, Hunter Schafer, FKA Twigs\n - Directed by David Lowery | Rating: R | Runtime: 1 hr 50 min",
|
||||
"browser_view": "https://face-veteran-investigate-daniel.trycloudflare.com"
|
||||
}
|
||||
},
|
||||
"timestamp": "2026-04-22T03:28:05.665642"
|
||||
},
|
||||
{
|
||||
"index": 4,
|
||||
"original_task_id": "mind2web_4_1776817685",
|
||||
"api_task_id": "e3db75a7582e4fde856d8e9b746417a2",
|
||||
"task_description": "Go to the page with help in choosing a solar energy product for homes in tesla",
|
||||
"reference_length": 5,
|
||||
"status": "succeeded",
|
||||
"queue_time_sec": 0.0,
|
||||
"execution_time_sec": 295.44,
|
||||
"total_time_sec": 295.44,
|
||||
"result": {
|
||||
"task_id": "e3db75a7582e4fde856d8e9b746417a2",
|
||||
"status": "succeeded",
|
||||
"success": true,
|
||||
"execution_time": 293.71661829948425,
|
||||
"result": "Successfully found help content for choosing solar energy products via Tesla Assist chat. The response includes: (1) Design & Aesthetics comparison between Solar Panels and Solar Roof, (2) Durability information for both products, (3) Roof Eligibility table showing which roof types work with each product (Asphalt/Composition Shingles, Concrete Tile, Flat Roof, Standing Seam Metal, Other), (4) Key Benefits including savings & independence info, (5) Next steps with Order Now and Schedule Consultation links.",
|
||||
"error": null,
|
||||
"raw_response": {
|
||||
"success": true,
|
||||
"result": "Successfully found help content for choosing solar energy products via Tesla Assist chat. The response includes: (1) Design & Aesthetics comparison between Solar Panels and Solar Roof, (2) Durability information for both products, (3) Roof Eligibility table showing which roof types work with each product (Asphalt/Composition Shingles, Concrete Tile, Flat Roof, Standing Seam Metal, Other), (4) Key Benefits including savings & independence info, (5) Next steps with Order Now and Schedule Consultation links.",
|
||||
"browser_view": "https://face-veteran-investigate-daniel.trycloudflare.com"
|
||||
}
|
||||
},
|
||||
"timestamp": "2026-04-22T03:33:01.112179"
|
||||
}
|
||||
]
|
||||
12
api/mind2web_summary_20260422_015724.json
Normal file
12
api/mind2web_summary_20260422_015724.json
Normal file
|
|
@ -0,0 +1,12 @@
|
|||
{
|
||||
"benchmark": "Online-Mind2Web",
|
||||
"timestamp": "20260422_015724",
|
||||
"api_endpoint": "http://localhost:8088/api/browser/tasks",
|
||||
"total_tasks": 0,
|
||||
"completed": 0,
|
||||
"failed": 0,
|
||||
"success_rate": 0.0,
|
||||
"avg_time_sec": null,
|
||||
"median_time_sec": null,
|
||||
"tasks_per_hour": null
|
||||
}
|
||||
197
api/test-api.py
Normal file
197
api/test-api.py
Normal file
|
|
@ -0,0 +1,197 @@
|
|||
import requests
|
||||
import time
|
||||
import json
|
||||
from datasets import load_dataset
|
||||
from datetime import datetime
|
||||
|
||||
# Конфигурация API
|
||||
API_URL = "http://localhost:8088/api/browser/tasks"
|
||||
HEADERS = {"Content-Type": "application/json"}
|
||||
|
||||
# Загружаем датасет
|
||||
dataset = load_dataset("iMeanAI/Mind2Web-Live", split="train")
|
||||
|
||||
# Для теста берем первые N задач (замените на полный датасет при необходимости)
|
||||
TEST_SIZE = 10 # или len(dataset) для полного бенчмарка
|
||||
dataset = dataset.select(range(TEST_SIZE))
|
||||
|
||||
print(f"Загружено задач: {len(dataset)}")
|
||||
print(f"Поля: {dataset[0].keys()}\n")
|
||||
cnt = 3
|
||||
results = []
|
||||
|
||||
for idx, item in enumerate(dataset):
|
||||
if cnt > 0:
|
||||
cnt -=1
|
||||
continue
|
||||
# Поля из датасета
|
||||
task_desc = item['task'] # Описание задачи
|
||||
ref_length = item['reference_task_length'] # Эталонная длина в шагах
|
||||
evaluation = item['evaluation'] # Критерии оценки
|
||||
|
||||
# ID задачи (используем index + timestamp для уникальности)
|
||||
task_id_orig = f"mind2web_{idx}_{int(time.time())}"
|
||||
|
||||
print(f"\n[{idx + 1}/{len(dataset)}] Task: {task_desc[:70]}...")
|
||||
print(f" Эталонная длина: {ref_length} шагов")
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
# 1. Создаем задачу через API
|
||||
try:
|
||||
resp = requests.post(
|
||||
API_URL,
|
||||
json={
|
||||
"task": task_desc,
|
||||
"timeout": 300, # Увеличим таймаут для сложных задач
|
||||
"metadata": {
|
||||
"source": "mind2web",
|
||||
"reference_length": ref_length
|
||||
}
|
||||
},
|
||||
headers=HEADERS,
|
||||
timeout=10
|
||||
)
|
||||
|
||||
if resp.status_code != 202:
|
||||
print(f" ❌ Ошибка создания задачи: {resp.status_code}")
|
||||
print(f" Ответ: {resp.text}")
|
||||
continue
|
||||
|
||||
api_task_id = resp.json()["task_id"]
|
||||
created_at = time.time()
|
||||
queue_time = created_at - start_time
|
||||
|
||||
print(f" 📝 Task ID: {api_task_id} | Очередь: {queue_time:.2f}с")
|
||||
|
||||
# 2. Ожидание завершения с прогрессом
|
||||
status = "queued"
|
||||
poll_count = 0
|
||||
while status in ["queued", "running"]:
|
||||
time.sleep(2) # Интервал опроса
|
||||
poll_count += 1
|
||||
|
||||
try:
|
||||
status_resp = requests.get(f"{API_URL}/{api_task_id}", timeout=5)
|
||||
if status_resp.status_code == 200:
|
||||
status_data = status_resp.json()
|
||||
status = status_data.get("status", "unknown")
|
||||
|
||||
# Показываем прогресс каждые 5 опросов
|
||||
if poll_count % 5 == 0:
|
||||
elapsed = time.time() - start_time
|
||||
print(f" ⏳ Статус: {status} | Прошло: {elapsed:.1f}с")
|
||||
except Exception as e:
|
||||
print(f" ⚠️ Ошибка опроса: {e}")
|
||||
pass
|
||||
|
||||
end_time = time.time()
|
||||
execution_time = end_time - start_time
|
||||
|
||||
# 3. Получение результата
|
||||
result_resp = requests.get(f"{API_URL}/{api_task_id}/result", timeout=10)
|
||||
|
||||
result_data = None
|
||||
if result_resp.status_code == 200:
|
||||
try:
|
||||
result_data = result_resp.json()
|
||||
except:
|
||||
result_data = result_resp.text
|
||||
|
||||
# 4. Запись метрик
|
||||
result = {
|
||||
"index": idx,
|
||||
"original_task_id": task_id_orig,
|
||||
"api_task_id": api_task_id,
|
||||
"task_description": task_desc,
|
||||
"reference_length": ref_length,
|
||||
"status": status,
|
||||
"queue_time_sec": round(queue_time, 2),
|
||||
"execution_time_sec": round(execution_time, 2),
|
||||
"total_time_sec": round(end_time - start_time, 2),
|
||||
"result": result_data,
|
||||
"timestamp": datetime.now().isoformat()
|
||||
}
|
||||
results.append(result)
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
filename = f"mind2web_benchmark.json"
|
||||
with open(filename, "w", encoding="utf-8") as f:
|
||||
json.dump(results, f, indent=2, ensure_ascii=False)
|
||||
# Эмодзи статуса
|
||||
status_emoji = "✅" if status == "succeeded" else "❌"
|
||||
print(f" {status_emoji} Статус: {status} | Время: {execution_time:.1f}с")
|
||||
|
||||
except requests.exceptions.Timeout:
|
||||
print(f" ❌ Таймаут при создании задачи")
|
||||
except Exception as e:
|
||||
print(f" ❌ Ошибка: {type(e).__name__}: {e}")
|
||||
continue
|
||||
|
||||
# Сохранение детальных результатов
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
filename = f"mind2web_benchmark_{timestamp}.json"
|
||||
|
||||
with open(filename, "w", encoding="utf-8") as f:
|
||||
json.dump(results, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("📊 ИТОГОВЫЕ МЕТРИКИ СКОРОСТИ")
|
||||
print("=" * 60)
|
||||
|
||||
# Статистика по статусам
|
||||
completed = [r for r in results if r["status"] == "completed"]
|
||||
failed = [r for r in results if r["status"] == "failed"]
|
||||
unknown = [r for r in results if r["status"] not in ["completed", "failed"]]
|
||||
|
||||
print(f"\n📈 СТАТУСЫ:")
|
||||
print(f" Всего задач: {len(results)}")
|
||||
print(f" ✅ Успешно: {len(completed)} ({len(completed) / max(len(results), 1) * 100:.1f}%)")
|
||||
print(f" ❌ Провалено: {len(failed)} ({len(failed) / max(len(results), 1) * 100:.1f}%)")
|
||||
if unknown:
|
||||
print(f" ❓ Неизвестный статус: {len(unknown)}")
|
||||
|
||||
if completed:
|
||||
total_times = [r["total_time_sec"] for r in completed]
|
||||
queue_times = [r["queue_time_sec"] for r in completed]
|
||||
exec_times = [r["execution_time_sec"] for r in completed]
|
||||
|
||||
print(f"\n⏱️ ВРЕМЯ ВЫПОЛНЕНИЯ:")
|
||||
print(f" Среднее: {sum(total_times) / len(total_times):.2f} сек")
|
||||
print(f" Медиана (p50): {sorted(total_times)[len(total_times) // 2]:.2f} сек")
|
||||
if len(total_times) >= 20:
|
||||
print(f" p95: {sorted(total_times)[int(len(total_times) * 0.95)]:.2f} сек")
|
||||
print(f" Мин: {min(total_times):.2f} сек")
|
||||
print(f" Макс: {max(total_times):.2f} сек")
|
||||
|
||||
print(f"\n📊 ПРОИЗВОДИТЕЛЬНОСТЬ:")
|
||||
print(f" Среднее время в очереди: {sum(queue_times) / len(queue_times):.2f} сек")
|
||||
tasks_per_hour = 3600 / (sum(total_times) / len(total_times))
|
||||
print(f" Скорость выполнения: {tasks_per_hour:.1f} задач/час")
|
||||
|
||||
# Эффективность относительно эталонной длины
|
||||
if all("reference_length" in r for r in completed):
|
||||
avg_ref_length = sum(r["reference_length"] for r in completed) / len(completed)
|
||||
time_per_step = (sum(total_times) / len(total_times)) / avg_ref_length
|
||||
print(f" Среднее время на шаг: {time_per_step:.2f} сек")
|
||||
|
||||
print(f"\n💾 Результаты сохранены в: {filename}")
|
||||
|
||||
# Создание краткого отчета для сравнения
|
||||
summary = {
|
||||
"benchmark": "Online-Mind2Web",
|
||||
"timestamp": timestamp,
|
||||
"api_endpoint": API_URL,
|
||||
"total_tasks": len(results),
|
||||
"completed": len(completed),
|
||||
"failed": len(failed),
|
||||
"success_rate": len(completed) / max(len(results), 1) * 100,
|
||||
"avg_time_sec": sum(total_times) / len(total_times) if completed else None,
|
||||
"median_time_sec": sorted(total_times)[len(total_times) // 2] if completed else None,
|
||||
"tasks_per_hour": 3600 / (sum(total_times) / len(total_times)) if completed else None
|
||||
}
|
||||
|
||||
summary_file = f"mind2web_summary_{timestamp}.json"
|
||||
with open(summary_file, "w", encoding="utf-8") as f:
|
||||
json.dump(summary, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f"📋 Краткий отчет сохранен в: {summary_file}")
|
||||
117
requirements_test.txt
Normal file
117
requirements_test.txt
Normal file
|
|
@ -0,0 +1,117 @@
|
|||
aiofiles==25.1.0
|
||||
aiohappyeyeballs==2.6.1
|
||||
aiosignal==1.4.0
|
||||
annotated-doc==0.0.4
|
||||
annotated-types==0.7.0
|
||||
anthropic==0.76.0
|
||||
anyio==4.12.1
|
||||
attrs==26.1.0
|
||||
backoff==2.2.1
|
||||
beautifulsoup4==4.14.3
|
||||
browser-use==0.12.6
|
||||
browser-use-sdk==2.0.15
|
||||
bubus==1.5.6
|
||||
cdp-use==1.4.5
|
||||
certifi==2026.2.25
|
||||
cffi==2.0.0
|
||||
charset-normalizer==3.4.7
|
||||
click==8.3.1
|
||||
cloudpickle==3.1.2
|
||||
cryptography==46.0.7
|
||||
Cython==3.2.4
|
||||
datasets==4.8.4
|
||||
dill==0.4.1
|
||||
distro==1.9.0
|
||||
docstring_parser==0.18.0
|
||||
filelock==3.29.0
|
||||
frozenlist==1.8.0
|
||||
fsspec==2026.2.0
|
||||
google-api-core==2.29.0
|
||||
google-api-python-client==2.188.0
|
||||
google-auth==2.48.0
|
||||
google-auth-httplib2==0.3.1
|
||||
google-auth-oauthlib==1.2.4
|
||||
google-genai==1.65.0
|
||||
googleapis-common-protos==1.74.0
|
||||
groq==1.0.0
|
||||
h11==0.16.0
|
||||
hf-xet==1.4.3
|
||||
httpcore==1.0.9
|
||||
httplib2==0.31.2
|
||||
httptools==0.7.1
|
||||
httpx==0.28.1
|
||||
httpx-sse==0.4.3
|
||||
huggingface_hub==1.11.0
|
||||
idna==3.12
|
||||
inquirerpy==0.3.4
|
||||
jiter==0.14.0
|
||||
jsonschema==4.26.0
|
||||
jsonschema-specifications==2025.9.1
|
||||
lxml==6.1.0
|
||||
markdown-it-py==4.0.0
|
||||
markdownify==1.2.2
|
||||
mcp==1.26.0
|
||||
mdurl==0.1.2
|
||||
multidict==6.7.1
|
||||
multiprocess==0.70.19
|
||||
numpy==2.4.4
|
||||
oauthlib==3.3.1
|
||||
ollama==0.6.1
|
||||
openai==2.16.0
|
||||
packaging==26.1
|
||||
pandas==3.0.2
|
||||
pfzy==0.3.4
|
||||
pillow==12.1.1
|
||||
portalocker==3.2.0
|
||||
posthog==7.7.0
|
||||
prompt_toolkit==3.0.52
|
||||
propcache==0.4.1
|
||||
proto-plus==1.27.2
|
||||
protobuf==6.33.6
|
||||
psutil==7.2.2
|
||||
pyarrow==24.0.0
|
||||
pyasn1==0.6.3
|
||||
pyasn1_modules==0.4.2
|
||||
pycparser==3.0
|
||||
pydantic-settings==2.14.0
|
||||
pydantic_core==2.41.5
|
||||
Pygments==2.20.0
|
||||
PyJWT==2.12.1
|
||||
pyobjc-core==12.1
|
||||
pyobjc-framework-Cocoa==12.1
|
||||
pyotp==2.9.0
|
||||
pyparsing==3.3.2
|
||||
pypdf==6.9.1
|
||||
python-dateutil==2.9.0.post0
|
||||
python-docx==1.2.0
|
||||
python-dotenv==1.2.1
|
||||
python-multipart==0.0.26
|
||||
PyYAML==6.0.3
|
||||
referencing==0.37.0
|
||||
reportlab==4.4.9
|
||||
requests==2.32.5
|
||||
requests-oauthlib==2.0.0
|
||||
rich==14.3.1
|
||||
rpds-py==0.30.0
|
||||
rsa==4.9.1
|
||||
screeninfo==0.8.1
|
||||
shellingham==1.5.4
|
||||
six==1.17.0
|
||||
sniffio==1.3.1
|
||||
soupsieve==2.8.3
|
||||
sse-starlette==3.3.4
|
||||
starlette==1.0.0
|
||||
tenacity==9.1.4
|
||||
tqdm==4.67.3
|
||||
typer==0.24.1
|
||||
typing-inspection==0.4.2
|
||||
typing_extensions==4.15.0
|
||||
uritemplate==4.2.0
|
||||
urllib3==2.6.3
|
||||
uuid7==0.1.0
|
||||
uvloop==0.22.1
|
||||
watchfiles==1.1.1
|
||||
wcwidth==0.6.0
|
||||
websockets==16.0
|
||||
xxhash==3.6.0
|
||||
yarl==1.23.0
|
||||
Loading…
Add table
Add a link
Reference in a new issue