diff --git a/api/mind2web_benchmark.json b/api/mind2web_benchmark.json new file mode 100644 index 00000000..8463e76a --- /dev/null +++ b/api/mind2web_benchmark.json @@ -0,0 +1,77 @@ +[ + { + "index": 2, + "original_task_id": "mind2web_2_1776817552", + "api_task_id": "2334bb039ee04dc6b23b5b0240314064", + "task_description": "Find hard side Carry-on Luggage in black color in target", + "reference_length": 9, + "status": "succeeded", + "queue_time_sec": 0.02, + "execution_time_sec": 74.38, + "total_time_sec": 74.38, + "result": { + "task_id": "2334bb039ee04dc6b23b5b0240314064", + "status": "succeeded", + "success": true, + "execution_time": 72.45919299125671, + "result": "Found hard side carry-on luggage in black at Target:\n\n1. **Signature Hardside Carry On Spinner Suitcase Matte Black** - Open Story™\n - Price: $128.00\n - Rating: 4.3/5 (71 ratings)\n - Features: TSA Locks, Water-Resistant, Polycarbonate\n\n2. **Hardside Carry On Spinner Suitcase Black** - Open Story™\n - Price: $90.00\n - Rating: 4.3/5 (500 ratings)\n - Features: Telescoping Handle, 8 Wheels\n\n3. **SWISSGEAR Energie Hardside Carry On Spinner Suitcase** - Black\n - Price: $86.39 (on sale)\n - Rating: 4.2/5 (76 ratings)\n - Features: Structured Polycarbonate, 8 Spinner Wheels\n\nAll three products match your requirements for hard side carry-on luggage in black color.", + "error": null, + "raw_response": { + "success": true, + "result": "Found hard side carry-on luggage in black at Target:\n\n1. **Signature Hardside Carry On Spinner Suitcase Matte Black** - Open Story™\n - Price: $128.00\n - Rating: 4.3/5 (71 ratings)\n - Features: TSA Locks, Water-Resistant, Polycarbonate\n\n2. **Hardside Carry On Spinner Suitcase Black** - Open Story™\n - Price: $90.00\n - Rating: 4.3/5 (500 ratings)\n - Features: Telescoping Handle, 8 Wheels\n\n3. **SWISSGEAR Energie Hardside Carry On Spinner Suitcase** - Black\n - Price: $86.39 (on sale)\n - Rating: 4.2/5 (76 ratings)\n - Features: Structured Polycarbonate, 8 Spinner Wheels\n\nAll three products match your requirements for hard side carry-on luggage in black color.", + "browser_view": "https://face-veteran-investigate-daniel.trycloudflare.com" + } + }, + "timestamp": "2026-04-22T03:27:07.331142" + }, + { + "index": 3, + "original_task_id": "mind2web_3_1776817627", + "api_task_id": "fb9ca5f23cfe427d80c31747f61260d6", + "task_description": "Show me the coming soon AMC Artisan Films in amctheatres", + "reference_length": 6, + "status": "succeeded", + "queue_time_sec": 0.01, + "execution_time_sec": 58.33, + "total_time_sec": 58.33, + "result": { + "task_id": "fb9ca5f23cfe427d80c31747f61260d6", + "status": "succeeded", + "success": true, + "execution_time": 56.48335075378418, + "result": "Here are the AMC Artisan Films coming soon:\n\n1. **MICHAEL** - April 24\n - A front-row seat to the life and legacy of one of the most influential artists\n\n2. **The AI Doc: Or How I Became an Apocaloptimist**\n - Release date not specified\n\n3. **Lorne** - April 17, 2026\n - From Academy Award-winning filmmaker Morgan Neville\n - Behind-the-scenes glimpse at comedy empire builder\n - Rating: R | Runtime: 1 hr 41 min\n\n4. **Hamlet** - April 10, 2026\n - Rating: NR | Runtime: 1 hr 54 min\n\n5. **Mother Mary** - April 17, 2026\n - Starring Anne Hathaway, Michaela Coel, Hunter Schafer, FKA Twigs\n - Directed by David Lowery | Rating: R | Runtime: 1 hr 50 min", + "error": null, + "raw_response": { + "success": true, + "result": "Here are the AMC Artisan Films coming soon:\n\n1. **MICHAEL** - April 24\n - A front-row seat to the life and legacy of one of the most influential artists\n\n2. **The AI Doc: Or How I Became an Apocaloptimist**\n - Release date not specified\n\n3. **Lorne** - April 17, 2026\n - From Academy Award-winning filmmaker Morgan Neville\n - Behind-the-scenes glimpse at comedy empire builder\n - Rating: R | Runtime: 1 hr 41 min\n\n4. **Hamlet** - April 10, 2026\n - Rating: NR | Runtime: 1 hr 54 min\n\n5. **Mother Mary** - April 17, 2026\n - Starring Anne Hathaway, Michaela Coel, Hunter Schafer, FKA Twigs\n - Directed by David Lowery | Rating: R | Runtime: 1 hr 50 min", + "browser_view": "https://face-veteran-investigate-daniel.trycloudflare.com" + } + }, + "timestamp": "2026-04-22T03:28:05.665642" + }, + { + "index": 4, + "original_task_id": "mind2web_4_1776817685", + "api_task_id": "e3db75a7582e4fde856d8e9b746417a2", + "task_description": "Go to the page with help in choosing a solar energy product for homes in tesla", + "reference_length": 5, + "status": "succeeded", + "queue_time_sec": 0.0, + "execution_time_sec": 295.44, + "total_time_sec": 295.44, + "result": { + "task_id": "e3db75a7582e4fde856d8e9b746417a2", + "status": "succeeded", + "success": true, + "execution_time": 293.71661829948425, + "result": "Successfully found help content for choosing solar energy products via Tesla Assist chat. The response includes: (1) Design & Aesthetics comparison between Solar Panels and Solar Roof, (2) Durability information for both products, (3) Roof Eligibility table showing which roof types work with each product (Asphalt/Composition Shingles, Concrete Tile, Flat Roof, Standing Seam Metal, Other), (4) Key Benefits including savings & independence info, (5) Next steps with Order Now and Schedule Consultation links.", + "error": null, + "raw_response": { + "success": true, + "result": "Successfully found help content for choosing solar energy products via Tesla Assist chat. The response includes: (1) Design & Aesthetics comparison between Solar Panels and Solar Roof, (2) Durability information for both products, (3) Roof Eligibility table showing which roof types work with each product (Asphalt/Composition Shingles, Concrete Tile, Flat Roof, Standing Seam Metal, Other), (4) Key Benefits including savings & independence info, (5) Next steps with Order Now and Schedule Consultation links.", + "browser_view": "https://face-veteran-investigate-daniel.trycloudflare.com" + } + }, + "timestamp": "2026-04-22T03:33:01.112179" + } +] \ No newline at end of file diff --git a/api/mind2web_summary_20260422_015724.json b/api/mind2web_summary_20260422_015724.json new file mode 100644 index 00000000..5563f11a --- /dev/null +++ b/api/mind2web_summary_20260422_015724.json @@ -0,0 +1,12 @@ +{ + "benchmark": "Online-Mind2Web", + "timestamp": "20260422_015724", + "api_endpoint": "http://localhost:8088/api/browser/tasks", + "total_tasks": 0, + "completed": 0, + "failed": 0, + "success_rate": 0.0, + "avg_time_sec": null, + "median_time_sec": null, + "tasks_per_hour": null +} \ No newline at end of file diff --git a/api/test-api.py b/api/test-api.py new file mode 100644 index 00000000..1eef5b68 --- /dev/null +++ b/api/test-api.py @@ -0,0 +1,197 @@ +import requests +import time +import json +from datasets import load_dataset +from datetime import datetime + +# Конфигурация API +API_URL = "http://localhost:8088/api/browser/tasks" +HEADERS = {"Content-Type": "application/json"} + +# Загружаем датасет +dataset = load_dataset("iMeanAI/Mind2Web-Live", split="train") + +# Для теста берем первые N задач (замените на полный датасет при необходимости) +TEST_SIZE = 10 # или len(dataset) для полного бенчмарка +dataset = dataset.select(range(TEST_SIZE)) + +print(f"Загружено задач: {len(dataset)}") +print(f"Поля: {dataset[0].keys()}\n") +cnt = 3 +results = [] + +for idx, item in enumerate(dataset): + if cnt > 0: + cnt -=1 + continue + # Поля из датасета + task_desc = item['task'] # Описание задачи + ref_length = item['reference_task_length'] # Эталонная длина в шагах + evaluation = item['evaluation'] # Критерии оценки + + # ID задачи (используем index + timestamp для уникальности) + task_id_orig = f"mind2web_{idx}_{int(time.time())}" + + print(f"\n[{idx + 1}/{len(dataset)}] Task: {task_desc[:70]}...") + print(f" Эталонная длина: {ref_length} шагов") + + start_time = time.time() + + # 1. Создаем задачу через API + try: + resp = requests.post( + API_URL, + json={ + "task": task_desc, + "timeout": 300, # Увеличим таймаут для сложных задач + "metadata": { + "source": "mind2web", + "reference_length": ref_length + } + }, + headers=HEADERS, + timeout=10 + ) + + if resp.status_code != 202: + print(f" ❌ Ошибка создания задачи: {resp.status_code}") + print(f" Ответ: {resp.text}") + continue + + api_task_id = resp.json()["task_id"] + created_at = time.time() + queue_time = created_at - start_time + + print(f" 📝 Task ID: {api_task_id} | Очередь: {queue_time:.2f}с") + + # 2. Ожидание завершения с прогрессом + status = "queued" + poll_count = 0 + while status in ["queued", "running"]: + time.sleep(2) # Интервал опроса + poll_count += 1 + + try: + status_resp = requests.get(f"{API_URL}/{api_task_id}", timeout=5) + if status_resp.status_code == 200: + status_data = status_resp.json() + status = status_data.get("status", "unknown") + + # Показываем прогресс каждые 5 опросов + if poll_count % 5 == 0: + elapsed = time.time() - start_time + print(f" ⏳ Статус: {status} | Прошло: {elapsed:.1f}с") + except Exception as e: + print(f" ⚠️ Ошибка опроса: {e}") + pass + + end_time = time.time() + execution_time = end_time - start_time + + # 3. Получение результата + result_resp = requests.get(f"{API_URL}/{api_task_id}/result", timeout=10) + + result_data = None + if result_resp.status_code == 200: + try: + result_data = result_resp.json() + except: + result_data = result_resp.text + + # 4. Запись метрик + result = { + "index": idx, + "original_task_id": task_id_orig, + "api_task_id": api_task_id, + "task_description": task_desc, + "reference_length": ref_length, + "status": status, + "queue_time_sec": round(queue_time, 2), + "execution_time_sec": round(execution_time, 2), + "total_time_sec": round(end_time - start_time, 2), + "result": result_data, + "timestamp": datetime.now().isoformat() + } + results.append(result) + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + filename = f"mind2web_benchmark.json" + with open(filename, "w", encoding="utf-8") as f: + json.dump(results, f, indent=2, ensure_ascii=False) + # Эмодзи статуса + status_emoji = "✅" if status == "succeeded" else "❌" + print(f" {status_emoji} Статус: {status} | Время: {execution_time:.1f}с") + + except requests.exceptions.Timeout: + print(f" ❌ Таймаут при создании задачи") + except Exception as e: + print(f" ❌ Ошибка: {type(e).__name__}: {e}") + continue + +# Сохранение детальных результатов +timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") +filename = f"mind2web_benchmark_{timestamp}.json" + +with open(filename, "w", encoding="utf-8") as f: + json.dump(results, f, indent=2, ensure_ascii=False) + +print("\n" + "=" * 60) +print("📊 ИТОГОВЫЕ МЕТРИКИ СКОРОСТИ") +print("=" * 60) + +# Статистика по статусам +completed = [r for r in results if r["status"] == "completed"] +failed = [r for r in results if r["status"] == "failed"] +unknown = [r for r in results if r["status"] not in ["completed", "failed"]] + +print(f"\n📈 СТАТУСЫ:") +print(f" Всего задач: {len(results)}") +print(f" ✅ Успешно: {len(completed)} ({len(completed) / max(len(results), 1) * 100:.1f}%)") +print(f" ❌ Провалено: {len(failed)} ({len(failed) / max(len(results), 1) * 100:.1f}%)") +if unknown: + print(f" ❓ Неизвестный статус: {len(unknown)}") + +if completed: + total_times = [r["total_time_sec"] for r in completed] + queue_times = [r["queue_time_sec"] for r in completed] + exec_times = [r["execution_time_sec"] for r in completed] + + print(f"\n⏱️ ВРЕМЯ ВЫПОЛНЕНИЯ:") + print(f" Среднее: {sum(total_times) / len(total_times):.2f} сек") + print(f" Медиана (p50): {sorted(total_times)[len(total_times) // 2]:.2f} сек") + if len(total_times) >= 20: + print(f" p95: {sorted(total_times)[int(len(total_times) * 0.95)]:.2f} сек") + print(f" Мин: {min(total_times):.2f} сек") + print(f" Макс: {max(total_times):.2f} сек") + + print(f"\n📊 ПРОИЗВОДИТЕЛЬНОСТЬ:") + print(f" Среднее время в очереди: {sum(queue_times) / len(queue_times):.2f} сек") + tasks_per_hour = 3600 / (sum(total_times) / len(total_times)) + print(f" Скорость выполнения: {tasks_per_hour:.1f} задач/час") + + # Эффективность относительно эталонной длины + if all("reference_length" in r for r in completed): + avg_ref_length = sum(r["reference_length"] for r in completed) / len(completed) + time_per_step = (sum(total_times) / len(total_times)) / avg_ref_length + print(f" Среднее время на шаг: {time_per_step:.2f} сек") + +print(f"\n💾 Результаты сохранены в: {filename}") + +# Создание краткого отчета для сравнения +summary = { + "benchmark": "Online-Mind2Web", + "timestamp": timestamp, + "api_endpoint": API_URL, + "total_tasks": len(results), + "completed": len(completed), + "failed": len(failed), + "success_rate": len(completed) / max(len(results), 1) * 100, + "avg_time_sec": sum(total_times) / len(total_times) if completed else None, + "median_time_sec": sorted(total_times)[len(total_times) // 2] if completed else None, + "tasks_per_hour": 3600 / (sum(total_times) / len(total_times)) if completed else None +} + +summary_file = f"mind2web_summary_{timestamp}.json" +with open(summary_file, "w", encoding="utf-8") as f: + json.dump(summary, f, indent=2, ensure_ascii=False) + +print(f"📋 Краткий отчет сохранен в: {summary_file}") \ No newline at end of file diff --git a/requirements_test.txt b/requirements_test.txt new file mode 100644 index 00000000..f8d24223 --- /dev/null +++ b/requirements_test.txt @@ -0,0 +1,117 @@ +aiofiles==25.1.0 +aiohappyeyeballs==2.6.1 +aiosignal==1.4.0 +annotated-doc==0.0.4 +annotated-types==0.7.0 +anthropic==0.76.0 +anyio==4.12.1 +attrs==26.1.0 +backoff==2.2.1 +beautifulsoup4==4.14.3 +browser-use==0.12.6 +browser-use-sdk==2.0.15 +bubus==1.5.6 +cdp-use==1.4.5 +certifi==2026.2.25 +cffi==2.0.0 +charset-normalizer==3.4.7 +click==8.3.1 +cloudpickle==3.1.2 +cryptography==46.0.7 +Cython==3.2.4 +datasets==4.8.4 +dill==0.4.1 +distro==1.9.0 +docstring_parser==0.18.0 +filelock==3.29.0 +frozenlist==1.8.0 +fsspec==2026.2.0 +google-api-core==2.29.0 +google-api-python-client==2.188.0 +google-auth==2.48.0 +google-auth-httplib2==0.3.1 +google-auth-oauthlib==1.2.4 +google-genai==1.65.0 +googleapis-common-protos==1.74.0 +groq==1.0.0 +h11==0.16.0 +hf-xet==1.4.3 +httpcore==1.0.9 +httplib2==0.31.2 +httptools==0.7.1 +httpx==0.28.1 +httpx-sse==0.4.3 +huggingface_hub==1.11.0 +idna==3.12 +inquirerpy==0.3.4 +jiter==0.14.0 +jsonschema==4.26.0 +jsonschema-specifications==2025.9.1 +lxml==6.1.0 +markdown-it-py==4.0.0 +markdownify==1.2.2 +mcp==1.26.0 +mdurl==0.1.2 +multidict==6.7.1 +multiprocess==0.70.19 +numpy==2.4.4 +oauthlib==3.3.1 +ollama==0.6.1 +openai==2.16.0 +packaging==26.1 +pandas==3.0.2 +pfzy==0.3.4 +pillow==12.1.1 +portalocker==3.2.0 +posthog==7.7.0 +prompt_toolkit==3.0.52 +propcache==0.4.1 +proto-plus==1.27.2 +protobuf==6.33.6 +psutil==7.2.2 +pyarrow==24.0.0 +pyasn1==0.6.3 +pyasn1_modules==0.4.2 +pycparser==3.0 +pydantic-settings==2.14.0 +pydantic_core==2.41.5 +Pygments==2.20.0 +PyJWT==2.12.1 +pyobjc-core==12.1 +pyobjc-framework-Cocoa==12.1 +pyotp==2.9.0 +pyparsing==3.3.2 +pypdf==6.9.1 +python-dateutil==2.9.0.post0 +python-docx==1.2.0 +python-dotenv==1.2.1 +python-multipart==0.0.26 +PyYAML==6.0.3 +referencing==0.37.0 +reportlab==4.4.9 +requests==2.32.5 +requests-oauthlib==2.0.0 +rich==14.3.1 +rpds-py==0.30.0 +rsa==4.9.1 +screeninfo==0.8.1 +shellingham==1.5.4 +six==1.17.0 +sniffio==1.3.1 +soupsieve==2.8.3 +sse-starlette==3.3.4 +starlette==1.0.0 +tenacity==9.1.4 +tqdm==4.67.3 +typer==0.24.1 +typing-inspection==0.4.2 +typing_extensions==4.15.0 +uritemplate==4.2.0 +urllib3==2.6.3 +uuid7==0.1.0 +uvloop==0.22.1 +watchfiles==1.1.1 +wcwidth==0.6.0 +websockets==16.0 +xxhash==3.6.0 +yarl==1.23.0 \ No newline at end of file