tests add for api

2026-04-22 03:46:17 +03:00 · 2026-04-22 03:46:17 +03:00 · 365ab8dd79
commit 365ab8dd79
parent 50589232d6
4 changed files with 403 additions and 0 deletions
--- a/api/mind2web_benchmark.json
+++ b/api/mind2web_benchmark.json
@ -0,0 +1,77 @@
+[
+  {
+    "index": 2,
+    "original_task_id": "mind2web_2_1776817552",
+    "api_task_id": "2334bb039ee04dc6b23b5b0240314064",
+    "task_description": "Find hard side Carry-on Luggage in black color in target",
+    "reference_length": 9,
+    "status": "succeeded",
+    "queue_time_sec": 0.02,
+    "execution_time_sec": 74.38,
+    "total_time_sec": 74.38,
+    "result": {
+      "task_id": "2334bb039ee04dc6b23b5b0240314064",
+      "status": "succeeded",
+      "success": true,
+      "execution_time": 72.45919299125671,
+      "result": "Found hard side carry-on luggage in black at Target:\n\n1. **Signature Hardside Carry On Spinner Suitcase Matte Black** - Open Story™\n   - Price: $128.00\n   - Rating: 4.3/5 (71 ratings)\n   - Features: TSA Locks, Water-Resistant, Polycarbonate\n\n2. **Hardside Carry On Spinner Suitcase Black** - Open Story™\n   - Price: $90.00\n   - Rating: 4.3/5 (500 ratings)\n   - Features: Telescoping Handle, 8 Wheels\n\n3. **SWISSGEAR Energie Hardside Carry On Spinner Suitcase** - Black\n   - Price: $86.39 (on sale)\n   - Rating: 4.2/5 (76 ratings)\n   - Features: Structured Polycarbonate, 8 Spinner Wheels\n\nAll three products match your requirements for hard side carry-on luggage in black color.",
+      "error": null,
+      "raw_response": {
+        "success": true,
+        "result": "Found hard side carry-on luggage in black at Target:\n\n1. **Signature Hardside Carry On Spinner Suitcase Matte Black** - Open Story™\n   - Price: $128.00\n   - Rating: 4.3/5 (71 ratings)\n   - Features: TSA Locks, Water-Resistant, Polycarbonate\n\n2. **Hardside Carry On Spinner Suitcase Black** - Open Story™\n   - Price: $90.00\n   - Rating: 4.3/5 (500 ratings)\n   - Features: Telescoping Handle, 8 Wheels\n\n3. **SWISSGEAR Energie Hardside Carry On Spinner Suitcase** - Black\n   - Price: $86.39 (on sale)\n   - Rating: 4.2/5 (76 ratings)\n   - Features: Structured Polycarbonate, 8 Spinner Wheels\n\nAll three products match your requirements for hard side carry-on luggage in black color.",
+        "browser_view": "https://face-veteran-investigate-daniel.trycloudflare.com"
+      }
+    },
+    "timestamp": "2026-04-22T03:27:07.331142"
+  },
+  {
+    "index": 3,
+    "original_task_id": "mind2web_3_1776817627",
+    "api_task_id": "fb9ca5f23cfe427d80c31747f61260d6",
+    "task_description": "Show me the coming soon AMC Artisan Films in amctheatres",
+    "reference_length": 6,
+    "status": "succeeded",
+    "queue_time_sec": 0.01,
+    "execution_time_sec": 58.33,
+    "total_time_sec": 58.33,
+    "result": {
+      "task_id": "fb9ca5f23cfe427d80c31747f61260d6",
+      "status": "succeeded",
+      "success": true,
+      "execution_time": 56.48335075378418,
+      "result": "Here are the AMC Artisan Films coming soon:\n\n1. **MICHAEL** - April 24\n   - A front-row seat to the life and legacy of one of the most influential artists\n\n2. **The AI Doc: Or How I Became an Apocaloptimist**\n   - Release date not specified\n\n3. **Lorne** - April 17, 2026\n   - From Academy Award-winning filmmaker Morgan Neville\n   - Behind-the-scenes glimpse at comedy empire builder\n   - Rating: R | Runtime: 1 hr 41 min\n\n4. **Hamlet** - April 10, 2026\n   - Rating: NR | Runtime: 1 hr 54 min\n\n5. **Mother Mary** - April 17, 2026\n   - Starring Anne Hathaway, Michaela Coel, Hunter Schafer, FKA Twigs\n   - Directed by David Lowery | Rating: R | Runtime: 1 hr 50 min",
+      "error": null,
+      "raw_response": {
+        "success": true,
+        "result": "Here are the AMC Artisan Films coming soon:\n\n1. **MICHAEL** - April 24\n   - A front-row seat to the life and legacy of one of the most influential artists\n\n2. **The AI Doc: Or How I Became an Apocaloptimist**\n   - Release date not specified\n\n3. **Lorne** - April 17, 2026\n   - From Academy Award-winning filmmaker Morgan Neville\n   - Behind-the-scenes glimpse at comedy empire builder\n   - Rating: R | Runtime: 1 hr 41 min\n\n4. **Hamlet** - April 10, 2026\n   - Rating: NR | Runtime: 1 hr 54 min\n\n5. **Mother Mary** - April 17, 2026\n   - Starring Anne Hathaway, Michaela Coel, Hunter Schafer, FKA Twigs\n   - Directed by David Lowery | Rating: R | Runtime: 1 hr 50 min",
+        "browser_view": "https://face-veteran-investigate-daniel.trycloudflare.com"
+      }
+    },
+    "timestamp": "2026-04-22T03:28:05.665642"
+  },
+  {
+    "index": 4,
+    "original_task_id": "mind2web_4_1776817685",
+    "api_task_id": "e3db75a7582e4fde856d8e9b746417a2",
+    "task_description": "Go to the page with help in choosing a solar energy product for homes in tesla",
+    "reference_length": 5,
+    "status": "succeeded",
+    "queue_time_sec": 0.0,
+    "execution_time_sec": 295.44,
+    "total_time_sec": 295.44,
+    "result": {
+      "task_id": "e3db75a7582e4fde856d8e9b746417a2",
+      "status": "succeeded",
+      "success": true,
+      "execution_time": 293.71661829948425,
+      "result": "Successfully found help content for choosing solar energy products via Tesla Assist chat. The response includes: (1) Design & Aesthetics comparison between Solar Panels and Solar Roof, (2) Durability information for both products, (3) Roof Eligibility table showing which roof types work with each product (Asphalt/Composition Shingles, Concrete Tile, Flat Roof, Standing Seam Metal, Other), (4) Key Benefits including savings & independence info, (5) Next steps with Order Now and Schedule Consultation links.",
+      "error": null,
+      "raw_response": {
+        "success": true,
+        "result": "Successfully found help content for choosing solar energy products via Tesla Assist chat. The response includes: (1) Design & Aesthetics comparison between Solar Panels and Solar Roof, (2) Durability information for both products, (3) Roof Eligibility table showing which roof types work with each product (Asphalt/Composition Shingles, Concrete Tile, Flat Roof, Standing Seam Metal, Other), (4) Key Benefits including savings & independence info, (5) Next steps with Order Now and Schedule Consultation links.",
+        "browser_view": "https://face-veteran-investigate-daniel.trycloudflare.com"
+      }
+    },
+    "timestamp": "2026-04-22T03:33:01.112179"
+  }
+]
--- a/api/mind2web_summary_20260422_015724.json
+++ b/api/mind2web_summary_20260422_015724.json
@ -0,0 +1,12 @@
+{
+  "benchmark": "Online-Mind2Web",
+  "timestamp": "20260422_015724",
+  "api_endpoint": "http://localhost:8088/api/browser/tasks",
+  "total_tasks": 0,
+  "completed": 0,
+  "failed": 0,
+  "success_rate": 0.0,
+  "avg_time_sec": null,
+  "median_time_sec": null,
+  "tasks_per_hour": null
+}
--- a/api/test-api.py
+++ b/api/test-api.py
@ -0,0 +1,197 @@
+import requests
+import time
+import json
+from datasets import load_dataset
+from datetime import datetime
+
+# Конфигурация API
+API_URL = "http://localhost:8088/api/browser/tasks"
+HEADERS = {"Content-Type": "application/json"}
+
+# Загружаем датасет
+dataset = load_dataset("iMeanAI/Mind2Web-Live", split="train")
+
+# Для теста берем первые N задач (замените на полный датасет при необходимости)
+TEST_SIZE = 10  # или len(dataset) для полного бенчмарка
+dataset = dataset.select(range(TEST_SIZE))
+
+print(f"Загружено задач: {len(dataset)}")
+print(f"Поля: {dataset[0].keys()}\n")
+cnt = 3
+results = []
+
+for idx, item in enumerate(dataset):
+    if cnt > 0:
+        cnt -=1
+        continue
+    # Поля из датасета
+    task_desc = item['task']  # Описание задачи
+    ref_length = item['reference_task_length']  # Эталонная длина в шагах
+    evaluation = item['evaluation']  # Критерии оценки
+
+    # ID задачи (используем index + timestamp для уникальности)
+    task_id_orig = f"mind2web_{idx}_{int(time.time())}"
+
+    print(f"\n[{idx + 1}/{len(dataset)}] Task: {task_desc[:70]}...")
+    print(f"  Эталонная длина: {ref_length} шагов")
+
+    start_time = time.time()
+
+    # 1. Создаем задачу через API
+    try:
+        resp = requests.post(
+            API_URL,
+            json={
+                "task": task_desc,
+                "timeout": 300,  # Увеличим таймаут для сложных задач
+                "metadata": {
+                    "source": "mind2web",
+                    "reference_length": ref_length
+                }
+            },
+            headers=HEADERS,
+            timeout=10
+        )
+
+        if resp.status_code != 202:
+            print(f"  ❌ Ошибка создания задачи: {resp.status_code}")
+            print(f"     Ответ: {resp.text}")
+            continue
+
+        api_task_id = resp.json()["task_id"]
+        created_at = time.time()
+        queue_time = created_at - start_time
+
+        print(f"  📝 Task ID: {api_task_id} | Очередь: {queue_time:.2f}с")
+
+        # 2. Ожидание завершения с прогрессом
+        status = "queued"
+        poll_count = 0
+        while status in ["queued", "running"]:
+            time.sleep(2)  # Интервал опроса
+            poll_count += 1
+
+            try:
+                status_resp = requests.get(f"{API_URL}/{api_task_id}", timeout=5)
+                if status_resp.status_code == 200:
+                    status_data = status_resp.json()
+                    status = status_data.get("status", "unknown")
+
+                    # Показываем прогресс каждые 5 опросов
+                    if poll_count % 5 == 0:
+                        elapsed = time.time() - start_time
+                        print(f"  ⏳ Статус: {status} | Прошло: {elapsed:.1f}с")
+            except Exception as e:
+                print(f"  ⚠️ Ошибка опроса: {e}")
+                pass
+
+        end_time = time.time()
+        execution_time = end_time - start_time
+
+        # 3. Получение результата
+        result_resp = requests.get(f"{API_URL}/{api_task_id}/result", timeout=10)
+
+        result_data = None
+        if result_resp.status_code == 200:
+            try:
+                result_data = result_resp.json()
+            except:
+                result_data = result_resp.text
+
+        # 4. Запись метрик
+        result = {
+            "index": idx,
+            "original_task_id": task_id_orig,
+            "api_task_id": api_task_id,
+            "task_description": task_desc,
+            "reference_length": ref_length,
+            "status": status,
+            "queue_time_sec": round(queue_time, 2),
+            "execution_time_sec": round(execution_time, 2),
+            "total_time_sec": round(end_time - start_time, 2),
+            "result": result_data,
+            "timestamp": datetime.now().isoformat()
+        }
+        results.append(result)
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        filename = f"mind2web_benchmark.json"
+        with open(filename, "w", encoding="utf-8") as f:
+            json.dump(results, f, indent=2, ensure_ascii=False)
+        # Эмодзи статуса
+        status_emoji = "✅" if status == "succeeded" else "❌"
+        print(f"  {status_emoji} Статус: {status} | Время: {execution_time:.1f}с")
+
+    except requests.exceptions.Timeout:
+        print(f"  ❌ Таймаут при создании задачи")
+    except Exception as e:
+        print(f"  ❌ Ошибка: {type(e).__name__}: {e}")
+        continue
+
+# Сохранение детальных результатов
+timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+filename = f"mind2web_benchmark_{timestamp}.json"
+
+with open(filename, "w", encoding="utf-8") as f:
+    json.dump(results, f, indent=2, ensure_ascii=False)
+
+print("\n" + "=" * 60)
+print("📊 ИТОГОВЫЕ МЕТРИКИ СКОРОСТИ")
+print("=" * 60)
+
+# Статистика по статусам
+completed = [r for r in results if r["status"] == "completed"]
+failed = [r for r in results if r["status"] == "failed"]
+unknown = [r for r in results if r["status"] not in ["completed", "failed"]]
+
+print(f"\n📈 СТАТУСЫ:")
+print(f"  Всего задач: {len(results)}")
+print(f"  ✅ Успешно: {len(completed)} ({len(completed) / max(len(results), 1) * 100:.1f}%)")
+print(f"  ❌ Провалено: {len(failed)} ({len(failed) / max(len(results), 1) * 100:.1f}%)")
+if unknown:
+    print(f"  ❓ Неизвестный статус: {len(unknown)}")
+
+if completed:
+    total_times = [r["total_time_sec"] for r in completed]
+    queue_times = [r["queue_time_sec"] for r in completed]
+    exec_times = [r["execution_time_sec"] for r in completed]
+
+    print(f"\n⏱️ ВРЕМЯ ВЫПОЛНЕНИЯ:")
+    print(f"  Среднее: {sum(total_times) / len(total_times):.2f} сек")
+    print(f"  Медиана (p50): {sorted(total_times)[len(total_times) // 2]:.2f} сек")
+    if len(total_times) >= 20:
+        print(f"  p95: {sorted(total_times)[int(len(total_times) * 0.95)]:.2f} сек")
+    print(f"  Мин: {min(total_times):.2f} сек")
+    print(f"  Макс: {max(total_times):.2f} сек")
+
+    print(f"\n📊 ПРОИЗВОДИТЕЛЬНОСТЬ:")
+    print(f"  Среднее время в очереди: {sum(queue_times) / len(queue_times):.2f} сек")
+    tasks_per_hour = 3600 / (sum(total_times) / len(total_times))
+    print(f"  Скорость выполнения: {tasks_per_hour:.1f} задач/час")
+
+    # Эффективность относительно эталонной длины
+    if all("reference_length" in r for r in completed):
+        avg_ref_length = sum(r["reference_length"] for r in completed) / len(completed)
+        time_per_step = (sum(total_times) / len(total_times)) / avg_ref_length
+        print(f"  Среднее время на шаг: {time_per_step:.2f} сек")
+
+print(f"\n💾 Результаты сохранены в: {filename}")
+
+# Создание краткого отчета для сравнения
+summary = {
+    "benchmark": "Online-Mind2Web",
+    "timestamp": timestamp,
+    "api_endpoint": API_URL,
+    "total_tasks": len(results),
+    "completed": len(completed),
+    "failed": len(failed),
+    "success_rate": len(completed) / max(len(results), 1) * 100,
+    "avg_time_sec": sum(total_times) / len(total_times) if completed else None,
+    "median_time_sec": sorted(total_times)[len(total_times) // 2] if completed else None,
+    "tasks_per_hour": 3600 / (sum(total_times) / len(total_times)) if completed else None
+}
+
+summary_file = f"mind2web_summary_{timestamp}.json"
+with open(summary_file, "w", encoding="utf-8") as f:
+    json.dump(summary, f, indent=2, ensure_ascii=False)
+
+print(f"📋 Краткий отчет сохранен в: {summary_file}")
--- a/requirements_test.txt
+++ b/requirements_test.txt
@ -0,0 +1,117 @@
+aiofiles==25.1.0
+aiohappyeyeballs==2.6.1
+aiosignal==1.4.0
+annotated-doc==0.0.4
+annotated-types==0.7.0
+anthropic==0.76.0
+anyio==4.12.1
+attrs==26.1.0
+backoff==2.2.1
+beautifulsoup4==4.14.3
+browser-use==0.12.6
+browser-use-sdk==2.0.15
+bubus==1.5.6
+cdp-use==1.4.5
+certifi==2026.2.25
+cffi==2.0.0
+charset-normalizer==3.4.7
+click==8.3.1
+cloudpickle==3.1.2
+cryptography==46.0.7
+Cython==3.2.4
+datasets==4.8.4
+dill==0.4.1
+distro==1.9.0
+docstring_parser==0.18.0
+filelock==3.29.0
+frozenlist==1.8.0
+fsspec==2026.2.0
+google-api-core==2.29.0
+google-api-python-client==2.188.0
+google-auth==2.48.0
+google-auth-httplib2==0.3.1
+google-auth-oauthlib==1.2.4
+google-genai==1.65.0
+googleapis-common-protos==1.74.0
+groq==1.0.0
+h11==0.16.0
+hf-xet==1.4.3
+httpcore==1.0.9
+httplib2==0.31.2
+httptools==0.7.1
+httpx==0.28.1
+httpx-sse==0.4.3
+huggingface_hub==1.11.0
+idna==3.12
+inquirerpy==0.3.4
+jiter==0.14.0
+jsonschema==4.26.0
+jsonschema-specifications==2025.9.1
+lxml==6.1.0
+markdown-it-py==4.0.0
+markdownify==1.2.2
+mcp==1.26.0
+mdurl==0.1.2
+multidict==6.7.1
+multiprocess==0.70.19
+numpy==2.4.4
+oauthlib==3.3.1
+ollama==0.6.1
+openai==2.16.0
+packaging==26.1
+pandas==3.0.2
+pfzy==0.3.4
+pillow==12.1.1
+portalocker==3.2.0
+posthog==7.7.0
+prompt_toolkit==3.0.52
+propcache==0.4.1
+proto-plus==1.27.2
+protobuf==6.33.6
+psutil==7.2.2
+pyarrow==24.0.0
+pyasn1==0.6.3
+pyasn1_modules==0.4.2
+pycparser==3.0
+pydantic-settings==2.14.0
+pydantic_core==2.41.5
+Pygments==2.20.0
+PyJWT==2.12.1
+pyobjc-core==12.1
+pyobjc-framework-Cocoa==12.1
+pyotp==2.9.0
+pyparsing==3.3.2
+pypdf==6.9.1
+python-dateutil==2.9.0.post0
+python-docx==1.2.0
+python-dotenv==1.2.1
+python-multipart==0.0.26
+PyYAML==6.0.3
+referencing==0.37.0
+reportlab==4.4.9
+requests==2.32.5
+requests-oauthlib==2.0.0
+rich==14.3.1
+rpds-py==0.30.0
+rsa==4.9.1
+screeninfo==0.8.1
+shellingham==1.5.4
+six==1.17.0
+sniffio==1.3.1
+soupsieve==2.8.3
+sse-starlette==3.3.4
+starlette==1.0.0
+tenacity==9.1.4
+tqdm==4.67.3
+typer==0.24.1
+typing-inspection==0.4.2
+typing_extensions==4.15.0
+uritemplate==4.2.0
+urllib3==2.6.3
+uuid7==0.1.0
+uvloop==0.22.1
+watchfiles==1.1.1
+wcwidth==0.6.0
+websockets==16.0
+xxhash==3.6.0
+yarl==1.23.0