Compare commits

..

5 commits

Author SHA1 Message Date
Кобылкевич Фёдор
72e9cc6ff4 update skill 2026-03-27 13:07:34 +03:00
Кобылкевич Фёдор
74cb5455ca update skill 2026-03-26 23:28:59 +03:00
Кобылкевич Фёдор
aa7927a316 update skill 2026-03-26 23:28:35 +03:00
Кобылкевич Фёдор
7832c30cc0 edit docker files, add skill of browser-use in the workdir 2026-03-26 22:08:01 +03:00
Кобылкевич Фёдор
2f74daa8a6 add dockerfile and docker-compose file 2026-03-26 10:50:19 +03:00
59 changed files with 1309 additions and 3012 deletions

View file

@ -1,17 +0,0 @@
venv/
.venv/
node_modules/
__pycache__/
*.pyc
*.pyo
*.pyd
.git/
.github/
.env
config.yaml
sessions/
logs/
state.db

View file

@ -1,29 +1,5 @@
OPENAI_BASE_URL=
OPENAI_API_KEY=
MODEL_DEFAULT=
TERMINAL_DOCKER_IMAGE=python:3.12-slim
TERMINAL_ENV=docker
HERMES_MAX_ITERATIONS=90
HERMES_HOME=/app/hermes_data
HERMES_WORKSPACE_PATH=app/workspace
HERMES_MAX_ITERATIONS=
TELEGRAM_BOT_TOKEN=
TELEGRAM_ALLOWED_USERS=
TELEGRAM_HOME_CHANNEL=
BROWSER_URL=http://browser:9222
BROWSER_VIEW_URL=http://localhost:6080
BROWSER_VIEW_BASE_URL=http://localhost:6081
BROWSER_API_HOST=0.0.0.0
BROWSER_API_PORT=8088
BROWSER_USE_RPC_URL=http://browser:8787/run
BROWSER_USE_RPC_TIMEOUT=900
BROWSER_API_MAX_CONCURRENCY=2
BROWSER_USE_ISOLATION_MODE=docker-per-principal
BROWSER_RUNTIME_IMAGE=browser-use-browser-runtime:latest
BROWSER_RUNTIME_NETWORK=browser-net
BROWSER_RUNTIME_TTL_SECONDS=900
BROWSER_RUNTIME_START_TIMEOUT=45
BROWSER_RUNTIME_ENABLE_UI=true
TERMINAL_ENV=

View file

@ -1,23 +0,0 @@
name: Deploy to BrowserUse VPS
on:
push:
branches:
- main
- develop
workflow_dispatch:
jobs:
deploy:
runs-on: deploy-vps
env:
DEPLOY_DIR: /home/BrowserUse-vps/apps/BrowserUse_and_ComputerUse_skills
DEPLOY_BRANCH: feature/api-for-subagent
HEALTH_URL: http://127.0.0.1:8088/health
steps:
- name: Deploy Docker Compose stack
shell: bash
run: |
set -Eeuo pipefail
cd "$DEPLOY_DIR"
bash scripts/deploy_vps.sh

121
.gitignore vendored
View file

@ -1,64 +1,67 @@
/venv/
/_pycache/
*.pyc*
__pycache__/
.venv/
.vscode/
# ---> macOS
# General
.DS_Store
.AppleDouble
.LSOverride
# Icon must end with two \r
Icon
# Thumbnails
._*
# Files that might appear in the root of a volume
.DocumentRevisions-V100
.fseventsd
.Spotlight-V100
.TemporaryItems
.Trashes
.VolumeIcon.icns
.com.apple.timemachine.donotpresent
# Directories potentially created on remote AFP share
.AppleDB
.AppleDesktop
Network Trash Folder
Temporary Items
.apdisk
# ---> Windows
# Windows thumbnail cache files
Thumbs.db
Thumbs.db:encryptable
ehthumbs.db
ehthumbs_vista.db
# Dump file
*.stackdump
# Folder config file
[Dd]esktop.ini
# Recycle Bin used on file shares
$RECYCLE.BIN/
# Windows Installer files
*.cab
*.msi
*.msix
*.msm
*.msp
# Windows shortcuts
*.lnk
.env
.env.local
.env.development.local
.env.test.local
.env.production.local
.env.development
.env.test
docker-compose.override.yml
hermes_code/test_browser.py
.git
.github
.idea
hermes_data
workspace
hermes_data/*
workspace/*
export*
__pycache__/model_tools.cpython-310.pyc
__pycache__/web_tools.cpython-310.pyc
logs/
data/
.pytest_cache/
tmp/
temp_vision_images/
hermes-*/*
examples/
tests/quick_test_dataset.jsonl
tests/sample_dataset.jsonl
run_datagen_kimik2-thinking.sh
run_datagen_megascience_glm4-6.sh
run_datagen_sonnet.sh
source-data/*
run_datagen_megascience_glm4-6.sh
data/*
node_modules/
browser-use/
agent-browser/
# Private keys
*.ppk
*.pem
privvy*
images/
__pycache__/
hermes_agent.egg-info/
wandb/
testlogs
SOLUTION_SUMMARY.md
BROWSER_USE_QUICKSTART.md
BROWSER_USE_SETUP.md
START_HERE.md
GUI_BROWSER_SETUP.md
# CLI config (may contain sensitive SSH paths)
cli-config.yaml
*/config.yaml
# Skills Hub state (lives in ~/.hermes/skills/.hub/ at runtime, but just in case)
skills/.hub/
ignored/
.worktrees/
environments/benchmarks/evals/
# Release script temp files
.release_notes.md
*.idea

29
Dockerfile Normal file
View file

@ -0,0 +1,29 @@
FROM python:3.11
RUN apt-get update && apt-get install -y \
git \
curl \
build-essential \
python3-dev \
libffi-dev \
&& rm -rf /var/lib/apt/lists/*
RUN git clone https://github.com/NousResearch/hermes-agent.git /opt/hermes-agent
WORKDIR /opt/hermes-agent
RUN pip install --no-cache-dir -e .
RUN pip install --no-cache-dir python-telegram-bot
# Isolated runtime for browser-use to avoid dependency conflicts with hermes-agent.
RUN python -m venv /opt/browser-use-venv \
&& /opt/browser-use-venv/bin/pip install --no-cache-dir --upgrade pip \
&& /opt/browser-use-venv/bin/pip install --no-cache-dir browser-use
RUN ln -s /opt/hermes-agent/venv/bin/hermes /usr/local/bin/hermes 2>/dev/null || true
RUN ln -s /opt/browser-use-venv/bin/python /usr/local/bin/python-browser-use 2>/dev/null || true
RUN mkdir -p /root/.hermes/skills /root/.hermes/memories /root/.hermes/sessions
WORKDIR /workspace
CMD ["hermes", "gateway"]

106
GUI_BROWSER_FIX.md Normal file
View file

@ -0,0 +1,106 @@
# 🎯 Решение: browser-use с GUI браузером
## ✅ Проблема решена!
Теперь все действия через hermes-agent **ТРАНСЛИРУЮТСЯ** на GUI браузер в реальном времени.
## 🚀 Как использовать
### 1⃣ Запустите стек
```bash
docker compose --profile gui up -d
```
### 2⃣ Откройте VNC в браузере
```bash
open http://localhost:6080/vnc.html
```
### 3⃣ Дайте задачу agentу
Напишите что-нибудь типа:
```
"Откройте example.com и найдите заголовок страницы"
```
**Или** запустите напрямую:
```bash
docker compose exec -T hermes-agent python \
/root/.hermes/skills/autonomous-ai-agents/browser-use/scripts/run_browser_use.py \
--task "Open google.com and search for 'hello world'" \
--max-steps 5
```
### 4⃣ Смотрите действия в VNC окне 🎬
Вы видите как агент:
- 🔍 Навигирует по сайтам
- 🖱️ Кликает по кнопкам
- ⌨️ Вводит текст
- 📜 Скроллит страницу
## 🔧 Что было исправлено
### Проблема: "Host header is specified and is not an IP address"
Chromium CDP API **проверяет Host заголовок в HTTP запросах** и отвергает имена хостов.
**Решение:** Используем IP адрес контейнера вместо имени:
- ❌ ~~`BROWSER_USE_CDP_URL=http://chromium-gui:9223`~~
- ✅ `BROWSER_USE_CDP_URL=http://172.25.0.3:9223`
### Файлы, которые были обновлены:
1. **`docker-compose.yml`**
- Изменена `BROWSER_USE_CDP_URL` на `http://172.25.0.3:9223`
- Добавлена зависимость от `chromium-gui` в hermes-agent
2. **`docker/chromium-gui/start.sh`**
- Добавлена socat для проксирования TCP через IPv6
- Chromium слушает на `::1:9223` (IPv6 localhost)
- socat пробрасывает `9223` на все интерфейсы
## 📊 Архитектура
```
hermes-agent (контейнер)
├─ BROWSER_USE_CDP_URL=http://172.25.0.3:9223
└─→ chromium-gui (контейнер)
├─ Chromium слушает на ::1:9223 (IPv6)
├─ socat (TCP-LISTEN:9223 → TCP6:[::1]:9223)
├─ x11vnc (захватывает Xvfb)
└─ websockify (VNC → WebSocket)
└─→ http://localhost:6080/vnc.html (ваш браузер)
```
## 🎮 Протестировано
```bash
# ✅ CDP доступен
docker compose exec -T hermes-agent bash -c 'curl -s http://172.25.0.3:9223/json/version'
# ✅ VNC доступен
open http://localhost:6080/vnc.html
# ✅ socat проксирует
docker compose exec chromium-gui netstat -tlnp | grep 9223
```
## 📚 Дополнительно
- Полная документация: [`GUI_BROWSER_SETUP.md`](./GUI_BROWSER_SETUP.md)
- Диагностика проблем: смотрите раздел "Диагностика" в [`GUI_BROWSER_SETUP.md`](./GUI_BROWSER_SETUP.md)
---
**Теперь browser-use полностью интегрирован с GUI браузером! 🎉**

View file

@ -1,19 +1,2 @@
# BrowserUse_and_ComputerUse_skills
Чтобы запустить tool browser-use вместе с hermes agent тебе нужно выполнить следующие действия
```commandline
git clone https://git.lambda.coredump.ru/APEX/BrowserUse_and_ComputerUse_skills.git
git switch feature/telegram-browser-integration
touch .env
```
В создавшемся .env файле заполните переменные в соответствии с шаблоном, расположенном в .env.example
```commandline
docker compose up -d --build
```
```commandline
docker compose down
docker compose up -d
```
## Удачного пользования

50
SKILL.md Normal file
View file

@ -0,0 +1,50 @@
---
name: browser-use
version: "1.0.0"
description: |
Автоматизация браузера с помощью Playwright и библиотеки browser_use.
Выполняет навигацию, клики, заполнение форм, скриншоты, извлечение данных.
Подходит для тестирования веб-приложений, парсинга, автоматизации рутинных задач.
triggers:
- "открой сайт"
- "нажми на кнопку"
- "заполни форму"
- "сделай скриншот"
- "спарси данные"
- "автоматизируй браузер"
- "browser use"
- "playwright"
license: MIT
compatibility:
- hermes
- claude
allowed-tools:
- bash
- python
- read_file
- write_file
---
# BrowserUse Skill
Автоматизация браузера с использованием Playwright и browser_use.
## 🎯 Описание
Этот скилл позволяет Hermes-агенту управлять браузером:
- Открывать URL и навигировать
- Кликать по элементам
- Заполнять формы
- Извлекать данные (текст, атрибуты, HTML)
- Делать скриншоты
- Ждать загрузки элементов
- Выполнять кастомный JavaScript
- Работать с выпадающими списками
## 📦 Установка зависимостей
Перед первым использованием выполни:
```bash
cd ~/.hermes/skills/browser-use/scripts
chmod +x setup.sh
./setup.sh

View file

@ -1,18 +0,0 @@
FROM python:3.12-slim
ENV PYTHONDONTWRITEBYTECODE=1
ENV PYTHONUNBUFFERED=1
WORKDIR /app
RUN apt-get update \
&& apt-get install -y --no-install-recommends docker.io \
&& rm -rf /var/lib/apt/lists/*
COPY requirements.txt /app/requirements.txt
RUN pip install --no-cache-dir uv \
&& uv pip install --system --no-cache-dir -r /app/requirements.txt
COPY . /app/api
CMD ["uvicorn", "api.main:app", "--host", "0.0.0.0", "--port", "8088"]

View file

@ -1,172 +0,0 @@
# Browser REST API
REST API-обертка над `browser-use` RPC (`POST /run` в контейнере браузера).
Сервис принимает задачу, ставит ее в in-memory очередь, выполняет через `browser-use` и отдает статус/результат по `task_id`.
## Актуальный статус
Проверено smoke-тестом:
- `GET /health` отвечает `200` с `{"ok": true}`
- `POST /api/browser/tasks` возвращает `202` и `task_id`
- `GET /api/browser/tasks/{task_id}` возвращает `queued/running/...`
- `GET /api/browser/tasks/{task_id}/result` возвращает `202`, пока задача не завершена
- `GET /api/browser/tasks/{task_id}/history` возвращает историю шагов агента
## Архитектура
Слои сейчас разделены и выглядят нормально для MVP:
- `api/main.py` — точка входа ASGI (`uvicorn api.main:app`), сборка `FastAPI` и lifespan
- `api/routes/tasks.py` — HTTP-слой (валидация входа/выхода, status codes)
- `api/services/task_service.py` — orchestration (фоновые задачи, timeout, обработка ошибок)
- `api/repositories/task_store.py` — in-memory хранилище задач
- `api/clients/browser_rpc_client.py` — aiohttp-клиент к browser RPC
- `api/clients/browser_rpc_contracts.py` — protocol + исключения RPC-слоя
- `api/contracts/task_schemas.py` — Pydantic request/response DTO
- `api/domain/task_status.py` — доменный enum статусов
- `api/core/settings.py` — конфигурация из env
## Ограничения текущей реализации
- хранилище in-memory: после рестарта контейнера задачи теряются
- нет ретраев RPC при транспортных ошибках
- один инстанс процесса хранит задачи только локально (без shared state)
## Переменные окружения
- `BROWSER_API_HOST` (default: `0.0.0.0`)
- `BROWSER_API_PORT` (default: `8080`)
- `BROWSER_USE_RPC_URL` (default: `http://browser:8787/run`)
- `BROWSER_USE_RPC_TIMEOUT` (default: `900`)
- `BROWSER_API_MAX_CONCURRENCY` (default: `2`)
## Локальный запуск
```zsh
cd "/Users/fedorkobylkevic/PycharmProjects/BrowserUse_and_ComputerUse_skills"
source .venv/bin/activate
uvicorn api.main:app --host 0.0.0.0 --port 8088
```
## Запуск через Docker Compose
```zsh
cd "/Users/fedorkobylkevic/PycharmProjects/BrowserUse_and_ComputerUse_skills"
docker compose build browser-api
docker compose up -d browser browser-api
docker compose logs -f browser-api
```
## REST API
### `GET /health`
Проверка доступности API.
Пример ответа:
```json
{"ok": true}
```
### `POST /api/browser/tasks`
Создать задачу.
Request:
```json
{
"task": "Открой example.com и верни title",
"timeout": 300,
"metadata": {"source": "manual"}
}
```
Response `202`:
```json
{
"task_id": "53f54fa4c1f24219b3949d56b0457875",
"status": "queued"
}
```
### `GET /api/browser/tasks/{task_id}`
Текущий статус и таймстемпы.
### `GET /api/browser/tasks/{task_id}/result`
- `202` если задача еще `queued/running`
- `200` с финальным payload после завершения
### `GET /api/browser/tasks/{task_id}/history`
- `202` если задача еще `queued/running`
- `200` с финальной историей шагов после завершения
Пример ответа `200`:
```json
{
"task_id": "53f54fa4c1f24219b3949d56b0457875",
"status": "succeeded",
"history": [
{
"step": 1,
"kind": "thought",
"content": "Open target page",
"data": {"value": "Open target page"}
},
{
"step": 2,
"kind": "action",
"content": "Click login",
"data": {"value": "Click login"}
}
]
}
```
## Runs API (background runs)
Новый набор endpoint-ов для фоновых запусков:
- `POST /runs` — создать run в фоне
- `GET /runs/{run_id}` — получить run и его статус
- `POST /runs/{run_id}/cancel` — отменить pending/running run
- `DELETE /runs/{run_id}` — удалить завершенный run
- `GET /runs/{run_id}/wait` — дождаться завершения и вернуть финальный output
- `GET /runs/{run_id}/stream` — подключиться к live-потоку новых событий run (SSE)
- `GET /threads/{thread_id}/runs` — список run-ов в треде
Пример создания run:
```json
{
"thread_id": "thread-demo",
"input": "Открой example.com и верни title",
"timeout": 60,
"metadata": {"source": "manual"}
}
```
## Быстрый end-to-end пример
```zsh
curl -sS http://localhost:8088/health
RESP=$(curl -sS -X POST http://localhost:8088/api/browser/tasks \
-H "Content-Type: application/json" \
-d '{"task":"Открой example.com и верни title","timeout":30}')
echo "$RESP"
TASK_ID=$(python -c "import json,sys;print(json.loads(sys.argv[1])['task_id'])" "$RESP")
curl -sS "http://localhost:8088/api/browser/tasks/$TASK_ID"
curl -sS "http://localhost:8088/api/browser/tasks/$TASK_ID/result"
curl -sS "http://localhost:8088/api/browser/tasks/$TASK_ID/history"
```

View file

@ -1,39 +0,0 @@
from typing import Any
import aiohttp
from api.clients.browser_rpc_contracts import BrowserRpcError
class BrowserRpcClient:
def __init__(self, rpc_url: str, session: aiohttp.ClientSession) -> None:
self._rpc_url = rpc_url
self._session = session
async def run(self, task: str, timeout_sec: float, rpc_url: str | None = None) -> dict[str, Any]:
payload = {"task": task}
timeout = aiohttp.ClientTimeout(total=timeout_sec)
target_url = rpc_url or self._rpc_url
try:
async with self._session.post(target_url, json=payload, timeout=timeout) as response:
if response.status >= 400:
body = await response.text()
raise BrowserRpcError(f"RPC HTTP: {response.status}: {body}")
try:
data = await response.json(content_type=None)
except aiohttp.ContentTypeError as exc:
raise BrowserRpcError("RPC returned non-JSON response") from exc
except aiohttp.ClientError as exc:
raise BrowserRpcError(f"Transport error: {exc}") from exc
if not isinstance(data, dict):
raise BrowserRpcError("RPC returned invalid payload type")
return data
async def run_browser_task(rpc_url: str, task: str, timeout_sec: float) -> dict[str, Any]:
async with aiohttp.ClientSession() as session:
return await BrowserRpcClient(rpc_url, session=session).run(task=task, timeout_sec=timeout_sec)

View file

@ -1,8 +0,0 @@
from typing import Any, Protocol
class BrowserRpcError(RuntimeError): ...
class BrowserRpcRunner(Protocol):
async def run(self, task: str, timeout_sec: float, rpc_url: str | None = None) -> dict[str, Any]: ...

View file

@ -1,115 +0,0 @@
from typing import Any
from pydantic import BaseModel, Field
from api.domain.task_status import TaskStatus
class BrowserTaskRequest(BaseModel):
"""Запрос на запуск задачи в browser-use агенте."""
task: str = Field(..., description="Текстовая задача для browser-use агента")
timeout: int = Field(300, description="Максимальное время выполнения задачи в секундах")
metadata: dict[str, Any] | None = Field(default=None, description="Дополнительные метаданные клиента")
class BrowserTaskAcceptedResponse(BaseModel):
"""Ответ о том, что задача принята в обработку."""
task_id: str
status: TaskStatus
class BrowserTaskStatusResponse(BaseModel):
"""Текущий статус задачи и временные отметки ее выполнения."""
task_id: str
status: TaskStatus
create_at: float = Field(..., description="Время создания задачи в Unix timestamp")
started_at: float | None = Field(default=None, description="Время начала выполнения в Unix timestamp")
finished_at: float | None = Field(default=None, description="Время завершения выполнения в Unix timestamp")
error: str | None = Field(default=None, description="Текст ошибки, если задача завершилась с ошибкой")
class BrowserTaskResultResponse(BaseModel):
"""Финальный результат выполнения задачи в browser-use."""
task_id: str
status: TaskStatus
success: bool = Field(..., description="Успешно ли выполнена задача")
execution_time: float = Field(..., description="Фактическое время выполнения в секундах")
result: str | None = Field(default=None, description="Итоговый текстовый результат")
error: str | None = Field(default=None, description="Текст ошибки, если выполнение не удалось")
raw_response: dict[str, Any] | None = Field(default=None, description="Сырой ответ от browser-use RPC")
class TaskHistoryEvent(BaseModel):
"""Одно действие/шаг в истории выполнения browser-use агента."""
step: int = Field(..., description="Порядковый номер события в истории")
kind: str = Field(..., description="Тип события (thought/action/error/system)")
content: str | None = Field(default=None, description="Краткое текстовое описание события")
data: dict[str, Any] = Field(default_factory=dict, description="Дополнительные структурированные данные")
class BrowserTaskHistoryResponse(BaseModel):
"""История действий агента для конкретной задачи."""
task_id: str
status: TaskStatus
history: list[TaskHistoryEvent] = Field(default_factory=list)
class RunCreateRequest(BaseModel):
"""Запрос на создание фонового run."""
thread_id: str = Field(..., description="Идентификатор треда/контекста")
input: str = Field(..., description="Пользовательский prompt для browser-use")
timeout: int = Field(300, description="Максимальное время выполнения run в секундах")
metadata: dict[str, Any] | None = Field(default=None, description="Дополнительные метаданные")
class RunSummaryResponse(BaseModel):
"""Краткая информация о run."""
run_id: str
thread_id: str
status: TaskStatus
created_at: float
started_at: float | None = None
finished_at: float | None = None
error: str | None = None
class RunResponse(RunSummaryResponse):
"""Полная информация о run."""
input: str
metadata: dict[str, Any] | None = None
output: str | None = None
raw_response: dict[str, Any] | None = None
history: list[TaskHistoryEvent] = Field(default_factory=list)
class RunListResponse(BaseModel):
"""Список run-ов для треда."""
thread_id: str
runs: list[RunSummaryResponse] = Field(default_factory=list)
class RunWaitResponse(BaseModel):
"""Ответ ожидания завершения run."""
run: RunResponse
class RunStreamEvent(BaseModel):
"""Событие потока выполнения run."""
run_id: str
event: str
ts: float
data: dict[str, Any] = Field(default_factory=dict)

View file

View file

@ -1,16 +0,0 @@
import os
from dataclasses import dataclass
@dataclass(frozen=True)
class Settings:
app_host: str = os.getenv("BROWSER_API_HOST", "0.0.0.0")
app_port: int = int(os.getenv("BROWSER_API_PORT", "8080"))
browser_rpc_url: str = os.getenv("BROWSER_USE_RPC_URL", "http://browser:8787/run")
browser_rpc_timeout: float = float(os.getenv("BROWSER_USE_RPC_TIMEOUT", "900"))
max_concurrency: int = int(os.getenv("BROWSER_API_MAX_CONCURRENCY", "2"))
settings = Settings()

View file

View file

@ -1,10 +0,0 @@
from enum import Enum
class TaskStatus(str, Enum):
"""Состояние задачи браузерного агента."""
queued = "queued"
running = "running"
succeeded = "succeeded"
failed = "failed"
cancelled = "cancelled"

View file

@ -1,48 +0,0 @@
from contextlib import asynccontextmanager
import aiohttp
from fastapi import FastAPI
from api.clients.browser_rpc_client import BrowserRpcClient
from api.core.settings import settings
from api.repositories.task_store import TaskStore
from api.routes.runs import router as runs_router
from api.routes.tasks import router as tasks_router
from api.services.task_service import TaskService
@asynccontextmanager
async def lifespan(app: FastAPI):
session = aiohttp.ClientSession()
task_service = TaskService(
store=TaskStore(),
rpc_client=BrowserRpcClient(settings.browser_rpc_url, session=session),
max_concurrency=settings.max_concurrency,
rpc_timeout_cap=settings.browser_rpc_timeout,
)
app.state.task_service = task_service
try:
yield
finally:
await task_service.close()
await session.close()
def create_app() -> FastAPI:
app = FastAPI(
title="Browser API",
version="1.0.0",
description="REST API for submitting tasks to browser-use and retrieving their status/results.",
lifespan=lifespan,
)
app.include_router(tasks_router)
app.include_router(runs_router)
@app.get("/health")
async def health() -> dict:
return {"ok": True}
return app
app = create_app()

View file

@ -1,127 +0,0 @@
from __future__ import annotations
from typing import Any
from api.contracts.task_schemas import (
BrowserTaskAcceptedResponse,
BrowserTaskHistoryResponse,
BrowserTaskResultResponse,
BrowserTaskStatusResponse,
RunListResponse,
RunResponse,
RunSummaryResponse,
RunWaitResponse,
TaskHistoryEvent,
)
from api.domain.task_status import TaskStatus
from api.repositories.task_store import TaskRecord
class TaskRecordMapper:
ACTIVE_STATUSES = (TaskStatus.queued, TaskStatus.running)
@classmethod
def is_active_status(cls, status: TaskStatus) -> bool:
return status in cls.ACTIVE_STATUSES
@staticmethod
def to_task_accepted(rec: TaskRecord) -> BrowserTaskAcceptedResponse:
return BrowserTaskAcceptedResponse(task_id=rec.task_id, status=rec.status)
@staticmethod
def to_task_status(rec: TaskRecord) -> BrowserTaskStatusResponse:
return BrowserTaskStatusResponse(
task_id=rec.task_id,
status=rec.status,
create_at=rec.create_at,
started_at=rec.started_at,
finished_at=rec.finished_at,
error=rec.error,
)
@staticmethod
def to_task_result(rec: TaskRecord) -> BrowserTaskResultResponse:
return BrowserTaskResultResponse(
task_id=rec.task_id,
status=rec.status,
success=(rec.status == TaskStatus.succeeded),
execution_time=rec.execution_time,
result=rec.result,
error=rec.error,
raw_response=rec.raw_response,
)
@staticmethod
def to_pending_task_result(rec: TaskRecord) -> BrowserTaskResultResponse:
return BrowserTaskResultResponse(
task_id=rec.task_id,
status=rec.status,
success=False,
execution_time=rec.execution_time,
result=None,
error=None,
raw_response=None,
)
@staticmethod
def to_history_events(raw_history: list[dict[str, Any]]) -> list[TaskHistoryEvent]:
events: list[TaskHistoryEvent] = []
for index, item in enumerate(raw_history, start=1):
raw_step = item.get("step")
step = raw_step if isinstance(raw_step, int) else index
kind = str(item.get("kind") or item.get("type") or "system")
content = item.get("content")
if content is not None:
content = str(content)
data = item.get("data") if isinstance(item.get("data"), dict) else {}
events.append(TaskHistoryEvent(step=step, kind=kind, content=content, data=data))
return events
@classmethod
def to_task_history(cls, rec: TaskRecord) -> BrowserTaskHistoryResponse:
return BrowserTaskHistoryResponse(task_id=rec.task_id, status=rec.status,
history=cls.to_history_events(rec.history))
@classmethod
def to_pending_task_history(cls, rec: TaskRecord) -> BrowserTaskHistoryResponse:
return BrowserTaskHistoryResponse(task_id=rec.task_id, status=rec.status,
history=cls.to_history_events(rec.history))
@staticmethod
def to_run_summary(rec: TaskRecord) -> RunSummaryResponse:
return RunSummaryResponse(
run_id=rec.task_id,
thread_id=rec.thread_id,
status=rec.status,
created_at=rec.create_at,
started_at=rec.started_at,
finished_at=rec.finished_at,
error=rec.error,
)
@classmethod
def to_run_response(cls, rec: TaskRecord) -> RunResponse:
return RunResponse.model_validate(
{
"run_id": rec.task_id,
"thread_id": rec.thread_id,
"status": rec.status,
"created_at": rec.create_at,
"started_at": rec.started_at,
"finished_at": rec.finished_at,
"error": rec.error,
"input": rec.task,
"metadata": rec.metadata,
"output": rec.result,
"raw_response": rec.raw_response,
"history": cls.to_history_events(rec.history),
}
)
@classmethod
def to_run_wait(cls, rec: TaskRecord) -> RunWaitResponse:
return RunWaitResponse(run=cls.to_run_response(rec))
@classmethod
def to_thread_run_list(cls, thread_id: str, runs: list[TaskRecord]) -> RunListResponse:
return RunListResponse(thread_id=thread_id, runs=[cls.to_run_summary(item) for item in runs])

View file

@ -1,164 +0,0 @@
import time
import uuid
from asyncio import Event, Lock, Queue
from dataclasses import dataclass, field
from typing import Any
from api.domain.task_status import TaskStatus
@dataclass
class TaskRecord:
task_id: str
thread_id: str
task: str
timeout: int
metadata: dict[str, Any] | None
status: TaskStatus = TaskStatus.queued
create_at: float = field(default_factory=time.time)
started_at: float | None = None
finished_at: float | None = None
result: str | None = None
error: str | None = None
raw_response: dict[str, Any] | None = None
history: list[dict[str, Any]] = field(default_factory=list)
cancel_requested: bool = False
done_event: Event = field(default_factory=Event)
@property
def execution_time(self) -> float:
if self.started_at is None:
return 0
end = self.finished_at if self.finished_at is not None else time.time()
return max(0, end - self.started_at)
class TaskStore:
def __init__(self) -> None:
self._lock = Lock()
self._tasks: dict[str, TaskRecord] = {}
self._thread_index: dict[str, list[str]] = {}
self._subscribers: dict[str, set[Queue[dict[str, Any]]]] = {}
async def create(
self,
task: str,
timeout: int,
metadata: dict[str, Any] | None,
thread_id: str = "default",
) -> TaskRecord:
task_id = uuid.uuid4().hex
rec = TaskRecord(task_id=task_id, thread_id=thread_id, task=task, timeout=timeout, metadata=metadata)
async with self._lock:
self._tasks[task_id] = rec
self._thread_index.setdefault(thread_id, []).append(task_id)
self._subscribers.setdefault(task_id, set())
return rec
async def list_by_thread(self, thread_id: str) -> list[TaskRecord]:
async with self._lock:
ids = list(self._thread_index.get(thread_id, []))
return [self._tasks[item] for item in ids if item in self._tasks]
async def get(self, task_id: str) -> TaskRecord | None:
async with self._lock:
return self._tasks.get(task_id)
async def set_running(self, task_id: str) -> TaskRecord | None:
async with self._lock:
rec = self._tasks.get(task_id)
if rec is None:
return None
if rec.status == TaskStatus.cancelled:
return rec
rec.status = TaskStatus.running
rec.started_at = time.time()
return rec
async def set_done(
self,
task_id: str,
success: bool,
raw_response: dict[str, Any] | None,
error: str | None,
result: str | None = None,
history: list[dict[str, Any]] | None = None,
) -> TaskRecord | None:
async with self._lock:
rec = self._tasks.get(task_id)
if rec is None:
return None
rec.finished_at = time.time()
rec.raw_response = raw_response
rec.error = error if error is not None else (
raw_response.get("error") if isinstance(raw_response, dict) else None)
rec.result = result if result is not None else (
raw_response.get("result") if isinstance(raw_response, dict) else None)
rec.history = list(history or [])
rec.status = TaskStatus.succeeded if success else TaskStatus.failed
rec.done_event.set()
return rec
async def set_cancel_requested(self, task_id: str) -> TaskRecord | None:
async with self._lock:
rec = self._tasks.get(task_id)
if rec is None:
return None
rec.cancel_requested = True
if rec.status == TaskStatus.queued:
rec.status = TaskStatus.cancelled
rec.finished_at = time.time()
rec.error = "Cancelled by user"
rec.done_event.set()
return rec
async def set_cancelled(self, task_id: str, error: str = "Cancelled by user") -> TaskRecord | None:
async with self._lock:
rec = self._tasks.get(task_id)
if rec is None:
return None
if rec.status in (TaskStatus.succeeded, TaskStatus.failed, TaskStatus.cancelled):
return rec
rec.status = TaskStatus.cancelled
rec.finished_at = time.time()
rec.error = error
rec.done_event.set()
return rec
async def delete_if_finished(self, task_id: str) -> tuple[bool, bool]:
async with self._lock:
rec = self._tasks.get(task_id)
if rec is None:
return False, False
if rec.status in (TaskStatus.queued, TaskStatus.running):
return True, False
del self._tasks[task_id]
thread_list = self._thread_index.get(rec.thread_id, [])
if task_id in thread_list:
thread_list.remove(task_id)
self._subscribers.pop(task_id, None)
return True, True
async def subscribe(self, task_id: str) -> Queue[dict[str, Any]] | None:
queue: Queue[dict[str, Any]] = Queue()
async with self._lock:
if task_id not in self._tasks:
return None
self._subscribers.setdefault(task_id, set()).add(queue)
return queue
async def unsubscribe(self, task_id: str, queue: Queue[dict[str, Any]]) -> None:
async with self._lock:
subscribers = self._subscribers.get(task_id)
if subscribers is not None:
subscribers.discard(queue)
async def publish(self, task_id: str, event: dict[str, Any]) -> None:
async with self._lock:
subscribers = list(self._subscribers.get(task_id, set()))
for queue in subscribers:
try:
queue.put_nowait(event)
except Exception:
continue

View file

@ -1,4 +0,0 @@
fastapi==0.136.1
uvicorn[standard]==0.46.0
aiohttp==3.13.5
pydantic==2.13.3

View file

View file

@ -1,8 +0,0 @@
from fastapi import Request
from api.services.protocols import TaskServiceProtocol
def get_task_service(request: Request) -> TaskServiceProtocol:
return request.app.state.task_service

View file

@ -1,130 +0,0 @@
import asyncio
import json
from typing import AsyncIterator
from fastapi import APIRouter, Depends, HTTPException, Query, Response
from fastapi.responses import JSONResponse, StreamingResponse
from api.contracts.task_schemas import (
RunCreateRequest,
RunListResponse,
RunResponse,
RunStreamEvent,
RunSummaryResponse,
RunWaitResponse,
)
from api.mappers.task_record_mapper import TaskRecordMapper
from api.routes.dependencies import get_task_service
from api.services.protocols import TaskServiceProtocol
router = APIRouter(tags=["runs"])
@router.get("/threads/{thread_id}/runs", response_model=RunListResponse)
async def list_thread_runs(
thread_id: str,
service: TaskServiceProtocol = Depends(get_task_service),
) -> RunListResponse:
runs = await service.list_thread_runs(thread_id)
return TaskRecordMapper.to_thread_run_list(thread_id, runs)
@router.post("/runs", response_model=RunSummaryResponse, status_code=202)
async def create_run(
payload: RunCreateRequest,
service: TaskServiceProtocol = Depends(get_task_service),
) -> RunSummaryResponse:
rec = await service.create_run(
thread_id=payload.thread_id.strip(),
user_input=payload.input.strip(),
timeout=payload.timeout,
metadata=payload.metadata,
)
return TaskRecordMapper.to_run_summary(rec)
@router.get("/runs/{run_id}", response_model=RunResponse)
async def get_run(
run_id: str,
service: TaskServiceProtocol = Depends(get_task_service),
) -> RunResponse:
rec = await service.get_run(run_id)
if rec is None:
raise HTTPException(status_code=404, detail="Run not found")
return TaskRecordMapper.to_run_response(rec)
@router.post("/runs/{run_id}/cancel", response_model=RunSummaryResponse)
async def cancel_run(
run_id: str,
service: TaskServiceProtocol = Depends(get_task_service),
) -> RunSummaryResponse:
rec = await service.cancel_run(run_id)
if rec is None:
raise HTTPException(status_code=404, detail="Run not found")
return TaskRecordMapper.to_run_summary(rec)
@router.delete("/runs/{run_id}", status_code=204)
async def delete_run(
run_id: str,
service: TaskServiceProtocol = Depends(get_task_service),
) -> Response:
exists, deleted = await service.delete_run(run_id)
if not exists:
raise HTTPException(status_code=404, detail="Run not found")
if not deleted:
raise HTTPException(status_code=409, detail="Run is still active. Cancel it first.")
return Response(status_code=204)
@router.get("/runs/{run_id}/wait", response_model=RunWaitResponse)
async def wait_run(
run_id: str,
timeout: float | None = Query(default=None, ge=0),
service: TaskServiceProtocol = Depends(get_task_service),
) -> JSONResponse | RunWaitResponse:
rec = await service.wait_run(run_id, timeout=timeout)
if rec is None:
raise HTTPException(status_code=404, detail="Run not found")
if TaskRecordMapper.is_active_status(rec.status):
pending = TaskRecordMapper.to_run_wait(rec)
return JSONResponse(status_code=202, content=pending.model_dump(mode="json"))
return TaskRecordMapper.to_run_wait(rec)
@router.get("/runs/{run_id}/stream")
async def stream_run(
run_id: str,
service: TaskServiceProtocol = Depends(get_task_service),
) -> StreamingResponse:
queue = await service.subscribe_run_stream(run_id)
if queue is None:
raise HTTPException(status_code=404, detail="Run not found")
stream_queue = queue
async def event_stream() -> AsyncIterator[str]:
try:
while True:
try:
item = await asyncio.wait_for(stream_queue.get(), timeout=15)
except asyncio.TimeoutError:
rec = await service.get_run(run_id)
if rec is None:
break
if not TaskRecordMapper.is_active_status(rec.status):
break
yield ": keep-alive\n\n"
continue
payload = RunStreamEvent.model_validate(item).model_dump(mode="json")
yield f"data: {json.dumps(payload, ensure_ascii=False)}\\n\\n"
if payload["event"] in ("completed", "failed", "cancelled"):
break
finally:
await service.unsubscribe_run_stream(run_id, stream_queue)
return StreamingResponse(event_stream(), media_type="text/event-stream")

View file

@ -1,65 +0,0 @@
from fastapi import APIRouter, Depends, HTTPException
from fastapi.responses import JSONResponse
from api.contracts.task_schemas import (
BrowserTaskAcceptedResponse,
BrowserTaskHistoryResponse,
BrowserTaskRequest,
BrowserTaskResultResponse,
BrowserTaskStatusResponse,
)
from api.mappers.task_record_mapper import TaskRecordMapper
from api.routes.dependencies import get_task_service
from api.services.protocols import TaskServiceProtocol
router = APIRouter(prefix="/api/browser", tags=["browser-tasks"])
@router.post("/tasks", response_model=BrowserTaskAcceptedResponse, status_code=202)
async def create_task(
payload: BrowserTaskRequest,
service: TaskServiceProtocol = Depends(get_task_service),
) -> BrowserTaskAcceptedResponse:
rec = await service.submit_task(task=payload.task.strip(), timeout=payload.timeout, metadata=payload.metadata)
return TaskRecordMapper.to_task_accepted(rec)
@router.get("/tasks/{task_id}", response_model=BrowserTaskStatusResponse)
async def get_task_status(task_id: str, service: TaskServiceProtocol = Depends(get_task_service)) -> BrowserTaskStatusResponse:
rec = await service.get_task(task_id)
if rec is None:
raise HTTPException(status_code=404, detail="Task not found")
return TaskRecordMapper.to_task_status(rec)
@router.get("/tasks/{task_id}/result", response_model=BrowserTaskResultResponse)
async def get_task_result(
task_id: str,
service: TaskServiceProtocol = Depends(get_task_service),
) -> JSONResponse | BrowserTaskResultResponse:
rec = await service.get_task(task_id)
if rec is None:
raise HTTPException(status_code=404, detail="Task not found")
if TaskRecordMapper.is_active_status(rec.status):
pending = TaskRecordMapper.to_pending_task_result(rec)
return JSONResponse(status_code=202, content=pending.model_dump(mode="json"))
return TaskRecordMapper.to_task_result(rec)
@router.get("/tasks/{task_id}/history", response_model=BrowserTaskHistoryResponse)
async def get_task_history(
task_id: str,
service: TaskServiceProtocol = Depends(get_task_service),
) -> JSONResponse | BrowserTaskHistoryResponse:
rec = await service.get_task(task_id)
if rec is None:
raise HTTPException(status_code=404, detail="Task not found")
if TaskRecordMapper.is_active_status(rec.status):
pending = TaskRecordMapper.to_pending_task_history(rec)
return JSONResponse(status_code=202, content=pending.model_dump(mode="json"))
return TaskRecordMapper.to_task_history(rec)

View file

@ -1,464 +0,0 @@
"""Provision isolated browser-use Docker runtimes for API runs."""
from __future__ import annotations
import hashlib
import json
import logging
import os
import re
import subprocess
import tempfile
import threading
import time
from dataclasses import dataclass
from pathlib import Path
from typing import Any
from urllib import request
logger = logging.getLogger(__name__)
_DEFAULT_SHARED_CDP_URL = "http://browser:9222"
_DEFAULT_SHARED_RPC_URL = "http://browser:8787/run"
_DEFAULT_RUNTIME_IMAGE = "browser-use-browser-runtime:latest"
_DEFAULT_RUNTIME_NETWORK = "browser-net"
_DEFAULT_TTL_SECONDS = 900
_DEFAULT_START_TIMEOUT = 45
_DEFAULT_ENABLE_UI = True
_REGISTRY_LOCK = threading.Lock()
_VIEW_URL_CACHE_LOCK = threading.Lock()
_VIEW_URL_CACHE: dict[str, Any] = {"value": "", "expires_at": 0.0}
@dataclass(frozen=True)
class BrowserRuntimeConfig:
mode: str
runtime_image: str
runtime_network: str
runtime_ttl_seconds: int
runtime_start_timeout: int
shared_cdp_url: str
enable_ui: bool
def _state_dir() -> Path:
return Path(os.getenv("BROWSER_RUNTIME_STATE_DIR", "/tmp/browser-use-api"))
def _registry_path() -> Path:
return _state_dir() / "docker_runtimes.json"
def _as_int(value: Any, default: int) -> int:
try:
return max(1, int(value))
except (TypeError, ValueError):
return default
def _as_bool(value: Any, default: bool) -> bool:
if value is None:
return default
if isinstance(value, bool):
return value
return str(value).strip().lower() in {"1", "true", "yes", "on"}
def get_browser_runtime_config() -> BrowserRuntimeConfig:
mode = str(os.getenv("BROWSER_USE_ISOLATION_MODE", "shared")).strip().lower()
if mode not in {"shared", "docker-per-principal", "docker-per-task"}:
logger.warning("Unknown browser-use isolation mode %r; falling back to shared", mode)
mode = "shared"
return BrowserRuntimeConfig(
mode=mode,
runtime_image=os.getenv("BROWSER_RUNTIME_IMAGE", _DEFAULT_RUNTIME_IMAGE).strip()
or _DEFAULT_RUNTIME_IMAGE,
runtime_network=os.getenv("BROWSER_RUNTIME_NETWORK", _DEFAULT_RUNTIME_NETWORK).strip()
or _DEFAULT_RUNTIME_NETWORK,
runtime_ttl_seconds=_as_int(
os.getenv("BROWSER_RUNTIME_TTL_SECONDS"),
_DEFAULT_TTL_SECONDS,
),
runtime_start_timeout=_as_int(
os.getenv("BROWSER_RUNTIME_START_TIMEOUT"),
_DEFAULT_START_TIMEOUT,
),
shared_cdp_url=os.getenv("BROWSER_URL", _DEFAULT_SHARED_CDP_URL).strip()
or _DEFAULT_SHARED_CDP_URL,
enable_ui=_as_bool(
os.getenv("BROWSER_RUNTIME_ENABLE_UI"),
_DEFAULT_ENABLE_UI,
),
)
def resolve_isolation_owner(
mode: str,
task_id: str | None,
metadata: dict[str, Any] | None = None,
thread_id: str | None = None,
) -> str:
if mode == "docker-per-task":
return (task_id or "default").strip() or "default"
metadata = metadata or {}
for key in ("user_id", "session_id"):
value = metadata.get(key)
if value not in (None, ""):
return str(value).strip() or "default"
return (thread_id or task_id or "default").strip() or "default"
def hash_runtime_owner(owner: str) -> str:
return hashlib.sha256(owner.encode("utf-8")).hexdigest()[:16]
def _normalize_browser_view_base_url(raw_url: str) -> str:
url = (raw_url or "").strip()
if not url:
return ""
for marker in ("/vnc.html", "/index.html"):
idx = url.find(marker)
if idx != -1:
url = url[:idx]
break
return url.rstrip("/")
def _discover_browser_view_base_url_from_tunnel() -> str:
now = time.time()
with _VIEW_URL_CACHE_LOCK:
cached_value = str(_VIEW_URL_CACHE.get("value", "") or "")
expires_at = float(_VIEW_URL_CACHE.get("expires_at", 0.0) or 0.0)
if cached_value and now < expires_at:
return cached_value
try:
result = _run_docker(["logs", "--tail", "200", "browser-use-tunnel"], check=False)
combined = "\n".join(part for part in [result.stdout or "", result.stderr or ""] if part)
matches = re.findall(r"https://[^\s\"'<>]+", combined)
base_url = _normalize_browser_view_base_url(matches[-1]) if matches else ""
except Exception as exc:
logger.debug("Failed to discover browser view URL from tunnel logs: %s", exc)
base_url = ""
with _VIEW_URL_CACHE_LOCK:
_VIEW_URL_CACHE["value"] = base_url
_VIEW_URL_CACHE["expires_at"] = now + (60 if base_url else 10)
return base_url
def get_browser_view_url(
task_id: str | None = None,
metadata: dict[str, Any] | None = None,
thread_id: str | None = None,
) -> str:
base_url = _normalize_browser_view_base_url(
os.getenv("BROWSER_VIEW_BASE_URL", "") or os.getenv("BROWSER_VIEW_URL", "")
)
if not base_url:
base_url = _discover_browser_view_base_url_from_tunnel()
if not base_url:
return ""
config = get_browser_runtime_config()
if config.mode == "shared":
return f"{base_url}/vnc.html?path=websockify"
owner = resolve_isolation_owner(config.mode, task_id, metadata, thread_id)
owner_hash = hash_runtime_owner(owner)
return f"{base_url}/view/{owner_hash}/vnc.html?path=view/{owner_hash}/websockify"
def _shared_rpc_url() -> str:
return os.getenv("BROWSER_USE_RPC_URL", _DEFAULT_SHARED_RPC_URL).strip() or _DEFAULT_SHARED_RPC_URL
def _runtime_rpc_url(container_name: str) -> str:
return f"http://{container_name}:8787/run"
def _container_name(owner_hash: str) -> str:
return f"browser-use-browser-{owner_hash}"
def _volume_name(owner_hash: str) -> str:
return f"browser-use-profile-{owner_hash}"
def _load_registry() -> dict[str, Any]:
path = _registry_path()
if not path.exists():
return {"runtimes": {}}
try:
with open(path, "r", encoding="utf-8") as fh:
data = json.load(fh) or {}
if isinstance(data, dict) and isinstance(data.get("runtimes"), dict):
return data
except Exception as exc:
logger.warning("Failed to read browser-use runtime registry %s: %s", path, exc)
return {"runtimes": {}}
def _save_registry(payload: dict[str, Any]) -> None:
path = _registry_path()
path.parent.mkdir(parents=True, exist_ok=True)
fd, tmp_path = tempfile.mkstemp(dir=str(path.parent), prefix=".browser_use_", suffix=".tmp")
try:
with os.fdopen(fd, "w", encoding="utf-8") as fh:
json.dump(payload, fh, indent=2, sort_keys=True)
fh.flush()
os.fsync(fh.fileno())
os.replace(tmp_path, path)
except Exception:
try:
os.unlink(tmp_path)
except OSError:
pass
raise
def _run_docker(args: list[str], check: bool = True) -> subprocess.CompletedProcess[str]:
cmd = ["docker", *args]
logger.debug("browser-use docker cmd: %s", " ".join(cmd))
result = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=120,
)
if check and result.returncode != 0:
stderr = (result.stderr or result.stdout or "").strip()
raise RuntimeError(f"Docker command failed ({' '.join(cmd)}): {stderr}")
return result
def _ensure_docker_access() -> None:
_run_docker(["version"], check=True)
def _container_exists(container_name: str) -> bool:
result = _run_docker(["inspect", container_name], check=False)
return result.returncode == 0
def _container_running(container_name: str) -> bool:
result = _run_docker(["inspect", "-f", "{{.State.Running}}", container_name], check=False)
return result.returncode == 0 and result.stdout.strip().lower() == "true"
def _remove_container(container_name: str) -> None:
if container_name:
_run_docker(["rm", "-f", container_name], check=False)
def _volume_exists(volume_name: str) -> bool:
result = _run_docker(["volume", "inspect", volume_name], check=False)
return result.returncode == 0
def _ensure_volume(volume_name: str, owner_hash: str) -> None:
if _volume_exists(volume_name):
return
_run_docker(
[
"volume",
"create",
"--label",
"browser_use.runtime=true",
"--label",
f"browser_use.owner_hash={owner_hash}",
volume_name,
],
check=True,
)
def _remove_volume(volume_name: str) -> None:
if volume_name:
_run_docker(["volume", "rm", "-f", volume_name], check=False)
def _runtime_env_args(browser_view_url: str, config: BrowserRuntimeConfig) -> list[str]:
env: dict[str, str] = {
"BROWSER_ENABLE_UI": "true" if config.enable_ui else "false",
"BROWSER_DATA_DIR": "/data",
"BROWSER_USE_RPC_HOST": "0.0.0.0",
"BROWSER_USE_RPC_PORT": "8787",
}
if browser_view_url:
env["BROWSER_VIEW_URL"] = browser_view_url
for key in ("MODEL_DEFAULT", "OPENAI_API_KEY", "OPENAI_BASE_URL"):
value = os.getenv(key)
if value is not None:
env[key] = value
args: list[str] = []
for key, value in env.items():
args.extend(["-e", f"{key}={value}"])
return args
def _start_runtime_container(
container_name: str,
volume_name: str,
owner_hash: str,
browser_view_url: str,
config: BrowserRuntimeConfig,
) -> None:
_ensure_volume(volume_name, owner_hash)
run_args = [
"run",
"-d",
"--name",
container_name,
"--network",
config.runtime_network,
"--shm-size",
"2g",
"--label",
"browser_use.runtime=true",
"--label",
f"browser_use.owner_hash={owner_hash}",
"--label",
"browser_use.managed_by=browser_runtime_manager",
*_runtime_env_args(browser_view_url, config),
"-v",
f"{volume_name}:/data",
config.runtime_image,
]
_run_docker(run_args, check=True)
def _wait_for_runtime(container_name: str, timeout_seconds: int) -> None:
deadline = time.time() + timeout_seconds
health_url = f"http://{container_name}:8787/health"
last_error = ""
while time.time() < deadline:
try:
with request.urlopen(health_url, timeout=2) as response:
if 200 <= response.status < 300:
return
last_error = f"HTTP {response.status}"
except Exception as exc:
last_error = str(exc)
time.sleep(1)
raise RuntimeError(f"Browser runtime {container_name} did not become ready: {last_error}")
def _cleanup_expired_runtimes_locked(registry: dict[str, Any], config: BrowserRuntimeConfig) -> None:
now = time.time()
runtimes = registry.setdefault("runtimes", {})
expired_keys: list[str] = []
for runtime_key, entry in list(runtimes.items()):
last_used = float(entry.get("last_used", 0) or 0)
if not last_used or now - last_used < config.runtime_ttl_seconds:
continue
container_name = str(entry.get("container_name", "") or "")
volume_name = str(entry.get("volume_name", "") or "")
mode = str(entry.get("mode", "") or "")
logger.info("Cleaning expired browser-use runtime %s (%s)", runtime_key, container_name)
_remove_container(container_name)
if mode == "docker-per-task":
_remove_volume(volume_name)
expired_keys.append(runtime_key)
for runtime_key in expired_keys:
runtimes.pop(runtime_key, None)
def ensure_browser_runtime(
task_id: str | None = None,
metadata: dict[str, Any] | None = None,
thread_id: str | None = None,
) -> dict[str, str]:
config = get_browser_runtime_config()
if config.mode == "shared":
return {
"cdp_url": config.shared_cdp_url,
"rpc_url": _shared_rpc_url(),
"browser_view": get_browser_view_url(task_id=task_id, metadata=metadata, thread_id=thread_id),
"isolation_mode": "shared",
"owner_hash": "",
}
_ensure_docker_access()
owner = resolve_isolation_owner(config.mode, task_id, metadata, thread_id)
owner_hash = hash_runtime_owner(owner)
runtime_key = f"{config.mode}:{owner_hash}"
container_name = _container_name(owner_hash)
volume_name = _volume_name(owner_hash)
browser_view_url = get_browser_view_url(task_id=task_id, metadata=metadata, thread_id=thread_id)
with _REGISTRY_LOCK:
registry = _load_registry()
_cleanup_expired_runtimes_locked(registry, config)
if _container_running(container_name):
registry.setdefault("runtimes", {})[runtime_key] = {
"container_name": container_name,
"volume_name": volume_name,
"last_used": time.time(),
"mode": config.mode,
"owner_hash": owner_hash,
}
_save_registry(registry)
return {
"cdp_url": f"http://{container_name}:9222",
"rpc_url": _runtime_rpc_url(container_name),
"browser_view": browser_view_url,
"isolation_mode": config.mode,
"owner_hash": owner_hash,
}
if _container_exists(container_name):
_remove_container(container_name)
_start_runtime_container(container_name, volume_name, owner_hash, browser_view_url, config)
_wait_for_runtime(container_name, config.runtime_start_timeout)
registry.setdefault("runtimes", {})[runtime_key] = {
"container_name": container_name,
"volume_name": volume_name,
"last_used": time.time(),
"mode": config.mode,
"owner_hash": owner_hash,
}
_save_registry(registry)
return {
"cdp_url": f"http://{container_name}:9222",
"rpc_url": _runtime_rpc_url(container_name),
"browser_view": browser_view_url,
"isolation_mode": config.mode,
"owner_hash": owner_hash,
}
def cleanup_browser_runtime(
task_id: str | None = None,
metadata: dict[str, Any] | None = None,
thread_id: str | None = None,
) -> None:
config = get_browser_runtime_config()
if config.mode != "docker-per-task":
return
owner = resolve_isolation_owner(config.mode, task_id, metadata, thread_id)
owner_hash = hash_runtime_owner(owner)
runtime_key = f"{config.mode}:{owner_hash}"
container_name = _container_name(owner_hash)
volume_name = _volume_name(owner_hash)
with _REGISTRY_LOCK:
registry = _load_registry()
_remove_container(container_name)
_remove_volume(volume_name)
registry.setdefault("runtimes", {}).pop(runtime_key, None)
_save_registry(registry)

View file

@ -1,28 +0,0 @@
from __future__ import annotations
from asyncio import Queue
from typing import Any, Protocol
from api.repositories.task_store import TaskRecord
class TaskServiceProtocol(Protocol):
async def submit_task(self, task: str, timeout: int, metadata: dict | None) -> TaskRecord: ...
async def get_task(self, task_id: str) -> TaskRecord | None: ...
async def create_run(self, thread_id: str, user_input: str, timeout: int, metadata: dict | None) -> TaskRecord: ...
async def get_run(self, run_id: str) -> TaskRecord | None: ...
async def list_thread_runs(self, thread_id: str) -> list[TaskRecord]: ...
async def cancel_run(self, run_id: str) -> TaskRecord | None: ...
async def delete_run(self, run_id: str) -> tuple[bool, bool]: ...
async def wait_run(self, run_id: str, timeout: float | None = None) -> TaskRecord | None: ...
async def subscribe_run_stream(self, run_id: str) -> Queue[dict[str, Any]] | None: ...
async def unsubscribe_run_stream(self, run_id: str, queue: Queue[dict[str, Any]]) -> None: ...

View file

@ -1,260 +0,0 @@
import asyncio
import time
from typing import Any
from api.clients.browser_rpc_contracts import BrowserRpcError, BrowserRpcRunner
from api.domain.task_status import TaskStatus
from api.repositories.task_store import TaskRecord, TaskStore
from api.services.browser_runtime_manager import cleanup_browser_runtime, ensure_browser_runtime
class TaskService:
def __init__(
self,
store: TaskStore,
rpc_client: BrowserRpcRunner,
max_concurrency: int,
rpc_timeout_cap: float | None = None,
) -> None:
self._store = store
self._rpc_client = rpc_client
self._semaphore = asyncio.Semaphore(max_concurrency)
self._rpc_timeout_cap = rpc_timeout_cap
self._background_tasks: set[asyncio.Task[None]] = set()
self._task_by_run_id: dict[str, asyncio.Task[None]] = {}
async def submit_task(self, task: str, timeout: int, metadata: dict | None) -> TaskRecord:
record = await self.create_run(thread_id="default", user_input=task, timeout=timeout, metadata=metadata)
return record
async def create_run(self, thread_id: str, user_input: str, timeout: int, metadata: dict | None) -> TaskRecord:
record = await self._store.create(task=user_input, timeout=timeout, metadata=metadata, thread_id=thread_id)
background_task = asyncio.create_task(self._worker(record.task_id))
self._background_tasks.add(background_task)
background_task.add_done_callback(self._background_tasks.discard)
self._task_by_run_id[record.task_id] = background_task
def _cleanup(_: asyncio.Task[None]) -> None:
self._task_by_run_id.pop(record.task_id, None)
background_task.add_done_callback(_cleanup)
return record
async def get_task(self, task_id: str) -> TaskRecord | None:
return await self._store.get(task_id)
async def get_run(self, run_id: str) -> TaskRecord | None:
return await self.get_task(run_id)
async def list_thread_runs(self, thread_id: str) -> list[TaskRecord]:
return await self._store.list_by_thread(thread_id)
async def cancel_run(self, run_id: str) -> TaskRecord | None:
rec = await self._store.set_cancel_requested(run_id)
if rec is None:
return None
if rec.status == TaskStatus.cancelled:
await self._store.publish(run_id, self._event(run_id, "cancelled", {"status": rec.status.value}))
return rec
task = self._task_by_run_id.get(run_id)
if task is not None and not task.done():
task.cancel()
return rec
async def delete_run(self, run_id: str) -> tuple[bool, bool]:
return await self._store.delete_if_finished(run_id)
async def wait_run(self, run_id: str, timeout: float | None = None) -> TaskRecord | None:
rec = await self._store.get(run_id)
if rec is None:
return None
if rec.status not in (TaskStatus.queued, TaskStatus.running):
return rec
try:
if timeout is None:
await rec.done_event.wait()
else:
await asyncio.wait_for(rec.done_event.wait(), timeout=timeout)
except asyncio.TimeoutError:
return await self._store.get(run_id)
return await self._store.get(run_id)
async def subscribe_run_stream(self, run_id: str):
return await self._store.subscribe(run_id)
async def unsubscribe_run_stream(self, run_id: str, queue) -> None:
await self._store.unsubscribe(run_id, queue)
async def close(self) -> None:
if not self._background_tasks:
return
for task in list(self._background_tasks):
task.cancel()
await asyncio.gather(*self._background_tasks, return_exceptions=True)
self._background_tasks.clear()
self._task_by_run_id.clear()
async def _worker(self, task_id: str) -> None:
rec = await self._store.set_running(task_id)
if rec is None:
return
if rec.status == TaskStatus.cancelled:
return
await self._store.publish(task_id, self._event(task_id, "started", {"status": TaskStatus.running.value}))
async with self._semaphore:
runtime: dict[str, str] | None = None
try:
if rec.cancel_requested:
await self._store.set_cancelled(task_id)
await self._store.publish(task_id, self._event(task_id, "cancelled", {"status": TaskStatus.cancelled.value}))
return
runtime = await asyncio.to_thread(
ensure_browser_runtime,
task_id=task_id,
metadata=rec.metadata,
thread_id=rec.thread_id,
)
rpc_timeout = float(rec.timeout)
if self._rpc_timeout_cap is not None:
rpc_timeout = min(rpc_timeout, self._rpc_timeout_cap)
raw = await asyncio.wait_for(
self._rpc_client.run(task=rec.task, timeout_sec=rpc_timeout, rpc_url=runtime.get("rpc_url")),
timeout=float(rec.timeout) + 5,
)
raw = self._with_runtime_metadata(raw, runtime)
success = bool(raw.get("success"))
await self._store.set_done(
task_id=task_id,
success=success,
raw_response=raw,
error=None,
result=raw.get("result") if isinstance(raw, dict) else None,
history=self._extract_history(raw),
)
done = await self._store.get(task_id)
if done is not None:
await self._publish_history_events(done)
await self._store.publish(
task_id,
self._event(task_id, "completed" if success else "failed", {
"status": done.status.value,
"output": done.result,
"error": done.error,
}),
)
except asyncio.CancelledError:
await self._store.set_cancelled(task_id)
await self._store.publish(task_id, self._event(task_id, "cancelled", {"status": TaskStatus.cancelled.value}))
raise
except asyncio.TimeoutError:
await self._store.set_done(
task_id=task_id,
success=False,
raw_response=None,
error="Timeout exceeded",
history=None,
)
failed = await self._store.get(task_id)
if failed is not None:
await self._store.publish(task_id, self._event(task_id, "failed", {
"status": failed.status.value,
"error": failed.error,
}))
except BrowserRpcError as exc:
await self._store.set_done(
task_id=task_id,
success=False,
raw_response=None,
error=str(exc),
history=None,
)
failed = await self._store.get(task_id)
if failed is not None:
await self._store.publish(task_id, self._event(task_id, "failed", {
"status": failed.status.value,
"error": failed.error,
}))
except Exception as exc:
await self._store.set_done(
task_id=task_id,
success=False,
raw_response=None,
error=f"Internal error: {exc}",
history=None,
)
failed = await self._store.get(task_id)
if failed is not None:
await self._store.publish(task_id, self._event(task_id, "failed", {
"status": failed.status.value,
"error": failed.error,
}))
finally:
try:
await asyncio.to_thread(
cleanup_browser_runtime,
task_id=task_id,
metadata=rec.metadata,
thread_id=rec.thread_id,
)
except Exception:
pass
async def _publish_history_events(self, rec: TaskRecord) -> None:
for index, item in enumerate(rec.history, start=1):
await self._store.publish(
rec.task_id,
self._event(rec.task_id, "output", {
"step": item.get("step", index),
"kind": item.get("kind") or item.get("type") or "system",
"content": item.get("content"),
"data": item.get("data") if isinstance(item.get("data"), dict) else {},
}),
)
@staticmethod
def _event(run_id: str, event: str, data: dict[str, Any]) -> dict[str, Any]:
return {
"run_id": run_id,
"event": event,
"ts": time.time(),
"data": data,
}
@staticmethod
def _extract_history(raw: dict | None) -> list[dict]:
if not isinstance(raw, dict):
return []
events = raw.get("history")
if not isinstance(events, list):
return []
normalized: list[dict] = []
for event in events:
if isinstance(event, dict):
normalized.append(event)
return normalized
@staticmethod
def _with_runtime_metadata(raw: dict[str, Any], runtime: dict[str, str] | None) -> dict[str, Any]:
if not isinstance(raw, dict) or not runtime:
return raw
enriched = dict(raw)
browser_view = runtime.get("browser_view")
if browser_view and not enriched.get("browser_view"):
enriched["browser_view"] = browser_view
enriched["isolation_mode"] = runtime.get("isolation_mode", "shared")
owner_hash = runtime.get("owner_hash")
if owner_hash:
enriched["owner_hash"] = owner_hash
return enriched

View file

@ -1,197 +0,0 @@
import requests
import time
import json
from datasets import load_dataset
from datetime import datetime
# Конфигурация API
API_URL = "http://localhost:8088/api/browser/tasks"
HEADERS = {"Content-Type": "application/json"}
# Загружаем датасет
dataset = load_dataset("iMeanAI/Mind2Web-Live", split="train")
# Для теста берем первые N задач (замените на полный датасет при необходимости)
TEST_SIZE = 10 # или len(dataset) для полного бенчмарка
dataset = dataset.select(range(TEST_SIZE))
print(f"Загружено задач: {len(dataset)}")
print(f"Поля: {dataset[0].keys()}\n")
cnt = 3
results = []
for idx, item in enumerate(dataset):
if cnt > 0:
cnt -=1
continue
# Поля из датасета
task_desc = item['task'] # Описание задачи
ref_length = item['reference_task_length'] # Эталонная длина в шагах
evaluation = item['evaluation'] # Критерии оценки
# ID задачи (используем index + timestamp для уникальности)
task_id_orig = f"mind2web_{idx}_{int(time.time())}"
print(f"\n[{idx + 1}/{len(dataset)}] Task: {task_desc[:70]}...")
print(f" Эталонная длина: {ref_length} шагов")
start_time = time.time()
# 1. Создаем задачу через API
try:
resp = requests.post(
API_URL,
json={
"task": task_desc,
"timeout": 300, # Увеличим таймаут для сложных задач
"metadata": {
"source": "mind2web",
"reference_length": ref_length
}
},
headers=HEADERS,
timeout=10
)
if resp.status_code != 202:
print(f" ❌ Ошибка создания задачи: {resp.status_code}")
print(f" Ответ: {resp.text}")
continue
api_task_id = resp.json()["task_id"]
created_at = time.time()
queue_time = created_at - start_time
print(f" 📝 Task ID: {api_task_id} | Очередь: {queue_time:.2f}с")
# 2. Ожидание завершения с прогрессом
status = "queued"
poll_count = 0
while status in ["queued", "running"]:
time.sleep(2) # Интервал опроса
poll_count += 1
try:
status_resp = requests.get(f"{API_URL}/{api_task_id}", timeout=5)
if status_resp.status_code == 200:
status_data = status_resp.json()
status = status_data.get("status", "unknown")
# Показываем прогресс каждые 5 опросов
if poll_count % 5 == 0:
elapsed = time.time() - start_time
print(f" ⏳ Статус: {status} | Прошло: {elapsed:.1f}с")
except Exception as e:
print(f" ⚠️ Ошибка опроса: {e}")
pass
end_time = time.time()
execution_time = end_time - start_time
# 3. Получение результата
result_resp = requests.get(f"{API_URL}/{api_task_id}/result", timeout=10)
result_data = None
if result_resp.status_code == 200:
try:
result_data = result_resp.json()
except:
result_data = result_resp.text
# 4. Запись метрик
result = {
"index": idx,
"original_task_id": task_id_orig,
"api_task_id": api_task_id,
"task_description": task_desc,
"reference_length": ref_length,
"status": status,
"queue_time_sec": round(queue_time, 2),
"execution_time_sec": round(execution_time, 2),
"total_time_sec": round(end_time - start_time, 2),
"result": result_data,
"timestamp": datetime.now().isoformat()
}
results.append(result)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"mind2web_benchmark.json"
with open(filename, "w", encoding="utf-8") as f:
json.dump(results, f, indent=2, ensure_ascii=False)
# Эмодзи статуса
status_emoji = "" if status == "succeeded" else ""
print(f" {status_emoji} Статус: {status} | Время: {execution_time:.1f}с")
except requests.exceptions.Timeout:
print(f" ❌ Таймаут при создании задачи")
except Exception as e:
print(f" ❌ Ошибка: {type(e).__name__}: {e}")
continue
# Сохранение детальных результатов
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"mind2web_benchmark_{timestamp}.json"
with open(filename, "w", encoding="utf-8") as f:
json.dump(results, f, indent=2, ensure_ascii=False)
print("\n" + "=" * 60)
print("📊 ИТОГОВЫЕ МЕТРИКИ СКОРОСТИ")
print("=" * 60)
# Статистика по статусам
completed = [r for r in results if r["status"] == "completed"]
failed = [r for r in results if r["status"] == "failed"]
unknown = [r for r in results if r["status"] not in ["completed", "failed"]]
print(f"\n📈 СТАТУСЫ:")
print(f" Всего задач: {len(results)}")
print(f" ✅ Успешно: {len(completed)} ({len(completed) / max(len(results), 1) * 100:.1f}%)")
print(f" ❌ Провалено: {len(failed)} ({len(failed) / max(len(results), 1) * 100:.1f}%)")
if unknown:
print(f" ❓ Неизвестный статус: {len(unknown)}")
if completed:
total_times = [r["total_time_sec"] for r in completed]
queue_times = [r["queue_time_sec"] for r in completed]
exec_times = [r["execution_time_sec"] for r in completed]
print(f"\n⏱️ ВРЕМЯ ВЫПОЛНЕНИЯ:")
print(f" Среднее: {sum(total_times) / len(total_times):.2f} сек")
print(f" Медиана (p50): {sorted(total_times)[len(total_times) // 2]:.2f} сек")
if len(total_times) >= 20:
print(f" p95: {sorted(total_times)[int(len(total_times) * 0.95)]:.2f} сек")
print(f" Мин: {min(total_times):.2f} сек")
print(f" Макс: {max(total_times):.2f} сек")
print(f"\n📊 ПРОИЗВОДИТЕЛЬНОСТЬ:")
print(f" Среднее время в очереди: {sum(queue_times) / len(queue_times):.2f} сек")
tasks_per_hour = 3600 / (sum(total_times) / len(total_times))
print(f" Скорость выполнения: {tasks_per_hour:.1f} задач/час")
# Эффективность относительно эталонной длины
if all("reference_length" in r for r in completed):
avg_ref_length = sum(r["reference_length"] for r in completed) / len(completed)
time_per_step = (sum(total_times) / len(total_times)) / avg_ref_length
print(f" Среднее время на шаг: {time_per_step:.2f} сек")
print(f"\n💾 Результаты сохранены в: {filename}")
# Создание краткого отчета для сравнения
summary = {
"benchmark": "Online-Mind2Web",
"timestamp": timestamp,
"api_endpoint": API_URL,
"total_tasks": len(results),
"completed": len(completed),
"failed": len(failed),
"success_rate": len(completed) / max(len(results), 1) * 100,
"avg_time_sec": sum(total_times) / len(total_times) if completed else None,
"median_time_sec": sorted(total_times)[len(total_times) // 2] if completed else None,
"tasks_per_hour": 3600 / (sum(total_times) / len(total_times)) if completed else None
}
summary_file = f"mind2web_summary_{timestamp}.json"
with open(summary_file, "w", encoding="utf-8") as f:
json.dump(summary, f, indent=2, ensure_ascii=False)
print(f"📋 Краткий отчет сохранен в: {summary_file}")

View file

@ -1,97 +0,0 @@
from unittest.mock import MagicMock, patch
def test_resolve_isolation_owner_prefers_user_id():
from api.services.browser_runtime_manager import resolve_isolation_owner
owner = resolve_isolation_owner(
"docker-per-principal",
task_id="task-1",
metadata={"user_id": "user-7", "session_id": "session-9"},
thread_id="thread-1",
)
assert owner == "user-7"
def test_resolve_isolation_owner_uses_task_for_per_task_mode():
from api.services.browser_runtime_manager import resolve_isolation_owner
owner = resolve_isolation_owner(
"docker-per-task",
task_id="task-42",
metadata={"user_id": "user-7"},
thread_id="thread-1",
)
assert owner == "task-42"
def test_hash_runtime_owner_is_stable():
from api.services.browser_runtime_manager import hash_runtime_owner
assert hash_runtime_owner("owner-1") == hash_runtime_owner("owner-1")
assert hash_runtime_owner("owner-1") != hash_runtime_owner("owner-2")
def test_shared_mode_returns_shared_runtime(monkeypatch):
from api.services import browser_runtime_manager
monkeypatch.setenv("BROWSER_USE_ISOLATION_MODE", "shared")
monkeypatch.setenv("BROWSER_URL", "http://shared-browser:9333")
monkeypatch.setenv("BROWSER_USE_RPC_URL", "http://shared-browser:8787/run")
monkeypatch.setenv("BROWSER_VIEW_BASE_URL", "https://viewer.example.com")
runtime = browser_runtime_manager.ensure_browser_runtime(
task_id="task-1",
metadata={"user_id": "user-7"},
thread_id="thread-1",
)
assert runtime["cdp_url"] == "http://shared-browser:9333"
assert runtime["rpc_url"] == "http://shared-browser:8787/run"
assert runtime["browser_view"] == "https://viewer.example.com/vnc.html?path=websockify"
assert runtime["isolation_mode"] == "shared"
def test_isolated_mode_starts_container(monkeypatch):
from api.services import browser_runtime_manager
monkeypatch.setenv("BROWSER_USE_ISOLATION_MODE", "docker-per-principal")
monkeypatch.setenv("BROWSER_RUNTIME_IMAGE", "browser-use-browser-runtime:test")
monkeypatch.setenv("BROWSER_RUNTIME_NETWORK", "browser-net")
monkeypatch.setenv("BROWSER_VIEW_BASE_URL", "https://viewer.example.com")
saved_registry = {}
docker_calls = []
def fake_run_docker(args, check=True):
docker_calls.append(args)
if args[:2] == ["inspect", "-f"]:
return MagicMock(returncode=1, stdout="", stderr="")
if args[:1] == ["inspect"]:
return MagicMock(returncode=1, stdout="", stderr="")
return MagicMock(returncode=0, stdout="ok", stderr="")
with (
patch.object(browser_runtime_manager, "_load_registry", return_value={"runtimes": {}}),
patch.object(browser_runtime_manager, "_save_registry", side_effect=lambda payload: saved_registry.update(payload)),
patch.object(browser_runtime_manager, "_run_docker", side_effect=fake_run_docker),
patch.object(browser_runtime_manager, "_wait_for_runtime") as mock_wait,
):
runtime = browser_runtime_manager.ensure_browser_runtime(
task_id="task-1",
metadata={"user_id": "user-7"},
thread_id="thread-1",
)
assert runtime["isolation_mode"] == "docker-per-principal"
assert runtime["cdp_url"].startswith("http://browser-use-browser-")
assert runtime["rpc_url"].startswith("http://browser-use-browser-")
assert runtime["rpc_url"].endswith(":8787/run")
assert "/view/" in runtime["browser_view"]
assert saved_registry["runtimes"]
run_commands = [call for call in docker_calls if call[:2] == ["run", "-d"]]
assert run_commands
assert "browser-use-browser-runtime:test" in run_commands[0]
mock_wait.assert_called_once()

View file

@ -1,62 +0,0 @@
import asyncio
from typing import Any
class FakeRpcClient:
def __init__(self) -> None:
self.calls: list[dict[str, Any]] = []
async def run(self, task: str, timeout_sec: float, rpc_url: str | None = None) -> dict[str, Any]:
self.calls.append({"task": task, "timeout_sec": timeout_sec, "rpc_url": rpc_url})
return {"success": True, "result": "done"}
def test_task_service_routes_run_to_browser_runtime(monkeypatch):
from api.repositories.task_store import TaskStore
from api.services import task_service as task_service_module
from api.services.task_service import TaskService
runtime = {
"rpc_url": "http://browser-use-browser-abc:8787/run",
"browser_view": "https://viewer.example.com/view/abc/vnc.html?path=view/abc/websockify",
"isolation_mode": "docker-per-principal",
"owner_hash": "abc",
}
cleanup_calls = []
monkeypatch.setattr(task_service_module, "ensure_browser_runtime", lambda **_: runtime)
monkeypatch.setattr(task_service_module, "cleanup_browser_runtime", lambda **kwargs: cleanup_calls.append(kwargs))
async def scenario():
rpc_client = FakeRpcClient()
service = TaskService(
store=TaskStore(),
rpc_client=rpc_client,
max_concurrency=1,
rpc_timeout_cap=30,
)
rec = await service.create_run(
thread_id="thread-1",
user_input="open example.com",
timeout=60,
metadata={"user_id": "user-7"},
)
done = await service.wait_run(rec.task_id, timeout=2)
await service.close()
return rpc_client, done
rpc_client, done = asyncio.run(scenario())
assert rpc_client.calls == [
{
"task": "open example.com",
"timeout_sec": 30,
"rpc_url": "http://browser-use-browser-abc:8787/run",
}
]
assert done is not None
assert done.raw_response is not None
assert done.raw_response["browser_view"] == runtime["browser_view"]
assert done.raw_response["isolation_mode"] == "docker-per-principal"
assert done.raw_response["owner_hash"] == "abc"
assert cleanup_calls

View file

@ -0,0 +1,30 @@
---
## Файл: assets/config.example.json
```json
{
"browser": {
"headless": true,
"timeout": 30000,
"viewport": {
"width": 1280,
"height": 720
},
"user_agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
},
"screenshots": {
"path": "/tmp/browser-use-screenshots",
"format": "png",
"full_page": true
},
"retry": {
"max_attempts": 3,
"delay_seconds": 2
},
"logging": {
"level": "info",
"save_screenshots_on_error": true
}
}

191
browser-use/SKILL.md Normal file
View file

@ -0,0 +1,191 @@
---
name: browser-use
version: "1.1.0"
description: Run web automation tasks through browser-use and Chromium CDP (headless or GUI).
triggers:
- "browser-use"
- "open website and extract"
- "automate browser task"
- "run browser task"
- "открой сайт"
- "заполни форму"
- "найди на странице"
- "сделай в браузере"
allowed-tools:
- terminal
- file
- memory
---
# Browser Use (Chromium/CDP)
Use this skill when a task requires real browser actions: open pages, click, type, submit forms, extract text/data, verify visible results.
## Decision: when to use this skill
Use `browser-use` if user asks to:
- navigate websites step-by-step;
- interact with UI elements (buttons, inputs, dropdowns);
- extract structured content from rendered pages;
- complete multi-step flows (login/search/filter/checkout draft).
Do **not** use `browser-use` if task is:
- pure static fetch/API call (use lighter tools);
- local file manipulation only;
- impossible due to CAPTCHA/2FA/region lock without user intervention.
## What the agent can and cannot see
Short answer to common question: **the agent sees the rendered page state, not all JavaScript source by default**.
The agent typically sees/uses:
- rendered DOM and interactive elements;
- visible text/content after JS execution;
- current URL, titles, form states;
- action results/errors returned by browser-use.
The agent does **not automatically** get:
- full source code of all loaded JS bundles;
- complete DevTools Network timeline;
- hidden backend logic not exposed in page content.
If user asks about JS specifically, do explicit steps:
1. locate script URLs from page source/DOM;
2. open script URL(s) directly;
3. extract needed fragments (function names, endpoints, constants).
## Runtime modes (CDP endpoints)
This project supports two modes.
1) Headless browserless Chromium:
- CDP: `ws://chromium:3000/chromium?token=hermes-local`
2) GUI Chromium (visible in noVNC):
- CDP: `http://172.25.0.3:9223`
- Visual stream: `http://localhost:6080/vnc.html`
Notes:
- `run_browser_use.py` accepts both `ws://` and `http://` CDP URLs.
- For `http://`, script resolves `/json/version` and converts to websocket URL automatically.
## Required environment
Minimum required env vars:
- `OPENAI_API_KEY`
- optional: `OPENAI_BASE_URL`
- optional: `OPENAI_MODEL` or `BROWSER_USE_MODEL`
- optional override: `BROWSER_USE_CDP_URL`
Defaults in this repo:
- `BROWSER_USE_PYTHON=/opt/browser-use-venv/bin/python`
- `BROWSER_USE_CDP_URL=http://172.25.0.3:9223` (from `docker-compose.yml`)
## Quick runbook (inside Docker)
1. Ensure services are up:
```bash
docker compose --profile gui up -d
docker compose ps
```
2. Check env in `hermes-agent`:
```bash
docker compose exec -T hermes-agent python - <<'PY'
import os
print('OPENAI_API_KEY', '<set>' if os.getenv('OPENAI_API_KEY') else '<missing>')
print('BROWSER_USE_CDP_URL', os.getenv('BROWSER_USE_CDP_URL', '<missing>'))
print('OPENAI_MODEL', os.getenv('OPENAI_MODEL', '<missing>'))
PY
```
3. Run a task:
```bash
python-browser-use /root/.hermes/skills/autonomous-ai-agents/browser-use/scripts/run_browser_use.py \
--task "Open example.com and return page title" \
--max-steps 8
```
4. For GUI visibility, open stream:
```bash
open "http://localhost:6080/vnc.html"
```
## Runbook (outside Docker)
Use one combined command so env vars are available in the same process:
```bash
export OPENAI_API_KEY="$OPENAI_API_KEY" && \
export BROWSER_USE_CDP_URL="$BROWSER_USE_CDP_URL" && \
/opt/browser-use-venv/bin/python /root/.hermes/skills/autonomous-ai-agents/browser-use/scripts/run_browser_use.py \
--task "<task>" \
--max-steps 20
```
## How Hermes should call this skill
Standard pattern:
```bash
python-browser-use /root/.hermes/skills/autonomous-ai-agents/browser-use/scripts/run_browser_use.py \
--task "<user task in plain language>" \
--max-steps 20
```
If user gave a starting page, add `--start-url`.
```bash
python-browser-use /root/.hermes/skills/autonomous-ai-agents/browser-use/scripts/run_browser_use.py \
--task "Find contact email" \
--start-url "https://example.com" \
--max-steps 20
```
## Troubleshooting (symptom -> action)
`{"success": false, "error": "OPENAI_API_KEY is not set"}`
- check `workspace/.env` and `hermes_data/.env`;
- recreate container:
```bash
docker compose up -d --force-recreate hermes-agent
```
`401 key_model_access_denied`
- model is not allowed for API key;
- set `BROWSER_USE_MODEL` or `OPENAI_MODEL` to an allowed model.
`Connection refused` or CDP errors
- verify browser container is running:
```bash
docker compose ps
docker compose exec -T hermes-agent bash -lc 'curl -s http://172.25.0.3:9223/json/version | head'
```
Timeout / exit code `124`
- not necessarily script failure;
- increase `--max-steps` and/or task timeout envelope.
## Site-specific limitations
- Yandex Music: may be blocked by region.
- Wildberries: anti-bot/CAPTCHA may block automation.
When blocked by anti-bot/2FA/CAPTCHA:
- ask user for manual intervention;
- continue automation after challenge is passed;
- or switch to non-browser strategy if acceptable.
## Operational notes
- Script file: `/root/.hermes/skills/autonomous-ai-agents/browser-use/scripts/run_browser_use.py`
- Script output: JSON (`success`, `cdp_url`, `result.final_result`, `result.errors`)
- In current implementation `use_vision=False`, so decisions are based on browser-use structured state rather than visual screenshot reasoning.

View file

@ -0,0 +1,16 @@
{
"browser": {
"cdp_url": "ws://chromium:3000/chromium?token=hermes-local",
"headless": true,
"timeout": 120000
},
"agent": {
"model_env": "BROWSER_USE_MODEL",
"max_steps": 20,
"use_vision": false
},
"logging": {
"level": "info"
}
}

View file

@ -0,0 +1,16 @@
{
"browser": {
"cdp_url": "ws://chromium:3000/playwright?token=hermes-local",
"headless": true,
"timeout": 120000
},
"agent": {
"model_env": "BROWSER_USE_MODEL",
"max_steps": 20,
"use_vision": false
},
"logging": {
"level": "info"
}
}

View file

@ -0,0 +1,2 @@
browser-use==0.12.5

View file

@ -0,0 +1,181 @@
#!/usr/bin/env python3
"""Run browser-use task against a Chromium CDP endpoint."""
import argparse
import asyncio
import json
import os
import socket
from pathlib import Path
from typing import Any
from urllib.parse import urlparse, urlunparse
from urllib.request import urlopen
from browser_use import Agent, BrowserSession
from browser_use.llm import ChatOpenAI
ENV_FALLBACK_PATHS = (
Path("/workspace/.env"),
Path("/workspace/workspace/.env"),
Path("/root/.hermes/.env"),
)
def _read_env_from_files(name: str) -> str | None:
for env_path in ENV_FALLBACK_PATHS:
if not env_path.exists():
continue
try:
for raw_line in env_path.read_text(encoding="utf-8").splitlines():
line = raw_line.strip()
if not line or line.startswith("#") or "=" not in line:
continue
key, value = line.split("=", 1)
if key.strip() == name:
cleaned = value.strip().strip('"').strip("'")
return cleaned or None
except OSError:
continue
return None
def _get_env(name: str, default: str | None = None) -> str | None:
value = os.getenv(name)
if value:
return value
from_file = _read_env_from_files(name)
if from_file:
return from_file
return default if default else None
def _build_task(task: str, start_url: str | None) -> str:
if not start_url:
return task
return f"Start from {start_url}. Task: {task}"
def _serialize_history(history: Any) -> dict[str, Any]:
result = ""
errors: list[str] = []
if hasattr(history, "final_result"):
try:
result = history.final_result() or ""
except Exception:
result = ""
if hasattr(history, "errors"):
try:
raw_errors = list(history.errors())
errors = [str(e) for e in raw_errors if e]
except Exception:
errors = []
return {
"final_result": result,
"errors": errors,
"has_errors": bool(errors),
}
def _resolve_cdp_url(cdp_url: str) -> str:
if cdp_url.startswith("ws://") or cdp_url.startswith("wss://"):
return cdp_url
if cdp_url.startswith("http://") or cdp_url.startswith("https://"):
parsed = urlparse(cdp_url)
host = parsed.hostname or ""
port = parsed.port
# Chrome DevTools rejects non-IP/non-localhost Host headers in some setups.
# For docker service names, resolve to IP and query via numeric host.
if host not in {"localhost", "127.0.0.1", "0.0.0.0"}:
try:
resolved_host = socket.gethostbyname(host)
netloc = resolved_host if not port else f"{resolved_host}:{port}"
parsed = parsed._replace(netloc=netloc)
except OSError:
pass
version_url = urlunparse(parsed).rstrip("/")
if not version_url.endswith("/json/version"):
version_url = f"{version_url}/json/version"
with urlopen(version_url, timeout=10) as response: # nosec B310
payload = json.loads(response.read().decode("utf-8"))
ws_url = payload.get("webSocketDebuggerUrl")
if not ws_url:
raise RuntimeError(f"CDP endpoint did not return webSocketDebuggerUrl: {version_url}")
# Keep a reachable host for ws:// URL when input used docker DNS alias.
if host and host not in {"localhost", "127.0.0.1", "0.0.0.0"}:
ws_parsed = urlparse(str(ws_url))
ws_netloc = ws_parsed.netloc
ws_port = ws_parsed.port
if ws_port is None:
ws_port = 443 if ws_parsed.scheme == "wss" else 80
try:
resolved_host = socket.gethostbyname(host)
ws_netloc = f"{resolved_host}:{ws_port}"
ws_url = urlunparse(ws_parsed._replace(netloc=ws_netloc))
except OSError:
pass
return str(ws_url)
raise RuntimeError(f"Unsupported CDP URL scheme: {cdp_url}")
async def _run(args: argparse.Namespace) -> int:
api_key = _get_env("OPENAI_API_KEY")
if not api_key:
print(json.dumps({"success": False, "error": "OPENAI_API_KEY is not set"}))
return 2
model = _get_env("BROWSER_USE_MODEL", _get_env("OPENAI_MODEL", "gpt-4o-mini"))
base_url = _get_env("OPENAI_BASE_URL")
raw_cdp_url = args.cdp_url or _get_env("BROWSER_USE_CDP_URL", "ws://chromium:3000/chromium?token=hermes-local")
cdp_url = _resolve_cdp_url(raw_cdp_url)
llm = ChatOpenAI(
model=model,
api_key=api_key,
base_url=base_url,
temperature=0.0,
)
browser_session = BrowserSession(cdp_url=cdp_url)
agent = Agent(
task=_build_task(args.task, args.start_url),
llm=llm,
browser_session=browser_session,
use_vision=False,
)
history = await agent.run(max_steps=args.max_steps)
payload = _serialize_history(history)
print(
json.dumps(
{
"success": not payload["has_errors"],
"model": model,
"cdp_url": cdp_url,
"task": args.task,
"result": payload,
},
ensure_ascii=True,
)
)
return 0 if not payload["has_errors"] else 1
def main() -> int:
parser = argparse.ArgumentParser(description="Run browser-use task")
parser.add_argument("--task", required=True, help="Natural language task for browser-use")
parser.add_argument("--start-url", default=None, help="Optional URL to open first")
parser.add_argument("--max-steps", type=int, default=20, help="Max agent steps")
parser.add_argument("--cdp-url", default=None, help="CDP URL (ws://... or http://.../json/version host)")
args = parser.parse_args()
return asyncio.run(_run(args))
if __name__ == "__main__":
raise SystemExit(main())

View file

@ -0,0 +1,12 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
VENV_DIR="${SCRIPT_DIR}/.venv"
python3 -m venv "${VENV_DIR}"
"${VENV_DIR}/bin/pip" install --upgrade pip
"${VENV_DIR}/bin/pip" install -r "${SCRIPT_DIR}/requirements.txt"
echo "browser-use skill environment is ready: ${VENV_DIR}"

View file

@ -1,36 +0,0 @@
FROM debian:bookworm-slim
ENV DEBIAN_FRONTEND=noninteractive
RUN apt-get update && apt-get install -y --no-install-recommends \
chromium \
python3 \
python3-pip \
xvfb \
fluxbox \
x11vnc \
novnc \
websockify \
dbus-x11 \
socat \
procps \
curl \
ca-certificates \
&& rm -rf /var/lib/apt/lists/*
WORKDIR /src
RUN mkdir -p /src/browser_data
RUN rm -f /usr/lib/python3.*/EXTERNALLY-MANAGED \
&& python3 -m pip install --no-cache-dir --break-system-packages uv \
&& uv pip install --system --no-cache-dir \
"browser-use>=0.12.5" \
"langchain-openai>=0.3.0"
COPY entrypoint.sh /entrypoint.sh
COPY browser_use_runner.py /src/browser_use_runner.py
RUN chmod +x /entrypoint.sh
EXPOSE 6080 9222 8787
ENTRYPOINT ["/entrypoint.sh"]

View file

@ -1,247 +0,0 @@
import asyncio
import inspect
import json
import os
from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
from typing import Any, Literal
from urllib import error, request
from browser_use import Agent, Browser, ChatOpenAI
from pydantic import BaseModel, Field, ValidationError, field_validator
SPEED_OPTIMIZATION_PROMPT = """
Speed optimization instructions:
- Be extremely concise and direct in your responses
- Get to the goal as quickly as possible
- Use multi-action sequences whenever possible to reduce steps
"""
class RunTaskRequest(BaseModel):
"""RPC payload для запуска browser-use задачи."""
task: str = Field(..., min_length=1)
@field_validator("task")
@classmethod
def validate_task(cls, value: str) -> str:
normalized = value.strip()
if not normalized:
raise ValueError("Field 'task' is required")
return normalized
class HistoryEvent(BaseModel):
"""Нормализованное событие из history агента."""
step: int
kind: str
content: str | None = None
data: dict[str, Any] = Field(default_factory=dict)
class RunTaskSuccessResponse(BaseModel):
"""Успешный ответ RPC раннера."""
success: Literal[True] = True
result: str | None = None
history: list[HistoryEvent] = Field(default_factory=list)
browser_view: str = ""
class RunTaskErrorResponse(BaseModel):
"""Ошибка выполнения задачи в RPC раннере."""
success: Literal[False] = False
error: str
def _json_response(handler, status_code: int, payload: dict[str, Any] | BaseModel) -> None:
if isinstance(payload, BaseModel):
body = payload.model_dump(mode="json")
else:
body = payload
data = json.dumps(body, ensure_ascii=False).encode("utf-8")
handler.send_response(status_code)
handler.send_header("Content-Type", "application/json; charset=utf-8")
handler.send_header("Content-Length", str(len(data)))
handler.end_headers()
handler.wfile.write(data)
async def run_browser_task(task: str) -> RunTaskSuccessResponse | RunTaskErrorResponse:
cdp_url = os.getenv("BROWSER_CDP_URL", "http://127.0.0.1:9222")
browser_view_url = os.getenv("BROWSER_VIEW_URL", "")
browser = Browser(cdp_url=cdp_url)
llm = ChatOpenAI(
model=os.getenv("MODEL_DEFAULT", "qwen3.5-122b"),
api_key=os.getenv("OPENAI_API_KEY"),
base_url=os.getenv("OPENAI_BASE_URL"),
temperature=0.0,
)
agent = Agent(task=task,
llm=llm,
browser=browser,
flash_mode=True,
use_vision=False,
extend_system_message=SPEED_OPTIMIZATION_PROMPT,
)
try:
history = await agent.run()
return RunTaskSuccessResponse(
result=history.final_result(),
history=[HistoryEvent.model_validate(item) for item in _extract_history_events(history)],
browser_view=browser_view_url,
)
except Exception as err:
return RunTaskErrorResponse(error=f"Browser automation failed: {err}")
finally:
try:
close_method = getattr(browser, "close", None)
if callable(close_method):
close_result = close_method()
if inspect.isawaitable(close_result):
await close_result
except Exception:
pass
def _to_jsonable(value: Any) -> Any:
if value is None or isinstance(value, (str, int, float, bool)):
return value
if isinstance(value, dict):
return {str(key): _to_jsonable(val) for key, val in value.items()}
if isinstance(value, (list, tuple, set)):
return [_to_jsonable(item) for item in value]
for method_name in ("model_dump", "dict", "to_dict"):
method = getattr(value, method_name, None)
if callable(method):
try:
dumped = method()
return _to_jsonable(dumped)
except Exception:
pass
return str(value)
def _call_history_items(history: Any, attr_name: str) -> list[Any]:
method = getattr(history, attr_name, None)
if not callable(method):
return []
try:
raw: Any = method()
except Exception:
return []
if raw is None:
return []
if isinstance(raw, list):
return raw
if isinstance(raw, (str, bytes, dict)):
return [raw]
try:
return list(raw)
except TypeError:
return [raw]
except Exception:
return [raw]
def _extract_history_events(history: Any) -> list[dict[str, Any]]:
events: list[dict[str, Any]] = []
def append_many(kind: str, items: list[Any]) -> None:
if not items:
return
for item in items:
normalized = _to_jsonable(item)
payload = normalized if isinstance(normalized, dict) else {"value": normalized}
content = normalized if isinstance(normalized, str) else json.dumps(normalized, ensure_ascii=False)
events.append(
{
"step": len(events) + 1,
"kind": kind,
"content": content,
"data": payload,
}
)
append_many("thought", _call_history_items(history, "model_thoughts"))
append_many("action", _call_history_items(history, "model_actions"))
append_many("error", _call_history_items(history, "errors"))
if events:
return events
fallback = _to_jsonable(history)
return [
{
"step": 1,
"kind": "system",
"content": fallback if isinstance(fallback, str) else json.dumps(fallback, ensure_ascii=False),
"data": fallback if isinstance(fallback, dict) else {"value": fallback},
}
]
class BrowserUseRPCHandler(BaseHTTPRequestHandler):
def do_GET(self):
if self.path != "/health":
_json_response(self, 404, {"success": False, "error": "Not found"})
return
try:
debug_url = os.getenv("BROWSER_HEALTH_URL", "http://127.0.0.1:9222/json/version")
with request.urlopen(debug_url, timeout=2):
pass
_json_response(self, 200, {"success": True})
except Exception as err:
_json_response(self, 503, {"success": False, "error": f"Browser is not ready: {err}"})
def do_POST(self):
if self.path != "/run":
_json_response(self, 404, {"success": False, "error": "Not found"})
return
try:
content_length = int(self.headers.get("Content-Length", "0"))
raw = self.rfile.read(content_length)
payload = json.loads(raw.decode("utf-8") if raw else "{}")
request_model = RunTaskRequest.model_validate(payload)
result_model = asyncio.run(run_browser_task(request_model.task))
code = 200 if result_model.success else 500
_json_response(self, code, result_model)
except ValidationError as err:
_json_response(self, 400, RunTaskErrorResponse(error=f"Invalid request payload: {err.errors()}"))
except json.JSONDecodeError:
_json_response(self, 400, RunTaskErrorResponse(error="Invalid JSON payload"))
except error.URLError as err:
_json_response(self, 503, RunTaskErrorResponse(error=f"Transport error: {err}"))
except Exception as err:
_json_response(self, 500, RunTaskErrorResponse(error=f"Internal error: {err}"))
def log_message(self, format_str, *args):
return
def main():
host = os.getenv("BROWSER_USE_RPC_HOST", "0.0.0.0")
port = int(os.getenv("BROWSER_USE_RPC_PORT", "8787"))
server = ThreadingHTTPServer((host, port), BrowserUseRPCHandler) # type: ignore[arg-type]
print(f"browser-use RPC listening on {host}:{port}")
server.serve_forever()
if __name__ == "__main__":
main()

View file

@ -1,201 +0,0 @@
#!/usr/bin/env bash
set -Eeuo pipefail
export DISPLAY="${DISPLAY:-:99}"
DISPLAY_NUM="${DISPLAY#:}"
XVFB_LOG="/tmp/xvfb.log"
VNC_PORT="${VNC_PORT:-5900}"
NOVNC_PORT="${NOVNC_PORT:-6080}"
CHROME_LOCAL_DEBUG_PORT="${CHROME_LOCAL_DEBUG_PORT:-${BROWSER_CHROME_DEBUG_PORT:-9223}}"
CHROME_PUBLIC_DEBUG_PORT="${CHROME_PUBLIC_DEBUG_PORT:-${BROWSER_CDP_PROXY_PORT:-9222}}"
BROWSER_USE_RPC_PORT="${BROWSER_USE_RPC_PORT:-8787}"
CHROME_PROFILE_DIR="${CHROME_PROFILE_DIR:-${BROWSER_DATA_DIR:-/src/browser_data}}"
BROWSER_ENABLE_UI="${BROWSER_ENABLE_UI:-true}"
MAX_RESTARTS="${MAX_RESTARTS:-10}"
RESTART_WINDOW_SEC="${RESTART_WINDOW_SEC:-60}"
RESTART_BACKOFF_SEC="${RESTART_BACKOFF_SEC:-2}"
PIDS=()
STOPPING=0
WINDOW_START="$(date +%s)"
RESTART_COUNT=0
log() {
printf '[%s] %s\n' "$(date '+%Y-%m-%d %H:%M:%S')" "$*"
}
start_bg() {
"$@" &
local pid=$!
PIDS+=("$pid")
log "started: $* (pid=$pid)"
}
wait_for_port() {
local host=$1
local port=$2
local timeout_sec=$3
local end_ts=$(( $(date +%s) + timeout_sec ))
while [ "$(date +%s)" -lt "$end_ts" ]; do
if bash -c "</dev/tcp/${host}/${port}" >/dev/null 2>&1; then
return 0
fi
sleep 0.2
done
return 1
}
wait_for_x_display() {
local timeout_sec=$1
local end_ts=$(( $(date +%s) + timeout_sec ))
while [ "$(date +%s)" -lt "$end_ts" ]; do
if [ -S "/tmp/.X11-unix/X${DISPLAY_NUM}" ] && DISPLAY="$DISPLAY" bash -c 'echo >/dev/null' >/dev/null 2>&1; then
return 0
fi
sleep 0.2
done
return 1
}
cleanup() {
if [ "$STOPPING" -eq 1 ]; then
return
fi
STOPPING=1
log "shutdown signal received, stopping processes..."
if [ -n "${CHROME_PID:-}" ] && kill -0 "$CHROME_PID" >/dev/null 2>&1; then
kill "$CHROME_PID" >/dev/null 2>&1 || true
fi
for pid in "${PIDS[@]:-}"; do
kill "$pid" >/dev/null 2>&1 || true
done
sleep 1
if [ -n "${CHROME_PID:-}" ] && kill -0 "$CHROME_PID" >/dev/null 2>&1; then
kill -9 "$CHROME_PID" >/dev/null 2>&1 || true
fi
for pid in "${PIDS[@]:-}"; do
if kill -0 "$pid" >/dev/null 2>&1; then
kill -9 "$pid" >/dev/null 2>&1 || true
fi
done
log "shutdown complete"
}
trap cleanup SIGTERM SIGINT EXIT
mkdir -p /var/run/dbus /var/lib/dbus "$CHROME_PROFILE_DIR"
if [ ! -f /var/lib/dbus/machine-id ]; then
dbus-uuidgen > /var/lib/dbus/machine-id 2>/dev/null || true
fi
# Удаляем stale lock/socket от прошлых падений Xvfb на том же DISPLAY.
rm -f "/tmp/.X${DISPLAY_NUM}-lock" "/tmp/.X11-unix/X${DISPLAY_NUM}" || true
log "starting X stack on DISPLAY=${DISPLAY}"
Xvfb "$DISPLAY" -screen 0 1280x720x24 -ac +extension GLX +render -noreset >"$XVFB_LOG" 2>&1 &
XVFB_PID=$!
PIDS+=("$XVFB_PID")
log "started: Xvfb $DISPLAY (pid=$XVFB_PID)"
if ! wait_for_x_display 15; then
log "fatal: Xvfb did not initialize DISPLAY=${DISPLAY}"
if [ -f "$XVFB_LOG" ]; then
log "xvfb log tail:"
tail -n 40 "$XVFB_LOG" || true
fi
exit 1
fi
if [ "$BROWSER_ENABLE_UI" != "false" ]; then
start_bg fluxbox
start_bg x11vnc -display "$DISPLAY" -rfbport "$VNC_PORT" -nopw -listen 0.0.0.0 -xkb -forever -shared
start_bg websockify --web=/usr/share/novnc/ "$NOVNC_PORT" "localhost:${VNC_PORT}"
fi
start_bg socat "TCP-LISTEN:${CHROME_PUBLIC_DEBUG_PORT},fork,reuseaddr" "TCP:127.0.0.1:${CHROME_LOCAL_DEBUG_PORT}"
start_bg python3 -u /src/browser_use_runner.py
if [ "$BROWSER_ENABLE_UI" != "false" ]; then
if ! wait_for_port 127.0.0.1 "$VNC_PORT" 20; then
log "fatal: x11vnc did not open port ${VNC_PORT}"
exit 1
fi
if ! wait_for_port 127.0.0.1 "$NOVNC_PORT" 20; then
log "fatal: websockify did not open port ${NOVNC_PORT}"
exit 1
fi
fi
if ! wait_for_port 127.0.0.1 "$BROWSER_USE_RPC_PORT" 20; then
log "fatal: browser-use RPC did not open port ${BROWSER_USE_RPC_PORT}"
exit 1
fi
log "browser infrastructure is ready (noVNC:${NOVNC_PORT}, DevTools proxy:${CHROME_PUBLIC_DEBUG_PORT}, browser-use RPC:${BROWSER_USE_RPC_PORT})"
while true; do
for pid in "${PIDS[@]}"; do
if ! kill -0 "$pid" >/dev/null 2>&1; then
log "fatal: required background process died (pid=${pid})"
exit 1
fi
done
rm -f "${CHROME_PROFILE_DIR}/SingletonLock" "${CHROME_PROFILE_DIR}/SingletonCookie" "${CHROME_PROFILE_DIR}/SingletonSocket" 2>/dev/null || true
log "starting Chromium (local DevTools:${CHROME_LOCAL_DEBUG_PORT})"
chromium \
--no-sandbox \
--disable-dev-shm-usage \
--ozone-platform=x11 \
--remote-debugging-port="${CHROME_LOCAL_DEBUG_PORT}" \
--remote-debugging-address=127.0.0.1 \
--remote-allow-origins='*' \
--window-size=1280,720 \
--user-data-dir="${CHROME_PROFILE_DIR}" \
--disable-blink-features=AutomationControlled \
--no-first-run \
--disable-gpu \
--mute-audio \
--no-default-browser-check \
--disable-software-rasterizer \
--disable-features=site-per-process \
--disable-crash-reporter \
--disable-extensions \
--disable-sync &
CHROME_PID=$!
wait "$CHROME_PID" || CHROME_EXIT=$?
CHROME_EXIT=${CHROME_EXIT:-0}
if [ "$STOPPING" -eq 1 ]; then
break
fi
now="$(date +%s)"
if [ $(( now - WINDOW_START )) -gt "$RESTART_WINDOW_SEC" ]; then
WINDOW_START="$now"
RESTART_COUNT=0
fi
RESTART_COUNT=$((RESTART_COUNT + 1))
log "Chromium exited with code=${CHROME_EXIT}; restart ${RESTART_COUNT}/${MAX_RESTARTS} in current window"
if [ "$RESTART_COUNT" -ge "$MAX_RESTARTS" ]; then
log "fatal: too many Chromium restarts in ${RESTART_WINDOW_SEC}s"
exit 1
fi
sleep "$RESTART_BACKOFF_SEC"
unset CHROME_EXIT
unset CHROME_PID
done

View file

@ -1,46 +0,0 @@
events {}
http {
resolver 127.0.0.11 ipv6=off;
map $http_upgrade $connection_upgrade {
default upgrade;
'' close;
}
server {
listen 8080;
server_name _;
location = / {
add_header Content-Type text/plain;
return 200 "Browser view proxy is running.\n";
}
location / {
proxy_http_version 1.1;
proxy_set_header Host $host;
proxy_set_header Upgrade $http_upgrade;
proxy_set_header Connection $connection_upgrade;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
proxy_buffering off;
proxy_pass http://browser:6080;
}
location ~ "^/view/(?<owner>[a-f0-9]{16})$" {
return 302 /view/$owner/vnc.html?path=view/$owner/websockify;
}
location ~ "^/view/(?<owner>[a-f0-9]{16})/(?<rest>.*)$" {
proxy_http_version 1.1;
proxy_set_header Host $host;
proxy_set_header Upgrade $http_upgrade;
proxy_set_header Connection $connection_upgrade;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
proxy_buffering off;
proxy_pass http://browser-use-browser-$owner:6080/$rest$is_args$args;
}
}
}

View file

@ -1,14 +0,0 @@
services:
browser-api:
networks:
- browser-net
- lambdalab_frontend
browser-view-proxy:
networks:
- browser-net
- lambdalab_frontend
networks:
lambdalab_frontend:
external: true

View file

@ -1,104 +1,49 @@
services:
browser:
build:
context: ./browser_env
dockerfile: Dockerfile.browser
image: browser-use-browser-runtime:latest
container_name: browser-use-browser
chromium:
image: ghcr.io/browserless/chromium:latest
container_name: hermes-chromium
restart: unless-stopped
environment:
- MODEL_DEFAULT=${MODEL_DEFAULT:-qwen3.5-122b}
- OPENAI_API_KEY=${OPENAI_API_KEY}
- OPENAI_BASE_URL=${OPENAI_BASE_URL}
- BROWSER_USE_RPC_HOST=0.0.0.0
- BROWSER_USE_RPC_PORT=8787
- TOKEN=hermes-local
- TIMEOUT=120000
- CONCURRENT=5
shm_size: 1gb
ports:
- "${BROWSER_NOVNC_PUBLISH:-6080:6080}"
- "${BROWSER_CDP_PUBLISH:-9222:9222}"
networks:
browser-net:
aliases:
- browser
shm_size: '2gb'
volumes:
- browser_profiles:/src/browser_data
- ./workspace:/app/workspace:rw
restart: always
- "3000:3000"
chromium-gui:
build:
context: ./docker/chromium-gui
container_name: hermes-chromium-gui
restart: unless-stopped
shm_size: 1gb
ports:
- "127.0.0.1:6080:6080"
- "127.0.0.1:5900:5900"
- "127.0.0.1:9223:9223"
healthcheck:
test: [ "CMD-SHELL", "curl -fsS http://127.0.0.1:9222/json/version >/dev/null && curl -fsS http://127.0.0.1:8787/health >/dev/null || exit 1" ]
test: [ "CMD", "curl", "-fsS", "http://localhost:9223/json/version" ]
interval: 10s
timeout: 3s
retries: 12
start_period: 20s
timeout: 5s
retries: 6
browser-api:
build:
context: ./api
dockerfile: Dockerfile
container_name: browser-use-api
hermes-agent:
build: .
container_name: hermes-agent
restart: unless-stopped
stdin_open: true
tty: true
depends_on:
- chromium
- chromium-gui
env_file:
- ./workspace/.env
- ./hermes_data/.env
environment:
- BROWSER_USE_RPC_URL=http://browser:8787/run
- BROWSER_API_HOST=0.0.0.0
- BROWSER_API_PORT=8088
- BROWSER_API_MAX_CONCURRENCY=2
- BROWSER_VIEW_BASE_URL=${BROWSER_VIEW_BASE_URL:-http://localhost:6081}
- BROWSER_USE_ISOLATION_MODE=${BROWSER_USE_ISOLATION_MODE:-docker-per-principal}
- BROWSER_RUNTIME_IMAGE=${BROWSER_RUNTIME_IMAGE:-browser-use-browser-runtime:latest}
- BROWSER_RUNTIME_NETWORK=${BROWSER_RUNTIME_NETWORK:-browser-net}
- BROWSER_RUNTIME_TTL_SECONDS=${BROWSER_RUNTIME_TTL_SECONDS:-900}
- BROWSER_RUNTIME_START_TIMEOUT=${BROWSER_RUNTIME_START_TIMEOUT:-45}
- BROWSER_RUNTIME_ENABLE_UI=${BROWSER_RUNTIME_ENABLE_UI:-true}
- MODEL_DEFAULT=${MODEL_DEFAULT:-qwen3.5-122b}
- OPENAI_API_KEY=${OPENAI_API_KEY}
- OPENAI_BASE_URL=${OPENAI_BASE_URL}
depends_on:
browser:
condition: service_healthy
ports:
- "${BROWSER_API_PUBLISH:-8088:8088}"
- BROWSER_USE_CDP_URL=${BROWSER_USE_CDP_URL:-http://172.25.0.3:9223}
- BROWSER_USE_PYTHON=/opt/browser-use-venv/bin/python
volumes:
- /var/run/docker.sock:/var/run/docker.sock
healthcheck:
test:
[
"CMD-SHELL",
"python -c \"import urllib.request; urllib.request.urlopen('http://127.0.0.1:8088/health', timeout=2).read()\" >/dev/null 2>&1 || exit 1",
]
interval: 120s
timeout: 3s
retries: 12
start_period: 10s
restart: always
networks:
- browser-net
browser-view-proxy:
image: nginx:alpine
container_name: browser-use-view-proxy
volumes:
- ./browser_env/nginx.browser-view.conf:/etc/nginx/nginx.conf:ro
depends_on:
browser:
condition: service_healthy
ports:
- "${BROWSER_VIEW_PROXY_PUBLISH:-6081:8080}"
restart: always
networks:
- browser-net
tunnel:
image: cloudflare/cloudflared:latest
profiles:
- remote
container_name: browser-use-tunnel
restart: always
command: tunnel --protocol http2 --url http://browser-view-proxy:8080 --no-tls-verify
networks:
- browser-net
volumes:
browser_profiles:
networks:
browser-net:
name: browser-net
driver: bridge
- ./workspace:/workspace
- ./hermes_data:/root/.hermes
working_dir: /workspace
command: [ "hermes", "gateway" ]

View file

@ -0,0 +1,21 @@
FROM debian:bookworm-slim
RUN apt-get update && apt-get install -y --no-install-recommends \
chromium \
xvfb \
x11vnc \
fluxbox \
novnc \
websockify \
socat \
ca-certificates \
curl \
&& rm -rf /var/lib/apt/lists/*
COPY start.sh /usr/local/bin/start-gui-chromium.sh
RUN chmod +x /usr/local/bin/start-gui-chromium.sh
EXPOSE 6080 5900 9222
CMD ["/usr/local/bin/start-gui-chromium.sh"]

View file

@ -0,0 +1,26 @@
# Chromium GUI Service
This container provides a full Chromium GUI with three interfaces:
- noVNC web UI: `http://localhost:6080/vnc.html`
- VNC: `localhost:5900`
- CDP endpoint: `http://localhost:9223/json/version`
## Run
```bash
docker compose --profile gui up -d chromium-gui
```
## Use with browser-use
Pass the GUI CDP endpoint as HTTP URL (the runner resolves it to a websocket automatically):
```bash
docker compose exec -T hermes-agent python-browser-use \
/root/.hermes/skills/autonomous-ai-agents/browser-use/scripts/run_browser_use.py \
--cdp-url http://chromium-gui:9223 \
--task "Open example.com and return page title" \
--max-steps 5
```

View file

@ -0,0 +1,33 @@
#!/usr/bin/env bash
set -euo pipefail
export DISPLAY=:99
SCREEN_RESOLUTION="${SCREEN_RESOLUTION:-1920x1080x24}"
# Clean stale X lock/socket from previous crashed runs in the same container.
rm -f /tmp/.X99-lock
rm -f /tmp/.X11-unix/X99
Xvfb :99 -screen 0 "$SCREEN_RESOLUTION" -ac +extension RANDR &
fluxbox >/tmp/fluxbox.log 2>&1 &
x11vnc -display :99 -forever -shared -rfbport 5900 -nopw >/tmp/x11vnc.log 2>&1 &
websockify --web=/usr/share/novnc/ 6080 localhost:5900 >/tmp/novnc.log 2>&1 &
# Проксирование CDP на все адреса используя socat
# Chromium слушает на ::1:9223 (IPv6 localhost)
socat TCP-LISTEN:9223,reuseaddr,fork TCP6:[::1]:9223 >/tmp/socat.log 2>&1 &
exec chromium \
--no-sandbox \
--disable-dev-shm-usage \
--disable-gpu \
--disable-setuid-sandbox \
--remote-debugging-address=127.0.0.1 \
--remote-debugging-port=9223 \
--user-data-dir=/tmp/chromium-profile \
--window-size=1920,1080 \
--no-first-run \
--no-default-browser-check \
about:blank >/tmp/chromium.log 2>&1

View file

@ -1,133 +0,0 @@
# BrowserUse VPS Deployment
This project deploys to `BrowserUse-vps@lambda.coredump.ru` with a Gitea/Forgejo Actions runner installed on the VPS.
The server already has a root-owned `/opt/lambdalab` stack with Caddy on ports `80/443`. Keep this browser service as a separate app under the deploy user home directory, then attach the public-facing containers to the existing `lambdalab_frontend` Docker network through `docker-compose.vps.yml`.
## SSH Access
Add the public SSH key to the VPS user:
```sh
ssh BrowserUse-vps@lambda.coredump.ru
mkdir -p ~/.ssh
chmod 700 ~/.ssh
printf '%s\n' '<ssh-ed25519 public key>' >> ~/.ssh/authorized_keys
chmod 600 ~/.ssh/authorized_keys
```
The fingerprint `SHA256:/XC5ifPX8j+uRyp0Yw2zAl5nteWc3YcHeVHfCG+rhP4` is not enough by itself. `authorized_keys` needs the full public key line that starts with `ssh-ed25519`.
## Initial Server Checkout
Run once on the VPS:
```sh
mkdir -p ~/apps
cd ~/apps
git clone -b feature/api-for-subagent https://git.lambda.coredump.ru/APEX/BrowserUse_and_ComputerUse_skills.git
cd BrowserUse_and_ComputerUse_skills
```
Create a server-local `.env` file in the checkout. It is intentionally not committed:
```sh
OPENAI_API_KEY=...
OPENAI_BASE_URL=...
MODEL_DEFAULT=qwen3.5-122b
BROWSER_VIEW_BASE_URL=https://browser-view.lambda.coredump.ru
BROWSER_API_PUBLISH=127.0.0.1:8088:8088
BROWSER_VIEW_PROXY_PUBLISH=127.0.0.1:6081:8080
BROWSER_NOVNC_PUBLISH=127.0.0.1:6080:6080
BROWSER_CDP_PUBLISH=127.0.0.1:9222:9222
```
Then run the first deploy manually:
```sh
bash scripts/deploy_vps.sh
curl -fsS http://127.0.0.1:8088/health
```
The deploy script uses both Compose files by default:
```sh
docker-compose.yml:docker-compose.vps.yml
```
`docker-compose.vps.yml` connects `browser-api` and `browser-view-proxy` to the existing external `lambdalab_frontend` network so Caddy can reach them by Docker DNS.
## Domain Binding
The active Caddy config is root-owned at:
```sh
/opt/lambdalab/caddy/Caddyfile
```
Add these vhosts to that file from an admin/root account:
```caddyfile
browser-api.lambda.coredump.ru {
reverse_proxy browser-use-api:8088
}
browser-view.lambda.coredump.ru {
reverse_proxy browser-use-view-proxy:8080
}
```
Then reload the existing Caddy container from `/opt/lambdalab`:
```sh
cd /opt/lambdalab
docker compose exec caddy caddy reload --config /etc/caddy/Caddyfile
```
DNS must point both subdomains to the VPS public IP `155.212.185.120`. At inspection time, `lambda.coredump.ru` resolved to that IP, while `browser-api.lambda.coredump.ru` and `browser-view.lambda.coredump.ru` did not resolve yet.
## Gitea/Forgejo Runner
Install `act_runner` as the `BrowserUse-vps` user and register it with the repository, organization, or instance runner token:
```sh
mkdir -p ~/act_runner
cd ~/act_runner
./act_runner generate-config > config.yaml
./act_runner --config config.yaml register \
--no-interactive \
--instance https://git.lambda.coredump.ru \
--token '<runner-registration-token>' \
--name BrowserUse-vps \
--labels deploy-vps:host
```
Start it under the same user:
```sh
cd ~/act_runner
nohup ./act_runner daemon --config config.yaml > act_runner.log 2>&1 &
```
Because this account has `sudo: no`, a system-wide service cannot be installed from this user. If an admin enables a user-level systemd service for this account, run the same daemon command from that service instead of `nohup`.
## CI/CD Behavior
The workflow lives at `.gitea/workflows/deploy.yml`.
It runs on:
- push to `feature/api-for-subagent`
- manual `workflow_dispatch`
The job expects a runner label named `deploy-vps`, registered as `deploy-vps:host`. It enters:
```sh
/home/BrowserUse-vps/apps/BrowserUse_and_ComputerUse_skills
```
Then it fetches `origin/feature/api-for-subagent`, resets the tracked checkout to that commit, runs Docker Compose, and verifies:
```sh
curl -fsS http://127.0.0.1:8088/health
```

View file

@ -0,0 +1,27 @@
---
## 📚 Файл: references/common_patterns.md
```markdown
# Common Browser Automation Patterns
## Паттерн 1: Авторизация
### Сценарий
Пользователь хочет автоматизировать вход в систему.
### Реализация
```python
{
"action": "sequence",
"steps": [
{"action": "goto", "url": "https://example.com/login"},
{"action": "wait", "selector": "form", "timeout": 5000},
{"action": "fill", "selector": "input[name='email']", "value": "user@example.com"},
{"action": "fill", "selector": "input[name='password']", "value": "password123"},
{"action": "click", "selector": "button[type='submit']"},
{"action": "wait", "selector": ".dashboard", "timeout": 10000},
{"action": "screenshot", "path": "/tmp/after_login.png"}
]
}

52
references/selectors.md Normal file
View file

@ -0,0 +1,52 @@
# CSS Селекторы — Полная шпаргалка
## Быстрый справочник
### Базовые селекторы
| Селектор | Пример | Описание |
|----------|--------|----------|
| `*` | `*` | Все элементы |
| `element` | `div` | Элемент по тегу |
| `#id` | `#main` | Элемент по ID |
| `.class` | `.button` | Элемент по классу |
| `[attr]` | `[disabled]` | Элемент с атрибутом |
| `[attr=value]` | `[type="submit"]` | Точное совпадение атрибута |
| `[attr^=value]` | `[href^="https"]` | Атрибут начинается с |
| `[attr$=value]` | `[href$=".pdf"]` | Атрибут заканчивается на |
| `[attr*=value]` | `[name*="user"]` | Атрибут содержит |
### Комбинаторы
| Селектор | Пример | Описание |
|----------|--------|----------|
| `A B` | `div p` | Потомок (любой уровень) |
| `A > B` | `div > p` | Прямой потомок |
| `A + B` | `h1 + p` | Соседний элемент |
| `A ~ B` | `h1 ~ p` | Все следующие соседние |
### Псевдоклассы
| Псевдокласс | Пример | Описание |
|-------------|--------|----------|
| `:first-child` | `li:first-child` | Первый дочерний |
| `:last-child` | `li:last-child` | Последний дочерний |
| `:nth-child(n)` | `tr:nth-child(2)` | n-й дочерний |
| `:nth-of-type(n)` | `p:nth-of-type(2)` | n-й элемент типа |
| `:not(selector)` | `div:not(.hidden)` | Исключение |
| `:has(selector)` | `div:has(p)` | Содержит дочерний элемент |
| `:contains(text)` | `a:contains("Click")` | Содержит текст |
## XPath — Альтернатива
### Базовые XPath
```xpath
//element # Все элементы
//div[@id='main'] # По атрибуту
//div[contains(@class, 'btn')] # Частичное совпадение класса
//button[text()='Submit'] # По тексту
//a[contains(text(), 'Learn')] # Частичное совпадение текста
//div[@id='main']//p # Вложенность
//div[1] # Первый div
//div[last()] # Последний div

View file

@ -0,0 +1,338 @@
## 🐍 Файл: scripts/browser_automation.py
# !/usr/bin/env python3
"""
Browser automation core module for Hermes Agent Skill
Автоматизация браузера с использованием Playwright
"""
import asyncio
import json
import sys
import os
from typing import Dict, Any, Optional, List
from playwright.async_api import async_playwright, Page, Browser, Playwright
class BrowserAutomation:
"""Основной класс для автоматизации браузера"""
def __init__(self, headless: bool = True, timeout: int = 30000):
self.headless = headless
self.timeout = timeout
self.playwright: Optional[Playwright] = None
self.browser: Optional[Browser] = None
self.page: Optional[Page] = None
async def __aenter__(self):
await self.start()
return self
async def __aexit__(self, exc_type, exc_val, exc_tb):
await self.close()
async def start(self):
"""Запуск браузера"""
self.playwright = await async_playwright().start()
self.browser = await self.playwright.chromium.launch(
headless=self.headless,
args=[
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-accelerated-2d-canvas',
'--disable-gpu'
]
)
self.page = await self.browser.new_page()
self.page.set_default_timeout(self.timeout)
async def close(self):
"""Закрытие браузера"""
if self.browser:
await self.browser.close()
if self.playwright:
await self.playwright.stop()
async def goto(self, url: str) -> Dict[str, Any]:
"""Переход по URL"""
try:
response = await self.page.goto(url, wait_until='networkidle')
status = response.status if response else None
return {
"success": True,
"url": self.page.url,
"status": status
}
except Exception as e:
return {
"success": False,
"error": f"Failed to navigate to {url}: {str(e)}"
}
async def click(self, selector: str) -> Dict[str, Any]:
"""Клик по элементу"""
try:
await self.page.wait_for_selector(selector, timeout=self.timeout)
await self.page.click(selector)
return {
"success": True,
"selector": selector,
"message": f"Clicked on {selector}"
}
except Exception as e:
return {
"success": False,
"error": f"Failed to click on {selector}: {str(e)}"
}
async def fill(self, selector: str, value: str) -> Dict[str, Any]:
"""Заполнение поля"""
try:
await self.page.wait_for_selector(selector, timeout=self.timeout)
await self.page.fill(selector, value)
return {
"success": True,
"selector": selector,
"value": value,
"message": f"Filled {selector} with '{value}'"
}
except Exception as e:
return {
"success": False,
"error": f"Failed to fill {selector}: {str(e)}"
}
async def screenshot(self, path: str = "/tmp/screenshot.png") -> Dict[str, Any]:
"""Скриншот страницы"""
try:
# Убедимся, что директория существует
os.makedirs(os.path.dirname(path), exist_ok=True)
await self.page.screenshot(path=path, full_page=True)
return {
"success": True,
"path": path,
"message": f"Screenshot saved to {path}"
}
except Exception as e:
return {
"success": False,
"error": f"Failed to take screenshot: {str(e)}"
}
async def get_text(self, selector: str) -> Dict[str, Any]:
"""Получение текста элемента"""
try:
await self.page.wait_for_selector(selector, timeout=self.timeout)
text = await self.page.text_content(selector)
return {
"success": True,
"text": text.strip() if text else "",
"selector": selector
}
except Exception as e:
return {
"success": False,
"error": f"Failed to get text from {selector}: {str(e)}"
}
async def get_text_all(self, selector: str) -> Dict[str, Any]:
"""Получение текста всех элементов"""
try:
await self.page.wait_for_selector(selector, timeout=self.timeout)
elements = await self.page.query_selector_all(selector)
texts = []
for el in elements:
text = await el.text_content()
if text:
texts.append(text.strip())
return {
"success": True,
"texts": texts,
"count": len(texts),
"selector": selector
}
except Exception as e:
return {
"success": False,
"error": f"Failed to get texts from {selector}: {str(e)}"
}
async def evaluate(self, js_code: str) -> Dict[str, Any]:
"""Выполнение JavaScript"""
try:
result = await self.page.evaluate(js_code)
return {
"success": True,
"result": result,
"code": js_code[:100] # Обрезаем для вывода
}
except Exception as e:
return {
"success": False,
"error": f"Failed to evaluate JavaScript: {str(e)}"
}
async def select(self, selector: str, value: str) -> Dict[str, Any]:
"""Выбор из выпадающего списка"""
try:
await self.page.wait_for_selector(selector, timeout=self.timeout)
await self.page.select_option(selector, value)
return {
"success": True,
"selector": selector,
"value": value,
"message": f"Selected '{value}' from {selector}"
}
except Exception as e:
return {
"success": False,
"error": f"Failed to select from {selector}: {str(e)}"
}
async def wait_for_selector(self, selector: str, timeout: int = None) -> Dict[str, Any]:
"""Ожидание появления элемента"""
timeout_ms = timeout or self.timeout
try:
await self.page.wait_for_selector(selector, timeout=timeout_ms)
return {
"success": True,
"selector": selector,
"timeout": timeout_ms,
"message": f"Element {selector} appeared"
}
except Exception as e:
return {
"success": False,
"error": f"Timeout waiting for {selector}: {str(e)}"
}
async def get_html(self) -> Dict[str, Any]:
"""Получение HTML страницы"""
try:
html = await self.page.content()
return {
"success": True,
"html": html,
"size": len(html)
}
except Exception as e:
return {
"success": False,
"error": f"Failed to get HTML: {str(e)}"
}
async def get_title(self) -> Dict[str, Any]:
"""Получение заголовка страницы"""
try:
title = await self.page.title()
return {
"success": True,
"title": title
}
except Exception as e:
return {
"success": False,
"error": f"Failed to get title: {str(e)}"
}
async def get_url(self) -> Dict[str, Any]:
"""Получение текущего URL"""
try:
url = self.page.url
return {
"success": True,
"url": url
}
except Exception as e:
return {
"success": False,
"error": f"Failed to get URL: {str(e)}"
}
async def execute_sequence(self, steps: List[Dict[str, Any]]) -> Dict[str, Any]:
"""Выполнение последовательности действий"""
results = []
for i, step in enumerate(steps):
result = await self.execute_task(step)
results.append({
"step": i + 1,
"action": step.get("action"),
"result": result
})
# Если шаг не удался, прекращаем выполнение
if not result.get("success"):
return {
"success": False,
"error": f"Sequence failed at step {i + 1}",
"results": results
}
return {
"success": True,
"results": results,
"total_steps": len(steps)
}
async def execute_task(self, task: Dict[str, Any]) -> Dict[str, Any]:
"""Выполнение задачи по описанию"""
action = task.get("action")
actions_map = {
"goto": lambda: self.goto(task.get("url")),
"click": lambda: self.click(task.get("selector")),
"fill": lambda: self.fill(task.get("selector"), task.get("value")),
"screenshot": lambda: self.screenshot(task.get("path", "/tmp/screenshot.png")),
"get_text": lambda: self.get_text(task.get("selector")),
"get_text_all": lambda: self.get_text_all(task.get("selector")),
"evaluate": lambda: self.evaluate(task.get("code")),
"select": lambda: self.select(task.get("selector"), task.get("value")),
"wait": lambda: self.wait_for_selector(task.get("selector"), task.get("timeout")),
"get_html": lambda: self.get_html(),
"get_title": lambda: self.get_title(),
"get_url": lambda: self.get_url(),
"sequence": lambda: self.execute_sequence(task.get("steps", []))
}
if action not in actions_map:
return {
"success": False,
"error": f"Unknown action: {action}. Available: {', '.join(actions_map.keys())}"
}
return await actions_map[action]()
async def run_from_args():
"""Запуск из аргументов командной строки"""
if len(sys.argv) < 2:
print(json.dumps({
"success": False,
"error": "No task provided. Usage: python3 browser_automation.py '<JSON_TASK>'"
}))
return
try:
task = json.loads(sys.argv[1])
except json.JSONDecodeError:
# Если не JSON, пробуем как goto команду
task = {"action": "goto", "url": sys.argv[1]}
# Определяем режим headless (можно переопределить через переменную окружения)
headless = os.environ.get("BROWSER_HEADLESS", "true").lower() == "true"
async with BrowserAutomation(headless=headless) as browser:
result = await browser.execute_task(task)
print(json.dumps(result, ensure_ascii=False, indent=2))
if __name__ == "__main__":
asyncio.run(run_from_args())

View file

@ -1,65 +0,0 @@
#!/usr/bin/env bash
set -Eeuo pipefail
DEPLOY_BRANCH="${DEPLOY_BRANCH:-feature/api-for-subagent}"
HEALTH_URL="${HEALTH_URL:-http://127.0.0.1:8088/health}"
COMPOSE_FILES="${COMPOSE_FILES:-docker-compose.yml:docker-compose.vps.yml}"
log() {
printf '[deploy] %s\n' "$*"
}
fail() {
printf '[deploy] fatal: %s\n' "$*" >&2
exit 1
}
command -v git >/dev/null 2>&1 || fail "git is not installed"
command -v docker >/dev/null 2>&1 || fail "docker is not installed"
command -v curl >/dev/null 2>&1 || fail "curl is not installed"
docker compose version >/dev/null 2>&1 || fail "docker compose plugin is not available"
[ -d .git ] || fail "current directory is not a git checkout"
[ -f docker-compose.yml ] || fail "docker-compose.yml not found in current directory"
[ -f .env ] || fail ".env is missing; create it on the VPS with OPENAI_API_KEY and related runtime settings"
compose_args=()
IFS=':' read -r -a compose_files <<< "$COMPOSE_FILES"
for compose_file in "${compose_files[@]}"; do
if [ -f "$compose_file" ]; then
compose_args+=("-f" "$compose_file")
else
fail "compose file not found: ${compose_file}"
fi
done
log "fetching origin/${DEPLOY_BRANCH}"
git fetch --prune origin "+refs/heads/${DEPLOY_BRANCH}:refs/remotes/origin/${DEPLOY_BRANCH}"
log "checking out ${DEPLOY_BRANCH}"
git checkout -B "$DEPLOY_BRANCH" "origin/$DEPLOY_BRANCH"
git reset --hard "origin/$DEPLOY_BRANCH"
log "building Docker Compose services"
docker compose "${compose_args[@]}" build
log "starting Docker Compose stack"
docker compose "${compose_args[@]}" up -d --remove-orphans
log "current service state"
docker compose "${compose_args[@]}" ps
log "waiting for API health at ${HEALTH_URL}"
for attempt in {1..30}; do
if curl -fsS "$HEALTH_URL" >/dev/null; then
log "API is healthy"
exit 0
fi
log "health check failed, retry ${attempt}/30"
sleep 2
done
log "API did not become healthy; browser-api logs follow"
docker compose "${compose_args[@]}" logs --tail=120 browser-api || true
fail "health check failed: ${HEALTH_URL}"

2
scripts/requirements.txt Normal file
View file

@ -0,0 +1,2 @@
playwright>=1.40.0,<2.0.0
browser-use>=0.1.0,<1.0.0

72
scripts/setup.sh Normal file
View file

@ -0,0 +1,72 @@
#!/bin/bash
# Setup script for BrowserUse skill
# Устанавливает зависимости и браузеры для Playwright
set -e
echo "🔧 Installing BrowserUse skill dependencies..."
echo "================================================"
# Определяем цветной вывод
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color
# Проверка Python
echo -n "Checking Python... "
if command -v python3 &> /dev/null; then
PYTHON_VERSION=$(python3 --version)
echo -e "${GREEN}OK${NC} ($PYTHON_VERSION)"
else
echo -e "${RED}FAILED${NC}"
echo "Python 3 is required but not installed."
exit 1
fi
# Проверка pip
echo -n "Checking pip... "
if command -v pip3 &> /dev/null; then
echo -e "${GREEN}OK${NC}"
else
echo -e "${RED}FAILED${NC}"
echo "pip3 is required but not installed."
exit 1
fi
# Установка Python пакетов
echo ""
echo "📦 Installing Python packages..."
pip3 install --upgrade pip
pip3 install -r "$(dirname "$0")/requirements.txt"
# Установка браузеров Playwright
echo ""
echo "🌐 Installing Playwright browsers..."
python3 -m playwright install chromium
python3 -m playwright install-deps # Системные зависимости для Linux
# Проверка установки
echo ""
echo -n "✅ Verifying installation... "
if python3 -c "import playwright" 2>/dev/null; then
echo -e "${GREEN}OK${NC}"
else
echo -e "${RED}FAILED${NC}"
echo "Playwright installation verification failed."
exit 1
fi
# Создание временной директории для скриншотов
mkdir -p /tmp/browser-use-screenshots
echo "📁 Created screenshot directory: /tmp/browser-use-screenshots"
echo ""
echo "================================================"
echo -e "${GREEN}✅ BrowserUse skill successfully installed!${NC}"
echo ""
echo "📖 Quick test:"
echo " python3 $(dirname "$0")/browser_automation.py '{\"action\":\"goto\",\"url\":\"https://example.com\"}'"
echo ""
echo "📚 For more examples, see SKILL.md"
echo "================================================"