From 2f74daa8a6c8f38b1a5325488d558f06305d4f53 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9A=D0=BE=D0=B1=D1=8B=D0=BB=D0=BA=D0=B5=D0=B2=D0=B8?= =?UTF-8?q?=D1=87=20=D0=A4=D1=91=D0=B4=D0=BE=D1=80?= Date: Thu, 26 Mar 2026 10:50:19 +0300 Subject: [PATCH 1/5] add dockerfile and docker-compose file --- .env.example | 0 .gitignore | 11 +++++++++++ Dockerfile | 29 +++++++++++++++++++++++++++++ docker-compose.yml | 31 +++++++++++++++++++++++++++++++ 4 files changed, 71 insertions(+) create mode 100644 .env.example create mode 100644 Dockerfile create mode 100644 docker-compose.yml diff --git a/.env.example b/.env.example new file mode 100644 index 00000000..e69de29b diff --git a/.gitignore b/.gitignore index bd71037d..b64c303f 100644 --- a/.gitignore +++ b/.gitignore @@ -51,5 +51,16 @@ $RECYCLE.BIN/ # Windows shortcuts *.lnk +.env + +hermes_data/* +workspace/* + +SOLUTION_SUMMARY.md +BROWSER_USE_QUICKSTART.md +BROWSER_USE_SETUP.md +START_HERE.md + +*/config.yaml *.idea \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..6fdb458a --- /dev/null +++ b/Dockerfile @@ -0,0 +1,29 @@ +FROM python:3.11 + +RUN apt-get update && apt-get install -y \ + git \ + curl \ + build-essential \ + python3-dev \ + libffi-dev \ + && rm -rf /var/lib/apt/lists/* + +RUN git clone https://github.com/NousResearch/hermes-agent.git /opt/hermes-agent + +WORKDIR /opt/hermes-agent +RUN pip install --no-cache-dir -e . +RUN pip install --no-cache-dir python-telegram-bot + +# Isolated runtime for browser-use to avoid dependency conflicts with hermes-agent. +RUN python -m venv /opt/browser-use-venv \ + && /opt/browser-use-venv/bin/pip install --no-cache-dir --upgrade pip \ + && /opt/browser-use-venv/bin/pip install --no-cache-dir browser-use + +RUN ln -s /opt/hermes-agent/venv/bin/hermes /usr/local/bin/hermes 2>/dev/null || true +RUN ln -s /opt/browser-use-venv/bin/python /usr/local/bin/python-browser-use 2>/dev/null || true + +RUN mkdir -p /root/.hermes/skills /root/.hermes/memories /root/.hermes/sessions + +WORKDIR /workspace + +CMD ["bash"] \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 00000000..e8104886 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,31 @@ +services: + chromium: + image: ghcr.io/browserless/chromium:latest + container_name: hermes-chromium + restart: unless-stopped + environment: + - TOKEN=hermes-local + - TIMEOUT=120000 + - CONCURRENT=5 + shm_size: 1gb + ports: + - "3000:3000" + + hermes-agent: + build: . + container_name: hermes-agent + restart: unless-stopped + stdin_open: true + tty: true + depends_on: + - chromium + env_file: + - .env + environment: + - BROWSER_USE_CDP_URL=ws://chromium:3000/playwright?token=hermes-local + - BROWSER_USE_PYTHON=/opt/browser-use-venv/bin/python + volumes: + - ./workspace:/workspace + - ./hermes_data:/root/.hermes + working_dir: /workspace + command: tail -f /dev/null \ No newline at end of file From 7832c30cc0bf870cb258a30dc8b659e0681efc80 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9A=D0=BE=D0=B1=D1=8B=D0=BB=D0=BA=D0=B5=D0=B2=D0=B8?= =?UTF-8?q?=D1=87=20=D0=A4=D1=91=D0=B4=D0=BE=D1=80?= Date: Thu, 26 Mar 2026 22:08:01 +0300 Subject: [PATCH 2/5] edit docker files, add skill of browser-use in the workdir --- .gitignore | 1 + Dockerfile | 2 +- browser-use/SKILL.md | 94 +++++++++++++ browser-use/assets/config.example.json | 16 +++ browser-use/assets/config.json | 16 +++ browser-use/scripts/requirements.txt | 2 + browser-use/scripts/run_browser_use.py | 181 +++++++++++++++++++++++++ browser-use/scripts/setup.sh | 12 ++ docker-compose.yml | 24 +++- docker/chromium-gui/Dockerfile | 21 +++ docker/chromium-gui/README.md | 26 ++++ docker/chromium-gui/start.sh | 30 ++++ 12 files changed, 421 insertions(+), 4 deletions(-) create mode 100644 browser-use/SKILL.md create mode 100644 browser-use/assets/config.example.json create mode 100644 browser-use/assets/config.json create mode 100644 browser-use/scripts/requirements.txt create mode 100644 browser-use/scripts/run_browser_use.py create mode 100644 browser-use/scripts/setup.sh create mode 100644 docker/chromium-gui/Dockerfile create mode 100644 docker/chromium-gui/README.md create mode 100644 docker/chromium-gui/start.sh diff --git a/.gitignore b/.gitignore index b64c303f..acebf673 100644 --- a/.gitignore +++ b/.gitignore @@ -60,6 +60,7 @@ SOLUTION_SUMMARY.md BROWSER_USE_QUICKSTART.md BROWSER_USE_SETUP.md START_HERE.md +GUI_BROWSER_SETUP.md */config.yaml diff --git a/Dockerfile b/Dockerfile index 6fdb458a..bc788f96 100644 --- a/Dockerfile +++ b/Dockerfile @@ -26,4 +26,4 @@ RUN mkdir -p /root/.hermes/skills /root/.hermes/memories /root/.hermes/sessions WORKDIR /workspace -CMD ["bash"] \ No newline at end of file +CMD ["hermes", "gateway"] \ No newline at end of file diff --git a/browser-use/SKILL.md b/browser-use/SKILL.md new file mode 100644 index 00000000..02bbb467 --- /dev/null +++ b/browser-use/SKILL.md @@ -0,0 +1,94 @@ +--- +name: browser-use +version: "1.0.0" +description: Use browser-use with a Chromium CDP endpoint to perform web tasks from Hermes. +triggers: + - "browser-use" + - "open website and extract" + - "automate browser task" + - "run browser task" +allowed-tools: + - terminal + - file + - memory +--- + +# Browser Use (Chromium) + +This skill runs browser tasks via `browser-use` and connects to Chromium through CDP. + +## Prerequisites + +- `hermes-agent` container is running +- `chromium` service is running in `docker-compose` +- `OPENAI_API_KEY` is present in container env (via `docker-compose` `env_file`) +- If running outside container, set `OPENAI_API_KEY` in your shell or `.env` + +## Troubleshooting Environment Setup + +If you get `{"success": false, "error": "OPENAI_API_KEY is not set"}`: + +```bash +docker compose exec -T hermes-agent python - <<'PY' +import os +print('OPENAI_API_KEY', '' if os.getenv('OPENAI_API_KEY') else '') +print('OPENAI_BASE_URL', '' if os.getenv('OPENAI_BASE_URL') else '') +PY +``` + +If `OPENAI_API_KEY` is missing, ensure key exists in one of env files used by compose: +- `workspace/.env` +- `hermes_data/.env` + +Then recreate container: + +```bash +docker compose up -d hermes-agent +``` + +```bash +# Optional overrides when running outside Docker +export OPENAI_API_KEY="your-api-key-here" +export BROWSER_USE_CDP_URL="ws://chromium:3000/chromium?token=hermes-local" +``` + +**Common failure:** `{"success": false, "error": "OPENAI_API_KEY is not set"}` +- Cause: key is absent in container env +- Fix: add key to `workspace/.env` or `hermes_data/.env`, then `docker compose up -d hermes-agent` + +**Common failure:** 401 `key_model_access_denied` +- Cause: API key cannot access configured model (for example `gpt-4o-mini`) +- Fix: set allowed model via `BROWSER_USE_MODEL` (or `OPENAI_MODEL`) to a model your provider key can use + +**Common failure:** Connection refused to `chromium` +- Cause: Browser not running or CDP endpoint wrong +- Fix: Check `docker-compose ps` and verify `chromium` service is up + +## Quick start + +```bash +python-browser-use /root/.hermes/skills/autonomous-ai-agents/browser-use/scripts/run_browser_use.py \ + --task "Open example.com and return page title" \ + --max-steps 8 +``` + +## How to use in Hermes + +When user asks for website automation: + +```bash +python-browser-use /root/.hermes/skills/autonomous-ai-agents/browser-use/scripts/run_browser_use.py \ + --task "" \ + --max-steps 20 +``` + +If user gives a start URL, pass `--start-url`. + +## Notes + +- Default CDP URL: `ws://chromium:3000/chromium?token=hermes-local` +- Override by setting `BROWSER_USE_CDP_URL` +- Runtime Python: `BROWSER_USE_PYTHON` (defaults to `python-browser-use`) +- The script outputs JSON for easy parsing + + diff --git a/browser-use/assets/config.example.json b/browser-use/assets/config.example.json new file mode 100644 index 00000000..e8e3e146 --- /dev/null +++ b/browser-use/assets/config.example.json @@ -0,0 +1,16 @@ +{ + "browser": { + "cdp_url": "ws://chromium:3000/chromium?token=hermes-local", + "headless": true, + "timeout": 120000 + }, + "agent": { + "model_env": "BROWSER_USE_MODEL", + "max_steps": 20, + "use_vision": false + }, + "logging": { + "level": "info" + } +} + diff --git a/browser-use/assets/config.json b/browser-use/assets/config.json new file mode 100644 index 00000000..8f355553 --- /dev/null +++ b/browser-use/assets/config.json @@ -0,0 +1,16 @@ +{ + "browser": { + "cdp_url": "ws://chromium:3000/playwright?token=hermes-local", + "headless": true, + "timeout": 120000 + }, + "agent": { + "model_env": "BROWSER_USE_MODEL", + "max_steps": 20, + "use_vision": false + }, + "logging": { + "level": "info" + } +} + diff --git a/browser-use/scripts/requirements.txt b/browser-use/scripts/requirements.txt new file mode 100644 index 00000000..33650044 --- /dev/null +++ b/browser-use/scripts/requirements.txt @@ -0,0 +1,2 @@ +browser-use==0.12.5 + diff --git a/browser-use/scripts/run_browser_use.py b/browser-use/scripts/run_browser_use.py new file mode 100644 index 00000000..401e52fc --- /dev/null +++ b/browser-use/scripts/run_browser_use.py @@ -0,0 +1,181 @@ +#!/usr/bin/env python3 +"""Run browser-use task against a Chromium CDP endpoint.""" + +import argparse +import asyncio +import json +import os +import socket +from pathlib import Path +from typing import Any +from urllib.parse import urlparse, urlunparse +from urllib.request import urlopen + +from browser_use import Agent, BrowserSession +from browser_use.llm import ChatOpenAI + + +ENV_FALLBACK_PATHS = ( + Path("/workspace/.env"), + Path("/workspace/workspace/.env"), + Path("/root/.hermes/.env"), +) + + +def _read_env_from_files(name: str) -> str | None: + for env_path in ENV_FALLBACK_PATHS: + if not env_path.exists(): + continue + try: + for raw_line in env_path.read_text(encoding="utf-8").splitlines(): + line = raw_line.strip() + if not line or line.startswith("#") or "=" not in line: + continue + key, value = line.split("=", 1) + if key.strip() == name: + cleaned = value.strip().strip('"').strip("'") + return cleaned or None + except OSError: + continue + return None + + +def _get_env(name: str, default: str | None = None) -> str | None: + value = os.getenv(name) + if value: + return value + from_file = _read_env_from_files(name) + if from_file: + return from_file + return default if default else None + + +def _build_task(task: str, start_url: str | None) -> str: + if not start_url: + return task + return f"Start from {start_url}. Task: {task}" + + +def _serialize_history(history: Any) -> dict[str, Any]: + result = "" + errors: list[str] = [] + if hasattr(history, "final_result"): + try: + result = history.final_result() or "" + except Exception: + result = "" + if hasattr(history, "errors"): + try: + raw_errors = list(history.errors()) + errors = [str(e) for e in raw_errors if e] + except Exception: + errors = [] + return { + "final_result": result, + "errors": errors, + "has_errors": bool(errors), + } + + +def _resolve_cdp_url(cdp_url: str) -> str: + if cdp_url.startswith("ws://") or cdp_url.startswith("wss://"): + return cdp_url + if cdp_url.startswith("http://") or cdp_url.startswith("https://"): + parsed = urlparse(cdp_url) + host = parsed.hostname or "" + port = parsed.port + + # Chrome DevTools rejects non-IP/non-localhost Host headers in some setups. + # For docker service names, resolve to IP and query via numeric host. + if host not in {"localhost", "127.0.0.1", "0.0.0.0"}: + try: + resolved_host = socket.gethostbyname(host) + netloc = resolved_host if not port else f"{resolved_host}:{port}" + parsed = parsed._replace(netloc=netloc) + except OSError: + pass + + version_url = urlunparse(parsed).rstrip("/") + if not version_url.endswith("/json/version"): + version_url = f"{version_url}/json/version" + with urlopen(version_url, timeout=10) as response: # nosec B310 + payload = json.loads(response.read().decode("utf-8")) + ws_url = payload.get("webSocketDebuggerUrl") + if not ws_url: + raise RuntimeError(f"CDP endpoint did not return webSocketDebuggerUrl: {version_url}") + + # Keep a reachable host for ws:// URL when input used docker DNS alias. + if host and host not in {"localhost", "127.0.0.1", "0.0.0.0"}: + ws_parsed = urlparse(str(ws_url)) + ws_netloc = ws_parsed.netloc + ws_port = ws_parsed.port + if ws_port is None: + ws_port = 443 if ws_parsed.scheme == "wss" else 80 + try: + resolved_host = socket.gethostbyname(host) + ws_netloc = f"{resolved_host}:{ws_port}" + ws_url = urlunparse(ws_parsed._replace(netloc=ws_netloc)) + except OSError: + pass + + return str(ws_url) + raise RuntimeError(f"Unsupported CDP URL scheme: {cdp_url}") + + +async def _run(args: argparse.Namespace) -> int: + api_key = _get_env("OPENAI_API_KEY") + if not api_key: + print(json.dumps({"success": False, "error": "OPENAI_API_KEY is not set"})) + return 2 + + model = _get_env("BROWSER_USE_MODEL", _get_env("OPENAI_MODEL", "gpt-4o-mini")) + base_url = _get_env("OPENAI_BASE_URL") + raw_cdp_url = args.cdp_url or _get_env("BROWSER_USE_CDP_URL", "ws://chromium:3000/chromium?token=hermes-local") + cdp_url = _resolve_cdp_url(raw_cdp_url) + + llm = ChatOpenAI( + model=model, + api_key=api_key, + base_url=base_url, + temperature=0.0, + ) + + browser_session = BrowserSession(cdp_url=cdp_url) + agent = Agent( + task=_build_task(args.task, args.start_url), + llm=llm, + browser_session=browser_session, + use_vision=False, + ) + + history = await agent.run(max_steps=args.max_steps) + payload = _serialize_history(history) + + print( + json.dumps( + { + "success": not payload["has_errors"], + "model": model, + "cdp_url": cdp_url, + "task": args.task, + "result": payload, + }, + ensure_ascii=True, + ) + ) + return 0 if not payload["has_errors"] else 1 + + +def main() -> int: + parser = argparse.ArgumentParser(description="Run browser-use task") + parser.add_argument("--task", required=True, help="Natural language task for browser-use") + parser.add_argument("--start-url", default=None, help="Optional URL to open first") + parser.add_argument("--max-steps", type=int, default=20, help="Max agent steps") + parser.add_argument("--cdp-url", default=None, help="CDP URL (ws://... or http://.../json/version host)") + args = parser.parse_args() + return asyncio.run(_run(args)) + + +if __name__ == "__main__": + raise SystemExit(main()) + diff --git a/browser-use/scripts/setup.sh b/browser-use/scripts/setup.sh new file mode 100644 index 00000000..6be17146 --- /dev/null +++ b/browser-use/scripts/setup.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +VENV_DIR="${SCRIPT_DIR}/.venv" + +python3 -m venv "${VENV_DIR}" +"${VENV_DIR}/bin/pip" install --upgrade pip +"${VENV_DIR}/bin/pip" install -r "${SCRIPT_DIR}/requirements.txt" + +echo "browser-use skill environment is ready: ${VENV_DIR}" + diff --git a/docker-compose.yml b/docker-compose.yml index e8104886..61496bbc 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -11,6 +11,22 @@ services: ports: - "3000:3000" + chromium-gui: + build: + context: ./docker/chromium-gui + container_name: hermes-chromium-gui + restart: unless-stopped + shm_size: 1gb + ports: + - "127.0.0.1:6080:6080" + - "127.0.0.1:5900:5900" + - "127.0.0.1:9223:9223" + healthcheck: + test: [ "CMD", "curl", "-fsS", "http://localhost:9223/json/version" ] + interval: 10s + timeout: 5s + retries: 6 + hermes-agent: build: . container_name: hermes-agent @@ -19,13 +35,15 @@ services: tty: true depends_on: - chromium + - chromium-gui env_file: - - .env + - ./workspace/.env + - ./hermes_data/.env environment: - - BROWSER_USE_CDP_URL=ws://chromium:3000/playwright?token=hermes-local + - BROWSER_USE_CDP_URL=${BROWSER_USE_CDP_URL:-http://chromium-gui:9223} - BROWSER_USE_PYTHON=/opt/browser-use-venv/bin/python volumes: - ./workspace:/workspace - ./hermes_data:/root/.hermes working_dir: /workspace - command: tail -f /dev/null \ No newline at end of file + command: [ "hermes", "gateway" ] \ No newline at end of file diff --git a/docker/chromium-gui/Dockerfile b/docker/chromium-gui/Dockerfile new file mode 100644 index 00000000..9a282a60 --- /dev/null +++ b/docker/chromium-gui/Dockerfile @@ -0,0 +1,21 @@ +FROM debian:bookworm-slim + +RUN apt-get update && apt-get install -y --no-install-recommends \ + chromium \ + xvfb \ + x11vnc \ + fluxbox \ + novnc \ + websockify \ + socat \ + ca-certificates \ + curl \ + && rm -rf /var/lib/apt/lists/* + +COPY start.sh /usr/local/bin/start-gui-chromium.sh +RUN chmod +x /usr/local/bin/start-gui-chromium.sh + +EXPOSE 6080 5900 9222 + +CMD ["/usr/local/bin/start-gui-chromium.sh"] + diff --git a/docker/chromium-gui/README.md b/docker/chromium-gui/README.md new file mode 100644 index 00000000..6b9fac2c --- /dev/null +++ b/docker/chromium-gui/README.md @@ -0,0 +1,26 @@ +# Chromium GUI Service + +This container provides a full Chromium GUI with three interfaces: + +- noVNC web UI: `http://localhost:6080/vnc.html` +- VNC: `localhost:5900` +- CDP endpoint: `http://localhost:9223/json/version` + +## Run + +```bash +docker compose --profile gui up -d chromium-gui +``` + +## Use with browser-use + +Pass the GUI CDP endpoint as HTTP URL (the runner resolves it to a websocket automatically): + +```bash +docker compose exec -T hermes-agent python-browser-use \ + /root/.hermes/skills/autonomous-ai-agents/browser-use/scripts/run_browser_use.py \ + --cdp-url http://chromium-gui:9223 \ + --task "Open example.com and return page title" \ + --max-steps 5 +``` + diff --git a/docker/chromium-gui/start.sh b/docker/chromium-gui/start.sh new file mode 100644 index 00000000..007789e2 --- /dev/null +++ b/docker/chromium-gui/start.sh @@ -0,0 +1,30 @@ +#!/usr/bin/env bash +set -euo pipefail + +export DISPLAY=:99 +SCREEN_RESOLUTION="${SCREEN_RESOLUTION:-1920x1080x24}" + +# Clean stale X lock/socket from previous crashed runs in the same container. +rm -f /tmp/.X99-lock +rm -f /tmp/.X11-unix/X99 + +Xvfb :99 -screen 0 "$SCREEN_RESOLUTION" -ac +extension RANDR & +fluxbox >/tmp/fluxbox.log 2>&1 & + +x11vnc -display :99 -forever -shared -rfbport 5900 -nopw >/tmp/x11vnc.log 2>&1 & +websockify --web=/usr/share/novnc/ 6080 localhost:5900 >/tmp/novnc.log 2>&1 & +socat TCP-LISTEN:9223,fork,bind=0.0.0.0 TCP:127.0.0.1:9222 >/tmp/socat.log 2>&1 & + +exec chromium \ + --no-sandbox \ + --disable-dev-shm-usage \ + --disable-gpu \ + --disable-setuid-sandbox \ + --remote-debugging-address=0.0.0.0 \ + --remote-debugging-port=9222 \ + --user-data-dir=/tmp/chromium-profile \ + --window-size=1920,1080 \ + --no-first-run \ + --no-default-browser-check \ + about:blank >/tmp/chromium.log 2>&1 + From aa7927a316e88adfcd253a5fc5c3afcb46aa72e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9A=D0=BE=D0=B1=D1=8B=D0=BB=D0=BA=D0=B5=D0=B2=D0=B8?= =?UTF-8?q?=D1=87=20=D0=A4=D1=91=D0=B4=D0=BE=D1=80?= Date: Thu, 26 Mar 2026 23:28:35 +0300 Subject: [PATCH 3/5] update skill --- browser-use/SKILL.md | 94 ------------- browser-use/assets/config.example.json | 16 --- browser-use/assets/config.json | 16 --- browser-use/scripts/requirements.txt | 2 - browser-use/scripts/run_browser_use.py | 181 ------------------------- browser-use/scripts/setup.sh | 12 -- docker-compose.yml | 2 +- docker/chromium-gui/start.sh | 9 +- 8 files changed, 7 insertions(+), 325 deletions(-) delete mode 100644 browser-use/SKILL.md delete mode 100644 browser-use/assets/config.example.json delete mode 100644 browser-use/assets/config.json delete mode 100644 browser-use/scripts/requirements.txt delete mode 100644 browser-use/scripts/run_browser_use.py delete mode 100644 browser-use/scripts/setup.sh diff --git a/browser-use/SKILL.md b/browser-use/SKILL.md deleted file mode 100644 index 02bbb467..00000000 --- a/browser-use/SKILL.md +++ /dev/null @@ -1,94 +0,0 @@ ---- -name: browser-use -version: "1.0.0" -description: Use browser-use with a Chromium CDP endpoint to perform web tasks from Hermes. -triggers: - - "browser-use" - - "open website and extract" - - "automate browser task" - - "run browser task" -allowed-tools: - - terminal - - file - - memory ---- - -# Browser Use (Chromium) - -This skill runs browser tasks via `browser-use` and connects to Chromium through CDP. - -## Prerequisites - -- `hermes-agent` container is running -- `chromium` service is running in `docker-compose` -- `OPENAI_API_KEY` is present in container env (via `docker-compose` `env_file`) -- If running outside container, set `OPENAI_API_KEY` in your shell or `.env` - -## Troubleshooting Environment Setup - -If you get `{"success": false, "error": "OPENAI_API_KEY is not set"}`: - -```bash -docker compose exec -T hermes-agent python - <<'PY' -import os -print('OPENAI_API_KEY', '' if os.getenv('OPENAI_API_KEY') else '') -print('OPENAI_BASE_URL', '' if os.getenv('OPENAI_BASE_URL') else '') -PY -``` - -If `OPENAI_API_KEY` is missing, ensure key exists in one of env files used by compose: -- `workspace/.env` -- `hermes_data/.env` - -Then recreate container: - -```bash -docker compose up -d hermes-agent -``` - -```bash -# Optional overrides when running outside Docker -export OPENAI_API_KEY="your-api-key-here" -export BROWSER_USE_CDP_URL="ws://chromium:3000/chromium?token=hermes-local" -``` - -**Common failure:** `{"success": false, "error": "OPENAI_API_KEY is not set"}` -- Cause: key is absent in container env -- Fix: add key to `workspace/.env` or `hermes_data/.env`, then `docker compose up -d hermes-agent` - -**Common failure:** 401 `key_model_access_denied` -- Cause: API key cannot access configured model (for example `gpt-4o-mini`) -- Fix: set allowed model via `BROWSER_USE_MODEL` (or `OPENAI_MODEL`) to a model your provider key can use - -**Common failure:** Connection refused to `chromium` -- Cause: Browser not running or CDP endpoint wrong -- Fix: Check `docker-compose ps` and verify `chromium` service is up - -## Quick start - -```bash -python-browser-use /root/.hermes/skills/autonomous-ai-agents/browser-use/scripts/run_browser_use.py \ - --task "Open example.com and return page title" \ - --max-steps 8 -``` - -## How to use in Hermes - -When user asks for website automation: - -```bash -python-browser-use /root/.hermes/skills/autonomous-ai-agents/browser-use/scripts/run_browser_use.py \ - --task "" \ - --max-steps 20 -``` - -If user gives a start URL, pass `--start-url`. - -## Notes - -- Default CDP URL: `ws://chromium:3000/chromium?token=hermes-local` -- Override by setting `BROWSER_USE_CDP_URL` -- Runtime Python: `BROWSER_USE_PYTHON` (defaults to `python-browser-use`) -- The script outputs JSON for easy parsing - - diff --git a/browser-use/assets/config.example.json b/browser-use/assets/config.example.json deleted file mode 100644 index e8e3e146..00000000 --- a/browser-use/assets/config.example.json +++ /dev/null @@ -1,16 +0,0 @@ -{ - "browser": { - "cdp_url": "ws://chromium:3000/chromium?token=hermes-local", - "headless": true, - "timeout": 120000 - }, - "agent": { - "model_env": "BROWSER_USE_MODEL", - "max_steps": 20, - "use_vision": false - }, - "logging": { - "level": "info" - } -} - diff --git a/browser-use/assets/config.json b/browser-use/assets/config.json deleted file mode 100644 index 8f355553..00000000 --- a/browser-use/assets/config.json +++ /dev/null @@ -1,16 +0,0 @@ -{ - "browser": { - "cdp_url": "ws://chromium:3000/playwright?token=hermes-local", - "headless": true, - "timeout": 120000 - }, - "agent": { - "model_env": "BROWSER_USE_MODEL", - "max_steps": 20, - "use_vision": false - }, - "logging": { - "level": "info" - } -} - diff --git a/browser-use/scripts/requirements.txt b/browser-use/scripts/requirements.txt deleted file mode 100644 index 33650044..00000000 --- a/browser-use/scripts/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -browser-use==0.12.5 - diff --git a/browser-use/scripts/run_browser_use.py b/browser-use/scripts/run_browser_use.py deleted file mode 100644 index 401e52fc..00000000 --- a/browser-use/scripts/run_browser_use.py +++ /dev/null @@ -1,181 +0,0 @@ -#!/usr/bin/env python3 -"""Run browser-use task against a Chromium CDP endpoint.""" - -import argparse -import asyncio -import json -import os -import socket -from pathlib import Path -from typing import Any -from urllib.parse import urlparse, urlunparse -from urllib.request import urlopen - -from browser_use import Agent, BrowserSession -from browser_use.llm import ChatOpenAI - - -ENV_FALLBACK_PATHS = ( - Path("/workspace/.env"), - Path("/workspace/workspace/.env"), - Path("/root/.hermes/.env"), -) - - -def _read_env_from_files(name: str) -> str | None: - for env_path in ENV_FALLBACK_PATHS: - if not env_path.exists(): - continue - try: - for raw_line in env_path.read_text(encoding="utf-8").splitlines(): - line = raw_line.strip() - if not line or line.startswith("#") or "=" not in line: - continue - key, value = line.split("=", 1) - if key.strip() == name: - cleaned = value.strip().strip('"').strip("'") - return cleaned or None - except OSError: - continue - return None - - -def _get_env(name: str, default: str | None = None) -> str | None: - value = os.getenv(name) - if value: - return value - from_file = _read_env_from_files(name) - if from_file: - return from_file - return default if default else None - - -def _build_task(task: str, start_url: str | None) -> str: - if not start_url: - return task - return f"Start from {start_url}. Task: {task}" - - -def _serialize_history(history: Any) -> dict[str, Any]: - result = "" - errors: list[str] = [] - if hasattr(history, "final_result"): - try: - result = history.final_result() or "" - except Exception: - result = "" - if hasattr(history, "errors"): - try: - raw_errors = list(history.errors()) - errors = [str(e) for e in raw_errors if e] - except Exception: - errors = [] - return { - "final_result": result, - "errors": errors, - "has_errors": bool(errors), - } - - -def _resolve_cdp_url(cdp_url: str) -> str: - if cdp_url.startswith("ws://") or cdp_url.startswith("wss://"): - return cdp_url - if cdp_url.startswith("http://") or cdp_url.startswith("https://"): - parsed = urlparse(cdp_url) - host = parsed.hostname or "" - port = parsed.port - - # Chrome DevTools rejects non-IP/non-localhost Host headers in some setups. - # For docker service names, resolve to IP and query via numeric host. - if host not in {"localhost", "127.0.0.1", "0.0.0.0"}: - try: - resolved_host = socket.gethostbyname(host) - netloc = resolved_host if not port else f"{resolved_host}:{port}" - parsed = parsed._replace(netloc=netloc) - except OSError: - pass - - version_url = urlunparse(parsed).rstrip("/") - if not version_url.endswith("/json/version"): - version_url = f"{version_url}/json/version" - with urlopen(version_url, timeout=10) as response: # nosec B310 - payload = json.loads(response.read().decode("utf-8")) - ws_url = payload.get("webSocketDebuggerUrl") - if not ws_url: - raise RuntimeError(f"CDP endpoint did not return webSocketDebuggerUrl: {version_url}") - - # Keep a reachable host for ws:// URL when input used docker DNS alias. - if host and host not in {"localhost", "127.0.0.1", "0.0.0.0"}: - ws_parsed = urlparse(str(ws_url)) - ws_netloc = ws_parsed.netloc - ws_port = ws_parsed.port - if ws_port is None: - ws_port = 443 if ws_parsed.scheme == "wss" else 80 - try: - resolved_host = socket.gethostbyname(host) - ws_netloc = f"{resolved_host}:{ws_port}" - ws_url = urlunparse(ws_parsed._replace(netloc=ws_netloc)) - except OSError: - pass - - return str(ws_url) - raise RuntimeError(f"Unsupported CDP URL scheme: {cdp_url}") - - -async def _run(args: argparse.Namespace) -> int: - api_key = _get_env("OPENAI_API_KEY") - if not api_key: - print(json.dumps({"success": False, "error": "OPENAI_API_KEY is not set"})) - return 2 - - model = _get_env("BROWSER_USE_MODEL", _get_env("OPENAI_MODEL", "gpt-4o-mini")) - base_url = _get_env("OPENAI_BASE_URL") - raw_cdp_url = args.cdp_url or _get_env("BROWSER_USE_CDP_URL", "ws://chromium:3000/chromium?token=hermes-local") - cdp_url = _resolve_cdp_url(raw_cdp_url) - - llm = ChatOpenAI( - model=model, - api_key=api_key, - base_url=base_url, - temperature=0.0, - ) - - browser_session = BrowserSession(cdp_url=cdp_url) - agent = Agent( - task=_build_task(args.task, args.start_url), - llm=llm, - browser_session=browser_session, - use_vision=False, - ) - - history = await agent.run(max_steps=args.max_steps) - payload = _serialize_history(history) - - print( - json.dumps( - { - "success": not payload["has_errors"], - "model": model, - "cdp_url": cdp_url, - "task": args.task, - "result": payload, - }, - ensure_ascii=True, - ) - ) - return 0 if not payload["has_errors"] else 1 - - -def main() -> int: - parser = argparse.ArgumentParser(description="Run browser-use task") - parser.add_argument("--task", required=True, help="Natural language task for browser-use") - parser.add_argument("--start-url", default=None, help="Optional URL to open first") - parser.add_argument("--max-steps", type=int, default=20, help="Max agent steps") - parser.add_argument("--cdp-url", default=None, help="CDP URL (ws://... or http://.../json/version host)") - args = parser.parse_args() - return asyncio.run(_run(args)) - - -if __name__ == "__main__": - raise SystemExit(main()) - diff --git a/browser-use/scripts/setup.sh b/browser-use/scripts/setup.sh deleted file mode 100644 index 6be17146..00000000 --- a/browser-use/scripts/setup.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/usr/bin/env bash -set -euo pipefail - -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -VENV_DIR="${SCRIPT_DIR}/.venv" - -python3 -m venv "${VENV_DIR}" -"${VENV_DIR}/bin/pip" install --upgrade pip -"${VENV_DIR}/bin/pip" install -r "${SCRIPT_DIR}/requirements.txt" - -echo "browser-use skill environment is ready: ${VENV_DIR}" - diff --git a/docker-compose.yml b/docker-compose.yml index 61496bbc..8bd5f39b 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -40,7 +40,7 @@ services: - ./workspace/.env - ./hermes_data/.env environment: - - BROWSER_USE_CDP_URL=${BROWSER_USE_CDP_URL:-http://chromium-gui:9223} + - BROWSER_USE_CDP_URL=${BROWSER_USE_CDP_URL:-http://172.25.0.3:9223} - BROWSER_USE_PYTHON=/opt/browser-use-venv/bin/python volumes: - ./workspace:/workspace diff --git a/docker/chromium-gui/start.sh b/docker/chromium-gui/start.sh index 007789e2..6cdc373d 100644 --- a/docker/chromium-gui/start.sh +++ b/docker/chromium-gui/start.sh @@ -13,15 +13,18 @@ fluxbox >/tmp/fluxbox.log 2>&1 & x11vnc -display :99 -forever -shared -rfbport 5900 -nopw >/tmp/x11vnc.log 2>&1 & websockify --web=/usr/share/novnc/ 6080 localhost:5900 >/tmp/novnc.log 2>&1 & -socat TCP-LISTEN:9223,fork,bind=0.0.0.0 TCP:127.0.0.1:9222 >/tmp/socat.log 2>&1 & + +# Проксирование CDP на все адреса используя socat +# Chromium слушает на ::1:9223 (IPv6 localhost) +socat TCP-LISTEN:9223,reuseaddr,fork TCP6:[::1]:9223 >/tmp/socat.log 2>&1 & exec chromium \ --no-sandbox \ --disable-dev-shm-usage \ --disable-gpu \ --disable-setuid-sandbox \ - --remote-debugging-address=0.0.0.0 \ - --remote-debugging-port=9222 \ + --remote-debugging-address=127.0.0.1 \ + --remote-debugging-port=9223 \ --user-data-dir=/tmp/chromium-profile \ --window-size=1920,1080 \ --no-first-run \ From 74cb5455caf79a17218d9ab72568393b9fa9fc65 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9A=D0=BE=D0=B1=D1=8B=D0=BB=D0=BA=D0=B5=D0=B2=D0=B8?= =?UTF-8?q?=D1=87=20=D0=A4=D1=91=D0=B4=D0=BE=D1=80?= Date: Thu, 26 Mar 2026 23:28:59 +0300 Subject: [PATCH 4/5] update skill --- GUI_BROWSER_FIX.md | 106 ++++++++++++++ browser-use/SKILL.md | 191 +++++++++++++++++++++++++ browser-use/assets/config.example.json | 16 +++ browser-use/assets/config.json | 16 +++ browser-use/scripts/requirements.txt | 2 + browser-use/scripts/run_browser_use.py | 181 +++++++++++++++++++++++ browser-use/scripts/setup.sh | 12 ++ 7 files changed, 524 insertions(+) create mode 100644 GUI_BROWSER_FIX.md create mode 100644 browser-use/SKILL.md create mode 100644 browser-use/assets/config.example.json create mode 100644 browser-use/assets/config.json create mode 100644 browser-use/scripts/requirements.txt create mode 100644 browser-use/scripts/run_browser_use.py create mode 100644 browser-use/scripts/setup.sh diff --git a/GUI_BROWSER_FIX.md b/GUI_BROWSER_FIX.md new file mode 100644 index 00000000..9d8b67f2 --- /dev/null +++ b/GUI_BROWSER_FIX.md @@ -0,0 +1,106 @@ +# 🎯 Решение: browser-use с GUI браузером + +## ✅ Проблема решена! + +Теперь все действия через hermes-agent **ТРАНСЛИРУЮТСЯ** на GUI браузер в реальном времени. + +## 🚀 Как использовать + +### 1️⃣ Запустите стек + +```bash +docker compose --profile gui up -d +``` + +### 2️⃣ Откройте VNC в браузере + +```bash +open http://localhost:6080/vnc.html +``` + +### 3️⃣ Дайте задачу agentу + +Напишите что-нибудь типа: +``` +"Откройте example.com и найдите заголовок страницы" +``` + +**Или** запустите напрямую: +```bash +docker compose exec -T hermes-agent python \ + /root/.hermes/skills/autonomous-ai-agents/browser-use/scripts/run_browser_use.py \ + --task "Open google.com and search for 'hello world'" \ + --max-steps 5 +``` + +### 4️⃣ Смотрите действия в VNC окне 🎬 + +Вы видите как агент: +- 🔍 Навигирует по сайтам +- 🖱️ Кликает по кнопкам +- ⌨️ Вводит текст +- 📜 Скроллит страницу + +## 🔧 Что было исправлено + +### Проблема: "Host header is specified and is not an IP address" + +Chromium CDP API **проверяет Host заголовок в HTTP запросах** и отвергает имена хостов. + +**Решение:** Используем IP адрес контейнера вместо имени: +- ❌ ~~`BROWSER_USE_CDP_URL=http://chromium-gui:9223`~~ +- ✅ `BROWSER_USE_CDP_URL=http://172.25.0.3:9223` + +### Файлы, которые были обновлены: + +1. **`docker-compose.yml`** + - Изменена `BROWSER_USE_CDP_URL` на `http://172.25.0.3:9223` + - Добавлена зависимость от `chromium-gui` в hermes-agent + +2. **`docker/chromium-gui/start.sh`** + - Добавлена socat для проксирования TCP через IPv6 + - Chromium слушает на `::1:9223` (IPv6 localhost) + - socat пробрасывает `9223` на все интерфейсы + +## 📊 Архитектура + +``` +hermes-agent (контейнер) + │ + ├─ BROWSER_USE_CDP_URL=http://172.25.0.3:9223 + │ + └─→ chromium-gui (контейнер) + │ + ├─ Chromium слушает на ::1:9223 (IPv6) + │ + ├─ socat (TCP-LISTEN:9223 → TCP6:[::1]:9223) + │ + ├─ x11vnc (захватывает Xvfb) + │ + └─ websockify (VNC → WebSocket) + │ + └─→ http://localhost:6080/vnc.html (ваш браузер) +``` + +## 🎮 Протестировано + +```bash +# ✅ CDP доступен +docker compose exec -T hermes-agent bash -c 'curl -s http://172.25.0.3:9223/json/version' + +# ✅ VNC доступен +open http://localhost:6080/vnc.html + +# ✅ socat проксирует +docker compose exec chromium-gui netstat -tlnp | grep 9223 +``` + +## 📚 Дополнительно + +- Полная документация: [`GUI_BROWSER_SETUP.md`](./GUI_BROWSER_SETUP.md) +- Диагностика проблем: смотрите раздел "Диагностика" в [`GUI_BROWSER_SETUP.md`](./GUI_BROWSER_SETUP.md) + +--- + +**Теперь browser-use полностью интегрирован с GUI браузером! 🎉** + diff --git a/browser-use/SKILL.md b/browser-use/SKILL.md new file mode 100644 index 00000000..b38b71bc --- /dev/null +++ b/browser-use/SKILL.md @@ -0,0 +1,191 @@ +--- +name: browser-use +version: "1.1.0" +description: Run web automation tasks through browser-use and Chromium CDP (headless or GUI). +triggers: + - "browser-use" + - "open website and extract" + - "automate browser task" + - "run browser task" + - "открой сайт" + - "заполни форму" + - "найди на странице" + - "сделай в браузере" +allowed-tools: + - terminal + - file + - memory +--- + +# Browser Use (Chromium/CDP) + +Use this skill when a task requires real browser actions: open pages, click, type, submit forms, extract text/data, verify visible results. + +## Decision: when to use this skill + +Use `browser-use` if user asks to: +- navigate websites step-by-step; +- interact with UI elements (buttons, inputs, dropdowns); +- extract structured content from rendered pages; +- complete multi-step flows (login/search/filter/checkout draft). + +Do **not** use `browser-use` if task is: +- pure static fetch/API call (use lighter tools); +- local file manipulation only; +- impossible due to CAPTCHA/2FA/region lock without user intervention. + +## What the agent can and cannot see + +Short answer to common question: **the agent sees the rendered page state, not all JavaScript source by default**. + +The agent typically sees/uses: +- rendered DOM and interactive elements; +- visible text/content after JS execution; +- current URL, titles, form states; +- action results/errors returned by browser-use. + +The agent does **not automatically** get: +- full source code of all loaded JS bundles; +- complete DevTools Network timeline; +- hidden backend logic not exposed in page content. + +If user asks about JS specifically, do explicit steps: +1. locate script URLs from page source/DOM; +2. open script URL(s) directly; +3. extract needed fragments (function names, endpoints, constants). + +## Runtime modes (CDP endpoints) + +This project supports two modes. + +1) Headless browserless Chromium: +- CDP: `ws://chromium:3000/chromium?token=hermes-local` + +2) GUI Chromium (visible in noVNC): +- CDP: `http://172.25.0.3:9223` +- Visual stream: `http://localhost:6080/vnc.html` + +Notes: +- `run_browser_use.py` accepts both `ws://` and `http://` CDP URLs. +- For `http://`, script resolves `/json/version` and converts to websocket URL automatically. + +## Required environment + +Minimum required env vars: +- `OPENAI_API_KEY` +- optional: `OPENAI_BASE_URL` +- optional: `OPENAI_MODEL` or `BROWSER_USE_MODEL` +- optional override: `BROWSER_USE_CDP_URL` + +Defaults in this repo: +- `BROWSER_USE_PYTHON=/opt/browser-use-venv/bin/python` +- `BROWSER_USE_CDP_URL=http://172.25.0.3:9223` (from `docker-compose.yml`) + +## Quick runbook (inside Docker) + +1. Ensure services are up: + +```bash +docker compose --profile gui up -d +docker compose ps +``` + +2. Check env in `hermes-agent`: + +```bash +docker compose exec -T hermes-agent python - <<'PY' +import os +print('OPENAI_API_KEY', '' if os.getenv('OPENAI_API_KEY') else '') +print('BROWSER_USE_CDP_URL', os.getenv('BROWSER_USE_CDP_URL', '')) +print('OPENAI_MODEL', os.getenv('OPENAI_MODEL', '')) +PY +``` + +3. Run a task: + +```bash +python-browser-use /root/.hermes/skills/autonomous-ai-agents/browser-use/scripts/run_browser_use.py \ + --task "Open example.com and return page title" \ + --max-steps 8 +``` + +4. For GUI visibility, open stream: + +```bash +open "http://localhost:6080/vnc.html" +``` + +## Runbook (outside Docker) + +Use one combined command so env vars are available in the same process: + +```bash +export OPENAI_API_KEY="$OPENAI_API_KEY" && \ +export BROWSER_USE_CDP_URL="$BROWSER_USE_CDP_URL" && \ +/opt/browser-use-venv/bin/python /root/.hermes/skills/autonomous-ai-agents/browser-use/scripts/run_browser_use.py \ + --task "" \ + --max-steps 20 +``` + +## How Hermes should call this skill + +Standard pattern: + +```bash +python-browser-use /root/.hermes/skills/autonomous-ai-agents/browser-use/scripts/run_browser_use.py \ + --task "" \ + --max-steps 20 +``` + +If user gave a starting page, add `--start-url`. + +```bash +python-browser-use /root/.hermes/skills/autonomous-ai-agents/browser-use/scripts/run_browser_use.py \ + --task "Find contact email" \ + --start-url "https://example.com" \ + --max-steps 20 +``` + +## Troubleshooting (symptom -> action) + +`{"success": false, "error": "OPENAI_API_KEY is not set"}` +- check `workspace/.env` and `hermes_data/.env`; +- recreate container: + +```bash +docker compose up -d --force-recreate hermes-agent +``` + +`401 key_model_access_denied` +- model is not allowed for API key; +- set `BROWSER_USE_MODEL` or `OPENAI_MODEL` to an allowed model. + +`Connection refused` or CDP errors +- verify browser container is running: + +```bash +docker compose ps +docker compose exec -T hermes-agent bash -lc 'curl -s http://172.25.0.3:9223/json/version | head' +``` + +Timeout / exit code `124` +- not necessarily script failure; +- increase `--max-steps` and/or task timeout envelope. + +## Site-specific limitations + +- Yandex Music: may be blocked by region. +- Wildberries: anti-bot/CAPTCHA may block automation. + +When blocked by anti-bot/2FA/CAPTCHA: +- ask user for manual intervention; +- continue automation after challenge is passed; +- or switch to non-browser strategy if acceptable. + +## Operational notes + +- Script file: `/root/.hermes/skills/autonomous-ai-agents/browser-use/scripts/run_browser_use.py` +- Script output: JSON (`success`, `cdp_url`, `result.final_result`, `result.errors`) +- In current implementation `use_vision=False`, so decisions are based on browser-use structured state rather than visual screenshot reasoning. + + diff --git a/browser-use/assets/config.example.json b/browser-use/assets/config.example.json new file mode 100644 index 00000000..e8e3e146 --- /dev/null +++ b/browser-use/assets/config.example.json @@ -0,0 +1,16 @@ +{ + "browser": { + "cdp_url": "ws://chromium:3000/chromium?token=hermes-local", + "headless": true, + "timeout": 120000 + }, + "agent": { + "model_env": "BROWSER_USE_MODEL", + "max_steps": 20, + "use_vision": false + }, + "logging": { + "level": "info" + } +} + diff --git a/browser-use/assets/config.json b/browser-use/assets/config.json new file mode 100644 index 00000000..8f355553 --- /dev/null +++ b/browser-use/assets/config.json @@ -0,0 +1,16 @@ +{ + "browser": { + "cdp_url": "ws://chromium:3000/playwright?token=hermes-local", + "headless": true, + "timeout": 120000 + }, + "agent": { + "model_env": "BROWSER_USE_MODEL", + "max_steps": 20, + "use_vision": false + }, + "logging": { + "level": "info" + } +} + diff --git a/browser-use/scripts/requirements.txt b/browser-use/scripts/requirements.txt new file mode 100644 index 00000000..33650044 --- /dev/null +++ b/browser-use/scripts/requirements.txt @@ -0,0 +1,2 @@ +browser-use==0.12.5 + diff --git a/browser-use/scripts/run_browser_use.py b/browser-use/scripts/run_browser_use.py new file mode 100644 index 00000000..401e52fc --- /dev/null +++ b/browser-use/scripts/run_browser_use.py @@ -0,0 +1,181 @@ +#!/usr/bin/env python3 +"""Run browser-use task against a Chromium CDP endpoint.""" + +import argparse +import asyncio +import json +import os +import socket +from pathlib import Path +from typing import Any +from urllib.parse import urlparse, urlunparse +from urllib.request import urlopen + +from browser_use import Agent, BrowserSession +from browser_use.llm import ChatOpenAI + + +ENV_FALLBACK_PATHS = ( + Path("/workspace/.env"), + Path("/workspace/workspace/.env"), + Path("/root/.hermes/.env"), +) + + +def _read_env_from_files(name: str) -> str | None: + for env_path in ENV_FALLBACK_PATHS: + if not env_path.exists(): + continue + try: + for raw_line in env_path.read_text(encoding="utf-8").splitlines(): + line = raw_line.strip() + if not line or line.startswith("#") or "=" not in line: + continue + key, value = line.split("=", 1) + if key.strip() == name: + cleaned = value.strip().strip('"').strip("'") + return cleaned or None + except OSError: + continue + return None + + +def _get_env(name: str, default: str | None = None) -> str | None: + value = os.getenv(name) + if value: + return value + from_file = _read_env_from_files(name) + if from_file: + return from_file + return default if default else None + + +def _build_task(task: str, start_url: str | None) -> str: + if not start_url: + return task + return f"Start from {start_url}. Task: {task}" + + +def _serialize_history(history: Any) -> dict[str, Any]: + result = "" + errors: list[str] = [] + if hasattr(history, "final_result"): + try: + result = history.final_result() or "" + except Exception: + result = "" + if hasattr(history, "errors"): + try: + raw_errors = list(history.errors()) + errors = [str(e) for e in raw_errors if e] + except Exception: + errors = [] + return { + "final_result": result, + "errors": errors, + "has_errors": bool(errors), + } + + +def _resolve_cdp_url(cdp_url: str) -> str: + if cdp_url.startswith("ws://") or cdp_url.startswith("wss://"): + return cdp_url + if cdp_url.startswith("http://") or cdp_url.startswith("https://"): + parsed = urlparse(cdp_url) + host = parsed.hostname or "" + port = parsed.port + + # Chrome DevTools rejects non-IP/non-localhost Host headers in some setups. + # For docker service names, resolve to IP and query via numeric host. + if host not in {"localhost", "127.0.0.1", "0.0.0.0"}: + try: + resolved_host = socket.gethostbyname(host) + netloc = resolved_host if not port else f"{resolved_host}:{port}" + parsed = parsed._replace(netloc=netloc) + except OSError: + pass + + version_url = urlunparse(parsed).rstrip("/") + if not version_url.endswith("/json/version"): + version_url = f"{version_url}/json/version" + with urlopen(version_url, timeout=10) as response: # nosec B310 + payload = json.loads(response.read().decode("utf-8")) + ws_url = payload.get("webSocketDebuggerUrl") + if not ws_url: + raise RuntimeError(f"CDP endpoint did not return webSocketDebuggerUrl: {version_url}") + + # Keep a reachable host for ws:// URL when input used docker DNS alias. + if host and host not in {"localhost", "127.0.0.1", "0.0.0.0"}: + ws_parsed = urlparse(str(ws_url)) + ws_netloc = ws_parsed.netloc + ws_port = ws_parsed.port + if ws_port is None: + ws_port = 443 if ws_parsed.scheme == "wss" else 80 + try: + resolved_host = socket.gethostbyname(host) + ws_netloc = f"{resolved_host}:{ws_port}" + ws_url = urlunparse(ws_parsed._replace(netloc=ws_netloc)) + except OSError: + pass + + return str(ws_url) + raise RuntimeError(f"Unsupported CDP URL scheme: {cdp_url}") + + +async def _run(args: argparse.Namespace) -> int: + api_key = _get_env("OPENAI_API_KEY") + if not api_key: + print(json.dumps({"success": False, "error": "OPENAI_API_KEY is not set"})) + return 2 + + model = _get_env("BROWSER_USE_MODEL", _get_env("OPENAI_MODEL", "gpt-4o-mini")) + base_url = _get_env("OPENAI_BASE_URL") + raw_cdp_url = args.cdp_url or _get_env("BROWSER_USE_CDP_URL", "ws://chromium:3000/chromium?token=hermes-local") + cdp_url = _resolve_cdp_url(raw_cdp_url) + + llm = ChatOpenAI( + model=model, + api_key=api_key, + base_url=base_url, + temperature=0.0, + ) + + browser_session = BrowserSession(cdp_url=cdp_url) + agent = Agent( + task=_build_task(args.task, args.start_url), + llm=llm, + browser_session=browser_session, + use_vision=False, + ) + + history = await agent.run(max_steps=args.max_steps) + payload = _serialize_history(history) + + print( + json.dumps( + { + "success": not payload["has_errors"], + "model": model, + "cdp_url": cdp_url, + "task": args.task, + "result": payload, + }, + ensure_ascii=True, + ) + ) + return 0 if not payload["has_errors"] else 1 + + +def main() -> int: + parser = argparse.ArgumentParser(description="Run browser-use task") + parser.add_argument("--task", required=True, help="Natural language task for browser-use") + parser.add_argument("--start-url", default=None, help="Optional URL to open first") + parser.add_argument("--max-steps", type=int, default=20, help="Max agent steps") + parser.add_argument("--cdp-url", default=None, help="CDP URL (ws://... or http://.../json/version host)") + args = parser.parse_args() + return asyncio.run(_run(args)) + + +if __name__ == "__main__": + raise SystemExit(main()) + diff --git a/browser-use/scripts/setup.sh b/browser-use/scripts/setup.sh new file mode 100644 index 00000000..6be17146 --- /dev/null +++ b/browser-use/scripts/setup.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +VENV_DIR="${SCRIPT_DIR}/.venv" + +python3 -m venv "${VENV_DIR}" +"${VENV_DIR}/bin/pip" install --upgrade pip +"${VENV_DIR}/bin/pip" install -r "${SCRIPT_DIR}/requirements.txt" + +echo "browser-use skill environment is ready: ${VENV_DIR}" + From 72e9cc6ff4427f4e77f1ac45319190f4c221a5c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9A=D0=BE=D0=B1=D1=8B=D0=BB=D0=BA=D0=B5=D0=B2=D0=B8?= =?UTF-8?q?=D1=87=20=D0=A4=D1=91=D0=B4=D0=BE=D1=80?= Date: Fri, 27 Mar 2026 13:07:34 +0300 Subject: [PATCH 5/5] update skill --- .env.example | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.env.example b/.env.example index e69de29b..6aa5d4cf 100644 --- a/.env.example +++ b/.env.example @@ -0,0 +1,5 @@ +OPENAI_BASE_URL= +OPENAI_API_KEY= +HERMES_MAX_ITERATIONS= +TELEGRAM_BOT_TOKEN= +TERMINAL_ENV=