diff --git a/.gitignore b/.gitignore index b64c303f..acebf673 100644 --- a/.gitignore +++ b/.gitignore @@ -60,6 +60,7 @@ SOLUTION_SUMMARY.md BROWSER_USE_QUICKSTART.md BROWSER_USE_SETUP.md START_HERE.md +GUI_BROWSER_SETUP.md */config.yaml diff --git a/Dockerfile b/Dockerfile index 6fdb458a..bc788f96 100644 --- a/Dockerfile +++ b/Dockerfile @@ -26,4 +26,4 @@ RUN mkdir -p /root/.hermes/skills /root/.hermes/memories /root/.hermes/sessions WORKDIR /workspace -CMD ["bash"] \ No newline at end of file +CMD ["hermes", "gateway"] \ No newline at end of file diff --git a/browser-use/SKILL.md b/browser-use/SKILL.md new file mode 100644 index 00000000..02bbb467 --- /dev/null +++ b/browser-use/SKILL.md @@ -0,0 +1,94 @@ +--- +name: browser-use +version: "1.0.0" +description: Use browser-use with a Chromium CDP endpoint to perform web tasks from Hermes. +triggers: + - "browser-use" + - "open website and extract" + - "automate browser task" + - "run browser task" +allowed-tools: + - terminal + - file + - memory +--- + +# Browser Use (Chromium) + +This skill runs browser tasks via `browser-use` and connects to Chromium through CDP. + +## Prerequisites + +- `hermes-agent` container is running +- `chromium` service is running in `docker-compose` +- `OPENAI_API_KEY` is present in container env (via `docker-compose` `env_file`) +- If running outside container, set `OPENAI_API_KEY` in your shell or `.env` + +## Troubleshooting Environment Setup + +If you get `{"success": false, "error": "OPENAI_API_KEY is not set"}`: + +```bash +docker compose exec -T hermes-agent python - <<'PY' +import os +print('OPENAI_API_KEY', '' if os.getenv('OPENAI_API_KEY') else '') +print('OPENAI_BASE_URL', '' if os.getenv('OPENAI_BASE_URL') else '') +PY +``` + +If `OPENAI_API_KEY` is missing, ensure key exists in one of env files used by compose: +- `workspace/.env` +- `hermes_data/.env` + +Then recreate container: + +```bash +docker compose up -d hermes-agent +``` + +```bash +# Optional overrides when running outside Docker +export OPENAI_API_KEY="your-api-key-here" +export BROWSER_USE_CDP_URL="ws://chromium:3000/chromium?token=hermes-local" +``` + +**Common failure:** `{"success": false, "error": "OPENAI_API_KEY is not set"}` +- Cause: key is absent in container env +- Fix: add key to `workspace/.env` or `hermes_data/.env`, then `docker compose up -d hermes-agent` + +**Common failure:** 401 `key_model_access_denied` +- Cause: API key cannot access configured model (for example `gpt-4o-mini`) +- Fix: set allowed model via `BROWSER_USE_MODEL` (or `OPENAI_MODEL`) to a model your provider key can use + +**Common failure:** Connection refused to `chromium` +- Cause: Browser not running or CDP endpoint wrong +- Fix: Check `docker-compose ps` and verify `chromium` service is up + +## Quick start + +```bash +python-browser-use /root/.hermes/skills/autonomous-ai-agents/browser-use/scripts/run_browser_use.py \ + --task "Open example.com and return page title" \ + --max-steps 8 +``` + +## How to use in Hermes + +When user asks for website automation: + +```bash +python-browser-use /root/.hermes/skills/autonomous-ai-agents/browser-use/scripts/run_browser_use.py \ + --task "" \ + --max-steps 20 +``` + +If user gives a start URL, pass `--start-url`. + +## Notes + +- Default CDP URL: `ws://chromium:3000/chromium?token=hermes-local` +- Override by setting `BROWSER_USE_CDP_URL` +- Runtime Python: `BROWSER_USE_PYTHON` (defaults to `python-browser-use`) +- The script outputs JSON for easy parsing + + diff --git a/browser-use/assets/config.example.json b/browser-use/assets/config.example.json new file mode 100644 index 00000000..e8e3e146 --- /dev/null +++ b/browser-use/assets/config.example.json @@ -0,0 +1,16 @@ +{ + "browser": { + "cdp_url": "ws://chromium:3000/chromium?token=hermes-local", + "headless": true, + "timeout": 120000 + }, + "agent": { + "model_env": "BROWSER_USE_MODEL", + "max_steps": 20, + "use_vision": false + }, + "logging": { + "level": "info" + } +} + diff --git a/browser-use/assets/config.json b/browser-use/assets/config.json new file mode 100644 index 00000000..8f355553 --- /dev/null +++ b/browser-use/assets/config.json @@ -0,0 +1,16 @@ +{ + "browser": { + "cdp_url": "ws://chromium:3000/playwright?token=hermes-local", + "headless": true, + "timeout": 120000 + }, + "agent": { + "model_env": "BROWSER_USE_MODEL", + "max_steps": 20, + "use_vision": false + }, + "logging": { + "level": "info" + } +} + diff --git a/browser-use/scripts/requirements.txt b/browser-use/scripts/requirements.txt new file mode 100644 index 00000000..33650044 --- /dev/null +++ b/browser-use/scripts/requirements.txt @@ -0,0 +1,2 @@ +browser-use==0.12.5 + diff --git a/browser-use/scripts/run_browser_use.py b/browser-use/scripts/run_browser_use.py new file mode 100644 index 00000000..401e52fc --- /dev/null +++ b/browser-use/scripts/run_browser_use.py @@ -0,0 +1,181 @@ +#!/usr/bin/env python3 +"""Run browser-use task against a Chromium CDP endpoint.""" + +import argparse +import asyncio +import json +import os +import socket +from pathlib import Path +from typing import Any +from urllib.parse import urlparse, urlunparse +from urllib.request import urlopen + +from browser_use import Agent, BrowserSession +from browser_use.llm import ChatOpenAI + + +ENV_FALLBACK_PATHS = ( + Path("/workspace/.env"), + Path("/workspace/workspace/.env"), + Path("/root/.hermes/.env"), +) + + +def _read_env_from_files(name: str) -> str | None: + for env_path in ENV_FALLBACK_PATHS: + if not env_path.exists(): + continue + try: + for raw_line in env_path.read_text(encoding="utf-8").splitlines(): + line = raw_line.strip() + if not line or line.startswith("#") or "=" not in line: + continue + key, value = line.split("=", 1) + if key.strip() == name: + cleaned = value.strip().strip('"').strip("'") + return cleaned or None + except OSError: + continue + return None + + +def _get_env(name: str, default: str | None = None) -> str | None: + value = os.getenv(name) + if value: + return value + from_file = _read_env_from_files(name) + if from_file: + return from_file + return default if default else None + + +def _build_task(task: str, start_url: str | None) -> str: + if not start_url: + return task + return f"Start from {start_url}. Task: {task}" + + +def _serialize_history(history: Any) -> dict[str, Any]: + result = "" + errors: list[str] = [] + if hasattr(history, "final_result"): + try: + result = history.final_result() or "" + except Exception: + result = "" + if hasattr(history, "errors"): + try: + raw_errors = list(history.errors()) + errors = [str(e) for e in raw_errors if e] + except Exception: + errors = [] + return { + "final_result": result, + "errors": errors, + "has_errors": bool(errors), + } + + +def _resolve_cdp_url(cdp_url: str) -> str: + if cdp_url.startswith("ws://") or cdp_url.startswith("wss://"): + return cdp_url + if cdp_url.startswith("http://") or cdp_url.startswith("https://"): + parsed = urlparse(cdp_url) + host = parsed.hostname or "" + port = parsed.port + + # Chrome DevTools rejects non-IP/non-localhost Host headers in some setups. + # For docker service names, resolve to IP and query via numeric host. + if host not in {"localhost", "127.0.0.1", "0.0.0.0"}: + try: + resolved_host = socket.gethostbyname(host) + netloc = resolved_host if not port else f"{resolved_host}:{port}" + parsed = parsed._replace(netloc=netloc) + except OSError: + pass + + version_url = urlunparse(parsed).rstrip("/") + if not version_url.endswith("/json/version"): + version_url = f"{version_url}/json/version" + with urlopen(version_url, timeout=10) as response: # nosec B310 + payload = json.loads(response.read().decode("utf-8")) + ws_url = payload.get("webSocketDebuggerUrl") + if not ws_url: + raise RuntimeError(f"CDP endpoint did not return webSocketDebuggerUrl: {version_url}") + + # Keep a reachable host for ws:// URL when input used docker DNS alias. + if host and host not in {"localhost", "127.0.0.1", "0.0.0.0"}: + ws_parsed = urlparse(str(ws_url)) + ws_netloc = ws_parsed.netloc + ws_port = ws_parsed.port + if ws_port is None: + ws_port = 443 if ws_parsed.scheme == "wss" else 80 + try: + resolved_host = socket.gethostbyname(host) + ws_netloc = f"{resolved_host}:{ws_port}" + ws_url = urlunparse(ws_parsed._replace(netloc=ws_netloc)) + except OSError: + pass + + return str(ws_url) + raise RuntimeError(f"Unsupported CDP URL scheme: {cdp_url}") + + +async def _run(args: argparse.Namespace) -> int: + api_key = _get_env("OPENAI_API_KEY") + if not api_key: + print(json.dumps({"success": False, "error": "OPENAI_API_KEY is not set"})) + return 2 + + model = _get_env("BROWSER_USE_MODEL", _get_env("OPENAI_MODEL", "gpt-4o-mini")) + base_url = _get_env("OPENAI_BASE_URL") + raw_cdp_url = args.cdp_url or _get_env("BROWSER_USE_CDP_URL", "ws://chromium:3000/chromium?token=hermes-local") + cdp_url = _resolve_cdp_url(raw_cdp_url) + + llm = ChatOpenAI( + model=model, + api_key=api_key, + base_url=base_url, + temperature=0.0, + ) + + browser_session = BrowserSession(cdp_url=cdp_url) + agent = Agent( + task=_build_task(args.task, args.start_url), + llm=llm, + browser_session=browser_session, + use_vision=False, + ) + + history = await agent.run(max_steps=args.max_steps) + payload = _serialize_history(history) + + print( + json.dumps( + { + "success": not payload["has_errors"], + "model": model, + "cdp_url": cdp_url, + "task": args.task, + "result": payload, + }, + ensure_ascii=True, + ) + ) + return 0 if not payload["has_errors"] else 1 + + +def main() -> int: + parser = argparse.ArgumentParser(description="Run browser-use task") + parser.add_argument("--task", required=True, help="Natural language task for browser-use") + parser.add_argument("--start-url", default=None, help="Optional URL to open first") + parser.add_argument("--max-steps", type=int, default=20, help="Max agent steps") + parser.add_argument("--cdp-url", default=None, help="CDP URL (ws://... or http://.../json/version host)") + args = parser.parse_args() + return asyncio.run(_run(args)) + + +if __name__ == "__main__": + raise SystemExit(main()) + diff --git a/browser-use/scripts/setup.sh b/browser-use/scripts/setup.sh new file mode 100644 index 00000000..6be17146 --- /dev/null +++ b/browser-use/scripts/setup.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +VENV_DIR="${SCRIPT_DIR}/.venv" + +python3 -m venv "${VENV_DIR}" +"${VENV_DIR}/bin/pip" install --upgrade pip +"${VENV_DIR}/bin/pip" install -r "${SCRIPT_DIR}/requirements.txt" + +echo "browser-use skill environment is ready: ${VENV_DIR}" + diff --git a/docker-compose.yml b/docker-compose.yml index e8104886..61496bbc 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -11,6 +11,22 @@ services: ports: - "3000:3000" + chromium-gui: + build: + context: ./docker/chromium-gui + container_name: hermes-chromium-gui + restart: unless-stopped + shm_size: 1gb + ports: + - "127.0.0.1:6080:6080" + - "127.0.0.1:5900:5900" + - "127.0.0.1:9223:9223" + healthcheck: + test: [ "CMD", "curl", "-fsS", "http://localhost:9223/json/version" ] + interval: 10s + timeout: 5s + retries: 6 + hermes-agent: build: . container_name: hermes-agent @@ -19,13 +35,15 @@ services: tty: true depends_on: - chromium + - chromium-gui env_file: - - .env + - ./workspace/.env + - ./hermes_data/.env environment: - - BROWSER_USE_CDP_URL=ws://chromium:3000/playwright?token=hermes-local + - BROWSER_USE_CDP_URL=${BROWSER_USE_CDP_URL:-http://chromium-gui:9223} - BROWSER_USE_PYTHON=/opt/browser-use-venv/bin/python volumes: - ./workspace:/workspace - ./hermes_data:/root/.hermes working_dir: /workspace - command: tail -f /dev/null \ No newline at end of file + command: [ "hermes", "gateway" ] \ No newline at end of file diff --git a/docker/chromium-gui/Dockerfile b/docker/chromium-gui/Dockerfile new file mode 100644 index 00000000..9a282a60 --- /dev/null +++ b/docker/chromium-gui/Dockerfile @@ -0,0 +1,21 @@ +FROM debian:bookworm-slim + +RUN apt-get update && apt-get install -y --no-install-recommends \ + chromium \ + xvfb \ + x11vnc \ + fluxbox \ + novnc \ + websockify \ + socat \ + ca-certificates \ + curl \ + && rm -rf /var/lib/apt/lists/* + +COPY start.sh /usr/local/bin/start-gui-chromium.sh +RUN chmod +x /usr/local/bin/start-gui-chromium.sh + +EXPOSE 6080 5900 9222 + +CMD ["/usr/local/bin/start-gui-chromium.sh"] + diff --git a/docker/chromium-gui/README.md b/docker/chromium-gui/README.md new file mode 100644 index 00000000..6b9fac2c --- /dev/null +++ b/docker/chromium-gui/README.md @@ -0,0 +1,26 @@ +# Chromium GUI Service + +This container provides a full Chromium GUI with three interfaces: + +- noVNC web UI: `http://localhost:6080/vnc.html` +- VNC: `localhost:5900` +- CDP endpoint: `http://localhost:9223/json/version` + +## Run + +```bash +docker compose --profile gui up -d chromium-gui +``` + +## Use with browser-use + +Pass the GUI CDP endpoint as HTTP URL (the runner resolves it to a websocket automatically): + +```bash +docker compose exec -T hermes-agent python-browser-use \ + /root/.hermes/skills/autonomous-ai-agents/browser-use/scripts/run_browser_use.py \ + --cdp-url http://chromium-gui:9223 \ + --task "Open example.com and return page title" \ + --max-steps 5 +``` + diff --git a/docker/chromium-gui/start.sh b/docker/chromium-gui/start.sh new file mode 100644 index 00000000..007789e2 --- /dev/null +++ b/docker/chromium-gui/start.sh @@ -0,0 +1,30 @@ +#!/usr/bin/env bash +set -euo pipefail + +export DISPLAY=:99 +SCREEN_RESOLUTION="${SCREEN_RESOLUTION:-1920x1080x24}" + +# Clean stale X lock/socket from previous crashed runs in the same container. +rm -f /tmp/.X99-lock +rm -f /tmp/.X11-unix/X99 + +Xvfb :99 -screen 0 "$SCREEN_RESOLUTION" -ac +extension RANDR & +fluxbox >/tmp/fluxbox.log 2>&1 & + +x11vnc -display :99 -forever -shared -rfbport 5900 -nopw >/tmp/x11vnc.log 2>&1 & +websockify --web=/usr/share/novnc/ 6080 localhost:5900 >/tmp/novnc.log 2>&1 & +socat TCP-LISTEN:9223,fork,bind=0.0.0.0 TCP:127.0.0.1:9222 >/tmp/socat.log 2>&1 & + +exec chromium \ + --no-sandbox \ + --disable-dev-shm-usage \ + --disable-gpu \ + --disable-setuid-sandbox \ + --remote-debugging-address=0.0.0.0 \ + --remote-debugging-port=9222 \ + --user-data-dir=/tmp/chromium-profile \ + --window-size=1920,1080 \ + --no-first-run \ + --no-default-browser-check \ + about:blank >/tmp/chromium.log 2>&1 +