feat: add website blocklist enforcement for web/browser tools (#1064)

Adds security.website_blocklist config for user-managed domain blocking across URL-capable tools. Enforced at the tool level (not monkey-patching) so it's safe and predictable. - tools/website_policy.py: shared policy loader with domain normalization, wildcard support (*.tracking.example), shared file imports, and structured block metadata - web_extract: pre-fetch URL check + post-redirect recheck - web_crawl: pre-crawl URL check + per-page URL recheck - browser_navigate: pre-navigation URL check - Blocked responses include blocked_by_policy metadata so the agent can explain exactly what was denied Config: security: website_blocklist: enabled: true domains: ["evil.com", "*.tracking.example"] shared_files: ["team-blocklist.txt"] Salvaged from PR #1086 by @kshitijk4poor. Browser post-redirect checks deferred (browser_tool was fully rewritten since the PR branched). Co-authored-by: kshitijk4poor <kshitijk4poor@users.noreply.github.com>
2026-03-17 02:59:28 -07:00 · 2026-03-17 02:59:28 -07:00 · 30c417fe70
commit 30c417fe70
parent 6020db0243
5 changed files with 758 additions and 2 deletions
--- a/tools/browser_tool.py
+++ b/tools/browser_tool.py
@ -65,6 +65,7 @@ import requests
 from typing import Dict, Any, Optional, List
 from pathlib import Path
 from agent.auxiliary_client import call_llm
+from tools.website_policy import check_website_access
 from tools.browser_providers.base import CloudBrowserProvider
 from tools.browser_providers.browserbase import BrowserbaseProvider
 from tools.browser_providers.browser_use import BrowserUseProvider
@ -901,6 +902,19 @@ def browser_navigate(url: str, task_id: Optional[str] = None) -> str:
    Returns:
        JSON string with navigation result (includes stealth features info on first nav)
    """
+    # Website policy check — block before navigating
+    try:
+        blocked = check_website_access(url)
+    except Exception as _policy_err:
+        return json.dumps({"success": False, "error": f"Website policy error: {_policy_err}"})
+    if blocked:
+        logger.info("Blocked browser_navigate to %s by rule %s", blocked["host"], blocked["rule"])
+        return json.dumps({
+            "success": False,
+            "error": blocked["message"],
+            "blocked_by_policy": {"host": blocked["host"], "rule": blocked["rule"], "source": blocked["source"]},
+        })
+
    effective_task_id = task_id or "default"
    
    # Get session info to check if this is a new session
--- a/tools/web_tools.py
+++ b/tools/web_tools.py
@ -49,6 +49,7 @@ from typing import List, Dict, Any, Optional
 from firecrawl import Firecrawl
 from agent.auxiliary_client import async_call_llm
 from tools.debug_helpers import DebugSession
+from tools.website_policy import WebsitePolicyError, check_website_access

 logger = logging.getLogger(__name__)

@ -616,6 +617,21 @@ async def web_extract_tool(
                results.append({"url": url, "error": "Interrupted", "title": ""})
                continue

+            # Website policy check — block before fetching
+            try:
+                blocked = check_website_access(url)
+            except WebsitePolicyError as policy_err:
+                results.append({"url": url, "title": "", "content": "", "error": f"Website policy error: {policy_err}"})
+                continue
+            if blocked:
+                logger.info("Blocked web_extract for %s by rule %s", blocked["host"], blocked["rule"])
+                results.append({
+                    "url": url, "title": "", "content": "",
+                    "error": blocked["message"],
+                    "blocked_by_policy": {"host": blocked["host"], "rule": blocked["rule"], "source": blocked["source"]},
+                })
+                continue
+
            try:
                logger.info("Scraping: %s", url)
                scrape_result = _get_firecrawl_client().scrape(
@ -669,11 +685,26 @@ async def web_extract_tool(
                # Get title from metadata
                title = metadata.get("title", "")
                
+                # Re-check final URL after redirect
+                final_url = metadata.get("sourceURL", url)
+                try:
+                    final_blocked = check_website_access(final_url)
+                except WebsitePolicyError:
+                    final_blocked = None
+                if final_blocked:
+                    logger.info("Blocked redirected web_extract for %s by rule %s", final_blocked["host"], final_blocked["rule"])
+                    results.append({
+                        "url": final_url, "title": title, "content": "", "raw_content": "",
+                        "error": final_blocked["message"],
+                        "blocked_by_policy": {"host": final_blocked["host"], "rule": final_blocked["rule"], "source": final_blocked["source"]},
+                    })
+                    continue
+
                # Choose content based on requested format
                chosen_content = content_markdown if (format == "markdown" or (format is None and content_markdown)) else content_html or content_markdown or ""
                
                results.append({
-                    "url": metadata.get("sourceURL", url),
+                    "url": final_url,
                    "title": title,
                    "content": chosen_content,
                    "raw_content": chosen_content,
@ -778,6 +809,7 @@ async def web_extract_tool(
                "title": r.get("title", ""),
                "content": r.get("content", ""),
                "error": r.get("error"),
+                **({  "blocked_by_policy": r["blocked_by_policy"]} if "blocked_by_policy" in r else {}),
            }
            for r in response.get("results", [])
        ]
@ -870,6 +902,16 @@ async def web_crawl_tool(
        instructions_text = f" with instructions: '{instructions}'" if instructions else ""
        logger.info("Crawling %s%s", url, instructions_text)
        
+        # Website policy check — block before crawling
+        try:
+            blocked = check_website_access(url)
+        except WebsitePolicyError as policy_err:
+            return json.dumps({"results": [{"url": url, "title": "", "content": "", "error": f"Website policy error: {policy_err}"}]}, ensure_ascii=False)
+        if blocked:
+            logger.info("Blocked web_crawl for %s by rule %s", blocked["host"], blocked["rule"])
+            return json.dumps({"results": [{"url": url, "title": "", "content": "", "error": blocked["message"],
+                "blocked_by_policy": {"host": blocked["host"], "rule": blocked["rule"], "source": blocked["source"]}}]}, ensure_ascii=False)
+
        # Use Firecrawl's v2 crawl functionality
        # Docs: https://docs.firecrawl.dev/features/crawl
        # The crawl() method automatically waits for completion and returns all data
@ -975,6 +1017,20 @@ async def web_crawl_tool(
            page_url = metadata.get("sourceURL", metadata.get("url", "Unknown URL"))
            title = metadata.get("title", "")
            
+            # Re-check crawled page URL against policy
+            try:
+                page_blocked = check_website_access(page_url)
+            except WebsitePolicyError:
+                page_blocked = None
+            if page_blocked:
+                logger.info("Blocked crawled page %s by rule %s", page_blocked["host"], page_blocked["rule"])
+                pages.append({
+                    "url": page_url, "title": title, "content": "", "raw_content": "",
+                    "error": page_blocked["message"],
+                    "blocked_by_policy": {"host": page_blocked["host"], "rule": page_blocked["rule"], "source": page_blocked["source"]},
+                })
+                continue
+
            # Choose content (prefer markdown)
            content = content_markdown or content_html or ""
            
@ -1070,9 +1126,11 @@ async def web_crawl_tool(
        # Trim output to minimal fields per entry: title, content, error
        trimmed_results = [
            {
+                "url": r.get("url", ""),
                "title": r.get("title", ""),
                "content": r.get("content", ""),
-                "error": r.get("error")
+                "error": r.get("error"),
+                **({  "blocked_by_policy": r["blocked_by_policy"]} if "blocked_by_policy" in r else {}),
            }
            for r in response.get("results", [])
        ]
--- a/tools/website_policy.py
+++ b/tools/website_policy.py
@ -0,0 +1,193 @@
+"""Website access policy helpers for URL-capable tools.
+
+This module loads a user-managed website blocklist from ~/.hermes/config.yaml
+and optional shared list files. It is intentionally lightweight so web/browser
+tools can enforce URL policy without pulling in the heavier CLI config stack.
+"""
+
+from __future__ import annotations
+
+import fnmatch
+import os
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
+from urllib.parse import urlparse
+
+import yaml
+
+
+_DEFAULT_WEBSITE_BLOCKLIST = {
+    "enabled": True,
+    "domains": [],
+    "shared_files": [],
+}
+
+
+def _get_hermes_home() -> Path:
+    return Path(os.getenv("HERMES_HOME", Path.home() / ".hermes"))
+
+
+def _get_default_config_path() -> Path:
+    return _get_hermes_home() / "config.yaml"
+
+
+class WebsitePolicyError(Exception):
+    """Raised when a website policy file is malformed."""
+
+
+def _normalize_host(host: str) -> str:
+    return (host or "").strip().lower().rstrip(".")
+
+
+def _normalize_rule(rule: Any) -> Optional[str]:
+    if not isinstance(rule, str):
+        return None
+    value = rule.strip().lower()
+    if not value or value.startswith("#"):
+        return None
+    if "://" in value:
+        parsed = urlparse(value)
+        value = parsed.netloc or parsed.path
+    value = value.split("/", 1)[0].strip().rstrip(".")
+    if value.startswith("www."):
+        value = value[4:]
+    return value or None
+
+
+def _iter_blocklist_file_rules(path: Path) -> List[str]:
+    try:
+        raw = path.read_text(encoding="utf-8")
+    except FileNotFoundError as exc:
+        raise WebsitePolicyError(f"Shared blocklist file not found: {path}") from exc
+    except (OSError, UnicodeDecodeError) as exc:
+        raise WebsitePolicyError(f"Failed to read shared blocklist file {path}: {exc}") from exc
+
+    rules: List[str] = []
+    for line in raw.splitlines():
+        stripped = line.strip()
+        if not stripped or stripped.startswith("#"):
+            continue
+        normalized = _normalize_rule(stripped)
+        if normalized:
+            rules.append(normalized)
+    return rules
+
+
+def _load_policy_config(config_path: Optional[Path] = None) -> Dict[str, Any]:
+    config_path = config_path or _get_default_config_path()
+    if not config_path.exists():
+        return dict(_DEFAULT_WEBSITE_BLOCKLIST)
+    try:
+        with open(config_path, encoding="utf-8") as f:
+            config = yaml.safe_load(f) or {}
+    except yaml.YAMLError as exc:
+        raise WebsitePolicyError(f"Invalid config YAML at {config_path}: {exc}") from exc
+    except OSError as exc:
+        raise WebsitePolicyError(f"Failed to read config file {config_path}: {exc}") from exc
+    if not isinstance(config, dict):
+        raise WebsitePolicyError("config root must be a mapping")
+
+    security = config.get("security", {})
+    if security is None:
+        security = {}
+    if not isinstance(security, dict):
+        raise WebsitePolicyError("security must be a mapping")
+
+    website_blocklist = security.get("website_blocklist", {})
+    if website_blocklist is None:
+        website_blocklist = {}
+    if not isinstance(website_blocklist, dict):
+        raise WebsitePolicyError("security.website_blocklist must be a mapping")
+
+    policy = dict(_DEFAULT_WEBSITE_BLOCKLIST)
+    policy.update(website_blocklist)
+    return policy
+
+
+def load_website_blocklist(config_path: Optional[Path] = None) -> Dict[str, Any]:
+    config_path = config_path or _get_default_config_path()
+    policy = _load_policy_config(config_path)
+
+    raw_domains = policy.get("domains", []) or []
+    if not isinstance(raw_domains, list):
+        raise WebsitePolicyError("security.website_blocklist.domains must be a list")
+
+    raw_shared_files = policy.get("shared_files", []) or []
+    if not isinstance(raw_shared_files, list):
+        raise WebsitePolicyError("security.website_blocklist.shared_files must be a list")
+
+    enabled = policy.get("enabled", True)
+    if not isinstance(enabled, bool):
+        raise WebsitePolicyError("security.website_blocklist.enabled must be a boolean")
+
+    rules: List[Dict[str, str]] = []
+    seen: set[Tuple[str, str]] = set()
+
+    for raw_rule in raw_domains:
+        normalized = _normalize_rule(raw_rule)
+        if normalized and ("config", normalized) not in seen:
+            rules.append({"pattern": normalized, "source": "config"})
+            seen.add(("config", normalized))
+
+    for shared_file in raw_shared_files:
+        if not isinstance(shared_file, str) or not shared_file.strip():
+            continue
+        path = Path(shared_file).expanduser()
+        if not path.is_absolute():
+            path = (_get_hermes_home() / path).resolve()
+        for normalized in _iter_blocklist_file_rules(path):
+            key = (str(path), normalized)
+            if key in seen:
+                continue
+            rules.append({"pattern": normalized, "source": str(path)})
+            seen.add(key)
+
+    return {"enabled": enabled, "rules": rules}
+
+
+def _match_host_against_rule(host: str, pattern: str) -> bool:
+    if not host or not pattern:
+        return False
+    if pattern.startswith("*."):
+        return fnmatch.fnmatch(host, pattern)
+    return host == pattern or host.endswith(f".{pattern}")
+
+
+def _extract_host_from_urlish(url: str) -> str:
+    parsed = urlparse(url)
+    host = _normalize_host(parsed.hostname or parsed.netloc)
+    if host:
+        return host
+
+    if "://" not in url:
+        schemeless = urlparse(f"//{url}")
+        host = _normalize_host(schemeless.hostname or schemeless.netloc)
+        if host:
+            return host
+
+    return ""
+
+
+def check_website_access(url: str, config_path: Optional[Path] = None) -> Optional[Dict[str, str]]:
+    host = _extract_host_from_urlish(url)
+    if not host:
+        return None
+
+    policy = load_website_blocklist(config_path)
+    if not policy.get("enabled"):
+        return None
+
+    for rule in policy.get("rules", []):
+        pattern = rule.get("pattern", "")
+        if _match_host_against_rule(host, pattern):
+            return {
+                "url": url,
+                "host": host,
+                "rule": pattern,
+                "source": rule.get("source", "config"),
+                "message": (
+                    f"Blocked by website policy: '{host}' matched rule '{pattern}'"
+                    f" from {rule.get('source', 'config')}"
+                ),
+            }
+    return None