feat: add website blocklist enforcement for web/browser tools (#1064)

Adds security.website_blocklist config for user-managed domain blocking
across URL-capable tools. Enforced at the tool level (not monkey-patching)
so it's safe and predictable.

- tools/website_policy.py: shared policy loader with domain normalization,
  wildcard support (*.tracking.example), shared file imports, and
  structured block metadata
- web_extract: pre-fetch URL check + post-redirect recheck
- web_crawl: pre-crawl URL check + per-page URL recheck
- browser_navigate: pre-navigation URL check
- Blocked responses include blocked_by_policy metadata so the agent
  can explain exactly what was denied

Config:
  security:
    website_blocklist:
      enabled: true
      domains: ["evil.com", "*.tracking.example"]
      shared_files: ["team-blocklist.txt"]

Salvaged from PR #1086 by @kshitijk4poor. Browser post-redirect checks
deferred (browser_tool was fully rewritten since the PR branched).

Co-authored-by: kshitijk4poor <kshitijk4poor@users.noreply.github.com>
This commit is contained in:
teknium1 2026-03-17 02:59:28 -07:00
parent 6020db0243
commit 30c417fe70
5 changed files with 758 additions and 2 deletions

View file

@ -65,6 +65,7 @@ import requests
from typing import Dict, Any, Optional, List
from pathlib import Path
from agent.auxiliary_client import call_llm
from tools.website_policy import check_website_access
from tools.browser_providers.base import CloudBrowserProvider
from tools.browser_providers.browserbase import BrowserbaseProvider
from tools.browser_providers.browser_use import BrowserUseProvider
@ -901,6 +902,19 @@ def browser_navigate(url: str, task_id: Optional[str] = None) -> str:
Returns:
JSON string with navigation result (includes stealth features info on first nav)
"""
# Website policy check — block before navigating
try:
blocked = check_website_access(url)
except Exception as _policy_err:
return json.dumps({"success": False, "error": f"Website policy error: {_policy_err}"})
if blocked:
logger.info("Blocked browser_navigate to %s by rule %s", blocked["host"], blocked["rule"])
return json.dumps({
"success": False,
"error": blocked["message"],
"blocked_by_policy": {"host": blocked["host"], "rule": blocked["rule"], "source": blocked["source"]},
})
effective_task_id = task_id or "default"
# Get session info to check if this is a new session

View file

@ -49,6 +49,7 @@ from typing import List, Dict, Any, Optional
from firecrawl import Firecrawl
from agent.auxiliary_client import async_call_llm
from tools.debug_helpers import DebugSession
from tools.website_policy import WebsitePolicyError, check_website_access
logger = logging.getLogger(__name__)
@ -616,6 +617,21 @@ async def web_extract_tool(
results.append({"url": url, "error": "Interrupted", "title": ""})
continue
# Website policy check — block before fetching
try:
blocked = check_website_access(url)
except WebsitePolicyError as policy_err:
results.append({"url": url, "title": "", "content": "", "error": f"Website policy error: {policy_err}"})
continue
if blocked:
logger.info("Blocked web_extract for %s by rule %s", blocked["host"], blocked["rule"])
results.append({
"url": url, "title": "", "content": "",
"error": blocked["message"],
"blocked_by_policy": {"host": blocked["host"], "rule": blocked["rule"], "source": blocked["source"]},
})
continue
try:
logger.info("Scraping: %s", url)
scrape_result = _get_firecrawl_client().scrape(
@ -669,11 +685,26 @@ async def web_extract_tool(
# Get title from metadata
title = metadata.get("title", "")
# Re-check final URL after redirect
final_url = metadata.get("sourceURL", url)
try:
final_blocked = check_website_access(final_url)
except WebsitePolicyError:
final_blocked = None
if final_blocked:
logger.info("Blocked redirected web_extract for %s by rule %s", final_blocked["host"], final_blocked["rule"])
results.append({
"url": final_url, "title": title, "content": "", "raw_content": "",
"error": final_blocked["message"],
"blocked_by_policy": {"host": final_blocked["host"], "rule": final_blocked["rule"], "source": final_blocked["source"]},
})
continue
# Choose content based on requested format
chosen_content = content_markdown if (format == "markdown" or (format is None and content_markdown)) else content_html or content_markdown or ""
results.append({
"url": metadata.get("sourceURL", url),
"url": final_url,
"title": title,
"content": chosen_content,
"raw_content": chosen_content,
@ -778,6 +809,7 @@ async def web_extract_tool(
"title": r.get("title", ""),
"content": r.get("content", ""),
"error": r.get("error"),
**({ "blocked_by_policy": r["blocked_by_policy"]} if "blocked_by_policy" in r else {}),
}
for r in response.get("results", [])
]
@ -870,6 +902,16 @@ async def web_crawl_tool(
instructions_text = f" with instructions: '{instructions}'" if instructions else ""
logger.info("Crawling %s%s", url, instructions_text)
# Website policy check — block before crawling
try:
blocked = check_website_access(url)
except WebsitePolicyError as policy_err:
return json.dumps({"results": [{"url": url, "title": "", "content": "", "error": f"Website policy error: {policy_err}"}]}, ensure_ascii=False)
if blocked:
logger.info("Blocked web_crawl for %s by rule %s", blocked["host"], blocked["rule"])
return json.dumps({"results": [{"url": url, "title": "", "content": "", "error": blocked["message"],
"blocked_by_policy": {"host": blocked["host"], "rule": blocked["rule"], "source": blocked["source"]}}]}, ensure_ascii=False)
# Use Firecrawl's v2 crawl functionality
# Docs: https://docs.firecrawl.dev/features/crawl
# The crawl() method automatically waits for completion and returns all data
@ -975,6 +1017,20 @@ async def web_crawl_tool(
page_url = metadata.get("sourceURL", metadata.get("url", "Unknown URL"))
title = metadata.get("title", "")
# Re-check crawled page URL against policy
try:
page_blocked = check_website_access(page_url)
except WebsitePolicyError:
page_blocked = None
if page_blocked:
logger.info("Blocked crawled page %s by rule %s", page_blocked["host"], page_blocked["rule"])
pages.append({
"url": page_url, "title": title, "content": "", "raw_content": "",
"error": page_blocked["message"],
"blocked_by_policy": {"host": page_blocked["host"], "rule": page_blocked["rule"], "source": page_blocked["source"]},
})
continue
# Choose content (prefer markdown)
content = content_markdown or content_html or ""
@ -1070,9 +1126,11 @@ async def web_crawl_tool(
# Trim output to minimal fields per entry: title, content, error
trimmed_results = [
{
"url": r.get("url", ""),
"title": r.get("title", ""),
"content": r.get("content", ""),
"error": r.get("error")
"error": r.get("error"),
**({ "blocked_by_policy": r["blocked_by_policy"]} if "blocked_by_policy" in r else {}),
}
for r in response.get("results", [])
]

193
tools/website_policy.py Normal file
View file

@ -0,0 +1,193 @@
"""Website access policy helpers for URL-capable tools.
This module loads a user-managed website blocklist from ~/.hermes/config.yaml
and optional shared list files. It is intentionally lightweight so web/browser
tools can enforce URL policy without pulling in the heavier CLI config stack.
"""
from __future__ import annotations
import fnmatch
import os
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
from urllib.parse import urlparse
import yaml
_DEFAULT_WEBSITE_BLOCKLIST = {
"enabled": True,
"domains": [],
"shared_files": [],
}
def _get_hermes_home() -> Path:
return Path(os.getenv("HERMES_HOME", Path.home() / ".hermes"))
def _get_default_config_path() -> Path:
return _get_hermes_home() / "config.yaml"
class WebsitePolicyError(Exception):
"""Raised when a website policy file is malformed."""
def _normalize_host(host: str) -> str:
return (host or "").strip().lower().rstrip(".")
def _normalize_rule(rule: Any) -> Optional[str]:
if not isinstance(rule, str):
return None
value = rule.strip().lower()
if not value or value.startswith("#"):
return None
if "://" in value:
parsed = urlparse(value)
value = parsed.netloc or parsed.path
value = value.split("/", 1)[0].strip().rstrip(".")
if value.startswith("www."):
value = value[4:]
return value or None
def _iter_blocklist_file_rules(path: Path) -> List[str]:
try:
raw = path.read_text(encoding="utf-8")
except FileNotFoundError as exc:
raise WebsitePolicyError(f"Shared blocklist file not found: {path}") from exc
except (OSError, UnicodeDecodeError) as exc:
raise WebsitePolicyError(f"Failed to read shared blocklist file {path}: {exc}") from exc
rules: List[str] = []
for line in raw.splitlines():
stripped = line.strip()
if not stripped or stripped.startswith("#"):
continue
normalized = _normalize_rule(stripped)
if normalized:
rules.append(normalized)
return rules
def _load_policy_config(config_path: Optional[Path] = None) -> Dict[str, Any]:
config_path = config_path or _get_default_config_path()
if not config_path.exists():
return dict(_DEFAULT_WEBSITE_BLOCKLIST)
try:
with open(config_path, encoding="utf-8") as f:
config = yaml.safe_load(f) or {}
except yaml.YAMLError as exc:
raise WebsitePolicyError(f"Invalid config YAML at {config_path}: {exc}") from exc
except OSError as exc:
raise WebsitePolicyError(f"Failed to read config file {config_path}: {exc}") from exc
if not isinstance(config, dict):
raise WebsitePolicyError("config root must be a mapping")
security = config.get("security", {})
if security is None:
security = {}
if not isinstance(security, dict):
raise WebsitePolicyError("security must be a mapping")
website_blocklist = security.get("website_blocklist", {})
if website_blocklist is None:
website_blocklist = {}
if not isinstance(website_blocklist, dict):
raise WebsitePolicyError("security.website_blocklist must be a mapping")
policy = dict(_DEFAULT_WEBSITE_BLOCKLIST)
policy.update(website_blocklist)
return policy
def load_website_blocklist(config_path: Optional[Path] = None) -> Dict[str, Any]:
config_path = config_path or _get_default_config_path()
policy = _load_policy_config(config_path)
raw_domains = policy.get("domains", []) or []
if not isinstance(raw_domains, list):
raise WebsitePolicyError("security.website_blocklist.domains must be a list")
raw_shared_files = policy.get("shared_files", []) or []
if not isinstance(raw_shared_files, list):
raise WebsitePolicyError("security.website_blocklist.shared_files must be a list")
enabled = policy.get("enabled", True)
if not isinstance(enabled, bool):
raise WebsitePolicyError("security.website_blocklist.enabled must be a boolean")
rules: List[Dict[str, str]] = []
seen: set[Tuple[str, str]] = set()
for raw_rule in raw_domains:
normalized = _normalize_rule(raw_rule)
if normalized and ("config", normalized) not in seen:
rules.append({"pattern": normalized, "source": "config"})
seen.add(("config", normalized))
for shared_file in raw_shared_files:
if not isinstance(shared_file, str) or not shared_file.strip():
continue
path = Path(shared_file).expanduser()
if not path.is_absolute():
path = (_get_hermes_home() / path).resolve()
for normalized in _iter_blocklist_file_rules(path):
key = (str(path), normalized)
if key in seen:
continue
rules.append({"pattern": normalized, "source": str(path)})
seen.add(key)
return {"enabled": enabled, "rules": rules}
def _match_host_against_rule(host: str, pattern: str) -> bool:
if not host or not pattern:
return False
if pattern.startswith("*."):
return fnmatch.fnmatch(host, pattern)
return host == pattern or host.endswith(f".{pattern}")
def _extract_host_from_urlish(url: str) -> str:
parsed = urlparse(url)
host = _normalize_host(parsed.hostname or parsed.netloc)
if host:
return host
if "://" not in url:
schemeless = urlparse(f"//{url}")
host = _normalize_host(schemeless.hostname or schemeless.netloc)
if host:
return host
return ""
def check_website_access(url: str, config_path: Optional[Path] = None) -> Optional[Dict[str, str]]:
host = _extract_host_from_urlish(url)
if not host:
return None
policy = load_website_blocklist(config_path)
if not policy.get("enabled"):
return None
for rule in policy.get("rules", []):
pattern = rule.get("pattern", "")
if _match_host_against_rule(host, pattern):
return {
"url": url,
"host": host,
"rule": pattern,
"source": rule.get("source", "config"),
"message": (
f"Blocked by website policy: '{host}' matched rule '{pattern}'"
f" from {rule.get('source', 'config')}"
),
}
return None