From 9f6bccd76a0a64d9251620e5c713e34f9df4649f Mon Sep 17 00:00:00 2001
From: teknium1 <teknium1@gmail.com>
Date: Sat, 14 Mar 2026 20:48:29 -0700
Subject: [PATCH 1/4] feat: add direct endpoint overrides for auxiliary and
 delegation

Add base_url/api_key overrides for auxiliary tasks and delegation so users can
route those flows straight to a custom OpenAI-compatible endpoint without
having to rely on provider=main or named custom providers.

Also clear gateway session env vars in test isolation so the full suite stays
deterministic when run from a messaging-backed agent session.
---
 agent/auxiliary_client.py                     | 244 +++++++++++++-----
 cli.py                                        |  48 +++-
 gateway/run.py                                |  30 ++-
 hermes_cli/config.py                          |  16 ++
 tests/agent/test_auxiliary_client.py          |  64 ++++-
 tests/conftest.py                             |   6 +
 tests/test_auxiliary_config_bridge.py         |  47 +++-
 tests/tools/test_delegate.py                  |  72 ++++++
 tools/delegate_tool.py                        |  52 +++-
 .../docs/reference/environment-variables.md   |  17 ++
 website/docs/user-guide/configuration.md      |  21 +-
 .../docs/user-guide/features/delegation.md    |   8 +
 12 files changed, 526 insertions(+), 99 deletions(-)

diff --git a/agent/auxiliary_client.py b/agent/auxiliary_client.py
index dd8f22bb..957452fc 100644
--- a/agent/auxiliary_client.py
+++ b/agent/auxiliary_client.py
@@ -30,6 +30,10 @@ Default "auto" follows the chains above.
 Per-task model overrides (e.g. AUXILIARY_VISION_MODEL,
 AUXILIARY_WEB_EXTRACT_MODEL) let callers use a different model slug
 than the provider's default.
+
+Per-task direct endpoint overrides (e.g. AUXILIARY_VISION_BASE_URL,
+AUXILIARY_VISION_API_KEY) let callers route a specific auxiliary task to a
+custom OpenAI-compatible endpoint without touching the main model settings.
 """
 
 import json
@@ -418,6 +422,17 @@ def _get_auxiliary_provider(task: str = "") -> str:
     return "auto"
 
 
+def _get_auxiliary_env_override(task: str, suffix: str) -> Optional[str]:
+    """Read an auxiliary env override from AUXILIARY_* or CONTEXT_* prefixes."""
+    if not task:
+        return None
+    for prefix in ("AUXILIARY_", "CONTEXT_"):
+        val = os.getenv(f"{prefix}{task.upper()}_{suffix}", "").strip()
+        if val:
+            return val
+    return None
+
+
 def _try_openrouter() -> Tuple[Optional[OpenAI], Optional[str]]:
     or_key = os.getenv("OPENROUTER_API_KEY")
     if not or_key:
@@ -564,6 +579,8 @@ def resolve_provider_client(
     model: str = None,
     async_mode: bool = False,
     raw_codex: bool = False,
+    explicit_base_url: str = None,
+    explicit_api_key: str = None,
 ) -> Tuple[Optional[Any], Optional[str]]:
     """Central router: given a provider name and optional model, return a
     configured client with the correct auth, base URL, and API format.
@@ -585,6 +602,8 @@ def resolve_provider_client(
             instead of wrapping in CodexAuxiliaryClient.  Use this when
             the caller needs direct access to responses.stream() (e.g.,
             the main agent loop).
+        explicit_base_url: Optional direct OpenAI-compatible endpoint.
+        explicit_api_key: Optional API key paired with explicit_base_url.
 
     Returns:
         (client, resolved_model) or (None, None) if auth is unavailable.
@@ -661,6 +680,22 @@ def resolve_provider_client(
 
     # ── Custom endpoint (OPENAI_BASE_URL + OPENAI_API_KEY) ───────────
     if provider == "custom":
+        if explicit_base_url:
+            custom_base = explicit_base_url.strip()
+            custom_key = (
+                (explicit_api_key or "").strip()
+                or os.getenv("OPENAI_API_KEY", "").strip()
+            )
+            if not custom_base or not custom_key:
+                logger.warning(
+                    "resolve_provider_client: explicit custom endpoint requested "
+                    "but no API key was found (set explicit_api_key or OPENAI_API_KEY)"
+                )
+                return None, None
+            final_model = model or _read_main_model() or "gpt-4o-mini"
+            client = OpenAI(api_key=custom_key, base_url=custom_base)
+            return (_to_async_client(client, final_model) if async_mode
+                    else (client, final_model))
         # Try custom first, then codex, then API-key providers
         for try_fn in (_try_custom_endpoint, _try_codex,
                        _resolve_api_key_provider):
@@ -749,10 +784,13 @@ def get_text_auxiliary_client(task: str = "") -> Tuple[Optional[OpenAI], Optiona
     Callers may override the returned model with a per-task env var
     (e.g. CONTEXT_COMPRESSION_MODEL, AUXILIARY_WEB_EXTRACT_MODEL).
     """
-    forced = _get_auxiliary_provider(task)
-    if forced != "auto":
-        return resolve_provider_client(forced)
-    return resolve_provider_client("auto")
+    provider, model, base_url, api_key = _resolve_task_provider_model(task or None)
+    return resolve_provider_client(
+        provider,
+        model=model,
+        explicit_base_url=base_url,
+        explicit_api_key=api_key,
+    )
 
 
 def get_async_text_auxiliary_client(task: str = ""):
@@ -762,10 +800,14 @@ def get_async_text_auxiliary_client(task: str = ""):
     (AsyncCodexAuxiliaryClient, model) which wraps the Responses API.
     Returns (None, None) when no provider is available.
     """
-    forced = _get_auxiliary_provider(task)
-    if forced != "auto":
-        return resolve_provider_client(forced, async_mode=True)
-    return resolve_provider_client("auto", async_mode=True)
+    provider, model, base_url, api_key = _resolve_task_provider_model(task or None)
+    return resolve_provider_client(
+        provider,
+        model=model,
+        async_mode=True,
+        explicit_base_url=base_url,
+        explicit_api_key=api_key,
+    )
 
 
 _VISION_AUTO_PROVIDER_ORDER = (
@@ -821,26 +863,43 @@ def resolve_vision_provider_client(
     provider: Optional[str] = None,
     model: Optional[str] = None,
     *,
+    base_url: Optional[str] = None,
+    api_key: Optional[str] = None,
     async_mode: bool = False,
 ) -> Tuple[Optional[str], Optional[Any], Optional[str]]:
     """Resolve the client actually used for vision tasks.
 
-    Explicit provider overrides still use the generic provider router for
-    non-standard backends, so users can intentionally force experimental
-    providers. Auto mode stays conservative and only tries vision backends
-    known to work today.
+    Direct endpoint overrides take precedence over provider selection. Explicit
+    provider overrides still use the generic provider router for non-standard
+    backends, so users can intentionally force experimental providers. Auto mode
+    stays conservative and only tries vision backends known to work today.
     """
-    requested = _normalize_vision_provider(provider or _get_auxiliary_provider("vision"))
+    requested, resolved_model, resolved_base_url, resolved_api_key = _resolve_task_provider_model(
+        "vision", provider, model, base_url, api_key
+    )
+    requested = _normalize_vision_provider(requested)
 
     def _finalize(resolved_provider: str, sync_client: Any, default_model: Optional[str]):
         if sync_client is None:
             return resolved_provider, None, None
-        final_model = model or default_model
+        final_model = resolved_model or default_model
         if async_mode:
             async_client, async_model = _to_async_client(sync_client, final_model)
             return resolved_provider, async_client, async_model
         return resolved_provider, sync_client, final_model
 
+    if resolved_base_url:
+        client, final_model = resolve_provider_client(
+            "custom",
+            model=resolved_model,
+            async_mode=async_mode,
+            explicit_base_url=resolved_base_url,
+            explicit_api_key=resolved_api_key,
+        )
+        if client is None:
+            return "custom", None, None
+        return "custom", client, final_model
+
     if requested == "auto":
         for candidate in get_available_vision_backends():
             sync_client, default_model = _resolve_strict_vision_backend(candidate)
@@ -853,7 +912,7 @@ def resolve_vision_provider_client(
         sync_client, default_model = _resolve_strict_vision_backend(requested)
         return _finalize(requested, sync_client, default_model)
 
-    client, final_model = _get_cached_client(requested, model, async_mode)
+    client, final_model = _get_cached_client(requested, resolved_model, async_mode)
     if client is None:
         return requested, None, None
     return requested, client, final_model
@@ -910,19 +969,29 @@ def auxiliary_max_tokens_param(value: int) -> dict:
 # Every auxiliary LLM consumer should use these instead of manually
 # constructing clients and calling .chat.completions.create().
 
-# Client cache: (provider, async_mode) -> (client, default_model)
+# Client cache: (provider, async_mode, base_url, api_key) -> (client, default_model)
 _client_cache: Dict[tuple, tuple] = {}
 
 
 def _get_cached_client(
-    provider: str, model: str = None, async_mode: bool = False,
+    provider: str,
+    model: str = None,
+    async_mode: bool = False,
+    base_url: str = None,
+    api_key: str = None,
 ) -> Tuple[Optional[Any], Optional[str]]:
     """Get or create a cached client for the given provider."""
-    cache_key = (provider, async_mode)
+    cache_key = (provider, async_mode, base_url or "", api_key or "")
     if cache_key in _client_cache:
         cached_client, cached_default = _client_cache[cache_key]
         return cached_client, model or cached_default
-    client, default_model = resolve_provider_client(provider, model, async_mode)
+    client, default_model = resolve_provider_client(
+        provider,
+        model,
+        async_mode,
+        explicit_base_url=base_url,
+        explicit_api_key=api_key,
+    )
     if client is not None:
         _client_cache[cache_key] = (client, default_model)
     return client, model or default_model
@@ -932,57 +1001,75 @@ def _resolve_task_provider_model(
     task: str = None,
     provider: str = None,
     model: str = None,
-) -> Tuple[str, Optional[str]]:
+    base_url: str = None,
+    api_key: str = None,
+) -> Tuple[str, Optional[str], Optional[str], Optional[str]]:
     """Determine provider + model for a call.
 
     Priority:
-      1. Explicit provider/model args (always win)
-      2. Env var overrides (AUXILIARY_{TASK}_PROVIDER, etc.)
-      3. Config file (auxiliary.{task}.provider/model or compression.*)
+      1. Explicit provider/model/base_url/api_key args (always win)
+      2. Env var overrides (AUXILIARY_{TASK}_*, CONTEXT_{TASK}_*)
+      3. Config file (auxiliary.{task}.* or compression.*)
       4. "auto" (full auto-detection chain)
 
-    Returns (provider, model) where model may be None (use provider default).
+    Returns (provider, model, base_url, api_key) where model may be None
+    (use provider default). When base_url is set, provider is forced to
+    "custom" and the task uses that direct endpoint.
     """
-    if provider:
-        return provider, model
+    config = {}
+    cfg_provider = None
+    cfg_model = None
+    cfg_base_url = None
+    cfg_api_key = None
 
     if task:
-        # Check env var overrides first
-        env_provider = _get_auxiliary_provider(task)
-        if env_provider != "auto":
-            # Check for env var model override too
-            env_model = None
-            for prefix in ("AUXILIARY_", "CONTEXT_"):
-                val = os.getenv(f"{prefix}{task.upper()}_MODEL", "").strip()
-                if val:
-                    env_model = val
-                    break
-            return env_provider, model or env_model
-
-        # Read from config file
         try:
             from hermes_cli.config import load_config
             config = load_config()
         except ImportError:
-            return "auto", model
+            config = {}
 
-        # Check auxiliary.{task} section
-        aux = config.get("auxiliary", {})
-        task_config = aux.get(task, {})
-        cfg_provider = task_config.get("provider", "").strip() or None
-        cfg_model = task_config.get("model", "").strip() or None
+        aux = config.get("auxiliary", {}) if isinstance(config, dict) else {}
+        task_config = aux.get(task, {}) if isinstance(aux, dict) else {}
+        if not isinstance(task_config, dict):
+            task_config = {}
+        cfg_provider = str(task_config.get("provider", "")).strip() or None
+        cfg_model = str(task_config.get("model", "")).strip() or None
+        cfg_base_url = str(task_config.get("base_url", "")).strip() or None
+        cfg_api_key = str(task_config.get("api_key", "")).strip() or None
 
         # Backwards compat: compression section has its own keys
         if task == "compression" and not cfg_provider:
-            comp = config.get("compression", {})
-            cfg_provider = comp.get("summary_provider", "").strip() or None
-            cfg_model = cfg_model or comp.get("summary_model", "").strip() or None
+            comp = config.get("compression", {}) if isinstance(config, dict) else {}
+            if isinstance(comp, dict):
+                cfg_provider = comp.get("summary_provider", "").strip() or None
+                cfg_model = cfg_model or comp.get("summary_model", "").strip() or None
 
+    env_model = _get_auxiliary_env_override(task, "MODEL") if task else None
+    resolved_model = model or env_model or cfg_model
+
+    if base_url:
+        return "custom", resolved_model, base_url, api_key
+    if provider:
+        return provider, resolved_model, base_url, api_key
+
+    if task:
+        env_base_url = _get_auxiliary_env_override(task, "BASE_URL")
+        env_api_key = _get_auxiliary_env_override(task, "API_KEY")
+        if env_base_url:
+            return "custom", resolved_model, env_base_url, env_api_key or cfg_api_key
+
+        env_provider = _get_auxiliary_provider(task)
+        if env_provider != "auto":
+            return env_provider, resolved_model, None, None
+
+        if cfg_base_url:
+            return "custom", resolved_model, cfg_base_url, cfg_api_key
         if cfg_provider and cfg_provider != "auto":
-            return cfg_provider, model or cfg_model
-        return "auto", model or cfg_model
+            return cfg_provider, resolved_model, None, None
+        return "auto", resolved_model, None, None
 
-    return "auto", model
+    return "auto", resolved_model, None, None
 
 
 def _build_call_kwargs(
@@ -994,6 +1081,7 @@ def _build_call_kwargs(
     tools: Optional[list] = None,
     timeout: float = 30.0,
     extra_body: Optional[dict] = None,
+    base_url: Optional[str] = None,
 ) -> dict:
     """Build kwargs for .chat.completions.create() with model/provider adjustments."""
     kwargs: Dict[str, Any] = {
@@ -1009,7 +1097,7 @@ def _build_call_kwargs(
         # Codex adapter handles max_tokens internally; OpenRouter/Nous use max_tokens.
         # Direct OpenAI api.openai.com with newer models needs max_completion_tokens.
         if provider == "custom":
-            custom_base = os.getenv("OPENAI_BASE_URL", "")
+            custom_base = base_url or os.getenv("OPENAI_BASE_URL", "")
             if "api.openai.com" in custom_base.lower():
                 kwargs["max_completion_tokens"] = max_tokens
             else:
@@ -1035,6 +1123,8 @@ def call_llm(
     *,
     provider: str = None,
     model: str = None,
+    base_url: str = None,
+    api_key: str = None,
     messages: list,
     temperature: float = None,
     max_tokens: int = None,
@@ -1066,16 +1156,18 @@ def call_llm(
     Raises:
         RuntimeError: If no provider is configured.
     """
-    resolved_provider, resolved_model = _resolve_task_provider_model(
-        task, provider, model)
+    resolved_provider, resolved_model, resolved_base_url, resolved_api_key = _resolve_task_provider_model(
+        task, provider, model, base_url, api_key)
 
     if task == "vision":
         effective_provider, client, final_model = resolve_vision_provider_client(
-            provider=resolved_provider,
-            model=resolved_model,
+            provider=provider,
+            model=model,
+            base_url=base_url,
+            api_key=api_key,
             async_mode=False,
         )
-        if client is None and resolved_provider != "auto":
+        if client is None and resolved_provider != "auto" and not resolved_base_url:
             logger.warning(
                 "Vision provider %s unavailable, falling back to auto vision backends",
                 resolved_provider,
@@ -1092,10 +1184,15 @@ def call_llm(
             )
         resolved_provider = effective_provider or resolved_provider
     else:
-        client, final_model = _get_cached_client(resolved_provider, resolved_model)
+        client, final_model = _get_cached_client(
+            resolved_provider,
+            resolved_model,
+            base_url=resolved_base_url,
+            api_key=resolved_api_key,
+        )
         if client is None:
             # Fallback: try openrouter
-            if resolved_provider != "openrouter":
+            if resolved_provider != "openrouter" and not resolved_base_url:
                 logger.warning("Provider %s unavailable, falling back to openrouter",
                                resolved_provider)
                 client, final_model = _get_cached_client(
@@ -1108,7 +1205,8 @@ def call_llm(
     kwargs = _build_call_kwargs(
         resolved_provider, final_model, messages,
         temperature=temperature, max_tokens=max_tokens,
-        tools=tools, timeout=timeout, extra_body=extra_body)
+        tools=tools, timeout=timeout, extra_body=extra_body,
+        base_url=resolved_base_url)
 
     # Handle max_tokens vs max_completion_tokens retry
     try:
@@ -1127,6 +1225,8 @@ async def async_call_llm(
     *,
     provider: str = None,
     model: str = None,
+    base_url: str = None,
+    api_key: str = None,
     messages: list,
     temperature: float = None,
     max_tokens: int = None,
@@ -1138,16 +1238,18 @@ async def async_call_llm(
 
     Same as call_llm() but async. See call_llm() for full documentation.
     """
-    resolved_provider, resolved_model = _resolve_task_provider_model(
-        task, provider, model)
+    resolved_provider, resolved_model, resolved_base_url, resolved_api_key = _resolve_task_provider_model(
+        task, provider, model, base_url, api_key)
 
     if task == "vision":
         effective_provider, client, final_model = resolve_vision_provider_client(
-            provider=resolved_provider,
-            model=resolved_model,
+            provider=provider,
+            model=model,
+            base_url=base_url,
+            api_key=api_key,
             async_mode=True,
         )
-        if client is None and resolved_provider != "auto":
+        if client is None and resolved_provider != "auto" and not resolved_base_url:
             logger.warning(
                 "Vision provider %s unavailable, falling back to auto vision backends",
                 resolved_provider,
@@ -1165,9 +1267,14 @@ async def async_call_llm(
         resolved_provider = effective_provider or resolved_provider
     else:
         client, final_model = _get_cached_client(
-            resolved_provider, resolved_model, async_mode=True)
+            resolved_provider,
+            resolved_model,
+            async_mode=True,
+            base_url=resolved_base_url,
+            api_key=resolved_api_key,
+        )
         if client is None:
-            if resolved_provider != "openrouter":
+            if resolved_provider != "openrouter" and not resolved_base_url:
                 logger.warning("Provider %s unavailable, falling back to openrouter",
                                resolved_provider)
                 client, final_model = _get_cached_client(
@@ -1181,7 +1288,8 @@ async def async_call_llm(
     kwargs = _build_call_kwargs(
         resolved_provider, final_model, messages,
         temperature=temperature, max_tokens=max_tokens,
-        tools=tools, timeout=timeout, extra_body=extra_body)
+        tools=tools, timeout=timeout, extra_body=extra_body,
+        base_url=resolved_base_url)
 
     try:
         return await client.chat.completions.create(**kwargs)
diff --git a/cli.py b/cli.py
index 44c7889c..1bebbf4f 100755
--- a/cli.py
+++ b/cli.py
@@ -218,11 +218,27 @@ def load_cli_config() -> Dict[str, Any]:
             "timeout": 300,    # Max seconds a sandbox script can run before being killed (5 min)
             "max_tool_calls": 50,  # Max RPC tool calls per execution
         },
+        "auxiliary": {
+            "vision": {
+                "provider": "auto",
+                "model": "",
+                "base_url": "",
+                "api_key": "",
+            },
+            "web_extract": {
+                "provider": "auto",
+                "model": "",
+                "base_url": "",
+                "api_key": "",
+            },
+        },
         "delegation": {
             "max_iterations": 45,  # Max tool-calling turns per child agent
             "default_toolsets": ["terminal", "file", "web"],  # Default toolsets for subagents
             "model": "",       # Subagent model override (empty = inherit parent model)
             "provider": "",    # Subagent provider override (empty = inherit parent provider)
+            "base_url": "",    # Direct OpenAI-compatible endpoint for subagents
+            "api_key": "",     # API key for delegation.base_url (falls back to OPENAI_API_KEY)
         },
     }
     
@@ -363,28 +379,44 @@ def load_cli_config() -> Dict[str, Any]:
         if config_key in compression_config:
             os.environ[env_var] = str(compression_config[config_key])
     
-    # Apply auxiliary model overrides to environment variables.
-    # Vision and web_extract each have their own provider + model pair.
+    # Apply auxiliary model/direct-endpoint overrides to environment variables.
+    # Vision and web_extract each have their own provider/model/base_url/api_key tuple.
     # (Compression is handled in the compression section above.)
     # Only set env vars for non-empty / non-default values so auto-detection
     # still works.
     auxiliary_config = defaults.get("auxiliary", {})
     auxiliary_task_env = {
-        # config key → (provider env var, model env var)
-        "vision":      ("AUXILIARY_VISION_PROVIDER",      "AUXILIARY_VISION_MODEL"),
-        "web_extract": ("AUXILIARY_WEB_EXTRACT_PROVIDER",  "AUXILIARY_WEB_EXTRACT_MODEL"),
+        # config key → env var mapping
+        "vision": {
+            "provider": "AUXILIARY_VISION_PROVIDER",
+            "model": "AUXILIARY_VISION_MODEL",
+            "base_url": "AUXILIARY_VISION_BASE_URL",
+            "api_key": "AUXILIARY_VISION_API_KEY",
+        },
+        "web_extract": {
+            "provider": "AUXILIARY_WEB_EXTRACT_PROVIDER",
+            "model": "AUXILIARY_WEB_EXTRACT_MODEL",
+            "base_url": "AUXILIARY_WEB_EXTRACT_BASE_URL",
+            "api_key": "AUXILIARY_WEB_EXTRACT_API_KEY",
+        },
     }
     
-    for task_key, (prov_env, model_env) in auxiliary_task_env.items():
+    for task_key, env_map in auxiliary_task_env.items():
         task_cfg = auxiliary_config.get(task_key, {})
         if not isinstance(task_cfg, dict):
             continue
         prov = str(task_cfg.get("provider", "")).strip()
         model = str(task_cfg.get("model", "")).strip()
+        base_url = str(task_cfg.get("base_url", "")).strip()
+        api_key = str(task_cfg.get("api_key", "")).strip()
         if prov and prov != "auto":
-            os.environ[prov_env] = prov
+            os.environ[env_map["provider"]] = prov
         if model:
-            os.environ[model_env] = model
+            os.environ[env_map["model"]] = model
+        if base_url:
+            os.environ[env_map["base_url"]] = base_url
+        if api_key:
+            os.environ[env_map["api_key"]] = api_key
     
     # Security settings
     security_config = defaults.get("security", {})
diff --git a/gateway/run.py b/gateway/run.py
index e973852b..8941fcec 100644
--- a/gateway/run.py
+++ b/gateway/run.py
@@ -100,24 +100,40 @@ if _config_path.exists():
             for _cfg_key, _env_var in _compression_env_map.items():
                 if _cfg_key in _compression_cfg:
                     os.environ[_env_var] = str(_compression_cfg[_cfg_key])
-        # Auxiliary model overrides (vision, web_extract).
-        # Each task has provider + model; bridge non-default values to env vars.
+        # Auxiliary model/direct-endpoint overrides (vision, web_extract).
+        # Each task has provider/model/base_url/api_key; bridge non-default values to env vars.
         _auxiliary_cfg = _cfg.get("auxiliary", {})
         if _auxiliary_cfg and isinstance(_auxiliary_cfg, dict):
             _aux_task_env = {
-                "vision":      ("AUXILIARY_VISION_PROVIDER",      "AUXILIARY_VISION_MODEL"),
-                "web_extract": ("AUXILIARY_WEB_EXTRACT_PROVIDER",  "AUXILIARY_WEB_EXTRACT_MODEL"),
+                "vision": {
+                    "provider": "AUXILIARY_VISION_PROVIDER",
+                    "model": "AUXILIARY_VISION_MODEL",
+                    "base_url": "AUXILIARY_VISION_BASE_URL",
+                    "api_key": "AUXILIARY_VISION_API_KEY",
+                },
+                "web_extract": {
+                    "provider": "AUXILIARY_WEB_EXTRACT_PROVIDER",
+                    "model": "AUXILIARY_WEB_EXTRACT_MODEL",
+                    "base_url": "AUXILIARY_WEB_EXTRACT_BASE_URL",
+                    "api_key": "AUXILIARY_WEB_EXTRACT_API_KEY",
+                },
             }
-            for _task_key, (_prov_env, _model_env) in _aux_task_env.items():
+            for _task_key, _env_map in _aux_task_env.items():
                 _task_cfg = _auxiliary_cfg.get(_task_key, {})
                 if not isinstance(_task_cfg, dict):
                     continue
                 _prov = str(_task_cfg.get("provider", "")).strip()
                 _model = str(_task_cfg.get("model", "")).strip()
+                _base_url = str(_task_cfg.get("base_url", "")).strip()
+                _api_key = str(_task_cfg.get("api_key", "")).strip()
                 if _prov and _prov != "auto":
-                    os.environ[_prov_env] = _prov
+                    os.environ[_env_map["provider"]] = _prov
                 if _model:
-                    os.environ[_model_env] = _model
+                    os.environ[_env_map["model"]] = _model
+                if _base_url:
+                    os.environ[_env_map["base_url"]] = _base_url
+                if _api_key:
+                    os.environ[_env_map["api_key"]] = _api_key
         _agent_cfg = _cfg.get("agent", {})
         if _agent_cfg and isinstance(_agent_cfg, dict):
             if "max_turns" in _agent_cfg:
diff --git a/hermes_cli/config.py b/hermes_cli/config.py
index bdde858d..b67405a0 100644
--- a/hermes_cli/config.py
+++ b/hermes_cli/config.py
@@ -150,30 +150,44 @@ DEFAULT_CONFIG = {
         "vision": {
             "provider": "auto",    # auto | openrouter | nous | codex | custom
             "model": "",           # e.g. "google/gemini-2.5-flash", "gpt-4o"
+            "base_url": "",        # direct OpenAI-compatible endpoint (takes precedence over provider)
+            "api_key": "",         # API key for base_url (falls back to OPENAI_API_KEY)
         },
         "web_extract": {
             "provider": "auto",
             "model": "",
+            "base_url": "",
+            "api_key": "",
         },
         "compression": {
             "provider": "auto",
             "model": "",
+            "base_url": "",
+            "api_key": "",
         },
         "session_search": {
             "provider": "auto",
             "model": "",
+            "base_url": "",
+            "api_key": "",
         },
         "skills_hub": {
             "provider": "auto",
             "model": "",
+            "base_url": "",
+            "api_key": "",
         },
         "mcp": {
             "provider": "auto",
             "model": "",
+            "base_url": "",
+            "api_key": "",
         },
         "flush_memories": {
             "provider": "auto",
             "model": "",
+            "base_url": "",
+            "api_key": "",
         },
     },
     
@@ -243,6 +257,8 @@ DEFAULT_CONFIG = {
     "delegation": {
         "model": "",       # e.g. "google/gemini-3-flash-preview" (empty = inherit parent model)
         "provider": "",    # e.g. "openrouter" (empty = inherit parent provider + credentials)
+        "base_url": "",    # direct OpenAI-compatible endpoint for subagents
+        "api_key": "",     # API key for delegation.base_url (falls back to OPENAI_API_KEY)
     },
 
     # Ephemeral prefill messages file — JSON list of {role, content} dicts
diff --git a/tests/agent/test_auxiliary_client.py b/tests/agent/test_auxiliary_client.py
index 57c73eb8..d60e3c81 100644
--- a/tests/agent/test_auxiliary_client.py
+++ b/tests/agent/test_auxiliary_client.py
@@ -24,9 +24,11 @@ def _clean_env(monkeypatch):
     for key in (
         "OPENROUTER_API_KEY", "OPENAI_BASE_URL", "OPENAI_API_KEY",
         "OPENAI_MODEL", "LLM_MODEL", "NOUS_INFERENCE_BASE_URL",
-        # Per-task provider/model overrides
+        # Per-task provider/model/direct-endpoint overrides
         "AUXILIARY_VISION_PROVIDER", "AUXILIARY_VISION_MODEL",
+        "AUXILIARY_VISION_BASE_URL", "AUXILIARY_VISION_API_KEY",
         "AUXILIARY_WEB_EXTRACT_PROVIDER", "AUXILIARY_WEB_EXTRACT_MODEL",
+        "AUXILIARY_WEB_EXTRACT_BASE_URL", "AUXILIARY_WEB_EXTRACT_API_KEY",
         "CONTEXT_COMPRESSION_PROVIDER", "CONTEXT_COMPRESSION_MODEL",
     ):
         monkeypatch.delenv(key, raising=False)
@@ -142,6 +144,27 @@ class TestGetTextAuxiliaryClient:
         call_kwargs = mock_openai.call_args
         assert call_kwargs.kwargs["base_url"] == "http://localhost:1234/v1"
 
+    def test_task_direct_endpoint_override(self, monkeypatch):
+        monkeypatch.setenv("OPENROUTER_API_KEY", "or-key")
+        monkeypatch.setenv("AUXILIARY_WEB_EXTRACT_BASE_URL", "http://localhost:2345/v1")
+        monkeypatch.setenv("AUXILIARY_WEB_EXTRACT_API_KEY", "task-key")
+        monkeypatch.setenv("AUXILIARY_WEB_EXTRACT_MODEL", "task-model")
+        with patch("agent.auxiliary_client.OpenAI") as mock_openai:
+            client, model = get_text_auxiliary_client("web_extract")
+        assert model == "task-model"
+        assert mock_openai.call_args.kwargs["base_url"] == "http://localhost:2345/v1"
+        assert mock_openai.call_args.kwargs["api_key"] == "task-key"
+
+    def test_task_direct_endpoint_without_openai_key_does_not_fall_back(self, monkeypatch):
+        monkeypatch.setenv("OPENROUTER_API_KEY", "or-key")
+        monkeypatch.setenv("AUXILIARY_WEB_EXTRACT_BASE_URL", "http://localhost:2345/v1")
+        monkeypatch.setenv("AUXILIARY_WEB_EXTRACT_MODEL", "task-model")
+        with patch("agent.auxiliary_client.OpenAI") as mock_openai:
+            client, model = get_text_auxiliary_client("web_extract")
+        assert client is None
+        assert model is None
+        mock_openai.assert_not_called()
+
     def test_codex_fallback_when_nothing_else(self, codex_auth_dir):
         with patch("agent.auxiliary_client._read_nous_auth", return_value=None), \
              patch("agent.auxiliary_client.OpenAI") as mock_openai:
@@ -194,6 +217,27 @@ class TestVisionClientFallback:
             client, model = get_vision_auxiliary_client()
         assert client is not None  # Custom endpoint picked up as fallback
 
+    def test_vision_direct_endpoint_override(self, monkeypatch):
+        monkeypatch.setenv("OPENROUTER_API_KEY", "or-key")
+        monkeypatch.setenv("AUXILIARY_VISION_BASE_URL", "http://localhost:4567/v1")
+        monkeypatch.setenv("AUXILIARY_VISION_API_KEY", "vision-key")
+        monkeypatch.setenv("AUXILIARY_VISION_MODEL", "vision-model")
+        with patch("agent.auxiliary_client.OpenAI") as mock_openai:
+            client, model = get_vision_auxiliary_client()
+        assert model == "vision-model"
+        assert mock_openai.call_args.kwargs["base_url"] == "http://localhost:4567/v1"
+        assert mock_openai.call_args.kwargs["api_key"] == "vision-key"
+
+    def test_vision_direct_endpoint_requires_openai_api_key(self, monkeypatch):
+        monkeypatch.setenv("OPENROUTER_API_KEY", "or-key")
+        monkeypatch.setenv("AUXILIARY_VISION_BASE_URL", "http://localhost:4567/v1")
+        monkeypatch.setenv("AUXILIARY_VISION_MODEL", "vision-model")
+        with patch("agent.auxiliary_client.OpenAI") as mock_openai:
+            client, model = get_vision_auxiliary_client()
+        assert client is None
+        assert model is None
+        mock_openai.assert_not_called()
+
     def test_vision_uses_openrouter_when_available(self, monkeypatch):
         monkeypatch.setenv("OPENROUTER_API_KEY", "or-key")
         with patch("agent.auxiliary_client.OpenAI") as mock_openai:
@@ -390,6 +434,24 @@ class TestTaskSpecificOverrides:
             client, model = get_text_auxiliary_client("web_extract")
         assert model == "google/gemini-3-flash-preview"
 
+    def test_task_direct_endpoint_from_config(self, monkeypatch, tmp_path):
+        hermes_home = tmp_path / "hermes"
+        hermes_home.mkdir(parents=True, exist_ok=True)
+        (hermes_home / "config.yaml").write_text(
+            """auxiliary:
+  web_extract:
+    base_url: http://localhost:3456/v1
+    api_key: config-key
+    model: config-model
+"""
+        )
+        monkeypatch.setenv("HERMES_HOME", str(hermes_home))
+        with patch("agent.auxiliary_client.OpenAI") as mock_openai:
+            client, model = get_text_auxiliary_client("web_extract")
+        assert model == "config-model"
+        assert mock_openai.call_args.kwargs["base_url"] == "http://localhost:3456/v1"
+        assert mock_openai.call_args.kwargs["api_key"] == "config-key"
+
     def test_task_without_override_uses_auto(self, monkeypatch):
         """A task with no provider env var falls through to auto chain."""
         monkeypatch.setenv("OPENROUTER_API_KEY", "or-key")
diff --git a/tests/conftest.py b/tests/conftest.py
index 9c9f9a44..67fad819 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -26,6 +26,12 @@ def _isolate_hermes_home(tmp_path, monkeypatch):
     (fake_home / "memories").mkdir()
     (fake_home / "skills").mkdir()
     monkeypatch.setenv("HERMES_HOME", str(fake_home))
+    # Tests should not inherit the agent's current gateway/messaging surface.
+    # Individual tests that need gateway behavior set these explicitly.
+    monkeypatch.delenv("HERMES_SESSION_PLATFORM", raising=False)
+    monkeypatch.delenv("HERMES_SESSION_CHAT_ID", raising=False)
+    monkeypatch.delenv("HERMES_SESSION_CHAT_NAME", raising=False)
+    monkeypatch.delenv("HERMES_GATEWAY_SESSION", raising=False)
 
 
 @pytest.fixture()
diff --git a/tests/test_auxiliary_config_bridge.py b/tests/test_auxiliary_config_bridge.py
index a4d65c2a..22e88bdf 100644
--- a/tests/test_auxiliary_config_bridge.py
+++ b/tests/test_auxiliary_config_bridge.py
@@ -25,7 +25,9 @@ def _run_auxiliary_bridge(config_dict, monkeypatch):
     # Clear env vars
     for key in (
         "AUXILIARY_VISION_PROVIDER", "AUXILIARY_VISION_MODEL",
+        "AUXILIARY_VISION_BASE_URL", "AUXILIARY_VISION_API_KEY",
         "AUXILIARY_WEB_EXTRACT_PROVIDER", "AUXILIARY_WEB_EXTRACT_MODEL",
+        "AUXILIARY_WEB_EXTRACT_BASE_URL", "AUXILIARY_WEB_EXTRACT_API_KEY",
         "CONTEXT_COMPRESSION_PROVIDER", "CONTEXT_COMPRESSION_MODEL",
     ):
         monkeypatch.delenv(key, raising=False)
@@ -47,19 +49,35 @@ def _run_auxiliary_bridge(config_dict, monkeypatch):
     auxiliary_cfg = config_dict.get("auxiliary", {})
     if auxiliary_cfg and isinstance(auxiliary_cfg, dict):
         aux_task_env = {
-            "vision":      ("AUXILIARY_VISION_PROVIDER",      "AUXILIARY_VISION_MODEL"),
-            "web_extract": ("AUXILIARY_WEB_EXTRACT_PROVIDER",  "AUXILIARY_WEB_EXTRACT_MODEL"),
+            "vision": {
+                "provider": "AUXILIARY_VISION_PROVIDER",
+                "model": "AUXILIARY_VISION_MODEL",
+                "base_url": "AUXILIARY_VISION_BASE_URL",
+                "api_key": "AUXILIARY_VISION_API_KEY",
+            },
+            "web_extract": {
+                "provider": "AUXILIARY_WEB_EXTRACT_PROVIDER",
+                "model": "AUXILIARY_WEB_EXTRACT_MODEL",
+                "base_url": "AUXILIARY_WEB_EXTRACT_BASE_URL",
+                "api_key": "AUXILIARY_WEB_EXTRACT_API_KEY",
+            },
         }
-        for task_key, (prov_env, model_env) in aux_task_env.items():
+        for task_key, env_map in aux_task_env.items():
             task_cfg = auxiliary_cfg.get(task_key, {})
             if not isinstance(task_cfg, dict):
                 continue
             prov = str(task_cfg.get("provider", "")).strip()
             model = str(task_cfg.get("model", "")).strip()
+            base_url = str(task_cfg.get("base_url", "")).strip()
+            api_key = str(task_cfg.get("api_key", "")).strip()
             if prov and prov != "auto":
-                os.environ[prov_env] = prov
+                os.environ[env_map["provider"]] = prov
             if model:
-                os.environ[model_env] = model
+                os.environ[env_map["model"]] = model
+            if base_url:
+                os.environ[env_map["base_url"]] = base_url
+            if api_key:
+                os.environ[env_map["api_key"]] = api_key
 
 
 # ── Config bridging tests ────────────────────────────────────────────────────
@@ -101,6 +119,21 @@ class TestAuxiliaryConfigBridge:
         assert os.environ.get("AUXILIARY_WEB_EXTRACT_PROVIDER") == "nous"
         assert os.environ.get("AUXILIARY_WEB_EXTRACT_MODEL") == "gemini-2.5-flash"
 
+    def test_direct_endpoint_bridged(self, monkeypatch):
+        config = {
+            "auxiliary": {
+                "vision": {
+                    "base_url": "http://localhost:1234/v1",
+                    "api_key": "local-key",
+                    "model": "qwen2.5-vl",
+                }
+            }
+        }
+        _run_auxiliary_bridge(config, monkeypatch)
+        assert os.environ.get("AUXILIARY_VISION_BASE_URL") == "http://localhost:1234/v1"
+        assert os.environ.get("AUXILIARY_VISION_API_KEY") == "local-key"
+        assert os.environ.get("AUXILIARY_VISION_MODEL") == "qwen2.5-vl"
+
     def test_compression_provider_bridged(self, monkeypatch):
         config = {
             "compression": {
@@ -200,8 +233,12 @@ class TestGatewayBridgeCodeParity:
         # Check for key patterns that indicate the bridge is present
         assert "AUXILIARY_VISION_PROVIDER" in content
         assert "AUXILIARY_VISION_MODEL" in content
+        assert "AUXILIARY_VISION_BASE_URL" in content
+        assert "AUXILIARY_VISION_API_KEY" in content
         assert "AUXILIARY_WEB_EXTRACT_PROVIDER" in content
         assert "AUXILIARY_WEB_EXTRACT_MODEL" in content
+        assert "AUXILIARY_WEB_EXTRACT_BASE_URL" in content
+        assert "AUXILIARY_WEB_EXTRACT_API_KEY" in content
 
     def test_gateway_has_compression_provider(self):
         """Gateway must bridge compression.summary_provider."""
diff --git a/tests/tools/test_delegate.py b/tests/tools/test_delegate.py
index 680233b0..a29560b2 100644
--- a/tests/tools/test_delegate.py
+++ b/tests/tools/test_delegate.py
@@ -10,6 +10,7 @@ Run with:  python -m pytest tests/test_delegate.py -v
 """
 
 import json
+import os
 import sys
 import unittest
 from unittest.mock import MagicMock, patch
@@ -462,6 +463,43 @@ class TestDelegationCredentialResolution(unittest.TestCase):
         self.assertEqual(creds["api_mode"], "chat_completions")
         mock_resolve.assert_called_once_with(requested="openrouter")
 
+    def test_direct_endpoint_uses_configured_base_url_and_api_key(self):
+        parent = _make_mock_parent(depth=0)
+        cfg = {
+            "model": "qwen2.5-coder",
+            "provider": "openrouter",
+            "base_url": "http://localhost:1234/v1",
+            "api_key": "local-key",
+        }
+        creds = _resolve_delegation_credentials(cfg, parent)
+        self.assertEqual(creds["model"], "qwen2.5-coder")
+        self.assertEqual(creds["provider"], "custom")
+        self.assertEqual(creds["base_url"], "http://localhost:1234/v1")
+        self.assertEqual(creds["api_key"], "local-key")
+        self.assertEqual(creds["api_mode"], "chat_completions")
+
+    def test_direct_endpoint_falls_back_to_openai_api_key_env(self):
+        parent = _make_mock_parent(depth=0)
+        cfg = {
+            "model": "qwen2.5-coder",
+            "base_url": "http://localhost:1234/v1",
+        }
+        with patch.dict(os.environ, {"OPENAI_API_KEY": "env-openai-key"}, clear=False):
+            creds = _resolve_delegation_credentials(cfg, parent)
+        self.assertEqual(creds["api_key"], "env-openai-key")
+        self.assertEqual(creds["provider"], "custom")
+
+    def test_direct_endpoint_does_not_fall_back_to_openrouter_api_key_env(self):
+        parent = _make_mock_parent(depth=0)
+        cfg = {
+            "model": "qwen2.5-coder",
+            "base_url": "http://localhost:1234/v1",
+        }
+        with patch.dict(os.environ, {"OPENROUTER_API_KEY": "env-openrouter-key"}, clear=False):
+            with self.assertRaises(ValueError) as ctx:
+                _resolve_delegation_credentials(cfg, parent)
+        self.assertIn("OPENAI_API_KEY", str(ctx.exception))
+
     @patch("hermes_cli.runtime_provider.resolve_runtime_provider")
     def test_nous_provider_resolves_nous_credentials(self, mock_resolve):
         """Nous provider resolves Nous Portal base_url and api_key."""
@@ -589,6 +627,40 @@ class TestDelegationProviderIntegration(unittest.TestCase):
             self.assertNotEqual(kwargs["base_url"], parent.base_url)
             self.assertNotEqual(kwargs["api_key"], parent.api_key)
 
+    @patch("tools.delegate_tool._load_config")
+    @patch("tools.delegate_tool._resolve_delegation_credentials")
+    def test_direct_endpoint_credentials_reach_child_agent(self, mock_creds, mock_cfg):
+        mock_cfg.return_value = {
+            "max_iterations": 45,
+            "model": "qwen2.5-coder",
+            "base_url": "http://localhost:1234/v1",
+            "api_key": "local-key",
+        }
+        mock_creds.return_value = {
+            "model": "qwen2.5-coder",
+            "provider": "custom",
+            "base_url": "http://localhost:1234/v1",
+            "api_key": "local-key",
+            "api_mode": "chat_completions",
+        }
+        parent = _make_mock_parent(depth=0)
+
+        with patch("run_agent.AIAgent") as MockAgent:
+            mock_child = MagicMock()
+            mock_child.run_conversation.return_value = {
+                "final_response": "done", "completed": True, "api_calls": 1
+            }
+            MockAgent.return_value = mock_child
+
+            delegate_task(goal="Direct endpoint test", parent_agent=parent)
+
+            _, kwargs = MockAgent.call_args
+            self.assertEqual(kwargs["model"], "qwen2.5-coder")
+            self.assertEqual(kwargs["provider"], "custom")
+            self.assertEqual(kwargs["base_url"], "http://localhost:1234/v1")
+            self.assertEqual(kwargs["api_key"], "local-key")
+            self.assertEqual(kwargs["api_mode"], "chat_completions")
+
     @patch("tools.delegate_tool._load_config")
     @patch("tools.delegate_tool._resolve_delegation_credentials")
     def test_empty_config_inherits_parent(self, mock_creds, mock_cfg):
diff --git a/tools/delegate_tool.py b/tools/delegate_tool.py
index 76026be5..0d5908ab 100644
--- a/tools/delegate_tool.py
+++ b/tools/delegate_tool.py
@@ -540,18 +540,51 @@ def delegate_task(
 def _resolve_delegation_credentials(cfg: dict, parent_agent) -> dict:
     """Resolve credentials for subagent delegation.
 
-    If ``delegation.provider`` is configured, resolves the full credential
-    bundle (base_url, api_key, api_mode, provider) via the runtime provider
-    system — the same path used by CLI/gateway startup.  This lets subagents
-    run on a completely different provider:model pair.
+    If ``delegation.base_url`` is configured, subagents use that direct
+    OpenAI-compatible endpoint. Otherwise, if ``delegation.provider`` is
+    configured, the full credential bundle (base_url, api_key, api_mode,
+    provider) is resolved via the runtime provider system — the same path used
+    by CLI/gateway startup. This lets subagents run on a completely different
+    provider:model pair.
 
-    If no provider is configured, returns None values so the child inherits
-    everything from the parent agent.
+    If neither base_url nor provider is configured, returns None values so the
+    child inherits everything from the parent agent.
 
     Raises ValueError with a user-friendly message on credential failure.
     """
-    configured_model = cfg.get("model") or None
-    configured_provider = cfg.get("provider") or None
+    configured_model = str(cfg.get("model") or "").strip() or None
+    configured_provider = str(cfg.get("provider") or "").strip() or None
+    configured_base_url = str(cfg.get("base_url") or "").strip() or None
+    configured_api_key = str(cfg.get("api_key") or "").strip() or None
+
+    if configured_base_url:
+        api_key = (
+            configured_api_key
+            or os.getenv("OPENAI_API_KEY", "").strip()
+        )
+        if not api_key:
+            raise ValueError(
+                "Delegation base_url is configured but no API key was found. "
+                "Set delegation.api_key or OPENAI_API_KEY."
+            )
+
+        base_lower = configured_base_url.lower()
+        provider = "custom"
+        api_mode = "chat_completions"
+        if "chatgpt.com/backend-api/codex" in base_lower:
+            provider = "openai-codex"
+            api_mode = "codex_responses"
+        elif "api.anthropic.com" in base_lower:
+            provider = "anthropic"
+            api_mode = "anthropic_messages"
+
+        return {
+            "model": configured_model,
+            "provider": provider,
+            "base_url": configured_base_url,
+            "api_key": api_key,
+            "api_mode": api_mode,
+        }
 
     if not configured_provider:
         # No provider override — child inherits everything from parent
@@ -570,7 +603,8 @@ def _resolve_delegation_credentials(cfg: dict, parent_agent) -> dict:
     except Exception as exc:
         raise ValueError(
             f"Cannot resolve delegation provider '{configured_provider}': {exc}. "
-            f"Check that the provider is configured (API key set, valid provider name). "
+            f"Check that the provider is configured (API key set, valid provider name), "
+            f"or set delegation.base_url/delegation.api_key for a direct endpoint. "
             f"Available providers: openrouter, nous, zai, kimi-coding, minimax."
         ) from exc
 
diff --git a/website/docs/reference/environment-variables.md b/website/docs/reference/environment-variables.md
index d4f633ee..36a54d26 100644
--- a/website/docs/reference/environment-variables.md
+++ b/website/docs/reference/environment-variables.md
@@ -180,6 +180,23 @@ For native Anthropic auth, Hermes prefers Claude Code's own credential files whe
 | `CONTEXT_COMPRESSION_THRESHOLD` | Trigger at this % of limit (default: 0.50) |
 | `CONTEXT_COMPRESSION_MODEL` | Model for summaries |
 
+## Auxiliary Task Overrides
+
+| Variable | Description |
+|----------|-------------|
+| `AUXILIARY_VISION_PROVIDER` | Override provider for vision tasks |
+| `AUXILIARY_VISION_MODEL` | Override model for vision tasks |
+| `AUXILIARY_VISION_BASE_URL` | Direct OpenAI-compatible endpoint for vision tasks |
+| `AUXILIARY_VISION_API_KEY` | API key paired with `AUXILIARY_VISION_BASE_URL` |
+| `AUXILIARY_WEB_EXTRACT_PROVIDER` | Override provider for web extraction/summarization |
+| `AUXILIARY_WEB_EXTRACT_MODEL` | Override model for web extraction/summarization |
+| `AUXILIARY_WEB_EXTRACT_BASE_URL` | Direct OpenAI-compatible endpoint for web extraction/summarization |
+| `AUXILIARY_WEB_EXTRACT_API_KEY` | API key paired with `AUXILIARY_WEB_EXTRACT_BASE_URL` |
+| `CONTEXT_COMPRESSION_PROVIDER` | Override provider for context compression summaries |
+| `CONTEXT_COMPRESSION_MODEL` | Override model for context compression summaries |
+
+For task-specific direct endpoints, Hermes uses the task's configured API key or `OPENAI_API_KEY`. It does not reuse `OPENROUTER_API_KEY` for those custom endpoints.
+
 ## Provider Routing (config.yaml only)
 
 These go in `~/.hermes/config.yaml` under the `provider_routing` section:
diff --git a/website/docs/user-guide/configuration.md b/website/docs/user-guide/configuration.md
index 71525764..0a1c50cb 100644
--- a/website/docs/user-guide/configuration.md
+++ b/website/docs/user-guide/configuration.md
@@ -569,11 +569,15 @@ auxiliary:
   vision:
     provider: "auto"           # "auto", "openrouter", "nous", "main"
     model: ""                  # e.g. "openai/gpt-4o", "google/gemini-2.5-flash"
+    base_url: ""               # direct OpenAI-compatible endpoint (takes precedence over provider)
+    api_key: ""                # API key for base_url (falls back to OPENAI_API_KEY)
 
   # Web page summarization + browser page text extraction
   web_extract:
     provider: "auto"
     model: ""                  # e.g. "google/gemini-2.5-flash"
+    base_url: ""
+    api_key: ""
 ```
 
 ### Changing the Vision Model
@@ -604,6 +608,17 @@ AUXILIARY_VISION_MODEL=openai/gpt-4o
 
 ### Common Setups
 
+**Using a direct custom endpoint** (clearer than `provider: "main"` for local/self-hosted APIs):
+```yaml
+auxiliary:
+  vision:
+    base_url: "http://localhost:1234/v1"
+    api_key: "local-key"
+    model: "qwen2.5-vl"
+```
+
+`base_url` takes precedence over `provider`, so this is the most explicit way to route an auxiliary task to a specific endpoint. For direct endpoint overrides, Hermes uses the configured `api_key` or falls back to `OPENAI_API_KEY`; it does not reuse `OPENROUTER_API_KEY` for that custom endpoint.
+
 **Using OpenAI API key for vision:**
 ```yaml
 # In ~/.hermes/.env:
@@ -848,13 +863,17 @@ delegation:
     - web
   # model: "google/gemini-3-flash-preview"  # Override model (empty = inherit parent)
   # provider: "openrouter"                  # Override provider (empty = inherit parent)
+  # base_url: "http://localhost:1234/v1"    # Direct OpenAI-compatible endpoint (takes precedence over provider)
+  # api_key: "local-key"                    # API key for base_url (falls back to OPENAI_API_KEY)
 ```
 
 **Subagent provider:model override:** By default, subagents inherit the parent agent's provider and model. Set `delegation.provider` and `delegation.model` to route subagents to a different provider:model pair — e.g., use a cheap/fast model for narrowly-scoped subtasks while your primary agent runs an expensive reasoning model.
 
+**Direct endpoint override:** If you want the obvious custom-endpoint path, set `delegation.base_url`, `delegation.api_key`, and `delegation.model`. That sends subagents directly to that OpenAI-compatible endpoint and takes precedence over `delegation.provider`. If `delegation.api_key` is omitted, Hermes falls back to `OPENAI_API_KEY` only.
+
 The delegation provider uses the same credential resolution as CLI/gateway startup. All configured providers are supported: `openrouter`, `nous`, `zai`, `kimi-coding`, `minimax`, `minimax-cn`. When a provider is set, the system automatically resolves the correct base URL, API key, and API mode — no manual credential wiring needed.
 
-**Precedence:** `delegation.provider` in config → parent provider (inherited). `delegation.model` in config → parent model (inherited). Setting just `model` without `provider` changes only the model name while keeping the parent's credentials (useful for switching models within the same provider like OpenRouter).
+**Precedence:** `delegation.base_url` in config → `delegation.provider` in config → parent provider (inherited). `delegation.model` in config → parent model (inherited). Setting just `model` without `provider` changes only the model name while keeping the parent's credentials (useful for switching models within the same provider like OpenRouter).
 
 ## Clarify
 
diff --git a/website/docs/user-guide/features/delegation.md b/website/docs/user-guide/features/delegation.md
index 78237167..f3193d9a 100644
--- a/website/docs/user-guide/features/delegation.md
+++ b/website/docs/user-guide/features/delegation.md
@@ -209,6 +209,14 @@ Delegation has a **depth limit of 2** — a parent (depth 0) can spawn children
 delegation:
   max_iterations: 50                        # Max turns per child (default: 50)
   default_toolsets: ["terminal", "file", "web"]  # Default toolsets
+  model: "google/gemini-3-flash-preview"             # Optional provider/model override
+  provider: "openrouter"                             # Optional built-in provider
+
+# Or use a direct custom endpoint instead of provider:
+delegation:
+  model: "qwen2.5-coder"
+  base_url: "http://localhost:1234/v1"
+  api_key: "local-key"
 ```
 
 :::tip

From 95939a1b5130c4a04bf67eaacbbb7ea7af5bd3f3 Mon Sep 17 00:00:00 2001
From: Teknium <127238744+teknium1@users.noreply.github.com>
Date: Sat, 14 Mar 2026 21:17:41 -0700
Subject: [PATCH 2/4] docs: clarify gateway service scopes (#1378)

---
 cron/__init__.py                              |  3 ++-
 hermes_cli/cron.py                            |  4 +++-
 hermes_cli/main.py                            |  2 +-
 website/docs/guides/daily-briefing-bot.md     |  7 +++++--
 .../docs/guides/team-telegram-assistant.md    | 10 ++++++++--
 website/docs/user-guide/features/cron.md      |  3 ++-
 website/docs/user-guide/messaging/email.md    |  3 ++-
 website/docs/user-guide/messaging/index.md    | 20 +++++++++++++++----
 website/docs/user-guide/messaging/signal.md   |  3 ++-
 website/docs/user-guide/messaging/slack.md    |  3 ++-
 website/docs/user-guide/messaging/whatsapp.md |  3 ++-
 11 files changed, 45 insertions(+), 16 deletions(-)

diff --git a/cron/__init__.py b/cron/__init__.py
index 31d7bf8e..2c44cabf 100644
--- a/cron/__init__.py
+++ b/cron/__init__.py
@@ -7,7 +7,8 @@ This module provides scheduled task execution, allowing the agent to:
 - Execute tasks in isolated sessions (no prior context)
 
 Cron jobs are executed automatically by the gateway daemon:
-    hermes gateway install    # Install as system service (recommended)
+    hermes gateway install    # Install as a user service
+    sudo hermes gateway install --system  # Linux servers: boot-time system service
     hermes gateway            # Or run in foreground
 
 The gateway ticks the scheduler every 60 seconds. A file lock prevents
diff --git a/hermes_cli/cron.py b/hermes_cli/cron.py
index a068d637..97a22579 100644
--- a/hermes_cli/cron.py
+++ b/hermes_cli/cron.py
@@ -96,6 +96,7 @@ def cron_list(show_all: bool = False):
     if not find_gateway_pids():
         print(color("  ⚠  Gateway is not running — jobs won't fire automatically.", Colors.YELLOW))
         print(color("     Start it with: hermes gateway install", Colors.DIM))
+        print(color("                    sudo hermes gateway install --system  # Linux servers", Colors.DIM))
         print()
 
 
@@ -120,7 +121,8 @@ def cron_status():
         print(color("✗ Gateway is not running — cron jobs will NOT fire", Colors.RED))
         print()
         print("  To enable automatic execution:")
-        print("    hermes gateway install    # Install as system service (recommended)")
+        print("    hermes gateway install    # Install as a user service")
+        print("    sudo hermes gateway install --system  # Linux servers: boot-time system service")
         print("    hermes gateway            # Or run in foreground")
 
     print()
diff --git a/hermes_cli/main.py b/hermes_cli/main.py
index 1238d9b6..5f9356b8 100644
--- a/hermes_cli/main.py
+++ b/hermes_cli/main.py
@@ -2313,7 +2313,7 @@ Examples:
     hermes gateway                Run messaging gateway
     hermes -s hermes-agent-dev,github-auth
     hermes -w                     Start in isolated git worktree
-    hermes gateway install        Install as system service
+    hermes gateway install        Install gateway background service
     hermes sessions list          List past sessions
     hermes sessions browse        Interactive session picker
     hermes sessions rename ID T   Rename/title a session
diff --git a/website/docs/guides/daily-briefing-bot.md b/website/docs/guides/daily-briefing-bot.md
index 85f11c40..78bfd690 100644
--- a/website/docs/guides/daily-briefing-bot.md
+++ b/website/docs/guides/daily-briefing-bot.md
@@ -29,7 +29,8 @@ Before starting, make sure you have:
 - **Hermes Agent installed** — see the [Installation guide](/docs/getting-started/installation)
 - **Gateway running** — the gateway daemon handles cron execution:
   ```bash
-  hermes gateway install   # Install as system service (recommended)
+  hermes gateway install   # Install as a user service
+  sudo hermes gateway install --system   # Linux servers: boot-time system service
   # or
   hermes gateway           # Run in foreground
   ```
@@ -242,10 +243,12 @@ Make sure the scheduler is actually running:
 hermes cron status
 ```
 
-If the gateway isn't running, your jobs won't execute. Install it as a system service for reliability:
+If the gateway isn't running, your jobs won't execute. Install it as a background service for reliability:
 
 ```bash
 hermes gateway install
+# or on Linux servers
+sudo hermes gateway install --system
 ```
 
 ## Going Further
diff --git a/website/docs/guides/team-telegram-assistant.md b/website/docs/guides/team-telegram-assistant.md
index 124f4bf3..88de9c70 100644
--- a/website/docs/guides/team-telegram-assistant.md
+++ b/website/docs/guides/team-telegram-assistant.md
@@ -143,12 +143,13 @@ For a persistent deployment that survives reboots:
 
 ```bash
 hermes gateway install
+sudo hermes gateway install --system   # Linux only: boot-time system service
 ```
 
-This creates a **systemd** service (Linux) or **launchd** service (macOS) that runs automatically.
+This creates a background service: a user-level **systemd** service on Linux by default, a **launchd** service on macOS, or a boot-time Linux system service if you pass `--system`.
 
 ```bash
-# Linux — manage the service
+# Linux — manage the default user service
 hermes gateway start
 hermes gateway stop
 hermes gateway status
@@ -158,6 +159,11 @@ journalctl --user -u hermes-gateway -f
 
 # Keep running after SSH logout
 sudo loginctl enable-linger $USER
+
+# Linux servers — explicit system-service commands
+sudo hermes gateway start --system
+sudo hermes gateway status --system
+journalctl -u hermes-gateway -f
 ```
 
 ```bash
diff --git a/website/docs/user-guide/features/cron.md b/website/docs/user-guide/features/cron.md
index dd7d5606..2d0a4c83 100644
--- a/website/docs/user-guide/features/cron.md
+++ b/website/docs/user-guide/features/cron.md
@@ -156,7 +156,8 @@ What they do:
 **Cron execution is handled by the gateway daemon.** The gateway ticks the scheduler every 60 seconds, running any due jobs in isolated agent sessions.
 
 ```bash
-hermes gateway install     # Install as system service (recommended)
+hermes gateway install     # Install as a user service
+sudo hermes gateway install --system   # Linux: boot-time system service for servers
 hermes gateway             # Or run in foreground
 
 hermes cron list
diff --git a/website/docs/user-guide/messaging/email.md b/website/docs/user-guide/messaging/email.md
index f6746290..8f515e85 100644
--- a/website/docs/user-guide/messaging/email.md
+++ b/website/docs/user-guide/messaging/email.md
@@ -80,7 +80,8 @@ EMAIL_HOME_ADDRESS=your@email.com      # Default delivery target for cron jobs
 
 ```bash
 hermes gateway              # Run in foreground
-hermes gateway install      # Install as a system service
+hermes gateway install      # Install as a user service
+sudo hermes gateway install --system   # Linux only: boot-time system service
 ```
 
 On startup, the adapter:
diff --git a/website/docs/user-guide/messaging/index.md b/website/docs/user-guide/messaging/index.md
index 39c03b74..d71edafe 100644
--- a/website/docs/user-guide/messaging/index.md
+++ b/website/docs/user-guide/messaging/index.md
@@ -54,10 +54,12 @@ This walks you through configuring each platform with arrow-key selection, shows
 ```bash
 hermes gateway              # Run in foreground
 hermes gateway setup        # Configure messaging platforms interactively
-hermes gateway install      # Install as systemd service (Linux) / launchd (macOS)
-hermes gateway start        # Start the service
-hermes gateway stop         # Stop the service
-hermes gateway status       # Check service status
+hermes gateway install      # Install as a user service (Linux) / launchd service (macOS)
+sudo hermes gateway install --system   # Linux only: install a boot-time system service
+hermes gateway start        # Start the default service
+hermes gateway stop         # Stop the default service
+hermes gateway status       # Check default service status
+hermes gateway status --system         # Linux only: inspect the system service explicitly
 ```
 
 ## Chat Commands (Inside Messaging)
@@ -188,8 +190,18 @@ journalctl --user -u hermes-gateway -f
 
 # Enable lingering (keeps running after logout)
 sudo loginctl enable-linger $USER
+
+# Or install a boot-time system service that still runs as your user
+sudo hermes gateway install --system
+sudo hermes gateway start --system
+sudo hermes gateway status --system
+journalctl -u hermes-gateway -f
 ```
 
+Use the user service on laptops and dev boxes. Use the system service on VPS or headless hosts that should come back at boot without relying on systemd linger.
+
+Avoid keeping both the user and system gateway units installed at once unless you really mean to. Hermes will warn if it detects both because start/stop/status behavior gets ambiguous.
+
 ### macOS (launchd)
 
 ```bash
diff --git a/website/docs/user-guide/messaging/signal.md b/website/docs/user-guide/messaging/signal.md
index 53bb862a..e1fd5463 100644
--- a/website/docs/user-guide/messaging/signal.md
+++ b/website/docs/user-guide/messaging/signal.md
@@ -127,7 +127,8 @@ Then start the gateway:
 
 ```bash
 hermes gateway              # Foreground
-hermes gateway install      # Install as a system service
+hermes gateway install      # Install as a user service
+sudo hermes gateway install --system   # Linux only: boot-time system service
 ```
 
 ---
diff --git a/website/docs/user-guide/messaging/slack.md b/website/docs/user-guide/messaging/slack.md
index 2ff79f35..29a14353 100644
--- a/website/docs/user-guide/messaging/slack.md
+++ b/website/docs/user-guide/messaging/slack.md
@@ -168,7 +168,8 @@ Then start the gateway:
 
 ```bash
 hermes gateway              # Foreground
-hermes gateway install      # Install as a system service
+hermes gateway install      # Install as a user service
+sudo hermes gateway install --system   # Linux only: boot-time system service
 ```
 
 ---
diff --git a/website/docs/user-guide/messaging/whatsapp.md b/website/docs/user-guide/messaging/whatsapp.md
index af432fb8..eb741467 100644
--- a/website/docs/user-guide/messaging/whatsapp.md
+++ b/website/docs/user-guide/messaging/whatsapp.md
@@ -101,7 +101,8 @@ Then start the gateway:
 
 ```bash
 hermes gateway              # Foreground
-hermes gateway install      # Install as a system service
+hermes gateway install      # Install as a user service
+sudo hermes gateway install --system   # Linux only: boot-time system service
 ```
 
 The gateway starts the WhatsApp bridge automatically using the saved session.

From ff3473a37c704b86a4809c349f1627bd83f1c4da Mon Sep 17 00:00:00 2001
From: Teknium <127238744+teknium1@users.noreply.github.com>
Date: Sat, 14 Mar 2026 21:18:17 -0700
Subject: [PATCH 3/4] feat: add /plan command (#1372)

* feat: add /plan command

* refactor: back /plan with bundled skill

* docs: document /plan skill
---
 agent/skill_commands.py                    |  32 +++++-
 cli.py                                     |  28 +++++
 gateway/run.py                             |  23 +++-
 skills/software-development/plan/SKILL.md  |  55 +++++++++
 tests/agent/test_skill_commands.py         |  38 +++++-
 tests/gateway/test_plan_command.py         | 128 +++++++++++++++++++++
 tests/test_cli_plan_command.py             |  66 +++++++++++
 website/docs/reference/skills-catalog.md   |   1 +
 website/docs/reference/slash-commands.md   |   4 +-
 website/docs/user-guide/features/skills.md |   3 +
 10 files changed, 372 insertions(+), 6 deletions(-)
 create mode 100644 skills/software-development/plan/SKILL.md
 create mode 100644 tests/gateway/test_plan_command.py
 create mode 100644 tests/test_cli_plan_command.py

diff --git a/agent/skill_commands.py b/agent/skill_commands.py
index b9d5135f..8afdfa93 100644
--- a/agent/skill_commands.py
+++ b/agent/skill_commands.py
@@ -1,17 +1,38 @@
-"""Skill slash commands — scan installed skills and build invocation messages.
+"""Shared slash command helpers for skills and built-in prompt-style modes.
 
 Shared between CLI (cli.py) and gateway (gateway/run.py) so both surfaces
-can invoke skills via /skill-name commands.
+can invoke skills via /skill-name commands and prompt-only built-ins like
+/plan.
 """
 
 import json
 import logging
+import os
+import re
+from datetime import datetime
 from pathlib import Path
 from typing import Any, Dict, Optional
 
 logger = logging.getLogger(__name__)
 
 _skill_commands: Dict[str, Dict[str, Any]] = {}
+_PLAN_SLUG_RE = re.compile(r"[^a-z0-9]+")
+
+
+def build_plan_path(
+    user_instruction: str = "",
+    *,
+    now: datetime | None = None,
+) -> Path:
+    """Return the default markdown path for a /plan invocation."""
+    hermes_home = Path(os.getenv("HERMES_HOME", Path.home() / ".hermes"))
+    slug_source = (user_instruction or "").strip().splitlines()[0] if user_instruction else ""
+    slug = _PLAN_SLUG_RE.sub("-", slug_source.lower()).strip("-")
+    if slug:
+        slug = "-".join(part for part in slug.split("-")[:8] if part)[:48].strip("-")
+    slug = slug or "conversation-plan"
+    timestamp = (now or datetime.now()).strftime("%Y-%m-%d_%H%M%S")
+    return hermes_home / "plans" / f"{timestamp}-{slug}.md"
 
 
 def _load_skill_payload(skill_identifier: str, task_id: str | None = None) -> tuple[dict[str, Any], Path | None, str] | None:
@@ -56,6 +77,7 @@ def _build_skill_message(
     skill_dir: Path | None,
     activation_note: str,
     user_instruction: str = "",
+    runtime_note: str = "",
 ) -> str:
     """Format a loaded skill into a user/system message payload."""
     from tools.skills_tool import SKILLS_DIR
@@ -115,6 +137,10 @@ def _build_skill_message(
         parts.append("")
         parts.append(f"The user has provided the following instruction alongside the skill invocation: {user_instruction}")
 
+    if runtime_note:
+        parts.append("")
+        parts.append(f"[Runtime note: {runtime_note}]")
+
     return "\n".join(parts)
 
 
@@ -172,6 +198,7 @@ def build_skill_invocation_message(
     cmd_key: str,
     user_instruction: str = "",
     task_id: str | None = None,
+    runtime_note: str = "",
 ) -> Optional[str]:
     """Build the user message content for a skill slash command invocation.
 
@@ -201,6 +228,7 @@ def build_skill_invocation_message(
         skill_dir,
         activation_note,
         user_instruction=user_instruction,
+        runtime_note=runtime_note,
     )
 
 
diff --git a/cli.py b/cli.py
index 1bebbf4f..654dfb25 100755
--- a/cli.py
+++ b/cli.py
@@ -1080,6 +1080,7 @@ from agent.skill_commands import (
     scan_skill_commands,
     get_skill_commands,
     build_skill_invocation_message,
+    build_plan_path,
     build_preloaded_skills_prompt,
 )
 
@@ -3193,6 +3194,8 @@ class HermesCLI:
         elif cmd_lower.startswith("/personality"):
             # Use original case (handler lowercases the personality name itself)
             self._handle_personality_command(cmd_original)
+        elif cmd_lower == "/plan" or cmd_lower.startswith("/plan "):
+            self._handle_plan_command(cmd_original)
         elif cmd_lower == "/retry":
             retry_msg = self.retry_last()
             if retry_msg and hasattr(self, '_pending_input'):
@@ -3304,6 +3307,31 @@ class HermesCLI:
         
         return True
     
+    def _handle_plan_command(self, cmd: str):
+        """Handle /plan [request] — load the bundled plan skill."""
+        parts = cmd.strip().split(maxsplit=1)
+        user_instruction = parts[1].strip() if len(parts) > 1 else ""
+
+        plan_path = build_plan_path(user_instruction)
+        msg = build_skill_invocation_message(
+            "/plan",
+            user_instruction,
+            task_id=self.session_id,
+            runtime_note=(
+                f"Save the markdown plan with write_file to this exact path: {plan_path}"
+            ),
+        )
+
+        if not msg:
+            self.console.print("[bold red]Failed to load the bundled /plan skill[/]")
+            return
+
+        _cprint(f"  📝 Plan mode queued via skill. Markdown plan target: {plan_path}")
+        if hasattr(self, '_pending_input'):
+            self._pending_input.put(msg)
+        else:
+            self.console.print("[bold red]Plan mode unavailable: input queue not initialized[/]")
+    
     def _handle_background_command(self, cmd: str):
         """Handle /background <prompt> — run a prompt in a separate background session.
 
diff --git a/gateway/run.py b/gateway/run.py
index 8941fcec..c8c5831e 100644
--- a/gateway/run.py
+++ b/gateway/run.py
@@ -1114,7 +1114,7 @@ class GatewayRunner:
         
         # Emit command:* hook for any recognized slash command
         _known_commands = {"new", "reset", "help", "status", "stop", "model", "reasoning",
-                          "personality", "retry", "undo", "sethome", "set-home",
+                          "personality", "plan", "retry", "undo", "sethome", "set-home",
                           "compress", "usage", "insights", "reload-mcp", "reload_mcp",
                           "update", "title", "resume", "provider", "rollback",
                           "background", "reasoning", "voice"}
@@ -1149,6 +1149,27 @@ class GatewayRunner:
         
         if command == "personality":
             return await self._handle_personality_command(event)
+
+        if command == "plan":
+            try:
+                from agent.skill_commands import build_plan_path, build_skill_invocation_message
+
+                user_instruction = event.get_command_args().strip()
+                plan_path = build_plan_path(user_instruction)
+                event.text = build_skill_invocation_message(
+                    "/plan",
+                    user_instruction,
+                    task_id=_quick_key,
+                    runtime_note=(
+                        f"Save the markdown plan with write_file to this exact path: {plan_path}"
+                    ),
+                )
+                if not event.text:
+                    return "Failed to load the bundled /plan skill."
+                command = None
+            except Exception as e:
+                logger.exception("Failed to prepare /plan command")
+                return f"Failed to enter plan mode: {e}"
         
         if command == "retry":
             return await self._handle_retry_command(event)
diff --git a/skills/software-development/plan/SKILL.md b/skills/software-development/plan/SKILL.md
new file mode 100644
index 00000000..92f39e8c
--- /dev/null
+++ b/skills/software-development/plan/SKILL.md
@@ -0,0 +1,55 @@
+---
+name: plan
+description: Plan mode for Hermes — inspect context, write a markdown plan, save it under $HERMES_HOME/plans, and do not execute the work.
+version: 1.0.0
+author: Hermes Agent
+license: MIT
+metadata:
+  hermes:
+    tags: [planning, plan-mode, implementation, workflow]
+    related_skills: [writing-plans, subagent-driven-development]
+---
+
+# Plan Mode
+
+Use this skill when the user wants a plan instead of execution.
+
+## Core behavior
+
+For this turn, you are planning only.
+
+- Do not implement code.
+- Do not edit project files except the plan markdown file.
+- Do not run mutating terminal commands, commit, push, or perform external actions.
+- You may inspect the repo or other context with read-only commands/tools when needed.
+- Your deliverable is a markdown plan saved to `$HERMES_HOME/plans`.
+
+## Output requirements
+
+Write a markdown plan that is concrete and actionable.
+
+Include, when relevant:
+- Goal
+- Current context / assumptions
+- Proposed approach
+- Step-by-step plan
+- Files likely to change
+- Tests / validation
+- Risks, tradeoffs, and open questions
+
+If the task is code-related, include exact file paths, likely test targets, and verification steps.
+
+## Save location
+
+Save the plan with `write_file` under:
+- `$HERMES_HOME/plans/YYYY-MM-DD_HHMMSS-<slug>.md`
+
+If the runtime provides a specific target path, use that exact path.
+If not, create a sensible timestamped filename yourself.
+
+## Interaction style
+
+- If the request is clear enough, write the plan directly.
+- If no explicit instruction accompanies `/plan`, infer the task from the current conversation context.
+- If it is genuinely underspecified, ask a brief clarifying question instead of guessing.
+- After saving the plan, reply briefly with what you planned and the saved path.
diff --git a/tests/agent/test_skill_commands.py b/tests/agent/test_skill_commands.py
index 42a6fb4d..8daa7b36 100644
--- a/tests/agent/test_skill_commands.py
+++ b/tests/agent/test_skill_commands.py
@@ -1,13 +1,15 @@
 """Tests for agent/skill_commands.py — skill slash command scanning and platform filtering."""
 
 import os
+from datetime import datetime
 from unittest.mock import patch
 
 import tools.skills_tool as skills_tool_module
 from agent.skill_commands import (
-    scan_skill_commands,
-    build_skill_invocation_message,
+    build_plan_path,
     build_preloaded_skills_prompt,
+    build_skill_invocation_message,
+    scan_skill_commands,
 )
 
 
@@ -272,3 +274,35 @@ Generate some audio.
 
         assert msg is not None
         assert 'file_path="<path>"' in msg
+
+
+class TestPlanSkillHelpers:
+    def test_build_plan_path_uses_hermes_home_and_slugifies_request(self, tmp_path, monkeypatch):
+        monkeypatch.setenv("HERMES_HOME", str(tmp_path))
+
+        path = build_plan_path(
+            "Implement OAuth login + refresh tokens!",
+            now=datetime(2026, 3, 15, 9, 30, 45),
+        )
+
+        assert path == tmp_path / "plans" / "2026-03-15_093045-implement-oauth-login-refresh-tokens.md"
+
+    def test_plan_skill_message_can_include_runtime_save_path_note(self, tmp_path):
+        with patch("tools.skills_tool.SKILLS_DIR", tmp_path):
+            _make_skill(
+                tmp_path,
+                "plan",
+                body="Save plans under $HERMES_HOME/plans and do not execute the work.",
+            )
+            scan_skill_commands()
+            msg = build_skill_invocation_message(
+                "/plan",
+                "Add a /plan command",
+                runtime_note="Save the markdown plan with write_file to /tmp/plans/plan.md",
+            )
+
+        assert msg is not None
+        assert "Save plans under $HERMES_HOME/plans" in msg
+        assert "Add a /plan command" in msg
+        assert "/tmp/plans/plan.md" in msg
+        assert "Runtime note:" in msg
diff --git a/tests/gateway/test_plan_command.py b/tests/gateway/test_plan_command.py
new file mode 100644
index 00000000..2cfea42e
--- /dev/null
+++ b/tests/gateway/test_plan_command.py
@@ -0,0 +1,128 @@
+"""Tests for the /plan gateway slash command."""
+
+from datetime import datetime
+from types import SimpleNamespace
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+
+from agent.skill_commands import scan_skill_commands
+from gateway.config import GatewayConfig, Platform, PlatformConfig
+from gateway.platforms.base import MessageEvent
+from gateway.session import SessionEntry, SessionSource
+
+
+def _make_runner():
+    from gateway.run import GatewayRunner
+
+    runner = object.__new__(GatewayRunner)
+    runner.config = GatewayConfig(
+        platforms={Platform.TELEGRAM: PlatformConfig(enabled=True, token="***")}
+    )
+    runner.adapters = {}
+    runner._voice_mode = {}
+    runner.hooks = SimpleNamespace(emit=AsyncMock(), loaded_hooks=False)
+    runner.session_store = MagicMock()
+    runner.session_store.get_or_create_session.return_value = SessionEntry(
+        session_key="agent:main:telegram:dm:c1:u1",
+        session_id="sess-1",
+        created_at=datetime.now(),
+        updated_at=datetime.now(),
+        platform=Platform.TELEGRAM,
+        chat_type="dm",
+    )
+    runner.session_store.load_transcript.return_value = []
+    runner.session_store.has_any_sessions.return_value = True
+    runner.session_store.append_to_transcript = MagicMock()
+    runner.session_store.rewrite_transcript = MagicMock()
+    runner._running_agents = {}
+    runner._pending_messages = {}
+    runner._pending_approvals = {}
+    runner._session_db = None
+    runner._reasoning_config = None
+    runner._provider_routing = {}
+    runner._fallback_model = None
+    runner._show_reasoning = False
+    runner._is_user_authorized = lambda _source: True
+    runner._set_session_env = lambda _context: None
+    runner._run_agent = AsyncMock(
+        return_value={
+            "final_response": "planned",
+            "messages": [],
+            "tools": [],
+            "history_offset": 0,
+            "last_prompt_tokens": 0,
+        }
+    )
+    return runner
+
+
+def _make_event(text="/plan"):
+    return MessageEvent(
+        text=text,
+        source=SessionSource(
+            platform=Platform.TELEGRAM,
+            user_id="u1",
+            chat_id="c1",
+            user_name="tester",
+            chat_type="dm",
+        ),
+        message_id="m1",
+    )
+
+
+def _make_plan_skill(skills_dir):
+    skill_dir = skills_dir / "plan"
+    skill_dir.mkdir(parents=True, exist_ok=True)
+    (skill_dir / "SKILL.md").write_text(
+        """---
+name: plan
+description: Plan mode skill.
+---
+
+# Plan
+
+Use the current conversation context when no explicit instruction is provided.
+Save plans under $HERMES_HOME/plans.
+"""
+    )
+
+
+class TestGatewayPlanCommand:
+    @pytest.mark.asyncio
+    async def test_plan_command_loads_skill_and_runs_agent(self, monkeypatch, tmp_path):
+        import gateway.run as gateway_run
+
+        runner = _make_runner()
+        event = _make_event("/plan Add OAuth login")
+
+        monkeypatch.setenv("HERMES_HOME", str(tmp_path))
+        monkeypatch.setattr(gateway_run, "_resolve_runtime_agent_kwargs", lambda: {"api_key": "***"})
+        monkeypatch.setattr(
+            "agent.model_metadata.get_model_context_length",
+            lambda *_args, **_kwargs: 100_000,
+        )
+
+        with patch("tools.skills_tool.SKILLS_DIR", tmp_path):
+            _make_plan_skill(tmp_path)
+            scan_skill_commands()
+            result = await runner._handle_message(event)
+
+        assert result == "planned"
+        forwarded = runner._run_agent.call_args.kwargs["message"]
+        assert "Plan mode skill" in forwarded
+        assert "Add OAuth login" in forwarded
+        assert str(tmp_path / "plans") in forwarded
+        assert "Runtime note:" in forwarded
+
+    @pytest.mark.asyncio
+    async def test_plan_command_appears_in_help_output_via_skill_listing(self, tmp_path):
+        runner = _make_runner()
+        event = _make_event("/help")
+
+        with patch("tools.skills_tool.SKILLS_DIR", tmp_path):
+            _make_plan_skill(tmp_path)
+            scan_skill_commands()
+            result = await runner._handle_help_command(event)
+
+        assert "/plan" in result
diff --git a/tests/test_cli_plan_command.py b/tests/test_cli_plan_command.py
new file mode 100644
index 00000000..50fa1c5e
--- /dev/null
+++ b/tests/test_cli_plan_command.py
@@ -0,0 +1,66 @@
+"""Tests for the /plan CLI slash command."""
+
+from unittest.mock import MagicMock, patch
+
+from agent.skill_commands import scan_skill_commands
+from cli import HermesCLI
+
+
+def _make_cli():
+    cli_obj = HermesCLI.__new__(HermesCLI)
+    cli_obj.config = {}
+    cli_obj.console = MagicMock()
+    cli_obj.agent = None
+    cli_obj.conversation_history = []
+    cli_obj.session_id = "sess-123"
+    cli_obj._pending_input = MagicMock()
+    return cli_obj
+
+
+def _make_plan_skill(skills_dir):
+    skill_dir = skills_dir / "plan"
+    skill_dir.mkdir(parents=True, exist_ok=True)
+    (skill_dir / "SKILL.md").write_text(
+        """---
+name: plan
+description: Plan mode skill.
+---
+
+# Plan
+
+Use the current conversation context when no explicit instruction is provided.
+Save plans under $HERMES_HOME/plans.
+"""
+    )
+
+
+class TestCLIPlanCommand:
+    def test_plan_command_queues_plan_skill_message(self, tmp_path, monkeypatch):
+        monkeypatch.setenv("HERMES_HOME", str(tmp_path))
+        cli_obj = _make_cli()
+
+        with patch("tools.skills_tool.SKILLS_DIR", tmp_path):
+            _make_plan_skill(tmp_path)
+            scan_skill_commands()
+            result = cli_obj.process_command("/plan Add OAuth login")
+
+        assert result is True
+        cli_obj._pending_input.put.assert_called_once()
+        queued = cli_obj._pending_input.put.call_args[0][0]
+        assert "Plan mode skill" in queued
+        assert "Add OAuth login" in queued
+        assert str(tmp_path / "plans") in queued
+        assert "Runtime note:" in queued
+
+    def test_plan_without_args_uses_skill_context_guidance(self, tmp_path, monkeypatch):
+        monkeypatch.setenv("HERMES_HOME", str(tmp_path))
+        cli_obj = _make_cli()
+
+        with patch("tools.skills_tool.SKILLS_DIR", tmp_path):
+            _make_plan_skill(tmp_path)
+            scan_skill_commands()
+            cli_obj.process_command("/plan")
+
+        queued = cli_obj._pending_input.put.call_args[0][0]
+        assert "current conversation context" in queued
+        assert "conversation-plan" in queued
diff --git a/website/docs/reference/skills-catalog.md b/website/docs/reference/skills-catalog.md
index 1be8a5f3..a6eb510e 100644
--- a/website/docs/reference/skills-catalog.md
+++ b/website/docs/reference/skills-catalog.md
@@ -236,6 +236,7 @@ Skills for controlling smart home devices — lights, switches, sensors, and hom
 | Skill | Description | Path |
 |-------|-------------|------|
 | `code-review` | Guidelines for performing thorough code reviews with security and quality focus | `software-development/code-review` |
+| `plan` | Plan mode for Hermes — inspect context, write a markdown plan, save it under `$HERMES_HOME/plans`, and do not execute the work. | `software-development/plan` |
 | `requesting-code-review` | Use when completing tasks, implementing major features, or before merging. Validates work meets requirements through systematic review process. | `software-development/requesting-code-review` |
 | `subagent-driven-development` | Use when executing implementation plans with independent tasks. Dispatches fresh delegate_task per task with two-stage review (spec compliance then code quality). | `software-development/subagent-driven-development` |
 | `systematic-debugging` | Use when encountering any bug, test failure, or unexpected behavior. 4-phase root cause investigation — NO fixes without understanding the problem first. | `software-development/systematic-debugging` |
diff --git a/website/docs/reference/slash-commands.md b/website/docs/reference/slash-commands.md
index 9ef45460..a9e9f420 100644
--- a/website/docs/reference/slash-commands.md
+++ b/website/docs/reference/slash-commands.md
@@ -11,7 +11,7 @@ Hermes has two slash-command surfaces:
 - **Interactive CLI slash commands** — handled by `cli.py` / `hermes_cli/commands.py`
 - **Messaging slash commands** — handled by `gateway/run.py`
 
-Installed skills are also exposed as dynamic slash commands on both surfaces.
+Installed skills are also exposed as dynamic slash commands on both surfaces. That includes bundled skills like `/plan`, which opens plan mode and saves markdown plans under `~/.hermes/plans/`.
 
 ## Interactive CLI slash commands
 
@@ -32,6 +32,7 @@ Type `/` in the CLI to open the autocomplete menu. Built-in commands are case-in
 | `/compress` | Manually compress conversation context (flush memories + summarize) |
 | `/rollback` | List or restore filesystem checkpoints (usage: /rollback [number]) |
 | `/background` | Run a prompt in the background (usage: /background &lt;prompt&gt;) |
+| `/plan [request]` | Load the bundled `plan` skill to write a markdown plan instead of executing the work. Plans are saved under `~/.hermes/plans/`. |
 
 ### Configuration
 
@@ -109,6 +110,7 @@ The messaging gateway supports the following built-in commands inside Telegram,
 | `/voice [on\|off\|tts\|join\|channel\|leave\|status]` | Control spoken replies in chat. `join`/`channel`/`leave` manage Discord voice-channel mode. |
 | `/rollback [number]` | List or restore filesystem checkpoints. |
 | `/background &lt;prompt&gt;` | Run a prompt in a separate background session. |
+| `/plan [request]` | Load the bundled `plan` skill to write a markdown plan instead of executing the work. Plans are saved under `~/.hermes/plans/`. |
 | `/reload-mcp` | Reload MCP servers from config. |
 | `/update` | Update Hermes Agent to the latest version. |
 | `/help` | Show messaging help. |
diff --git a/website/docs/user-guide/features/skills.md b/website/docs/user-guide/features/skills.md
index ae2d6f05..bf40f5e0 100644
--- a/website/docs/user-guide/features/skills.md
+++ b/website/docs/user-guide/features/skills.md
@@ -24,11 +24,14 @@ Every installed skill is automatically available as a slash command:
 /gif-search funny cats
 /axolotl help me fine-tune Llama 3 on my dataset
 /github-pr-workflow create a PR for the auth refactor
+/plan design a rollout for migrating our auth provider
 
 # Just the skill name loads it and lets the agent ask what you need:
 /excalidraw
 ```
 
+The bundled `plan` skill is a good example of a skill-backed slash command with custom behavior. Running `/plan [request]` tells Hermes to inspect context if needed, write a markdown implementation plan instead of executing the task, and save the result under `~/.hermes/plans/`.
+
 You can also interact with skills through natural conversation:
 
 ```bash

From b14a07315b5f9420f4396085501d743a01352c8e Mon Sep 17 00:00:00 2001
From: Teknium <127238744+teknium1@users.noreply.github.com>
Date: Sat, 14 Mar 2026 21:28:51 -0700
Subject: [PATCH 4/4] fix: save /plan output in workspace (#1381)

---
 agent/skill_commands.py                    | 12 ++++++++----
 cli.py                                     |  3 ++-
 gateway/run.py                             |  3 ++-
 skills/software-development/plan/SKILL.md  | 10 ++++++----
 tests/agent/test_skill_commands.py         | 19 +++++++++++--------
 tests/gateway/test_plan_command.py         |  7 ++++---
 tests/test_cli_plan_command.py             | 11 ++++++-----
 website/docs/reference/skills-catalog.md   |  2 +-
 website/docs/reference/slash-commands.md   |  6 +++---
 website/docs/user-guide/features/skills.md |  2 +-
 10 files changed, 44 insertions(+), 31 deletions(-)

diff --git a/agent/skill_commands.py b/agent/skill_commands.py
index 8afdfa93..67315ee8 100644
--- a/agent/skill_commands.py
+++ b/agent/skill_commands.py
@@ -7,7 +7,6 @@ can invoke skills via /skill-name commands and prompt-only built-ins like
 
 import json
 import logging
-import os
 import re
 from datetime import datetime
 from pathlib import Path
@@ -24,15 +23,20 @@ def build_plan_path(
     *,
     now: datetime | None = None,
 ) -> Path:
-    """Return the default markdown path for a /plan invocation."""
-    hermes_home = Path(os.getenv("HERMES_HOME", Path.home() / ".hermes"))
+    """Return the default workspace-relative markdown path for a /plan invocation.
+
+    Relative paths are intentional: file tools are task/backend-aware and resolve
+    them against the active working directory for local, docker, ssh, modal,
+    daytona, and similar terminal backends. That keeps the plan with the active
+    workspace instead of the Hermes host's global home directory.
+    """
     slug_source = (user_instruction or "").strip().splitlines()[0] if user_instruction else ""
     slug = _PLAN_SLUG_RE.sub("-", slug_source.lower()).strip("-")
     if slug:
         slug = "-".join(part for part in slug.split("-")[:8] if part)[:48].strip("-")
     slug = slug or "conversation-plan"
     timestamp = (now or datetime.now()).strftime("%Y-%m-%d_%H%M%S")
-    return hermes_home / "plans" / f"{timestamp}-{slug}.md"
+    return Path(".hermes") / "plans" / f"{timestamp}-{slug}.md"
 
 
 def _load_skill_payload(skill_identifier: str, task_id: str | None = None) -> tuple[dict[str, Any], Path | None, str] | None:
diff --git a/cli.py b/cli.py
index 654dfb25..70a202d3 100755
--- a/cli.py
+++ b/cli.py
@@ -3318,7 +3318,8 @@ class HermesCLI:
             user_instruction,
             task_id=self.session_id,
             runtime_note=(
-                f"Save the markdown plan with write_file to this exact path: {plan_path}"
+                "Save the markdown plan with write_file to this exact relative path "
+                f"inside the active workspace/backend cwd: {plan_path}"
             ),
         )
 
diff --git a/gateway/run.py b/gateway/run.py
index c8c5831e..67e93d2c 100644
--- a/gateway/run.py
+++ b/gateway/run.py
@@ -1161,7 +1161,8 @@ class GatewayRunner:
                     user_instruction,
                     task_id=_quick_key,
                     runtime_note=(
-                        f"Save the markdown plan with write_file to this exact path: {plan_path}"
+                        "Save the markdown plan with write_file to this exact relative path "
+                        f"inside the active workspace/backend cwd: {plan_path}"
                     ),
                 )
                 if not event.text:
diff --git a/skills/software-development/plan/SKILL.md b/skills/software-development/plan/SKILL.md
index 92f39e8c..daf6bf79 100644
--- a/skills/software-development/plan/SKILL.md
+++ b/skills/software-development/plan/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: plan
-description: Plan mode for Hermes — inspect context, write a markdown plan, save it under $HERMES_HOME/plans, and do not execute the work.
+description: Plan mode for Hermes — inspect context, write a markdown plan into the active workspace's `.hermes/plans/` directory, and do not execute the work.
 version: 1.0.0
 author: Hermes Agent
 license: MIT
@@ -22,7 +22,7 @@ For this turn, you are planning only.
 - Do not edit project files except the plan markdown file.
 - Do not run mutating terminal commands, commit, push, or perform external actions.
 - You may inspect the repo or other context with read-only commands/tools when needed.
-- Your deliverable is a markdown plan saved to `$HERMES_HOME/plans`.
+- Your deliverable is a markdown plan saved inside the active workspace under `.hermes/plans/`.
 
 ## Output requirements
 
@@ -42,10 +42,12 @@ If the task is code-related, include exact file paths, likely test targets, and
 ## Save location
 
 Save the plan with `write_file` under:
-- `$HERMES_HOME/plans/YYYY-MM-DD_HHMMSS-<slug>.md`
+- `.hermes/plans/YYYY-MM-DD_HHMMSS-<slug>.md`
+
+Treat that as relative to the active working directory / backend workspace. Hermes file tools are backend-aware, so using this relative path keeps the plan with the workspace on local, docker, ssh, modal, and daytona backends.
 
 If the runtime provides a specific target path, use that exact path.
-If not, create a sensible timestamped filename yourself.
+If not, create a sensible timestamped filename yourself under `.hermes/plans/`.
 
 ## Interaction style
 
diff --git a/tests/agent/test_skill_commands.py b/tests/agent/test_skill_commands.py
index 8daa7b36..c0244613 100644
--- a/tests/agent/test_skill_commands.py
+++ b/tests/agent/test_skill_commands.py
@@ -2,6 +2,7 @@
 
 import os
 from datetime import datetime
+from pathlib import Path
 from unittest.mock import patch
 
 import tools.skills_tool as skills_tool_module
@@ -277,32 +278,34 @@ Generate some audio.
 
 
 class TestPlanSkillHelpers:
-    def test_build_plan_path_uses_hermes_home_and_slugifies_request(self, tmp_path, monkeypatch):
-        monkeypatch.setenv("HERMES_HOME", str(tmp_path))
-
+    def test_build_plan_path_uses_workspace_relative_dir_and_slugifies_request(self):
         path = build_plan_path(
             "Implement OAuth login + refresh tokens!",
             now=datetime(2026, 3, 15, 9, 30, 45),
         )
 
-        assert path == tmp_path / "plans" / "2026-03-15_093045-implement-oauth-login-refresh-tokens.md"
+        assert path == Path(".hermes") / "plans" / "2026-03-15_093045-implement-oauth-login-refresh-tokens.md"
 
     def test_plan_skill_message_can_include_runtime_save_path_note(self, tmp_path):
         with patch("tools.skills_tool.SKILLS_DIR", tmp_path):
             _make_skill(
                 tmp_path,
                 "plan",
-                body="Save plans under $HERMES_HOME/plans and do not execute the work.",
+                body="Save plans under .hermes/plans in the active workspace and do not execute the work.",
             )
             scan_skill_commands()
             msg = build_skill_invocation_message(
                 "/plan",
                 "Add a /plan command",
-                runtime_note="Save the markdown plan with write_file to /tmp/plans/plan.md",
+                runtime_note=(
+                    "Save the markdown plan with write_file to this exact relative path inside "
+                    "the active workspace/backend cwd: .hermes/plans/plan.md"
+                ),
             )
 
         assert msg is not None
-        assert "Save plans under $HERMES_HOME/plans" in msg
+        assert "Save plans under $HERMES_HOME/plans" not in msg
+        assert ".hermes/plans" in msg
         assert "Add a /plan command" in msg
-        assert "/tmp/plans/plan.md" in msg
+        assert ".hermes/plans/plan.md" in msg
         assert "Runtime note:" in msg
diff --git a/tests/gateway/test_plan_command.py b/tests/gateway/test_plan_command.py
index 2cfea42e..d43f46cd 100644
--- a/tests/gateway/test_plan_command.py
+++ b/tests/gateway/test_plan_command.py
@@ -83,7 +83,7 @@ description: Plan mode skill.
 # Plan
 
 Use the current conversation context when no explicit instruction is provided.
-Save plans under $HERMES_HOME/plans.
+Save plans under the active workspace's .hermes/plans directory.
 """
     )
 
@@ -96,7 +96,6 @@ class TestGatewayPlanCommand:
         runner = _make_runner()
         event = _make_event("/plan Add OAuth login")
 
-        monkeypatch.setenv("HERMES_HOME", str(tmp_path))
         monkeypatch.setattr(gateway_run, "_resolve_runtime_agent_kwargs", lambda: {"api_key": "***"})
         monkeypatch.setattr(
             "agent.model_metadata.get_model_context_length",
@@ -112,7 +111,9 @@ class TestGatewayPlanCommand:
         forwarded = runner._run_agent.call_args.kwargs["message"]
         assert "Plan mode skill" in forwarded
         assert "Add OAuth login" in forwarded
-        assert str(tmp_path / "plans") in forwarded
+        assert ".hermes/plans" in forwarded
+        assert str(tmp_path / "plans") not in forwarded
+        assert "active workspace/backend cwd" in forwarded
         assert "Runtime note:" in forwarded
 
     @pytest.mark.asyncio
diff --git a/tests/test_cli_plan_command.py b/tests/test_cli_plan_command.py
index 50fa1c5e..8f8205d7 100644
--- a/tests/test_cli_plan_command.py
+++ b/tests/test_cli_plan_command.py
@@ -29,14 +29,13 @@ description: Plan mode skill.
 # Plan
 
 Use the current conversation context when no explicit instruction is provided.
-Save plans under $HERMES_HOME/plans.
+Save plans under the active workspace's .hermes/plans directory.
 """
     )
 
 
 class TestCLIPlanCommand:
     def test_plan_command_queues_plan_skill_message(self, tmp_path, monkeypatch):
-        monkeypatch.setenv("HERMES_HOME", str(tmp_path))
         cli_obj = _make_cli()
 
         with patch("tools.skills_tool.SKILLS_DIR", tmp_path):
@@ -49,11 +48,12 @@ class TestCLIPlanCommand:
         queued = cli_obj._pending_input.put.call_args[0][0]
         assert "Plan mode skill" in queued
         assert "Add OAuth login" in queued
-        assert str(tmp_path / "plans") in queued
+        assert ".hermes/plans" in queued
+        assert str(tmp_path / "plans") not in queued
+        assert "active workspace/backend cwd" in queued
         assert "Runtime note:" in queued
 
     def test_plan_without_args_uses_skill_context_guidance(self, tmp_path, monkeypatch):
-        monkeypatch.setenv("HERMES_HOME", str(tmp_path))
         cli_obj = _make_cli()
 
         with patch("tools.skills_tool.SKILLS_DIR", tmp_path):
@@ -63,4 +63,5 @@ class TestCLIPlanCommand:
 
         queued = cli_obj._pending_input.put.call_args[0][0]
         assert "current conversation context" in queued
-        assert "conversation-plan" in queued
+        assert ".hermes/plans/" in queued
+        assert "conversation-plan.md" in queued
diff --git a/website/docs/reference/skills-catalog.md b/website/docs/reference/skills-catalog.md
index a6eb510e..7e128f11 100644
--- a/website/docs/reference/skills-catalog.md
+++ b/website/docs/reference/skills-catalog.md
@@ -236,7 +236,7 @@ Skills for controlling smart home devices — lights, switches, sensors, and hom
 | Skill | Description | Path |
 |-------|-------------|------|
 | `code-review` | Guidelines for performing thorough code reviews with security and quality focus | `software-development/code-review` |
-| `plan` | Plan mode for Hermes — inspect context, write a markdown plan, save it under `$HERMES_HOME/plans`, and do not execute the work. | `software-development/plan` |
+| `plan` | Plan mode for Hermes — inspect context, write a markdown plan into `.hermes/plans/` in the active workspace/backend working directory, and do not execute the work. | `software-development/plan` |
 | `requesting-code-review` | Use when completing tasks, implementing major features, or before merging. Validates work meets requirements through systematic review process. | `software-development/requesting-code-review` |
 | `subagent-driven-development` | Use when executing implementation plans with independent tasks. Dispatches fresh delegate_task per task with two-stage review (spec compliance then code quality). | `software-development/subagent-driven-development` |
 | `systematic-debugging` | Use when encountering any bug, test failure, or unexpected behavior. 4-phase root cause investigation — NO fixes without understanding the problem first. | `software-development/systematic-debugging` |
diff --git a/website/docs/reference/slash-commands.md b/website/docs/reference/slash-commands.md
index a9e9f420..d69d1c75 100644
--- a/website/docs/reference/slash-commands.md
+++ b/website/docs/reference/slash-commands.md
@@ -11,7 +11,7 @@ Hermes has two slash-command surfaces:
 - **Interactive CLI slash commands** — handled by `cli.py` / `hermes_cli/commands.py`
 - **Messaging slash commands** — handled by `gateway/run.py`
 
-Installed skills are also exposed as dynamic slash commands on both surfaces. That includes bundled skills like `/plan`, which opens plan mode and saves markdown plans under `~/.hermes/plans/`.
+Installed skills are also exposed as dynamic slash commands on both surfaces. That includes bundled skills like `/plan`, which opens plan mode and saves markdown plans under `.hermes/plans/` relative to the active workspace/backend working directory.
 
 ## Interactive CLI slash commands
 
@@ -32,7 +32,7 @@ Type `/` in the CLI to open the autocomplete menu. Built-in commands are case-in
 | `/compress` | Manually compress conversation context (flush memories + summarize) |
 | `/rollback` | List or restore filesystem checkpoints (usage: /rollback [number]) |
 | `/background` | Run a prompt in the background (usage: /background &lt;prompt&gt;) |
-| `/plan [request]` | Load the bundled `plan` skill to write a markdown plan instead of executing the work. Plans are saved under `~/.hermes/plans/`. |
+| `/plan [request]` | Load the bundled `plan` skill to write a markdown plan instead of executing the work. Plans are saved under `.hermes/plans/` relative to the active workspace/backend working directory. |
 
 ### Configuration
 
@@ -110,7 +110,7 @@ The messaging gateway supports the following built-in commands inside Telegram,
 | `/voice [on\|off\|tts\|join\|channel\|leave\|status]` | Control spoken replies in chat. `join`/`channel`/`leave` manage Discord voice-channel mode. |
 | `/rollback [number]` | List or restore filesystem checkpoints. |
 | `/background &lt;prompt&gt;` | Run a prompt in a separate background session. |
-| `/plan [request]` | Load the bundled `plan` skill to write a markdown plan instead of executing the work. Plans are saved under `~/.hermes/plans/`. |
+| `/plan [request]` | Load the bundled `plan` skill to write a markdown plan instead of executing the work. Plans are saved under `.hermes/plans/` relative to the active workspace/backend working directory. |
 | `/reload-mcp` | Reload MCP servers from config. |
 | `/update` | Update Hermes Agent to the latest version. |
 | `/help` | Show messaging help. |
diff --git a/website/docs/user-guide/features/skills.md b/website/docs/user-guide/features/skills.md
index bf40f5e0..f9073ce7 100644
--- a/website/docs/user-guide/features/skills.md
+++ b/website/docs/user-guide/features/skills.md
@@ -30,7 +30,7 @@ Every installed skill is automatically available as a slash command:
 /excalidraw
 ```
 
-The bundled `plan` skill is a good example of a skill-backed slash command with custom behavior. Running `/plan [request]` tells Hermes to inspect context if needed, write a markdown implementation plan instead of executing the task, and save the result under `~/.hermes/plans/`.
+The bundled `plan` skill is a good example of a skill-backed slash command with custom behavior. Running `/plan [request]` tells Hermes to inspect context if needed, write a markdown implementation plan instead of executing the task, and save the result under `.hermes/plans/` relative to the active workspace/backend working directory.
 
 You can also interact with skills through natural conversation: