merge: resolve conflicts with main (show_cost, turn routing, docker docs)

2026-03-16 14:22:38 -07:00 · 2026-03-16 14:22:38 -07:00 · f4d61c168b
commit f4d61c168b
parent 8feb9e4656 5e5c92663d
59 changed files with 4528 additions and 607 deletions
--- a/gateway/platforms/email.py
+++ b/gateway/platforms/email.py
@ -135,14 +135,23 @@ def _extract_email_address(raw: str) -> str:
    return raw.strip().lower()


-def _extract_attachments(msg: email_lib.message.Message) -> List[Dict[str, Any]]:
-    """Extract attachment metadata and cache files locally."""
+def _extract_attachments(
+    msg: email_lib.message.Message,
+    skip_attachments: bool = False,
+) -> List[Dict[str, Any]]:
+    """Extract attachment metadata and cache files locally.
+
+    When *skip_attachments* is True, all attachment/inline parts are ignored
+    (useful for malware protection or bandwidth savings).
+    """
    attachments = []
    if not msg.is_multipart():
        return attachments

    for part in msg.walk():
        disposition = str(part.get("Content-Disposition", ""))
+        if skip_attachments and ("attachment" in disposition or "inline" in disposition):
+            continue
        if "attachment" not in disposition and "inline" not in disposition:
            continue
        # Skip text/plain and text/html body parts
@ -196,6 +205,13 @@ class EmailAdapter(BasePlatformAdapter):
        self._smtp_port = int(os.getenv("EMAIL_SMTP_PORT", "587"))
        self._poll_interval = int(os.getenv("EMAIL_POLL_INTERVAL", "15"))

+        # Skip attachments — configured via config.yaml:
+        #   platforms:
+        #     email:
+        #       skip_attachments: true
+        extra = config.extra or {}
+        self._skip_attachments = extra.get("skip_attachments", False)
+
        # Track message IDs we've already processed to avoid duplicates
        self._seen_uids: set = set()
        self._poll_task: Optional[asyncio.Task] = None
@ -306,7 +322,7 @@ class EmailAdapter(BasePlatformAdapter):
                message_id = msg.get("Message-ID", "")
                in_reply_to = msg.get("In-Reply-To", "")
                body = _extract_text_body(msg)
-                attachments = _extract_attachments(msg)
+                attachments = _extract_attachments(msg, skip_attachments=self._skip_attachments)

                results.append({
                    "uid": uid,
--- a/gateway/platforms/telegram.py
+++ b/gateway/platforms/telegram.py
@ -202,8 +202,26 @@ class TelegramAdapter(BasePlatformAdapter):
                self._handle_media_message
            ))
            
-            # Start polling in background
-            await self._app.initialize()
+            # Start polling — retry initialize() for transient TLS resets
+            try:
+                from telegram.error import NetworkError, TimedOut
+            except ImportError:
+                NetworkError = TimedOut = OSError  # type: ignore[misc,assignment]
+            _max_connect = 3
+            for _attempt in range(_max_connect):
+                try:
+                    await self._app.initialize()
+                    break
+                except (NetworkError, TimedOut, OSError) as init_err:
+                    if _attempt < _max_connect - 1:
+                        wait = 2 ** _attempt
+                        logger.warning(
+                            "[%s] Connect attempt %d/%d failed: %s — retrying in %ds",
+                            self.name, _attempt + 1, _max_connect, init_err, wait,
+                        )
+                        await asyncio.sleep(wait)
+                    else:
+                        raise
            await self._app.start()
            loop = asyncio.get_running_loop()

@ -265,6 +283,8 @@ class TelegramAdapter(BasePlatformAdapter):
                    release_scoped_lock("telegram-bot-token", self._token_lock_identity)
                except Exception:
                    pass
+            message = f"Telegram startup failed: {e}"
+            self._set_fatal_error("telegram_connect_error", message, retryable=True)
            logger.error("[%s] Failed to connect to Telegram: %s", self.name, e, exc_info=True)
            return False
    
@ -334,32 +354,47 @@ class TelegramAdapter(BasePlatformAdapter):
            message_ids = []
            thread_id = metadata.get("thread_id") if metadata else None
            
+            try:
+                from telegram.error import NetworkError as _NetErr
+            except ImportError:
+                _NetErr = OSError  # type: ignore[misc,assignment]
+
            for i, chunk in enumerate(chunks):
-                # Try Markdown first, fall back to plain text if it fails
-                try:
-                    msg = await self._bot.send_message(
-                        chat_id=int(chat_id),
-                        text=chunk,
-                        parse_mode=ParseMode.MARKDOWN_V2,
-                        reply_to_message_id=int(reply_to) if reply_to and i == 0 else None,
-                        message_thread_id=int(thread_id) if thread_id else None,
-                    )
-                except Exception as md_error:
-                    # Markdown parsing failed, try plain text
-                    if "parse" in str(md_error).lower() or "markdown" in str(md_error).lower():
-                        logger.warning("[%s] MarkdownV2 parse failed, falling back to plain text: %s", self.name, md_error)
-                        # Strip MDV2 escape backslashes so the user doesn't
-                        # see raw backslashes littered through the message.
-                        plain_chunk = _strip_mdv2(chunk)
-                        msg = await self._bot.send_message(
-                            chat_id=int(chat_id),
-                            text=plain_chunk,
-                            parse_mode=None,  # Plain text
-                            reply_to_message_id=int(reply_to) if reply_to and i == 0 else None,
-                            message_thread_id=int(thread_id) if thread_id else None,
-                        )
-                    else:
-                        raise  # Re-raise if not a parse error
+                msg = None
+                for _send_attempt in range(3):
+                    try:
+                        # Try Markdown first, fall back to plain text if it fails
+                        try:
+                            msg = await self._bot.send_message(
+                                chat_id=int(chat_id),
+                                text=chunk,
+                                parse_mode=ParseMode.MARKDOWN_V2,
+                                reply_to_message_id=int(reply_to) if reply_to and i == 0 else None,
+                                message_thread_id=int(thread_id) if thread_id else None,
+                            )
+                        except Exception as md_error:
+                            # Markdown parsing failed, try plain text
+                            if "parse" in str(md_error).lower() or "markdown" in str(md_error).lower():
+                                logger.warning("[%s] MarkdownV2 parse failed, falling back to plain text: %s", self.name, md_error)
+                                plain_chunk = _strip_mdv2(chunk)
+                                msg = await self._bot.send_message(
+                                    chat_id=int(chat_id),
+                                    text=plain_chunk,
+                                    parse_mode=None,
+                                    reply_to_message_id=int(reply_to) if reply_to and i == 0 else None,
+                                    message_thread_id=int(thread_id) if thread_id else None,
+                                )
+                            else:
+                                raise
+                        break  # success
+                    except _NetErr as send_err:
+                        if _send_attempt < 2:
+                            wait = 2 ** _send_attempt
+                            logger.warning("[%s] Network error on send (attempt %d/3), retrying in %ds: %s",
+                                           self.name, _send_attempt + 1, wait, send_err)
+                            await asyncio.sleep(wait)
+                        else:
+                            raise
                message_ids.append(str(msg.message_id))
            
            return SendResult(
--- a/gateway/run.py
+++ b/gateway/run.py
@ -157,6 +157,12 @@ if _config_path.exists():
                    "base_url": "AUXILIARY_WEB_EXTRACT_BASE_URL",
                    "api_key": "AUXILIARY_WEB_EXTRACT_API_KEY",
                },
+                "approval": {
+                    "provider": "AUXILIARY_APPROVAL_PROVIDER",
+                    "model": "AUXILIARY_APPROVAL_MODEL",
+                    "base_url": "AUXILIARY_APPROVAL_BASE_URL",
+                    "api_key": "AUXILIARY_APPROVAL_API_KEY",
+                },
            }
            for _task_key, _env_map in _aux_task_env.items():
                _task_cfg = _auxiliary_cfg.get(_task_key, {})
@ -318,6 +324,7 @@ class GatewayRunner:
        self._show_reasoning = self._load_show_reasoning()
        self._provider_routing = self._load_provider_routing()
        self._fallback_model = self._load_fallback_model()
+        self._smart_model_routing = self._load_smart_model_routing()

        # Wire process registry into session store for reset protection
        from tools.process_registry import process_registry
@ -587,6 +594,18 @@ class GatewayRunner:
            group_sessions_per_user=getattr(config, "group_sessions_per_user", True),
        )

+    def _resolve_turn_agent_config(self, user_message: str, model: str, runtime_kwargs: dict) -> dict:
+        from agent.smart_model_routing import resolve_turn_route
+
+        primary = {
+            "model": model,
+            "api_key": runtime_kwargs.get("api_key"),
+            "base_url": runtime_kwargs.get("base_url"),
+            "provider": runtime_kwargs.get("provider"),
+            "api_mode": runtime_kwargs.get("api_mode"),
+        }
+        return resolve_turn_route(user_message, getattr(self, "_smart_model_routing", {}), primary)
+
    async def _handle_adapter_fatal_error(self, adapter: BasePlatformAdapter) -> None:
        """React to a non-retryable adapter failure after startup."""
        logger.error(
@ -789,6 +808,20 @@ class GatewayRunner:
            pass
        return None

+    @staticmethod
+    def _load_smart_model_routing() -> dict:
+        """Load optional smart cheap-vs-strong model routing config."""
+        try:
+            import yaml as _y
+            cfg_path = _hermes_home / "config.yaml"
+            if cfg_path.exists():
+                with open(cfg_path, encoding="utf-8") as _f:
+                    cfg = _y.safe_load(_f) or {}
+                return cfg.get("smart_model_routing", {}) or {}
+        except Exception:
+            pass
+        return {}
+
    async def start(self) -> bool:
        """
        Start the gateway and all configured platform adapters.
@ -831,12 +864,15 @@ class GatewayRunner:
            logger.warning("Process checkpoint recovery: %s", e)
        
        connected_count = 0
+        enabled_platform_count = 0
        startup_nonretryable_errors: list[str] = []
+        startup_retryable_errors: list[str] = []
        
        # Initialize and connect each configured platform
        for platform, platform_config in self.config.platforms.items():
            if not platform_config.enabled:
                continue
+            enabled_platform_count += 1
            
            adapter = self._create_adapter(platform, platform_config)
            if not adapter:
@ -858,12 +894,22 @@ class GatewayRunner:
                    logger.info("✓ %s connected", platform.value)
                else:
                    logger.warning("✗ %s failed to connect", platform.value)
-                    if adapter.has_fatal_error and not adapter.fatal_error_retryable:
-                        startup_nonretryable_errors.append(
+                    if adapter.has_fatal_error:
+                        target = (
+                            startup_retryable_errors
+                            if adapter.fatal_error_retryable
+                            else startup_nonretryable_errors
+                        )
+                        target.append(
                            f"{platform.value}: {adapter.fatal_error_message}"
                        )
+                    else:
+                        startup_retryable_errors.append(
+                            f"{platform.value}: failed to connect"
+                        )
            except Exception as e:
                logger.error("✗ %s error: %s", platform.value, e)
+                startup_retryable_errors.append(f"{platform.value}: {e}")
        
        if connected_count == 0:
            if startup_nonretryable_errors:
@ -876,7 +922,16 @@ class GatewayRunner:
                    pass
                self._request_clean_exit(reason)
                return True
-            logger.warning("No messaging platforms connected.")
+            if enabled_platform_count > 0:
+                reason = "; ".join(startup_retryable_errors) or "all configured messaging platforms failed to connect"
+                logger.error("Gateway failed to connect any configured messaging platform: %s", reason)
+                try:
+                    from gateway.status import write_runtime_status
+                    write_runtime_status(gateway_state="startup_failed", exit_reason=reason)
+                except Exception:
+                    pass
+                return False
+            logger.warning("No messaging platforms enabled.")
            logger.info("Gateway will continue running for cron job execution.")
        
        # Update delivery router with adapters
@ -1430,8 +1485,17 @@ class GatewayRunner:
        # Set environment variables for tools
        self._set_session_env(context)
        
+        # Read privacy.redact_pii from config (re-read per message)
+        _redact_pii = False
+        try:
+            with open(_config_path, encoding="utf-8") as _pf:
+                _pcfg = yaml.safe_load(_pf) or {}
+            _redact_pii = bool((_pcfg.get("privacy") or {}).get("redact_pii", False))
+        except Exception:
+            pass
+
        # Build the context prompt to inject
-        context_prompt = build_session_context_prompt(context)
+        context_prompt = build_session_context_prompt(context, redact_pii=_redact_pii)
        
        # If the previous session expired and was auto-reset, prepend a notice
        # so the agent knows this is a fresh conversation (not an intentional /reset).
@ -2899,11 +2963,12 @@ class GatewayRunner:
            max_iterations = int(os.getenv("HERMES_MAX_ITERATIONS", "90"))
            reasoning_config = self._load_reasoning_config()
            self._reasoning_config = reasoning_config
+            turn_route = self._resolve_turn_agent_config(prompt, model, runtime_kwargs)

            def run_sync():
                agent = AIAgent(
-                    model=model,
-                    **runtime_kwargs,
+                    model=turn_route["model"],
+                    **turn_route["runtime"],
                    max_iterations=max_iterations,
                    quiet_mode=True,
                    verbose_logging=False,
@ -3625,13 +3690,9 @@ class GatewayRunner:
          1. Immediately understand what the user sent (no extra tool call).
          2. Re-examine the image with vision_analyze if it needs more detail.

-        Athabasca persistence should happen through Athabasca's own POST
-        /api/uploads flow, using the returned asset.publicUrl rather than local
-        cache paths.
-
        Args:
-            user_text:      The user's original caption / message text.
-            image_paths:    List of local file paths to cached images.
+            user_text:   The user's original caption / message text.
+            image_paths: List of local file paths to cached images.

        Returns:
            The enriched message string with vision descriptions prepended.
@ -3656,16 +3717,10 @@ class GatewayRunner:
                result = _json.loads(result_json)
                if result.get("success"):
                    description = result.get("analysis", "")
-                    athabasca_note = (
-                        "\n[If this image needs to persist in Athabasca state, upload the cached file "
-                        "through Athabasca POST /api/uploads and use the returned asset.publicUrl. "
-                        "Do not store the local cache path as the canonical imageUrl.]"
-                    )
                    enriched_parts.append(
                        f"[The user sent an image~ Here's what I can see:\n{description}]\n"
                        f"[If you need a closer look, use vision_analyze with "
                        f"image_url: {path} ~]"
-                        f"{athabasca_note}"
                    )
                else:
                    enriched_parts.append(
@ -4177,9 +4232,10 @@ class GatewayRunner:
                except Exception as _sc_err:
                    logger.debug("Could not set up stream consumer: %s", _sc_err)

+            turn_route = self._resolve_turn_agent_config(message, model, runtime_kwargs)
            agent = AIAgent(
-                model=model,
-                **runtime_kwargs,
+                model=turn_route["model"],
+                **turn_route["runtime"],
                max_iterations=max_iterations,
                quiet_mode=True,
                verbose_logging=False,
--- a/gateway/session.py
+++ b/gateway/session.py
@ -8,9 +8,11 @@ Handles:
 - Dynamic system prompt injection (agent knows its context)
 """

+import hashlib
 import logging
 import os
 import json
+import re
 import uuid
 from pathlib import Path
 from datetime import datetime, timedelta
@ -19,6 +21,41 @@ from typing import Dict, List, Optional, Any

 logger = logging.getLogger(__name__)

+
+# ---------------------------------------------------------------------------
+# PII redaction helpers
+# ---------------------------------------------------------------------------
+
+_PHONE_RE = re.compile(r"^\+?\d[\d\-\s]{6,}$")
+
+
+def _hash_id(value: str) -> str:
+    """Deterministic 12-char hex hash of an identifier."""
+    return hashlib.sha256(value.encode("utf-8")).hexdigest()[:12]
+
+
+def _hash_sender_id(value: str) -> str:
+    """Hash a sender ID to ``user_<12hex>``."""
+    return f"user_{_hash_id(value)}"
+
+
+def _hash_chat_id(value: str) -> str:
+    """Hash the numeric portion of a chat ID, preserving platform prefix.
+
+    ``telegram:12345`` → ``telegram:<hash>``
+    ``12345``          → ``<hash>``
+    """
+    colon = value.find(":")
+    if colon > 0:
+        prefix = value[:colon]
+        return f"{prefix}:{_hash_id(value[colon + 1:])}"
+    return _hash_id(value)
+
+
+def _looks_like_phone(value: str) -> bool:
+    """Return True if *value* looks like a phone number (E.164 or similar)."""
+    return bool(_PHONE_RE.match(value.strip()))
+
 from .config import (
    Platform,
    GatewayConfig,
@ -146,7 +183,21 @@ class SessionContext:
        }


-def build_session_context_prompt(context: SessionContext) -> str:
+_PII_SAFE_PLATFORMS = frozenset({
+    Platform.WHATSAPP,
+    Platform.SIGNAL,
+    Platform.TELEGRAM,
+})
+"""Platforms where user IDs can be safely redacted (no in-message mention system
+that requires raw IDs).  Discord is excluded because mentions use ``<@user_id>``
+and the LLM needs the real ID to tag users."""
+
+
+def build_session_context_prompt(
+    context: SessionContext,
+    *,
+    redact_pii: bool = False,
+) -> str:
    """
    Build the dynamic system prompt section that tells the agent about its context.
    
@ -154,7 +205,15 @@ def build_session_context_prompt(context: SessionContext) -> str:
    - Where messages are coming from
    - What platforms are connected
    - Where it can deliver scheduled task outputs
+
+    When *redact_pii* is True **and** the source platform is in
+    ``_PII_SAFE_PLATFORMS``, phone numbers are stripped and user/chat IDs
+    are replaced with deterministic hashes before being sent to the LLM.
+    Platforms like Discord are excluded because mentions need real IDs.
+    Routing still uses the original values (they stay in SessionSource).
    """
+    # Only apply redaction on platforms where IDs aren't needed for mentions
+    redact_pii = redact_pii and context.source.platform in _PII_SAFE_PLATFORMS
    lines = [
        "## Current Session Context",
        "",
@ -165,7 +224,25 @@ def build_session_context_prompt(context: SessionContext) -> str:
    if context.source.platform == Platform.LOCAL:
        lines.append(f"**Source:** {platform_name} (the machine running this agent)")
    else:
-        lines.append(f"**Source:** {platform_name} ({context.source.description})")
+        # Build a description that respects PII redaction
+        src = context.source
+        if redact_pii:
+            # Build a safe description without raw IDs
+            _uname = src.user_name or (
+                _hash_sender_id(src.user_id) if src.user_id else "user"
+            )
+            _cname = src.chat_name or _hash_chat_id(src.chat_id)
+            if src.chat_type == "dm":
+                desc = f"DM with {_uname}"
+            elif src.chat_type == "group":
+                desc = f"group: {_cname}"
+            elif src.chat_type == "channel":
+                desc = f"channel: {_cname}"
+            else:
+                desc = _cname
+        else:
+            desc = src.description
+        lines.append(f"**Source:** {platform_name} ({desc})")
    
    # Channel topic (if available - provides context about the channel's purpose)
    if context.source.chat_topic:
@ -175,7 +252,10 @@ def build_session_context_prompt(context: SessionContext) -> str:
    if context.source.user_name:
        lines.append(f"**User:** {context.source.user_name}")
    elif context.source.user_id:
-        lines.append(f"**User ID:** {context.source.user_id}")
+        uid = context.source.user_id
+        if redact_pii:
+            uid = _hash_sender_id(uid)
+        lines.append(f"**User ID:** {uid}")
    
    # Platform-specific behavioral notes
    if context.source.platform == Platform.SLACK:
@ -210,7 +290,8 @@ def build_session_context_prompt(context: SessionContext) -> str:
        lines.append("")
        lines.append("**Home Channels (default destinations):**")
        for platform, home in context.home_channels.items():
-            lines.append(f"  - {platform.value}: {home.name} (ID: {home.chat_id})")
+            hc_id = _hash_chat_id(home.chat_id) if redact_pii else home.chat_id
+            lines.append(f"  - {platform.value}: {home.name} (ID: {hc_id})")
    
    # Delivery options for scheduled tasks
    lines.append("")
@ -220,7 +301,10 @@ def build_session_context_prompt(context: SessionContext) -> str:
    if context.source.platform == Platform.LOCAL:
        lines.append("- `\"origin\"` → Local output (saved to files)")
    else:
-        lines.append(f"- `\"origin\"` → Back to this chat ({context.source.chat_name or context.source.chat_id})")
+        _origin_label = context.source.chat_name or (
+            _hash_chat_id(context.source.chat_id) if redact_pii else context.source.chat_id
+        )
+        lines.append(f"- `\"origin\"` → Back to this chat ({_origin_label})")
    
    # Local always available
    lines.append("- `\"local\"` → Save to local files only (~/.hermes/cron/output/)")