Add background process management with process tool, wait, PTY, and stdin support

New process registry and tool for managing long-running background processes across all terminal backends (local, Docker, Singularity, Modal, SSH). Process Registry (tools/process_registry.py): - ProcessSession tracking with rolling 200KB output buffer - spawn_local() with optional PTY via ptyprocess for interactive CLIs - spawn_via_env() for non-local backends (runs inside sandbox, never on host) - Background reader threads per process (Popen stdout or PTY) - wait() with timeout clamping, interrupt support, and transparent limit reporting - JSON checkpoint to ~/.hermes/processes.json for gateway crash recovery - Module-level singleton shared across agent loop, gateway, and RL Process Tool (model_tools.py): - 7 actions: list, poll, log, wait, kill, write, submit - Paired with terminal in all toolsets (CLI, messaging, RL) - Timeout clamping with transparent notes in response Terminal Tool Updates (tools/terminal_tool.py): - Replaced nohup background mode with registry spawn (returns session_id) - Added workdir parameter for per-command working directory - Added check_interval parameter for gateway auto-check watchers - Added pty parameter for interactive CLI tools (Codex, Claude Code) - Updated TERMINAL_TOOL_DESCRIPTION with full background workflow docs - Cleanup thread now respects active background processes (won't reap sandbox) Gateway Integration (gateway/run.py, session.py, config.py): - Session reset protection: sessions with active processes exempt from reset - Default idle timeout increased from 2 hours to 24 hours - from_dict fallback aligned to match (was 120, now 1440) - session_key env var propagated to process registry for session mapping - Crash recovery on gateway startup via checkpoint probe - check_interval watcher: asyncio task polls process, delivers updates to platform RL Safety (environments/): - tool_context.py cleanup() kills background processes on episode end - hermes_base_env.py warns when enabled_toolsets is None (loads all tools) - Process tool safe in RL via wait() blocking the agent loop Also: - Added ptyprocess as optional dependency (in pyproject.toml [pty] extra + [all]) - Fixed pre-existing bug: rl_test_inference missing from TOOL_TO_TOOLSET_MAP - Updated AGENTS.md with process management docs and project structure - Updated README.md terminal section with process management overview
2026-02-17 02:51:31 -08:00 · 2026-02-17 02:51:31 -08:00 · 061fa70907
commit 061fa70907
parent 48b5cfd085
12 changed files with 1142 additions and 40 deletions
--- a/gateway/config.py
+++ b/gateway/config.py
@ -65,7 +65,7 @@ class SessionResetPolicy:
    """
    mode: str = "both"  # "daily", "idle", or "both"
    at_hour: int = 4  # Hour for daily reset (0-23, local time)
-    idle_minutes: int = 120  # Minutes of inactivity before reset
+    idle_minutes: int = 1440  # Minutes of inactivity before reset (24 hours)
    
    def to_dict(self) -> Dict[str, Any]:
        return {
@ -79,7 +79,7 @@ class SessionResetPolicy:
        return cls(
            mode=data.get("mode", "both"),
            at_hour=data.get("at_hour", 4),
-            idle_minutes=data.get("idle_minutes", 120),
+            idle_minutes=data.get("idle_minutes", 1440),
        )


--- a/gateway/run.py
+++ b/gateway/run.py
@ -72,7 +72,13 @@ class GatewayRunner:
    def __init__(self, config: Optional[GatewayConfig] = None):
        self.config = config or load_gateway_config()
        self.adapters: Dict[Platform, BasePlatformAdapter] = {}
-        self.session_store = SessionStore(self.config.sessions_dir, self.config)
+
+        # Wire process registry into session store for reset protection
+        from tools.process_registry import process_registry
+        self.session_store = SessionStore(
+            self.config.sessions_dir, self.config,
+            has_active_processes_fn=lambda key: process_registry.has_active_for_session(key),
+        )
        self.delivery_router = DeliveryRouter(self.config)
        self._running = False
        self._shutdown_event = asyncio.Event()
@ -106,6 +112,15 @@ class GatewayRunner:
        # Discover and load event hooks
        self.hooks.discover_and_load()
        
+        # Recover background processes from checkpoint (crash recovery)
+        try:
+            from tools.process_registry import process_registry
+            recovered = process_registry.recover_from_checkpoint()
+            if recovered:
+                print(f"[gateway] Recovered {recovered} background process(es) from previous run")
+        except Exception as e:
+            print(f"[gateway] Process checkpoint recovery: {e}")
+        
        connected_count = 0
        
        # Initialize and connect each configured platform
@ -429,6 +444,15 @@ class GatewayRunner:
                "response": (response or "")[:500],
            })
            
+            # Check for pending process watchers (check_interval on background processes)
+            try:
+                from tools.process_registry import process_registry
+                while process_registry.pending_watchers:
+                    watcher = process_registry.pending_watchers.pop(0)
+                    asyncio.create_task(self._run_process_watcher(watcher))
+            except Exception as e:
+                print(f"[gateway] Process watcher setup error: {e}", flush=True)
+
            # Check if the agent encountered a dangerous command needing approval
            # The terminal tool stores the last pending approval globally
            try:
@ -701,6 +725,75 @@ class GatewayRunner:
            return prefix
        return user_text

+    async def _run_process_watcher(self, watcher: dict) -> None:
+        """
+        Periodically check a background process and push updates to the user.
+
+        Runs as an asyncio task. Stays silent when nothing changed.
+        Auto-removes when the process exits or is killed.
+        """
+        from tools.process_registry import process_registry
+
+        session_id = watcher["session_id"]
+        interval = watcher["check_interval"]
+        session_key = watcher.get("session_key", "")
+        platform_name = watcher.get("platform", "")
+        chat_id = watcher.get("chat_id", "")
+
+        print(f"[gateway] Process watcher started: {session_id} (every {interval}s)", flush=True)
+
+        last_output_len = 0
+        while True:
+            await asyncio.sleep(interval)
+
+            session = process_registry.get(session_id)
+            if session is None:
+                break
+
+            current_output_len = len(session.output_buffer)
+            has_new_output = current_output_len > last_output_len
+            last_output_len = current_output_len
+
+            if session.exited:
+                # Process finished -- deliver final update
+                new_output = session.output_buffer[-1000:] if session.output_buffer else ""
+                message_text = (
+                    f"[Background process {session_id} finished with exit code {session.exit_code}~ "
+                    f"Here's the final output:\n{new_output}]"
+                )
+                # Try to deliver to the originating platform
+                adapter = None
+                for p, a in self.adapters.items():
+                    if p.value == platform_name:
+                        adapter = a
+                        break
+                if adapter and chat_id:
+                    try:
+                        await adapter.send(chat_id, message_text)
+                    except Exception as e:
+                        print(f"[gateway] Watcher delivery error: {e}", flush=True)
+                break
+
+            elif has_new_output:
+                # New output available -- deliver status update
+                new_output = session.output_buffer[-500:] if session.output_buffer else ""
+                message_text = (
+                    f"[Background process {session_id} is still running~ "
+                    f"New output:\n{new_output}]"
+                )
+                adapter = None
+                for p, a in self.adapters.items():
+                    if p.value == platform_name:
+                        adapter = a
+                        break
+                if adapter and chat_id:
+                    try:
+                        await adapter.send(chat_id, message_text)
+                    except Exception as e:
+                        print(f"[gateway] Watcher delivery error: {e}", flush=True)
+
+        print(f"[gateway] Process watcher ended: {session_id}", flush=True)
+
    async def _run_agent(
        self,
        message: str,
@ -824,6 +917,10 @@ class GatewayRunner:
        tools_holder = [None]   # Mutable container for the tool definitions
        
        def run_sync():
+            # Pass session_key to process registry via env var so background
+            # processes can be mapped back to this gateway session
+            os.environ["HERMES_SESSION_KEY"] = session_key or ""
+
            # Read from env var or use default (same as CLI)
            max_iterations = int(os.getenv("HERMES_MAX_ITERATIONS", "60"))
            
--- a/gateway/session.py
+++ b/gateway/session.py
@ -270,11 +270,15 @@ class SessionStore:
    - {session_id}.jsonl: Conversation transcripts
    """
    
-    def __init__(self, sessions_dir: Path, config: GatewayConfig):
+    def __init__(self, sessions_dir: Path, config: GatewayConfig,
+                 has_active_processes_fn=None):
        self.sessions_dir = sessions_dir
        self.config = config
        self._entries: Dict[str, SessionEntry] = {}
        self._loaded = False
+        # Optional callback to check if a session has active background processes.
+        # When set, sessions with running processes are exempt from reset.
+        self._has_active_processes_fn = has_active_processes_fn
    
    def _ensure_loaded(self) -> None:
        """Load sessions from disk if not already loaded."""
@ -320,7 +324,14 @@ class SessionStore:
        Check if a session should be reset based on policy.
        
        Returns True if the session is stale and should start fresh.
+        Sessions with active background processes are never reset.
        """
+        # Don't reset sessions that have active background processes
+        if self._has_active_processes_fn:
+            session_key = self._generate_session_key(source)
+            if self._has_active_processes_fn(session_key):
+                return False
+
        policy = self.config.get_reset_policy(
            platform=source.platform,
            session_type=source.chat_type