Add background process management with process tool, wait, PTY, and stdin support

New process registry and tool for managing long-running background processes
across all terminal backends (local, Docker, Singularity, Modal, SSH).

Process Registry (tools/process_registry.py):
- ProcessSession tracking with rolling 200KB output buffer
- spawn_local() with optional PTY via ptyprocess for interactive CLIs
- spawn_via_env() for non-local backends (runs inside sandbox, never on host)
- Background reader threads per process (Popen stdout or PTY)
- wait() with timeout clamping, interrupt support, and transparent limit reporting
- JSON checkpoint to ~/.hermes/processes.json for gateway crash recovery
- Module-level singleton shared across agent loop, gateway, and RL

Process Tool (model_tools.py):
- 7 actions: list, poll, log, wait, kill, write, submit
- Paired with terminal in all toolsets (CLI, messaging, RL)
- Timeout clamping with transparent notes in response

Terminal Tool Updates (tools/terminal_tool.py):
- Replaced nohup background mode with registry spawn (returns session_id)
- Added workdir parameter for per-command working directory
- Added check_interval parameter for gateway auto-check watchers
- Added pty parameter for interactive CLI tools (Codex, Claude Code)
- Updated TERMINAL_TOOL_DESCRIPTION with full background workflow docs
- Cleanup thread now respects active background processes (won't reap sandbox)

Gateway Integration (gateway/run.py, session.py, config.py):
- Session reset protection: sessions with active processes exempt from reset
- Default idle timeout increased from 2 hours to 24 hours
- from_dict fallback aligned to match (was 120, now 1440)
- session_key env var propagated to process registry for session mapping
- Crash recovery on gateway startup via checkpoint probe
- check_interval watcher: asyncio task polls process, delivers updates to platform

RL Safety (environments/):
- tool_context.py cleanup() kills background processes on episode end
- hermes_base_env.py warns when enabled_toolsets is None (loads all tools)
- Process tool safe in RL via wait() blocking the agent loop

Also:
- Added ptyprocess as optional dependency (in pyproject.toml [pty] extra + [all])
- Fixed pre-existing bug: rl_test_inference missing from TOOL_TO_TOOLSET_MAP
- Updated AGENTS.md with process management docs and project structure
- Updated README.md terminal section with process management overview
This commit is contained in:
teknium1 2026-02-17 02:51:31 -08:00
parent 48b5cfd085
commit 061fa70907
12 changed files with 1142 additions and 40 deletions

View file

@ -65,7 +65,7 @@ class SessionResetPolicy:
"""
mode: str = "both" # "daily", "idle", or "both"
at_hour: int = 4 # Hour for daily reset (0-23, local time)
idle_minutes: int = 120 # Minutes of inactivity before reset
idle_minutes: int = 1440 # Minutes of inactivity before reset (24 hours)
def to_dict(self) -> Dict[str, Any]:
return {
@ -79,7 +79,7 @@ class SessionResetPolicy:
return cls(
mode=data.get("mode", "both"),
at_hour=data.get("at_hour", 4),
idle_minutes=data.get("idle_minutes", 120),
idle_minutes=data.get("idle_minutes", 1440),
)

View file

@ -72,7 +72,13 @@ class GatewayRunner:
def __init__(self, config: Optional[GatewayConfig] = None):
self.config = config or load_gateway_config()
self.adapters: Dict[Platform, BasePlatformAdapter] = {}
self.session_store = SessionStore(self.config.sessions_dir, self.config)
# Wire process registry into session store for reset protection
from tools.process_registry import process_registry
self.session_store = SessionStore(
self.config.sessions_dir, self.config,
has_active_processes_fn=lambda key: process_registry.has_active_for_session(key),
)
self.delivery_router = DeliveryRouter(self.config)
self._running = False
self._shutdown_event = asyncio.Event()
@ -106,6 +112,15 @@ class GatewayRunner:
# Discover and load event hooks
self.hooks.discover_and_load()
# Recover background processes from checkpoint (crash recovery)
try:
from tools.process_registry import process_registry
recovered = process_registry.recover_from_checkpoint()
if recovered:
print(f"[gateway] Recovered {recovered} background process(es) from previous run")
except Exception as e:
print(f"[gateway] Process checkpoint recovery: {e}")
connected_count = 0
# Initialize and connect each configured platform
@ -429,6 +444,15 @@ class GatewayRunner:
"response": (response or "")[:500],
})
# Check for pending process watchers (check_interval on background processes)
try:
from tools.process_registry import process_registry
while process_registry.pending_watchers:
watcher = process_registry.pending_watchers.pop(0)
asyncio.create_task(self._run_process_watcher(watcher))
except Exception as e:
print(f"[gateway] Process watcher setup error: {e}", flush=True)
# Check if the agent encountered a dangerous command needing approval
# The terminal tool stores the last pending approval globally
try:
@ -701,6 +725,75 @@ class GatewayRunner:
return prefix
return user_text
async def _run_process_watcher(self, watcher: dict) -> None:
"""
Periodically check a background process and push updates to the user.
Runs as an asyncio task. Stays silent when nothing changed.
Auto-removes when the process exits or is killed.
"""
from tools.process_registry import process_registry
session_id = watcher["session_id"]
interval = watcher["check_interval"]
session_key = watcher.get("session_key", "")
platform_name = watcher.get("platform", "")
chat_id = watcher.get("chat_id", "")
print(f"[gateway] Process watcher started: {session_id} (every {interval}s)", flush=True)
last_output_len = 0
while True:
await asyncio.sleep(interval)
session = process_registry.get(session_id)
if session is None:
break
current_output_len = len(session.output_buffer)
has_new_output = current_output_len > last_output_len
last_output_len = current_output_len
if session.exited:
# Process finished -- deliver final update
new_output = session.output_buffer[-1000:] if session.output_buffer else ""
message_text = (
f"[Background process {session_id} finished with exit code {session.exit_code}~ "
f"Here's the final output:\n{new_output}]"
)
# Try to deliver to the originating platform
adapter = None
for p, a in self.adapters.items():
if p.value == platform_name:
adapter = a
break
if adapter and chat_id:
try:
await adapter.send(chat_id, message_text)
except Exception as e:
print(f"[gateway] Watcher delivery error: {e}", flush=True)
break
elif has_new_output:
# New output available -- deliver status update
new_output = session.output_buffer[-500:] if session.output_buffer else ""
message_text = (
f"[Background process {session_id} is still running~ "
f"New output:\n{new_output}]"
)
adapter = None
for p, a in self.adapters.items():
if p.value == platform_name:
adapter = a
break
if adapter and chat_id:
try:
await adapter.send(chat_id, message_text)
except Exception as e:
print(f"[gateway] Watcher delivery error: {e}", flush=True)
print(f"[gateway] Process watcher ended: {session_id}", flush=True)
async def _run_agent(
self,
message: str,
@ -824,6 +917,10 @@ class GatewayRunner:
tools_holder = [None] # Mutable container for the tool definitions
def run_sync():
# Pass session_key to process registry via env var so background
# processes can be mapped back to this gateway session
os.environ["HERMES_SESSION_KEY"] = session_key or ""
# Read from env var or use default (same as CLI)
max_iterations = int(os.getenv("HERMES_MAX_ITERATIONS", "60"))

View file

@ -270,11 +270,15 @@ class SessionStore:
- {session_id}.jsonl: Conversation transcripts
"""
def __init__(self, sessions_dir: Path, config: GatewayConfig):
def __init__(self, sessions_dir: Path, config: GatewayConfig,
has_active_processes_fn=None):
self.sessions_dir = sessions_dir
self.config = config
self._entries: Dict[str, SessionEntry] = {}
self._loaded = False
# Optional callback to check if a session has active background processes.
# When set, sessions with running processes are exempt from reset.
self._has_active_processes_fn = has_active_processes_fn
def _ensure_loaded(self) -> None:
"""Load sessions from disk if not already loaded."""
@ -320,7 +324,14 @@ class SessionStore:
Check if a session should be reset based on policy.
Returns True if the session is stale and should start fresh.
Sessions with active background processes are never reset.
"""
# Don't reset sessions that have active background processes
if self._has_active_processes_fn:
session_key = self._generate_session_key(source)
if self._has_active_processes_fn(session_key):
return False
policy = self.config.get_reset_policy(
platform=source.platform,
session_type=source.chat_type