Add background process management with process tool, wait, PTY, and stdin support
New process registry and tool for managing long-running background processes across all terminal backends (local, Docker, Singularity, Modal, SSH). Process Registry (tools/process_registry.py): - ProcessSession tracking with rolling 200KB output buffer - spawn_local() with optional PTY via ptyprocess for interactive CLIs - spawn_via_env() for non-local backends (runs inside sandbox, never on host) - Background reader threads per process (Popen stdout or PTY) - wait() with timeout clamping, interrupt support, and transparent limit reporting - JSON checkpoint to ~/.hermes/processes.json for gateway crash recovery - Module-level singleton shared across agent loop, gateway, and RL Process Tool (model_tools.py): - 7 actions: list, poll, log, wait, kill, write, submit - Paired with terminal in all toolsets (CLI, messaging, RL) - Timeout clamping with transparent notes in response Terminal Tool Updates (tools/terminal_tool.py): - Replaced nohup background mode with registry spawn (returns session_id) - Added workdir parameter for per-command working directory - Added check_interval parameter for gateway auto-check watchers - Added pty parameter for interactive CLI tools (Codex, Claude Code) - Updated TERMINAL_TOOL_DESCRIPTION with full background workflow docs - Cleanup thread now respects active background processes (won't reap sandbox) Gateway Integration (gateway/run.py, session.py, config.py): - Session reset protection: sessions with active processes exempt from reset - Default idle timeout increased from 2 hours to 24 hours - from_dict fallback aligned to match (was 120, now 1440) - session_key env var propagated to process registry for session mapping - Crash recovery on gateway startup via checkpoint probe - check_interval watcher: asyncio task polls process, delivers updates to platform RL Safety (environments/): - tool_context.py cleanup() kills background processes on episode end - hermes_base_env.py warns when enabled_toolsets is None (loads all tools) - Process tool safe in RL via wait() blocking the agent loop Also: - Added ptyprocess as optional dependency (in pyproject.toml [pty] extra + [all]) - Fixed pre-existing bug: rl_test_inference missing from TOOL_TO_TOOLSET_MAP - Updated AGENTS.md with process management docs and project structure - Updated README.md terminal section with process management overview
This commit is contained in:
parent
48b5cfd085
commit
061fa70907
12 changed files with 1142 additions and 40 deletions
|
|
@ -65,7 +65,7 @@ class SessionResetPolicy:
|
|||
"""
|
||||
mode: str = "both" # "daily", "idle", or "both"
|
||||
at_hour: int = 4 # Hour for daily reset (0-23, local time)
|
||||
idle_minutes: int = 120 # Minutes of inactivity before reset
|
||||
idle_minutes: int = 1440 # Minutes of inactivity before reset (24 hours)
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return {
|
||||
|
|
@ -79,7 +79,7 @@ class SessionResetPolicy:
|
|||
return cls(
|
||||
mode=data.get("mode", "both"),
|
||||
at_hour=data.get("at_hour", 4),
|
||||
idle_minutes=data.get("idle_minutes", 120),
|
||||
idle_minutes=data.get("idle_minutes", 1440),
|
||||
)
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -72,7 +72,13 @@ class GatewayRunner:
|
|||
def __init__(self, config: Optional[GatewayConfig] = None):
|
||||
self.config = config or load_gateway_config()
|
||||
self.adapters: Dict[Platform, BasePlatformAdapter] = {}
|
||||
self.session_store = SessionStore(self.config.sessions_dir, self.config)
|
||||
|
||||
# Wire process registry into session store for reset protection
|
||||
from tools.process_registry import process_registry
|
||||
self.session_store = SessionStore(
|
||||
self.config.sessions_dir, self.config,
|
||||
has_active_processes_fn=lambda key: process_registry.has_active_for_session(key),
|
||||
)
|
||||
self.delivery_router = DeliveryRouter(self.config)
|
||||
self._running = False
|
||||
self._shutdown_event = asyncio.Event()
|
||||
|
|
@ -106,6 +112,15 @@ class GatewayRunner:
|
|||
# Discover and load event hooks
|
||||
self.hooks.discover_and_load()
|
||||
|
||||
# Recover background processes from checkpoint (crash recovery)
|
||||
try:
|
||||
from tools.process_registry import process_registry
|
||||
recovered = process_registry.recover_from_checkpoint()
|
||||
if recovered:
|
||||
print(f"[gateway] Recovered {recovered} background process(es) from previous run")
|
||||
except Exception as e:
|
||||
print(f"[gateway] Process checkpoint recovery: {e}")
|
||||
|
||||
connected_count = 0
|
||||
|
||||
# Initialize and connect each configured platform
|
||||
|
|
@ -429,6 +444,15 @@ class GatewayRunner:
|
|||
"response": (response or "")[:500],
|
||||
})
|
||||
|
||||
# Check for pending process watchers (check_interval on background processes)
|
||||
try:
|
||||
from tools.process_registry import process_registry
|
||||
while process_registry.pending_watchers:
|
||||
watcher = process_registry.pending_watchers.pop(0)
|
||||
asyncio.create_task(self._run_process_watcher(watcher))
|
||||
except Exception as e:
|
||||
print(f"[gateway] Process watcher setup error: {e}", flush=True)
|
||||
|
||||
# Check if the agent encountered a dangerous command needing approval
|
||||
# The terminal tool stores the last pending approval globally
|
||||
try:
|
||||
|
|
@ -701,6 +725,75 @@ class GatewayRunner:
|
|||
return prefix
|
||||
return user_text
|
||||
|
||||
async def _run_process_watcher(self, watcher: dict) -> None:
|
||||
"""
|
||||
Periodically check a background process and push updates to the user.
|
||||
|
||||
Runs as an asyncio task. Stays silent when nothing changed.
|
||||
Auto-removes when the process exits or is killed.
|
||||
"""
|
||||
from tools.process_registry import process_registry
|
||||
|
||||
session_id = watcher["session_id"]
|
||||
interval = watcher["check_interval"]
|
||||
session_key = watcher.get("session_key", "")
|
||||
platform_name = watcher.get("platform", "")
|
||||
chat_id = watcher.get("chat_id", "")
|
||||
|
||||
print(f"[gateway] Process watcher started: {session_id} (every {interval}s)", flush=True)
|
||||
|
||||
last_output_len = 0
|
||||
while True:
|
||||
await asyncio.sleep(interval)
|
||||
|
||||
session = process_registry.get(session_id)
|
||||
if session is None:
|
||||
break
|
||||
|
||||
current_output_len = len(session.output_buffer)
|
||||
has_new_output = current_output_len > last_output_len
|
||||
last_output_len = current_output_len
|
||||
|
||||
if session.exited:
|
||||
# Process finished -- deliver final update
|
||||
new_output = session.output_buffer[-1000:] if session.output_buffer else ""
|
||||
message_text = (
|
||||
f"[Background process {session_id} finished with exit code {session.exit_code}~ "
|
||||
f"Here's the final output:\n{new_output}]"
|
||||
)
|
||||
# Try to deliver to the originating platform
|
||||
adapter = None
|
||||
for p, a in self.adapters.items():
|
||||
if p.value == platform_name:
|
||||
adapter = a
|
||||
break
|
||||
if adapter and chat_id:
|
||||
try:
|
||||
await adapter.send(chat_id, message_text)
|
||||
except Exception as e:
|
||||
print(f"[gateway] Watcher delivery error: {e}", flush=True)
|
||||
break
|
||||
|
||||
elif has_new_output:
|
||||
# New output available -- deliver status update
|
||||
new_output = session.output_buffer[-500:] if session.output_buffer else ""
|
||||
message_text = (
|
||||
f"[Background process {session_id} is still running~ "
|
||||
f"New output:\n{new_output}]"
|
||||
)
|
||||
adapter = None
|
||||
for p, a in self.adapters.items():
|
||||
if p.value == platform_name:
|
||||
adapter = a
|
||||
break
|
||||
if adapter and chat_id:
|
||||
try:
|
||||
await adapter.send(chat_id, message_text)
|
||||
except Exception as e:
|
||||
print(f"[gateway] Watcher delivery error: {e}", flush=True)
|
||||
|
||||
print(f"[gateway] Process watcher ended: {session_id}", flush=True)
|
||||
|
||||
async def _run_agent(
|
||||
self,
|
||||
message: str,
|
||||
|
|
@ -824,6 +917,10 @@ class GatewayRunner:
|
|||
tools_holder = [None] # Mutable container for the tool definitions
|
||||
|
||||
def run_sync():
|
||||
# Pass session_key to process registry via env var so background
|
||||
# processes can be mapped back to this gateway session
|
||||
os.environ["HERMES_SESSION_KEY"] = session_key or ""
|
||||
|
||||
# Read from env var or use default (same as CLI)
|
||||
max_iterations = int(os.getenv("HERMES_MAX_ITERATIONS", "60"))
|
||||
|
||||
|
|
|
|||
|
|
@ -270,11 +270,15 @@ class SessionStore:
|
|||
- {session_id}.jsonl: Conversation transcripts
|
||||
"""
|
||||
|
||||
def __init__(self, sessions_dir: Path, config: GatewayConfig):
|
||||
def __init__(self, sessions_dir: Path, config: GatewayConfig,
|
||||
has_active_processes_fn=None):
|
||||
self.sessions_dir = sessions_dir
|
||||
self.config = config
|
||||
self._entries: Dict[str, SessionEntry] = {}
|
||||
self._loaded = False
|
||||
# Optional callback to check if a session has active background processes.
|
||||
# When set, sessions with running processes are exempt from reset.
|
||||
self._has_active_processes_fn = has_active_processes_fn
|
||||
|
||||
def _ensure_loaded(self) -> None:
|
||||
"""Load sessions from disk if not already loaded."""
|
||||
|
|
@ -320,7 +324,14 @@ class SessionStore:
|
|||
Check if a session should be reset based on policy.
|
||||
|
||||
Returns True if the session is stale and should start fresh.
|
||||
Sessions with active background processes are never reset.
|
||||
"""
|
||||
# Don't reset sessions that have active background processes
|
||||
if self._has_active_processes_fn:
|
||||
session_key = self._generate_session_key(source)
|
||||
if self._has_active_processes_fn(session_key):
|
||||
return False
|
||||
|
||||
policy = self.config.get_reset_policy(
|
||||
platform=source.platform,
|
||||
session_type=source.chat_type
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue