The architecture has been updated

This commit is contained in:
Skyber_2 2026-03-31 23:31:36 +03:00
parent 805f7a017e
commit a01257ead9
1119 changed files with 226 additions and 352 deletions

View file

@ -0,0 +1,13 @@
"""Hermes execution environment backends.
Each backend provides the same interface (BaseEnvironment ABC) for running
shell commands in a specific execution context: local, Docker, Singularity,
SSH, Modal, or Daytona.
The terminal_tool.py factory (_create_environment) selects the backend
based on the TERMINAL_ENV configuration.
"""
from tools.environments.base import BaseEnvironment
__all__ = ["BaseEnvironment"]

View file

@ -0,0 +1,99 @@
"""Base class for all Hermes execution environment backends."""
from abc import ABC, abstractmethod
import os
import subprocess
from pathlib import Path
from hermes_cli.config import get_hermes_home
def get_sandbox_dir() -> Path:
"""Return the host-side root for all sandbox storage (Docker workspaces,
Singularity overlays/SIF cache, etc.).
Configurable via TERMINAL_SANDBOX_DIR. Defaults to {HERMES_HOME}/sandboxes/.
"""
custom = os.getenv("TERMINAL_SANDBOX_DIR")
if custom:
p = Path(custom)
else:
p = get_hermes_home() / "sandboxes"
p.mkdir(parents=True, exist_ok=True)
return p
class BaseEnvironment(ABC):
"""Common interface for all Hermes execution backends.
Subclasses implement execute() and cleanup(). Shared helpers eliminate
duplicated subprocess boilerplate across backends.
"""
def __init__(self, cwd: str, timeout: int, env: dict = None):
self.cwd = cwd
self.timeout = timeout
self.env = env or {}
@abstractmethod
def execute(self, command: str, cwd: str = "", *,
timeout: int | None = None,
stdin_data: str | None = None) -> dict:
"""Execute a command, return {"output": str, "returncode": int}."""
...
@abstractmethod
def cleanup(self):
"""Release backend resources (container, instance, connection)."""
...
def stop(self):
"""Alias for cleanup (compat with older callers)."""
self.cleanup()
def __del__(self):
try:
self.cleanup()
except Exception:
pass
# ------------------------------------------------------------------
# Shared helpers (eliminate duplication across backends)
# ------------------------------------------------------------------
def _prepare_command(self, command: str) -> tuple[str, str | None]:
"""Transform sudo commands if SUDO_PASSWORD is available.
Returns:
(transformed_command, sudo_stdin) see _transform_sudo_command
for the full contract. Callers that drive a subprocess directly
should prepend sudo_stdin (when not None) to any stdin_data they
pass to Popen. Callers that embed stdin via heredoc (modal,
daytona) handle sudo_stdin in their own execute() method.
"""
from tools.terminal_tool import _transform_sudo_command
return _transform_sudo_command(command)
def _build_run_kwargs(self, timeout: int | None,
stdin_data: str | None = None) -> dict:
"""Build common subprocess.run kwargs for non-interactive execution."""
kw = {
"text": True,
"timeout": timeout or self.timeout,
"encoding": "utf-8",
"errors": "replace",
"stdout": subprocess.PIPE,
"stderr": subprocess.STDOUT,
}
if stdin_data is not None:
kw["input"] = stdin_data
else:
kw["stdin"] = subprocess.DEVNULL
return kw
def _timeout_result(self, timeout: int | None) -> dict:
"""Standard return dict when a command times out."""
return {
"output": f"Command timed out after {timeout or self.timeout}s",
"returncode": 124,
}

View file

@ -0,0 +1,250 @@
"""Daytona cloud execution environment.
Uses the Daytona Python SDK to run commands in cloud sandboxes.
Supports persistent sandboxes: when enabled, sandboxes are stopped on cleanup
and resumed on next creation, preserving the filesystem across sessions.
"""
import logging
import time
import math
import shlex
import threading
import uuid
import warnings
from typing import Optional
from tools.environments.base import BaseEnvironment
from tools.interrupt import is_interrupted
logger = logging.getLogger(__name__)
class DaytonaEnvironment(BaseEnvironment):
"""Daytona cloud sandbox execution backend.
Uses stopped/started sandbox lifecycle for filesystem persistence
instead of snapshots, making it faster and stateless on the host.
"""
def __init__(
self,
image: str,
cwd: str = "/home/daytona",
timeout: int = 60,
cpu: int = 1,
memory: int = 5120, # MB (hermes convention)
disk: int = 10240, # MB (Daytona platform max is 10GB)
persistent_filesystem: bool = True,
task_id: str = "default",
):
self._requested_cwd = cwd
super().__init__(cwd=cwd, timeout=timeout)
from daytona import (
Daytona,
CreateSandboxFromImageParams,
DaytonaError,
Resources,
SandboxState,
)
self._persistent = persistent_filesystem
self._task_id = task_id
self._SandboxState = SandboxState
self._daytona = Daytona()
self._sandbox = None
self._lock = threading.Lock()
memory_gib = max(1, math.ceil(memory / 1024))
disk_gib = max(1, math.ceil(disk / 1024))
if disk_gib > 10:
warnings.warn(
f"Daytona: requested disk ({disk_gib}GB) exceeds platform limit (10GB). "
f"Capping to 10GB. Set container_disk: 10240 in config to silence this.",
stacklevel=2,
)
disk_gib = 10
resources = Resources(cpu=cpu, memory=memory_gib, disk=disk_gib)
labels = {"hermes_task_id": task_id}
sandbox_name = f"hermes-{task_id}"
# Try to resume an existing sandbox for this task
if self._persistent:
# 1. Try name-based lookup (new path)
try:
self._sandbox = self._daytona.get(sandbox_name)
self._sandbox.start()
logger.info("Daytona: resumed sandbox %s for task %s",
self._sandbox.id, task_id)
except DaytonaError:
self._sandbox = None
except Exception as e:
logger.warning("Daytona: failed to resume sandbox for task %s: %s",
task_id, e)
self._sandbox = None
# 2. Legacy fallback: find sandbox created before the naming migration
if self._sandbox is None:
try:
page = self._daytona.list(labels=labels, page=1, limit=1)
if page.items:
self._sandbox = page.items[0]
self._sandbox.start()
logger.info("Daytona: resumed legacy sandbox %s for task %s",
self._sandbox.id, task_id)
except Exception as e:
logger.debug("Daytona: no legacy sandbox found for task %s: %s",
task_id, e)
self._sandbox = None
# Create a fresh sandbox if we don't have one
if self._sandbox is None:
self._sandbox = self._daytona.create(
CreateSandboxFromImageParams(
image=image,
name=sandbox_name,
labels=labels,
auto_stop_interval=0,
resources=resources,
)
)
logger.info("Daytona: created sandbox %s for task %s",
self._sandbox.id, task_id)
# Resolve cwd: detect actual home dir inside the sandbox
if self._requested_cwd in ("~", "/home/daytona"):
try:
home = self._sandbox.process.exec("echo $HOME").result.strip()
if home:
self.cwd = home
except Exception:
pass # leave cwd as-is; sandbox will use its own default
logger.info("Daytona: resolved cwd to %s", self.cwd)
def _ensure_sandbox_ready(self):
"""Restart sandbox if it was stopped (e.g., by a previous interrupt)."""
self._sandbox.refresh_data()
if self._sandbox.state in (self._SandboxState.STOPPED, self._SandboxState.ARCHIVED):
self._sandbox.start()
logger.info("Daytona: restarted sandbox %s", self._sandbox.id)
def _exec_in_thread(self, exec_command: str, cwd: Optional[str], timeout: int) -> dict:
"""Run exec in a background thread with interrupt polling.
The Daytona SDK's exec(timeout=...) parameter is unreliable (the
server-side timeout is not enforced and the SDK has no client-side
fallback), so we wrap the command with the shell ``timeout`` utility
which reliably kills the process and returns exit code 124.
"""
# Wrap with shell `timeout` to enforce the deadline reliably.
# Add a small buffer so the shell timeout fires before any SDK-level
# timeout would, giving us a clean exit code 124.
timed_command = f"timeout {timeout} sh -c {shlex.quote(exec_command)}"
result_holder: dict = {"value": None, "error": None}
def _run():
try:
response = self._sandbox.process.exec(
timed_command, cwd=cwd,
)
result_holder["value"] = {
"output": response.result or "",
"returncode": response.exit_code,
}
except Exception as e:
result_holder["error"] = e
t = threading.Thread(target=_run, daemon=True)
t.start()
# Wait for timeout + generous buffer for network/SDK overhead
deadline = time.monotonic() + timeout + 10
while t.is_alive():
t.join(timeout=0.2)
if is_interrupted():
with self._lock:
try:
self._sandbox.stop()
except Exception:
pass
return {
"output": "[Command interrupted - Daytona sandbox stopped]",
"returncode": 130,
}
if time.monotonic() > deadline:
# Shell timeout didn't fire and SDK is hung — force stop
with self._lock:
try:
self._sandbox.stop()
except Exception:
pass
return self._timeout_result(timeout)
if result_holder["error"]:
return {"error": result_holder["error"]}
return result_holder["value"]
def execute(self, command: str, cwd: str = "", *,
timeout: Optional[int] = None,
stdin_data: Optional[str] = None) -> dict:
with self._lock:
self._ensure_sandbox_ready()
if stdin_data is not None:
marker = f"HERMES_EOF_{uuid.uuid4().hex[:8]}"
while marker in stdin_data:
marker = f"HERMES_EOF_{uuid.uuid4().hex[:8]}"
command = f"{command} << '{marker}'\n{stdin_data}\n{marker}"
exec_command, sudo_stdin = self._prepare_command(command)
# Daytona sandboxes execute commands via the Daytona SDK and cannot
# pipe subprocess stdin directly the way a local Popen can. When a
# sudo password is present, use a shell-level pipe from printf so that
# the password feeds sudo -S without appearing as an echo argument
# embedded in the shell string. The password is still visible in the
# remote sandbox's command line, but it is not exposed on the user's
# local machine — which is the primary threat being mitigated.
if sudo_stdin is not None:
import shlex
exec_command = (
f"printf '%s\\n' {shlex.quote(sudo_stdin.rstrip())} | {exec_command}"
)
effective_cwd = cwd or self.cwd or None
effective_timeout = timeout or self.timeout
result = self._exec_in_thread(exec_command, effective_cwd, effective_timeout)
if "error" in result:
from daytona import DaytonaError
err = result["error"]
if isinstance(err, DaytonaError):
with self._lock:
try:
self._ensure_sandbox_ready()
except Exception:
return {"output": f"Daytona execution error: {err}", "returncode": 1}
result = self._exec_in_thread(exec_command, effective_cwd, effective_timeout)
if "error" not in result:
return result
return {"output": f"Daytona execution error: {err}", "returncode": 1}
return result
def cleanup(self):
with self._lock:
if self._sandbox is None:
return
try:
if self._persistent:
self._sandbox.stop()
logger.info("Daytona: stopped sandbox %s (filesystem preserved)",
self._sandbox.id)
else:
self._daytona.delete(self._sandbox)
logger.info("Daytona: deleted sandbox %s", self._sandbox.id)
except Exception as e:
logger.warning("Daytona: cleanup failed: %s", e)
self._sandbox = None

View file

@ -0,0 +1,494 @@
"""Docker execution environment for sandboxed command execution.
Security hardened (cap-drop ALL, no-new-privileges, PID limits),
configurable resource limits (CPU, memory, disk), and optional filesystem
persistence via bind mounts.
"""
import logging
import os
import re
import shutil
import subprocess
import sys
import threading
import time
import uuid
from typing import Optional
from tools.environments.base import BaseEnvironment
from tools.interrupt import is_interrupted
logger = logging.getLogger(__name__)
# Common Docker Desktop install paths checked when 'docker' is not in PATH.
# macOS Intel: /usr/local/bin, macOS Apple Silicon (Homebrew): /opt/homebrew/bin,
# Docker Desktop app bundle: /Applications/Docker.app/Contents/Resources/bin
_DOCKER_SEARCH_PATHS = [
"/usr/local/bin/docker",
"/opt/homebrew/bin/docker",
"/Applications/Docker.app/Contents/Resources/bin/docker",
]
_docker_executable: Optional[str] = None # resolved once, cached
_ENV_VAR_NAME_RE = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*$")
def _normalize_forward_env_names(forward_env: list[str] | None) -> list[str]:
"""Return a deduplicated list of valid environment variable names."""
normalized: list[str] = []
seen: set[str] = set()
for item in forward_env or []:
if not isinstance(item, str):
logger.warning("Ignoring non-string docker_forward_env entry: %r", item)
continue
key = item.strip()
if not key:
continue
if not _ENV_VAR_NAME_RE.match(key):
logger.warning("Ignoring invalid docker_forward_env entry: %r", item)
continue
if key in seen:
continue
seen.add(key)
normalized.append(key)
return normalized
def _load_hermes_env_vars() -> dict[str, str]:
"""Load ~/.hermes/.env values without failing Docker command execution."""
try:
from hermes_cli.config import load_env
return load_env() or {}
except Exception:
return {}
def find_docker() -> Optional[str]:
"""Locate the docker CLI binary.
Checks ``shutil.which`` first (respects PATH), then probes well-known
install locations on macOS where Docker Desktop may not be in PATH
(e.g. when running as a gateway service via launchd).
Returns the absolute path, or ``None`` if docker cannot be found.
"""
global _docker_executable
if _docker_executable is not None:
return _docker_executable
found = shutil.which("docker")
if found:
_docker_executable = found
return found
for path in _DOCKER_SEARCH_PATHS:
if os.path.isfile(path) and os.access(path, os.X_OK):
_docker_executable = path
logger.info("Found docker at non-PATH location: %s", path)
return path
return None
# Security flags applied to every container.
# The container itself is the security boundary (isolated from host).
# We drop all capabilities then add back the minimum needed:
# DAC_OVERRIDE - root can write to bind-mounted dirs owned by host user
# CHOWN/FOWNER - package managers (pip, npm, apt) need to set file ownership
# Block privilege escalation and limit PIDs.
# /tmp is size-limited and nosuid but allows exec (needed by pip/npm builds).
_SECURITY_ARGS = [
"--cap-drop", "ALL",
"--cap-add", "DAC_OVERRIDE",
"--cap-add", "CHOWN",
"--cap-add", "FOWNER",
"--security-opt", "no-new-privileges",
"--pids-limit", "256",
"--tmpfs", "/tmp:rw,nosuid,size=512m",
"--tmpfs", "/var/tmp:rw,noexec,nosuid,size=256m",
"--tmpfs", "/run:rw,noexec,nosuid,size=64m",
]
_storage_opt_ok: Optional[bool] = None # cached result across instances
def _ensure_docker_available() -> None:
"""Best-effort check that the docker CLI is available before use.
Reuses ``find_docker()`` so this preflight stays consistent with the rest of
the Docker backend, including known non-PATH Docker Desktop locations.
"""
docker_exe = find_docker()
if not docker_exe:
logger.error(
"Docker backend selected but no docker executable was found in PATH "
"or known install locations. Install Docker Desktop and ensure the "
"CLI is available."
)
raise RuntimeError(
"Docker executable not found in PATH or known install locations. "
"Install Docker and ensure the 'docker' command is available."
)
try:
result = subprocess.run(
[docker_exe, "version"],
capture_output=True,
text=True,
timeout=5,
)
except FileNotFoundError:
logger.error(
"Docker backend selected but the resolved docker executable '%s' could "
"not be executed.",
docker_exe,
exc_info=True,
)
raise RuntimeError(
"Docker executable could not be executed. Check your Docker installation."
)
except subprocess.TimeoutExpired:
logger.error(
"Docker backend selected but '%s version' timed out. "
"The Docker daemon may not be running.",
docker_exe,
exc_info=True,
)
raise RuntimeError(
"Docker daemon is not responding. Ensure Docker is running and try again."
)
except Exception:
logger.error(
"Unexpected error while checking Docker availability.",
exc_info=True,
)
raise
else:
if result.returncode != 0:
logger.error(
"Docker backend selected but '%s version' failed "
"(exit code %d, stderr=%s)",
docker_exe,
result.returncode,
result.stderr.strip(),
)
raise RuntimeError(
"Docker command is available but 'docker version' failed. "
"Check your Docker installation."
)
class DockerEnvironment(BaseEnvironment):
"""Hardened Docker container execution with resource limits and persistence.
Security: all capabilities dropped, no privilege escalation, PID limits,
size-limited tmpfs for scratch dirs. The container itself is the security
boundary the filesystem inside is writable so agents can install packages
(pip, npm, apt) as needed. Writable workspace via tmpfs or bind mounts.
Persistence: when enabled, bind mounts preserve /workspace and /root
across container restarts.
"""
def __init__(
self,
image: str,
cwd: str = "/root",
timeout: int = 60,
cpu: float = 0,
memory: int = 0,
disk: int = 0,
persistent_filesystem: bool = False,
task_id: str = "default",
volumes: list = None,
forward_env: list[str] | None = None,
network: bool = True,
host_cwd: str = None,
auto_mount_cwd: bool = False,
):
if cwd == "~":
cwd = "/root"
super().__init__(cwd=cwd, timeout=timeout)
self._base_image = image
self._persistent = persistent_filesystem
self._task_id = task_id
self._forward_env = _normalize_forward_env_names(forward_env)
self._container_id: Optional[str] = None
logger.info(f"DockerEnvironment volumes: {volumes}")
# Ensure volumes is a list (config.yaml could be malformed)
if volumes is not None and not isinstance(volumes, list):
logger.warning(f"docker_volumes config is not a list: {volumes!r}")
volumes = []
# Fail fast if Docker is not available.
_ensure_docker_available()
# Build resource limit args
resource_args = []
if cpu > 0:
resource_args.extend(["--cpus", str(cpu)])
if memory > 0:
resource_args.extend(["--memory", f"{memory}m"])
if disk > 0 and sys.platform != "darwin":
if self._storage_opt_supported():
resource_args.extend(["--storage-opt", f"size={disk}m"])
else:
logger.warning(
"Docker storage driver does not support per-container disk limits "
"(requires overlay2 on XFS with pquota). Container will run without disk quota."
)
if not network:
resource_args.append("--network=none")
# Persistent workspace via bind mounts from a configurable host directory
# (TERMINAL_SANDBOX_DIR, default ~/.hermes/sandboxes/). Non-persistent
# mode uses tmpfs (ephemeral, fast, gone on cleanup).
from tools.environments.base import get_sandbox_dir
# User-configured volume mounts (from config.yaml docker_volumes)
volume_args = []
workspace_explicitly_mounted = False
for vol in (volumes or []):
if not isinstance(vol, str):
logger.warning(f"Docker volume entry is not a string: {vol!r}")
continue
vol = vol.strip()
if not vol:
continue
if ":" in vol:
volume_args.extend(["-v", vol])
if ":/workspace" in vol:
workspace_explicitly_mounted = True
else:
logger.warning(f"Docker volume '{vol}' missing colon, skipping")
host_cwd_abs = os.path.abspath(os.path.expanduser(host_cwd)) if host_cwd else ""
bind_host_cwd = (
auto_mount_cwd
and bool(host_cwd_abs)
and os.path.isdir(host_cwd_abs)
and not workspace_explicitly_mounted
)
if auto_mount_cwd and host_cwd and not os.path.isdir(host_cwd_abs):
logger.debug(f"Skipping docker cwd mount: host_cwd is not a valid directory: {host_cwd}")
self._workspace_dir: Optional[str] = None
self._home_dir: Optional[str] = None
writable_args = []
if self._persistent:
sandbox = get_sandbox_dir() / "docker" / task_id
self._home_dir = str(sandbox / "home")
os.makedirs(self._home_dir, exist_ok=True)
writable_args.extend([
"-v", f"{self._home_dir}:/root",
])
if not bind_host_cwd and not workspace_explicitly_mounted:
self._workspace_dir = str(sandbox / "workspace")
os.makedirs(self._workspace_dir, exist_ok=True)
writable_args.extend([
"-v", f"{self._workspace_dir}:/workspace",
])
else:
if not bind_host_cwd and not workspace_explicitly_mounted:
writable_args.extend([
"--tmpfs", "/workspace:rw,exec,size=10g",
])
writable_args.extend([
"--tmpfs", "/home:rw,exec,size=1g",
"--tmpfs", "/root:rw,exec,size=1g",
])
if bind_host_cwd:
logger.info(f"Mounting configured host cwd to /workspace: {host_cwd_abs}")
volume_args = ["-v", f"{host_cwd_abs}:/workspace", *volume_args]
elif workspace_explicitly_mounted:
logger.debug("Skipping docker cwd mount: /workspace already mounted by user config")
logger.info(f"Docker volume_args: {volume_args}")
all_run_args = list(_SECURITY_ARGS) + writable_args + resource_args + volume_args
logger.info(f"Docker run_args: {all_run_args}")
# Resolve the docker executable once so it works even when
# /usr/local/bin is not in PATH (common on macOS gateway/service).
self._docker_exe = find_docker() or "docker"
# Start the container directly via `docker run -d`.
container_name = f"hermes-{uuid.uuid4().hex[:8]}"
run_cmd = [
self._docker_exe, "run", "-d",
"--name", container_name,
"-w", cwd,
*all_run_args,
image,
"sleep", "2h",
]
logger.debug(f"Starting container: {' '.join(run_cmd)}")
result = subprocess.run(
run_cmd,
capture_output=True,
text=True,
timeout=120, # image pull may take a while
check=True,
)
self._container_id = result.stdout.strip()
logger.info(f"Started container {container_name} ({self._container_id[:12]})")
@staticmethod
def _storage_opt_supported() -> bool:
"""Check if Docker's storage driver supports --storage-opt size=.
Only overlay2 on XFS with pquota supports per-container disk quotas.
Ubuntu (and most distros) default to ext4, where this flag errors out.
"""
global _storage_opt_ok
if _storage_opt_ok is not None:
return _storage_opt_ok
try:
docker = find_docker() or "docker"
result = subprocess.run(
[docker, "info", "--format", "{{.Driver}}"],
capture_output=True, text=True, timeout=10,
)
driver = result.stdout.strip().lower()
if driver != "overlay2":
_storage_opt_ok = False
return False
# overlay2 only supports storage-opt on XFS with pquota.
# Probe by attempting a dry-ish run — the fastest reliable check.
probe = subprocess.run(
[docker, "create", "--storage-opt", "size=1m", "hello-world"],
capture_output=True, text=True, timeout=15,
)
if probe.returncode == 0:
# Clean up the created container
container_id = probe.stdout.strip()
if container_id:
subprocess.run([docker, "rm", container_id],
capture_output=True, timeout=5)
_storage_opt_ok = True
else:
_storage_opt_ok = False
except Exception:
_storage_opt_ok = False
logger.debug("Docker --storage-opt support: %s", _storage_opt_ok)
return _storage_opt_ok
def execute(self, command: str, cwd: str = "", *,
timeout: int | None = None,
stdin_data: str | None = None) -> dict:
exec_command, sudo_stdin = self._prepare_command(command)
work_dir = cwd or self.cwd
effective_timeout = timeout or self.timeout
# Merge sudo password (if any) with caller-supplied stdin_data.
if sudo_stdin is not None and stdin_data is not None:
effective_stdin = sudo_stdin + stdin_data
elif sudo_stdin is not None:
effective_stdin = sudo_stdin
else:
effective_stdin = stdin_data
# docker exec -w doesn't expand ~, so prepend a cd into the command
if work_dir == "~" or work_dir.startswith("~/"):
exec_command = f"cd {work_dir} && {exec_command}"
work_dir = "/"
assert self._container_id, "Container not started"
cmd = [self._docker_exe, "exec"]
if effective_stdin is not None:
cmd.append("-i")
cmd.extend(["-w", work_dir])
hermes_env = _load_hermes_env_vars() if self._forward_env else {}
for key in self._forward_env:
value = os.getenv(key)
if value is None:
value = hermes_env.get(key)
if value is not None:
cmd.extend(["-e", f"{key}={value}"])
cmd.extend([self._container_id, "bash", "-lc", exec_command])
try:
_output_chunks = []
proc = subprocess.Popen(
cmd,
stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
stdin=subprocess.PIPE if effective_stdin else subprocess.DEVNULL,
text=True,
)
if effective_stdin:
try:
proc.stdin.write(effective_stdin)
proc.stdin.close()
except Exception:
pass
def _drain():
try:
for line in proc.stdout:
_output_chunks.append(line)
except Exception:
pass
reader = threading.Thread(target=_drain, daemon=True)
reader.start()
deadline = time.monotonic() + effective_timeout
while proc.poll() is None:
if is_interrupted():
proc.terminate()
try:
proc.wait(timeout=1)
except subprocess.TimeoutExpired:
proc.kill()
reader.join(timeout=2)
return {
"output": "".join(_output_chunks) + "\n[Command interrupted]",
"returncode": 130,
}
if time.monotonic() > deadline:
proc.kill()
reader.join(timeout=2)
return self._timeout_result(effective_timeout)
time.sleep(0.2)
reader.join(timeout=5)
return {"output": "".join(_output_chunks), "returncode": proc.returncode}
except Exception as e:
return {"output": f"Docker execution error: {e}", "returncode": 1}
def cleanup(self):
"""Stop and remove the container. Bind-mount dirs persist if persistent=True."""
if self._container_id:
try:
# Stop in background so cleanup doesn't block
stop_cmd = (
f"(timeout 60 {self._docker_exe} stop {self._container_id} || "
f"{self._docker_exe} rm -f {self._container_id}) >/dev/null 2>&1 &"
)
subprocess.Popen(stop_cmd, shell=True)
except Exception as e:
logger.warning("Failed to stop container %s: %s", self._container_id, e)
if not self._persistent:
# Also schedule removal (stop only leaves it as stopped)
try:
subprocess.Popen(
f"sleep 3 && {self._docker_exe} rm -f {self._container_id} >/dev/null 2>&1 &",
shell=True,
)
except Exception:
pass
self._container_id = None
if not self._persistent:
for d in (self._workspace_dir, self._home_dir):
if d:
shutil.rmtree(d, ignore_errors=True)

View file

@ -0,0 +1,476 @@
"""Local execution environment with interrupt support and non-blocking I/O."""
import glob
import os
import platform
import shutil
import signal
import subprocess
import threading
import time
_IS_WINDOWS = platform.system() == "Windows"
from tools.environments.base import BaseEnvironment
from tools.environments.persistent_shell import PersistentShellMixin
from tools.interrupt import is_interrupted
# Unique marker to isolate real command output from shell init/exit noise.
# printf (no trailing newline) keeps the boundaries clean for splitting.
_OUTPUT_FENCE = "__HERMES_FENCE_a9f7b3__"
# Hermes-internal env vars that should NOT leak into terminal subprocesses.
# These are loaded from ~/.hermes/.env for Hermes' own LLM/provider calls
# but can break external CLIs (e.g. codex) that also honor them.
# See: https://github.com/NousResearch/hermes-agent/issues/1002
#
# Built dynamically from the provider registry so new providers are
# automatically covered without manual blocklist maintenance.
_HERMES_PROVIDER_ENV_FORCE_PREFIX = "_HERMES_FORCE_"
def _build_provider_env_blocklist() -> frozenset:
"""Derive the blocklist from provider, tool, and gateway config.
Automatically picks up api_key_env_vars and base_url_env_var from
every registered provider, plus tool/messaging env vars from the
optional config registry, so new Hermes-managed secrets are blocked
in subprocesses without having to maintain multiple static lists.
"""
blocked: set[str] = set()
try:
from hermes_cli.auth import PROVIDER_REGISTRY
for pconfig in PROVIDER_REGISTRY.values():
blocked.update(pconfig.api_key_env_vars)
if pconfig.base_url_env_var:
blocked.add(pconfig.base_url_env_var)
except ImportError:
pass
try:
from hermes_cli.config import OPTIONAL_ENV_VARS
for name, metadata in OPTIONAL_ENV_VARS.items():
category = metadata.get("category")
if category in {"tool", "messaging"}:
blocked.add(name)
elif category == "setting" and metadata.get("password"):
blocked.add(name)
except ImportError:
pass
# Vars not covered above but still Hermes-internal / conflict-prone.
blocked.update({
"OPENAI_BASE_URL",
"OPENAI_API_KEY",
"OPENAI_API_BASE", # legacy alias
"OPENAI_ORG_ID",
"OPENAI_ORGANIZATION",
"OPENROUTER_API_KEY",
"ANTHROPIC_BASE_URL",
"ANTHROPIC_TOKEN", # OAuth token (not in registry as env var)
"CLAUDE_CODE_OAUTH_TOKEN",
"LLM_MODEL",
# Expanded isolation for other major providers (Issue #1002)
"GOOGLE_API_KEY", # Gemini / Google AI Studio
"DEEPSEEK_API_KEY", # DeepSeek
"MISTRAL_API_KEY", # Mistral AI
"GROQ_API_KEY", # Groq
"TOGETHER_API_KEY", # Together AI
"PERPLEXITY_API_KEY", # Perplexity
"COHERE_API_KEY", # Cohere
"FIREWORKS_API_KEY", # Fireworks AI
"XAI_API_KEY", # xAI (Grok)
"HELICONE_API_KEY", # LLM Observability proxy
"PARALLEL_API_KEY",
"FIRECRAWL_API_KEY",
"FIRECRAWL_API_URL",
# Gateway/runtime config not represented in OPTIONAL_ENV_VARS.
"TELEGRAM_HOME_CHANNEL",
"TELEGRAM_HOME_CHANNEL_NAME",
"DISCORD_HOME_CHANNEL",
"DISCORD_HOME_CHANNEL_NAME",
"DISCORD_REQUIRE_MENTION",
"DISCORD_FREE_RESPONSE_CHANNELS",
"DISCORD_AUTO_THREAD",
"SLACK_HOME_CHANNEL",
"SLACK_HOME_CHANNEL_NAME",
"SLACK_ALLOWED_USERS",
"WHATSAPP_ENABLED",
"WHATSAPP_MODE",
"WHATSAPP_ALLOWED_USERS",
"SIGNAL_HTTP_URL",
"SIGNAL_ACCOUNT",
"SIGNAL_ALLOWED_USERS",
"SIGNAL_GROUP_ALLOWED_USERS",
"SIGNAL_HOME_CHANNEL",
"SIGNAL_HOME_CHANNEL_NAME",
"SIGNAL_IGNORE_STORIES",
"HASS_TOKEN",
"HASS_URL",
"EMAIL_ADDRESS",
"EMAIL_PASSWORD",
"EMAIL_IMAP_HOST",
"EMAIL_SMTP_HOST",
"EMAIL_HOME_ADDRESS",
"EMAIL_HOME_ADDRESS_NAME",
"GATEWAY_ALLOWED_USERS",
# Skills Hub / GitHub app auth paths and aliases.
"GH_TOKEN",
"GITHUB_APP_ID",
"GITHUB_APP_PRIVATE_KEY_PATH",
"GITHUB_APP_INSTALLATION_ID",
# Remote sandbox backend credentials.
"MODAL_TOKEN_ID",
"MODAL_TOKEN_SECRET",
"DAYTONA_API_KEY",
})
return frozenset(blocked)
_HERMES_PROVIDER_ENV_BLOCKLIST = _build_provider_env_blocklist()
def _sanitize_subprocess_env(base_env: dict | None, extra_env: dict | None = None) -> dict:
"""Filter Hermes-managed secrets from a subprocess environment.
`_HERMES_FORCE_<VAR>` entries in ``extra_env`` opt a blocked variable back in
intentionally for callers that truly need it. Vars registered via
:mod:`tools.env_passthrough` (skill-declared or user-configured) also
bypass the blocklist.
"""
try:
from tools.env_passthrough import is_env_passthrough as _is_passthrough
except Exception:
_is_passthrough = lambda _: False # noqa: E731
sanitized: dict[str, str] = {}
for key, value in (base_env or {}).items():
if key.startswith(_HERMES_PROVIDER_ENV_FORCE_PREFIX):
continue
if key not in _HERMES_PROVIDER_ENV_BLOCKLIST or _is_passthrough(key):
sanitized[key] = value
for key, value in (extra_env or {}).items():
if key.startswith(_HERMES_PROVIDER_ENV_FORCE_PREFIX):
real_key = key[len(_HERMES_PROVIDER_ENV_FORCE_PREFIX):]
sanitized[real_key] = value
elif key not in _HERMES_PROVIDER_ENV_BLOCKLIST or _is_passthrough(key):
sanitized[key] = value
return sanitized
def _find_bash() -> str:
"""Find bash for command execution.
The fence wrapper uses bash syntax (semicolons, $?, printf), so we
must use bash not the user's $SHELL which could be fish/zsh/etc.
On Windows: uses Git Bash (bundled with Git for Windows).
"""
if not _IS_WINDOWS:
return (
shutil.which("bash")
or ("/usr/bin/bash" if os.path.isfile("/usr/bin/bash") else None)
or ("/bin/bash" if os.path.isfile("/bin/bash") else None)
or os.environ.get("SHELL") # last resort: whatever they have
or "/bin/sh"
)
# Windows: look for Git Bash (installed with Git for Windows).
# Allow override via env var (same pattern as Claude Code).
custom = os.environ.get("HERMES_GIT_BASH_PATH")
if custom and os.path.isfile(custom):
return custom
# shutil.which finds bash.exe if Git\bin is on PATH
found = shutil.which("bash")
if found:
return found
# Check common Git for Windows install locations
for candidate in (
os.path.join(os.environ.get("ProgramFiles", r"C:\Program Files"), "Git", "bin", "bash.exe"),
os.path.join(os.environ.get("ProgramFiles(x86)", r"C:\Program Files (x86)"), "Git", "bin", "bash.exe"),
os.path.join(os.environ.get("LOCALAPPDATA", ""), "Programs", "Git", "bin", "bash.exe"),
):
if candidate and os.path.isfile(candidate):
return candidate
raise RuntimeError(
"Git Bash not found. Hermes Agent requires Git for Windows on Windows.\n"
"Install it from: https://git-scm.com/download/win\n"
"Or set HERMES_GIT_BASH_PATH to your bash.exe location."
)
# Backward compat — process_registry.py imports this name
_find_shell = _find_bash
# Noise lines emitted by interactive shells when stdin is not a terminal.
# Used as a fallback when output fence markers are missing.
_SHELL_NOISE_SUBSTRINGS = (
# bash
"bash: cannot set terminal process group",
"bash: no job control in this shell",
"no job control in this shell",
"cannot set terminal process group",
"tcsetattr: Inappropriate ioctl for device",
# zsh / oh-my-zsh / macOS terminal session
"Restored session:",
"Saving session...",
"Last login:",
"command not found:",
"Oh My Zsh",
"compinit:",
)
def _clean_shell_noise(output: str) -> str:
"""Strip shell startup/exit warnings that leak when using -i without a TTY.
Removes lines matching known noise patterns from both the beginning
and end of the output. Lines in the middle are left untouched.
"""
def _is_noise(line: str) -> bool:
return any(noise in line for noise in _SHELL_NOISE_SUBSTRINGS)
lines = output.split("\n")
# Strip leading noise
while lines and _is_noise(lines[0]):
lines.pop(0)
# Strip trailing noise (walk backwards, skip empty lines from split)
end = len(lines) - 1
while end >= 0 and (not lines[end] or _is_noise(lines[end])):
end -= 1
if end < 0:
return ""
cleaned = lines[: end + 1]
result = "\n".join(cleaned)
# Preserve trailing newline if original had one
if output.endswith("\n") and result and not result.endswith("\n"):
result += "\n"
return result
# Standard PATH entries for environments with minimal PATH (e.g. systemd services).
# Includes macOS Homebrew paths (/opt/homebrew/* for Apple Silicon).
_SANE_PATH = (
"/opt/homebrew/bin:/opt/homebrew/sbin:"
"/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"
)
def _make_run_env(env: dict) -> dict:
"""Build a run environment with a sane PATH and provider-var stripping."""
try:
from tools.env_passthrough import is_env_passthrough as _is_passthrough
except Exception:
_is_passthrough = lambda _: False # noqa: E731
merged = dict(os.environ | env)
run_env = {}
for k, v in merged.items():
if k.startswith(_HERMES_PROVIDER_ENV_FORCE_PREFIX):
real_key = k[len(_HERMES_PROVIDER_ENV_FORCE_PREFIX):]
run_env[real_key] = v
elif k not in _HERMES_PROVIDER_ENV_BLOCKLIST or _is_passthrough(k):
run_env[k] = v
existing_path = run_env.get("PATH", "")
if "/usr/bin" not in existing_path.split(":"):
run_env["PATH"] = f"{existing_path}:{_SANE_PATH}" if existing_path else _SANE_PATH
return run_env
def _extract_fenced_output(raw: str) -> str:
"""Extract real command output from between fence markers.
The execute() method wraps each command with printf(FENCE) markers.
This function finds the first and last fence and returns only the
content between them, which is the actual command output free of
any shell init/exit noise.
Falls back to pattern-based _clean_shell_noise if fences are missing.
"""
first = raw.find(_OUTPUT_FENCE)
if first == -1:
return _clean_shell_noise(raw)
start = first + len(_OUTPUT_FENCE)
last = raw.rfind(_OUTPUT_FENCE)
if last <= first:
# Only start fence found (e.g. user command called `exit`)
return _clean_shell_noise(raw[start:])
return raw[start:last]
class LocalEnvironment(PersistentShellMixin, BaseEnvironment):
"""Run commands directly on the host machine.
Features:
- Popen + polling for interrupt support (user can cancel mid-command)
- Background stdout drain thread to prevent pipe buffer deadlocks
- stdin_data support for piping content (bypasses ARG_MAX limits)
- sudo -S transform via SUDO_PASSWORD env var
- Uses interactive login shell so full user env is available
- Optional persistent shell mode (cwd/env vars survive across calls)
"""
def __init__(self, cwd: str = "", timeout: int = 60, env: dict = None,
persistent: bool = False):
super().__init__(cwd=cwd or os.getcwd(), timeout=timeout, env=env)
self.persistent = persistent
if self.persistent:
self._init_persistent_shell()
@property
def _temp_prefix(self) -> str:
return f"/tmp/hermes-local-{self._session_id}"
def _spawn_shell_process(self) -> subprocess.Popen:
user_shell = _find_bash()
run_env = _make_run_env(self.env)
return subprocess.Popen(
[user_shell, "-l"],
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.DEVNULL,
text=True,
env=run_env,
preexec_fn=None if _IS_WINDOWS else os.setsid,
)
def _read_temp_files(self, *paths: str) -> list[str]:
results = []
for path in paths:
if os.path.exists(path):
with open(path) as f:
results.append(f.read())
else:
results.append("")
return results
def _kill_shell_children(self):
if self._shell_pid is None:
return
try:
subprocess.run(
["pkill", "-P", str(self._shell_pid)],
capture_output=True, timeout=5,
)
except (subprocess.TimeoutExpired, FileNotFoundError):
pass
def _cleanup_temp_files(self):
for f in glob.glob(f"{self._temp_prefix}-*"):
if os.path.exists(f):
os.remove(f)
def _execute_oneshot(self, command: str, cwd: str = "", *,
timeout: int | None = None,
stdin_data: str | None = None) -> dict:
work_dir = cwd or self.cwd or os.getcwd()
effective_timeout = timeout or self.timeout
exec_command, sudo_stdin = self._prepare_command(command)
if sudo_stdin is not None and stdin_data is not None:
effective_stdin = sudo_stdin + stdin_data
elif sudo_stdin is not None:
effective_stdin = sudo_stdin
else:
effective_stdin = stdin_data
user_shell = _find_bash()
fenced_cmd = (
f"printf '{_OUTPUT_FENCE}';"
f" {exec_command};"
f" __hermes_rc=$?;"
f" printf '{_OUTPUT_FENCE}';"
f" exit $__hermes_rc"
)
run_env = _make_run_env(self.env)
proc = subprocess.Popen(
[user_shell, "-lic", fenced_cmd],
text=True,
cwd=work_dir,
env=run_env,
encoding="utf-8",
errors="replace",
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
stdin=subprocess.PIPE if effective_stdin is not None else subprocess.DEVNULL,
preexec_fn=None if _IS_WINDOWS else os.setsid,
)
if effective_stdin is not None:
def _write_stdin():
try:
proc.stdin.write(effective_stdin)
proc.stdin.close()
except (BrokenPipeError, OSError):
pass
threading.Thread(target=_write_stdin, daemon=True).start()
_output_chunks: list[str] = []
def _drain_stdout():
try:
for line in proc.stdout:
_output_chunks.append(line)
except ValueError:
pass
finally:
try:
proc.stdout.close()
except Exception:
pass
reader = threading.Thread(target=_drain_stdout, daemon=True)
reader.start()
deadline = time.monotonic() + effective_timeout
while proc.poll() is None:
if is_interrupted():
try:
if _IS_WINDOWS:
proc.terminate()
else:
pgid = os.getpgid(proc.pid)
os.killpg(pgid, signal.SIGTERM)
try:
proc.wait(timeout=1.0)
except subprocess.TimeoutExpired:
os.killpg(pgid, signal.SIGKILL)
except (ProcessLookupError, PermissionError):
proc.kill()
reader.join(timeout=2)
return {
"output": "".join(_output_chunks) + "\n[Command interrupted — user sent a new message]",
"returncode": 130,
}
if time.monotonic() > deadline:
try:
if _IS_WINDOWS:
proc.terminate()
else:
os.killpg(os.getpgid(proc.pid), signal.SIGTERM)
except (ProcessLookupError, PermissionError):
proc.kill()
reader.join(timeout=2)
return self._timeout_result(effective_timeout)
time.sleep(0.2)
reader.join(timeout=5)
output = _extract_fenced_output("".join(_output_chunks))
return {"output": output, "returncode": proc.returncode}

View file

@ -0,0 +1,259 @@
"""Modal cloud execution environment using SWE-ReX directly.
Supports persistent filesystem snapshots: when enabled, the sandbox's filesystem
is snapshotted on cleanup and restored on next creation, so installed packages,
project files, and config changes survive across sessions.
"""
import asyncio
import json
import logging
import threading
import uuid
from pathlib import Path
from typing import Any, Dict, Optional
from hermes_cli.config import get_hermes_home
from tools.environments.base import BaseEnvironment
from tools.interrupt import is_interrupted
logger = logging.getLogger(__name__)
_SNAPSHOT_STORE = get_hermes_home() / "modal_snapshots.json"
def _load_snapshots() -> Dict[str, str]:
"""Load snapshot ID mapping from disk."""
if _SNAPSHOT_STORE.exists():
try:
return json.loads(_SNAPSHOT_STORE.read_text())
except Exception:
pass
return {}
def _save_snapshots(data: Dict[str, str]) -> None:
"""Persist snapshot ID mapping to disk."""
_SNAPSHOT_STORE.parent.mkdir(parents=True, exist_ok=True)
_SNAPSHOT_STORE.write_text(json.dumps(data, indent=2))
class _AsyncWorker:
"""Background thread with its own event loop for async-safe swe-rex calls.
Allows sync code to submit async coroutines and block for results,
even when called from inside another running event loop (e.g. Atropos).
"""
def __init__(self):
self._loop: Optional[asyncio.AbstractEventLoop] = None
self._thread: Optional[threading.Thread] = None
self._started = threading.Event()
def start(self):
self._thread = threading.Thread(target=self._run_loop, daemon=True)
self._thread.start()
self._started.wait(timeout=30)
def _run_loop(self):
self._loop = asyncio.new_event_loop()
asyncio.set_event_loop(self._loop)
self._started.set()
self._loop.run_forever()
def run_coroutine(self, coro, timeout=600):
if self._loop is None or self._loop.is_closed():
raise RuntimeError("AsyncWorker loop is not running")
future = asyncio.run_coroutine_threadsafe(coro, self._loop)
return future.result(timeout=timeout)
def stop(self):
if self._loop and self._loop.is_running():
self._loop.call_soon_threadsafe(self._loop.stop)
if self._thread:
self._thread.join(timeout=10)
class ModalEnvironment(BaseEnvironment):
"""Modal cloud execution via SWE-ReX.
Uses swe-rex's ModalDeployment directly for sandbox management.
Adds sudo -S support, configurable resources (CPU, memory, disk),
and optional filesystem persistence via Modal's snapshot API.
"""
def __init__(
self,
image: str,
cwd: str = "/root",
timeout: int = 60,
modal_sandbox_kwargs: Optional[Dict[str, Any]] = None,
persistent_filesystem: bool = True,
task_id: str = "default",
):
super().__init__(cwd=cwd, timeout=timeout)
self._persistent = persistent_filesystem
self._task_id = task_id
self._base_image = image
self._deployment = None
self._worker = _AsyncWorker()
sandbox_kwargs = dict(modal_sandbox_kwargs or {})
# If persistent, try to restore from a previous snapshot
restored_image = None
if self._persistent:
snapshot_id = _load_snapshots().get(self._task_id)
if snapshot_id:
try:
import modal
restored_image = modal.Image.from_id(snapshot_id)
logger.info("Modal: restoring from snapshot %s", snapshot_id[:20])
except Exception as e:
logger.warning("Modal: failed to restore snapshot, using base image: %s", e)
restored_image = None
effective_image = restored_image if restored_image else image
# Pre-build a modal.Image with pip fix for Modal's legacy image builder.
# Some task images have broken pip; fix via ensurepip before Modal uses it.
import modal as _modal
if isinstance(effective_image, str):
effective_image = _modal.Image.from_registry(
effective_image,
setup_dockerfile_commands=[
"RUN rm -rf /usr/local/lib/python*/site-packages/pip* 2>/dev/null; "
"python -m ensurepip --upgrade --default-pip 2>/dev/null || true",
],
)
# Start the async worker thread and create the deployment on it
# so all gRPC channels are bound to the worker's event loop.
self._worker.start()
from swerex.deployment.modal import ModalDeployment
async def _create_and_start():
deployment = ModalDeployment(
image=effective_image,
startup_timeout=180.0,
runtime_timeout=3600.0,
deployment_timeout=3600.0,
install_pipx=True,
modal_sandbox_kwargs=sandbox_kwargs,
)
await deployment.start()
return deployment
self._deployment = self._worker.run_coroutine(_create_and_start())
def execute(self, command: str, cwd: str = "", *,
timeout: int | None = None,
stdin_data: str | None = None) -> dict:
if stdin_data is not None:
marker = f"HERMES_EOF_{uuid.uuid4().hex[:8]}"
while marker in stdin_data:
marker = f"HERMES_EOF_{uuid.uuid4().hex[:8]}"
command = f"{command} << '{marker}'\n{stdin_data}\n{marker}"
exec_command, sudo_stdin = self._prepare_command(command)
# Modal sandboxes execute commands via the Modal SDK and cannot pipe
# subprocess stdin directly the way a local Popen can. When a sudo
# password is present, use a shell-level pipe from printf so that the
# password feeds sudo -S without appearing as an echo argument embedded
# in the shell string.
if sudo_stdin is not None:
import shlex
exec_command = (
f"printf '%s\\n' {shlex.quote(sudo_stdin.rstrip())} | {exec_command}"
)
from swerex.runtime.abstract import Command as RexCommand
effective_cwd = cwd or self.cwd
effective_timeout = timeout or self.timeout
# Run in a background thread so we can poll for interrupts
result_holder = {"value": None, "error": None}
def _run():
try:
async def _do_execute():
return await self._deployment.runtime.execute(
RexCommand(
command=exec_command,
shell=True,
check=False,
cwd=effective_cwd,
timeout=effective_timeout,
merge_output_streams=True,
)
)
output = self._worker.run_coroutine(_do_execute())
result_holder["value"] = {
"output": output.stdout,
"returncode": output.exit_code,
}
except Exception as e:
result_holder["error"] = e
t = threading.Thread(target=_run, daemon=True)
t.start()
while t.is_alive():
t.join(timeout=0.2)
if is_interrupted():
try:
self._worker.run_coroutine(
asyncio.wait_for(self._deployment.stop(), timeout=10),
timeout=15,
)
except Exception:
pass
return {
"output": "[Command interrupted - Modal sandbox terminated]",
"returncode": 130,
}
if result_holder["error"]:
return {"output": f"Modal execution error: {result_holder['error']}", "returncode": 1}
return result_holder["value"]
def cleanup(self):
"""Snapshot the filesystem (if persistent) then stop the sandbox."""
if self._deployment is None:
return
if self._persistent:
try:
sandbox = getattr(self._deployment, '_sandbox', None)
if sandbox:
async def _snapshot():
img = await sandbox.snapshot_filesystem.aio()
return img.object_id
try:
snapshot_id = self._worker.run_coroutine(_snapshot(), timeout=60)
except Exception:
snapshot_id = None
if snapshot_id:
snapshots = _load_snapshots()
snapshots[self._task_id] = snapshot_id
_save_snapshots(snapshots)
logger.info("Modal: saved filesystem snapshot %s for task %s",
snapshot_id[:20], self._task_id)
except Exception as e:
logger.warning("Modal: filesystem snapshot failed: %s", e)
try:
self._worker.run_coroutine(
asyncio.wait_for(self._deployment.stop(), timeout=10),
timeout=15,
)
except Exception:
pass
finally:
self._worker.stop()
self._deployment = None

View file

@ -0,0 +1,272 @@
"""Persistent shell mixin: file-based IPC protocol for long-lived bash shells."""
import logging
import shlex
import subprocess
import threading
import time
import uuid
from abc import abstractmethod
from tools.interrupt import is_interrupted
logger = logging.getLogger(__name__)
class PersistentShellMixin:
"""Mixin that adds persistent shell capability to any BaseEnvironment.
Subclasses must implement ``_spawn_shell_process()``, ``_read_temp_files()``,
``_kill_shell_children()``, ``_execute_oneshot()``, and ``_cleanup_temp_files()``.
"""
persistent: bool
@abstractmethod
def _spawn_shell_process(self) -> subprocess.Popen: ...
@abstractmethod
def _read_temp_files(self, *paths: str) -> list[str]: ...
@abstractmethod
def _kill_shell_children(self): ...
@abstractmethod
def _execute_oneshot(self, command: str, cwd: str, *,
timeout: int | None = None,
stdin_data: str | None = None) -> dict: ...
@abstractmethod
def _cleanup_temp_files(self): ...
_session_id: str = ""
_poll_interval: float = 0.01
@property
def _temp_prefix(self) -> str:
return f"/tmp/hermes-persistent-{self._session_id}"
# ------------------------------------------------------------------
# Lifecycle
# ------------------------------------------------------------------
def _init_persistent_shell(self):
self._shell_lock = threading.Lock()
self._shell_proc: subprocess.Popen | None = None
self._shell_alive: bool = False
self._shell_pid: int | None = None
self._session_id = uuid.uuid4().hex[:12]
p = self._temp_prefix
self._pshell_stdout = f"{p}-stdout"
self._pshell_stderr = f"{p}-stderr"
self._pshell_status = f"{p}-status"
self._pshell_cwd = f"{p}-cwd"
self._pshell_pid_file = f"{p}-pid"
self._shell_proc = self._spawn_shell_process()
self._shell_alive = True
self._drain_thread = threading.Thread(
target=self._drain_shell_output, daemon=True,
)
self._drain_thread.start()
init_script = (
f"export TERM=${{TERM:-dumb}}\n"
f"touch {self._pshell_stdout} {self._pshell_stderr} "
f"{self._pshell_status} {self._pshell_cwd} {self._pshell_pid_file}\n"
f"echo $$ > {self._pshell_pid_file}\n"
f"pwd > {self._pshell_cwd}\n"
)
self._send_to_shell(init_script)
deadline = time.monotonic() + 3.0
while time.monotonic() < deadline:
pid_str = self._read_temp_files(self._pshell_pid_file)[0].strip()
if pid_str.isdigit():
self._shell_pid = int(pid_str)
break
time.sleep(0.05)
else:
logger.warning("Could not read persistent shell PID")
self._shell_pid = None
if self._shell_pid:
logger.info(
"Persistent shell started (session=%s, pid=%d)",
self._session_id, self._shell_pid,
)
reported_cwd = self._read_temp_files(self._pshell_cwd)[0].strip()
if reported_cwd:
self.cwd = reported_cwd
def _cleanup_persistent_shell(self):
if self._shell_proc is None:
return
if self._session_id:
self._cleanup_temp_files()
try:
self._shell_proc.stdin.close()
except Exception:
pass
try:
self._shell_proc.terminate()
self._shell_proc.wait(timeout=3)
except subprocess.TimeoutExpired:
self._shell_proc.kill()
self._shell_alive = False
self._shell_proc = None
if hasattr(self, "_drain_thread") and self._drain_thread.is_alive():
self._drain_thread.join(timeout=1.0)
# ------------------------------------------------------------------
# execute() / cleanup() — shared dispatcher, subclasses inherit
# ------------------------------------------------------------------
def execute(self, command: str, cwd: str = "", *,
timeout: int | None = None,
stdin_data: str | None = None) -> dict:
if self.persistent:
return self._execute_persistent(
command, cwd, timeout=timeout, stdin_data=stdin_data,
)
return self._execute_oneshot(
command, cwd, timeout=timeout, stdin_data=stdin_data,
)
def cleanup(self):
if self.persistent:
self._cleanup_persistent_shell()
# ------------------------------------------------------------------
# Shell I/O
# ------------------------------------------------------------------
def _drain_shell_output(self):
try:
for _ in self._shell_proc.stdout:
pass
except Exception:
pass
self._shell_alive = False
def _send_to_shell(self, text: str):
if not self._shell_alive or self._shell_proc is None:
return
try:
self._shell_proc.stdin.write(text)
self._shell_proc.stdin.flush()
except (BrokenPipeError, OSError):
self._shell_alive = False
def _read_persistent_output(self) -> tuple[str, int, str]:
stdout, stderr, status_raw, cwd = self._read_temp_files(
self._pshell_stdout, self._pshell_stderr,
self._pshell_status, self._pshell_cwd,
)
output = self._merge_output(stdout, stderr)
status = status_raw.strip()
if ":" in status:
status = status.split(":", 1)[1]
try:
exit_code = int(status.strip())
except ValueError:
exit_code = 1
return output, exit_code, cwd.strip()
# ------------------------------------------------------------------
# Execution
# ------------------------------------------------------------------
def _execute_persistent(self, command: str, cwd: str, *,
timeout: int | None = None,
stdin_data: str | None = None) -> dict:
if not self._shell_alive:
logger.info("Persistent shell died, restarting...")
self._init_persistent_shell()
exec_command, sudo_stdin = self._prepare_command(command)
effective_timeout = timeout or self.timeout
if stdin_data or sudo_stdin:
return self._execute_oneshot(
command, cwd, timeout=timeout, stdin_data=stdin_data,
)
with self._shell_lock:
return self._execute_persistent_locked(
exec_command, cwd, effective_timeout,
)
def _execute_persistent_locked(self, command: str, cwd: str,
timeout: int) -> dict:
work_dir = cwd or self.cwd
cmd_id = uuid.uuid4().hex[:8]
truncate = (
f": > {self._pshell_stdout}\n"
f": > {self._pshell_stderr}\n"
f": > {self._pshell_status}\n"
)
self._send_to_shell(truncate)
escaped = command.replace("'", "'\\''")
ipc_script = (
f"cd {shlex.quote(work_dir)}\n"
f"eval '{escaped}' < /dev/null > {self._pshell_stdout} 2> {self._pshell_stderr}\n"
f"__EC=$?\n"
f"pwd > {self._pshell_cwd}\n"
f"echo {cmd_id}:$__EC > {self._pshell_status}\n"
)
self._send_to_shell(ipc_script)
deadline = time.monotonic() + timeout
poll_interval = self._poll_interval
while True:
if is_interrupted():
self._kill_shell_children()
output, _, _ = self._read_persistent_output()
return {
"output": output + "\n[Command interrupted]",
"returncode": 130,
}
if time.monotonic() > deadline:
self._kill_shell_children()
output, _, _ = self._read_persistent_output()
if output:
return {
"output": output + f"\n[Command timed out after {timeout}s]",
"returncode": 124,
}
return self._timeout_result(timeout)
if not self._shell_alive:
return {
"output": "Persistent shell died during execution",
"returncode": 1,
}
status_content = self._read_temp_files(self._pshell_status)[0].strip()
if status_content.startswith(cmd_id + ":"):
break
time.sleep(poll_interval)
output, exit_code, new_cwd = self._read_persistent_output()
if new_cwd:
self.cwd = new_cwd
return {"output": output, "returncode": exit_code}
@staticmethod
def _merge_output(stdout: str, stderr: str) -> str:
parts = []
if stdout.strip():
parts.append(stdout.rstrip("\n"))
if stderr.strip():
parts.append(stderr.rstrip("\n"))
return "\n".join(parts)

View file

@ -0,0 +1,369 @@
"""Singularity/Apptainer persistent container environment.
Security-hardened with --containall, --no-home, capability dropping.
Supports configurable resource limits and optional filesystem persistence
via writable overlay directories that survive across sessions.
"""
import json
import logging
import os
import shutil
import subprocess
import tempfile
import threading
import uuid
from pathlib import Path
from typing import Any, Dict, Optional
from hermes_cli.config import get_hermes_home
from tools.environments.base import BaseEnvironment
from tools.interrupt import is_interrupted
logger = logging.getLogger(__name__)
_SNAPSHOT_STORE = get_hermes_home() / "singularity_snapshots.json"
def _find_singularity_executable() -> str:
"""Locate the apptainer or singularity CLI binary.
Returns the executable name (``"apptainer"`` or ``"singularity"``).
Raises ``RuntimeError`` with install instructions if neither is found.
"""
if shutil.which("apptainer"):
return "apptainer"
if shutil.which("singularity"):
return "singularity"
raise RuntimeError(
"Neither 'apptainer' nor 'singularity' was found in PATH. "
"Install Apptainer (https://apptainer.org/docs/admin/main/installation.html) "
"or Singularity and ensure the CLI is available."
)
def _ensure_singularity_available() -> str:
"""Preflight check: resolve the executable and verify it responds.
Returns the executable name on success.
Raises ``RuntimeError`` with an actionable message on failure.
"""
exe = _find_singularity_executable()
try:
result = subprocess.run(
[exe, "version"],
capture_output=True,
text=True,
timeout=10,
)
except FileNotFoundError:
raise RuntimeError(
f"Singularity backend selected but the resolved executable '{exe}' "
"could not be executed. Check your installation."
)
except subprocess.TimeoutExpired:
raise RuntimeError(
f"'{exe} version' timed out. The runtime may be misconfigured."
)
if result.returncode != 0:
stderr = result.stderr.strip()[:200]
raise RuntimeError(
f"'{exe} version' failed (exit code {result.returncode}): {stderr}"
)
return exe
def _load_snapshots() -> Dict[str, str]:
if _SNAPSHOT_STORE.exists():
try:
return json.loads(_SNAPSHOT_STORE.read_text())
except Exception:
pass
return {}
def _save_snapshots(data: Dict[str, str]) -> None:
_SNAPSHOT_STORE.parent.mkdir(parents=True, exist_ok=True)
_SNAPSHOT_STORE.write_text(json.dumps(data, indent=2))
# -------------------------------------------------------------------------
# Singularity helpers (scratch dir, SIF cache, SIF building)
# -------------------------------------------------------------------------
def _get_scratch_dir() -> Path:
"""Get the best directory for Singularity sandboxes.
Resolution order:
1. TERMINAL_SCRATCH_DIR (explicit override)
2. TERMINAL_SANDBOX_DIR / singularity (shared sandbox root)
3. /scratch (common on HPC clusters)
4. ~/.hermes/sandboxes/singularity (fallback)
"""
custom_scratch = os.getenv("TERMINAL_SCRATCH_DIR")
if custom_scratch:
scratch_path = Path(custom_scratch)
scratch_path.mkdir(parents=True, exist_ok=True)
return scratch_path
from tools.environments.base import get_sandbox_dir
sandbox = get_sandbox_dir() / "singularity"
scratch = Path("/scratch")
if scratch.exists() and os.access(scratch, os.W_OK):
user_scratch = scratch / os.getenv("USER", "hermes") / "hermes-agent"
user_scratch.mkdir(parents=True, exist_ok=True)
logger.info("Using /scratch for sandboxes: %s", user_scratch)
return user_scratch
sandbox.mkdir(parents=True, exist_ok=True)
return sandbox
def _get_apptainer_cache_dir() -> Path:
"""Get the Apptainer cache directory for SIF images."""
cache_dir = os.getenv("APPTAINER_CACHEDIR")
if cache_dir:
cache_path = Path(cache_dir)
cache_path.mkdir(parents=True, exist_ok=True)
return cache_path
scratch = _get_scratch_dir()
cache_path = scratch / ".apptainer"
cache_path.mkdir(parents=True, exist_ok=True)
return cache_path
_sif_build_lock = threading.Lock()
def _get_or_build_sif(image: str, executable: str = "apptainer") -> str:
"""Get or build a SIF image from a docker:// URL.
Returns the path unchanged if it's already a .sif file.
For docker:// URLs, checks the cache and builds if needed.
"""
if image.endswith('.sif') and Path(image).exists():
return image
if not image.startswith('docker://'):
return image
image_name = image.replace('docker://', '').replace('/', '-').replace(':', '-')
cache_dir = _get_apptainer_cache_dir()
sif_path = cache_dir / f"{image_name}.sif"
if sif_path.exists():
return str(sif_path)
with _sif_build_lock:
if sif_path.exists():
return str(sif_path)
logger.info("Building SIF image (one-time setup)...")
logger.info(" Source: %s", image)
logger.info(" Target: %s", sif_path)
tmp_dir = cache_dir / "tmp"
tmp_dir.mkdir(parents=True, exist_ok=True)
env = os.environ.copy()
env["APPTAINER_TMPDIR"] = str(tmp_dir)
env["APPTAINER_CACHEDIR"] = str(cache_dir)
try:
result = subprocess.run(
[executable, "build", str(sif_path), image],
capture_output=True, text=True, timeout=600, env=env,
)
if result.returncode != 0:
logger.warning("SIF build failed, falling back to docker:// URL")
logger.warning(" Error: %s", result.stderr[:500])
return image
logger.info("SIF image built successfully")
return str(sif_path)
except subprocess.TimeoutExpired:
logger.warning("SIF build timed out, falling back to docker:// URL")
if sif_path.exists():
sif_path.unlink()
return image
except Exception as e:
logger.warning("SIF build error: %s, falling back to docker:// URL", e)
return image
# -------------------------------------------------------------------------
# SingularityEnvironment
# -------------------------------------------------------------------------
class SingularityEnvironment(BaseEnvironment):
"""Hardened Singularity/Apptainer container with resource limits and persistence.
Security: --containall (isolated PID/IPC/mount namespaces, no host home mount),
--no-home, writable-tmpfs for scratch space. The container cannot see or modify
the host filesystem outside of explicitly bound paths.
Persistence: when enabled, the writable overlay directory is preserved across
sessions so installed packages and files survive cleanup/restore.
"""
def __init__(
self,
image: str,
cwd: str = "~",
timeout: int = 60,
cpu: float = 0,
memory: int = 0,
disk: int = 0,
persistent_filesystem: bool = False,
task_id: str = "default",
):
super().__init__(cwd=cwd, timeout=timeout)
self.executable = _ensure_singularity_available()
self.image = _get_or_build_sif(image, self.executable)
self.instance_id = f"hermes_{uuid.uuid4().hex[:12]}"
self._instance_started = False
self._persistent = persistent_filesystem
self._task_id = task_id
self._overlay_dir: Optional[Path] = None
# Resource limits
self._cpu = cpu
self._memory = memory
# Persistent overlay directory
if self._persistent:
overlay_base = _get_scratch_dir() / "hermes-overlays"
overlay_base.mkdir(parents=True, exist_ok=True)
self._overlay_dir = overlay_base / f"overlay-{task_id}"
self._overlay_dir.mkdir(parents=True, exist_ok=True)
self._start_instance()
def _start_instance(self):
cmd = [self.executable, "instance", "start"]
# Security: full isolation from host
cmd.extend(["--containall", "--no-home"])
# Writable layer
if self._persistent and self._overlay_dir:
# Persistent writable overlay -- survives across restarts
cmd.extend(["--overlay", str(self._overlay_dir)])
else:
cmd.append("--writable-tmpfs")
# Resource limits (cgroup-based, may require root or appropriate config)
if self._memory > 0:
cmd.extend(["--memory", f"{self._memory}M"])
if self._cpu > 0:
cmd.extend(["--cpus", str(self._cpu)])
cmd.extend([str(self.image), self.instance_id])
try:
result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
if result.returncode != 0:
raise RuntimeError(f"Failed to start instance: {result.stderr}")
self._instance_started = True
logger.info("Singularity instance %s started (persistent=%s)",
self.instance_id, self._persistent)
except subprocess.TimeoutExpired:
raise RuntimeError("Instance start timed out")
def execute(self, command: str, cwd: str = "", *,
timeout: int | None = None,
stdin_data: str | None = None) -> dict:
if not self._instance_started:
return {"output": "Instance not started", "returncode": -1}
effective_timeout = timeout or self.timeout
work_dir = cwd or self.cwd
exec_command, sudo_stdin = self._prepare_command(command)
# Merge sudo password (if any) with caller-supplied stdin_data.
if sudo_stdin is not None and stdin_data is not None:
effective_stdin = sudo_stdin + stdin_data
elif sudo_stdin is not None:
effective_stdin = sudo_stdin
else:
effective_stdin = stdin_data
# apptainer exec --pwd doesn't expand ~, so prepend a cd into the command
if work_dir == "~" or work_dir.startswith("~/"):
exec_command = f"cd {work_dir} && {exec_command}"
work_dir = "/tmp"
cmd = [self.executable, "exec", "--pwd", work_dir,
f"instance://{self.instance_id}",
"bash", "-c", exec_command]
try:
import time as _time
_output_chunks = []
proc = subprocess.Popen(
cmd,
stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
stdin=subprocess.PIPE if effective_stdin else subprocess.DEVNULL,
text=True,
)
if effective_stdin:
try:
proc.stdin.write(effective_stdin)
proc.stdin.close()
except Exception:
pass
def _drain():
try:
for line in proc.stdout:
_output_chunks.append(line)
except Exception:
pass
reader = threading.Thread(target=_drain, daemon=True)
reader.start()
deadline = _time.monotonic() + effective_timeout
while proc.poll() is None:
if is_interrupted():
proc.terminate()
try:
proc.wait(timeout=1)
except subprocess.TimeoutExpired:
proc.kill()
reader.join(timeout=2)
return {
"output": "".join(_output_chunks) + "\n[Command interrupted]",
"returncode": 130,
}
if _time.monotonic() > deadline:
proc.kill()
reader.join(timeout=2)
return self._timeout_result(effective_timeout)
_time.sleep(0.2)
reader.join(timeout=5)
return {"output": "".join(_output_chunks), "returncode": proc.returncode}
except Exception as e:
return {"output": f"Singularity execution error: {e}", "returncode": 1}
def cleanup(self):
"""Stop the instance. If persistent, the overlay dir survives for next creation."""
if self._instance_started:
try:
subprocess.run(
[self.executable, "instance", "stop", self.instance_id],
capture_output=True, text=True, timeout=30,
)
logger.info("Singularity instance %s stopped", self.instance_id)
except Exception as e:
logger.warning("Failed to stop Singularity instance %s: %s", self.instance_id, e)
self._instance_started = False
# Record overlay path for persistence restoration
if self._persistent and self._overlay_dir:
snapshots = _load_snapshots()
snapshots[self._task_id] = str(self._overlay_dir)
_save_snapshots(snapshots)

View file

@ -0,0 +1,232 @@
"""SSH remote execution environment with ControlMaster connection persistence."""
import logging
import shutil
import subprocess
import tempfile
import threading
import time
from pathlib import Path
from tools.environments.base import BaseEnvironment
from tools.environments.persistent_shell import PersistentShellMixin
from tools.interrupt import is_interrupted
logger = logging.getLogger(__name__)
def _ensure_ssh_available() -> None:
"""Fail fast with a clear error when the SSH client is unavailable."""
if not shutil.which("ssh"):
raise RuntimeError(
"SSH is not installed or not in PATH. Install OpenSSH client: apt install openssh-client"
)
class SSHEnvironment(PersistentShellMixin, BaseEnvironment):
"""Run commands on a remote machine over SSH.
Uses SSH ControlMaster for connection persistence so subsequent
commands are fast. Security benefit: the agent cannot modify its
own code since execution happens on a separate machine.
Foreground commands are interruptible: the local ssh process is killed
and a remote kill is attempted over the ControlMaster socket.
When ``persistent=True``, a single long-lived bash shell is kept alive
over SSH and state (cwd, env vars, shell variables) persists across
``execute()`` calls. Output capture uses file-based IPC on the remote
host (stdout/stderr/exit-code written to temp files, polled via fast
ControlMaster one-shot reads).
"""
def __init__(self, host: str, user: str, cwd: str = "~",
timeout: int = 60, port: int = 22, key_path: str = "",
persistent: bool = False):
super().__init__(cwd=cwd, timeout=timeout)
self.host = host
self.user = user
self.port = port
self.key_path = key_path
self.persistent = persistent
self.control_dir = Path(tempfile.gettempdir()) / "hermes-ssh"
self.control_dir.mkdir(parents=True, exist_ok=True)
self.control_socket = self.control_dir / f"{user}@{host}:{port}.sock"
_ensure_ssh_available()
self._establish_connection()
if self.persistent:
self._init_persistent_shell()
def _build_ssh_command(self, extra_args: list | None = None) -> list:
cmd = ["ssh"]
cmd.extend(["-o", f"ControlPath={self.control_socket}"])
cmd.extend(["-o", "ControlMaster=auto"])
cmd.extend(["-o", "ControlPersist=300"])
cmd.extend(["-o", "BatchMode=yes"])
cmd.extend(["-o", "StrictHostKeyChecking=accept-new"])
cmd.extend(["-o", "ConnectTimeout=10"])
if self.port != 22:
cmd.extend(["-p", str(self.port)])
if self.key_path:
cmd.extend(["-i", self.key_path])
if extra_args:
cmd.extend(extra_args)
cmd.append(f"{self.user}@{self.host}")
return cmd
def _establish_connection(self):
cmd = self._build_ssh_command()
cmd.append("echo 'SSH connection established'")
try:
result = subprocess.run(cmd, capture_output=True, text=True, timeout=15)
if result.returncode != 0:
error_msg = result.stderr.strip() or result.stdout.strip()
raise RuntimeError(f"SSH connection failed: {error_msg}")
except subprocess.TimeoutExpired:
raise RuntimeError(f"SSH connection to {self.user}@{self.host} timed out")
_poll_interval: float = 0.15
@property
def _temp_prefix(self) -> str:
return f"/tmp/hermes-ssh-{self._session_id}"
def _spawn_shell_process(self) -> subprocess.Popen:
cmd = self._build_ssh_command()
cmd.append("bash -l")
return subprocess.Popen(
cmd,
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.DEVNULL,
text=True,
)
def _read_temp_files(self, *paths: str) -> list[str]:
if len(paths) == 1:
cmd = self._build_ssh_command()
cmd.append(f"cat {paths[0]} 2>/dev/null")
try:
result = subprocess.run(
cmd, capture_output=True, text=True, timeout=10,
)
return [result.stdout]
except (subprocess.TimeoutExpired, OSError):
return [""]
delim = f"__HERMES_SEP_{self._session_id}__"
script = "; ".join(
f"cat {p} 2>/dev/null; echo '{delim}'" for p in paths
)
cmd = self._build_ssh_command()
cmd.append(script)
try:
result = subprocess.run(
cmd, capture_output=True, text=True, timeout=10,
)
parts = result.stdout.split(delim + "\n")
return [parts[i] if i < len(parts) else "" for i in range(len(paths))]
except (subprocess.TimeoutExpired, OSError):
return [""] * len(paths)
def _kill_shell_children(self):
if self._shell_pid is None:
return
cmd = self._build_ssh_command()
cmd.append(f"pkill -P {self._shell_pid} 2>/dev/null; true")
try:
subprocess.run(cmd, capture_output=True, timeout=5)
except (subprocess.TimeoutExpired, OSError):
pass
def _cleanup_temp_files(self):
cmd = self._build_ssh_command()
cmd.append(f"rm -f {self._temp_prefix}-*")
try:
subprocess.run(cmd, capture_output=True, timeout=5)
except (subprocess.TimeoutExpired, OSError):
pass
def _execute_oneshot(self, command: str, cwd: str = "", *,
timeout: int | None = None,
stdin_data: str | None = None) -> dict:
work_dir = cwd or self.cwd
exec_command, sudo_stdin = self._prepare_command(command)
wrapped = f'cd {work_dir} && {exec_command}'
effective_timeout = timeout or self.timeout
if sudo_stdin is not None and stdin_data is not None:
effective_stdin = sudo_stdin + stdin_data
elif sudo_stdin is not None:
effective_stdin = sudo_stdin
else:
effective_stdin = stdin_data
cmd = self._build_ssh_command()
cmd.append(wrapped)
kwargs = self._build_run_kwargs(timeout, effective_stdin)
kwargs.pop("timeout", None)
_output_chunks = []
proc = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
stdin=subprocess.PIPE if effective_stdin else subprocess.DEVNULL,
text=True,
)
if effective_stdin:
try:
proc.stdin.write(effective_stdin)
proc.stdin.close()
except (BrokenPipeError, OSError):
pass
def _drain():
try:
for line in proc.stdout:
_output_chunks.append(line)
except Exception:
pass
reader = threading.Thread(target=_drain, daemon=True)
reader.start()
deadline = time.monotonic() + effective_timeout
while proc.poll() is None:
if is_interrupted():
proc.terminate()
try:
proc.wait(timeout=1)
except subprocess.TimeoutExpired:
proc.kill()
reader.join(timeout=2)
return {
"output": "".join(_output_chunks) + "\n[Command interrupted]",
"returncode": 130,
}
if time.monotonic() > deadline:
proc.kill()
reader.join(timeout=2)
return self._timeout_result(effective_timeout)
time.sleep(0.2)
reader.join(timeout=5)
return {"output": "".join(_output_chunks), "returncode": proc.returncode}
def cleanup(self):
super().cleanup()
if self.control_socket.exists():
try:
cmd = ["ssh", "-o", f"ControlPath={self.control_socket}",
"-O", "exit", f"{self.user}@{self.host}"]
subprocess.run(cmd, capture_output=True, timeout=5)
except (OSError, subprocess.SubprocessError):
pass
try:
self.control_socket.unlink()
except OSError:
pass