The architecture has been updated
This commit is contained in:
parent
805f7a017e
commit
a01257ead9
1119 changed files with 226 additions and 352 deletions
13
hermes_code/tools/environments/__init__.py
Normal file
13
hermes_code/tools/environments/__init__.py
Normal file
|
|
@ -0,0 +1,13 @@
|
|||
"""Hermes execution environment backends.
|
||||
|
||||
Each backend provides the same interface (BaseEnvironment ABC) for running
|
||||
shell commands in a specific execution context: local, Docker, Singularity,
|
||||
SSH, Modal, or Daytona.
|
||||
|
||||
The terminal_tool.py factory (_create_environment) selects the backend
|
||||
based on the TERMINAL_ENV configuration.
|
||||
"""
|
||||
|
||||
from tools.environments.base import BaseEnvironment
|
||||
|
||||
__all__ = ["BaseEnvironment"]
|
||||
99
hermes_code/tools/environments/base.py
Normal file
99
hermes_code/tools/environments/base.py
Normal file
|
|
@ -0,0 +1,99 @@
|
|||
"""Base class for all Hermes execution environment backends."""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
import os
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
from hermes_cli.config import get_hermes_home
|
||||
|
||||
|
||||
def get_sandbox_dir() -> Path:
|
||||
"""Return the host-side root for all sandbox storage (Docker workspaces,
|
||||
Singularity overlays/SIF cache, etc.).
|
||||
|
||||
Configurable via TERMINAL_SANDBOX_DIR. Defaults to {HERMES_HOME}/sandboxes/.
|
||||
"""
|
||||
custom = os.getenv("TERMINAL_SANDBOX_DIR")
|
||||
if custom:
|
||||
p = Path(custom)
|
||||
else:
|
||||
p = get_hermes_home() / "sandboxes"
|
||||
p.mkdir(parents=True, exist_ok=True)
|
||||
return p
|
||||
|
||||
|
||||
class BaseEnvironment(ABC):
|
||||
"""Common interface for all Hermes execution backends.
|
||||
|
||||
Subclasses implement execute() and cleanup(). Shared helpers eliminate
|
||||
duplicated subprocess boilerplate across backends.
|
||||
"""
|
||||
|
||||
def __init__(self, cwd: str, timeout: int, env: dict = None):
|
||||
self.cwd = cwd
|
||||
self.timeout = timeout
|
||||
self.env = env or {}
|
||||
|
||||
@abstractmethod
|
||||
def execute(self, command: str, cwd: str = "", *,
|
||||
timeout: int | None = None,
|
||||
stdin_data: str | None = None) -> dict:
|
||||
"""Execute a command, return {"output": str, "returncode": int}."""
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def cleanup(self):
|
||||
"""Release backend resources (container, instance, connection)."""
|
||||
...
|
||||
|
||||
def stop(self):
|
||||
"""Alias for cleanup (compat with older callers)."""
|
||||
self.cleanup()
|
||||
|
||||
def __del__(self):
|
||||
try:
|
||||
self.cleanup()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Shared helpers (eliminate duplication across backends)
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _prepare_command(self, command: str) -> tuple[str, str | None]:
|
||||
"""Transform sudo commands if SUDO_PASSWORD is available.
|
||||
|
||||
Returns:
|
||||
(transformed_command, sudo_stdin) — see _transform_sudo_command
|
||||
for the full contract. Callers that drive a subprocess directly
|
||||
should prepend sudo_stdin (when not None) to any stdin_data they
|
||||
pass to Popen. Callers that embed stdin via heredoc (modal,
|
||||
daytona) handle sudo_stdin in their own execute() method.
|
||||
"""
|
||||
from tools.terminal_tool import _transform_sudo_command
|
||||
return _transform_sudo_command(command)
|
||||
|
||||
def _build_run_kwargs(self, timeout: int | None,
|
||||
stdin_data: str | None = None) -> dict:
|
||||
"""Build common subprocess.run kwargs for non-interactive execution."""
|
||||
kw = {
|
||||
"text": True,
|
||||
"timeout": timeout or self.timeout,
|
||||
"encoding": "utf-8",
|
||||
"errors": "replace",
|
||||
"stdout": subprocess.PIPE,
|
||||
"stderr": subprocess.STDOUT,
|
||||
}
|
||||
if stdin_data is not None:
|
||||
kw["input"] = stdin_data
|
||||
else:
|
||||
kw["stdin"] = subprocess.DEVNULL
|
||||
return kw
|
||||
|
||||
def _timeout_result(self, timeout: int | None) -> dict:
|
||||
"""Standard return dict when a command times out."""
|
||||
return {
|
||||
"output": f"Command timed out after {timeout or self.timeout}s",
|
||||
"returncode": 124,
|
||||
}
|
||||
250
hermes_code/tools/environments/daytona.py
Normal file
250
hermes_code/tools/environments/daytona.py
Normal file
|
|
@ -0,0 +1,250 @@
|
|||
"""Daytona cloud execution environment.
|
||||
|
||||
Uses the Daytona Python SDK to run commands in cloud sandboxes.
|
||||
Supports persistent sandboxes: when enabled, sandboxes are stopped on cleanup
|
||||
and resumed on next creation, preserving the filesystem across sessions.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import time
|
||||
import math
|
||||
import shlex
|
||||
import threading
|
||||
import uuid
|
||||
import warnings
|
||||
from typing import Optional
|
||||
|
||||
from tools.environments.base import BaseEnvironment
|
||||
from tools.interrupt import is_interrupted
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DaytonaEnvironment(BaseEnvironment):
|
||||
"""Daytona cloud sandbox execution backend.
|
||||
|
||||
Uses stopped/started sandbox lifecycle for filesystem persistence
|
||||
instead of snapshots, making it faster and stateless on the host.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
image: str,
|
||||
cwd: str = "/home/daytona",
|
||||
timeout: int = 60,
|
||||
cpu: int = 1,
|
||||
memory: int = 5120, # MB (hermes convention)
|
||||
disk: int = 10240, # MB (Daytona platform max is 10GB)
|
||||
persistent_filesystem: bool = True,
|
||||
task_id: str = "default",
|
||||
):
|
||||
self._requested_cwd = cwd
|
||||
super().__init__(cwd=cwd, timeout=timeout)
|
||||
|
||||
from daytona import (
|
||||
Daytona,
|
||||
CreateSandboxFromImageParams,
|
||||
DaytonaError,
|
||||
Resources,
|
||||
SandboxState,
|
||||
)
|
||||
|
||||
self._persistent = persistent_filesystem
|
||||
self._task_id = task_id
|
||||
self._SandboxState = SandboxState
|
||||
self._daytona = Daytona()
|
||||
self._sandbox = None
|
||||
self._lock = threading.Lock()
|
||||
|
||||
memory_gib = max(1, math.ceil(memory / 1024))
|
||||
disk_gib = max(1, math.ceil(disk / 1024))
|
||||
if disk_gib > 10:
|
||||
warnings.warn(
|
||||
f"Daytona: requested disk ({disk_gib}GB) exceeds platform limit (10GB). "
|
||||
f"Capping to 10GB. Set container_disk: 10240 in config to silence this.",
|
||||
stacklevel=2,
|
||||
)
|
||||
disk_gib = 10
|
||||
resources = Resources(cpu=cpu, memory=memory_gib, disk=disk_gib)
|
||||
|
||||
labels = {"hermes_task_id": task_id}
|
||||
sandbox_name = f"hermes-{task_id}"
|
||||
|
||||
# Try to resume an existing sandbox for this task
|
||||
if self._persistent:
|
||||
# 1. Try name-based lookup (new path)
|
||||
try:
|
||||
self._sandbox = self._daytona.get(sandbox_name)
|
||||
self._sandbox.start()
|
||||
logger.info("Daytona: resumed sandbox %s for task %s",
|
||||
self._sandbox.id, task_id)
|
||||
except DaytonaError:
|
||||
self._sandbox = None
|
||||
except Exception as e:
|
||||
logger.warning("Daytona: failed to resume sandbox for task %s: %s",
|
||||
task_id, e)
|
||||
self._sandbox = None
|
||||
|
||||
# 2. Legacy fallback: find sandbox created before the naming migration
|
||||
if self._sandbox is None:
|
||||
try:
|
||||
page = self._daytona.list(labels=labels, page=1, limit=1)
|
||||
if page.items:
|
||||
self._sandbox = page.items[0]
|
||||
self._sandbox.start()
|
||||
logger.info("Daytona: resumed legacy sandbox %s for task %s",
|
||||
self._sandbox.id, task_id)
|
||||
except Exception as e:
|
||||
logger.debug("Daytona: no legacy sandbox found for task %s: %s",
|
||||
task_id, e)
|
||||
self._sandbox = None
|
||||
|
||||
# Create a fresh sandbox if we don't have one
|
||||
if self._sandbox is None:
|
||||
self._sandbox = self._daytona.create(
|
||||
CreateSandboxFromImageParams(
|
||||
image=image,
|
||||
name=sandbox_name,
|
||||
labels=labels,
|
||||
auto_stop_interval=0,
|
||||
resources=resources,
|
||||
)
|
||||
)
|
||||
logger.info("Daytona: created sandbox %s for task %s",
|
||||
self._sandbox.id, task_id)
|
||||
|
||||
# Resolve cwd: detect actual home dir inside the sandbox
|
||||
if self._requested_cwd in ("~", "/home/daytona"):
|
||||
try:
|
||||
home = self._sandbox.process.exec("echo $HOME").result.strip()
|
||||
if home:
|
||||
self.cwd = home
|
||||
except Exception:
|
||||
pass # leave cwd as-is; sandbox will use its own default
|
||||
logger.info("Daytona: resolved cwd to %s", self.cwd)
|
||||
|
||||
def _ensure_sandbox_ready(self):
|
||||
"""Restart sandbox if it was stopped (e.g., by a previous interrupt)."""
|
||||
self._sandbox.refresh_data()
|
||||
if self._sandbox.state in (self._SandboxState.STOPPED, self._SandboxState.ARCHIVED):
|
||||
self._sandbox.start()
|
||||
logger.info("Daytona: restarted sandbox %s", self._sandbox.id)
|
||||
|
||||
def _exec_in_thread(self, exec_command: str, cwd: Optional[str], timeout: int) -> dict:
|
||||
"""Run exec in a background thread with interrupt polling.
|
||||
|
||||
The Daytona SDK's exec(timeout=...) parameter is unreliable (the
|
||||
server-side timeout is not enforced and the SDK has no client-side
|
||||
fallback), so we wrap the command with the shell ``timeout`` utility
|
||||
which reliably kills the process and returns exit code 124.
|
||||
"""
|
||||
# Wrap with shell `timeout` to enforce the deadline reliably.
|
||||
# Add a small buffer so the shell timeout fires before any SDK-level
|
||||
# timeout would, giving us a clean exit code 124.
|
||||
timed_command = f"timeout {timeout} sh -c {shlex.quote(exec_command)}"
|
||||
|
||||
result_holder: dict = {"value": None, "error": None}
|
||||
|
||||
def _run():
|
||||
try:
|
||||
response = self._sandbox.process.exec(
|
||||
timed_command, cwd=cwd,
|
||||
)
|
||||
result_holder["value"] = {
|
||||
"output": response.result or "",
|
||||
"returncode": response.exit_code,
|
||||
}
|
||||
except Exception as e:
|
||||
result_holder["error"] = e
|
||||
|
||||
t = threading.Thread(target=_run, daemon=True)
|
||||
t.start()
|
||||
# Wait for timeout + generous buffer for network/SDK overhead
|
||||
deadline = time.monotonic() + timeout + 10
|
||||
while t.is_alive():
|
||||
t.join(timeout=0.2)
|
||||
if is_interrupted():
|
||||
with self._lock:
|
||||
try:
|
||||
self._sandbox.stop()
|
||||
except Exception:
|
||||
pass
|
||||
return {
|
||||
"output": "[Command interrupted - Daytona sandbox stopped]",
|
||||
"returncode": 130,
|
||||
}
|
||||
if time.monotonic() > deadline:
|
||||
# Shell timeout didn't fire and SDK is hung — force stop
|
||||
with self._lock:
|
||||
try:
|
||||
self._sandbox.stop()
|
||||
except Exception:
|
||||
pass
|
||||
return self._timeout_result(timeout)
|
||||
|
||||
if result_holder["error"]:
|
||||
return {"error": result_holder["error"]}
|
||||
return result_holder["value"]
|
||||
|
||||
def execute(self, command: str, cwd: str = "", *,
|
||||
timeout: Optional[int] = None,
|
||||
stdin_data: Optional[str] = None) -> dict:
|
||||
with self._lock:
|
||||
self._ensure_sandbox_ready()
|
||||
|
||||
if stdin_data is not None:
|
||||
marker = f"HERMES_EOF_{uuid.uuid4().hex[:8]}"
|
||||
while marker in stdin_data:
|
||||
marker = f"HERMES_EOF_{uuid.uuid4().hex[:8]}"
|
||||
command = f"{command} << '{marker}'\n{stdin_data}\n{marker}"
|
||||
|
||||
exec_command, sudo_stdin = self._prepare_command(command)
|
||||
|
||||
# Daytona sandboxes execute commands via the Daytona SDK and cannot
|
||||
# pipe subprocess stdin directly the way a local Popen can. When a
|
||||
# sudo password is present, use a shell-level pipe from printf so that
|
||||
# the password feeds sudo -S without appearing as an echo argument
|
||||
# embedded in the shell string. The password is still visible in the
|
||||
# remote sandbox's command line, but it is not exposed on the user's
|
||||
# local machine — which is the primary threat being mitigated.
|
||||
if sudo_stdin is not None:
|
||||
import shlex
|
||||
exec_command = (
|
||||
f"printf '%s\\n' {shlex.quote(sudo_stdin.rstrip())} | {exec_command}"
|
||||
)
|
||||
effective_cwd = cwd or self.cwd or None
|
||||
effective_timeout = timeout or self.timeout
|
||||
|
||||
result = self._exec_in_thread(exec_command, effective_cwd, effective_timeout)
|
||||
|
||||
if "error" in result:
|
||||
from daytona import DaytonaError
|
||||
err = result["error"]
|
||||
if isinstance(err, DaytonaError):
|
||||
with self._lock:
|
||||
try:
|
||||
self._ensure_sandbox_ready()
|
||||
except Exception:
|
||||
return {"output": f"Daytona execution error: {err}", "returncode": 1}
|
||||
result = self._exec_in_thread(exec_command, effective_cwd, effective_timeout)
|
||||
if "error" not in result:
|
||||
return result
|
||||
return {"output": f"Daytona execution error: {err}", "returncode": 1}
|
||||
|
||||
return result
|
||||
|
||||
def cleanup(self):
|
||||
with self._lock:
|
||||
if self._sandbox is None:
|
||||
return
|
||||
try:
|
||||
if self._persistent:
|
||||
self._sandbox.stop()
|
||||
logger.info("Daytona: stopped sandbox %s (filesystem preserved)",
|
||||
self._sandbox.id)
|
||||
else:
|
||||
self._daytona.delete(self._sandbox)
|
||||
logger.info("Daytona: deleted sandbox %s", self._sandbox.id)
|
||||
except Exception as e:
|
||||
logger.warning("Daytona: cleanup failed: %s", e)
|
||||
self._sandbox = None
|
||||
494
hermes_code/tools/environments/docker.py
Normal file
494
hermes_code/tools/environments/docker.py
Normal file
|
|
@ -0,0 +1,494 @@
|
|||
"""Docker execution environment for sandboxed command execution.
|
||||
|
||||
Security hardened (cap-drop ALL, no-new-privileges, PID limits),
|
||||
configurable resource limits (CPU, memory, disk), and optional filesystem
|
||||
persistence via bind mounts.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
import uuid
|
||||
from typing import Optional
|
||||
|
||||
from tools.environments.base import BaseEnvironment
|
||||
from tools.interrupt import is_interrupted
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# Common Docker Desktop install paths checked when 'docker' is not in PATH.
|
||||
# macOS Intel: /usr/local/bin, macOS Apple Silicon (Homebrew): /opt/homebrew/bin,
|
||||
# Docker Desktop app bundle: /Applications/Docker.app/Contents/Resources/bin
|
||||
_DOCKER_SEARCH_PATHS = [
|
||||
"/usr/local/bin/docker",
|
||||
"/opt/homebrew/bin/docker",
|
||||
"/Applications/Docker.app/Contents/Resources/bin/docker",
|
||||
]
|
||||
|
||||
_docker_executable: Optional[str] = None # resolved once, cached
|
||||
_ENV_VAR_NAME_RE = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*$")
|
||||
|
||||
|
||||
def _normalize_forward_env_names(forward_env: list[str] | None) -> list[str]:
|
||||
"""Return a deduplicated list of valid environment variable names."""
|
||||
normalized: list[str] = []
|
||||
seen: set[str] = set()
|
||||
|
||||
for item in forward_env or []:
|
||||
if not isinstance(item, str):
|
||||
logger.warning("Ignoring non-string docker_forward_env entry: %r", item)
|
||||
continue
|
||||
|
||||
key = item.strip()
|
||||
if not key:
|
||||
continue
|
||||
if not _ENV_VAR_NAME_RE.match(key):
|
||||
logger.warning("Ignoring invalid docker_forward_env entry: %r", item)
|
||||
continue
|
||||
if key in seen:
|
||||
continue
|
||||
|
||||
seen.add(key)
|
||||
normalized.append(key)
|
||||
|
||||
return normalized
|
||||
|
||||
|
||||
def _load_hermes_env_vars() -> dict[str, str]:
|
||||
"""Load ~/.hermes/.env values without failing Docker command execution."""
|
||||
try:
|
||||
from hermes_cli.config import load_env
|
||||
|
||||
return load_env() or {}
|
||||
except Exception:
|
||||
return {}
|
||||
|
||||
|
||||
def find_docker() -> Optional[str]:
|
||||
"""Locate the docker CLI binary.
|
||||
|
||||
Checks ``shutil.which`` first (respects PATH), then probes well-known
|
||||
install locations on macOS where Docker Desktop may not be in PATH
|
||||
(e.g. when running as a gateway service via launchd).
|
||||
|
||||
Returns the absolute path, or ``None`` if docker cannot be found.
|
||||
"""
|
||||
global _docker_executable
|
||||
if _docker_executable is not None:
|
||||
return _docker_executable
|
||||
|
||||
found = shutil.which("docker")
|
||||
if found:
|
||||
_docker_executable = found
|
||||
return found
|
||||
|
||||
for path in _DOCKER_SEARCH_PATHS:
|
||||
if os.path.isfile(path) and os.access(path, os.X_OK):
|
||||
_docker_executable = path
|
||||
logger.info("Found docker at non-PATH location: %s", path)
|
||||
return path
|
||||
|
||||
return None
|
||||
|
||||
|
||||
# Security flags applied to every container.
|
||||
# The container itself is the security boundary (isolated from host).
|
||||
# We drop all capabilities then add back the minimum needed:
|
||||
# DAC_OVERRIDE - root can write to bind-mounted dirs owned by host user
|
||||
# CHOWN/FOWNER - package managers (pip, npm, apt) need to set file ownership
|
||||
# Block privilege escalation and limit PIDs.
|
||||
# /tmp is size-limited and nosuid but allows exec (needed by pip/npm builds).
|
||||
_SECURITY_ARGS = [
|
||||
"--cap-drop", "ALL",
|
||||
"--cap-add", "DAC_OVERRIDE",
|
||||
"--cap-add", "CHOWN",
|
||||
"--cap-add", "FOWNER",
|
||||
"--security-opt", "no-new-privileges",
|
||||
"--pids-limit", "256",
|
||||
"--tmpfs", "/tmp:rw,nosuid,size=512m",
|
||||
"--tmpfs", "/var/tmp:rw,noexec,nosuid,size=256m",
|
||||
"--tmpfs", "/run:rw,noexec,nosuid,size=64m",
|
||||
]
|
||||
|
||||
|
||||
_storage_opt_ok: Optional[bool] = None # cached result across instances
|
||||
|
||||
|
||||
def _ensure_docker_available() -> None:
|
||||
"""Best-effort check that the docker CLI is available before use.
|
||||
|
||||
Reuses ``find_docker()`` so this preflight stays consistent with the rest of
|
||||
the Docker backend, including known non-PATH Docker Desktop locations.
|
||||
"""
|
||||
docker_exe = find_docker()
|
||||
if not docker_exe:
|
||||
logger.error(
|
||||
"Docker backend selected but no docker executable was found in PATH "
|
||||
"or known install locations. Install Docker Desktop and ensure the "
|
||||
"CLI is available."
|
||||
)
|
||||
raise RuntimeError(
|
||||
"Docker executable not found in PATH or known install locations. "
|
||||
"Install Docker and ensure the 'docker' command is available."
|
||||
)
|
||||
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[docker_exe, "version"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5,
|
||||
)
|
||||
except FileNotFoundError:
|
||||
logger.error(
|
||||
"Docker backend selected but the resolved docker executable '%s' could "
|
||||
"not be executed.",
|
||||
docker_exe,
|
||||
exc_info=True,
|
||||
)
|
||||
raise RuntimeError(
|
||||
"Docker executable could not be executed. Check your Docker installation."
|
||||
)
|
||||
except subprocess.TimeoutExpired:
|
||||
logger.error(
|
||||
"Docker backend selected but '%s version' timed out. "
|
||||
"The Docker daemon may not be running.",
|
||||
docker_exe,
|
||||
exc_info=True,
|
||||
)
|
||||
raise RuntimeError(
|
||||
"Docker daemon is not responding. Ensure Docker is running and try again."
|
||||
)
|
||||
except Exception:
|
||||
logger.error(
|
||||
"Unexpected error while checking Docker availability.",
|
||||
exc_info=True,
|
||||
)
|
||||
raise
|
||||
else:
|
||||
if result.returncode != 0:
|
||||
logger.error(
|
||||
"Docker backend selected but '%s version' failed "
|
||||
"(exit code %d, stderr=%s)",
|
||||
docker_exe,
|
||||
result.returncode,
|
||||
result.stderr.strip(),
|
||||
)
|
||||
raise RuntimeError(
|
||||
"Docker command is available but 'docker version' failed. "
|
||||
"Check your Docker installation."
|
||||
)
|
||||
|
||||
|
||||
class DockerEnvironment(BaseEnvironment):
|
||||
"""Hardened Docker container execution with resource limits and persistence.
|
||||
|
||||
Security: all capabilities dropped, no privilege escalation, PID limits,
|
||||
size-limited tmpfs for scratch dirs. The container itself is the security
|
||||
boundary — the filesystem inside is writable so agents can install packages
|
||||
(pip, npm, apt) as needed. Writable workspace via tmpfs or bind mounts.
|
||||
|
||||
Persistence: when enabled, bind mounts preserve /workspace and /root
|
||||
across container restarts.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
image: str,
|
||||
cwd: str = "/root",
|
||||
timeout: int = 60,
|
||||
cpu: float = 0,
|
||||
memory: int = 0,
|
||||
disk: int = 0,
|
||||
persistent_filesystem: bool = False,
|
||||
task_id: str = "default",
|
||||
volumes: list = None,
|
||||
forward_env: list[str] | None = None,
|
||||
network: bool = True,
|
||||
host_cwd: str = None,
|
||||
auto_mount_cwd: bool = False,
|
||||
):
|
||||
if cwd == "~":
|
||||
cwd = "/root"
|
||||
super().__init__(cwd=cwd, timeout=timeout)
|
||||
self._base_image = image
|
||||
self._persistent = persistent_filesystem
|
||||
self._task_id = task_id
|
||||
self._forward_env = _normalize_forward_env_names(forward_env)
|
||||
self._container_id: Optional[str] = None
|
||||
logger.info(f"DockerEnvironment volumes: {volumes}")
|
||||
# Ensure volumes is a list (config.yaml could be malformed)
|
||||
if volumes is not None and not isinstance(volumes, list):
|
||||
logger.warning(f"docker_volumes config is not a list: {volumes!r}")
|
||||
volumes = []
|
||||
|
||||
# Fail fast if Docker is not available.
|
||||
_ensure_docker_available()
|
||||
|
||||
# Build resource limit args
|
||||
resource_args = []
|
||||
if cpu > 0:
|
||||
resource_args.extend(["--cpus", str(cpu)])
|
||||
if memory > 0:
|
||||
resource_args.extend(["--memory", f"{memory}m"])
|
||||
if disk > 0 and sys.platform != "darwin":
|
||||
if self._storage_opt_supported():
|
||||
resource_args.extend(["--storage-opt", f"size={disk}m"])
|
||||
else:
|
||||
logger.warning(
|
||||
"Docker storage driver does not support per-container disk limits "
|
||||
"(requires overlay2 on XFS with pquota). Container will run without disk quota."
|
||||
)
|
||||
if not network:
|
||||
resource_args.append("--network=none")
|
||||
|
||||
# Persistent workspace via bind mounts from a configurable host directory
|
||||
# (TERMINAL_SANDBOX_DIR, default ~/.hermes/sandboxes/). Non-persistent
|
||||
# mode uses tmpfs (ephemeral, fast, gone on cleanup).
|
||||
from tools.environments.base import get_sandbox_dir
|
||||
|
||||
# User-configured volume mounts (from config.yaml docker_volumes)
|
||||
volume_args = []
|
||||
workspace_explicitly_mounted = False
|
||||
for vol in (volumes or []):
|
||||
if not isinstance(vol, str):
|
||||
logger.warning(f"Docker volume entry is not a string: {vol!r}")
|
||||
continue
|
||||
vol = vol.strip()
|
||||
if not vol:
|
||||
continue
|
||||
if ":" in vol:
|
||||
volume_args.extend(["-v", vol])
|
||||
if ":/workspace" in vol:
|
||||
workspace_explicitly_mounted = True
|
||||
else:
|
||||
logger.warning(f"Docker volume '{vol}' missing colon, skipping")
|
||||
|
||||
host_cwd_abs = os.path.abspath(os.path.expanduser(host_cwd)) if host_cwd else ""
|
||||
bind_host_cwd = (
|
||||
auto_mount_cwd
|
||||
and bool(host_cwd_abs)
|
||||
and os.path.isdir(host_cwd_abs)
|
||||
and not workspace_explicitly_mounted
|
||||
)
|
||||
if auto_mount_cwd and host_cwd and not os.path.isdir(host_cwd_abs):
|
||||
logger.debug(f"Skipping docker cwd mount: host_cwd is not a valid directory: {host_cwd}")
|
||||
|
||||
self._workspace_dir: Optional[str] = None
|
||||
self._home_dir: Optional[str] = None
|
||||
writable_args = []
|
||||
if self._persistent:
|
||||
sandbox = get_sandbox_dir() / "docker" / task_id
|
||||
self._home_dir = str(sandbox / "home")
|
||||
os.makedirs(self._home_dir, exist_ok=True)
|
||||
writable_args.extend([
|
||||
"-v", f"{self._home_dir}:/root",
|
||||
])
|
||||
if not bind_host_cwd and not workspace_explicitly_mounted:
|
||||
self._workspace_dir = str(sandbox / "workspace")
|
||||
os.makedirs(self._workspace_dir, exist_ok=True)
|
||||
writable_args.extend([
|
||||
"-v", f"{self._workspace_dir}:/workspace",
|
||||
])
|
||||
else:
|
||||
if not bind_host_cwd and not workspace_explicitly_mounted:
|
||||
writable_args.extend([
|
||||
"--tmpfs", "/workspace:rw,exec,size=10g",
|
||||
])
|
||||
writable_args.extend([
|
||||
"--tmpfs", "/home:rw,exec,size=1g",
|
||||
"--tmpfs", "/root:rw,exec,size=1g",
|
||||
])
|
||||
|
||||
if bind_host_cwd:
|
||||
logger.info(f"Mounting configured host cwd to /workspace: {host_cwd_abs}")
|
||||
volume_args = ["-v", f"{host_cwd_abs}:/workspace", *volume_args]
|
||||
elif workspace_explicitly_mounted:
|
||||
logger.debug("Skipping docker cwd mount: /workspace already mounted by user config")
|
||||
|
||||
logger.info(f"Docker volume_args: {volume_args}")
|
||||
all_run_args = list(_SECURITY_ARGS) + writable_args + resource_args + volume_args
|
||||
logger.info(f"Docker run_args: {all_run_args}")
|
||||
|
||||
# Resolve the docker executable once so it works even when
|
||||
# /usr/local/bin is not in PATH (common on macOS gateway/service).
|
||||
self._docker_exe = find_docker() or "docker"
|
||||
|
||||
# Start the container directly via `docker run -d`.
|
||||
container_name = f"hermes-{uuid.uuid4().hex[:8]}"
|
||||
run_cmd = [
|
||||
self._docker_exe, "run", "-d",
|
||||
"--name", container_name,
|
||||
"-w", cwd,
|
||||
*all_run_args,
|
||||
image,
|
||||
"sleep", "2h",
|
||||
]
|
||||
logger.debug(f"Starting container: {' '.join(run_cmd)}")
|
||||
result = subprocess.run(
|
||||
run_cmd,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120, # image pull may take a while
|
||||
check=True,
|
||||
)
|
||||
self._container_id = result.stdout.strip()
|
||||
logger.info(f"Started container {container_name} ({self._container_id[:12]})")
|
||||
|
||||
@staticmethod
|
||||
def _storage_opt_supported() -> bool:
|
||||
"""Check if Docker's storage driver supports --storage-opt size=.
|
||||
|
||||
Only overlay2 on XFS with pquota supports per-container disk quotas.
|
||||
Ubuntu (and most distros) default to ext4, where this flag errors out.
|
||||
"""
|
||||
global _storage_opt_ok
|
||||
if _storage_opt_ok is not None:
|
||||
return _storage_opt_ok
|
||||
try:
|
||||
docker = find_docker() or "docker"
|
||||
result = subprocess.run(
|
||||
[docker, "info", "--format", "{{.Driver}}"],
|
||||
capture_output=True, text=True, timeout=10,
|
||||
)
|
||||
driver = result.stdout.strip().lower()
|
||||
if driver != "overlay2":
|
||||
_storage_opt_ok = False
|
||||
return False
|
||||
# overlay2 only supports storage-opt on XFS with pquota.
|
||||
# Probe by attempting a dry-ish run — the fastest reliable check.
|
||||
probe = subprocess.run(
|
||||
[docker, "create", "--storage-opt", "size=1m", "hello-world"],
|
||||
capture_output=True, text=True, timeout=15,
|
||||
)
|
||||
if probe.returncode == 0:
|
||||
# Clean up the created container
|
||||
container_id = probe.stdout.strip()
|
||||
if container_id:
|
||||
subprocess.run([docker, "rm", container_id],
|
||||
capture_output=True, timeout=5)
|
||||
_storage_opt_ok = True
|
||||
else:
|
||||
_storage_opt_ok = False
|
||||
except Exception:
|
||||
_storage_opt_ok = False
|
||||
logger.debug("Docker --storage-opt support: %s", _storage_opt_ok)
|
||||
return _storage_opt_ok
|
||||
|
||||
def execute(self, command: str, cwd: str = "", *,
|
||||
timeout: int | None = None,
|
||||
stdin_data: str | None = None) -> dict:
|
||||
exec_command, sudo_stdin = self._prepare_command(command)
|
||||
work_dir = cwd or self.cwd
|
||||
effective_timeout = timeout or self.timeout
|
||||
|
||||
# Merge sudo password (if any) with caller-supplied stdin_data.
|
||||
if sudo_stdin is not None and stdin_data is not None:
|
||||
effective_stdin = sudo_stdin + stdin_data
|
||||
elif sudo_stdin is not None:
|
||||
effective_stdin = sudo_stdin
|
||||
else:
|
||||
effective_stdin = stdin_data
|
||||
|
||||
# docker exec -w doesn't expand ~, so prepend a cd into the command
|
||||
if work_dir == "~" or work_dir.startswith("~/"):
|
||||
exec_command = f"cd {work_dir} && {exec_command}"
|
||||
work_dir = "/"
|
||||
|
||||
assert self._container_id, "Container not started"
|
||||
cmd = [self._docker_exe, "exec"]
|
||||
if effective_stdin is not None:
|
||||
cmd.append("-i")
|
||||
cmd.extend(["-w", work_dir])
|
||||
hermes_env = _load_hermes_env_vars() if self._forward_env else {}
|
||||
for key in self._forward_env:
|
||||
value = os.getenv(key)
|
||||
if value is None:
|
||||
value = hermes_env.get(key)
|
||||
if value is not None:
|
||||
cmd.extend(["-e", f"{key}={value}"])
|
||||
cmd.extend([self._container_id, "bash", "-lc", exec_command])
|
||||
|
||||
try:
|
||||
_output_chunks = []
|
||||
proc = subprocess.Popen(
|
||||
cmd,
|
||||
stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
|
||||
stdin=subprocess.PIPE if effective_stdin else subprocess.DEVNULL,
|
||||
text=True,
|
||||
)
|
||||
if effective_stdin:
|
||||
try:
|
||||
proc.stdin.write(effective_stdin)
|
||||
proc.stdin.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def _drain():
|
||||
try:
|
||||
for line in proc.stdout:
|
||||
_output_chunks.append(line)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
reader = threading.Thread(target=_drain, daemon=True)
|
||||
reader.start()
|
||||
deadline = time.monotonic() + effective_timeout
|
||||
|
||||
while proc.poll() is None:
|
||||
if is_interrupted():
|
||||
proc.terminate()
|
||||
try:
|
||||
proc.wait(timeout=1)
|
||||
except subprocess.TimeoutExpired:
|
||||
proc.kill()
|
||||
reader.join(timeout=2)
|
||||
return {
|
||||
"output": "".join(_output_chunks) + "\n[Command interrupted]",
|
||||
"returncode": 130,
|
||||
}
|
||||
if time.monotonic() > deadline:
|
||||
proc.kill()
|
||||
reader.join(timeout=2)
|
||||
return self._timeout_result(effective_timeout)
|
||||
time.sleep(0.2)
|
||||
|
||||
reader.join(timeout=5)
|
||||
return {"output": "".join(_output_chunks), "returncode": proc.returncode}
|
||||
except Exception as e:
|
||||
return {"output": f"Docker execution error: {e}", "returncode": 1}
|
||||
|
||||
def cleanup(self):
|
||||
"""Stop and remove the container. Bind-mount dirs persist if persistent=True."""
|
||||
if self._container_id:
|
||||
try:
|
||||
# Stop in background so cleanup doesn't block
|
||||
stop_cmd = (
|
||||
f"(timeout 60 {self._docker_exe} stop {self._container_id} || "
|
||||
f"{self._docker_exe} rm -f {self._container_id}) >/dev/null 2>&1 &"
|
||||
)
|
||||
subprocess.Popen(stop_cmd, shell=True)
|
||||
except Exception as e:
|
||||
logger.warning("Failed to stop container %s: %s", self._container_id, e)
|
||||
|
||||
if not self._persistent:
|
||||
# Also schedule removal (stop only leaves it as stopped)
|
||||
try:
|
||||
subprocess.Popen(
|
||||
f"sleep 3 && {self._docker_exe} rm -f {self._container_id} >/dev/null 2>&1 &",
|
||||
shell=True,
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
self._container_id = None
|
||||
|
||||
if not self._persistent:
|
||||
for d in (self._workspace_dir, self._home_dir):
|
||||
if d:
|
||||
shutil.rmtree(d, ignore_errors=True)
|
||||
476
hermes_code/tools/environments/local.py
Normal file
476
hermes_code/tools/environments/local.py
Normal file
|
|
@ -0,0 +1,476 @@
|
|||
"""Local execution environment with interrupt support and non-blocking I/O."""
|
||||
|
||||
import glob
|
||||
import os
|
||||
import platform
|
||||
import shutil
|
||||
import signal
|
||||
import subprocess
|
||||
import threading
|
||||
import time
|
||||
|
||||
_IS_WINDOWS = platform.system() == "Windows"
|
||||
|
||||
from tools.environments.base import BaseEnvironment
|
||||
from tools.environments.persistent_shell import PersistentShellMixin
|
||||
from tools.interrupt import is_interrupted
|
||||
|
||||
# Unique marker to isolate real command output from shell init/exit noise.
|
||||
# printf (no trailing newline) keeps the boundaries clean for splitting.
|
||||
_OUTPUT_FENCE = "__HERMES_FENCE_a9f7b3__"
|
||||
|
||||
# Hermes-internal env vars that should NOT leak into terminal subprocesses.
|
||||
# These are loaded from ~/.hermes/.env for Hermes' own LLM/provider calls
|
||||
# but can break external CLIs (e.g. codex) that also honor them.
|
||||
# See: https://github.com/NousResearch/hermes-agent/issues/1002
|
||||
#
|
||||
# Built dynamically from the provider registry so new providers are
|
||||
# automatically covered without manual blocklist maintenance.
|
||||
_HERMES_PROVIDER_ENV_FORCE_PREFIX = "_HERMES_FORCE_"
|
||||
|
||||
|
||||
def _build_provider_env_blocklist() -> frozenset:
|
||||
"""Derive the blocklist from provider, tool, and gateway config.
|
||||
|
||||
Automatically picks up api_key_env_vars and base_url_env_var from
|
||||
every registered provider, plus tool/messaging env vars from the
|
||||
optional config registry, so new Hermes-managed secrets are blocked
|
||||
in subprocesses without having to maintain multiple static lists.
|
||||
"""
|
||||
blocked: set[str] = set()
|
||||
|
||||
try:
|
||||
from hermes_cli.auth import PROVIDER_REGISTRY
|
||||
for pconfig in PROVIDER_REGISTRY.values():
|
||||
blocked.update(pconfig.api_key_env_vars)
|
||||
if pconfig.base_url_env_var:
|
||||
blocked.add(pconfig.base_url_env_var)
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
try:
|
||||
from hermes_cli.config import OPTIONAL_ENV_VARS
|
||||
for name, metadata in OPTIONAL_ENV_VARS.items():
|
||||
category = metadata.get("category")
|
||||
if category in {"tool", "messaging"}:
|
||||
blocked.add(name)
|
||||
elif category == "setting" and metadata.get("password"):
|
||||
blocked.add(name)
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
# Vars not covered above but still Hermes-internal / conflict-prone.
|
||||
blocked.update({
|
||||
"OPENAI_BASE_URL",
|
||||
"OPENAI_API_KEY",
|
||||
"OPENAI_API_BASE", # legacy alias
|
||||
"OPENAI_ORG_ID",
|
||||
"OPENAI_ORGANIZATION",
|
||||
"OPENROUTER_API_KEY",
|
||||
"ANTHROPIC_BASE_URL",
|
||||
"ANTHROPIC_TOKEN", # OAuth token (not in registry as env var)
|
||||
"CLAUDE_CODE_OAUTH_TOKEN",
|
||||
"LLM_MODEL",
|
||||
# Expanded isolation for other major providers (Issue #1002)
|
||||
"GOOGLE_API_KEY", # Gemini / Google AI Studio
|
||||
"DEEPSEEK_API_KEY", # DeepSeek
|
||||
"MISTRAL_API_KEY", # Mistral AI
|
||||
"GROQ_API_KEY", # Groq
|
||||
"TOGETHER_API_KEY", # Together AI
|
||||
"PERPLEXITY_API_KEY", # Perplexity
|
||||
"COHERE_API_KEY", # Cohere
|
||||
"FIREWORKS_API_KEY", # Fireworks AI
|
||||
"XAI_API_KEY", # xAI (Grok)
|
||||
"HELICONE_API_KEY", # LLM Observability proxy
|
||||
"PARALLEL_API_KEY",
|
||||
"FIRECRAWL_API_KEY",
|
||||
"FIRECRAWL_API_URL",
|
||||
# Gateway/runtime config not represented in OPTIONAL_ENV_VARS.
|
||||
"TELEGRAM_HOME_CHANNEL",
|
||||
"TELEGRAM_HOME_CHANNEL_NAME",
|
||||
"DISCORD_HOME_CHANNEL",
|
||||
"DISCORD_HOME_CHANNEL_NAME",
|
||||
"DISCORD_REQUIRE_MENTION",
|
||||
"DISCORD_FREE_RESPONSE_CHANNELS",
|
||||
"DISCORD_AUTO_THREAD",
|
||||
"SLACK_HOME_CHANNEL",
|
||||
"SLACK_HOME_CHANNEL_NAME",
|
||||
"SLACK_ALLOWED_USERS",
|
||||
"WHATSAPP_ENABLED",
|
||||
"WHATSAPP_MODE",
|
||||
"WHATSAPP_ALLOWED_USERS",
|
||||
"SIGNAL_HTTP_URL",
|
||||
"SIGNAL_ACCOUNT",
|
||||
"SIGNAL_ALLOWED_USERS",
|
||||
"SIGNAL_GROUP_ALLOWED_USERS",
|
||||
"SIGNAL_HOME_CHANNEL",
|
||||
"SIGNAL_HOME_CHANNEL_NAME",
|
||||
"SIGNAL_IGNORE_STORIES",
|
||||
"HASS_TOKEN",
|
||||
"HASS_URL",
|
||||
"EMAIL_ADDRESS",
|
||||
"EMAIL_PASSWORD",
|
||||
"EMAIL_IMAP_HOST",
|
||||
"EMAIL_SMTP_HOST",
|
||||
"EMAIL_HOME_ADDRESS",
|
||||
"EMAIL_HOME_ADDRESS_NAME",
|
||||
"GATEWAY_ALLOWED_USERS",
|
||||
# Skills Hub / GitHub app auth paths and aliases.
|
||||
"GH_TOKEN",
|
||||
"GITHUB_APP_ID",
|
||||
"GITHUB_APP_PRIVATE_KEY_PATH",
|
||||
"GITHUB_APP_INSTALLATION_ID",
|
||||
# Remote sandbox backend credentials.
|
||||
"MODAL_TOKEN_ID",
|
||||
"MODAL_TOKEN_SECRET",
|
||||
"DAYTONA_API_KEY",
|
||||
})
|
||||
return frozenset(blocked)
|
||||
|
||||
|
||||
_HERMES_PROVIDER_ENV_BLOCKLIST = _build_provider_env_blocklist()
|
||||
|
||||
|
||||
def _sanitize_subprocess_env(base_env: dict | None, extra_env: dict | None = None) -> dict:
|
||||
"""Filter Hermes-managed secrets from a subprocess environment.
|
||||
|
||||
`_HERMES_FORCE_<VAR>` entries in ``extra_env`` opt a blocked variable back in
|
||||
intentionally for callers that truly need it. Vars registered via
|
||||
:mod:`tools.env_passthrough` (skill-declared or user-configured) also
|
||||
bypass the blocklist.
|
||||
"""
|
||||
try:
|
||||
from tools.env_passthrough import is_env_passthrough as _is_passthrough
|
||||
except Exception:
|
||||
_is_passthrough = lambda _: False # noqa: E731
|
||||
|
||||
sanitized: dict[str, str] = {}
|
||||
|
||||
for key, value in (base_env or {}).items():
|
||||
if key.startswith(_HERMES_PROVIDER_ENV_FORCE_PREFIX):
|
||||
continue
|
||||
if key not in _HERMES_PROVIDER_ENV_BLOCKLIST or _is_passthrough(key):
|
||||
sanitized[key] = value
|
||||
|
||||
for key, value in (extra_env or {}).items():
|
||||
if key.startswith(_HERMES_PROVIDER_ENV_FORCE_PREFIX):
|
||||
real_key = key[len(_HERMES_PROVIDER_ENV_FORCE_PREFIX):]
|
||||
sanitized[real_key] = value
|
||||
elif key not in _HERMES_PROVIDER_ENV_BLOCKLIST or _is_passthrough(key):
|
||||
sanitized[key] = value
|
||||
|
||||
return sanitized
|
||||
|
||||
|
||||
def _find_bash() -> str:
|
||||
"""Find bash for command execution.
|
||||
|
||||
The fence wrapper uses bash syntax (semicolons, $?, printf), so we
|
||||
must use bash — not the user's $SHELL which could be fish/zsh/etc.
|
||||
On Windows: uses Git Bash (bundled with Git for Windows).
|
||||
"""
|
||||
if not _IS_WINDOWS:
|
||||
return (
|
||||
shutil.which("bash")
|
||||
or ("/usr/bin/bash" if os.path.isfile("/usr/bin/bash") else None)
|
||||
or ("/bin/bash" if os.path.isfile("/bin/bash") else None)
|
||||
or os.environ.get("SHELL") # last resort: whatever they have
|
||||
or "/bin/sh"
|
||||
)
|
||||
|
||||
# Windows: look for Git Bash (installed with Git for Windows).
|
||||
# Allow override via env var (same pattern as Claude Code).
|
||||
custom = os.environ.get("HERMES_GIT_BASH_PATH")
|
||||
if custom and os.path.isfile(custom):
|
||||
return custom
|
||||
|
||||
# shutil.which finds bash.exe if Git\bin is on PATH
|
||||
found = shutil.which("bash")
|
||||
if found:
|
||||
return found
|
||||
|
||||
# Check common Git for Windows install locations
|
||||
for candidate in (
|
||||
os.path.join(os.environ.get("ProgramFiles", r"C:\Program Files"), "Git", "bin", "bash.exe"),
|
||||
os.path.join(os.environ.get("ProgramFiles(x86)", r"C:\Program Files (x86)"), "Git", "bin", "bash.exe"),
|
||||
os.path.join(os.environ.get("LOCALAPPDATA", ""), "Programs", "Git", "bin", "bash.exe"),
|
||||
):
|
||||
if candidate and os.path.isfile(candidate):
|
||||
return candidate
|
||||
|
||||
raise RuntimeError(
|
||||
"Git Bash not found. Hermes Agent requires Git for Windows on Windows.\n"
|
||||
"Install it from: https://git-scm.com/download/win\n"
|
||||
"Or set HERMES_GIT_BASH_PATH to your bash.exe location."
|
||||
)
|
||||
|
||||
|
||||
# Backward compat — process_registry.py imports this name
|
||||
_find_shell = _find_bash
|
||||
|
||||
|
||||
# Noise lines emitted by interactive shells when stdin is not a terminal.
|
||||
# Used as a fallback when output fence markers are missing.
|
||||
_SHELL_NOISE_SUBSTRINGS = (
|
||||
# bash
|
||||
"bash: cannot set terminal process group",
|
||||
"bash: no job control in this shell",
|
||||
"no job control in this shell",
|
||||
"cannot set terminal process group",
|
||||
"tcsetattr: Inappropriate ioctl for device",
|
||||
# zsh / oh-my-zsh / macOS terminal session
|
||||
"Restored session:",
|
||||
"Saving session...",
|
||||
"Last login:",
|
||||
"command not found:",
|
||||
"Oh My Zsh",
|
||||
"compinit:",
|
||||
)
|
||||
|
||||
|
||||
def _clean_shell_noise(output: str) -> str:
|
||||
"""Strip shell startup/exit warnings that leak when using -i without a TTY.
|
||||
|
||||
Removes lines matching known noise patterns from both the beginning
|
||||
and end of the output. Lines in the middle are left untouched.
|
||||
"""
|
||||
|
||||
def _is_noise(line: str) -> bool:
|
||||
return any(noise in line for noise in _SHELL_NOISE_SUBSTRINGS)
|
||||
|
||||
lines = output.split("\n")
|
||||
|
||||
# Strip leading noise
|
||||
while lines and _is_noise(lines[0]):
|
||||
lines.pop(0)
|
||||
|
||||
# Strip trailing noise (walk backwards, skip empty lines from split)
|
||||
end = len(lines) - 1
|
||||
while end >= 0 and (not lines[end] or _is_noise(lines[end])):
|
||||
end -= 1
|
||||
|
||||
if end < 0:
|
||||
return ""
|
||||
|
||||
cleaned = lines[: end + 1]
|
||||
result = "\n".join(cleaned)
|
||||
|
||||
# Preserve trailing newline if original had one
|
||||
if output.endswith("\n") and result and not result.endswith("\n"):
|
||||
result += "\n"
|
||||
return result
|
||||
|
||||
|
||||
# Standard PATH entries for environments with minimal PATH (e.g. systemd services).
|
||||
# Includes macOS Homebrew paths (/opt/homebrew/* for Apple Silicon).
|
||||
_SANE_PATH = (
|
||||
"/opt/homebrew/bin:/opt/homebrew/sbin:"
|
||||
"/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"
|
||||
)
|
||||
|
||||
|
||||
def _make_run_env(env: dict) -> dict:
|
||||
"""Build a run environment with a sane PATH and provider-var stripping."""
|
||||
try:
|
||||
from tools.env_passthrough import is_env_passthrough as _is_passthrough
|
||||
except Exception:
|
||||
_is_passthrough = lambda _: False # noqa: E731
|
||||
|
||||
merged = dict(os.environ | env)
|
||||
run_env = {}
|
||||
for k, v in merged.items():
|
||||
if k.startswith(_HERMES_PROVIDER_ENV_FORCE_PREFIX):
|
||||
real_key = k[len(_HERMES_PROVIDER_ENV_FORCE_PREFIX):]
|
||||
run_env[real_key] = v
|
||||
elif k not in _HERMES_PROVIDER_ENV_BLOCKLIST or _is_passthrough(k):
|
||||
run_env[k] = v
|
||||
existing_path = run_env.get("PATH", "")
|
||||
if "/usr/bin" not in existing_path.split(":"):
|
||||
run_env["PATH"] = f"{existing_path}:{_SANE_PATH}" if existing_path else _SANE_PATH
|
||||
return run_env
|
||||
|
||||
|
||||
def _extract_fenced_output(raw: str) -> str:
|
||||
"""Extract real command output from between fence markers.
|
||||
|
||||
The execute() method wraps each command with printf(FENCE) markers.
|
||||
This function finds the first and last fence and returns only the
|
||||
content between them, which is the actual command output free of
|
||||
any shell init/exit noise.
|
||||
|
||||
Falls back to pattern-based _clean_shell_noise if fences are missing.
|
||||
"""
|
||||
first = raw.find(_OUTPUT_FENCE)
|
||||
if first == -1:
|
||||
return _clean_shell_noise(raw)
|
||||
|
||||
start = first + len(_OUTPUT_FENCE)
|
||||
last = raw.rfind(_OUTPUT_FENCE)
|
||||
|
||||
if last <= first:
|
||||
# Only start fence found (e.g. user command called `exit`)
|
||||
return _clean_shell_noise(raw[start:])
|
||||
|
||||
return raw[start:last]
|
||||
|
||||
|
||||
class LocalEnvironment(PersistentShellMixin, BaseEnvironment):
|
||||
"""Run commands directly on the host machine.
|
||||
|
||||
Features:
|
||||
- Popen + polling for interrupt support (user can cancel mid-command)
|
||||
- Background stdout drain thread to prevent pipe buffer deadlocks
|
||||
- stdin_data support for piping content (bypasses ARG_MAX limits)
|
||||
- sudo -S transform via SUDO_PASSWORD env var
|
||||
- Uses interactive login shell so full user env is available
|
||||
- Optional persistent shell mode (cwd/env vars survive across calls)
|
||||
"""
|
||||
|
||||
def __init__(self, cwd: str = "", timeout: int = 60, env: dict = None,
|
||||
persistent: bool = False):
|
||||
super().__init__(cwd=cwd or os.getcwd(), timeout=timeout, env=env)
|
||||
self.persistent = persistent
|
||||
if self.persistent:
|
||||
self._init_persistent_shell()
|
||||
|
||||
@property
|
||||
def _temp_prefix(self) -> str:
|
||||
return f"/tmp/hermes-local-{self._session_id}"
|
||||
|
||||
def _spawn_shell_process(self) -> subprocess.Popen:
|
||||
user_shell = _find_bash()
|
||||
run_env = _make_run_env(self.env)
|
||||
return subprocess.Popen(
|
||||
[user_shell, "-l"],
|
||||
stdin=subprocess.PIPE,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.DEVNULL,
|
||||
text=True,
|
||||
env=run_env,
|
||||
preexec_fn=None if _IS_WINDOWS else os.setsid,
|
||||
)
|
||||
|
||||
def _read_temp_files(self, *paths: str) -> list[str]:
|
||||
results = []
|
||||
for path in paths:
|
||||
if os.path.exists(path):
|
||||
with open(path) as f:
|
||||
results.append(f.read())
|
||||
else:
|
||||
results.append("")
|
||||
return results
|
||||
|
||||
def _kill_shell_children(self):
|
||||
if self._shell_pid is None:
|
||||
return
|
||||
try:
|
||||
subprocess.run(
|
||||
["pkill", "-P", str(self._shell_pid)],
|
||||
capture_output=True, timeout=5,
|
||||
)
|
||||
except (subprocess.TimeoutExpired, FileNotFoundError):
|
||||
pass
|
||||
|
||||
def _cleanup_temp_files(self):
|
||||
for f in glob.glob(f"{self._temp_prefix}-*"):
|
||||
if os.path.exists(f):
|
||||
os.remove(f)
|
||||
|
||||
def _execute_oneshot(self, command: str, cwd: str = "", *,
|
||||
timeout: int | None = None,
|
||||
stdin_data: str | None = None) -> dict:
|
||||
work_dir = cwd or self.cwd or os.getcwd()
|
||||
effective_timeout = timeout or self.timeout
|
||||
exec_command, sudo_stdin = self._prepare_command(command)
|
||||
|
||||
if sudo_stdin is not None and stdin_data is not None:
|
||||
effective_stdin = sudo_stdin + stdin_data
|
||||
elif sudo_stdin is not None:
|
||||
effective_stdin = sudo_stdin
|
||||
else:
|
||||
effective_stdin = stdin_data
|
||||
|
||||
user_shell = _find_bash()
|
||||
fenced_cmd = (
|
||||
f"printf '{_OUTPUT_FENCE}';"
|
||||
f" {exec_command};"
|
||||
f" __hermes_rc=$?;"
|
||||
f" printf '{_OUTPUT_FENCE}';"
|
||||
f" exit $__hermes_rc"
|
||||
)
|
||||
run_env = _make_run_env(self.env)
|
||||
|
||||
proc = subprocess.Popen(
|
||||
[user_shell, "-lic", fenced_cmd],
|
||||
text=True,
|
||||
cwd=work_dir,
|
||||
env=run_env,
|
||||
encoding="utf-8",
|
||||
errors="replace",
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
stdin=subprocess.PIPE if effective_stdin is not None else subprocess.DEVNULL,
|
||||
preexec_fn=None if _IS_WINDOWS else os.setsid,
|
||||
)
|
||||
|
||||
if effective_stdin is not None:
|
||||
def _write_stdin():
|
||||
try:
|
||||
proc.stdin.write(effective_stdin)
|
||||
proc.stdin.close()
|
||||
except (BrokenPipeError, OSError):
|
||||
pass
|
||||
threading.Thread(target=_write_stdin, daemon=True).start()
|
||||
|
||||
_output_chunks: list[str] = []
|
||||
|
||||
def _drain_stdout():
|
||||
try:
|
||||
for line in proc.stdout:
|
||||
_output_chunks.append(line)
|
||||
except ValueError:
|
||||
pass
|
||||
finally:
|
||||
try:
|
||||
proc.stdout.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
reader = threading.Thread(target=_drain_stdout, daemon=True)
|
||||
reader.start()
|
||||
deadline = time.monotonic() + effective_timeout
|
||||
|
||||
while proc.poll() is None:
|
||||
if is_interrupted():
|
||||
try:
|
||||
if _IS_WINDOWS:
|
||||
proc.terminate()
|
||||
else:
|
||||
pgid = os.getpgid(proc.pid)
|
||||
os.killpg(pgid, signal.SIGTERM)
|
||||
try:
|
||||
proc.wait(timeout=1.0)
|
||||
except subprocess.TimeoutExpired:
|
||||
os.killpg(pgid, signal.SIGKILL)
|
||||
except (ProcessLookupError, PermissionError):
|
||||
proc.kill()
|
||||
reader.join(timeout=2)
|
||||
return {
|
||||
"output": "".join(_output_chunks) + "\n[Command interrupted — user sent a new message]",
|
||||
"returncode": 130,
|
||||
}
|
||||
if time.monotonic() > deadline:
|
||||
try:
|
||||
if _IS_WINDOWS:
|
||||
proc.terminate()
|
||||
else:
|
||||
os.killpg(os.getpgid(proc.pid), signal.SIGTERM)
|
||||
except (ProcessLookupError, PermissionError):
|
||||
proc.kill()
|
||||
reader.join(timeout=2)
|
||||
return self._timeout_result(effective_timeout)
|
||||
time.sleep(0.2)
|
||||
|
||||
reader.join(timeout=5)
|
||||
output = _extract_fenced_output("".join(_output_chunks))
|
||||
return {"output": output, "returncode": proc.returncode}
|
||||
259
hermes_code/tools/environments/modal.py
Normal file
259
hermes_code/tools/environments/modal.py
Normal file
|
|
@ -0,0 +1,259 @@
|
|||
"""Modal cloud execution environment using SWE-ReX directly.
|
||||
|
||||
Supports persistent filesystem snapshots: when enabled, the sandbox's filesystem
|
||||
is snapshotted on cleanup and restored on next creation, so installed packages,
|
||||
project files, and config changes survive across sessions.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import threading
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
from hermes_cli.config import get_hermes_home
|
||||
from tools.environments.base import BaseEnvironment
|
||||
from tools.interrupt import is_interrupted
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_SNAPSHOT_STORE = get_hermes_home() / "modal_snapshots.json"
|
||||
|
||||
|
||||
def _load_snapshots() -> Dict[str, str]:
|
||||
"""Load snapshot ID mapping from disk."""
|
||||
if _SNAPSHOT_STORE.exists():
|
||||
try:
|
||||
return json.loads(_SNAPSHOT_STORE.read_text())
|
||||
except Exception:
|
||||
pass
|
||||
return {}
|
||||
|
||||
|
||||
def _save_snapshots(data: Dict[str, str]) -> None:
|
||||
"""Persist snapshot ID mapping to disk."""
|
||||
_SNAPSHOT_STORE.parent.mkdir(parents=True, exist_ok=True)
|
||||
_SNAPSHOT_STORE.write_text(json.dumps(data, indent=2))
|
||||
|
||||
|
||||
class _AsyncWorker:
|
||||
"""Background thread with its own event loop for async-safe swe-rex calls.
|
||||
|
||||
Allows sync code to submit async coroutines and block for results,
|
||||
even when called from inside another running event loop (e.g. Atropos).
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self._loop: Optional[asyncio.AbstractEventLoop] = None
|
||||
self._thread: Optional[threading.Thread] = None
|
||||
self._started = threading.Event()
|
||||
|
||||
def start(self):
|
||||
self._thread = threading.Thread(target=self._run_loop, daemon=True)
|
||||
self._thread.start()
|
||||
self._started.wait(timeout=30)
|
||||
|
||||
def _run_loop(self):
|
||||
self._loop = asyncio.new_event_loop()
|
||||
asyncio.set_event_loop(self._loop)
|
||||
self._started.set()
|
||||
self._loop.run_forever()
|
||||
|
||||
def run_coroutine(self, coro, timeout=600):
|
||||
if self._loop is None or self._loop.is_closed():
|
||||
raise RuntimeError("AsyncWorker loop is not running")
|
||||
future = asyncio.run_coroutine_threadsafe(coro, self._loop)
|
||||
return future.result(timeout=timeout)
|
||||
|
||||
def stop(self):
|
||||
if self._loop and self._loop.is_running():
|
||||
self._loop.call_soon_threadsafe(self._loop.stop)
|
||||
if self._thread:
|
||||
self._thread.join(timeout=10)
|
||||
|
||||
|
||||
class ModalEnvironment(BaseEnvironment):
|
||||
"""Modal cloud execution via SWE-ReX.
|
||||
|
||||
Uses swe-rex's ModalDeployment directly for sandbox management.
|
||||
Adds sudo -S support, configurable resources (CPU, memory, disk),
|
||||
and optional filesystem persistence via Modal's snapshot API.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
image: str,
|
||||
cwd: str = "/root",
|
||||
timeout: int = 60,
|
||||
modal_sandbox_kwargs: Optional[Dict[str, Any]] = None,
|
||||
persistent_filesystem: bool = True,
|
||||
task_id: str = "default",
|
||||
):
|
||||
super().__init__(cwd=cwd, timeout=timeout)
|
||||
|
||||
self._persistent = persistent_filesystem
|
||||
self._task_id = task_id
|
||||
self._base_image = image
|
||||
self._deployment = None
|
||||
self._worker = _AsyncWorker()
|
||||
|
||||
sandbox_kwargs = dict(modal_sandbox_kwargs or {})
|
||||
|
||||
# If persistent, try to restore from a previous snapshot
|
||||
restored_image = None
|
||||
if self._persistent:
|
||||
snapshot_id = _load_snapshots().get(self._task_id)
|
||||
if snapshot_id:
|
||||
try:
|
||||
import modal
|
||||
restored_image = modal.Image.from_id(snapshot_id)
|
||||
logger.info("Modal: restoring from snapshot %s", snapshot_id[:20])
|
||||
except Exception as e:
|
||||
logger.warning("Modal: failed to restore snapshot, using base image: %s", e)
|
||||
restored_image = None
|
||||
|
||||
effective_image = restored_image if restored_image else image
|
||||
|
||||
# Pre-build a modal.Image with pip fix for Modal's legacy image builder.
|
||||
# Some task images have broken pip; fix via ensurepip before Modal uses it.
|
||||
import modal as _modal
|
||||
if isinstance(effective_image, str):
|
||||
effective_image = _modal.Image.from_registry(
|
||||
effective_image,
|
||||
setup_dockerfile_commands=[
|
||||
"RUN rm -rf /usr/local/lib/python*/site-packages/pip* 2>/dev/null; "
|
||||
"python -m ensurepip --upgrade --default-pip 2>/dev/null || true",
|
||||
],
|
||||
)
|
||||
|
||||
# Start the async worker thread and create the deployment on it
|
||||
# so all gRPC channels are bound to the worker's event loop.
|
||||
self._worker.start()
|
||||
|
||||
from swerex.deployment.modal import ModalDeployment
|
||||
|
||||
async def _create_and_start():
|
||||
deployment = ModalDeployment(
|
||||
image=effective_image,
|
||||
startup_timeout=180.0,
|
||||
runtime_timeout=3600.0,
|
||||
deployment_timeout=3600.0,
|
||||
install_pipx=True,
|
||||
modal_sandbox_kwargs=sandbox_kwargs,
|
||||
)
|
||||
await deployment.start()
|
||||
return deployment
|
||||
|
||||
self._deployment = self._worker.run_coroutine(_create_and_start())
|
||||
|
||||
def execute(self, command: str, cwd: str = "", *,
|
||||
timeout: int | None = None,
|
||||
stdin_data: str | None = None) -> dict:
|
||||
if stdin_data is not None:
|
||||
marker = f"HERMES_EOF_{uuid.uuid4().hex[:8]}"
|
||||
while marker in stdin_data:
|
||||
marker = f"HERMES_EOF_{uuid.uuid4().hex[:8]}"
|
||||
command = f"{command} << '{marker}'\n{stdin_data}\n{marker}"
|
||||
|
||||
exec_command, sudo_stdin = self._prepare_command(command)
|
||||
|
||||
# Modal sandboxes execute commands via the Modal SDK and cannot pipe
|
||||
# subprocess stdin directly the way a local Popen can. When a sudo
|
||||
# password is present, use a shell-level pipe from printf so that the
|
||||
# password feeds sudo -S without appearing as an echo argument embedded
|
||||
# in the shell string.
|
||||
if sudo_stdin is not None:
|
||||
import shlex
|
||||
exec_command = (
|
||||
f"printf '%s\\n' {shlex.quote(sudo_stdin.rstrip())} | {exec_command}"
|
||||
)
|
||||
|
||||
from swerex.runtime.abstract import Command as RexCommand
|
||||
|
||||
effective_cwd = cwd or self.cwd
|
||||
effective_timeout = timeout or self.timeout
|
||||
|
||||
# Run in a background thread so we can poll for interrupts
|
||||
result_holder = {"value": None, "error": None}
|
||||
|
||||
def _run():
|
||||
try:
|
||||
async def _do_execute():
|
||||
return await self._deployment.runtime.execute(
|
||||
RexCommand(
|
||||
command=exec_command,
|
||||
shell=True,
|
||||
check=False,
|
||||
cwd=effective_cwd,
|
||||
timeout=effective_timeout,
|
||||
merge_output_streams=True,
|
||||
)
|
||||
)
|
||||
output = self._worker.run_coroutine(_do_execute())
|
||||
result_holder["value"] = {
|
||||
"output": output.stdout,
|
||||
"returncode": output.exit_code,
|
||||
}
|
||||
except Exception as e:
|
||||
result_holder["error"] = e
|
||||
|
||||
t = threading.Thread(target=_run, daemon=True)
|
||||
t.start()
|
||||
while t.is_alive():
|
||||
t.join(timeout=0.2)
|
||||
if is_interrupted():
|
||||
try:
|
||||
self._worker.run_coroutine(
|
||||
asyncio.wait_for(self._deployment.stop(), timeout=10),
|
||||
timeout=15,
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
return {
|
||||
"output": "[Command interrupted - Modal sandbox terminated]",
|
||||
"returncode": 130,
|
||||
}
|
||||
|
||||
if result_holder["error"]:
|
||||
return {"output": f"Modal execution error: {result_holder['error']}", "returncode": 1}
|
||||
return result_holder["value"]
|
||||
|
||||
def cleanup(self):
|
||||
"""Snapshot the filesystem (if persistent) then stop the sandbox."""
|
||||
if self._deployment is None:
|
||||
return
|
||||
|
||||
if self._persistent:
|
||||
try:
|
||||
sandbox = getattr(self._deployment, '_sandbox', None)
|
||||
if sandbox:
|
||||
async def _snapshot():
|
||||
img = await sandbox.snapshot_filesystem.aio()
|
||||
return img.object_id
|
||||
|
||||
try:
|
||||
snapshot_id = self._worker.run_coroutine(_snapshot(), timeout=60)
|
||||
except Exception:
|
||||
snapshot_id = None
|
||||
|
||||
if snapshot_id:
|
||||
snapshots = _load_snapshots()
|
||||
snapshots[self._task_id] = snapshot_id
|
||||
_save_snapshots(snapshots)
|
||||
logger.info("Modal: saved filesystem snapshot %s for task %s",
|
||||
snapshot_id[:20], self._task_id)
|
||||
except Exception as e:
|
||||
logger.warning("Modal: filesystem snapshot failed: %s", e)
|
||||
|
||||
try:
|
||||
self._worker.run_coroutine(
|
||||
asyncio.wait_for(self._deployment.stop(), timeout=10),
|
||||
timeout=15,
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
finally:
|
||||
self._worker.stop()
|
||||
self._deployment = None
|
||||
272
hermes_code/tools/environments/persistent_shell.py
Normal file
272
hermes_code/tools/environments/persistent_shell.py
Normal file
|
|
@ -0,0 +1,272 @@
|
|||
"""Persistent shell mixin: file-based IPC protocol for long-lived bash shells."""
|
||||
|
||||
import logging
|
||||
import shlex
|
||||
import subprocess
|
||||
import threading
|
||||
import time
|
||||
import uuid
|
||||
from abc import abstractmethod
|
||||
|
||||
from tools.interrupt import is_interrupted
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class PersistentShellMixin:
|
||||
"""Mixin that adds persistent shell capability to any BaseEnvironment.
|
||||
|
||||
Subclasses must implement ``_spawn_shell_process()``, ``_read_temp_files()``,
|
||||
``_kill_shell_children()``, ``_execute_oneshot()``, and ``_cleanup_temp_files()``.
|
||||
"""
|
||||
|
||||
persistent: bool
|
||||
|
||||
@abstractmethod
|
||||
def _spawn_shell_process(self) -> subprocess.Popen: ...
|
||||
|
||||
@abstractmethod
|
||||
def _read_temp_files(self, *paths: str) -> list[str]: ...
|
||||
|
||||
@abstractmethod
|
||||
def _kill_shell_children(self): ...
|
||||
|
||||
@abstractmethod
|
||||
def _execute_oneshot(self, command: str, cwd: str, *,
|
||||
timeout: int | None = None,
|
||||
stdin_data: str | None = None) -> dict: ...
|
||||
|
||||
@abstractmethod
|
||||
def _cleanup_temp_files(self): ...
|
||||
|
||||
_session_id: str = ""
|
||||
_poll_interval: float = 0.01
|
||||
|
||||
@property
|
||||
def _temp_prefix(self) -> str:
|
||||
return f"/tmp/hermes-persistent-{self._session_id}"
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Lifecycle
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _init_persistent_shell(self):
|
||||
self._shell_lock = threading.Lock()
|
||||
self._shell_proc: subprocess.Popen | None = None
|
||||
self._shell_alive: bool = False
|
||||
self._shell_pid: int | None = None
|
||||
|
||||
self._session_id = uuid.uuid4().hex[:12]
|
||||
p = self._temp_prefix
|
||||
self._pshell_stdout = f"{p}-stdout"
|
||||
self._pshell_stderr = f"{p}-stderr"
|
||||
self._pshell_status = f"{p}-status"
|
||||
self._pshell_cwd = f"{p}-cwd"
|
||||
self._pshell_pid_file = f"{p}-pid"
|
||||
|
||||
self._shell_proc = self._spawn_shell_process()
|
||||
self._shell_alive = True
|
||||
|
||||
self._drain_thread = threading.Thread(
|
||||
target=self._drain_shell_output, daemon=True,
|
||||
)
|
||||
self._drain_thread.start()
|
||||
|
||||
init_script = (
|
||||
f"export TERM=${{TERM:-dumb}}\n"
|
||||
f"touch {self._pshell_stdout} {self._pshell_stderr} "
|
||||
f"{self._pshell_status} {self._pshell_cwd} {self._pshell_pid_file}\n"
|
||||
f"echo $$ > {self._pshell_pid_file}\n"
|
||||
f"pwd > {self._pshell_cwd}\n"
|
||||
)
|
||||
self._send_to_shell(init_script)
|
||||
|
||||
deadline = time.monotonic() + 3.0
|
||||
while time.monotonic() < deadline:
|
||||
pid_str = self._read_temp_files(self._pshell_pid_file)[0].strip()
|
||||
if pid_str.isdigit():
|
||||
self._shell_pid = int(pid_str)
|
||||
break
|
||||
time.sleep(0.05)
|
||||
else:
|
||||
logger.warning("Could not read persistent shell PID")
|
||||
self._shell_pid = None
|
||||
|
||||
if self._shell_pid:
|
||||
logger.info(
|
||||
"Persistent shell started (session=%s, pid=%d)",
|
||||
self._session_id, self._shell_pid,
|
||||
)
|
||||
|
||||
reported_cwd = self._read_temp_files(self._pshell_cwd)[0].strip()
|
||||
if reported_cwd:
|
||||
self.cwd = reported_cwd
|
||||
|
||||
def _cleanup_persistent_shell(self):
|
||||
if self._shell_proc is None:
|
||||
return
|
||||
|
||||
if self._session_id:
|
||||
self._cleanup_temp_files()
|
||||
|
||||
try:
|
||||
self._shell_proc.stdin.close()
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
self._shell_proc.terminate()
|
||||
self._shell_proc.wait(timeout=3)
|
||||
except subprocess.TimeoutExpired:
|
||||
self._shell_proc.kill()
|
||||
|
||||
self._shell_alive = False
|
||||
self._shell_proc = None
|
||||
|
||||
if hasattr(self, "_drain_thread") and self._drain_thread.is_alive():
|
||||
self._drain_thread.join(timeout=1.0)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# execute() / cleanup() — shared dispatcher, subclasses inherit
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def execute(self, command: str, cwd: str = "", *,
|
||||
timeout: int | None = None,
|
||||
stdin_data: str | None = None) -> dict:
|
||||
if self.persistent:
|
||||
return self._execute_persistent(
|
||||
command, cwd, timeout=timeout, stdin_data=stdin_data,
|
||||
)
|
||||
return self._execute_oneshot(
|
||||
command, cwd, timeout=timeout, stdin_data=stdin_data,
|
||||
)
|
||||
|
||||
def cleanup(self):
|
||||
if self.persistent:
|
||||
self._cleanup_persistent_shell()
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Shell I/O
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _drain_shell_output(self):
|
||||
try:
|
||||
for _ in self._shell_proc.stdout:
|
||||
pass
|
||||
except Exception:
|
||||
pass
|
||||
self._shell_alive = False
|
||||
|
||||
def _send_to_shell(self, text: str):
|
||||
if not self._shell_alive or self._shell_proc is None:
|
||||
return
|
||||
try:
|
||||
self._shell_proc.stdin.write(text)
|
||||
self._shell_proc.stdin.flush()
|
||||
except (BrokenPipeError, OSError):
|
||||
self._shell_alive = False
|
||||
|
||||
def _read_persistent_output(self) -> tuple[str, int, str]:
|
||||
stdout, stderr, status_raw, cwd = self._read_temp_files(
|
||||
self._pshell_stdout, self._pshell_stderr,
|
||||
self._pshell_status, self._pshell_cwd,
|
||||
)
|
||||
output = self._merge_output(stdout, stderr)
|
||||
status = status_raw.strip()
|
||||
if ":" in status:
|
||||
status = status.split(":", 1)[1]
|
||||
try:
|
||||
exit_code = int(status.strip())
|
||||
except ValueError:
|
||||
exit_code = 1
|
||||
return output, exit_code, cwd.strip()
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Execution
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _execute_persistent(self, command: str, cwd: str, *,
|
||||
timeout: int | None = None,
|
||||
stdin_data: str | None = None) -> dict:
|
||||
if not self._shell_alive:
|
||||
logger.info("Persistent shell died, restarting...")
|
||||
self._init_persistent_shell()
|
||||
|
||||
exec_command, sudo_stdin = self._prepare_command(command)
|
||||
effective_timeout = timeout or self.timeout
|
||||
if stdin_data or sudo_stdin:
|
||||
return self._execute_oneshot(
|
||||
command, cwd, timeout=timeout, stdin_data=stdin_data,
|
||||
)
|
||||
|
||||
with self._shell_lock:
|
||||
return self._execute_persistent_locked(
|
||||
exec_command, cwd, effective_timeout,
|
||||
)
|
||||
|
||||
def _execute_persistent_locked(self, command: str, cwd: str,
|
||||
timeout: int) -> dict:
|
||||
work_dir = cwd or self.cwd
|
||||
cmd_id = uuid.uuid4().hex[:8]
|
||||
truncate = (
|
||||
f": > {self._pshell_stdout}\n"
|
||||
f": > {self._pshell_stderr}\n"
|
||||
f": > {self._pshell_status}\n"
|
||||
)
|
||||
self._send_to_shell(truncate)
|
||||
escaped = command.replace("'", "'\\''")
|
||||
|
||||
ipc_script = (
|
||||
f"cd {shlex.quote(work_dir)}\n"
|
||||
f"eval '{escaped}' < /dev/null > {self._pshell_stdout} 2> {self._pshell_stderr}\n"
|
||||
f"__EC=$?\n"
|
||||
f"pwd > {self._pshell_cwd}\n"
|
||||
f"echo {cmd_id}:$__EC > {self._pshell_status}\n"
|
||||
)
|
||||
self._send_to_shell(ipc_script)
|
||||
deadline = time.monotonic() + timeout
|
||||
poll_interval = self._poll_interval
|
||||
|
||||
while True:
|
||||
if is_interrupted():
|
||||
self._kill_shell_children()
|
||||
output, _, _ = self._read_persistent_output()
|
||||
return {
|
||||
"output": output + "\n[Command interrupted]",
|
||||
"returncode": 130,
|
||||
}
|
||||
|
||||
if time.monotonic() > deadline:
|
||||
self._kill_shell_children()
|
||||
output, _, _ = self._read_persistent_output()
|
||||
if output:
|
||||
return {
|
||||
"output": output + f"\n[Command timed out after {timeout}s]",
|
||||
"returncode": 124,
|
||||
}
|
||||
return self._timeout_result(timeout)
|
||||
|
||||
if not self._shell_alive:
|
||||
return {
|
||||
"output": "Persistent shell died during execution",
|
||||
"returncode": 1,
|
||||
}
|
||||
|
||||
status_content = self._read_temp_files(self._pshell_status)[0].strip()
|
||||
if status_content.startswith(cmd_id + ":"):
|
||||
break
|
||||
|
||||
time.sleep(poll_interval)
|
||||
|
||||
output, exit_code, new_cwd = self._read_persistent_output()
|
||||
if new_cwd:
|
||||
self.cwd = new_cwd
|
||||
return {"output": output, "returncode": exit_code}
|
||||
|
||||
@staticmethod
|
||||
def _merge_output(stdout: str, stderr: str) -> str:
|
||||
parts = []
|
||||
if stdout.strip():
|
||||
parts.append(stdout.rstrip("\n"))
|
||||
if stderr.strip():
|
||||
parts.append(stderr.rstrip("\n"))
|
||||
return "\n".join(parts)
|
||||
369
hermes_code/tools/environments/singularity.py
Normal file
369
hermes_code/tools/environments/singularity.py
Normal file
|
|
@ -0,0 +1,369 @@
|
|||
"""Singularity/Apptainer persistent container environment.
|
||||
|
||||
Security-hardened with --containall, --no-home, capability dropping.
|
||||
Supports configurable resource limits and optional filesystem persistence
|
||||
via writable overlay directories that survive across sessions.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import tempfile
|
||||
import threading
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
from hermes_cli.config import get_hermes_home
|
||||
from tools.environments.base import BaseEnvironment
|
||||
from tools.interrupt import is_interrupted
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_SNAPSHOT_STORE = get_hermes_home() / "singularity_snapshots.json"
|
||||
|
||||
|
||||
def _find_singularity_executable() -> str:
|
||||
"""Locate the apptainer or singularity CLI binary.
|
||||
|
||||
Returns the executable name (``"apptainer"`` or ``"singularity"``).
|
||||
Raises ``RuntimeError`` with install instructions if neither is found.
|
||||
"""
|
||||
if shutil.which("apptainer"):
|
||||
return "apptainer"
|
||||
if shutil.which("singularity"):
|
||||
return "singularity"
|
||||
raise RuntimeError(
|
||||
"Neither 'apptainer' nor 'singularity' was found in PATH. "
|
||||
"Install Apptainer (https://apptainer.org/docs/admin/main/installation.html) "
|
||||
"or Singularity and ensure the CLI is available."
|
||||
)
|
||||
|
||||
|
||||
def _ensure_singularity_available() -> str:
|
||||
"""Preflight check: resolve the executable and verify it responds.
|
||||
|
||||
Returns the executable name on success.
|
||||
Raises ``RuntimeError`` with an actionable message on failure.
|
||||
"""
|
||||
exe = _find_singularity_executable()
|
||||
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[exe, "version"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=10,
|
||||
)
|
||||
except FileNotFoundError:
|
||||
raise RuntimeError(
|
||||
f"Singularity backend selected but the resolved executable '{exe}' "
|
||||
"could not be executed. Check your installation."
|
||||
)
|
||||
except subprocess.TimeoutExpired:
|
||||
raise RuntimeError(
|
||||
f"'{exe} version' timed out. The runtime may be misconfigured."
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
stderr = result.stderr.strip()[:200]
|
||||
raise RuntimeError(
|
||||
f"'{exe} version' failed (exit code {result.returncode}): {stderr}"
|
||||
)
|
||||
|
||||
return exe
|
||||
|
||||
|
||||
def _load_snapshots() -> Dict[str, str]:
|
||||
if _SNAPSHOT_STORE.exists():
|
||||
try:
|
||||
return json.loads(_SNAPSHOT_STORE.read_text())
|
||||
except Exception:
|
||||
pass
|
||||
return {}
|
||||
|
||||
|
||||
def _save_snapshots(data: Dict[str, str]) -> None:
|
||||
_SNAPSHOT_STORE.parent.mkdir(parents=True, exist_ok=True)
|
||||
_SNAPSHOT_STORE.write_text(json.dumps(data, indent=2))
|
||||
|
||||
|
||||
# -------------------------------------------------------------------------
|
||||
# Singularity helpers (scratch dir, SIF cache, SIF building)
|
||||
# -------------------------------------------------------------------------
|
||||
|
||||
def _get_scratch_dir() -> Path:
|
||||
"""Get the best directory for Singularity sandboxes.
|
||||
|
||||
Resolution order:
|
||||
1. TERMINAL_SCRATCH_DIR (explicit override)
|
||||
2. TERMINAL_SANDBOX_DIR / singularity (shared sandbox root)
|
||||
3. /scratch (common on HPC clusters)
|
||||
4. ~/.hermes/sandboxes/singularity (fallback)
|
||||
"""
|
||||
custom_scratch = os.getenv("TERMINAL_SCRATCH_DIR")
|
||||
if custom_scratch:
|
||||
scratch_path = Path(custom_scratch)
|
||||
scratch_path.mkdir(parents=True, exist_ok=True)
|
||||
return scratch_path
|
||||
|
||||
from tools.environments.base import get_sandbox_dir
|
||||
sandbox = get_sandbox_dir() / "singularity"
|
||||
|
||||
scratch = Path("/scratch")
|
||||
if scratch.exists() and os.access(scratch, os.W_OK):
|
||||
user_scratch = scratch / os.getenv("USER", "hermes") / "hermes-agent"
|
||||
user_scratch.mkdir(parents=True, exist_ok=True)
|
||||
logger.info("Using /scratch for sandboxes: %s", user_scratch)
|
||||
return user_scratch
|
||||
|
||||
sandbox.mkdir(parents=True, exist_ok=True)
|
||||
return sandbox
|
||||
|
||||
|
||||
def _get_apptainer_cache_dir() -> Path:
|
||||
"""Get the Apptainer cache directory for SIF images."""
|
||||
cache_dir = os.getenv("APPTAINER_CACHEDIR")
|
||||
if cache_dir:
|
||||
cache_path = Path(cache_dir)
|
||||
cache_path.mkdir(parents=True, exist_ok=True)
|
||||
return cache_path
|
||||
scratch = _get_scratch_dir()
|
||||
cache_path = scratch / ".apptainer"
|
||||
cache_path.mkdir(parents=True, exist_ok=True)
|
||||
return cache_path
|
||||
|
||||
|
||||
_sif_build_lock = threading.Lock()
|
||||
|
||||
|
||||
def _get_or_build_sif(image: str, executable: str = "apptainer") -> str:
|
||||
"""Get or build a SIF image from a docker:// URL.
|
||||
|
||||
Returns the path unchanged if it's already a .sif file.
|
||||
For docker:// URLs, checks the cache and builds if needed.
|
||||
"""
|
||||
if image.endswith('.sif') and Path(image).exists():
|
||||
return image
|
||||
if not image.startswith('docker://'):
|
||||
return image
|
||||
|
||||
image_name = image.replace('docker://', '').replace('/', '-').replace(':', '-')
|
||||
cache_dir = _get_apptainer_cache_dir()
|
||||
sif_path = cache_dir / f"{image_name}.sif"
|
||||
|
||||
if sif_path.exists():
|
||||
return str(sif_path)
|
||||
|
||||
with _sif_build_lock:
|
||||
if sif_path.exists():
|
||||
return str(sif_path)
|
||||
|
||||
logger.info("Building SIF image (one-time setup)...")
|
||||
logger.info(" Source: %s", image)
|
||||
logger.info(" Target: %s", sif_path)
|
||||
|
||||
tmp_dir = cache_dir / "tmp"
|
||||
tmp_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
env = os.environ.copy()
|
||||
env["APPTAINER_TMPDIR"] = str(tmp_dir)
|
||||
env["APPTAINER_CACHEDIR"] = str(cache_dir)
|
||||
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[executable, "build", str(sif_path), image],
|
||||
capture_output=True, text=True, timeout=600, env=env,
|
||||
)
|
||||
if result.returncode != 0:
|
||||
logger.warning("SIF build failed, falling back to docker:// URL")
|
||||
logger.warning(" Error: %s", result.stderr[:500])
|
||||
return image
|
||||
logger.info("SIF image built successfully")
|
||||
return str(sif_path)
|
||||
except subprocess.TimeoutExpired:
|
||||
logger.warning("SIF build timed out, falling back to docker:// URL")
|
||||
if sif_path.exists():
|
||||
sif_path.unlink()
|
||||
return image
|
||||
except Exception as e:
|
||||
logger.warning("SIF build error: %s, falling back to docker:// URL", e)
|
||||
return image
|
||||
|
||||
|
||||
# -------------------------------------------------------------------------
|
||||
# SingularityEnvironment
|
||||
# -------------------------------------------------------------------------
|
||||
|
||||
class SingularityEnvironment(BaseEnvironment):
|
||||
"""Hardened Singularity/Apptainer container with resource limits and persistence.
|
||||
|
||||
Security: --containall (isolated PID/IPC/mount namespaces, no host home mount),
|
||||
--no-home, writable-tmpfs for scratch space. The container cannot see or modify
|
||||
the host filesystem outside of explicitly bound paths.
|
||||
|
||||
Persistence: when enabled, the writable overlay directory is preserved across
|
||||
sessions so installed packages and files survive cleanup/restore.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
image: str,
|
||||
cwd: str = "~",
|
||||
timeout: int = 60,
|
||||
cpu: float = 0,
|
||||
memory: int = 0,
|
||||
disk: int = 0,
|
||||
persistent_filesystem: bool = False,
|
||||
task_id: str = "default",
|
||||
):
|
||||
super().__init__(cwd=cwd, timeout=timeout)
|
||||
self.executable = _ensure_singularity_available()
|
||||
self.image = _get_or_build_sif(image, self.executable)
|
||||
self.instance_id = f"hermes_{uuid.uuid4().hex[:12]}"
|
||||
self._instance_started = False
|
||||
self._persistent = persistent_filesystem
|
||||
self._task_id = task_id
|
||||
self._overlay_dir: Optional[Path] = None
|
||||
|
||||
# Resource limits
|
||||
self._cpu = cpu
|
||||
self._memory = memory
|
||||
|
||||
# Persistent overlay directory
|
||||
if self._persistent:
|
||||
overlay_base = _get_scratch_dir() / "hermes-overlays"
|
||||
overlay_base.mkdir(parents=True, exist_ok=True)
|
||||
self._overlay_dir = overlay_base / f"overlay-{task_id}"
|
||||
self._overlay_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
self._start_instance()
|
||||
|
||||
def _start_instance(self):
|
||||
cmd = [self.executable, "instance", "start"]
|
||||
|
||||
# Security: full isolation from host
|
||||
cmd.extend(["--containall", "--no-home"])
|
||||
|
||||
# Writable layer
|
||||
if self._persistent and self._overlay_dir:
|
||||
# Persistent writable overlay -- survives across restarts
|
||||
cmd.extend(["--overlay", str(self._overlay_dir)])
|
||||
else:
|
||||
cmd.append("--writable-tmpfs")
|
||||
|
||||
# Resource limits (cgroup-based, may require root or appropriate config)
|
||||
if self._memory > 0:
|
||||
cmd.extend(["--memory", f"{self._memory}M"])
|
||||
if self._cpu > 0:
|
||||
cmd.extend(["--cpus", str(self._cpu)])
|
||||
|
||||
cmd.extend([str(self.image), self.instance_id])
|
||||
|
||||
try:
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
|
||||
if result.returncode != 0:
|
||||
raise RuntimeError(f"Failed to start instance: {result.stderr}")
|
||||
self._instance_started = True
|
||||
logger.info("Singularity instance %s started (persistent=%s)",
|
||||
self.instance_id, self._persistent)
|
||||
except subprocess.TimeoutExpired:
|
||||
raise RuntimeError("Instance start timed out")
|
||||
|
||||
def execute(self, command: str, cwd: str = "", *,
|
||||
timeout: int | None = None,
|
||||
stdin_data: str | None = None) -> dict:
|
||||
if not self._instance_started:
|
||||
return {"output": "Instance not started", "returncode": -1}
|
||||
|
||||
effective_timeout = timeout or self.timeout
|
||||
work_dir = cwd or self.cwd
|
||||
exec_command, sudo_stdin = self._prepare_command(command)
|
||||
|
||||
# Merge sudo password (if any) with caller-supplied stdin_data.
|
||||
if sudo_stdin is not None and stdin_data is not None:
|
||||
effective_stdin = sudo_stdin + stdin_data
|
||||
elif sudo_stdin is not None:
|
||||
effective_stdin = sudo_stdin
|
||||
else:
|
||||
effective_stdin = stdin_data
|
||||
|
||||
# apptainer exec --pwd doesn't expand ~, so prepend a cd into the command
|
||||
if work_dir == "~" or work_dir.startswith("~/"):
|
||||
exec_command = f"cd {work_dir} && {exec_command}"
|
||||
work_dir = "/tmp"
|
||||
|
||||
cmd = [self.executable, "exec", "--pwd", work_dir,
|
||||
f"instance://{self.instance_id}",
|
||||
"bash", "-c", exec_command]
|
||||
|
||||
try:
|
||||
import time as _time
|
||||
_output_chunks = []
|
||||
proc = subprocess.Popen(
|
||||
cmd,
|
||||
stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
|
||||
stdin=subprocess.PIPE if effective_stdin else subprocess.DEVNULL,
|
||||
text=True,
|
||||
)
|
||||
if effective_stdin:
|
||||
try:
|
||||
proc.stdin.write(effective_stdin)
|
||||
proc.stdin.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def _drain():
|
||||
try:
|
||||
for line in proc.stdout:
|
||||
_output_chunks.append(line)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
reader = threading.Thread(target=_drain, daemon=True)
|
||||
reader.start()
|
||||
deadline = _time.monotonic() + effective_timeout
|
||||
|
||||
while proc.poll() is None:
|
||||
if is_interrupted():
|
||||
proc.terminate()
|
||||
try:
|
||||
proc.wait(timeout=1)
|
||||
except subprocess.TimeoutExpired:
|
||||
proc.kill()
|
||||
reader.join(timeout=2)
|
||||
return {
|
||||
"output": "".join(_output_chunks) + "\n[Command interrupted]",
|
||||
"returncode": 130,
|
||||
}
|
||||
if _time.monotonic() > deadline:
|
||||
proc.kill()
|
||||
reader.join(timeout=2)
|
||||
return self._timeout_result(effective_timeout)
|
||||
_time.sleep(0.2)
|
||||
|
||||
reader.join(timeout=5)
|
||||
return {"output": "".join(_output_chunks), "returncode": proc.returncode}
|
||||
except Exception as e:
|
||||
return {"output": f"Singularity execution error: {e}", "returncode": 1}
|
||||
|
||||
def cleanup(self):
|
||||
"""Stop the instance. If persistent, the overlay dir survives for next creation."""
|
||||
if self._instance_started:
|
||||
try:
|
||||
subprocess.run(
|
||||
[self.executable, "instance", "stop", self.instance_id],
|
||||
capture_output=True, text=True, timeout=30,
|
||||
)
|
||||
logger.info("Singularity instance %s stopped", self.instance_id)
|
||||
except Exception as e:
|
||||
logger.warning("Failed to stop Singularity instance %s: %s", self.instance_id, e)
|
||||
self._instance_started = False
|
||||
|
||||
# Record overlay path for persistence restoration
|
||||
if self._persistent and self._overlay_dir:
|
||||
snapshots = _load_snapshots()
|
||||
snapshots[self._task_id] = str(self._overlay_dir)
|
||||
_save_snapshots(snapshots)
|
||||
232
hermes_code/tools/environments/ssh.py
Normal file
232
hermes_code/tools/environments/ssh.py
Normal file
|
|
@ -0,0 +1,232 @@
|
|||
"""SSH remote execution environment with ControlMaster connection persistence."""
|
||||
|
||||
import logging
|
||||
import shutil
|
||||
import subprocess
|
||||
import tempfile
|
||||
import threading
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
from tools.environments.base import BaseEnvironment
|
||||
from tools.environments.persistent_shell import PersistentShellMixin
|
||||
from tools.interrupt import is_interrupted
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _ensure_ssh_available() -> None:
|
||||
"""Fail fast with a clear error when the SSH client is unavailable."""
|
||||
if not shutil.which("ssh"):
|
||||
raise RuntimeError(
|
||||
"SSH is not installed or not in PATH. Install OpenSSH client: apt install openssh-client"
|
||||
)
|
||||
|
||||
|
||||
class SSHEnvironment(PersistentShellMixin, BaseEnvironment):
|
||||
"""Run commands on a remote machine over SSH.
|
||||
|
||||
Uses SSH ControlMaster for connection persistence so subsequent
|
||||
commands are fast. Security benefit: the agent cannot modify its
|
||||
own code since execution happens on a separate machine.
|
||||
|
||||
Foreground commands are interruptible: the local ssh process is killed
|
||||
and a remote kill is attempted over the ControlMaster socket.
|
||||
|
||||
When ``persistent=True``, a single long-lived bash shell is kept alive
|
||||
over SSH and state (cwd, env vars, shell variables) persists across
|
||||
``execute()`` calls. Output capture uses file-based IPC on the remote
|
||||
host (stdout/stderr/exit-code written to temp files, polled via fast
|
||||
ControlMaster one-shot reads).
|
||||
"""
|
||||
|
||||
def __init__(self, host: str, user: str, cwd: str = "~",
|
||||
timeout: int = 60, port: int = 22, key_path: str = "",
|
||||
persistent: bool = False):
|
||||
super().__init__(cwd=cwd, timeout=timeout)
|
||||
self.host = host
|
||||
self.user = user
|
||||
self.port = port
|
||||
self.key_path = key_path
|
||||
self.persistent = persistent
|
||||
|
||||
self.control_dir = Path(tempfile.gettempdir()) / "hermes-ssh"
|
||||
self.control_dir.mkdir(parents=True, exist_ok=True)
|
||||
self.control_socket = self.control_dir / f"{user}@{host}:{port}.sock"
|
||||
_ensure_ssh_available()
|
||||
self._establish_connection()
|
||||
|
||||
if self.persistent:
|
||||
self._init_persistent_shell()
|
||||
|
||||
def _build_ssh_command(self, extra_args: list | None = None) -> list:
|
||||
cmd = ["ssh"]
|
||||
cmd.extend(["-o", f"ControlPath={self.control_socket}"])
|
||||
cmd.extend(["-o", "ControlMaster=auto"])
|
||||
cmd.extend(["-o", "ControlPersist=300"])
|
||||
cmd.extend(["-o", "BatchMode=yes"])
|
||||
cmd.extend(["-o", "StrictHostKeyChecking=accept-new"])
|
||||
cmd.extend(["-o", "ConnectTimeout=10"])
|
||||
if self.port != 22:
|
||||
cmd.extend(["-p", str(self.port)])
|
||||
if self.key_path:
|
||||
cmd.extend(["-i", self.key_path])
|
||||
if extra_args:
|
||||
cmd.extend(extra_args)
|
||||
cmd.append(f"{self.user}@{self.host}")
|
||||
return cmd
|
||||
|
||||
def _establish_connection(self):
|
||||
cmd = self._build_ssh_command()
|
||||
cmd.append("echo 'SSH connection established'")
|
||||
try:
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, timeout=15)
|
||||
if result.returncode != 0:
|
||||
error_msg = result.stderr.strip() or result.stdout.strip()
|
||||
raise RuntimeError(f"SSH connection failed: {error_msg}")
|
||||
except subprocess.TimeoutExpired:
|
||||
raise RuntimeError(f"SSH connection to {self.user}@{self.host} timed out")
|
||||
|
||||
_poll_interval: float = 0.15
|
||||
|
||||
@property
|
||||
def _temp_prefix(self) -> str:
|
||||
return f"/tmp/hermes-ssh-{self._session_id}"
|
||||
|
||||
def _spawn_shell_process(self) -> subprocess.Popen:
|
||||
cmd = self._build_ssh_command()
|
||||
cmd.append("bash -l")
|
||||
return subprocess.Popen(
|
||||
cmd,
|
||||
stdin=subprocess.PIPE,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.DEVNULL,
|
||||
text=True,
|
||||
)
|
||||
|
||||
def _read_temp_files(self, *paths: str) -> list[str]:
|
||||
if len(paths) == 1:
|
||||
cmd = self._build_ssh_command()
|
||||
cmd.append(f"cat {paths[0]} 2>/dev/null")
|
||||
try:
|
||||
result = subprocess.run(
|
||||
cmd, capture_output=True, text=True, timeout=10,
|
||||
)
|
||||
return [result.stdout]
|
||||
except (subprocess.TimeoutExpired, OSError):
|
||||
return [""]
|
||||
|
||||
delim = f"__HERMES_SEP_{self._session_id}__"
|
||||
script = "; ".join(
|
||||
f"cat {p} 2>/dev/null; echo '{delim}'" for p in paths
|
||||
)
|
||||
cmd = self._build_ssh_command()
|
||||
cmd.append(script)
|
||||
try:
|
||||
result = subprocess.run(
|
||||
cmd, capture_output=True, text=True, timeout=10,
|
||||
)
|
||||
parts = result.stdout.split(delim + "\n")
|
||||
return [parts[i] if i < len(parts) else "" for i in range(len(paths))]
|
||||
except (subprocess.TimeoutExpired, OSError):
|
||||
return [""] * len(paths)
|
||||
|
||||
def _kill_shell_children(self):
|
||||
if self._shell_pid is None:
|
||||
return
|
||||
cmd = self._build_ssh_command()
|
||||
cmd.append(f"pkill -P {self._shell_pid} 2>/dev/null; true")
|
||||
try:
|
||||
subprocess.run(cmd, capture_output=True, timeout=5)
|
||||
except (subprocess.TimeoutExpired, OSError):
|
||||
pass
|
||||
|
||||
def _cleanup_temp_files(self):
|
||||
cmd = self._build_ssh_command()
|
||||
cmd.append(f"rm -f {self._temp_prefix}-*")
|
||||
try:
|
||||
subprocess.run(cmd, capture_output=True, timeout=5)
|
||||
except (subprocess.TimeoutExpired, OSError):
|
||||
pass
|
||||
|
||||
def _execute_oneshot(self, command: str, cwd: str = "", *,
|
||||
timeout: int | None = None,
|
||||
stdin_data: str | None = None) -> dict:
|
||||
work_dir = cwd or self.cwd
|
||||
exec_command, sudo_stdin = self._prepare_command(command)
|
||||
wrapped = f'cd {work_dir} && {exec_command}'
|
||||
effective_timeout = timeout or self.timeout
|
||||
|
||||
if sudo_stdin is not None and stdin_data is not None:
|
||||
effective_stdin = sudo_stdin + stdin_data
|
||||
elif sudo_stdin is not None:
|
||||
effective_stdin = sudo_stdin
|
||||
else:
|
||||
effective_stdin = stdin_data
|
||||
|
||||
cmd = self._build_ssh_command()
|
||||
cmd.append(wrapped)
|
||||
|
||||
kwargs = self._build_run_kwargs(timeout, effective_stdin)
|
||||
kwargs.pop("timeout", None)
|
||||
_output_chunks = []
|
||||
proc = subprocess.Popen(
|
||||
cmd,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
stdin=subprocess.PIPE if effective_stdin else subprocess.DEVNULL,
|
||||
text=True,
|
||||
)
|
||||
|
||||
if effective_stdin:
|
||||
try:
|
||||
proc.stdin.write(effective_stdin)
|
||||
proc.stdin.close()
|
||||
except (BrokenPipeError, OSError):
|
||||
pass
|
||||
|
||||
def _drain():
|
||||
try:
|
||||
for line in proc.stdout:
|
||||
_output_chunks.append(line)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
reader = threading.Thread(target=_drain, daemon=True)
|
||||
reader.start()
|
||||
deadline = time.monotonic() + effective_timeout
|
||||
|
||||
while proc.poll() is None:
|
||||
if is_interrupted():
|
||||
proc.terminate()
|
||||
try:
|
||||
proc.wait(timeout=1)
|
||||
except subprocess.TimeoutExpired:
|
||||
proc.kill()
|
||||
reader.join(timeout=2)
|
||||
return {
|
||||
"output": "".join(_output_chunks) + "\n[Command interrupted]",
|
||||
"returncode": 130,
|
||||
}
|
||||
if time.monotonic() > deadline:
|
||||
proc.kill()
|
||||
reader.join(timeout=2)
|
||||
return self._timeout_result(effective_timeout)
|
||||
time.sleep(0.2)
|
||||
|
||||
reader.join(timeout=5)
|
||||
return {"output": "".join(_output_chunks), "returncode": proc.returncode}
|
||||
|
||||
def cleanup(self):
|
||||
super().cleanup()
|
||||
if self.control_socket.exists():
|
||||
try:
|
||||
cmd = ["ssh", "-o", f"ControlPath={self.control_socket}",
|
||||
"-O", "exit", f"{self.user}@{self.host}"]
|
||||
subprocess.run(cmd, capture_output=True, timeout=5)
|
||||
except (OSError, subprocess.SubprocessError):
|
||||
pass
|
||||
try:
|
||||
self.control_socket.unlink()
|
||||
except OSError:
|
||||
pass
|
||||
Loading…
Add table
Add a link
Reference in a new issue