feat: enhance interrupt handling and container resource configuration

- Introduced a shared interrupt signaling mechanism to allow tools to check for user interrupts during long-running operations.
- Updated the AIAgent to handle interrupts more effectively, ensuring in-progress tool calls are canceled and multiple interrupt messages are combined into one prompt.
- Enhanced the CLI configuration to include container resource limits (CPU, memory, disk) and persistence options for Docker, Singularity, and Modal environments.
- Improved documentation to clarify interrupt behaviors and container resource settings, providing users with better guidance on configuration and usage.
This commit is contained in:
teknium1 2026-02-23 02:11:33 -08:00
parent c7857dc1d4
commit 90af34bc83
18 changed files with 940 additions and 90 deletions

View file

@ -1,22 +1,108 @@
"""Docker execution environment wrapping mini-swe-agent's DockerEnvironment."""
"""Docker execution environment wrapping mini-swe-agent's DockerEnvironment.
Adds security hardening, configurable resource limits (CPU, memory, disk),
and optional filesystem persistence via `docker commit`/`docker create --image`.
"""
import logging
import os
import subprocess
import threading
import time
from typing import Optional
from tools.environments.base import BaseEnvironment
from tools.interrupt import is_interrupted
logger = logging.getLogger(__name__)
# Security flags applied to every container
_SECURITY_ARGS = [
"--read-only",
"--cap-drop", "ALL",
"--security-opt", "no-new-privileges",
"--pids-limit", "256",
"--tmpfs", "/tmp:rw,noexec,nosuid,size=512m",
"--tmpfs", "/var/tmp:rw,noexec,nosuid,size=256m",
"--tmpfs", "/run:rw,noexec,nosuid,size=64m",
]
class DockerEnvironment(BaseEnvironment):
"""Docker container execution via mini-swe-agent.
"""Hardened Docker container execution with resource limits and persistence.
Wraps the upstream DockerEnvironment and adds non-blocking stdin
and sudo -S support.
Security: read-only root, all capabilities dropped, no privilege escalation,
PID limits, tmpfs for writable scratch. Writable overlay for /home and cwd
via tmpfs or bind mounts.
Persistence: when enabled, `docker commit` saves the container state on
cleanup, and the next creation restores from that image.
"""
def __init__(self, image: str, cwd: str = "/", timeout: int = 60):
def __init__(
self,
image: str,
cwd: str = "/",
timeout: int = 60,
cpu: float = 0,
memory: int = 0,
disk: int = 0,
persistent_filesystem: bool = False,
task_id: str = "default",
network: bool = True,
):
super().__init__(cwd=cwd, timeout=timeout)
self._base_image = image
self._persistent = persistent_filesystem
self._task_id = task_id
self._container_id: Optional[str] = None
from minisweagent.environments.docker import DockerEnvironment as _Docker
self._inner = _Docker(image=image, cwd=cwd, timeout=timeout)
# Build resource limit args
resource_args = []
if cpu > 0:
resource_args.extend(["--cpus", str(cpu)])
if memory > 0:
resource_args.extend(["--memory", f"{memory}m"])
if disk > 0:
resource_args.extend(["--storage-opt", f"size={disk}m"])
if not network:
resource_args.append("--network=none")
# Persistent volume for writable workspace that survives container restarts.
# Non-persistent mode uses tmpfs (ephemeral, fast, gone on cleanup).
self._volume_name: Optional[str] = None
if self._persistent:
self._volume_name = f"hermes-workspace-{task_id}"
# Create volume if it doesn't exist
subprocess.run(
["docker", "volume", "create", self._volume_name],
capture_output=True, timeout=10,
)
writable_args = [
"-v", f"{self._volume_name}:{cwd}",
"-v", f"{self._volume_name}-home:/root",
]
else:
writable_args = [
"--tmpfs", f"{cwd}:rw,exec,size=10g",
"--tmpfs", "/home:rw,exec,size=1g",
"--tmpfs", "/root:rw,exec,size=1g",
]
# All containers get full security hardening (read-only root + writable
# mounts for the workspace). Persistence uses Docker volumes, not
# filesystem layer commits, so --read-only is always safe.
all_run_args = list(_SECURITY_ARGS) + writable_args + resource_args
self._inner = _Docker(
image=effective_image, cwd=cwd, timeout=timeout,
run_args=all_run_args,
)
self._container_id = self._inner.container_id
def execute(self, command: str, cwd: str = "", *,
timeout: int | None = None,
@ -38,10 +124,65 @@ class DockerEnvironment(BaseEnvironment):
cmd.extend([self._inner.container_id, "bash", "-lc", exec_command])
try:
result = subprocess.run(cmd, **self._build_run_kwargs(timeout, stdin_data))
return {"output": result.stdout, "returncode": result.returncode}
except subprocess.TimeoutExpired:
return self._timeout_result(effective_timeout)
_output_chunks = []
proc = subprocess.Popen(
cmd,
stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
stdin=subprocess.PIPE if stdin_data else subprocess.DEVNULL,
text=True,
)
if stdin_data:
try:
proc.stdin.write(stdin_data)
proc.stdin.close()
except Exception:
pass
def _drain():
try:
for line in proc.stdout:
_output_chunks.append(line)
except Exception:
pass
reader = threading.Thread(target=_drain, daemon=True)
reader.start()
deadline = time.monotonic() + effective_timeout
while proc.poll() is None:
if is_interrupted():
proc.terminate()
try:
proc.wait(timeout=1)
except subprocess.TimeoutExpired:
proc.kill()
reader.join(timeout=2)
return {
"output": "".join(_output_chunks) + "\n[Command interrupted]",
"returncode": 130,
}
if time.monotonic() > deadline:
proc.kill()
reader.join(timeout=2)
return self._timeout_result(effective_timeout)
time.sleep(0.2)
reader.join(timeout=5)
return {"output": "".join(_output_chunks), "returncode": proc.returncode}
except Exception as e:
return {"output": f"Docker execution error: {e}", "returncode": 1}
def cleanup(self):
"""Stop and remove the container. Volumes persist if persistent=True."""
self._inner.cleanup()
# If NOT persistent, remove the workspace volumes too
if not self._persistent and self._volume_name:
for vol in [self._volume_name, f"{self._volume_name}-home"]:
try:
subprocess.run(
["docker", "volume", "rm", "-f", vol],
capture_output=True, timeout=10,
)
except Exception:
pass

View file

@ -76,7 +76,12 @@ class LocalEnvironment(BaseEnvironment):
while proc.poll() is None:
if _interrupt_event.is_set():
try:
os.killpg(os.getpgid(proc.pid), signal.SIGTERM)
pgid = os.getpgid(proc.pid)
os.killpg(pgid, signal.SIGTERM)
try:
proc.wait(timeout=1.0)
except subprocess.TimeoutExpired:
os.killpg(pgid, signal.SIGKILL)
except (ProcessLookupError, PermissionError):
proc.kill()
reader.join(timeout=2)

View file

@ -1,21 +1,61 @@
"""Modal cloud execution environment wrapping mini-swe-agent's SwerexModalEnvironment."""
"""Modal cloud execution environment wrapping mini-swe-agent's SwerexModalEnvironment.
Supports persistent filesystem snapshots: when enabled, the sandbox's filesystem
is snapshotted on cleanup and restored on next creation, so installed packages,
project files, and config changes survive across sessions.
"""
import json
import logging
import threading
import time
import uuid
from pathlib import Path
from typing import Any, Dict, Optional
from tools.environments.base import BaseEnvironment
from tools.interrupt import is_interrupted
logger = logging.getLogger(__name__)
_SNAPSHOT_STORE = Path.home() / ".hermes" / "modal_snapshots.json"
def _load_snapshots() -> Dict[str, str]:
"""Load snapshot ID mapping from disk."""
if _SNAPSHOT_STORE.exists():
try:
return json.loads(_SNAPSHOT_STORE.read_text())
except Exception:
pass
return {}
def _save_snapshots(data: Dict[str, str]) -> None:
"""Persist snapshot ID mapping to disk."""
_SNAPSHOT_STORE.parent.mkdir(parents=True, exist_ok=True)
_SNAPSHOT_STORE.write_text(json.dumps(data, indent=2))
class ModalEnvironment(BaseEnvironment):
"""Modal cloud execution via mini-swe-agent.
Wraps SwerexModalEnvironment and adds sudo -S support.
Async-safety patches are applied once before first use so Modal
works inside any event loop (Atropos, gateway, etc.).
Wraps SwerexModalEnvironment and adds sudo -S support, configurable
resources (CPU, memory, disk), and optional filesystem persistence
via Modal's snapshot_filesystem() API.
"""
_patches_applied = False
def __init__(self, image: str, cwd: str = "/root", timeout: int = 60):
def __init__(
self,
image: str,
cwd: str = "/root",
timeout: int = 60,
modal_sandbox_kwargs: Optional[Dict[str, Any]] = None,
persistent_filesystem: bool = True,
task_id: str = "default",
):
super().__init__(cwd=cwd, timeout=timeout)
if not ModalEnvironment._patches_applied:
@ -26,10 +66,35 @@ class ModalEnvironment(BaseEnvironment):
pass
ModalEnvironment._patches_applied = True
self._persistent = persistent_filesystem
self._task_id = task_id
self._base_image = image
sandbox_kwargs = dict(modal_sandbox_kwargs or {})
# If persistent, try to restore from a previous snapshot
restored_image = None
if self._persistent:
snapshot_id = _load_snapshots().get(self._task_id)
if snapshot_id:
try:
import modal
restored_image = modal.Image.from_id(snapshot_id)
logger.info("Modal: restoring from snapshot %s", snapshot_id[:20])
except Exception as e:
logger.warning("Modal: failed to restore snapshot, using base image: %s", e)
restored_image = None
effective_image = restored_image if restored_image else image
from minisweagent.environments.extra.swerex_modal import SwerexModalEnvironment
self._inner = SwerexModalEnvironment(
image=image, cwd=cwd, timeout=timeout,
startup_timeout=180.0, runtime_timeout=3600.0,
image=effective_image,
cwd=cwd,
timeout=timeout,
startup_timeout=180.0,
runtime_timeout=3600.0,
modal_sandbox_kwargs=sandbox_kwargs,
)
def execute(self, command: str, cwd: str = "", *,
@ -42,8 +107,61 @@ class ModalEnvironment(BaseEnvironment):
command = f"{command} << '{marker}'\n{stdin_data}\n{marker}"
exec_command = self._prepare_command(command)
return self._inner.execute(exec_command, cwd=cwd, timeout=timeout)
# Run in a background thread so we can poll for interrupts
result_holder = {"value": None, "error": None}
def _run():
try:
result_holder["value"] = self._inner.execute(exec_command, cwd=cwd, timeout=timeout)
except Exception as e:
result_holder["error"] = e
t = threading.Thread(target=_run, daemon=True)
t.start()
while t.is_alive():
t.join(timeout=0.2)
if is_interrupted():
try:
self._inner.stop()
except Exception:
pass
return {
"output": "[Command interrupted - Modal sandbox terminated]",
"returncode": 130,
}
if result_holder["error"]:
return {"output": f"Modal execution error: {result_holder['error']}", "returncode": 1}
return result_holder["value"]
def cleanup(self):
"""Snapshot the filesystem (if persistent) then stop the sandbox."""
if self._persistent:
try:
sandbox = getattr(self._inner, 'deployment', None)
sandbox = getattr(sandbox, '_sandbox', None) if sandbox else None
if sandbox:
import asyncio
async def _snapshot():
img = await sandbox.snapshot_filesystem.aio()
return img.object_id
try:
snapshot_id = asyncio.run(_snapshot())
except RuntimeError:
import concurrent.futures
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool:
snapshot_id = pool.submit(
asyncio.run, _snapshot()
).result(timeout=60)
snapshots = _load_snapshots()
snapshots[self._task_id] = snapshot_id
_save_snapshots(snapshots)
logger.info("Modal: saved filesystem snapshot %s for task %s",
snapshot_id[:20], self._task_id)
except Exception as e:
logger.warning("Modal: filesystem snapshot failed: %s", e)
if hasattr(self._inner, 'stop'):
self._inner.stop()

View file

@ -1,9 +1,11 @@
"""Singularity/Apptainer persistent container environment.
Also contains the Singularity-specific helpers: scratch dir management,
Apptainer cache, and SIF image building.
Security-hardened with --containall, --no-home, capability dropping.
Supports configurable resource limits and optional filesystem persistence
via writable overlay directories that survive across sessions.
"""
import json
import logging
import os
import shutil
@ -12,11 +14,29 @@ import tempfile
import threading
import uuid
from pathlib import Path
from typing import Any, Dict, Optional
from tools.environments.base import BaseEnvironment
from tools.interrupt import is_interrupted
logger = logging.getLogger(__name__)
_SNAPSHOT_STORE = Path.home() / ".hermes" / "singularity_snapshots.json"
def _load_snapshots() -> Dict[str, str]:
if _SNAPSHOT_STORE.exists():
try:
return json.loads(_SNAPSHOT_STORE.read_text())
except Exception:
pass
return {}
def _save_snapshots(data: Dict[str, str]) -> None:
_SNAPSHOT_STORE.parent.mkdir(parents=True, exist_ok=True)
_SNAPSHOT_STORE.write_text(json.dumps(data, indent=2))
# -------------------------------------------------------------------------
# Singularity helpers (scratch dir, SIF cache, SIF building)
@ -116,32 +136,77 @@ def _get_or_build_sif(image: str, executable: str = "apptainer") -> str:
# -------------------------------------------------------------------------
class SingularityEnvironment(BaseEnvironment):
"""Persistent Singularity/Apptainer container environment.
"""Hardened Singularity/Apptainer container with resource limits and persistence.
Uses ``apptainer instance`` to create a long-running container that persists
state across all commands within a task.
Security: --containall (isolated PID/IPC/mount namespaces, no host home mount),
--no-home, writable-tmpfs for scratch space. The container cannot see or modify
the host filesystem outside of explicitly bound paths.
Persistence: when enabled, the writable overlay directory is preserved across
sessions so installed packages and files survive cleanup/restore.
"""
def __init__(self, image: str, cwd: str = "/root", timeout: int = 60):
def __init__(
self,
image: str,
cwd: str = "/root",
timeout: int = 60,
cpu: float = 0,
memory: int = 0,
disk: int = 0,
persistent_filesystem: bool = False,
task_id: str = "default",
):
super().__init__(cwd=cwd, timeout=timeout)
self.executable = "apptainer" if shutil.which("apptainer") else "singularity"
self.image = _get_or_build_sif(image, self.executable)
self.instance_id = f"hermes_{uuid.uuid4().hex[:12]}"
self._instance_started = False
self._persistent = persistent_filesystem
self._task_id = task_id
self._overlay_dir: Optional[Path] = None
# Resource limits
self._cpu = cpu
self._memory = memory
# Persistent overlay directory
if self._persistent:
overlay_base = _get_scratch_dir() / "hermes-overlays"
overlay_base.mkdir(parents=True, exist_ok=True)
self._overlay_dir = overlay_base / f"overlay-{task_id}"
self._overlay_dir.mkdir(parents=True, exist_ok=True)
self._start_instance()
def _start_instance(self):
cmd = [
self.executable, "instance", "start",
"--writable-tmpfs", "--containall",
str(self.image), self.instance_id,
]
cmd = [self.executable, "instance", "start"]
# Security: full isolation from host
cmd.extend(["--containall", "--no-home"])
# Writable layer
if self._persistent and self._overlay_dir:
# Persistent writable overlay -- survives across restarts
cmd.extend(["--overlay", str(self._overlay_dir)])
else:
cmd.append("--writable-tmpfs")
# Resource limits (cgroup-based, may require root or appropriate config)
if self._memory > 0:
cmd.extend(["--memory", f"{self._memory}M"])
if self._cpu > 0:
cmd.extend(["--cpus", str(self._cpu)])
cmd.extend([str(self.image), self.instance_id])
try:
result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
if result.returncode != 0:
raise RuntimeError(f"Failed to start instance: {result.stderr}")
self._instance_started = True
logger.info("Singularity instance %s started", self.instance_id)
logger.info("Singularity instance %s started (persistent=%s)",
self.instance_id, self._persistent)
except subprocess.TimeoutExpired:
raise RuntimeError("Instance start timed out")
@ -151,17 +216,63 @@ class SingularityEnvironment(BaseEnvironment):
if not self._instance_started:
return {"output": "Instance not started", "returncode": -1}
effective_timeout = timeout or self.timeout
cmd = [self.executable, "exec", "--pwd", cwd or self.cwd,
f"instance://{self.instance_id}",
"bash", "-c", self._prepare_command(command)]
try:
result = subprocess.run(cmd, **self._build_run_kwargs(timeout, stdin_data))
return {"output": result.stdout, "returncode": result.returncode}
except subprocess.TimeoutExpired:
return self._timeout_result(timeout)
import time as _time
_output_chunks = []
proc = subprocess.Popen(
cmd,
stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
stdin=subprocess.PIPE if stdin_data else subprocess.DEVNULL,
text=True,
)
if stdin_data:
try:
proc.stdin.write(stdin_data)
proc.stdin.close()
except Exception:
pass
def _drain():
try:
for line in proc.stdout:
_output_chunks.append(line)
except Exception:
pass
reader = threading.Thread(target=_drain, daemon=True)
reader.start()
deadline = _time.monotonic() + effective_timeout
while proc.poll() is None:
if is_interrupted():
proc.terminate()
try:
proc.wait(timeout=1)
except subprocess.TimeoutExpired:
proc.kill()
reader.join(timeout=2)
return {
"output": "".join(_output_chunks) + "\n[Command interrupted]",
"returncode": 130,
}
if _time.monotonic() > deadline:
proc.kill()
reader.join(timeout=2)
return self._timeout_result(effective_timeout)
_time.sleep(0.2)
reader.join(timeout=5)
return {"output": "".join(_output_chunks), "returncode": proc.returncode}
except Exception as e:
return {"output": f"Singularity execution error: {e}", "returncode": 1}
def cleanup(self):
"""Stop the instance. If persistent, the overlay dir survives for next creation."""
if self._instance_started:
try:
subprocess.run(
@ -172,3 +283,9 @@ class SingularityEnvironment(BaseEnvironment):
except Exception as e:
logger.warning("Failed to stop Singularity instance %s: %s", self.instance_id, e)
self._instance_started = False
# Record overlay path for persistence restoration
if self._persistent and self._overlay_dir:
snapshots = _load_snapshots()
snapshots[self._task_id] = str(self._overlay_dir)
_save_snapshots(snapshots)

View file

@ -3,9 +3,12 @@
import logging
import subprocess
import tempfile
import threading
import time
from pathlib import Path
from tools.environments.base import BaseEnvironment
from tools.interrupt import is_interrupted
logger = logging.getLogger(__name__)
@ -16,6 +19,9 @@ class SSHEnvironment(BaseEnvironment):
Uses SSH ControlMaster for connection persistence so subsequent
commands are fast. Security benefit: the agent cannot modify its
own code since execution happens on a separate machine.
Foreground commands are interruptible: the local ssh process is killed
and a remote kill is attempted over the ControlMaster socket.
"""
def __init__(self, host: str, user: str, cwd: str = "/tmp",
@ -65,15 +71,65 @@ class SSHEnvironment(BaseEnvironment):
work_dir = cwd or self.cwd
exec_command = self._prepare_command(command)
wrapped = f'cd {work_dir} && {exec_command}'
effective_timeout = timeout or self.timeout
cmd = self._build_ssh_command()
cmd.extend(["bash", "-c", wrapped])
try:
result = subprocess.run(cmd, **self._build_run_kwargs(timeout, stdin_data))
return {"output": result.stdout, "returncode": result.returncode}
except subprocess.TimeoutExpired:
return self._timeout_result(timeout)
kwargs = self._build_run_kwargs(timeout, stdin_data)
# Remove timeout from kwargs -- we handle it in the poll loop
kwargs.pop("timeout", None)
_output_chunks = []
proc = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
stdin=subprocess.PIPE if stdin_data else subprocess.DEVNULL,
text=True,
)
if stdin_data:
try:
proc.stdin.write(stdin_data)
proc.stdin.close()
except Exception:
pass
def _drain():
try:
for line in proc.stdout:
_output_chunks.append(line)
except Exception:
pass
reader = threading.Thread(target=_drain, daemon=True)
reader.start()
deadline = time.monotonic() + effective_timeout
while proc.poll() is None:
if is_interrupted():
proc.terminate()
try:
proc.wait(timeout=1)
except subprocess.TimeoutExpired:
proc.kill()
reader.join(timeout=2)
return {
"output": "".join(_output_chunks) + "\n[Command interrupted]",
"returncode": 130,
}
if time.monotonic() > deadline:
proc.kill()
reader.join(timeout=2)
return self._timeout_result(effective_timeout)
time.sleep(0.2)
reader.join(timeout=5)
return {"output": "".join(_output_chunks), "returncode": proc.returncode}
except Exception as e:
return {"output": f"SSH execution error: {str(e)}", "returncode": 1}