Refactor file operations and environment management in file_tools and terminal_tool

- Improved the caching mechanism for ShellFileOperations to ensure stale entries are invalidated when environments are cleaned up.
- Enhanced thread safety by refining the use of locks during environment creation and cleanup processes.
- Streamlined the cleanup of inactive environments to prevent blocking other tool calls, ensuring efficient resource management.
- Added error handling and messaging improvements for better user feedback during environment cleanup.
This commit is contained in:
teknium1 2026-02-16 19:37:40 -08:00
parent 01a3a6ab0d
commit 8117d0adab
2 changed files with 146 additions and 124 deletions

View file

@ -17,43 +17,49 @@ def _get_file_ops(task_id: str = "default") -> ShellFileOperations:
Respects the TERMINAL_ENV setting -- if the task_id doesn't have an Respects the TERMINAL_ENV setting -- if the task_id doesn't have an
environment yet, creates one using the configured backend (local, docker, environment yet, creates one using the configured backend (local, docker,
modal, etc.) rather than always defaulting to local. modal, etc.) rather than always defaulting to local.
Thread-safe: uses the same per-task creation locks as terminal_tool to
prevent duplicate sandbox creation from concurrent tool calls.
""" """
from tools.terminal_tool import ( from tools.terminal_tool import (
_active_environments, _env_lock, _create_environment, _active_environments, _env_lock, _create_environment,
_get_env_config, _last_activity, _start_cleanup_thread, _get_env_config, _last_activity, _start_cleanup_thread,
_check_disk_usage_warning, _check_disk_usage_warning,
_creation_locks, _creation_locks_lock,
) )
import time import time
# Fast path: check cache without heavy locks # Fast path: check cache -- but also verify the underlying environment
# is still alive (it may have been killed by the cleanup thread).
with _file_ops_lock: with _file_ops_lock:
if task_id in _file_ops_cache: cached = _file_ops_cache.get(task_id)
return _file_ops_cache[task_id] if cached is not None:
# Check if we need to create a new environment.
# Uses the same per-task creation locks as terminal_tool to prevent
# duplicate sandbox creation from concurrent tool calls.
from tools.terminal_tool import _creation_locks, _creation_locks_lock
needs_creation = False
with _env_lock: with _env_lock:
if task_id not in _active_environments: if task_id in _active_environments:
needs_creation = True _last_activity[task_id] = time.time()
return cached
else:
# Environment was cleaned up -- invalidate stale cache entry
with _file_ops_lock:
_file_ops_cache.pop(task_id, None)
if needs_creation: # Need to ensure the environment exists before building file_ops.
# Per-task lock: only one thread creates the sandbox, others wait # Acquire per-task lock so only one thread creates the sandbox.
with _creation_locks_lock: with _creation_locks_lock:
if task_id not in _creation_locks: if task_id not in _creation_locks:
_creation_locks[task_id] = __import__("threading").Lock() _creation_locks[task_id] = threading.Lock()
task_lock = _creation_locks[task_id] task_lock = _creation_locks[task_id]
with task_lock: with task_lock:
# Double-check after acquiring the per-task lock # Double-check: another thread may have created it while we waited
with _env_lock: with _env_lock:
if task_id in _active_environments: if task_id in _active_environments:
needs_creation = False _last_activity[task_id] = time.time()
terminal_env = _active_environments[task_id]
else:
terminal_env = None
if needs_creation: if terminal_env is None:
from tools.terminal_tool import _task_env_overrides from tools.terminal_tool import _task_env_overrides
config = _get_env_config() config = _get_env_config()
@ -73,7 +79,7 @@ def _get_file_ops(task_id: str = "default") -> ShellFileOperations:
if not os.getenv("HERMES_QUIET"): if not os.getenv("HERMES_QUIET"):
print(f"[FileTools] Creating new {env_type} environment for task {task_id[:8]}...", flush=True) print(f"[FileTools] Creating new {env_type} environment for task {task_id[:8]}...", flush=True)
new_env = _create_environment( terminal_env = _create_environment(
env_type=env_type, env_type=env_type,
image=image, image=image,
cwd=cwd, cwd=cwd,
@ -81,18 +87,14 @@ def _get_file_ops(task_id: str = "default") -> ShellFileOperations:
) )
with _env_lock: with _env_lock:
_active_environments[task_id] = new_env _active_environments[task_id] = terminal_env
_last_activity[task_id] = __import__("time").time() _last_activity[task_id] = time.time()
_start_cleanup_thread() _start_cleanup_thread()
if not os.getenv("HERMES_QUIET"): if not os.getenv("HERMES_QUIET"):
print(f"[FileTools] {env_type} environment ready for task {task_id[:8]}", flush=True) print(f"[FileTools] {env_type} environment ready for task {task_id[:8]}", flush=True)
# Now get the environment and build file_ops # Build file_ops from the (guaranteed live) environment and cache it
with _env_lock:
_last_activity[task_id] = time.time()
terminal_env = _active_environments[task_id]
file_ops = ShellFileOperations(terminal_env) file_ops = ShellFileOperations(terminal_env)
with _file_ops_lock: with _file_ops_lock:
_file_ops_cache[task_id] = file_ops _file_ops_cache[task_id] = file_ops

View file

@ -1274,18 +1274,39 @@ def _cleanup_inactive_envs(lifetime_seconds: int = 300):
global _active_environments, _last_activity global _active_environments, _last_activity
current_time = time.time() current_time = time.time()
tasks_to_cleanup = []
# Phase 1: collect stale entries and remove them from tracking dicts while
# holding the lock. Do NOT call env.cleanup() inside the lock -- Modal and
# Docker teardown can block for 10-15s, which would stall every concurrent
# terminal/file tool call waiting on _env_lock.
envs_to_stop = [] # list of (task_id, env) pairs
with _env_lock: with _env_lock:
for task_id, last_time in list(_last_activity.items()): for task_id, last_time in list(_last_activity.items()):
if current_time - last_time > lifetime_seconds: if current_time - last_time > lifetime_seconds:
tasks_to_cleanup.append(task_id) env = _active_environments.pop(task_id, None)
_last_activity.pop(task_id, None)
_task_workdirs.pop(task_id, None)
if env is not None:
envs_to_stop.append((task_id, env))
# Also purge per-task creation locks for cleaned-up tasks
with _creation_locks_lock:
for task_id, _ in envs_to_stop:
_creation_locks.pop(task_id, None)
# Phase 2: stop the actual sandboxes OUTSIDE the lock so other tool calls
# are not blocked while Modal/Docker sandboxes shut down.
for task_id, env in envs_to_stop:
# Invalidate stale file_ops cache entry (Bug fix: prevents
# ShellFileOperations from referencing a dead sandbox)
try:
from tools.file_tools import clear_file_ops_cache
clear_file_ops_cache(task_id)
except ImportError:
pass
for task_id in tasks_to_cleanup:
try: try:
if task_id in _active_environments:
env = _active_environments[task_id]
# Try various cleanup methods
if hasattr(env, 'cleanup'): if hasattr(env, 'cleanup'):
env.cleanup() env.cleanup()
elif hasattr(env, 'stop'): elif hasattr(env, 'stop'):
@ -1293,15 +1314,9 @@ def _cleanup_inactive_envs(lifetime_seconds: int = 300):
elif hasattr(env, 'terminate'): elif hasattr(env, 'terminate'):
env.terminate() env.terminate()
del _active_environments[task_id]
if not os.getenv("HERMES_QUIET"): if not os.getenv("HERMES_QUIET"):
print(f"[Terminal Cleanup] Cleaned up inactive environment for task: {task_id}") print(f"[Terminal Cleanup] Cleaned up inactive environment for task: {task_id}")
if task_id in _last_activity:
del _last_activity[task_id]
if task_id in _task_workdirs:
del _task_workdirs[task_id]
except Exception as e: except Exception as e:
error_str = str(e) error_str = str(e)
if not os.getenv("HERMES_QUIET"): if not os.getenv("HERMES_QUIET"):
@ -1310,14 +1325,6 @@ def _cleanup_inactive_envs(lifetime_seconds: int = 300):
else: else:
print(f"[Terminal Cleanup] Error cleaning up environment for task {task_id}: {e}") print(f"[Terminal Cleanup] Error cleaning up environment for task {task_id}: {e}")
# Always remove from tracking dicts
if task_id in _active_environments:
del _active_environments[task_id]
if task_id in _last_activity:
del _last_activity[task_id]
if task_id in _task_workdirs:
del _task_workdirs[task_id]
def _cleanup_thread_worker(): def _cleanup_thread_worker():
"""Background thread worker that periodically cleans up inactive environments.""" """Background thread worker that periodically cleans up inactive environments."""
@ -1415,10 +1422,30 @@ def cleanup_vm(task_id: str):
"""Manually clean up a specific environment by task_id.""" """Manually clean up a specific environment by task_id."""
global _active_environments, _last_activity, _task_workdirs global _active_environments, _last_activity, _task_workdirs
# Remove from tracking dicts while holding the lock, but defer the
# actual (potentially slow) env.cleanup() call to outside the lock
# so other tool calls aren't blocked.
env = None
with _env_lock: with _env_lock:
env = _active_environments.pop(task_id, None)
_task_workdirs.pop(task_id, None)
_last_activity.pop(task_id, None)
# Clean up per-task creation lock
with _creation_locks_lock:
_creation_locks.pop(task_id, None)
# Invalidate stale file_ops cache entry
try:
from tools.file_tools import clear_file_ops_cache
clear_file_ops_cache(task_id)
except ImportError:
pass
if env is None:
return
try: try:
if task_id in _active_environments:
env = _active_environments[task_id]
if hasattr(env, 'cleanup'): if hasattr(env, 'cleanup'):
env.cleanup() env.cleanup()
elif hasattr(env, 'stop'): elif hasattr(env, 'stop'):
@ -1426,16 +1453,9 @@ def cleanup_vm(task_id: str):
elif hasattr(env, 'terminate'): elif hasattr(env, 'terminate'):
env.terminate() env.terminate()
del _active_environments[task_id]
if not os.getenv("HERMES_QUIET"): if not os.getenv("HERMES_QUIET"):
print(f"[Terminal Cleanup] Manually cleaned up environment for task: {task_id}") print(f"[Terminal Cleanup] Manually cleaned up environment for task: {task_id}")
if task_id in _task_workdirs:
del _task_workdirs[task_id]
if task_id in _last_activity:
del _last_activity[task_id]
except Exception as e: except Exception as e:
if not os.getenv("HERMES_QUIET"): if not os.getenv("HERMES_QUIET"):
error_str = str(e) error_str = str(e)