Refactor file operations and environment management in file_tools and terminal_tool
- Improved the caching mechanism for ShellFileOperations to ensure stale entries are invalidated when environments are cleaned up. - Enhanced thread safety by refining the use of locks during environment creation and cleanup processes. - Streamlined the cleanup of inactive environments to prevent blocking other tool calls, ensuring efficient resource management. - Added error handling and messaging improvements for better user feedback during environment cleanup.
This commit is contained in:
parent
01a3a6ab0d
commit
8117d0adab
2 changed files with 146 additions and 124 deletions
|
|
@ -17,43 +17,49 @@ def _get_file_ops(task_id: str = "default") -> ShellFileOperations:
|
||||||
Respects the TERMINAL_ENV setting -- if the task_id doesn't have an
|
Respects the TERMINAL_ENV setting -- if the task_id doesn't have an
|
||||||
environment yet, creates one using the configured backend (local, docker,
|
environment yet, creates one using the configured backend (local, docker,
|
||||||
modal, etc.) rather than always defaulting to local.
|
modal, etc.) rather than always defaulting to local.
|
||||||
|
|
||||||
|
Thread-safe: uses the same per-task creation locks as terminal_tool to
|
||||||
|
prevent duplicate sandbox creation from concurrent tool calls.
|
||||||
"""
|
"""
|
||||||
from tools.terminal_tool import (
|
from tools.terminal_tool import (
|
||||||
_active_environments, _env_lock, _create_environment,
|
_active_environments, _env_lock, _create_environment,
|
||||||
_get_env_config, _last_activity, _start_cleanup_thread,
|
_get_env_config, _last_activity, _start_cleanup_thread,
|
||||||
_check_disk_usage_warning,
|
_check_disk_usage_warning,
|
||||||
|
_creation_locks, _creation_locks_lock,
|
||||||
)
|
)
|
||||||
import time
|
import time
|
||||||
|
|
||||||
# Fast path: check cache without heavy locks
|
# Fast path: check cache -- but also verify the underlying environment
|
||||||
|
# is still alive (it may have been killed by the cleanup thread).
|
||||||
with _file_ops_lock:
|
with _file_ops_lock:
|
||||||
if task_id in _file_ops_cache:
|
cached = _file_ops_cache.get(task_id)
|
||||||
return _file_ops_cache[task_id]
|
if cached is not None:
|
||||||
|
|
||||||
# Check if we need to create a new environment.
|
|
||||||
# Uses the same per-task creation locks as terminal_tool to prevent
|
|
||||||
# duplicate sandbox creation from concurrent tool calls.
|
|
||||||
from tools.terminal_tool import _creation_locks, _creation_locks_lock
|
|
||||||
|
|
||||||
needs_creation = False
|
|
||||||
with _env_lock:
|
with _env_lock:
|
||||||
if task_id not in _active_environments:
|
if task_id in _active_environments:
|
||||||
needs_creation = True
|
_last_activity[task_id] = time.time()
|
||||||
|
return cached
|
||||||
|
else:
|
||||||
|
# Environment was cleaned up -- invalidate stale cache entry
|
||||||
|
with _file_ops_lock:
|
||||||
|
_file_ops_cache.pop(task_id, None)
|
||||||
|
|
||||||
if needs_creation:
|
# Need to ensure the environment exists before building file_ops.
|
||||||
# Per-task lock: only one thread creates the sandbox, others wait
|
# Acquire per-task lock so only one thread creates the sandbox.
|
||||||
with _creation_locks_lock:
|
with _creation_locks_lock:
|
||||||
if task_id not in _creation_locks:
|
if task_id not in _creation_locks:
|
||||||
_creation_locks[task_id] = __import__("threading").Lock()
|
_creation_locks[task_id] = threading.Lock()
|
||||||
task_lock = _creation_locks[task_id]
|
task_lock = _creation_locks[task_id]
|
||||||
|
|
||||||
with task_lock:
|
with task_lock:
|
||||||
# Double-check after acquiring the per-task lock
|
# Double-check: another thread may have created it while we waited
|
||||||
with _env_lock:
|
with _env_lock:
|
||||||
if task_id in _active_environments:
|
if task_id in _active_environments:
|
||||||
needs_creation = False
|
_last_activity[task_id] = time.time()
|
||||||
|
terminal_env = _active_environments[task_id]
|
||||||
|
else:
|
||||||
|
terminal_env = None
|
||||||
|
|
||||||
if needs_creation:
|
if terminal_env is None:
|
||||||
from tools.terminal_tool import _task_env_overrides
|
from tools.terminal_tool import _task_env_overrides
|
||||||
|
|
||||||
config = _get_env_config()
|
config = _get_env_config()
|
||||||
|
|
@ -73,7 +79,7 @@ def _get_file_ops(task_id: str = "default") -> ShellFileOperations:
|
||||||
if not os.getenv("HERMES_QUIET"):
|
if not os.getenv("HERMES_QUIET"):
|
||||||
print(f"[FileTools] Creating new {env_type} environment for task {task_id[:8]}...", flush=True)
|
print(f"[FileTools] Creating new {env_type} environment for task {task_id[:8]}...", flush=True)
|
||||||
|
|
||||||
new_env = _create_environment(
|
terminal_env = _create_environment(
|
||||||
env_type=env_type,
|
env_type=env_type,
|
||||||
image=image,
|
image=image,
|
||||||
cwd=cwd,
|
cwd=cwd,
|
||||||
|
|
@ -81,18 +87,14 @@ def _get_file_ops(task_id: str = "default") -> ShellFileOperations:
|
||||||
)
|
)
|
||||||
|
|
||||||
with _env_lock:
|
with _env_lock:
|
||||||
_active_environments[task_id] = new_env
|
_active_environments[task_id] = terminal_env
|
||||||
_last_activity[task_id] = __import__("time").time()
|
_last_activity[task_id] = time.time()
|
||||||
|
|
||||||
_start_cleanup_thread()
|
_start_cleanup_thread()
|
||||||
if not os.getenv("HERMES_QUIET"):
|
if not os.getenv("HERMES_QUIET"):
|
||||||
print(f"[FileTools] {env_type} environment ready for task {task_id[:8]}", flush=True)
|
print(f"[FileTools] {env_type} environment ready for task {task_id[:8]}", flush=True)
|
||||||
|
|
||||||
# Now get the environment and build file_ops
|
# Build file_ops from the (guaranteed live) environment and cache it
|
||||||
with _env_lock:
|
|
||||||
_last_activity[task_id] = time.time()
|
|
||||||
terminal_env = _active_environments[task_id]
|
|
||||||
|
|
||||||
file_ops = ShellFileOperations(terminal_env)
|
file_ops = ShellFileOperations(terminal_env)
|
||||||
with _file_ops_lock:
|
with _file_ops_lock:
|
||||||
_file_ops_cache[task_id] = file_ops
|
_file_ops_cache[task_id] = file_ops
|
||||||
|
|
|
||||||
|
|
@ -1274,18 +1274,39 @@ def _cleanup_inactive_envs(lifetime_seconds: int = 300):
|
||||||
global _active_environments, _last_activity
|
global _active_environments, _last_activity
|
||||||
|
|
||||||
current_time = time.time()
|
current_time = time.time()
|
||||||
tasks_to_cleanup = []
|
|
||||||
|
# Phase 1: collect stale entries and remove them from tracking dicts while
|
||||||
|
# holding the lock. Do NOT call env.cleanup() inside the lock -- Modal and
|
||||||
|
# Docker teardown can block for 10-15s, which would stall every concurrent
|
||||||
|
# terminal/file tool call waiting on _env_lock.
|
||||||
|
envs_to_stop = [] # list of (task_id, env) pairs
|
||||||
|
|
||||||
with _env_lock:
|
with _env_lock:
|
||||||
for task_id, last_time in list(_last_activity.items()):
|
for task_id, last_time in list(_last_activity.items()):
|
||||||
if current_time - last_time > lifetime_seconds:
|
if current_time - last_time > lifetime_seconds:
|
||||||
tasks_to_cleanup.append(task_id)
|
env = _active_environments.pop(task_id, None)
|
||||||
|
_last_activity.pop(task_id, None)
|
||||||
|
_task_workdirs.pop(task_id, None)
|
||||||
|
if env is not None:
|
||||||
|
envs_to_stop.append((task_id, env))
|
||||||
|
|
||||||
|
# Also purge per-task creation locks for cleaned-up tasks
|
||||||
|
with _creation_locks_lock:
|
||||||
|
for task_id, _ in envs_to_stop:
|
||||||
|
_creation_locks.pop(task_id, None)
|
||||||
|
|
||||||
|
# Phase 2: stop the actual sandboxes OUTSIDE the lock so other tool calls
|
||||||
|
# are not blocked while Modal/Docker sandboxes shut down.
|
||||||
|
for task_id, env in envs_to_stop:
|
||||||
|
# Invalidate stale file_ops cache entry (Bug fix: prevents
|
||||||
|
# ShellFileOperations from referencing a dead sandbox)
|
||||||
|
try:
|
||||||
|
from tools.file_tools import clear_file_ops_cache
|
||||||
|
clear_file_ops_cache(task_id)
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
|
||||||
for task_id in tasks_to_cleanup:
|
|
||||||
try:
|
try:
|
||||||
if task_id in _active_environments:
|
|
||||||
env = _active_environments[task_id]
|
|
||||||
# Try various cleanup methods
|
|
||||||
if hasattr(env, 'cleanup'):
|
if hasattr(env, 'cleanup'):
|
||||||
env.cleanup()
|
env.cleanup()
|
||||||
elif hasattr(env, 'stop'):
|
elif hasattr(env, 'stop'):
|
||||||
|
|
@ -1293,15 +1314,9 @@ def _cleanup_inactive_envs(lifetime_seconds: int = 300):
|
||||||
elif hasattr(env, 'terminate'):
|
elif hasattr(env, 'terminate'):
|
||||||
env.terminate()
|
env.terminate()
|
||||||
|
|
||||||
del _active_environments[task_id]
|
|
||||||
if not os.getenv("HERMES_QUIET"):
|
if not os.getenv("HERMES_QUIET"):
|
||||||
print(f"[Terminal Cleanup] Cleaned up inactive environment for task: {task_id}")
|
print(f"[Terminal Cleanup] Cleaned up inactive environment for task: {task_id}")
|
||||||
|
|
||||||
if task_id in _last_activity:
|
|
||||||
del _last_activity[task_id]
|
|
||||||
if task_id in _task_workdirs:
|
|
||||||
del _task_workdirs[task_id]
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
error_str = str(e)
|
error_str = str(e)
|
||||||
if not os.getenv("HERMES_QUIET"):
|
if not os.getenv("HERMES_QUIET"):
|
||||||
|
|
@ -1310,14 +1325,6 @@ def _cleanup_inactive_envs(lifetime_seconds: int = 300):
|
||||||
else:
|
else:
|
||||||
print(f"[Terminal Cleanup] Error cleaning up environment for task {task_id}: {e}")
|
print(f"[Terminal Cleanup] Error cleaning up environment for task {task_id}: {e}")
|
||||||
|
|
||||||
# Always remove from tracking dicts
|
|
||||||
if task_id in _active_environments:
|
|
||||||
del _active_environments[task_id]
|
|
||||||
if task_id in _last_activity:
|
|
||||||
del _last_activity[task_id]
|
|
||||||
if task_id in _task_workdirs:
|
|
||||||
del _task_workdirs[task_id]
|
|
||||||
|
|
||||||
|
|
||||||
def _cleanup_thread_worker():
|
def _cleanup_thread_worker():
|
||||||
"""Background thread worker that periodically cleans up inactive environments."""
|
"""Background thread worker that periodically cleans up inactive environments."""
|
||||||
|
|
@ -1415,10 +1422,30 @@ def cleanup_vm(task_id: str):
|
||||||
"""Manually clean up a specific environment by task_id."""
|
"""Manually clean up a specific environment by task_id."""
|
||||||
global _active_environments, _last_activity, _task_workdirs
|
global _active_environments, _last_activity, _task_workdirs
|
||||||
|
|
||||||
|
# Remove from tracking dicts while holding the lock, but defer the
|
||||||
|
# actual (potentially slow) env.cleanup() call to outside the lock
|
||||||
|
# so other tool calls aren't blocked.
|
||||||
|
env = None
|
||||||
with _env_lock:
|
with _env_lock:
|
||||||
|
env = _active_environments.pop(task_id, None)
|
||||||
|
_task_workdirs.pop(task_id, None)
|
||||||
|
_last_activity.pop(task_id, None)
|
||||||
|
|
||||||
|
# Clean up per-task creation lock
|
||||||
|
with _creation_locks_lock:
|
||||||
|
_creation_locks.pop(task_id, None)
|
||||||
|
|
||||||
|
# Invalidate stale file_ops cache entry
|
||||||
|
try:
|
||||||
|
from tools.file_tools import clear_file_ops_cache
|
||||||
|
clear_file_ops_cache(task_id)
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
if env is None:
|
||||||
|
return
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if task_id in _active_environments:
|
|
||||||
env = _active_environments[task_id]
|
|
||||||
if hasattr(env, 'cleanup'):
|
if hasattr(env, 'cleanup'):
|
||||||
env.cleanup()
|
env.cleanup()
|
||||||
elif hasattr(env, 'stop'):
|
elif hasattr(env, 'stop'):
|
||||||
|
|
@ -1426,16 +1453,9 @@ def cleanup_vm(task_id: str):
|
||||||
elif hasattr(env, 'terminate'):
|
elif hasattr(env, 'terminate'):
|
||||||
env.terminate()
|
env.terminate()
|
||||||
|
|
||||||
del _active_environments[task_id]
|
|
||||||
if not os.getenv("HERMES_QUIET"):
|
if not os.getenv("HERMES_QUIET"):
|
||||||
print(f"[Terminal Cleanup] Manually cleaned up environment for task: {task_id}")
|
print(f"[Terminal Cleanup] Manually cleaned up environment for task: {task_id}")
|
||||||
|
|
||||||
if task_id in _task_workdirs:
|
|
||||||
del _task_workdirs[task_id]
|
|
||||||
|
|
||||||
if task_id in _last_activity:
|
|
||||||
del _last_activity[task_id]
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if not os.getenv("HERMES_QUIET"):
|
if not os.getenv("HERMES_QUIET"):
|
||||||
error_str = str(e)
|
error_str = str(e)
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue