from dataclasses import dataclass from datetime import timedelta from uuid import UUID, uuid4 from domain.sandbox import SandboxSession from usecase.interface import ( Clock, Logger, Metrics, SandboxLifecycleLocker, SandboxRuntime, SandboxSessionRepository, Tracer, ) @dataclass(frozen=True, slots=True) class CreateSandboxCommand: chat_id: UUID class CreateSandbox: def __init__( self, repository: SandboxSessionRepository, locker: SandboxLifecycleLocker, runtime: SandboxRuntime, clock: Clock, logger: Logger, metrics: Metrics, tracer: Tracer, ttl: timedelta, ) -> None: self._repository = repository self._locker = locker self._runtime = runtime self._clock = clock self._logger = logger self._metrics = metrics self._tracer = tracer self._ttl = ttl def execute(self, command: CreateSandboxCommand) -> SandboxSession: chat_id = command.chat_id with self._tracer.start_span( 'usecase.create_sandbox', attrs={'chat.id': str(chat_id)}, ) as span: try: with self._locker.lock(chat_id): session = self._repository.get_active_by_chat_id(chat_id) now = self._clock.now() if session is not None and session.expires_at > now: span.set_attribute('session.id', str(session.session_id)) span.set_attribute('container.id', session.container_id) span.set_attribute('sandbox.result', 'reused') self._metrics.increment( 'sandbox.create.total', attrs=_result_metric_attrs('reused'), ) self._logger.info( 'sandbox_reused', attrs=_sandbox_attrs(session), ) return session result = 'created' new_session_id: UUID | None = None if session is not None: result = 'replaced' new_session_id = _new_session_id() span.set_attribute( 'sandbox.previous_session.id', str(session.session_id), ) span.set_attribute( 'sandbox.previous_container.id', session.container_id, ) span.set_attribute( 'sandbox.new_session.id', str(new_session_id), ) self._logger.info( 'sandbox_replaced', attrs=_sandbox_attrs(session), ) self._runtime.stop(session.container_id) self._repository.delete(session.session_id) _set_active_count(self._metrics, self._repository) created_at = self._clock.now() expires_at = created_at + self._ttl if new_session_id is None: new_session_id = _new_session_id() span.set_attribute('session.id', str(new_session_id)) new_session = self._runtime.create( session_id=new_session_id, chat_id=chat_id, created_at=created_at, expires_at=expires_at, ) if result == 'replaced': span.set_attribute( 'sandbox.new_container.id', new_session.container_id, ) self._save_created_session(new_session) _set_active_count(self._metrics, self._repository) if result == 'replaced': span.set_attribute('session.id', str(new_session.session_id)) span.set_attribute('container.id', new_session.container_id) span.set_attribute('sandbox.result', result) self._metrics.increment( 'sandbox.create.total', attrs=_result_metric_attrs(result), ) self._logger.info( 'sandbox_created', attrs=_sandbox_attrs(new_session), ) return new_session except Exception as exc: span.set_attribute('sandbox.result', 'error') self._metrics.increment( 'sandbox.create.total', attrs=_result_metric_attrs('error'), ) span.record_error(exc) raise def _save_created_session(self, session: SandboxSession) -> None: try: self._repository.save(session) except Exception as exc: self._compensate_save_failure(session, exc) raise def _compensate_save_failure( self, session: SandboxSession, error: Exception, ) -> None: try: self._runtime.stop(session.container_id) except Exception as stop_error: _set_active_count(self._metrics, self._repository) raise error from stop_error _set_active_count(self._metrics, self._repository) class CleanupExpiredSandboxes: def __init__( self, repository: SandboxSessionRepository, locker: SandboxLifecycleLocker, runtime: SandboxRuntime, clock: Clock, logger: Logger, metrics: Metrics, tracer: Tracer, ) -> None: self._repository = repository self._locker = locker self._runtime = runtime self._clock = clock self._logger = logger self._metrics = metrics self._tracer = tracer def execute(self) -> list[SandboxSession]: cleaned_sessions: list[SandboxSession] = [] error_count = 0 with self._tracer.start_span( 'usecase.cleanup_expired_sandboxes', ) as span: try: expired_sessions = self._repository.list_expired(self._clock.now()) except Exception as exc: span.set_attribute('sandbox.result', 'error') self._metrics.increment( 'sandbox.cleanup.error.total', attrs=_cleanup_error_metric_attrs( type(exc).__name__, 'list_expired', ), ) span.record_error(exc) raise span.set_attribute('sandbox.expired_count', len(expired_sessions)) for session in expired_sessions: with self._tracer.start_span( 'usecase.cleanup_expired_sandbox', attrs=_sandbox_span_attrs(session), ) as cleanup_span: try: cleaned_session = self._cleanup_session(session) except Exception as exc: error_count += 1 cleanup_span.set_attribute('sandbox.result', 'error') cleanup_span.record_error(exc) self._metrics.increment( 'sandbox.cleanup.error.total', attrs=_error_metric_attrs(type(exc).__name__), ) attrs = _sandbox_attrs(session) attrs['error'] = type(exc).__name__ self._logger.error( 'sandbox_clean_failed', attrs=attrs, ) continue if cleaned_session is None: cleanup_span.set_attribute('sandbox.result', 'skipped') continue cleanup_span.set_attribute('sandbox.result', 'cleaned') cleaned_sessions.append(cleaned_session) self._metrics.increment( 'sandbox.cleanup.total', attrs=_result_metric_attrs('cleaned'), ) self._logger.info( 'sandbox_cleaned', attrs=_sandbox_attrs(cleaned_session), ) span.set_attribute('sandbox.cleaned_count', len(cleaned_sessions)) span.set_attribute('sandbox.error_count', error_count) span.set_attribute( 'sandbox.result', 'completed' if error_count == 0 else 'completed_with_errors', ) return cleaned_sessions def _cleanup_session(self, session: SandboxSession) -> SandboxSession | None: with self._locker.lock(session.chat_id): current_session = self._repository.get_active_by_chat_id(session.chat_id) now = self._clock.now() if current_session is None: return None if current_session.session_id != session.session_id: return None if current_session.expires_at > now: return None self._runtime.stop(current_session.container_id) self._repository.delete(current_session.session_id) _set_active_count(self._metrics, self._repository) return current_session def _new_session_id() -> UUID: return uuid4() def _sandbox_attrs(session: SandboxSession) -> dict[str, str]: return { 'chat_id': str(session.chat_id), 'session_id': str(session.session_id), 'container_id': session.container_id, } def _sandbox_span_attrs(session: SandboxSession) -> dict[str, str]: return { 'chat.id': str(session.chat_id), 'session.id': str(session.session_id), 'container.id': session.container_id, } def _result_metric_attrs(result: str) -> dict[str, str]: return {'result': result} def _error_metric_attrs(error_type: str) -> dict[str, str]: return {'error.type': error_type} def _cleanup_error_metric_attrs( error_type: str, reason: str, ) -> dict[str, str]: return { 'error.type': error_type, 'reason': reason, } def _set_active_count( metrics: Metrics, repository: SandboxSessionRepository, ) -> None: metrics.set('sandbox.active.count', repository.count_active())