From 4cdf6e45de0363a2db350ca8e78b4125f1103610 Mon Sep 17 00:00:00 2001 From: Azamat Date: Fri, 3 Apr 2026 00:56:37 +0300 Subject: [PATCH] instrument sandbox usecases --- adapter/di/container.py | 4 + tasks.md | 2 +- test/test_create_http.py | 4 + test/test_sandbox_usecase.py | 15 +++ usecase/sandbox.py | 215 +++++++++++++++++++++++++++-------- 5 files changed, 191 insertions(+), 49 deletions(-) diff --git a/adapter/di/container.py b/adapter/di/container.py index ace3a42..4b87b2f 100644 --- a/adapter/di/container.py +++ b/adapter/di/container.py @@ -96,6 +96,8 @@ def build_container( runtime=sandbox_runtime, clock=clock, logger=observability.logger, + metrics=observability.metrics, + tracer=observability.tracer, ttl=timedelta(seconds=app_config.sandbox.ttl_seconds), ), cleanup_expired_sandboxes=CleanupExpiredSandboxes( @@ -104,6 +106,8 @@ def build_container( runtime=sandbox_runtime, clock=clock, logger=observability.logger, + metrics=observability.metrics, + tracer=observability.tracer, ), ) diff --git a/tasks.md b/tasks.md index d24657c..a43d96a 100644 --- a/tasks.md +++ b/tasks.md @@ -245,7 +245,7 @@ ### M20. Трейсы и метрики в sandbox usecases - Субагент: `feature-developer` -- Статус: pending +- Статус: completed - Зависимости: `M19` - Commit required: yes - Commit message: `instrument sandbox usecases` diff --git a/test/test_create_http.py b/test/test_create_http.py index 652644b..a25eaba 100644 --- a/test/test_create_http.py +++ b/test/test_create_http.py @@ -502,6 +502,8 @@ def test_startup_reconciliation_reuses_existing_container_after_restart( runtime=runtime, clock=FakeClock(created_at), logger=logger, + metrics=NoopMetrics(), + tracer=NoopTracer(), ttl=timedelta(minutes=5), ), cleanup_expired_sandboxes=CleanupExpiredSandboxes( @@ -510,6 +512,8 @@ def test_startup_reconciliation_reuses_existing_container_after_restart( runtime=runtime, clock=FakeClock(created_at), logger=logger, + metrics=NoopMetrics(), + tracer=NoopTracer(), ), ) container = AppContainer( diff --git a/test/test_sandbox_usecase.py b/test/test_sandbox_usecase.py index 4fedb21..f744492 100644 --- a/test/test_sandbox_usecase.py +++ b/test/test_sandbox_usecase.py @@ -2,6 +2,7 @@ import threading from datetime import UTC, datetime, timedelta from uuid import UUID +from adapter.observability.noop import NoopMetrics, NoopTracer from domain.sandbox import SandboxSession, SandboxStatus from repository.sandbox_lock import ProcessLocalSandboxLifecycleLocker from repository.sandbox_session import InMemorySandboxSessionRepository @@ -218,6 +219,8 @@ def test_create_sandbox_reuses_active_session_when_not_expired() -> None: runtime=runtime, clock=FakeClock(now), logger=logger, + metrics=NoopMetrics(), + tracer=NoopTracer(), ttl=timedelta(minutes=5), ) @@ -264,6 +267,8 @@ def test_create_sandbox_replaces_expired_session_and_creates_new_one( runtime=runtime, clock=FakeClock(now), logger=logger, + metrics=NoopMetrics(), + tracer=NoopTracer(), ttl=timedelta(minutes=5), ) monkeypatch.setattr('usecase.sandbox._new_session_id', lambda: SESSION_NEW_ID) @@ -323,6 +328,8 @@ def test_create_sandbox_creates_new_session_when_none_exists() -> None: runtime=runtime, clock=FakeClock(now), logger=logger, + metrics=NoopMetrics(), + tracer=NoopTracer(), ttl=timedelta(minutes=5), ) @@ -370,6 +377,8 @@ def test_create_sandbox_serializes_duplicate_concurrent_create_for_chat_id( runtime=runtime, clock=FakeClock(now), logger=logger, + metrics=NoopMetrics(), + tracer=NoopTracer(), ttl=timedelta(minutes=5), ) monkeypatch.setattr('usecase.sandbox._new_session_id', lambda: SESSION_NEW_ID) @@ -473,6 +482,8 @@ def test_cleanup_expired_sandboxes_stops_and_deletes_only_expired_sessions() -> runtime=runtime, clock=FakeClock(now), logger=logger, + metrics=NoopMetrics(), + tracer=NoopTracer(), ) result = usecase.execute() @@ -534,6 +545,8 @@ def test_cleanup_expired_sandboxes_skips_replaced_session_from_stale_snapshot() runtime=runtime, clock=FakeClock(now), logger=logger, + metrics=NoopMetrics(), + tracer=NoopTracer(), ) result = usecase.execute() @@ -575,6 +588,8 @@ def test_cleanup_expired_sandboxes_continues_after_stop_failure() -> None: runtime=runtime, clock=FakeClock(now), logger=logger, + metrics=NoopMetrics(), + tracer=NoopTracer(), ) result = usecase.execute() diff --git a/usecase/sandbox.py b/usecase/sandbox.py index 83ee39d..2bdb369 100644 --- a/usecase/sandbox.py +++ b/usecase/sandbox.py @@ -6,9 +6,11 @@ from domain.sandbox import SandboxSession from usecase.interface import ( Clock, Logger, + Metrics, SandboxLifecycleLocker, SandboxRuntime, SandboxSessionRepository, + Tracer, ) @@ -25,6 +27,8 @@ class CreateSandbox: runtime: SandboxRuntime, clock: Clock, logger: Logger, + metrics: Metrics, + tracer: Tracer, ttl: timedelta, ) -> None: self._repository = repository @@ -32,44 +36,80 @@ class CreateSandbox: self._runtime = runtime self._clock = clock self._logger = logger + self._metrics = metrics + self._tracer = tracer self._ttl = ttl def execute(self, command: CreateSandboxCommand) -> SandboxSession: chat_id = command.chat_id - with self._locker.lock(chat_id): - session = self._repository.get_active_by_chat_id(chat_id) - now = self._clock.now() + with self._tracer.start_span( + 'usecase.create_sandbox', + attrs={'chat.id': str(chat_id)}, + ) as span: + try: + with self._locker.lock(chat_id): + session = self._repository.get_active_by_chat_id(chat_id) + now = self._clock.now() - if session is not None and session.expires_at > now: - self._logger.info( - 'sandbox_reused', - attrs=_sandbox_attrs(session), + if session is not None and session.expires_at > now: + span.set_attribute('session.id', str(session.session_id)) + span.set_attribute('container.id', session.container_id) + span.set_attribute('sandbox.result', 'reused') + self._metrics.increment( + 'sandbox.create.total', + attrs=_result_metric_attrs('reused'), + ) + self._logger.info( + 'sandbox_reused', + attrs=_sandbox_attrs(session), + ) + return session + + result = 'created' + if session is not None: + result = 'replaced' + span.set_attribute('session.id', str(session.session_id)) + span.set_attribute('container.id', session.container_id) + self._logger.info( + 'sandbox_replaced', + attrs=_sandbox_attrs(session), + ) + self._runtime.stop(session.container_id) + self._repository.delete(session.session_id) + _set_active_count(self._metrics, self._repository) + + created_at = self._clock.now() + expires_at = created_at + self._ttl + session_id = _new_session_id() + span.set_attribute('session.id', str(session_id)) + new_session = self._runtime.create( + session_id=session_id, + chat_id=chat_id, + created_at=created_at, + expires_at=expires_at, + ) + self._repository.save(new_session) + _set_active_count(self._metrics, self._repository) + span.set_attribute('container.id', new_session.container_id) + span.set_attribute('sandbox.result', result) + self._metrics.increment( + 'sandbox.create.total', + attrs=_result_metric_attrs(result), + ) + self._logger.info( + 'sandbox_created', + attrs=_sandbox_attrs(new_session), + ) + return new_session + except Exception as exc: + span.set_attribute('sandbox.result', 'error') + self._metrics.increment( + 'sandbox.create.total', + attrs=_result_metric_attrs('error'), ) - return session - - if session is not None: - self._logger.info( - 'sandbox_replaced', - attrs=_sandbox_attrs(session), - ) - self._runtime.stop(session.container_id) - self._repository.delete(session.session_id) - - created_at = self._clock.now() - expires_at = created_at + self._ttl - new_session = self._runtime.create( - session_id=_new_session_id(), - chat_id=chat_id, - created_at=created_at, - expires_at=expires_at, - ) - self._repository.save(new_session) - self._logger.info( - 'sandbox_created', - attrs=_sandbox_attrs(new_session), - ) - return new_session + span.record_error(exc) + raise class CleanupExpiredSandboxes: @@ -80,39 +120,84 @@ class CleanupExpiredSandboxes: runtime: SandboxRuntime, clock: Clock, logger: Logger, + metrics: Metrics, + tracer: Tracer, ) -> None: self._repository = repository self._locker = locker self._runtime = runtime self._clock = clock self._logger = logger + self._metrics = metrics + self._tracer = tracer def execute(self) -> list[SandboxSession]: - expired_sessions = self._repository.list_expired(self._clock.now()) cleaned_sessions: list[SandboxSession] = [] + error_count = 0 - for session in expired_sessions: + with self._tracer.start_span( + 'usecase.cleanup_expired_sandboxes', + ) as span: try: - cleaned_session = self._cleanup_session(session) + expired_sessions = self._repository.list_expired(self._clock.now()) except Exception as exc: - attrs = _sandbox_attrs(session) - attrs['error'] = type(exc).__name__ - self._logger.error( - 'sandbox_clean_failed', - attrs=attrs, + span.set_attribute('sandbox.result', 'error') + self._metrics.increment( + 'sandbox.cleanup.error.total', + attrs=_cleanup_error_metric_attrs( + type(exc).__name__, + 'list_expired', + ), ) - continue + span.record_error(exc) + raise - if cleaned_session is None: - continue + span.set_attribute('sandbox.expired_count', len(expired_sessions)) + for session in expired_sessions: + with self._tracer.start_span( + 'usecase.cleanup_expired_sandbox', + attrs=_sandbox_span_attrs(session), + ) as cleanup_span: + try: + cleaned_session = self._cleanup_session(session) + except Exception as exc: + error_count += 1 + cleanup_span.set_attribute('sandbox.result', 'error') + cleanup_span.record_error(exc) + self._metrics.increment( + 'sandbox.cleanup.error.total', + attrs=_error_metric_attrs(type(exc).__name__), + ) + attrs = _sandbox_attrs(session) + attrs['error'] = type(exc).__name__ + self._logger.error( + 'sandbox_clean_failed', + attrs=attrs, + ) + continue - cleaned_sessions.append(cleaned_session) - self._logger.info( - 'sandbox_cleaned', - attrs=_sandbox_attrs(cleaned_session), + if cleaned_session is None: + cleanup_span.set_attribute('sandbox.result', 'skipped') + continue + + cleanup_span.set_attribute('sandbox.result', 'cleaned') + cleaned_sessions.append(cleaned_session) + self._metrics.increment( + 'sandbox.cleanup.total', + attrs=_result_metric_attrs('cleaned'), + ) + self._logger.info( + 'sandbox_cleaned', + attrs=_sandbox_attrs(cleaned_session), + ) + + span.set_attribute('sandbox.cleaned_count', len(cleaned_sessions)) + span.set_attribute('sandbox.error_count', error_count) + span.set_attribute( + 'sandbox.result', + 'completed' if error_count == 0 else 'completed_with_errors', ) - - return cleaned_sessions + return cleaned_sessions def _cleanup_session(self, session: SandboxSession) -> SandboxSession | None: with self._locker.lock(session.chat_id): @@ -129,6 +214,7 @@ class CleanupExpiredSandboxes: self._runtime.stop(current_session.container_id) self._repository.delete(current_session.session_id) + _set_active_count(self._metrics, self._repository) return current_session @@ -142,3 +228,36 @@ def _sandbox_attrs(session: SandboxSession) -> dict[str, str]: 'session_id': str(session.session_id), 'container_id': session.container_id, } + + +def _sandbox_span_attrs(session: SandboxSession) -> dict[str, str]: + return { + 'chat.id': str(session.chat_id), + 'session.id': str(session.session_id), + 'container.id': session.container_id, + } + + +def _result_metric_attrs(result: str) -> dict[str, str]: + return {'result': result} + + +def _error_metric_attrs(error_type: str) -> dict[str, str]: + return {'error.type': error_type} + + +def _cleanup_error_metric_attrs( + error_type: str, + reason: str, +) -> dict[str, str]: + return { + 'error.type': error_type, + 'reason': reason, + } + + +def _set_active_count( + metrics: Metrics, + repository: SandboxSessionRepository, +) -> None: + metrics.set('sandbox.active.count', repository.count_active())