instrument sandbox usecases

This commit is contained in:
Azamat 2026-04-03 00:56:37 +03:00
parent a86e1ee8c7
commit 4cdf6e45de
5 changed files with 191 additions and 49 deletions

View file

@ -96,6 +96,8 @@ def build_container(
runtime=sandbox_runtime, runtime=sandbox_runtime,
clock=clock, clock=clock,
logger=observability.logger, logger=observability.logger,
metrics=observability.metrics,
tracer=observability.tracer,
ttl=timedelta(seconds=app_config.sandbox.ttl_seconds), ttl=timedelta(seconds=app_config.sandbox.ttl_seconds),
), ),
cleanup_expired_sandboxes=CleanupExpiredSandboxes( cleanup_expired_sandboxes=CleanupExpiredSandboxes(
@ -104,6 +106,8 @@ def build_container(
runtime=sandbox_runtime, runtime=sandbox_runtime,
clock=clock, clock=clock,
logger=observability.logger, logger=observability.logger,
metrics=observability.metrics,
tracer=observability.tracer,
), ),
) )

View file

@ -245,7 +245,7 @@
### M20. Трейсы и метрики в sandbox usecases ### M20. Трейсы и метрики в sandbox usecases
- Субагент: `feature-developer` - Субагент: `feature-developer`
- Статус: pending - Статус: completed
- Зависимости: `M19` - Зависимости: `M19`
- Commit required: yes - Commit required: yes
- Commit message: `instrument sandbox usecases` - Commit message: `instrument sandbox usecases`

View file

@ -502,6 +502,8 @@ def test_startup_reconciliation_reuses_existing_container_after_restart(
runtime=runtime, runtime=runtime,
clock=FakeClock(created_at), clock=FakeClock(created_at),
logger=logger, logger=logger,
metrics=NoopMetrics(),
tracer=NoopTracer(),
ttl=timedelta(minutes=5), ttl=timedelta(minutes=5),
), ),
cleanup_expired_sandboxes=CleanupExpiredSandboxes( cleanup_expired_sandboxes=CleanupExpiredSandboxes(
@ -510,6 +512,8 @@ def test_startup_reconciliation_reuses_existing_container_after_restart(
runtime=runtime, runtime=runtime,
clock=FakeClock(created_at), clock=FakeClock(created_at),
logger=logger, logger=logger,
metrics=NoopMetrics(),
tracer=NoopTracer(),
), ),
) )
container = AppContainer( container = AppContainer(

View file

@ -2,6 +2,7 @@ import threading
from datetime import UTC, datetime, timedelta from datetime import UTC, datetime, timedelta
from uuid import UUID from uuid import UUID
from adapter.observability.noop import NoopMetrics, NoopTracer
from domain.sandbox import SandboxSession, SandboxStatus from domain.sandbox import SandboxSession, SandboxStatus
from repository.sandbox_lock import ProcessLocalSandboxLifecycleLocker from repository.sandbox_lock import ProcessLocalSandboxLifecycleLocker
from repository.sandbox_session import InMemorySandboxSessionRepository from repository.sandbox_session import InMemorySandboxSessionRepository
@ -218,6 +219,8 @@ def test_create_sandbox_reuses_active_session_when_not_expired() -> None:
runtime=runtime, runtime=runtime,
clock=FakeClock(now), clock=FakeClock(now),
logger=logger, logger=logger,
metrics=NoopMetrics(),
tracer=NoopTracer(),
ttl=timedelta(minutes=5), ttl=timedelta(minutes=5),
) )
@ -264,6 +267,8 @@ def test_create_sandbox_replaces_expired_session_and_creates_new_one(
runtime=runtime, runtime=runtime,
clock=FakeClock(now), clock=FakeClock(now),
logger=logger, logger=logger,
metrics=NoopMetrics(),
tracer=NoopTracer(),
ttl=timedelta(minutes=5), ttl=timedelta(minutes=5),
) )
monkeypatch.setattr('usecase.sandbox._new_session_id', lambda: SESSION_NEW_ID) monkeypatch.setattr('usecase.sandbox._new_session_id', lambda: SESSION_NEW_ID)
@ -323,6 +328,8 @@ def test_create_sandbox_creates_new_session_when_none_exists() -> None:
runtime=runtime, runtime=runtime,
clock=FakeClock(now), clock=FakeClock(now),
logger=logger, logger=logger,
metrics=NoopMetrics(),
tracer=NoopTracer(),
ttl=timedelta(minutes=5), ttl=timedelta(minutes=5),
) )
@ -370,6 +377,8 @@ def test_create_sandbox_serializes_duplicate_concurrent_create_for_chat_id(
runtime=runtime, runtime=runtime,
clock=FakeClock(now), clock=FakeClock(now),
logger=logger, logger=logger,
metrics=NoopMetrics(),
tracer=NoopTracer(),
ttl=timedelta(minutes=5), ttl=timedelta(minutes=5),
) )
monkeypatch.setattr('usecase.sandbox._new_session_id', lambda: SESSION_NEW_ID) monkeypatch.setattr('usecase.sandbox._new_session_id', lambda: SESSION_NEW_ID)
@ -473,6 +482,8 @@ def test_cleanup_expired_sandboxes_stops_and_deletes_only_expired_sessions() ->
runtime=runtime, runtime=runtime,
clock=FakeClock(now), clock=FakeClock(now),
logger=logger, logger=logger,
metrics=NoopMetrics(),
tracer=NoopTracer(),
) )
result = usecase.execute() result = usecase.execute()
@ -534,6 +545,8 @@ def test_cleanup_expired_sandboxes_skips_replaced_session_from_stale_snapshot()
runtime=runtime, runtime=runtime,
clock=FakeClock(now), clock=FakeClock(now),
logger=logger, logger=logger,
metrics=NoopMetrics(),
tracer=NoopTracer(),
) )
result = usecase.execute() result = usecase.execute()
@ -575,6 +588,8 @@ def test_cleanup_expired_sandboxes_continues_after_stop_failure() -> None:
runtime=runtime, runtime=runtime,
clock=FakeClock(now), clock=FakeClock(now),
logger=logger, logger=logger,
metrics=NoopMetrics(),
tracer=NoopTracer(),
) )
result = usecase.execute() result = usecase.execute()

View file

@ -6,9 +6,11 @@ from domain.sandbox import SandboxSession
from usecase.interface import ( from usecase.interface import (
Clock, Clock,
Logger, Logger,
Metrics,
SandboxLifecycleLocker, SandboxLifecycleLocker,
SandboxRuntime, SandboxRuntime,
SandboxSessionRepository, SandboxSessionRepository,
Tracer,
) )
@ -25,6 +27,8 @@ class CreateSandbox:
runtime: SandboxRuntime, runtime: SandboxRuntime,
clock: Clock, clock: Clock,
logger: Logger, logger: Logger,
metrics: Metrics,
tracer: Tracer,
ttl: timedelta, ttl: timedelta,
) -> None: ) -> None:
self._repository = repository self._repository = repository
@ -32,44 +36,80 @@ class CreateSandbox:
self._runtime = runtime self._runtime = runtime
self._clock = clock self._clock = clock
self._logger = logger self._logger = logger
self._metrics = metrics
self._tracer = tracer
self._ttl = ttl self._ttl = ttl
def execute(self, command: CreateSandboxCommand) -> SandboxSession: def execute(self, command: CreateSandboxCommand) -> SandboxSession:
chat_id = command.chat_id chat_id = command.chat_id
with self._locker.lock(chat_id): with self._tracer.start_span(
session = self._repository.get_active_by_chat_id(chat_id) 'usecase.create_sandbox',
now = self._clock.now() attrs={'chat.id': str(chat_id)},
) as span:
try:
with self._locker.lock(chat_id):
session = self._repository.get_active_by_chat_id(chat_id)
now = self._clock.now()
if session is not None and session.expires_at > now: if session is not None and session.expires_at > now:
self._logger.info( span.set_attribute('session.id', str(session.session_id))
'sandbox_reused', span.set_attribute('container.id', session.container_id)
attrs=_sandbox_attrs(session), span.set_attribute('sandbox.result', 'reused')
self._metrics.increment(
'sandbox.create.total',
attrs=_result_metric_attrs('reused'),
)
self._logger.info(
'sandbox_reused',
attrs=_sandbox_attrs(session),
)
return session
result = 'created'
if session is not None:
result = 'replaced'
span.set_attribute('session.id', str(session.session_id))
span.set_attribute('container.id', session.container_id)
self._logger.info(
'sandbox_replaced',
attrs=_sandbox_attrs(session),
)
self._runtime.stop(session.container_id)
self._repository.delete(session.session_id)
_set_active_count(self._metrics, self._repository)
created_at = self._clock.now()
expires_at = created_at + self._ttl
session_id = _new_session_id()
span.set_attribute('session.id', str(session_id))
new_session = self._runtime.create(
session_id=session_id,
chat_id=chat_id,
created_at=created_at,
expires_at=expires_at,
)
self._repository.save(new_session)
_set_active_count(self._metrics, self._repository)
span.set_attribute('container.id', new_session.container_id)
span.set_attribute('sandbox.result', result)
self._metrics.increment(
'sandbox.create.total',
attrs=_result_metric_attrs(result),
)
self._logger.info(
'sandbox_created',
attrs=_sandbox_attrs(new_session),
)
return new_session
except Exception as exc:
span.set_attribute('sandbox.result', 'error')
self._metrics.increment(
'sandbox.create.total',
attrs=_result_metric_attrs('error'),
) )
return session span.record_error(exc)
raise
if session is not None:
self._logger.info(
'sandbox_replaced',
attrs=_sandbox_attrs(session),
)
self._runtime.stop(session.container_id)
self._repository.delete(session.session_id)
created_at = self._clock.now()
expires_at = created_at + self._ttl
new_session = self._runtime.create(
session_id=_new_session_id(),
chat_id=chat_id,
created_at=created_at,
expires_at=expires_at,
)
self._repository.save(new_session)
self._logger.info(
'sandbox_created',
attrs=_sandbox_attrs(new_session),
)
return new_session
class CleanupExpiredSandboxes: class CleanupExpiredSandboxes:
@ -80,39 +120,84 @@ class CleanupExpiredSandboxes:
runtime: SandboxRuntime, runtime: SandboxRuntime,
clock: Clock, clock: Clock,
logger: Logger, logger: Logger,
metrics: Metrics,
tracer: Tracer,
) -> None: ) -> None:
self._repository = repository self._repository = repository
self._locker = locker self._locker = locker
self._runtime = runtime self._runtime = runtime
self._clock = clock self._clock = clock
self._logger = logger self._logger = logger
self._metrics = metrics
self._tracer = tracer
def execute(self) -> list[SandboxSession]: def execute(self) -> list[SandboxSession]:
expired_sessions = self._repository.list_expired(self._clock.now())
cleaned_sessions: list[SandboxSession] = [] cleaned_sessions: list[SandboxSession] = []
error_count = 0
for session in expired_sessions: with self._tracer.start_span(
'usecase.cleanup_expired_sandboxes',
) as span:
try: try:
cleaned_session = self._cleanup_session(session) expired_sessions = self._repository.list_expired(self._clock.now())
except Exception as exc: except Exception as exc:
attrs = _sandbox_attrs(session) span.set_attribute('sandbox.result', 'error')
attrs['error'] = type(exc).__name__ self._metrics.increment(
self._logger.error( 'sandbox.cleanup.error.total',
'sandbox_clean_failed', attrs=_cleanup_error_metric_attrs(
attrs=attrs, type(exc).__name__,
'list_expired',
),
) )
continue span.record_error(exc)
raise
if cleaned_session is None: span.set_attribute('sandbox.expired_count', len(expired_sessions))
continue for session in expired_sessions:
with self._tracer.start_span(
'usecase.cleanup_expired_sandbox',
attrs=_sandbox_span_attrs(session),
) as cleanup_span:
try:
cleaned_session = self._cleanup_session(session)
except Exception as exc:
error_count += 1
cleanup_span.set_attribute('sandbox.result', 'error')
cleanup_span.record_error(exc)
self._metrics.increment(
'sandbox.cleanup.error.total',
attrs=_error_metric_attrs(type(exc).__name__),
)
attrs = _sandbox_attrs(session)
attrs['error'] = type(exc).__name__
self._logger.error(
'sandbox_clean_failed',
attrs=attrs,
)
continue
cleaned_sessions.append(cleaned_session) if cleaned_session is None:
self._logger.info( cleanup_span.set_attribute('sandbox.result', 'skipped')
'sandbox_cleaned', continue
attrs=_sandbox_attrs(cleaned_session),
cleanup_span.set_attribute('sandbox.result', 'cleaned')
cleaned_sessions.append(cleaned_session)
self._metrics.increment(
'sandbox.cleanup.total',
attrs=_result_metric_attrs('cleaned'),
)
self._logger.info(
'sandbox_cleaned',
attrs=_sandbox_attrs(cleaned_session),
)
span.set_attribute('sandbox.cleaned_count', len(cleaned_sessions))
span.set_attribute('sandbox.error_count', error_count)
span.set_attribute(
'sandbox.result',
'completed' if error_count == 0 else 'completed_with_errors',
) )
return cleaned_sessions
return cleaned_sessions
def _cleanup_session(self, session: SandboxSession) -> SandboxSession | None: def _cleanup_session(self, session: SandboxSession) -> SandboxSession | None:
with self._locker.lock(session.chat_id): with self._locker.lock(session.chat_id):
@ -129,6 +214,7 @@ class CleanupExpiredSandboxes:
self._runtime.stop(current_session.container_id) self._runtime.stop(current_session.container_id)
self._repository.delete(current_session.session_id) self._repository.delete(current_session.session_id)
_set_active_count(self._metrics, self._repository)
return current_session return current_session
@ -142,3 +228,36 @@ def _sandbox_attrs(session: SandboxSession) -> dict[str, str]:
'session_id': str(session.session_id), 'session_id': str(session.session_id),
'container_id': session.container_id, 'container_id': session.container_id,
} }
def _sandbox_span_attrs(session: SandboxSession) -> dict[str, str]:
return {
'chat.id': str(session.chat_id),
'session.id': str(session.session_id),
'container.id': session.container_id,
}
def _result_metric_attrs(result: str) -> dict[str, str]:
return {'result': result}
def _error_metric_attrs(error_type: str) -> dict[str, str]:
return {'error.type': error_type}
def _cleanup_error_metric_attrs(
error_type: str,
reason: str,
) -> dict[str, str]:
return {
'error.type': error_type,
'reason': reason,
}
def _set_active_count(
metrics: Metrics,
repository: SandboxSessionRepository,
) -> None:
metrics.set('sandbox.active.count', repository.count_active())