From 8d3a080d4575d43bb3878853977ce3613036859d Mon Sep 17 00:00:00 2001 From: Azamat Date: Fri, 3 Apr 2026 01:15:23 +0300 Subject: [PATCH] instrument sandbox docker runtime --- adapter/di/container.py | 9 +- adapter/docker/runtime.py | 225 +++++++++++++++++++++++------- adapter/sandbox/reconciliation.py | 50 ++++--- tasks.md | 2 +- test/test_create_http.py | 175 ++++++++++++++++++++++- test/test_docker_runtime.py | 23 ++- 6 files changed, 411 insertions(+), 73 deletions(-) diff --git a/adapter/di/container.py b/adapter/di/container.py index 4b87b2f..b18382c 100644 --- a/adapter/di/container.py +++ b/adapter/di/container.py @@ -81,11 +81,18 @@ def build_container( sandbox_repository = InMemorySandboxSessionRepository() sandbox_locker = ProcessLocalSandboxLifecycleLocker() - sandbox_runtime = DockerSandboxRuntime(app_config.sandbox, docker_client) + sandbox_runtime = DockerSandboxRuntime( + app_config.sandbox, + docker_client, + observability.metrics, + observability.tracer, + ) sandbox_reconciler = SandboxSessionReconciler( state_source=sandbox_runtime, registry=sandbox_repository, logger=observability.logger, + metrics=observability.metrics, + tracer=observability.tracer, ) repositories = AppRepositories(sandbox_session=sandbox_repository) diff --git a/adapter/docker/runtime.py b/adapter/docker/runtime.py index 3f33466..3c6e93c 100644 --- a/adapter/docker/runtime.py +++ b/adapter/docker/runtime.py @@ -1,3 +1,4 @@ +import time from datetime import datetime from pathlib import Path from uuid import UUID @@ -9,7 +10,7 @@ from docker.types import Mount from adapter.config.model import SandboxConfig from domain.error import SandboxError, SandboxStartError from domain.sandbox import SandboxSession, SandboxStatus -from usecase.interface import SandboxRuntime +from usecase.interface import Metrics, SandboxRuntime, Span, Tracer SANDBOX_LABELS = ('session_id', 'chat_id', 'expires_at') @@ -19,9 +20,13 @@ class DockerSandboxRuntime(SandboxRuntime): self, config: SandboxConfig, client: DockerClient, + metrics: Metrics, + tracer: Tracer, ) -> None: self._config = config self._client = client + self._metrics = metrics + self._tracer = tracer def create( self, @@ -31,62 +36,143 @@ class DockerSandboxRuntime(SandboxRuntime): created_at: datetime, expires_at: datetime, ) -> SandboxSession: - try: - chat_path = self._chat_path(chat_id) - dependencies_path = self._readonly_host_path( - self._config.dependencies_host_path - ) - lambda_tools_path = self._readonly_host_path( - self._config.lambda_tools_host_path - ) - chat_path.mkdir(parents=True, exist_ok=True) - container = self._client.containers.run( - self._config.image, - detach=True, - labels=self._labels(session_id, chat_id, expires_at), - mounts=self._mounts(chat_path, dependencies_path, lambda_tools_path), - ) - except (DockerException, OSError, ValueError) as exc: - raise SandboxStartError(str(chat_id)) from exc + started_at = time.perf_counter() + result = 'error' - container_id = str(getattr(container, 'id', '')).strip() - if not container_id: - raise SandboxStartError(str(chat_id)) + with self._tracer.start_span( + 'adapter.docker.create_sandbox', + attrs={ + 'chat.id': str(chat_id), + 'session.id': str(session_id), + }, + ) as span: + try: + try: + chat_path = self._chat_path(chat_id) + dependencies_path = self._readonly_host_path( + self._config.dependencies_host_path + ) + lambda_tools_path = self._readonly_host_path( + self._config.lambda_tools_host_path + ) + chat_path.mkdir(parents=True, exist_ok=True) + container = self._client.containers.run( + self._config.image, + detach=True, + labels=self._labels(session_id, chat_id, expires_at), + mounts=self._mounts( + chat_path, + dependencies_path, + lambda_tools_path, + ), + ) + except (DockerException, OSError, ValueError) as exc: + raise SandboxStartError(str(chat_id)) from exc - return SandboxSession( - session_id=session_id, - chat_id=chat_id, - container_id=container_id, - status=SandboxStatus.RUNNING, - created_at=created_at, - expires_at=expires_at, - ) + container_id = str(getattr(container, 'id', '')).strip() + if not container_id: + raise SandboxStartError(str(chat_id)) + + result = 'created' + span.set_attribute('container.id', container_id) + span.set_attribute('sandbox.result', result) + return SandboxSession( + session_id=session_id, + chat_id=chat_id, + container_id=container_id, + status=SandboxStatus.RUNNING, + created_at=created_at, + expires_at=expires_at, + ) + except Exception as exc: + span.set_attribute('sandbox.result', result) + span.record_error(exc) + self._metrics.increment( + 'sandbox.runtime.error.total', + attrs=_runtime_error_metric_attrs('create', _error_type(exc)), + ) + raise + finally: + self._metrics.record( + 'sandbox.runtime.create.duration_ms', + _duration_ms(started_at), + attrs=_runtime_metric_attrs('create', result), + ) def stop(self, container_id: str) -> None: - try: - container = self._client.containers.get(container_id) - container.stop() - except NotFound: - return - except DockerException as exc: - raise SandboxError('sandbox_stop_failed') from exc + started_at = time.perf_counter() + result = 'error' + + with self._tracer.start_span( + 'adapter.docker.stop_sandbox', + attrs={'container.id': container_id}, + ) as span: + try: + container = self._client.containers.get(container_id) + _set_span_container_attrs(span, container) + container.stop() + result = 'stopped' + span.set_attribute('sandbox.result', result) + except NotFound: + result = 'not_found' + span.set_attribute('sandbox.result', result) + return + except DockerException as exc: + span.set_attribute('sandbox.result', result) + span.record_error(exc) + self._metrics.increment( + 'sandbox.runtime.error.total', + attrs=_runtime_error_metric_attrs('stop', type(exc).__name__), + ) + raise SandboxError('sandbox_stop_failed') from exc + finally: + self._metrics.record( + 'sandbox.runtime.stop.duration_ms', + _duration_ms(started_at), + attrs=_runtime_metric_attrs('stop', result), + ) def list_active_sessions(self) -> list[SandboxSession]: - try: - containers = self._client.containers.list( - filters={'label': list(SANDBOX_LABELS)} - ) - except DockerException as exc: - raise SandboxError('sandbox_list_failed') from exc + started_at = time.perf_counter() + result = 'error' - sessions: list[SandboxSession] = [] - for container in containers: - session = self._session_from_container(container) - if session is None: - continue - sessions.append(session) + with self._tracer.start_span( + 'adapter.docker.list_active_sandboxes', + ) as span: + try: + try: + containers = self._client.containers.list( + filters={'label': list(SANDBOX_LABELS)} + ) + except DockerException as exc: + raise SandboxError('sandbox_list_failed') from exc - return sessions + sessions: list[SandboxSession] = [] + for container in containers: + session = self._session_from_container(container) + if session is None: + continue + sessions.append(session) + + result = 'listed' + span.set_attribute('sandbox.container_count', len(containers)) + span.set_attribute('sandbox.active_count', len(sessions)) + span.set_attribute('sandbox.result', result) + return sessions + except Exception as exc: + span.set_attribute('sandbox.result', result) + span.record_error(exc) + self._metrics.increment( + 'sandbox.runtime.error.total', + attrs=_runtime_error_metric_attrs('list_active', _error_type(exc)), + ) + raise + finally: + self._metrics.record( + 'sandbox.runtime.list_active.duration_ms', + _duration_ms(started_at), + attrs=_runtime_metric_attrs('list_active', result), + ) def _labels( self, @@ -186,3 +272,44 @@ class DockerSandboxRuntime(SandboxRuntime): def _parse_datetime(value: str) -> datetime: normalized = f'{value[:-1]}+00:00' if value.endswith('Z') else value return datetime.fromisoformat(normalized) + + +def _duration_ms(started_at: float) -> float: + return (time.perf_counter() - started_at) * 1000 + + +def _runtime_metric_attrs(operation: str, result: str) -> dict[str, str]: + return { + 'operation': operation, + 'result': result, + } + + +def _runtime_error_metric_attrs( + operation: str, + error_type: str, +) -> dict[str, str]: + return { + 'operation': operation, + 'error.type': error_type, + } + + +def _error_type(error: Exception) -> str: + if isinstance(error.__cause__, Exception): + return type(error.__cause__).__name__ + return type(error).__name__ + + +def _set_span_container_attrs(span: Span, container: object) -> None: + labels = getattr(container, 'labels', None) + if not isinstance(labels, dict): + return + + session_id = labels.get('session_id') + if isinstance(session_id, str) and session_id: + span.set_attribute('session.id', session_id) + + chat_id = labels.get('chat_id') + if isinstance(chat_id, str) and chat_id: + span.set_attribute('chat.id', chat_id) diff --git a/adapter/sandbox/reconciliation.py b/adapter/sandbox/reconciliation.py index 2d04ca5..81cdb75 100644 --- a/adapter/sandbox/reconciliation.py +++ b/adapter/sandbox/reconciliation.py @@ -3,7 +3,7 @@ from typing import Protocol from uuid import UUID from domain.sandbox import SandboxSession -from usecase.interface import Logger +from usecase.interface import Logger, Metrics, Tracer class SandboxSessionStateSource(Protocol): @@ -13,27 +13,45 @@ class SandboxSessionStateSource(Protocol): class SandboxSessionRegistry(Protocol): def replace_all(self, sessions: list[SandboxSession]) -> None: ... + def count_active(self) -> int: ... + @dataclass(frozen=True, slots=True) class SandboxSessionReconciler: state_source: SandboxSessionStateSource registry: SandboxSessionRegistry logger: Logger + metrics: Metrics + tracer: Tracer def execute(self) -> list[SandboxSession]: - sessions_by_chat_id: dict[UUID, SandboxSession] = {} - for session in sorted( - self.state_source.list_active_sessions(), - key=lambda item: item.created_at, - ): - sessions_by_chat_id[session.chat_id] = session + with self.tracer.start_span( + 'adapter.sandbox.reconcile_sessions', + ) as span: + try: + sessions_by_chat_id: dict[UUID, SandboxSession] = {} + discovered_sessions = self.state_source.list_active_sessions() + span.set_attribute('sandbox.discovered_count', len(discovered_sessions)) + for session in sorted( + discovered_sessions, + key=lambda item: item.created_at, + ): + sessions_by_chat_id[session.chat_id] = session - sessions = list(sessions_by_chat_id.values()) - self.registry.replace_all(sessions) - self.logger.info( - 'sandbox_reconciled', - attrs={ - 'session_count': len(sessions), - }, - ) - return sessions + sessions = list(sessions_by_chat_id.values()) + self.registry.replace_all(sessions) + active_count = self.registry.count_active() + self.metrics.set('sandbox.active.count', active_count) + span.set_attribute('sandbox.active_count', active_count) + span.set_attribute('sandbox.result', 'reconciled') + self.logger.info( + 'sandbox_reconciled', + attrs={ + 'session_count': active_count, + }, + ) + return sessions + except Exception as exc: + span.set_attribute('sandbox.result', 'error') + span.record_error(exc) + raise diff --git a/tasks.md b/tasks.md index a43d96a..012111d 100644 --- a/tasks.md +++ b/tasks.md @@ -257,7 +257,7 @@ ### M21. Трейсы и runtime metrics в Docker adapter и reconciliation - Субагент: `feature-developer` -- Статус: pending +- Статус: completed - Зависимости: `M19` - Commit required: yes - Commit message: `instrument sandbox docker runtime` diff --git a/test/test_create_http.py b/test/test_create_http.py index a25eaba..e8686c4 100644 --- a/test/test_create_http.py +++ b/test/test_create_http.py @@ -7,6 +7,7 @@ from docker import DockerClient from fastapi import FastAPI from starlette.types import Message, Scope +import adapter.di.container as container_module from adapter.config.model import ( AppConfig, AppSectionConfig, @@ -20,6 +21,7 @@ from adapter.config.model import ( TracingConfig, ) from adapter.di.container import AppContainer, AppRepositories, AppUsecases +from adapter.docker.runtime import DockerSandboxRuntime from adapter.http.fastapi import app as app_module from adapter.observability.noop import NoopMetrics, NoopTracer from adapter.observability.runtime import ObservabilityRuntime @@ -80,7 +82,8 @@ class FakeCleanupExpiredSandboxes(CleanupExpiredSandboxes): class FakeDockerClient(DockerClient): - def __init__(self) -> None: + def __init__(self, base_url: str | None = None) -> None: + self.base_url = base_url self.close_calls = 0 def close(self) -> None: @@ -104,6 +107,79 @@ class FakeClock: return self._now +class RecordingMetrics: + def __init__(self) -> None: + self.increment_calls: list[tuple[str, int, Attrs | None]] = [] + self.record_calls: list[tuple[str, float, Attrs | None]] = [] + self.set_calls: list[tuple[str, int | float, Attrs | None]] = [] + + def increment( + self, + name: str, + value: int = 1, + attrs: Attrs | None = None, + ) -> None: + self.increment_calls.append((name, value, attrs)) + + def record( + self, + name: str, + value: float, + attrs: Attrs | None = None, + ) -> None: + self.record_calls.append((name, value, attrs)) + + def set( + self, + name: str, + value: int | float, + attrs: Attrs | None = None, + ) -> None: + self.set_calls.append((name, value, attrs)) + + +class RecordingSpan: + def __init__(self) -> None: + self.attrs: dict[str, str | int | float | bool] = {} + self.errors: list[Exception] = [] + + def set_attribute(self, name: str, value: str | int | float | bool) -> None: + self.attrs[name] = value + + def record_error(self, error: Exception) -> None: + self.errors.append(error) + + +class RecordingSpanContext: + def __init__(self, span: RecordingSpan) -> None: + self._span = span + + def __enter__(self) -> RecordingSpan: + return self._span + + def __exit__( + self, + exc_type: type[BaseException] | None, + exc: BaseException | None, + traceback: object, + ) -> bool | None: + return None + + +class RecordingTracer: + def __init__(self) -> None: + self.spans: list[tuple[str, Attrs | None, RecordingSpan]] = [] + + def start_span( + self, + name: str, + attrs: Attrs | None = None, + ) -> RecordingSpanContext: + span = RecordingSpan() + self.spans.append((name, attrs, span)) + return RecordingSpanContext(span) + + class FakeLifecycleRuntime: def __init__(self, sessions: list[SandboxSession]) -> None: self._sessions = list(sessions) @@ -142,6 +218,26 @@ class FakeLifecycleRuntime: self.stop_calls.append(container_id) +class FixedSandboxState: + def __init__(self, sessions: list[SandboxSession]) -> None: + self._sessions = list(sessions) + + def list_active_sessions(self) -> list[SandboxSession]: + return list(self._sessions) + + +class CountingRegistry: + def __init__(self, count_active_result: int) -> None: + self._count_active_result = count_active_result + self.replaced_sessions: list[SandboxSession] = [] + + def replace_all(self, sessions: list[SandboxSession]) -> None: + self.replaced_sessions = list(sessions) + + def count_active(self) -> int: + return self._count_active_result + + def build_config() -> AppConfig: return AppConfig( app=AppSectionConfig(name='master', env='test'), @@ -198,6 +294,8 @@ def build_container( state_source=EmptySandboxState(), registry=repositories.sandbox_session, logger=logger, + metrics=observability.metrics, + tracer=observability.tracer, ) usecases = AppUsecases( create_sandbox=create_sandbox_usecase, @@ -494,6 +592,8 @@ def test_startup_reconciliation_reuses_existing_container_after_restart( state_source=runtime, registry=repository, logger=logger, + metrics=observability.metrics, + tracer=observability.tracer, ) usecases = AppUsecases( create_sandbox=CreateSandbox( @@ -586,3 +686,76 @@ def test_removed_user_endpoint_returns_not_found(monkeypatch) -> None: assert status_code == 404 assert response == {'detail': 'Not Found'} assert docker_client.close_calls == 1 + + +def test_reconciliation_uses_registry_backed_active_count_metric() -> None: + logger = FakeLogger() + metrics = RecordingMetrics() + tracer = RecordingTracer() + created_at = datetime(2026, 4, 2, 12, 0, tzinfo=UTC) + session = SandboxSession( + session_id=SESSION_ID, + chat_id=CHAT_ID, + container_id='container-123', + status=SandboxStatus.RUNNING, + created_at=created_at, + expires_at=created_at + timedelta(minutes=5), + ) + registry = CountingRegistry(count_active_result=7) + reconciler = SandboxSessionReconciler( + state_source=FixedSandboxState([session]), + registry=registry, + logger=logger, + metrics=metrics, + tracer=tracer, + ) + + sessions = reconciler.execute() + + assert sessions == [session] + assert registry.replaced_sessions == [session] + assert metrics.set_calls == [('sandbox.active.count', 7, None)] + assert tracer.spans[0][0] == 'adapter.sandbox.reconcile_sessions' + assert tracer.spans[0][2].attrs['sandbox.active_count'] == 7 + + +def test_build_container_wires_observability_into_runtime_and_reconciler( + monkeypatch, +) -> None: + logger = FakeLogger() + metrics = RecordingMetrics() + tracer = RecordingTracer() + observability = ObservabilityRuntime( + logger=logger, + metrics=metrics, + tracer=tracer, + ) + docker_client = FakeDockerClient() + monkeypatch.setattr( + container_module, 'build_observability', lambda config: observability + ) + monkeypatch.setattr( + container_module.docker, + 'DockerClient', + lambda base_url: docker_client, + ) + + container = container_module.build_container(config=build_config()) + + runtime = container.sandbox_reconciler.state_source + assert isinstance(runtime, DockerSandboxRuntime) + assert runtime._metrics is metrics + assert runtime._tracer is tracer + assert container.sandbox_reconciler.metrics is metrics + assert container.sandbox_reconciler.tracer is tracer + assert container.usecases.create_sandbox._runtime is runtime + assert container.usecases.create_sandbox._metrics is metrics + assert container.usecases.create_sandbox._tracer is tracer + assert container.usecases.cleanup_expired_sandboxes._runtime is runtime + assert container.usecases.cleanup_expired_sandboxes._metrics is metrics + assert container.usecases.cleanup_expired_sandboxes._tracer is tracer + assert container._docker_client is docker_client + + container.shutdown() + + assert docker_client.close_calls == 1 diff --git a/test/test_docker_runtime.py b/test/test_docker_runtime.py index 1e207f3..ee6a2a4 100644 --- a/test/test_docker_runtime.py +++ b/test/test_docker_runtime.py @@ -10,6 +10,7 @@ from docker.types import Mount from adapter.config.model import SandboxConfig from adapter.docker.runtime import DockerSandboxRuntime +from adapter.observability.noop import NoopMetrics, NoopTracer from domain.error import SandboxError, SandboxStartError from domain.sandbox import SandboxSession, SandboxStatus @@ -116,6 +117,18 @@ def build_config(tmp_path: Path) -> SandboxConfig: ) +def build_runtime( + config: SandboxConfig, + containers: FakeContainers, +) -> DockerSandboxRuntime: + return DockerSandboxRuntime( + config, + FakeDockerClient(containers), + NoopMetrics(), + NoopTracer(), + ) + + def test_runtime_create_applies_mount_policy_and_labels_with_canonical_chat_id( tmp_path: Path, ) -> None: @@ -123,7 +136,7 @@ def test_runtime_create_applies_mount_policy_and_labels_with_canonical_chat_id( (tmp_path / 'dependencies').mkdir() (tmp_path / 'lambda-tools').mkdir() containers = FakeContainers() - runtime = DockerSandboxRuntime(config, FakeDockerClient(containers)) + runtime = build_runtime(config, containers) created_at = datetime(2026, 4, 2, 12, 0, tzinfo=UTC) expires_at = created_at + timedelta(minutes=5) @@ -181,7 +194,7 @@ def test_runtime_create_raises_start_error_when_container_id_is_missing( (tmp_path / 'dependencies').mkdir() (tmp_path / 'lambda-tools').mkdir() containers = FakeContainers(run_result=FakeContainer('')) - runtime = DockerSandboxRuntime(config, FakeDockerClient(containers)) + runtime = build_runtime(config, containers) with pytest.raises(SandboxStartError) as excinfo: runtime.create( @@ -199,7 +212,7 @@ def test_runtime_stop_ignores_missing_container(tmp_path: Path) -> None: config = build_config(tmp_path) containers = FakeContainers() containers.get_result = NotFound('missing') - runtime = DockerSandboxRuntime(config, FakeDockerClient(containers)) + runtime = build_runtime(config, containers) runtime.stop('container-123') @@ -210,7 +223,7 @@ def test_runtime_stop_wraps_docker_errors(tmp_path: Path) -> None: config = build_config(tmp_path) containers = FakeContainers() containers.get_result = DockerException('boom') - runtime = DockerSandboxRuntime(config, FakeDockerClient(containers)) + runtime = build_runtime(config, containers) with pytest.raises(SandboxError) as excinfo: runtime.stop('container-123') @@ -243,7 +256,7 @@ def test_runtime_list_active_sessions_reads_valid_labeled_containers( created_at='2026-04-02T12:01:00Z', ), ] - runtime = DockerSandboxRuntime(config, FakeDockerClient(containers)) + runtime = build_runtime(config, containers) sessions = runtime.list_active_sessions()