instrument sandbox docker runtime

This commit is contained in:
Azamat 2026-04-03 01:15:23 +03:00
parent 4cdf6e45de
commit 8d3a080d45
6 changed files with 411 additions and 73 deletions

View file

@ -81,11 +81,18 @@ def build_container(
sandbox_repository = InMemorySandboxSessionRepository() sandbox_repository = InMemorySandboxSessionRepository()
sandbox_locker = ProcessLocalSandboxLifecycleLocker() sandbox_locker = ProcessLocalSandboxLifecycleLocker()
sandbox_runtime = DockerSandboxRuntime(app_config.sandbox, docker_client) sandbox_runtime = DockerSandboxRuntime(
app_config.sandbox,
docker_client,
observability.metrics,
observability.tracer,
)
sandbox_reconciler = SandboxSessionReconciler( sandbox_reconciler = SandboxSessionReconciler(
state_source=sandbox_runtime, state_source=sandbox_runtime,
registry=sandbox_repository, registry=sandbox_repository,
logger=observability.logger, logger=observability.logger,
metrics=observability.metrics,
tracer=observability.tracer,
) )
repositories = AppRepositories(sandbox_session=sandbox_repository) repositories = AppRepositories(sandbox_session=sandbox_repository)

View file

@ -1,3 +1,4 @@
import time
from datetime import datetime from datetime import datetime
from pathlib import Path from pathlib import Path
from uuid import UUID from uuid import UUID
@ -9,7 +10,7 @@ from docker.types import Mount
from adapter.config.model import SandboxConfig from adapter.config.model import SandboxConfig
from domain.error import SandboxError, SandboxStartError from domain.error import SandboxError, SandboxStartError
from domain.sandbox import SandboxSession, SandboxStatus from domain.sandbox import SandboxSession, SandboxStatus
from usecase.interface import SandboxRuntime from usecase.interface import Metrics, SandboxRuntime, Span, Tracer
SANDBOX_LABELS = ('session_id', 'chat_id', 'expires_at') SANDBOX_LABELS = ('session_id', 'chat_id', 'expires_at')
@ -19,9 +20,13 @@ class DockerSandboxRuntime(SandboxRuntime):
self, self,
config: SandboxConfig, config: SandboxConfig,
client: DockerClient, client: DockerClient,
metrics: Metrics,
tracer: Tracer,
) -> None: ) -> None:
self._config = config self._config = config
self._client = client self._client = client
self._metrics = metrics
self._tracer = tracer
def create( def create(
self, self,
@ -31,6 +36,17 @@ class DockerSandboxRuntime(SandboxRuntime):
created_at: datetime, created_at: datetime,
expires_at: datetime, expires_at: datetime,
) -> SandboxSession: ) -> SandboxSession:
started_at = time.perf_counter()
result = 'error'
with self._tracer.start_span(
'adapter.docker.create_sandbox',
attrs={
'chat.id': str(chat_id),
'session.id': str(session_id),
},
) as span:
try:
try: try:
chat_path = self._chat_path(chat_id) chat_path = self._chat_path(chat_id)
dependencies_path = self._readonly_host_path( dependencies_path = self._readonly_host_path(
@ -44,7 +60,11 @@ class DockerSandboxRuntime(SandboxRuntime):
self._config.image, self._config.image,
detach=True, detach=True,
labels=self._labels(session_id, chat_id, expires_at), labels=self._labels(session_id, chat_id, expires_at),
mounts=self._mounts(chat_path, dependencies_path, lambda_tools_path), mounts=self._mounts(
chat_path,
dependencies_path,
lambda_tools_path,
),
) )
except (DockerException, OSError, ValueError) as exc: except (DockerException, OSError, ValueError) as exc:
raise SandboxStartError(str(chat_id)) from exc raise SandboxStartError(str(chat_id)) from exc
@ -53,6 +73,9 @@ class DockerSandboxRuntime(SandboxRuntime):
if not container_id: if not container_id:
raise SandboxStartError(str(chat_id)) raise SandboxStartError(str(chat_id))
result = 'created'
span.set_attribute('container.id', container_id)
span.set_attribute('sandbox.result', result)
return SandboxSession( return SandboxSession(
session_id=session_id, session_id=session_id,
chat_id=chat_id, chat_id=chat_id,
@ -61,17 +84,62 @@ class DockerSandboxRuntime(SandboxRuntime):
created_at=created_at, created_at=created_at,
expires_at=expires_at, expires_at=expires_at,
) )
except Exception as exc:
span.set_attribute('sandbox.result', result)
span.record_error(exc)
self._metrics.increment(
'sandbox.runtime.error.total',
attrs=_runtime_error_metric_attrs('create', _error_type(exc)),
)
raise
finally:
self._metrics.record(
'sandbox.runtime.create.duration_ms',
_duration_ms(started_at),
attrs=_runtime_metric_attrs('create', result),
)
def stop(self, container_id: str) -> None: def stop(self, container_id: str) -> None:
started_at = time.perf_counter()
result = 'error'
with self._tracer.start_span(
'adapter.docker.stop_sandbox',
attrs={'container.id': container_id},
) as span:
try: try:
container = self._client.containers.get(container_id) container = self._client.containers.get(container_id)
_set_span_container_attrs(span, container)
container.stop() container.stop()
result = 'stopped'
span.set_attribute('sandbox.result', result)
except NotFound: except NotFound:
result = 'not_found'
span.set_attribute('sandbox.result', result)
return return
except DockerException as exc: except DockerException as exc:
span.set_attribute('sandbox.result', result)
span.record_error(exc)
self._metrics.increment(
'sandbox.runtime.error.total',
attrs=_runtime_error_metric_attrs('stop', type(exc).__name__),
)
raise SandboxError('sandbox_stop_failed') from exc raise SandboxError('sandbox_stop_failed') from exc
finally:
self._metrics.record(
'sandbox.runtime.stop.duration_ms',
_duration_ms(started_at),
attrs=_runtime_metric_attrs('stop', result),
)
def list_active_sessions(self) -> list[SandboxSession]: def list_active_sessions(self) -> list[SandboxSession]:
started_at = time.perf_counter()
result = 'error'
with self._tracer.start_span(
'adapter.docker.list_active_sandboxes',
) as span:
try:
try: try:
containers = self._client.containers.list( containers = self._client.containers.list(
filters={'label': list(SANDBOX_LABELS)} filters={'label': list(SANDBOX_LABELS)}
@ -86,7 +154,25 @@ class DockerSandboxRuntime(SandboxRuntime):
continue continue
sessions.append(session) sessions.append(session)
result = 'listed'
span.set_attribute('sandbox.container_count', len(containers))
span.set_attribute('sandbox.active_count', len(sessions))
span.set_attribute('sandbox.result', result)
return sessions return sessions
except Exception as exc:
span.set_attribute('sandbox.result', result)
span.record_error(exc)
self._metrics.increment(
'sandbox.runtime.error.total',
attrs=_runtime_error_metric_attrs('list_active', _error_type(exc)),
)
raise
finally:
self._metrics.record(
'sandbox.runtime.list_active.duration_ms',
_duration_ms(started_at),
attrs=_runtime_metric_attrs('list_active', result),
)
def _labels( def _labels(
self, self,
@ -186,3 +272,44 @@ class DockerSandboxRuntime(SandboxRuntime):
def _parse_datetime(value: str) -> datetime: def _parse_datetime(value: str) -> datetime:
normalized = f'{value[:-1]}+00:00' if value.endswith('Z') else value normalized = f'{value[:-1]}+00:00' if value.endswith('Z') else value
return datetime.fromisoformat(normalized) return datetime.fromisoformat(normalized)
def _duration_ms(started_at: float) -> float:
return (time.perf_counter() - started_at) * 1000
def _runtime_metric_attrs(operation: str, result: str) -> dict[str, str]:
return {
'operation': operation,
'result': result,
}
def _runtime_error_metric_attrs(
operation: str,
error_type: str,
) -> dict[str, str]:
return {
'operation': operation,
'error.type': error_type,
}
def _error_type(error: Exception) -> str:
if isinstance(error.__cause__, Exception):
return type(error.__cause__).__name__
return type(error).__name__
def _set_span_container_attrs(span: Span, container: object) -> None:
labels = getattr(container, 'labels', None)
if not isinstance(labels, dict):
return
session_id = labels.get('session_id')
if isinstance(session_id, str) and session_id:
span.set_attribute('session.id', session_id)
chat_id = labels.get('chat_id')
if isinstance(chat_id, str) and chat_id:
span.set_attribute('chat.id', chat_id)

View file

@ -3,7 +3,7 @@ from typing import Protocol
from uuid import UUID from uuid import UUID
from domain.sandbox import SandboxSession from domain.sandbox import SandboxSession
from usecase.interface import Logger from usecase.interface import Logger, Metrics, Tracer
class SandboxSessionStateSource(Protocol): class SandboxSessionStateSource(Protocol):
@ -13,27 +13,45 @@ class SandboxSessionStateSource(Protocol):
class SandboxSessionRegistry(Protocol): class SandboxSessionRegistry(Protocol):
def replace_all(self, sessions: list[SandboxSession]) -> None: ... def replace_all(self, sessions: list[SandboxSession]) -> None: ...
def count_active(self) -> int: ...
@dataclass(frozen=True, slots=True) @dataclass(frozen=True, slots=True)
class SandboxSessionReconciler: class SandboxSessionReconciler:
state_source: SandboxSessionStateSource state_source: SandboxSessionStateSource
registry: SandboxSessionRegistry registry: SandboxSessionRegistry
logger: Logger logger: Logger
metrics: Metrics
tracer: Tracer
def execute(self) -> list[SandboxSession]: def execute(self) -> list[SandboxSession]:
with self.tracer.start_span(
'adapter.sandbox.reconcile_sessions',
) as span:
try:
sessions_by_chat_id: dict[UUID, SandboxSession] = {} sessions_by_chat_id: dict[UUID, SandboxSession] = {}
discovered_sessions = self.state_source.list_active_sessions()
span.set_attribute('sandbox.discovered_count', len(discovered_sessions))
for session in sorted( for session in sorted(
self.state_source.list_active_sessions(), discovered_sessions,
key=lambda item: item.created_at, key=lambda item: item.created_at,
): ):
sessions_by_chat_id[session.chat_id] = session sessions_by_chat_id[session.chat_id] = session
sessions = list(sessions_by_chat_id.values()) sessions = list(sessions_by_chat_id.values())
self.registry.replace_all(sessions) self.registry.replace_all(sessions)
active_count = self.registry.count_active()
self.metrics.set('sandbox.active.count', active_count)
span.set_attribute('sandbox.active_count', active_count)
span.set_attribute('sandbox.result', 'reconciled')
self.logger.info( self.logger.info(
'sandbox_reconciled', 'sandbox_reconciled',
attrs={ attrs={
'session_count': len(sessions), 'session_count': active_count,
}, },
) )
return sessions return sessions
except Exception as exc:
span.set_attribute('sandbox.result', 'error')
span.record_error(exc)
raise

View file

@ -257,7 +257,7 @@
### M21. Трейсы и runtime metrics в Docker adapter и reconciliation ### M21. Трейсы и runtime metrics в Docker adapter и reconciliation
- Субагент: `feature-developer` - Субагент: `feature-developer`
- Статус: pending - Статус: completed
- Зависимости: `M19` - Зависимости: `M19`
- Commit required: yes - Commit required: yes
- Commit message: `instrument sandbox docker runtime` - Commit message: `instrument sandbox docker runtime`

View file

@ -7,6 +7,7 @@ from docker import DockerClient
from fastapi import FastAPI from fastapi import FastAPI
from starlette.types import Message, Scope from starlette.types import Message, Scope
import adapter.di.container as container_module
from adapter.config.model import ( from adapter.config.model import (
AppConfig, AppConfig,
AppSectionConfig, AppSectionConfig,
@ -20,6 +21,7 @@ from adapter.config.model import (
TracingConfig, TracingConfig,
) )
from adapter.di.container import AppContainer, AppRepositories, AppUsecases from adapter.di.container import AppContainer, AppRepositories, AppUsecases
from adapter.docker.runtime import DockerSandboxRuntime
from adapter.http.fastapi import app as app_module from adapter.http.fastapi import app as app_module
from adapter.observability.noop import NoopMetrics, NoopTracer from adapter.observability.noop import NoopMetrics, NoopTracer
from adapter.observability.runtime import ObservabilityRuntime from adapter.observability.runtime import ObservabilityRuntime
@ -80,7 +82,8 @@ class FakeCleanupExpiredSandboxes(CleanupExpiredSandboxes):
class FakeDockerClient(DockerClient): class FakeDockerClient(DockerClient):
def __init__(self) -> None: def __init__(self, base_url: str | None = None) -> None:
self.base_url = base_url
self.close_calls = 0 self.close_calls = 0
def close(self) -> None: def close(self) -> None:
@ -104,6 +107,79 @@ class FakeClock:
return self._now return self._now
class RecordingMetrics:
def __init__(self) -> None:
self.increment_calls: list[tuple[str, int, Attrs | None]] = []
self.record_calls: list[tuple[str, float, Attrs | None]] = []
self.set_calls: list[tuple[str, int | float, Attrs | None]] = []
def increment(
self,
name: str,
value: int = 1,
attrs: Attrs | None = None,
) -> None:
self.increment_calls.append((name, value, attrs))
def record(
self,
name: str,
value: float,
attrs: Attrs | None = None,
) -> None:
self.record_calls.append((name, value, attrs))
def set(
self,
name: str,
value: int | float,
attrs: Attrs | None = None,
) -> None:
self.set_calls.append((name, value, attrs))
class RecordingSpan:
def __init__(self) -> None:
self.attrs: dict[str, str | int | float | bool] = {}
self.errors: list[Exception] = []
def set_attribute(self, name: str, value: str | int | float | bool) -> None:
self.attrs[name] = value
def record_error(self, error: Exception) -> None:
self.errors.append(error)
class RecordingSpanContext:
def __init__(self, span: RecordingSpan) -> None:
self._span = span
def __enter__(self) -> RecordingSpan:
return self._span
def __exit__(
self,
exc_type: type[BaseException] | None,
exc: BaseException | None,
traceback: object,
) -> bool | None:
return None
class RecordingTracer:
def __init__(self) -> None:
self.spans: list[tuple[str, Attrs | None, RecordingSpan]] = []
def start_span(
self,
name: str,
attrs: Attrs | None = None,
) -> RecordingSpanContext:
span = RecordingSpan()
self.spans.append((name, attrs, span))
return RecordingSpanContext(span)
class FakeLifecycleRuntime: class FakeLifecycleRuntime:
def __init__(self, sessions: list[SandboxSession]) -> None: def __init__(self, sessions: list[SandboxSession]) -> None:
self._sessions = list(sessions) self._sessions = list(sessions)
@ -142,6 +218,26 @@ class FakeLifecycleRuntime:
self.stop_calls.append(container_id) self.stop_calls.append(container_id)
class FixedSandboxState:
def __init__(self, sessions: list[SandboxSession]) -> None:
self._sessions = list(sessions)
def list_active_sessions(self) -> list[SandboxSession]:
return list(self._sessions)
class CountingRegistry:
def __init__(self, count_active_result: int) -> None:
self._count_active_result = count_active_result
self.replaced_sessions: list[SandboxSession] = []
def replace_all(self, sessions: list[SandboxSession]) -> None:
self.replaced_sessions = list(sessions)
def count_active(self) -> int:
return self._count_active_result
def build_config() -> AppConfig: def build_config() -> AppConfig:
return AppConfig( return AppConfig(
app=AppSectionConfig(name='master', env='test'), app=AppSectionConfig(name='master', env='test'),
@ -198,6 +294,8 @@ def build_container(
state_source=EmptySandboxState(), state_source=EmptySandboxState(),
registry=repositories.sandbox_session, registry=repositories.sandbox_session,
logger=logger, logger=logger,
metrics=observability.metrics,
tracer=observability.tracer,
) )
usecases = AppUsecases( usecases = AppUsecases(
create_sandbox=create_sandbox_usecase, create_sandbox=create_sandbox_usecase,
@ -494,6 +592,8 @@ def test_startup_reconciliation_reuses_existing_container_after_restart(
state_source=runtime, state_source=runtime,
registry=repository, registry=repository,
logger=logger, logger=logger,
metrics=observability.metrics,
tracer=observability.tracer,
) )
usecases = AppUsecases( usecases = AppUsecases(
create_sandbox=CreateSandbox( create_sandbox=CreateSandbox(
@ -586,3 +686,76 @@ def test_removed_user_endpoint_returns_not_found(monkeypatch) -> None:
assert status_code == 404 assert status_code == 404
assert response == {'detail': 'Not Found'} assert response == {'detail': 'Not Found'}
assert docker_client.close_calls == 1 assert docker_client.close_calls == 1
def test_reconciliation_uses_registry_backed_active_count_metric() -> None:
logger = FakeLogger()
metrics = RecordingMetrics()
tracer = RecordingTracer()
created_at = datetime(2026, 4, 2, 12, 0, tzinfo=UTC)
session = SandboxSession(
session_id=SESSION_ID,
chat_id=CHAT_ID,
container_id='container-123',
status=SandboxStatus.RUNNING,
created_at=created_at,
expires_at=created_at + timedelta(minutes=5),
)
registry = CountingRegistry(count_active_result=7)
reconciler = SandboxSessionReconciler(
state_source=FixedSandboxState([session]),
registry=registry,
logger=logger,
metrics=metrics,
tracer=tracer,
)
sessions = reconciler.execute()
assert sessions == [session]
assert registry.replaced_sessions == [session]
assert metrics.set_calls == [('sandbox.active.count', 7, None)]
assert tracer.spans[0][0] == 'adapter.sandbox.reconcile_sessions'
assert tracer.spans[0][2].attrs['sandbox.active_count'] == 7
def test_build_container_wires_observability_into_runtime_and_reconciler(
monkeypatch,
) -> None:
logger = FakeLogger()
metrics = RecordingMetrics()
tracer = RecordingTracer()
observability = ObservabilityRuntime(
logger=logger,
metrics=metrics,
tracer=tracer,
)
docker_client = FakeDockerClient()
monkeypatch.setattr(
container_module, 'build_observability', lambda config: observability
)
monkeypatch.setattr(
container_module.docker,
'DockerClient',
lambda base_url: docker_client,
)
container = container_module.build_container(config=build_config())
runtime = container.sandbox_reconciler.state_source
assert isinstance(runtime, DockerSandboxRuntime)
assert runtime._metrics is metrics
assert runtime._tracer is tracer
assert container.sandbox_reconciler.metrics is metrics
assert container.sandbox_reconciler.tracer is tracer
assert container.usecases.create_sandbox._runtime is runtime
assert container.usecases.create_sandbox._metrics is metrics
assert container.usecases.create_sandbox._tracer is tracer
assert container.usecases.cleanup_expired_sandboxes._runtime is runtime
assert container.usecases.cleanup_expired_sandboxes._metrics is metrics
assert container.usecases.cleanup_expired_sandboxes._tracer is tracer
assert container._docker_client is docker_client
container.shutdown()
assert docker_client.close_calls == 1

View file

@ -10,6 +10,7 @@ from docker.types import Mount
from adapter.config.model import SandboxConfig from adapter.config.model import SandboxConfig
from adapter.docker.runtime import DockerSandboxRuntime from adapter.docker.runtime import DockerSandboxRuntime
from adapter.observability.noop import NoopMetrics, NoopTracer
from domain.error import SandboxError, SandboxStartError from domain.error import SandboxError, SandboxStartError
from domain.sandbox import SandboxSession, SandboxStatus from domain.sandbox import SandboxSession, SandboxStatus
@ -116,6 +117,18 @@ def build_config(tmp_path: Path) -> SandboxConfig:
) )
def build_runtime(
config: SandboxConfig,
containers: FakeContainers,
) -> DockerSandboxRuntime:
return DockerSandboxRuntime(
config,
FakeDockerClient(containers),
NoopMetrics(),
NoopTracer(),
)
def test_runtime_create_applies_mount_policy_and_labels_with_canonical_chat_id( def test_runtime_create_applies_mount_policy_and_labels_with_canonical_chat_id(
tmp_path: Path, tmp_path: Path,
) -> None: ) -> None:
@ -123,7 +136,7 @@ def test_runtime_create_applies_mount_policy_and_labels_with_canonical_chat_id(
(tmp_path / 'dependencies').mkdir() (tmp_path / 'dependencies').mkdir()
(tmp_path / 'lambda-tools').mkdir() (tmp_path / 'lambda-tools').mkdir()
containers = FakeContainers() containers = FakeContainers()
runtime = DockerSandboxRuntime(config, FakeDockerClient(containers)) runtime = build_runtime(config, containers)
created_at = datetime(2026, 4, 2, 12, 0, tzinfo=UTC) created_at = datetime(2026, 4, 2, 12, 0, tzinfo=UTC)
expires_at = created_at + timedelta(minutes=5) expires_at = created_at + timedelta(minutes=5)
@ -181,7 +194,7 @@ def test_runtime_create_raises_start_error_when_container_id_is_missing(
(tmp_path / 'dependencies').mkdir() (tmp_path / 'dependencies').mkdir()
(tmp_path / 'lambda-tools').mkdir() (tmp_path / 'lambda-tools').mkdir()
containers = FakeContainers(run_result=FakeContainer('')) containers = FakeContainers(run_result=FakeContainer(''))
runtime = DockerSandboxRuntime(config, FakeDockerClient(containers)) runtime = build_runtime(config, containers)
with pytest.raises(SandboxStartError) as excinfo: with pytest.raises(SandboxStartError) as excinfo:
runtime.create( runtime.create(
@ -199,7 +212,7 @@ def test_runtime_stop_ignores_missing_container(tmp_path: Path) -> None:
config = build_config(tmp_path) config = build_config(tmp_path)
containers = FakeContainers() containers = FakeContainers()
containers.get_result = NotFound('missing') containers.get_result = NotFound('missing')
runtime = DockerSandboxRuntime(config, FakeDockerClient(containers)) runtime = build_runtime(config, containers)
runtime.stop('container-123') runtime.stop('container-123')
@ -210,7 +223,7 @@ def test_runtime_stop_wraps_docker_errors(tmp_path: Path) -> None:
config = build_config(tmp_path) config = build_config(tmp_path)
containers = FakeContainers() containers = FakeContainers()
containers.get_result = DockerException('boom') containers.get_result = DockerException('boom')
runtime = DockerSandboxRuntime(config, FakeDockerClient(containers)) runtime = build_runtime(config, containers)
with pytest.raises(SandboxError) as excinfo: with pytest.raises(SandboxError) as excinfo:
runtime.stop('container-123') runtime.stop('container-123')
@ -243,7 +256,7 @@ def test_runtime_list_active_sessions_reads_valid_labeled_containers(
created_at='2026-04-02T12:01:00Z', created_at='2026-04-02T12:01:00Z',
), ),
] ]
runtime = DockerSandboxRuntime(config, FakeDockerClient(containers)) runtime = build_runtime(config, containers)
sessions = runtime.list_active_sessions() sessions = runtime.list_active_sessions()