instrument sandbox docker runtime

This commit is contained in:
Azamat 2026-04-03 01:15:23 +03:00
parent 4cdf6e45de
commit 8d3a080d45
6 changed files with 411 additions and 73 deletions

View file

@ -1,3 +1,4 @@
import time
from datetime import datetime
from pathlib import Path
from uuid import UUID
@ -9,7 +10,7 @@ from docker.types import Mount
from adapter.config.model import SandboxConfig
from domain.error import SandboxError, SandboxStartError
from domain.sandbox import SandboxSession, SandboxStatus
from usecase.interface import SandboxRuntime
from usecase.interface import Metrics, SandboxRuntime, Span, Tracer
SANDBOX_LABELS = ('session_id', 'chat_id', 'expires_at')
@ -19,9 +20,13 @@ class DockerSandboxRuntime(SandboxRuntime):
self,
config: SandboxConfig,
client: DockerClient,
metrics: Metrics,
tracer: Tracer,
) -> None:
self._config = config
self._client = client
self._metrics = metrics
self._tracer = tracer
def create(
self,
@ -31,62 +36,143 @@ class DockerSandboxRuntime(SandboxRuntime):
created_at: datetime,
expires_at: datetime,
) -> SandboxSession:
try:
chat_path = self._chat_path(chat_id)
dependencies_path = self._readonly_host_path(
self._config.dependencies_host_path
)
lambda_tools_path = self._readonly_host_path(
self._config.lambda_tools_host_path
)
chat_path.mkdir(parents=True, exist_ok=True)
container = self._client.containers.run(
self._config.image,
detach=True,
labels=self._labels(session_id, chat_id, expires_at),
mounts=self._mounts(chat_path, dependencies_path, lambda_tools_path),
)
except (DockerException, OSError, ValueError) as exc:
raise SandboxStartError(str(chat_id)) from exc
started_at = time.perf_counter()
result = 'error'
container_id = str(getattr(container, 'id', '')).strip()
if not container_id:
raise SandboxStartError(str(chat_id))
with self._tracer.start_span(
'adapter.docker.create_sandbox',
attrs={
'chat.id': str(chat_id),
'session.id': str(session_id),
},
) as span:
try:
try:
chat_path = self._chat_path(chat_id)
dependencies_path = self._readonly_host_path(
self._config.dependencies_host_path
)
lambda_tools_path = self._readonly_host_path(
self._config.lambda_tools_host_path
)
chat_path.mkdir(parents=True, exist_ok=True)
container = self._client.containers.run(
self._config.image,
detach=True,
labels=self._labels(session_id, chat_id, expires_at),
mounts=self._mounts(
chat_path,
dependencies_path,
lambda_tools_path,
),
)
except (DockerException, OSError, ValueError) as exc:
raise SandboxStartError(str(chat_id)) from exc
return SandboxSession(
session_id=session_id,
chat_id=chat_id,
container_id=container_id,
status=SandboxStatus.RUNNING,
created_at=created_at,
expires_at=expires_at,
)
container_id = str(getattr(container, 'id', '')).strip()
if not container_id:
raise SandboxStartError(str(chat_id))
result = 'created'
span.set_attribute('container.id', container_id)
span.set_attribute('sandbox.result', result)
return SandboxSession(
session_id=session_id,
chat_id=chat_id,
container_id=container_id,
status=SandboxStatus.RUNNING,
created_at=created_at,
expires_at=expires_at,
)
except Exception as exc:
span.set_attribute('sandbox.result', result)
span.record_error(exc)
self._metrics.increment(
'sandbox.runtime.error.total',
attrs=_runtime_error_metric_attrs('create', _error_type(exc)),
)
raise
finally:
self._metrics.record(
'sandbox.runtime.create.duration_ms',
_duration_ms(started_at),
attrs=_runtime_metric_attrs('create', result),
)
def stop(self, container_id: str) -> None:
try:
container = self._client.containers.get(container_id)
container.stop()
except NotFound:
return
except DockerException as exc:
raise SandboxError('sandbox_stop_failed') from exc
started_at = time.perf_counter()
result = 'error'
with self._tracer.start_span(
'adapter.docker.stop_sandbox',
attrs={'container.id': container_id},
) as span:
try:
container = self._client.containers.get(container_id)
_set_span_container_attrs(span, container)
container.stop()
result = 'stopped'
span.set_attribute('sandbox.result', result)
except NotFound:
result = 'not_found'
span.set_attribute('sandbox.result', result)
return
except DockerException as exc:
span.set_attribute('sandbox.result', result)
span.record_error(exc)
self._metrics.increment(
'sandbox.runtime.error.total',
attrs=_runtime_error_metric_attrs('stop', type(exc).__name__),
)
raise SandboxError('sandbox_stop_failed') from exc
finally:
self._metrics.record(
'sandbox.runtime.stop.duration_ms',
_duration_ms(started_at),
attrs=_runtime_metric_attrs('stop', result),
)
def list_active_sessions(self) -> list[SandboxSession]:
try:
containers = self._client.containers.list(
filters={'label': list(SANDBOX_LABELS)}
)
except DockerException as exc:
raise SandboxError('sandbox_list_failed') from exc
started_at = time.perf_counter()
result = 'error'
sessions: list[SandboxSession] = []
for container in containers:
session = self._session_from_container(container)
if session is None:
continue
sessions.append(session)
with self._tracer.start_span(
'adapter.docker.list_active_sandboxes',
) as span:
try:
try:
containers = self._client.containers.list(
filters={'label': list(SANDBOX_LABELS)}
)
except DockerException as exc:
raise SandboxError('sandbox_list_failed') from exc
return sessions
sessions: list[SandboxSession] = []
for container in containers:
session = self._session_from_container(container)
if session is None:
continue
sessions.append(session)
result = 'listed'
span.set_attribute('sandbox.container_count', len(containers))
span.set_attribute('sandbox.active_count', len(sessions))
span.set_attribute('sandbox.result', result)
return sessions
except Exception as exc:
span.set_attribute('sandbox.result', result)
span.record_error(exc)
self._metrics.increment(
'sandbox.runtime.error.total',
attrs=_runtime_error_metric_attrs('list_active', _error_type(exc)),
)
raise
finally:
self._metrics.record(
'sandbox.runtime.list_active.duration_ms',
_duration_ms(started_at),
attrs=_runtime_metric_attrs('list_active', result),
)
def _labels(
self,
@ -186,3 +272,44 @@ class DockerSandboxRuntime(SandboxRuntime):
def _parse_datetime(value: str) -> datetime:
normalized = f'{value[:-1]}+00:00' if value.endswith('Z') else value
return datetime.fromisoformat(normalized)
def _duration_ms(started_at: float) -> float:
return (time.perf_counter() - started_at) * 1000
def _runtime_metric_attrs(operation: str, result: str) -> dict[str, str]:
return {
'operation': operation,
'result': result,
}
def _runtime_error_metric_attrs(
operation: str,
error_type: str,
) -> dict[str, str]:
return {
'operation': operation,
'error.type': error_type,
}
def _error_type(error: Exception) -> str:
if isinstance(error.__cause__, Exception):
return type(error.__cause__).__name__
return type(error).__name__
def _set_span_container_attrs(span: Span, container: object) -> None:
labels = getattr(container, 'labels', None)
if not isinstance(labels, dict):
return
session_id = labels.get('session_id')
if isinstance(session_id, str) and session_id:
span.set_attribute('session.id', session_id)
chat_id = labels.get('chat_id')
if isinstance(chat_id, str) and chat_id:
span.set_attribute('chat.id', chat_id)