instrument sandbox docker runtime
This commit is contained in:
parent
4cdf6e45de
commit
8d3a080d45
6 changed files with 411 additions and 73 deletions
|
|
@ -81,11 +81,18 @@ def build_container(
|
|||
|
||||
sandbox_repository = InMemorySandboxSessionRepository()
|
||||
sandbox_locker = ProcessLocalSandboxLifecycleLocker()
|
||||
sandbox_runtime = DockerSandboxRuntime(app_config.sandbox, docker_client)
|
||||
sandbox_runtime = DockerSandboxRuntime(
|
||||
app_config.sandbox,
|
||||
docker_client,
|
||||
observability.metrics,
|
||||
observability.tracer,
|
||||
)
|
||||
sandbox_reconciler = SandboxSessionReconciler(
|
||||
state_source=sandbox_runtime,
|
||||
registry=sandbox_repository,
|
||||
logger=observability.logger,
|
||||
metrics=observability.metrics,
|
||||
tracer=observability.tracer,
|
||||
)
|
||||
|
||||
repositories = AppRepositories(sandbox_session=sandbox_repository)
|
||||
|
|
|
|||
|
|
@ -1,3 +1,4 @@
|
|||
import time
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from uuid import UUID
|
||||
|
|
@ -9,7 +10,7 @@ from docker.types import Mount
|
|||
from adapter.config.model import SandboxConfig
|
||||
from domain.error import SandboxError, SandboxStartError
|
||||
from domain.sandbox import SandboxSession, SandboxStatus
|
||||
from usecase.interface import SandboxRuntime
|
||||
from usecase.interface import Metrics, SandboxRuntime, Span, Tracer
|
||||
|
||||
SANDBOX_LABELS = ('session_id', 'chat_id', 'expires_at')
|
||||
|
||||
|
|
@ -19,9 +20,13 @@ class DockerSandboxRuntime(SandboxRuntime):
|
|||
self,
|
||||
config: SandboxConfig,
|
||||
client: DockerClient,
|
||||
metrics: Metrics,
|
||||
tracer: Tracer,
|
||||
) -> None:
|
||||
self._config = config
|
||||
self._client = client
|
||||
self._metrics = metrics
|
||||
self._tracer = tracer
|
||||
|
||||
def create(
|
||||
self,
|
||||
|
|
@ -31,62 +36,143 @@ class DockerSandboxRuntime(SandboxRuntime):
|
|||
created_at: datetime,
|
||||
expires_at: datetime,
|
||||
) -> SandboxSession:
|
||||
try:
|
||||
chat_path = self._chat_path(chat_id)
|
||||
dependencies_path = self._readonly_host_path(
|
||||
self._config.dependencies_host_path
|
||||
)
|
||||
lambda_tools_path = self._readonly_host_path(
|
||||
self._config.lambda_tools_host_path
|
||||
)
|
||||
chat_path.mkdir(parents=True, exist_ok=True)
|
||||
container = self._client.containers.run(
|
||||
self._config.image,
|
||||
detach=True,
|
||||
labels=self._labels(session_id, chat_id, expires_at),
|
||||
mounts=self._mounts(chat_path, dependencies_path, lambda_tools_path),
|
||||
)
|
||||
except (DockerException, OSError, ValueError) as exc:
|
||||
raise SandboxStartError(str(chat_id)) from exc
|
||||
started_at = time.perf_counter()
|
||||
result = 'error'
|
||||
|
||||
container_id = str(getattr(container, 'id', '')).strip()
|
||||
if not container_id:
|
||||
raise SandboxStartError(str(chat_id))
|
||||
with self._tracer.start_span(
|
||||
'adapter.docker.create_sandbox',
|
||||
attrs={
|
||||
'chat.id': str(chat_id),
|
||||
'session.id': str(session_id),
|
||||
},
|
||||
) as span:
|
||||
try:
|
||||
try:
|
||||
chat_path = self._chat_path(chat_id)
|
||||
dependencies_path = self._readonly_host_path(
|
||||
self._config.dependencies_host_path
|
||||
)
|
||||
lambda_tools_path = self._readonly_host_path(
|
||||
self._config.lambda_tools_host_path
|
||||
)
|
||||
chat_path.mkdir(parents=True, exist_ok=True)
|
||||
container = self._client.containers.run(
|
||||
self._config.image,
|
||||
detach=True,
|
||||
labels=self._labels(session_id, chat_id, expires_at),
|
||||
mounts=self._mounts(
|
||||
chat_path,
|
||||
dependencies_path,
|
||||
lambda_tools_path,
|
||||
),
|
||||
)
|
||||
except (DockerException, OSError, ValueError) as exc:
|
||||
raise SandboxStartError(str(chat_id)) from exc
|
||||
|
||||
return SandboxSession(
|
||||
session_id=session_id,
|
||||
chat_id=chat_id,
|
||||
container_id=container_id,
|
||||
status=SandboxStatus.RUNNING,
|
||||
created_at=created_at,
|
||||
expires_at=expires_at,
|
||||
)
|
||||
container_id = str(getattr(container, 'id', '')).strip()
|
||||
if not container_id:
|
||||
raise SandboxStartError(str(chat_id))
|
||||
|
||||
result = 'created'
|
||||
span.set_attribute('container.id', container_id)
|
||||
span.set_attribute('sandbox.result', result)
|
||||
return SandboxSession(
|
||||
session_id=session_id,
|
||||
chat_id=chat_id,
|
||||
container_id=container_id,
|
||||
status=SandboxStatus.RUNNING,
|
||||
created_at=created_at,
|
||||
expires_at=expires_at,
|
||||
)
|
||||
except Exception as exc:
|
||||
span.set_attribute('sandbox.result', result)
|
||||
span.record_error(exc)
|
||||
self._metrics.increment(
|
||||
'sandbox.runtime.error.total',
|
||||
attrs=_runtime_error_metric_attrs('create', _error_type(exc)),
|
||||
)
|
||||
raise
|
||||
finally:
|
||||
self._metrics.record(
|
||||
'sandbox.runtime.create.duration_ms',
|
||||
_duration_ms(started_at),
|
||||
attrs=_runtime_metric_attrs('create', result),
|
||||
)
|
||||
|
||||
def stop(self, container_id: str) -> None:
|
||||
try:
|
||||
container = self._client.containers.get(container_id)
|
||||
container.stop()
|
||||
except NotFound:
|
||||
return
|
||||
except DockerException as exc:
|
||||
raise SandboxError('sandbox_stop_failed') from exc
|
||||
started_at = time.perf_counter()
|
||||
result = 'error'
|
||||
|
||||
with self._tracer.start_span(
|
||||
'adapter.docker.stop_sandbox',
|
||||
attrs={'container.id': container_id},
|
||||
) as span:
|
||||
try:
|
||||
container = self._client.containers.get(container_id)
|
||||
_set_span_container_attrs(span, container)
|
||||
container.stop()
|
||||
result = 'stopped'
|
||||
span.set_attribute('sandbox.result', result)
|
||||
except NotFound:
|
||||
result = 'not_found'
|
||||
span.set_attribute('sandbox.result', result)
|
||||
return
|
||||
except DockerException as exc:
|
||||
span.set_attribute('sandbox.result', result)
|
||||
span.record_error(exc)
|
||||
self._metrics.increment(
|
||||
'sandbox.runtime.error.total',
|
||||
attrs=_runtime_error_metric_attrs('stop', type(exc).__name__),
|
||||
)
|
||||
raise SandboxError('sandbox_stop_failed') from exc
|
||||
finally:
|
||||
self._metrics.record(
|
||||
'sandbox.runtime.stop.duration_ms',
|
||||
_duration_ms(started_at),
|
||||
attrs=_runtime_metric_attrs('stop', result),
|
||||
)
|
||||
|
||||
def list_active_sessions(self) -> list[SandboxSession]:
|
||||
try:
|
||||
containers = self._client.containers.list(
|
||||
filters={'label': list(SANDBOX_LABELS)}
|
||||
)
|
||||
except DockerException as exc:
|
||||
raise SandboxError('sandbox_list_failed') from exc
|
||||
started_at = time.perf_counter()
|
||||
result = 'error'
|
||||
|
||||
sessions: list[SandboxSession] = []
|
||||
for container in containers:
|
||||
session = self._session_from_container(container)
|
||||
if session is None:
|
||||
continue
|
||||
sessions.append(session)
|
||||
with self._tracer.start_span(
|
||||
'adapter.docker.list_active_sandboxes',
|
||||
) as span:
|
||||
try:
|
||||
try:
|
||||
containers = self._client.containers.list(
|
||||
filters={'label': list(SANDBOX_LABELS)}
|
||||
)
|
||||
except DockerException as exc:
|
||||
raise SandboxError('sandbox_list_failed') from exc
|
||||
|
||||
return sessions
|
||||
sessions: list[SandboxSession] = []
|
||||
for container in containers:
|
||||
session = self._session_from_container(container)
|
||||
if session is None:
|
||||
continue
|
||||
sessions.append(session)
|
||||
|
||||
result = 'listed'
|
||||
span.set_attribute('sandbox.container_count', len(containers))
|
||||
span.set_attribute('sandbox.active_count', len(sessions))
|
||||
span.set_attribute('sandbox.result', result)
|
||||
return sessions
|
||||
except Exception as exc:
|
||||
span.set_attribute('sandbox.result', result)
|
||||
span.record_error(exc)
|
||||
self._metrics.increment(
|
||||
'sandbox.runtime.error.total',
|
||||
attrs=_runtime_error_metric_attrs('list_active', _error_type(exc)),
|
||||
)
|
||||
raise
|
||||
finally:
|
||||
self._metrics.record(
|
||||
'sandbox.runtime.list_active.duration_ms',
|
||||
_duration_ms(started_at),
|
||||
attrs=_runtime_metric_attrs('list_active', result),
|
||||
)
|
||||
|
||||
def _labels(
|
||||
self,
|
||||
|
|
@ -186,3 +272,44 @@ class DockerSandboxRuntime(SandboxRuntime):
|
|||
def _parse_datetime(value: str) -> datetime:
|
||||
normalized = f'{value[:-1]}+00:00' if value.endswith('Z') else value
|
||||
return datetime.fromisoformat(normalized)
|
||||
|
||||
|
||||
def _duration_ms(started_at: float) -> float:
|
||||
return (time.perf_counter() - started_at) * 1000
|
||||
|
||||
|
||||
def _runtime_metric_attrs(operation: str, result: str) -> dict[str, str]:
|
||||
return {
|
||||
'operation': operation,
|
||||
'result': result,
|
||||
}
|
||||
|
||||
|
||||
def _runtime_error_metric_attrs(
|
||||
operation: str,
|
||||
error_type: str,
|
||||
) -> dict[str, str]:
|
||||
return {
|
||||
'operation': operation,
|
||||
'error.type': error_type,
|
||||
}
|
||||
|
||||
|
||||
def _error_type(error: Exception) -> str:
|
||||
if isinstance(error.__cause__, Exception):
|
||||
return type(error.__cause__).__name__
|
||||
return type(error).__name__
|
||||
|
||||
|
||||
def _set_span_container_attrs(span: Span, container: object) -> None:
|
||||
labels = getattr(container, 'labels', None)
|
||||
if not isinstance(labels, dict):
|
||||
return
|
||||
|
||||
session_id = labels.get('session_id')
|
||||
if isinstance(session_id, str) and session_id:
|
||||
span.set_attribute('session.id', session_id)
|
||||
|
||||
chat_id = labels.get('chat_id')
|
||||
if isinstance(chat_id, str) and chat_id:
|
||||
span.set_attribute('chat.id', chat_id)
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@ from typing import Protocol
|
|||
from uuid import UUID
|
||||
|
||||
from domain.sandbox import SandboxSession
|
||||
from usecase.interface import Logger
|
||||
from usecase.interface import Logger, Metrics, Tracer
|
||||
|
||||
|
||||
class SandboxSessionStateSource(Protocol):
|
||||
|
|
@ -13,27 +13,45 @@ class SandboxSessionStateSource(Protocol):
|
|||
class SandboxSessionRegistry(Protocol):
|
||||
def replace_all(self, sessions: list[SandboxSession]) -> None: ...
|
||||
|
||||
def count_active(self) -> int: ...
|
||||
|
||||
|
||||
@dataclass(frozen=True, slots=True)
|
||||
class SandboxSessionReconciler:
|
||||
state_source: SandboxSessionStateSource
|
||||
registry: SandboxSessionRegistry
|
||||
logger: Logger
|
||||
metrics: Metrics
|
||||
tracer: Tracer
|
||||
|
||||
def execute(self) -> list[SandboxSession]:
|
||||
sessions_by_chat_id: dict[UUID, SandboxSession] = {}
|
||||
for session in sorted(
|
||||
self.state_source.list_active_sessions(),
|
||||
key=lambda item: item.created_at,
|
||||
):
|
||||
sessions_by_chat_id[session.chat_id] = session
|
||||
with self.tracer.start_span(
|
||||
'adapter.sandbox.reconcile_sessions',
|
||||
) as span:
|
||||
try:
|
||||
sessions_by_chat_id: dict[UUID, SandboxSession] = {}
|
||||
discovered_sessions = self.state_source.list_active_sessions()
|
||||
span.set_attribute('sandbox.discovered_count', len(discovered_sessions))
|
||||
for session in sorted(
|
||||
discovered_sessions,
|
||||
key=lambda item: item.created_at,
|
||||
):
|
||||
sessions_by_chat_id[session.chat_id] = session
|
||||
|
||||
sessions = list(sessions_by_chat_id.values())
|
||||
self.registry.replace_all(sessions)
|
||||
self.logger.info(
|
||||
'sandbox_reconciled',
|
||||
attrs={
|
||||
'session_count': len(sessions),
|
||||
},
|
||||
)
|
||||
return sessions
|
||||
sessions = list(sessions_by_chat_id.values())
|
||||
self.registry.replace_all(sessions)
|
||||
active_count = self.registry.count_active()
|
||||
self.metrics.set('sandbox.active.count', active_count)
|
||||
span.set_attribute('sandbox.active_count', active_count)
|
||||
span.set_attribute('sandbox.result', 'reconciled')
|
||||
self.logger.info(
|
||||
'sandbox_reconciled',
|
||||
attrs={
|
||||
'session_count': active_count,
|
||||
},
|
||||
)
|
||||
return sessions
|
||||
except Exception as exc:
|
||||
span.set_attribute('sandbox.result', 'error')
|
||||
span.record_error(exc)
|
||||
raise
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue