instrument sandbox docker runtime

This commit is contained in:
Azamat 2026-04-03 01:15:23 +03:00
parent 4cdf6e45de
commit 8d3a080d45
6 changed files with 411 additions and 73 deletions

View file

@ -81,11 +81,18 @@ def build_container(
sandbox_repository = InMemorySandboxSessionRepository()
sandbox_locker = ProcessLocalSandboxLifecycleLocker()
sandbox_runtime = DockerSandboxRuntime(app_config.sandbox, docker_client)
sandbox_runtime = DockerSandboxRuntime(
app_config.sandbox,
docker_client,
observability.metrics,
observability.tracer,
)
sandbox_reconciler = SandboxSessionReconciler(
state_source=sandbox_runtime,
registry=sandbox_repository,
logger=observability.logger,
metrics=observability.metrics,
tracer=observability.tracer,
)
repositories = AppRepositories(sandbox_session=sandbox_repository)

View file

@ -1,3 +1,4 @@
import time
from datetime import datetime
from pathlib import Path
from uuid import UUID
@ -9,7 +10,7 @@ from docker.types import Mount
from adapter.config.model import SandboxConfig
from domain.error import SandboxError, SandboxStartError
from domain.sandbox import SandboxSession, SandboxStatus
from usecase.interface import SandboxRuntime
from usecase.interface import Metrics, SandboxRuntime, Span, Tracer
SANDBOX_LABELS = ('session_id', 'chat_id', 'expires_at')
@ -19,9 +20,13 @@ class DockerSandboxRuntime(SandboxRuntime):
self,
config: SandboxConfig,
client: DockerClient,
metrics: Metrics,
tracer: Tracer,
) -> None:
self._config = config
self._client = client
self._metrics = metrics
self._tracer = tracer
def create(
self,
@ -31,6 +36,17 @@ class DockerSandboxRuntime(SandboxRuntime):
created_at: datetime,
expires_at: datetime,
) -> SandboxSession:
started_at = time.perf_counter()
result = 'error'
with self._tracer.start_span(
'adapter.docker.create_sandbox',
attrs={
'chat.id': str(chat_id),
'session.id': str(session_id),
},
) as span:
try:
try:
chat_path = self._chat_path(chat_id)
dependencies_path = self._readonly_host_path(
@ -44,7 +60,11 @@ class DockerSandboxRuntime(SandboxRuntime):
self._config.image,
detach=True,
labels=self._labels(session_id, chat_id, expires_at),
mounts=self._mounts(chat_path, dependencies_path, lambda_tools_path),
mounts=self._mounts(
chat_path,
dependencies_path,
lambda_tools_path,
),
)
except (DockerException, OSError, ValueError) as exc:
raise SandboxStartError(str(chat_id)) from exc
@ -53,6 +73,9 @@ class DockerSandboxRuntime(SandboxRuntime):
if not container_id:
raise SandboxStartError(str(chat_id))
result = 'created'
span.set_attribute('container.id', container_id)
span.set_attribute('sandbox.result', result)
return SandboxSession(
session_id=session_id,
chat_id=chat_id,
@ -61,17 +84,62 @@ class DockerSandboxRuntime(SandboxRuntime):
created_at=created_at,
expires_at=expires_at,
)
except Exception as exc:
span.set_attribute('sandbox.result', result)
span.record_error(exc)
self._metrics.increment(
'sandbox.runtime.error.total',
attrs=_runtime_error_metric_attrs('create', _error_type(exc)),
)
raise
finally:
self._metrics.record(
'sandbox.runtime.create.duration_ms',
_duration_ms(started_at),
attrs=_runtime_metric_attrs('create', result),
)
def stop(self, container_id: str) -> None:
started_at = time.perf_counter()
result = 'error'
with self._tracer.start_span(
'adapter.docker.stop_sandbox',
attrs={'container.id': container_id},
) as span:
try:
container = self._client.containers.get(container_id)
_set_span_container_attrs(span, container)
container.stop()
result = 'stopped'
span.set_attribute('sandbox.result', result)
except NotFound:
result = 'not_found'
span.set_attribute('sandbox.result', result)
return
except DockerException as exc:
span.set_attribute('sandbox.result', result)
span.record_error(exc)
self._metrics.increment(
'sandbox.runtime.error.total',
attrs=_runtime_error_metric_attrs('stop', type(exc).__name__),
)
raise SandboxError('sandbox_stop_failed') from exc
finally:
self._metrics.record(
'sandbox.runtime.stop.duration_ms',
_duration_ms(started_at),
attrs=_runtime_metric_attrs('stop', result),
)
def list_active_sessions(self) -> list[SandboxSession]:
started_at = time.perf_counter()
result = 'error'
with self._tracer.start_span(
'adapter.docker.list_active_sandboxes',
) as span:
try:
try:
containers = self._client.containers.list(
filters={'label': list(SANDBOX_LABELS)}
@ -86,7 +154,25 @@ class DockerSandboxRuntime(SandboxRuntime):
continue
sessions.append(session)
result = 'listed'
span.set_attribute('sandbox.container_count', len(containers))
span.set_attribute('sandbox.active_count', len(sessions))
span.set_attribute('sandbox.result', result)
return sessions
except Exception as exc:
span.set_attribute('sandbox.result', result)
span.record_error(exc)
self._metrics.increment(
'sandbox.runtime.error.total',
attrs=_runtime_error_metric_attrs('list_active', _error_type(exc)),
)
raise
finally:
self._metrics.record(
'sandbox.runtime.list_active.duration_ms',
_duration_ms(started_at),
attrs=_runtime_metric_attrs('list_active', result),
)
def _labels(
self,
@ -186,3 +272,44 @@ class DockerSandboxRuntime(SandboxRuntime):
def _parse_datetime(value: str) -> datetime:
normalized = f'{value[:-1]}+00:00' if value.endswith('Z') else value
return datetime.fromisoformat(normalized)
def _duration_ms(started_at: float) -> float:
return (time.perf_counter() - started_at) * 1000
def _runtime_metric_attrs(operation: str, result: str) -> dict[str, str]:
return {
'operation': operation,
'result': result,
}
def _runtime_error_metric_attrs(
operation: str,
error_type: str,
) -> dict[str, str]:
return {
'operation': operation,
'error.type': error_type,
}
def _error_type(error: Exception) -> str:
if isinstance(error.__cause__, Exception):
return type(error.__cause__).__name__
return type(error).__name__
def _set_span_container_attrs(span: Span, container: object) -> None:
labels = getattr(container, 'labels', None)
if not isinstance(labels, dict):
return
session_id = labels.get('session_id')
if isinstance(session_id, str) and session_id:
span.set_attribute('session.id', session_id)
chat_id = labels.get('chat_id')
if isinstance(chat_id, str) and chat_id:
span.set_attribute('chat.id', chat_id)

View file

@ -3,7 +3,7 @@ from typing import Protocol
from uuid import UUID
from domain.sandbox import SandboxSession
from usecase.interface import Logger
from usecase.interface import Logger, Metrics, Tracer
class SandboxSessionStateSource(Protocol):
@ -13,27 +13,45 @@ class SandboxSessionStateSource(Protocol):
class SandboxSessionRegistry(Protocol):
def replace_all(self, sessions: list[SandboxSession]) -> None: ...
def count_active(self) -> int: ...
@dataclass(frozen=True, slots=True)
class SandboxSessionReconciler:
state_source: SandboxSessionStateSource
registry: SandboxSessionRegistry
logger: Logger
metrics: Metrics
tracer: Tracer
def execute(self) -> list[SandboxSession]:
with self.tracer.start_span(
'adapter.sandbox.reconcile_sessions',
) as span:
try:
sessions_by_chat_id: dict[UUID, SandboxSession] = {}
discovered_sessions = self.state_source.list_active_sessions()
span.set_attribute('sandbox.discovered_count', len(discovered_sessions))
for session in sorted(
self.state_source.list_active_sessions(),
discovered_sessions,
key=lambda item: item.created_at,
):
sessions_by_chat_id[session.chat_id] = session
sessions = list(sessions_by_chat_id.values())
self.registry.replace_all(sessions)
active_count = self.registry.count_active()
self.metrics.set('sandbox.active.count', active_count)
span.set_attribute('sandbox.active_count', active_count)
span.set_attribute('sandbox.result', 'reconciled')
self.logger.info(
'sandbox_reconciled',
attrs={
'session_count': len(sessions),
'session_count': active_count,
},
)
return sessions
except Exception as exc:
span.set_attribute('sandbox.result', 'error')
span.record_error(exc)
raise

View file

@ -257,7 +257,7 @@
### M21. Трейсы и runtime metrics в Docker adapter и reconciliation
- Субагент: `feature-developer`
- Статус: pending
- Статус: completed
- Зависимости: `M19`
- Commit required: yes
- Commit message: `instrument sandbox docker runtime`

View file

@ -7,6 +7,7 @@ from docker import DockerClient
from fastapi import FastAPI
from starlette.types import Message, Scope
import adapter.di.container as container_module
from adapter.config.model import (
AppConfig,
AppSectionConfig,
@ -20,6 +21,7 @@ from adapter.config.model import (
TracingConfig,
)
from adapter.di.container import AppContainer, AppRepositories, AppUsecases
from adapter.docker.runtime import DockerSandboxRuntime
from adapter.http.fastapi import app as app_module
from adapter.observability.noop import NoopMetrics, NoopTracer
from adapter.observability.runtime import ObservabilityRuntime
@ -80,7 +82,8 @@ class FakeCleanupExpiredSandboxes(CleanupExpiredSandboxes):
class FakeDockerClient(DockerClient):
def __init__(self) -> None:
def __init__(self, base_url: str | None = None) -> None:
self.base_url = base_url
self.close_calls = 0
def close(self) -> None:
@ -104,6 +107,79 @@ class FakeClock:
return self._now
class RecordingMetrics:
def __init__(self) -> None:
self.increment_calls: list[tuple[str, int, Attrs | None]] = []
self.record_calls: list[tuple[str, float, Attrs | None]] = []
self.set_calls: list[tuple[str, int | float, Attrs | None]] = []
def increment(
self,
name: str,
value: int = 1,
attrs: Attrs | None = None,
) -> None:
self.increment_calls.append((name, value, attrs))
def record(
self,
name: str,
value: float,
attrs: Attrs | None = None,
) -> None:
self.record_calls.append((name, value, attrs))
def set(
self,
name: str,
value: int | float,
attrs: Attrs | None = None,
) -> None:
self.set_calls.append((name, value, attrs))
class RecordingSpan:
def __init__(self) -> None:
self.attrs: dict[str, str | int | float | bool] = {}
self.errors: list[Exception] = []
def set_attribute(self, name: str, value: str | int | float | bool) -> None:
self.attrs[name] = value
def record_error(self, error: Exception) -> None:
self.errors.append(error)
class RecordingSpanContext:
def __init__(self, span: RecordingSpan) -> None:
self._span = span
def __enter__(self) -> RecordingSpan:
return self._span
def __exit__(
self,
exc_type: type[BaseException] | None,
exc: BaseException | None,
traceback: object,
) -> bool | None:
return None
class RecordingTracer:
def __init__(self) -> None:
self.spans: list[tuple[str, Attrs | None, RecordingSpan]] = []
def start_span(
self,
name: str,
attrs: Attrs | None = None,
) -> RecordingSpanContext:
span = RecordingSpan()
self.spans.append((name, attrs, span))
return RecordingSpanContext(span)
class FakeLifecycleRuntime:
def __init__(self, sessions: list[SandboxSession]) -> None:
self._sessions = list(sessions)
@ -142,6 +218,26 @@ class FakeLifecycleRuntime:
self.stop_calls.append(container_id)
class FixedSandboxState:
def __init__(self, sessions: list[SandboxSession]) -> None:
self._sessions = list(sessions)
def list_active_sessions(self) -> list[SandboxSession]:
return list(self._sessions)
class CountingRegistry:
def __init__(self, count_active_result: int) -> None:
self._count_active_result = count_active_result
self.replaced_sessions: list[SandboxSession] = []
def replace_all(self, sessions: list[SandboxSession]) -> None:
self.replaced_sessions = list(sessions)
def count_active(self) -> int:
return self._count_active_result
def build_config() -> AppConfig:
return AppConfig(
app=AppSectionConfig(name='master', env='test'),
@ -198,6 +294,8 @@ def build_container(
state_source=EmptySandboxState(),
registry=repositories.sandbox_session,
logger=logger,
metrics=observability.metrics,
tracer=observability.tracer,
)
usecases = AppUsecases(
create_sandbox=create_sandbox_usecase,
@ -494,6 +592,8 @@ def test_startup_reconciliation_reuses_existing_container_after_restart(
state_source=runtime,
registry=repository,
logger=logger,
metrics=observability.metrics,
tracer=observability.tracer,
)
usecases = AppUsecases(
create_sandbox=CreateSandbox(
@ -586,3 +686,76 @@ def test_removed_user_endpoint_returns_not_found(monkeypatch) -> None:
assert status_code == 404
assert response == {'detail': 'Not Found'}
assert docker_client.close_calls == 1
def test_reconciliation_uses_registry_backed_active_count_metric() -> None:
logger = FakeLogger()
metrics = RecordingMetrics()
tracer = RecordingTracer()
created_at = datetime(2026, 4, 2, 12, 0, tzinfo=UTC)
session = SandboxSession(
session_id=SESSION_ID,
chat_id=CHAT_ID,
container_id='container-123',
status=SandboxStatus.RUNNING,
created_at=created_at,
expires_at=created_at + timedelta(minutes=5),
)
registry = CountingRegistry(count_active_result=7)
reconciler = SandboxSessionReconciler(
state_source=FixedSandboxState([session]),
registry=registry,
logger=logger,
metrics=metrics,
tracer=tracer,
)
sessions = reconciler.execute()
assert sessions == [session]
assert registry.replaced_sessions == [session]
assert metrics.set_calls == [('sandbox.active.count', 7, None)]
assert tracer.spans[0][0] == 'adapter.sandbox.reconcile_sessions'
assert tracer.spans[0][2].attrs['sandbox.active_count'] == 7
def test_build_container_wires_observability_into_runtime_and_reconciler(
monkeypatch,
) -> None:
logger = FakeLogger()
metrics = RecordingMetrics()
tracer = RecordingTracer()
observability = ObservabilityRuntime(
logger=logger,
metrics=metrics,
tracer=tracer,
)
docker_client = FakeDockerClient()
monkeypatch.setattr(
container_module, 'build_observability', lambda config: observability
)
monkeypatch.setattr(
container_module.docker,
'DockerClient',
lambda base_url: docker_client,
)
container = container_module.build_container(config=build_config())
runtime = container.sandbox_reconciler.state_source
assert isinstance(runtime, DockerSandboxRuntime)
assert runtime._metrics is metrics
assert runtime._tracer is tracer
assert container.sandbox_reconciler.metrics is metrics
assert container.sandbox_reconciler.tracer is tracer
assert container.usecases.create_sandbox._runtime is runtime
assert container.usecases.create_sandbox._metrics is metrics
assert container.usecases.create_sandbox._tracer is tracer
assert container.usecases.cleanup_expired_sandboxes._runtime is runtime
assert container.usecases.cleanup_expired_sandboxes._metrics is metrics
assert container.usecases.cleanup_expired_sandboxes._tracer is tracer
assert container._docker_client is docker_client
container.shutdown()
assert docker_client.close_calls == 1

View file

@ -10,6 +10,7 @@ from docker.types import Mount
from adapter.config.model import SandboxConfig
from adapter.docker.runtime import DockerSandboxRuntime
from adapter.observability.noop import NoopMetrics, NoopTracer
from domain.error import SandboxError, SandboxStartError
from domain.sandbox import SandboxSession, SandboxStatus
@ -116,6 +117,18 @@ def build_config(tmp_path: Path) -> SandboxConfig:
)
def build_runtime(
config: SandboxConfig,
containers: FakeContainers,
) -> DockerSandboxRuntime:
return DockerSandboxRuntime(
config,
FakeDockerClient(containers),
NoopMetrics(),
NoopTracer(),
)
def test_runtime_create_applies_mount_policy_and_labels_with_canonical_chat_id(
tmp_path: Path,
) -> None:
@ -123,7 +136,7 @@ def test_runtime_create_applies_mount_policy_and_labels_with_canonical_chat_id(
(tmp_path / 'dependencies').mkdir()
(tmp_path / 'lambda-tools').mkdir()
containers = FakeContainers()
runtime = DockerSandboxRuntime(config, FakeDockerClient(containers))
runtime = build_runtime(config, containers)
created_at = datetime(2026, 4, 2, 12, 0, tzinfo=UTC)
expires_at = created_at + timedelta(minutes=5)
@ -181,7 +194,7 @@ def test_runtime_create_raises_start_error_when_container_id_is_missing(
(tmp_path / 'dependencies').mkdir()
(tmp_path / 'lambda-tools').mkdir()
containers = FakeContainers(run_result=FakeContainer(''))
runtime = DockerSandboxRuntime(config, FakeDockerClient(containers))
runtime = build_runtime(config, containers)
with pytest.raises(SandboxStartError) as excinfo:
runtime.create(
@ -199,7 +212,7 @@ def test_runtime_stop_ignores_missing_container(tmp_path: Path) -> None:
config = build_config(tmp_path)
containers = FakeContainers()
containers.get_result = NotFound('missing')
runtime = DockerSandboxRuntime(config, FakeDockerClient(containers))
runtime = build_runtime(config, containers)
runtime.stop('container-123')
@ -210,7 +223,7 @@ def test_runtime_stop_wraps_docker_errors(tmp_path: Path) -> None:
config = build_config(tmp_path)
containers = FakeContainers()
containers.get_result = DockerException('boom')
runtime = DockerSandboxRuntime(config, FakeDockerClient(containers))
runtime = build_runtime(config, containers)
with pytest.raises(SandboxError) as excinfo:
runtime.stop('container-123')
@ -243,7 +256,7 @@ def test_runtime_list_active_sessions_reads_valid_labeled_containers(
created_at='2026-04-02T12:01:00Z',
),
]
runtime = DockerSandboxRuntime(config, FakeDockerClient(containers))
runtime = build_runtime(config, containers)
sessions = runtime.list_active_sessions()