instrument sandbox usecases
This commit is contained in:
parent
a86e1ee8c7
commit
4cdf6e45de
5 changed files with 191 additions and 49 deletions
|
|
@ -96,6 +96,8 @@ def build_container(
|
||||||
runtime=sandbox_runtime,
|
runtime=sandbox_runtime,
|
||||||
clock=clock,
|
clock=clock,
|
||||||
logger=observability.logger,
|
logger=observability.logger,
|
||||||
|
metrics=observability.metrics,
|
||||||
|
tracer=observability.tracer,
|
||||||
ttl=timedelta(seconds=app_config.sandbox.ttl_seconds),
|
ttl=timedelta(seconds=app_config.sandbox.ttl_seconds),
|
||||||
),
|
),
|
||||||
cleanup_expired_sandboxes=CleanupExpiredSandboxes(
|
cleanup_expired_sandboxes=CleanupExpiredSandboxes(
|
||||||
|
|
@ -104,6 +106,8 @@ def build_container(
|
||||||
runtime=sandbox_runtime,
|
runtime=sandbox_runtime,
|
||||||
clock=clock,
|
clock=clock,
|
||||||
logger=observability.logger,
|
logger=observability.logger,
|
||||||
|
metrics=observability.metrics,
|
||||||
|
tracer=observability.tracer,
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
||||||
2
tasks.md
2
tasks.md
|
|
@ -245,7 +245,7 @@
|
||||||
### M20. Трейсы и метрики в sandbox usecases
|
### M20. Трейсы и метрики в sandbox usecases
|
||||||
|
|
||||||
- Субагент: `feature-developer`
|
- Субагент: `feature-developer`
|
||||||
- Статус: pending
|
- Статус: completed
|
||||||
- Зависимости: `M19`
|
- Зависимости: `M19`
|
||||||
- Commit required: yes
|
- Commit required: yes
|
||||||
- Commit message: `instrument sandbox usecases`
|
- Commit message: `instrument sandbox usecases`
|
||||||
|
|
|
||||||
|
|
@ -502,6 +502,8 @@ def test_startup_reconciliation_reuses_existing_container_after_restart(
|
||||||
runtime=runtime,
|
runtime=runtime,
|
||||||
clock=FakeClock(created_at),
|
clock=FakeClock(created_at),
|
||||||
logger=logger,
|
logger=logger,
|
||||||
|
metrics=NoopMetrics(),
|
||||||
|
tracer=NoopTracer(),
|
||||||
ttl=timedelta(minutes=5),
|
ttl=timedelta(minutes=5),
|
||||||
),
|
),
|
||||||
cleanup_expired_sandboxes=CleanupExpiredSandboxes(
|
cleanup_expired_sandboxes=CleanupExpiredSandboxes(
|
||||||
|
|
@ -510,6 +512,8 @@ def test_startup_reconciliation_reuses_existing_container_after_restart(
|
||||||
runtime=runtime,
|
runtime=runtime,
|
||||||
clock=FakeClock(created_at),
|
clock=FakeClock(created_at),
|
||||||
logger=logger,
|
logger=logger,
|
||||||
|
metrics=NoopMetrics(),
|
||||||
|
tracer=NoopTracer(),
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
container = AppContainer(
|
container = AppContainer(
|
||||||
|
|
|
||||||
|
|
@ -2,6 +2,7 @@ import threading
|
||||||
from datetime import UTC, datetime, timedelta
|
from datetime import UTC, datetime, timedelta
|
||||||
from uuid import UUID
|
from uuid import UUID
|
||||||
|
|
||||||
|
from adapter.observability.noop import NoopMetrics, NoopTracer
|
||||||
from domain.sandbox import SandboxSession, SandboxStatus
|
from domain.sandbox import SandboxSession, SandboxStatus
|
||||||
from repository.sandbox_lock import ProcessLocalSandboxLifecycleLocker
|
from repository.sandbox_lock import ProcessLocalSandboxLifecycleLocker
|
||||||
from repository.sandbox_session import InMemorySandboxSessionRepository
|
from repository.sandbox_session import InMemorySandboxSessionRepository
|
||||||
|
|
@ -218,6 +219,8 @@ def test_create_sandbox_reuses_active_session_when_not_expired() -> None:
|
||||||
runtime=runtime,
|
runtime=runtime,
|
||||||
clock=FakeClock(now),
|
clock=FakeClock(now),
|
||||||
logger=logger,
|
logger=logger,
|
||||||
|
metrics=NoopMetrics(),
|
||||||
|
tracer=NoopTracer(),
|
||||||
ttl=timedelta(minutes=5),
|
ttl=timedelta(minutes=5),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -264,6 +267,8 @@ def test_create_sandbox_replaces_expired_session_and_creates_new_one(
|
||||||
runtime=runtime,
|
runtime=runtime,
|
||||||
clock=FakeClock(now),
|
clock=FakeClock(now),
|
||||||
logger=logger,
|
logger=logger,
|
||||||
|
metrics=NoopMetrics(),
|
||||||
|
tracer=NoopTracer(),
|
||||||
ttl=timedelta(minutes=5),
|
ttl=timedelta(minutes=5),
|
||||||
)
|
)
|
||||||
monkeypatch.setattr('usecase.sandbox._new_session_id', lambda: SESSION_NEW_ID)
|
monkeypatch.setattr('usecase.sandbox._new_session_id', lambda: SESSION_NEW_ID)
|
||||||
|
|
@ -323,6 +328,8 @@ def test_create_sandbox_creates_new_session_when_none_exists() -> None:
|
||||||
runtime=runtime,
|
runtime=runtime,
|
||||||
clock=FakeClock(now),
|
clock=FakeClock(now),
|
||||||
logger=logger,
|
logger=logger,
|
||||||
|
metrics=NoopMetrics(),
|
||||||
|
tracer=NoopTracer(),
|
||||||
ttl=timedelta(minutes=5),
|
ttl=timedelta(minutes=5),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -370,6 +377,8 @@ def test_create_sandbox_serializes_duplicate_concurrent_create_for_chat_id(
|
||||||
runtime=runtime,
|
runtime=runtime,
|
||||||
clock=FakeClock(now),
|
clock=FakeClock(now),
|
||||||
logger=logger,
|
logger=logger,
|
||||||
|
metrics=NoopMetrics(),
|
||||||
|
tracer=NoopTracer(),
|
||||||
ttl=timedelta(minutes=5),
|
ttl=timedelta(minutes=5),
|
||||||
)
|
)
|
||||||
monkeypatch.setattr('usecase.sandbox._new_session_id', lambda: SESSION_NEW_ID)
|
monkeypatch.setattr('usecase.sandbox._new_session_id', lambda: SESSION_NEW_ID)
|
||||||
|
|
@ -473,6 +482,8 @@ def test_cleanup_expired_sandboxes_stops_and_deletes_only_expired_sessions() ->
|
||||||
runtime=runtime,
|
runtime=runtime,
|
||||||
clock=FakeClock(now),
|
clock=FakeClock(now),
|
||||||
logger=logger,
|
logger=logger,
|
||||||
|
metrics=NoopMetrics(),
|
||||||
|
tracer=NoopTracer(),
|
||||||
)
|
)
|
||||||
|
|
||||||
result = usecase.execute()
|
result = usecase.execute()
|
||||||
|
|
@ -534,6 +545,8 @@ def test_cleanup_expired_sandboxes_skips_replaced_session_from_stale_snapshot()
|
||||||
runtime=runtime,
|
runtime=runtime,
|
||||||
clock=FakeClock(now),
|
clock=FakeClock(now),
|
||||||
logger=logger,
|
logger=logger,
|
||||||
|
metrics=NoopMetrics(),
|
||||||
|
tracer=NoopTracer(),
|
||||||
)
|
)
|
||||||
|
|
||||||
result = usecase.execute()
|
result = usecase.execute()
|
||||||
|
|
@ -575,6 +588,8 @@ def test_cleanup_expired_sandboxes_continues_after_stop_failure() -> None:
|
||||||
runtime=runtime,
|
runtime=runtime,
|
||||||
clock=FakeClock(now),
|
clock=FakeClock(now),
|
||||||
logger=logger,
|
logger=logger,
|
||||||
|
metrics=NoopMetrics(),
|
||||||
|
tracer=NoopTracer(),
|
||||||
)
|
)
|
||||||
|
|
||||||
result = usecase.execute()
|
result = usecase.execute()
|
||||||
|
|
|
||||||
|
|
@ -6,9 +6,11 @@ from domain.sandbox import SandboxSession
|
||||||
from usecase.interface import (
|
from usecase.interface import (
|
||||||
Clock,
|
Clock,
|
||||||
Logger,
|
Logger,
|
||||||
|
Metrics,
|
||||||
SandboxLifecycleLocker,
|
SandboxLifecycleLocker,
|
||||||
SandboxRuntime,
|
SandboxRuntime,
|
||||||
SandboxSessionRepository,
|
SandboxSessionRepository,
|
||||||
|
Tracer,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -25,6 +27,8 @@ class CreateSandbox:
|
||||||
runtime: SandboxRuntime,
|
runtime: SandboxRuntime,
|
||||||
clock: Clock,
|
clock: Clock,
|
||||||
logger: Logger,
|
logger: Logger,
|
||||||
|
metrics: Metrics,
|
||||||
|
tracer: Tracer,
|
||||||
ttl: timedelta,
|
ttl: timedelta,
|
||||||
) -> None:
|
) -> None:
|
||||||
self._repository = repository
|
self._repository = repository
|
||||||
|
|
@ -32,44 +36,80 @@ class CreateSandbox:
|
||||||
self._runtime = runtime
|
self._runtime = runtime
|
||||||
self._clock = clock
|
self._clock = clock
|
||||||
self._logger = logger
|
self._logger = logger
|
||||||
|
self._metrics = metrics
|
||||||
|
self._tracer = tracer
|
||||||
self._ttl = ttl
|
self._ttl = ttl
|
||||||
|
|
||||||
def execute(self, command: CreateSandboxCommand) -> SandboxSession:
|
def execute(self, command: CreateSandboxCommand) -> SandboxSession:
|
||||||
chat_id = command.chat_id
|
chat_id = command.chat_id
|
||||||
|
|
||||||
with self._locker.lock(chat_id):
|
with self._tracer.start_span(
|
||||||
session = self._repository.get_active_by_chat_id(chat_id)
|
'usecase.create_sandbox',
|
||||||
now = self._clock.now()
|
attrs={'chat.id': str(chat_id)},
|
||||||
|
) as span:
|
||||||
|
try:
|
||||||
|
with self._locker.lock(chat_id):
|
||||||
|
session = self._repository.get_active_by_chat_id(chat_id)
|
||||||
|
now = self._clock.now()
|
||||||
|
|
||||||
if session is not None and session.expires_at > now:
|
if session is not None and session.expires_at > now:
|
||||||
self._logger.info(
|
span.set_attribute('session.id', str(session.session_id))
|
||||||
'sandbox_reused',
|
span.set_attribute('container.id', session.container_id)
|
||||||
attrs=_sandbox_attrs(session),
|
span.set_attribute('sandbox.result', 'reused')
|
||||||
|
self._metrics.increment(
|
||||||
|
'sandbox.create.total',
|
||||||
|
attrs=_result_metric_attrs('reused'),
|
||||||
|
)
|
||||||
|
self._logger.info(
|
||||||
|
'sandbox_reused',
|
||||||
|
attrs=_sandbox_attrs(session),
|
||||||
|
)
|
||||||
|
return session
|
||||||
|
|
||||||
|
result = 'created'
|
||||||
|
if session is not None:
|
||||||
|
result = 'replaced'
|
||||||
|
span.set_attribute('session.id', str(session.session_id))
|
||||||
|
span.set_attribute('container.id', session.container_id)
|
||||||
|
self._logger.info(
|
||||||
|
'sandbox_replaced',
|
||||||
|
attrs=_sandbox_attrs(session),
|
||||||
|
)
|
||||||
|
self._runtime.stop(session.container_id)
|
||||||
|
self._repository.delete(session.session_id)
|
||||||
|
_set_active_count(self._metrics, self._repository)
|
||||||
|
|
||||||
|
created_at = self._clock.now()
|
||||||
|
expires_at = created_at + self._ttl
|
||||||
|
session_id = _new_session_id()
|
||||||
|
span.set_attribute('session.id', str(session_id))
|
||||||
|
new_session = self._runtime.create(
|
||||||
|
session_id=session_id,
|
||||||
|
chat_id=chat_id,
|
||||||
|
created_at=created_at,
|
||||||
|
expires_at=expires_at,
|
||||||
|
)
|
||||||
|
self._repository.save(new_session)
|
||||||
|
_set_active_count(self._metrics, self._repository)
|
||||||
|
span.set_attribute('container.id', new_session.container_id)
|
||||||
|
span.set_attribute('sandbox.result', result)
|
||||||
|
self._metrics.increment(
|
||||||
|
'sandbox.create.total',
|
||||||
|
attrs=_result_metric_attrs(result),
|
||||||
|
)
|
||||||
|
self._logger.info(
|
||||||
|
'sandbox_created',
|
||||||
|
attrs=_sandbox_attrs(new_session),
|
||||||
|
)
|
||||||
|
return new_session
|
||||||
|
except Exception as exc:
|
||||||
|
span.set_attribute('sandbox.result', 'error')
|
||||||
|
self._metrics.increment(
|
||||||
|
'sandbox.create.total',
|
||||||
|
attrs=_result_metric_attrs('error'),
|
||||||
)
|
)
|
||||||
return session
|
span.record_error(exc)
|
||||||
|
raise
|
||||||
if session is not None:
|
|
||||||
self._logger.info(
|
|
||||||
'sandbox_replaced',
|
|
||||||
attrs=_sandbox_attrs(session),
|
|
||||||
)
|
|
||||||
self._runtime.stop(session.container_id)
|
|
||||||
self._repository.delete(session.session_id)
|
|
||||||
|
|
||||||
created_at = self._clock.now()
|
|
||||||
expires_at = created_at + self._ttl
|
|
||||||
new_session = self._runtime.create(
|
|
||||||
session_id=_new_session_id(),
|
|
||||||
chat_id=chat_id,
|
|
||||||
created_at=created_at,
|
|
||||||
expires_at=expires_at,
|
|
||||||
)
|
|
||||||
self._repository.save(new_session)
|
|
||||||
self._logger.info(
|
|
||||||
'sandbox_created',
|
|
||||||
attrs=_sandbox_attrs(new_session),
|
|
||||||
)
|
|
||||||
return new_session
|
|
||||||
|
|
||||||
|
|
||||||
class CleanupExpiredSandboxes:
|
class CleanupExpiredSandboxes:
|
||||||
|
|
@ -80,39 +120,84 @@ class CleanupExpiredSandboxes:
|
||||||
runtime: SandboxRuntime,
|
runtime: SandboxRuntime,
|
||||||
clock: Clock,
|
clock: Clock,
|
||||||
logger: Logger,
|
logger: Logger,
|
||||||
|
metrics: Metrics,
|
||||||
|
tracer: Tracer,
|
||||||
) -> None:
|
) -> None:
|
||||||
self._repository = repository
|
self._repository = repository
|
||||||
self._locker = locker
|
self._locker = locker
|
||||||
self._runtime = runtime
|
self._runtime = runtime
|
||||||
self._clock = clock
|
self._clock = clock
|
||||||
self._logger = logger
|
self._logger = logger
|
||||||
|
self._metrics = metrics
|
||||||
|
self._tracer = tracer
|
||||||
|
|
||||||
def execute(self) -> list[SandboxSession]:
|
def execute(self) -> list[SandboxSession]:
|
||||||
expired_sessions = self._repository.list_expired(self._clock.now())
|
|
||||||
cleaned_sessions: list[SandboxSession] = []
|
cleaned_sessions: list[SandboxSession] = []
|
||||||
|
error_count = 0
|
||||||
|
|
||||||
for session in expired_sessions:
|
with self._tracer.start_span(
|
||||||
|
'usecase.cleanup_expired_sandboxes',
|
||||||
|
) as span:
|
||||||
try:
|
try:
|
||||||
cleaned_session = self._cleanup_session(session)
|
expired_sessions = self._repository.list_expired(self._clock.now())
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
attrs = _sandbox_attrs(session)
|
span.set_attribute('sandbox.result', 'error')
|
||||||
attrs['error'] = type(exc).__name__
|
self._metrics.increment(
|
||||||
self._logger.error(
|
'sandbox.cleanup.error.total',
|
||||||
'sandbox_clean_failed',
|
attrs=_cleanup_error_metric_attrs(
|
||||||
attrs=attrs,
|
type(exc).__name__,
|
||||||
|
'list_expired',
|
||||||
|
),
|
||||||
)
|
)
|
||||||
continue
|
span.record_error(exc)
|
||||||
|
raise
|
||||||
|
|
||||||
if cleaned_session is None:
|
span.set_attribute('sandbox.expired_count', len(expired_sessions))
|
||||||
continue
|
for session in expired_sessions:
|
||||||
|
with self._tracer.start_span(
|
||||||
|
'usecase.cleanup_expired_sandbox',
|
||||||
|
attrs=_sandbox_span_attrs(session),
|
||||||
|
) as cleanup_span:
|
||||||
|
try:
|
||||||
|
cleaned_session = self._cleanup_session(session)
|
||||||
|
except Exception as exc:
|
||||||
|
error_count += 1
|
||||||
|
cleanup_span.set_attribute('sandbox.result', 'error')
|
||||||
|
cleanup_span.record_error(exc)
|
||||||
|
self._metrics.increment(
|
||||||
|
'sandbox.cleanup.error.total',
|
||||||
|
attrs=_error_metric_attrs(type(exc).__name__),
|
||||||
|
)
|
||||||
|
attrs = _sandbox_attrs(session)
|
||||||
|
attrs['error'] = type(exc).__name__
|
||||||
|
self._logger.error(
|
||||||
|
'sandbox_clean_failed',
|
||||||
|
attrs=attrs,
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
cleaned_sessions.append(cleaned_session)
|
if cleaned_session is None:
|
||||||
self._logger.info(
|
cleanup_span.set_attribute('sandbox.result', 'skipped')
|
||||||
'sandbox_cleaned',
|
continue
|
||||||
attrs=_sandbox_attrs(cleaned_session),
|
|
||||||
|
cleanup_span.set_attribute('sandbox.result', 'cleaned')
|
||||||
|
cleaned_sessions.append(cleaned_session)
|
||||||
|
self._metrics.increment(
|
||||||
|
'sandbox.cleanup.total',
|
||||||
|
attrs=_result_metric_attrs('cleaned'),
|
||||||
|
)
|
||||||
|
self._logger.info(
|
||||||
|
'sandbox_cleaned',
|
||||||
|
attrs=_sandbox_attrs(cleaned_session),
|
||||||
|
)
|
||||||
|
|
||||||
|
span.set_attribute('sandbox.cleaned_count', len(cleaned_sessions))
|
||||||
|
span.set_attribute('sandbox.error_count', error_count)
|
||||||
|
span.set_attribute(
|
||||||
|
'sandbox.result',
|
||||||
|
'completed' if error_count == 0 else 'completed_with_errors',
|
||||||
)
|
)
|
||||||
|
return cleaned_sessions
|
||||||
return cleaned_sessions
|
|
||||||
|
|
||||||
def _cleanup_session(self, session: SandboxSession) -> SandboxSession | None:
|
def _cleanup_session(self, session: SandboxSession) -> SandboxSession | None:
|
||||||
with self._locker.lock(session.chat_id):
|
with self._locker.lock(session.chat_id):
|
||||||
|
|
@ -129,6 +214,7 @@ class CleanupExpiredSandboxes:
|
||||||
|
|
||||||
self._runtime.stop(current_session.container_id)
|
self._runtime.stop(current_session.container_id)
|
||||||
self._repository.delete(current_session.session_id)
|
self._repository.delete(current_session.session_id)
|
||||||
|
_set_active_count(self._metrics, self._repository)
|
||||||
return current_session
|
return current_session
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -142,3 +228,36 @@ def _sandbox_attrs(session: SandboxSession) -> dict[str, str]:
|
||||||
'session_id': str(session.session_id),
|
'session_id': str(session.session_id),
|
||||||
'container_id': session.container_id,
|
'container_id': session.container_id,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _sandbox_span_attrs(session: SandboxSession) -> dict[str, str]:
|
||||||
|
return {
|
||||||
|
'chat.id': str(session.chat_id),
|
||||||
|
'session.id': str(session.session_id),
|
||||||
|
'container.id': session.container_id,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _result_metric_attrs(result: str) -> dict[str, str]:
|
||||||
|
return {'result': result}
|
||||||
|
|
||||||
|
|
||||||
|
def _error_metric_attrs(error_type: str) -> dict[str, str]:
|
||||||
|
return {'error.type': error_type}
|
||||||
|
|
||||||
|
|
||||||
|
def _cleanup_error_metric_attrs(
|
||||||
|
error_type: str,
|
||||||
|
reason: str,
|
||||||
|
) -> dict[str, str]:
|
||||||
|
return {
|
||||||
|
'error.type': error_type,
|
||||||
|
'reason': reason,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _set_active_count(
|
||||||
|
metrics: Metrics,
|
||||||
|
repository: SandboxSessionRepository,
|
||||||
|
) -> None:
|
||||||
|
metrics.set('sandbox.active.count', repository.count_active())
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue