add sandbox observability failure tests

This commit is contained in:
Azamat 2026-04-03 02:04:51 +03:00
parent 02770bce7d
commit b4a2a9ceea
3 changed files with 268 additions and 6 deletions

View file

@ -281,6 +281,22 @@ class StaleSnapshotRepository(InMemorySandboxSessionRepository):
return [self._snapshot]
class FailingSaveRepository(InMemorySandboxSessionRepository):
def __init__(self, error: Exception) -> None:
super().__init__()
self._error = error
self._fail_next_save = False
def fail_next_save(self) -> None:
self._fail_next_save = True
def save(self, session: SandboxSession) -> None:
if self._fail_next_save:
self._fail_next_save = False
raise self._error
super().save(session)
class FakeRuntime:
def __init__(self) -> None:
self.create_calls: list[dict[str, object]] = []
@ -706,6 +722,64 @@ def test_create_sandbox_replace_stop_failure_preserves_separate_identities(
assert excinfo.value in span.errors
def test_create_sandbox_replace_save_failure_records_stage_safe_trace_ids(
monkeypatch,
) -> None:
now = datetime(2026, 4, 2, 12, 0, tzinfo=UTC)
expired_session = SandboxSession(
session_id=SESSION_OLD_ID,
chat_id=CHAT_ID,
container_id='container-old',
status=SandboxStatus.RUNNING,
created_at=now - timedelta(minutes=10),
expires_at=now,
)
repository = FailingSaveRepository(RuntimeError('save_failed'))
repository.save(expired_session)
repository.fail_next_save()
metrics = RecordingMetrics()
tracer = RecordingTracer()
runtime = FakeRuntime()
usecase = CreateSandbox(
repository=repository,
locker=FakeLocker(),
runtime=runtime,
clock=FakeClock(now),
logger=FakeLogger(),
metrics=metrics,
tracer=tracer,
ttl=timedelta(minutes=5),
)
monkeypatch.setattr('usecase.sandbox._new_session_id', lambda: SESSION_NEW_ID)
with pytest.raises(RuntimeError, match='save_failed') as excinfo:
usecase.execute(CreateSandboxCommand(chat_id=CHAT_ID))
assert runtime.stop_calls == ['container-old']
assert len(runtime.create_calls) == 1
assert repository.get_active_by_chat_id(CHAT_ID) is None
_assert_increment_metric_present(
metrics,
'sandbox.create.total',
attrs={'result': 'error'},
)
span = _find_span(
tracer,
'usecase.create_sandbox',
{'chat.id': str(CHAT_ID)},
{
'sandbox.previous_session.id': str(SESSION_OLD_ID),
'sandbox.previous_container.id': 'container-old',
'sandbox.new_session.id': str(SESSION_NEW_ID),
'sandbox.new_container.id': f'container-{SESSION_NEW_ID}',
'sandbox.result': 'error',
},
)
assert 'session.id' not in span.attrs
assert 'container.id' not in span.attrs
assert excinfo.value in span.errors
def test_create_sandbox_serializes_duplicate_concurrent_create_for_chat_id(
monkeypatch,
) -> None:
@ -982,6 +1056,8 @@ def test_cleanup_expired_sandboxes_continues_after_stop_failure() -> None:
repository.save(cleaned_session)
runtime = FailingStopRuntime('container-fail')
logger = FakeLogger()
metrics = RecordingMetrics()
tracer = RecordingTracer()
locker = FakeLocker()
usecase = CleanupExpiredSandboxes(
repository=repository,
@ -989,8 +1065,8 @@ def test_cleanup_expired_sandboxes_continues_after_stop_failure() -> None:
runtime=runtime,
clock=FakeClock(now),
logger=logger,
metrics=NoopMetrics(),
tracer=NoopTracer(),
metrics=metrics,
tracer=tracer,
)
result = usecase.execute()
@ -1021,3 +1097,48 @@ def test_cleanup_expired_sandboxes_continues_after_stop_failure() -> None:
},
),
]
_assert_increment_metric_present(
metrics,
'sandbox.cleanup.error.total',
attrs={'error.type': 'RuntimeError'},
)
_assert_increment_metric_present(
metrics,
'sandbox.cleanup.total',
attrs={'result': 'cleaned'},
)
assert _active_count_values(metrics)
assert _active_count_values(metrics)[-1] == 1
root_span = _find_span(
tracer,
'usecase.cleanup_expired_sandboxes',
span_attrs={
'sandbox.expired_count': 2,
'sandbox.cleaned_count': 1,
'sandbox.error_count': 1,
'sandbox.result': 'completed_with_errors',
},
)
assert not root_span.errors
failed_span = _find_span(
tracer,
'usecase.cleanup_expired_sandbox',
{
'chat.id': str(FAIL_CHAT_ID),
'session.id': str(SESSION_FAIL_ID),
'container.id': 'container-fail',
},
{'sandbox.result': 'error'},
)
assert [str(error) for error in failed_span.errors] == ['stop_failed']
cleaned_span = _find_span(
tracer,
'usecase.cleanup_expired_sandbox',
{
'chat.id': str(CLEAN_CHAT_ID),
'session.id': str(SESSION_CLEAN_ID),
'container.id': 'container-clean',
},
{'sandbox.result': 'cleaned'},
)
assert not cleaned_span.errors